Puppeteer 最佳实践 #
性能优化 #
浏览器启动优化 #
javascript
const puppeteer = require('puppeteer');
async function launchOptimized() {
const browser = await puppeteer.launch({
headless: 'new', // 使用新的无头模式
args: [
// 禁用不必要的功能
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--disable-software-rasterizer',
'--disable-extensions',
'--disable-plugins',
'--disable-images', // 禁用图片加载
// 优化内存
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gl-drawing-for-tests',
// 优化性能
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
'--disable-features=IsolateOrigins,site-per-process',
// 设置内存限制
'--max-old-space-size=4096'
]
});
return browser;
}
页面资源优化 #
javascript
async function optimizePage(page) {
// 启用请求拦截
await page.setRequestInterception(true);
// 阻止不必要的资源
const blockedTypes = ['image', 'stylesheet', 'font', 'media'];
const blockedDomains = [
'google-analytics.com',
'googletagmanager.com',
'facebook.com',
'doubleclick.net'
];
page.on('request', (request) => {
const url = request.url();
const type = request.resourceType();
// 阻止特定类型
if (blockedTypes.includes(type)) {
request.abort();
return;
}
// 阻止特定域名
if (blockedDomains.some(domain => url.includes(domain))) {
request.abort();
return;
}
request.continue();
});
// 设置合理的超时
page.setDefaultTimeout(30000);
page.setDefaultNavigationTimeout(60000);
}
内存管理 #
javascript
class BrowserManager {
constructor(maxPages = 10) {
this.maxPages = maxPages;
this.activePages = 0;
this.browser = null;
}
async init() {
this.browser = await puppeteer.launch({
headless: 'new',
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
}
async newPage() {
if (this.activePages >= this.maxPages) {
throw new Error('Maximum page limit reached');
}
const page = await this.browser.newPage();
this.activePages++;
// 页面关闭时减少计数
page.on('close', () => {
this.activePages--;
});
return page;
}
async close() {
if (this.browser) {
await this.browser.close();
}
}
}
稳定性提升 #
错误处理模式 #
javascript
async function safeOperation(page, operation) {
try {
return await operation();
} catch (error) {
// 记录错误
console.error('Operation failed:', error.message);
// 截图保存现场
const timestamp = Date.now();
await page.screenshot({
path: `error-${timestamp}.png`
});
// 保存 HTML
const html = await page.content();
require('fs').writeFileSync(`error-${timestamp}.html`, html);
throw error;
}
}
// 使用示例
await safeOperation(page, async () => {
await page.goto('https://example.com');
await page.click('#button');
});
自动重试机制 #
javascript
async function withRetry(operation, options = {}) {
const {
maxRetries = 3,
delay = 1000,
backoff = 2,
onRetry = null
} = options;
let lastError;
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
return await operation();
} catch (error) {
lastError = error;
if (attempt < maxRetries) {
const waitTime = delay * Math.pow(backoff, attempt - 1);
if (onRetry) {
onRetry(attempt, error, waitTime);
}
await new Promise(resolve => setTimeout(resolve, waitTime));
}
}
}
throw lastError;
}
// 使用示例
const result = await withRetry(
async () => {
await page.goto('https://example.com');
return await page.title();
},
{
maxRetries: 5,
delay: 1000,
onRetry: (attempt, error, waitTime) => {
console.log(`Retry ${attempt} after ${waitTime}ms: ${error.message}`);
}
}
);
页面状态检查 #
javascript
async function checkPageHealth(page) {
try {
// 检查页面是否已关闭
if (page.isClosed()) {
return { healthy: false, reason: 'Page is closed' };
}
// 检查浏览器是否已断开
const browser = page.browser();
if (!browser.isConnected()) {
return { healthy: false, reason: 'Browser is disconnected' };
}
// 检查页面是否响应
await page.evaluate(() => 1 + 1);
return { healthy: true };
} catch (error) {
return { healthy: false, reason: error.message };
}
}
反爬虫策略 #
隐藏自动化特征 #
javascript
async function stealthMode(page) {
// 在页面加载前注入脚本
await page.evaluateOnNewDocument(() => {
// 隐藏 webdriver 属性
Object.defineProperty(navigator, 'webdriver', {
get: () => false
});
// 添加 chrome 对象
window.chrome = {
runtime: {}
};
// 修改 plugins
Object.defineProperty(navigator, 'plugins', {
get: () => [
{
name: 'Chrome PDF Plugin',
description: 'Portable Document Format',
filename: 'internal-pdf-viewer',
length: 1
},
{
name: 'Chrome PDF Viewer',
description: '',
filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai',
length: 1
},
{
name: 'Native Client',
description: '',
filename: 'internal-nacl-plugin',
length: 2
}
]
});
// 修改 languages
Object.defineProperty(navigator, 'languages', {
get: () => ['zh-CN', 'zh', 'en-US', 'en']
});
// 修改 platform
Object.defineProperty(navigator, 'platform', {
get: () => 'Win32'
});
// 修改 hardwareConcurrency
Object.defineProperty(navigator, 'hardwareConcurrency', {
get: () => 8
});
// 修改 deviceMemory
Object.defineProperty(navigator, 'deviceMemory', {
get: () => 8
});
// 覆盖 permissions
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
});
}
模拟真实用户行为 #
javascript
async function humanLikeBehavior(page) {
// 随机移动鼠标
await page.mouse.move(
Math.random() * 1000,
Math.random() * 800,
{ steps: 20 }
);
// 随机滚动
await page.evaluate(() => {
window.scrollBy(0, Math.random() * 500);
});
// 随机延迟
await page.waitForTimeout(Math.random() * 2000 + 500);
}
// 模拟真实输入
async function humanType(page, selector, text) {
await page.click(selector);
for (const char of text) {
await page.keyboard.type(char, {
delay: Math.random() * 100 + 50 // 50-150ms 随机延迟
});
}
}
随机 User-Agent #
javascript
const userAgents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15'
];
function getRandomUserAgent() {
return userAgents[Math.floor(Math.random() * userAgents.length)];
}
// 使用
await page.setUserAgent(getRandomUserAgent());
代理轮换 #
javascript
class ProxyRotator {
constructor(proxies) {
this.proxies = proxies;
this.currentIndex = 0;
}
getNext() {
const proxy = this.proxies[this.currentIndex];
this.currentIndex = (this.currentIndex + 1) % this.proxies.length;
return proxy;
}
async launchWithProxy() {
const proxy = this.getNext();
return await puppeteer.launch({
args: [
`--proxy-server=${proxy.host}:${proxy.port}`
]
});
}
}
// 使用示例
const rotator = new ProxyRotator([
{ host: 'proxy1.example.com', port: 8080 },
{ host: 'proxy2.example.com', port: 8080 },
{ host: 'proxy3.example.com', port: 8080 }
]);
const browser = await rotator.launchWithProxy();
资源管理 #
连接池管理 #
javascript
class BrowserPool {
constructor(options = {}) {
this.maxBrowsers = options.maxBrowsers || 5;
this.maxPagesPerBrowser = options.maxPagesPerBrowser || 10;
this.idleTimeout = options.idleTimeout || 300000; // 5 分钟
this.browsers = [];
this.availablePages = [];
}
async init() {
for (let i = 0; i < this.maxBrowsers; i++) {
const browser = await this.createBrowser();
this.browsers.push({
browser,
pages: 0,
lastUsed: Date.now()
});
}
// 定期清理空闲浏览器
this.startCleanup();
}
async createBrowser() {
return await puppeteer.launch({
headless: 'new',
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
}
async getPage() {
// 找到有空位的浏览器
const browserInfo = this.browsers.find(b => b.pages < this.maxPagesPerBrowser);
if (!browserInfo) {
throw new Error('No available browser slots');
}
const page = await browserInfo.browser.newPage();
browserInfo.pages++;
browserInfo.lastUsed = Date.now();
const releasePage = async () => {
if (!page.isClosed()) {
await page.close();
}
browserInfo.pages--;
};
return { page, release: releasePage };
}
startCleanup() {
setInterval(async () => {
const now = Date.now();
for (const info of this.browsers) {
if (info.pages === 0 && now - info.lastUsed > this.idleTimeout) {
await info.browser.close();
info.browser = await this.createBrowser();
info.lastUsed = now;
}
}
}, 60000); // 每分钟检查一次
}
async close() {
await Promise.all(this.browsers.map(info => info.browser.close()));
}
}
任务队列 #
javascript
class TaskQueue {
constructor(concurrency = 5) {
this.concurrency = concurrency;
this.running = 0;
this.queue = [];
}
async add(task) {
return new Promise((resolve, reject) => {
this.queue.push({ task, resolve, reject });
this.run();
});
}
async run() {
while (this.queue.length > 0 && this.running < this.concurrency) {
this.running++;
const { task, resolve, reject } = this.queue.shift();
try {
const result = await task();
resolve(result);
} catch (error) {
reject(error);
} finally {
this.running--;
this.run();
}
}
}
}
// 使用示例
const queue = new TaskQueue(5);
const results = await Promise.all(urls.map(url =>
queue.add(async () => {
const page = await browser.newPage();
await page.goto(url);
const data = await page.evaluate(() => document.title);
await page.close();
return data;
})
));
日志与监控 #
结构化日志 #
javascript
class Logger {
constructor(name) {
this.name = name;
}
log(level, message, data = {}) {
const logEntry = {
timestamp: new Date().toISOString(),
level,
name: this.name,
message,
...data
};
console.log(JSON.stringify(logEntry));
}
info(message, data) {
this.log('INFO', message, data);
}
error(message, data) {
this.log('ERROR', message, data);
}
warn(message, data) {
this.log('WARN', message, data);
}
debug(message, data) {
this.log('DEBUG', message, data);
}
}
// 使用示例
const logger = new Logger('Crawler');
logger.info('Page loaded', { url: page.url() });
logger.error('Navigation failed', { url, error: error.message });
性能监控 #
javascript
class PerformanceMonitor {
constructor() {
this.metrics = {
pagesVisited: 0,
requests: 0,
errors: 0,
totalDuration: 0,
avgDuration: 0
};
}
startTimer() {
return Date.now();
}
endTimer(startTime) {
const duration = Date.now() - startTime;
this.metrics.totalDuration += duration;
this.metrics.pagesVisited++;
this.metrics.avgDuration = this.metrics.totalDuration / this.metrics.pagesVisited;
return duration;
}
recordRequest() {
this.metrics.requests++;
}
recordError() {
this.metrics.errors++;
}
getMetrics() {
return { ...this.metrics };
}
report() {
console.log('Performance Report:');
console.log(` Pages visited: ${this.metrics.pagesVisited}`);
console.log(` Total requests: ${this.metrics.requests}`);
console.log(` Errors: ${this.metrics.errors}`);
console.log(` Total duration: ${this.metrics.totalDuration}ms`);
console.log(` Average duration: ${this.metrics.avgDuration.toFixed(2)}ms`);
}
}
安全考虑 #
敏感数据处理 #
javascript
// 不要在代码中硬编码敏感信息
// ❌ 错误
const password = 'my-password-123';
// ✅ 正确:使用环境变量
const password = process.env.PASSWORD;
// ✅ 正确:使用配置文件(不提交到版本控制)
const config = require('./config.local.json');
const password = config.password;
安全的浏览器配置 #
javascript
const browser = await puppeteer.launch({
headless: 'new',
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-web-security', // 仅在必要时使用
'--disable-features=IsolateOrigins,site-per-process'
],
ignoreHTTPSErrors: false, // 不要忽略 HTTPS 错误
defaultViewport: null
});
完整示例 #
生产级爬虫 #
javascript
const puppeteer = require('puppeteer');
const fs = require('fs');
class ProductionCrawler {
constructor(options = {}) {
this.concurrency = options.concurrency || 5;
this.timeout = options.timeout || 30000;
this.retries = options.retries || 3;
this.logger = new Logger('Crawler');
this.monitor = new PerformanceMonitor();
this.browserPool = null;
}
async init() {
this.browserPool = new BrowserPool({
maxBrowsers: this.concurrency,
maxPagesPerBrowser: 5
});
await this.browserPool.init();
this.logger.info('Crawler initialized');
}
async crawl(url) {
const startTime = this.monitor.startTimer();
let attempt = 0;
while (attempt < this.retries) {
attempt++;
let pageHandle;
try {
pageHandle = await this.browserPool.getPage();
const page = pageHandle.page;
// 应用隐身模式
await stealthMode(page);
// 设置超时
page.setDefaultTimeout(this.timeout);
// 访问页面
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: this.timeout
});
// 等待内容加载
await page.waitForSelector('.content', { timeout: 10000 });
// 提取数据
const data = await page.evaluate(() => {
return {
title: document.querySelector('h1')?.textContent,
content: document.querySelector('.content')?.textContent,
url: window.location.href
};
});
// 记录成功
const duration = this.monitor.endTimer(startTime);
this.logger.info('Page crawled successfully', { url, duration, attempt });
return { success: true, data, duration };
} catch (error) {
this.monitor.recordError();
this.logger.warn('Crawl attempt failed', {
url,
attempt,
error: error.message
});
if (attempt === this.retries) {
this.logger.error('Crawl failed after all retries', { url });
return { success: false, error: error.message };
}
// 指数退避
await new Promise(r => setTimeout(r, 1000 * Math.pow(2, attempt)));
} finally {
if (pageHandle) {
await pageHandle.release();
}
}
}
}
async crawlMultiple(urls) {
const queue = new TaskQueue(this.concurrency);
const results = await Promise.all(
urls.map(url => queue.add(() => this.crawl(url)))
);
this.monitor.report();
return results;
}
async close() {
await this.browserPool.close();
this.logger.info('Crawler closed');
}
}
// 使用示例
async function main() {
const crawler = new ProductionCrawler({
concurrency: 5,
timeout: 30000,
retries: 3
});
await crawler.init();
const urls = [
'https://example1.com',
'https://example2.com',
// ... 更多 URL
];
const results = await crawler.crawlMultiple(urls);
// 保存结果
fs.writeFileSync('results.json', JSON.stringify(results, null, 2));
await crawler.close();
}
main().catch(console.error);
总结 #
最佳实践清单 #
text
性能优化
├── 使用新的无头模式 (headless: 'new')
├── 阻止不必要的资源加载
├── 合理设置超时时间
├── 及时关闭页面释放资源
└── 使用连接池管理浏览器实例
稳定性
├── 实现自动重试机制
├── 捕获并记录错误
├── 保存错误现场(截图、HTML)
├── 检查页面健康状态
└── 使用任务队列控制并发
反爬虫
├── 隐藏 webdriver 特征
├── 模拟真实用户行为
├── 随机化请求特征
├── 使用代理轮换
└── 合理的请求频率
资源管理
├── 限制并发数量
├── 实现连接池
├── 定期清理空闲资源
├── 监控内存使用
└── 优雅关闭
安全
├── 不硬编码敏感信息
├── 使用环境变量
├── 不忽略 HTTPS 错误
├── 限制权限
└── 审计依赖
恭喜你完成了 Puppeteer 的学习之旅!现在你已经掌握了从基础到高级的所有知识,可以在实际项目中高效使用 Puppeteer 了。
最后更新:2026-03-28