Puppeteer 最佳实践 #

性能优化 #

浏览器启动优化 #

javascript
const puppeteer = require('puppeteer');

async function launchOptimized() {
  const browser = await puppeteer.launch({
    headless: 'new',  // 使用新的无头模式
    args: [
      // 禁用不必要的功能
      '--no-sandbox',
      '--disable-setuid-sandbox',
      '--disable-dev-shm-usage',
      '--disable-gpu',
      '--disable-software-rasterizer',
      '--disable-extensions',
      '--disable-plugins',
      '--disable-images',  // 禁用图片加载
      
      // 优化内存
      '--disable-dev-shm-usage',
      '--disable-accelerated-2d-canvas',
      '--disable-gl-drawing-for-tests',
      
      // 优化性能
      '--disable-background-timer-throttling',
      '--disable-backgrounding-occluded-windows',
      '--disable-renderer-backgrounding',
      '--disable-features=IsolateOrigins,site-per-process',
      
      // 设置内存限制
      '--max-old-space-size=4096'
    ]
  });
  
  return browser;
}

页面资源优化 #

javascript
async function optimizePage(page) {
  // 启用请求拦截
  await page.setRequestInterception(true);
  
  // 阻止不必要的资源
  const blockedTypes = ['image', 'stylesheet', 'font', 'media'];
  const blockedDomains = [
    'google-analytics.com',
    'googletagmanager.com',
    'facebook.com',
    'doubleclick.net'
  ];
  
  page.on('request', (request) => {
    const url = request.url();
    const type = request.resourceType();
    
    // 阻止特定类型
    if (blockedTypes.includes(type)) {
      request.abort();
      return;
    }
    
    // 阻止特定域名
    if (blockedDomains.some(domain => url.includes(domain))) {
      request.abort();
      return;
    }
    
    request.continue();
  });
  
  // 设置合理的超时
  page.setDefaultTimeout(30000);
  page.setDefaultNavigationTimeout(60000);
}

内存管理 #

javascript
class BrowserManager {
  constructor(maxPages = 10) {
    this.maxPages = maxPages;
    this.activePages = 0;
    this.browser = null;
  }
  
  async init() {
    this.browser = await puppeteer.launch({
      headless: 'new',
      args: ['--no-sandbox', '--disable-setuid-sandbox']
    });
  }
  
  async newPage() {
    if (this.activePages >= this.maxPages) {
      throw new Error('Maximum page limit reached');
    }
    
    const page = await this.browser.newPage();
    this.activePages++;
    
    // 页面关闭时减少计数
    page.on('close', () => {
      this.activePages--;
    });
    
    return page;
  }
  
  async close() {
    if (this.browser) {
      await this.browser.close();
    }
  }
}

稳定性提升 #

错误处理模式 #

javascript
async function safeOperation(page, operation) {
  try {
    return await operation();
  } catch (error) {
    // 记录错误
    console.error('Operation failed:', error.message);
    
    // 截图保存现场
    const timestamp = Date.now();
    await page.screenshot({
      path: `error-${timestamp}.png`
    });
    
    // 保存 HTML
    const html = await page.content();
    require('fs').writeFileSync(`error-${timestamp}.html`, html);
    
    throw error;
  }
}

// 使用示例
await safeOperation(page, async () => {
  await page.goto('https://example.com');
  await page.click('#button');
});

自动重试机制 #

javascript
async function withRetry(operation, options = {}) {
  const {
    maxRetries = 3,
    delay = 1000,
    backoff = 2,
    onRetry = null
  } = options;
  
  let lastError;
  
  for (let attempt = 1; attempt <= maxRetries; attempt++) {
    try {
      return await operation();
    } catch (error) {
      lastError = error;
      
      if (attempt < maxRetries) {
        const waitTime = delay * Math.pow(backoff, attempt - 1);
        
        if (onRetry) {
          onRetry(attempt, error, waitTime);
        }
        
        await new Promise(resolve => setTimeout(resolve, waitTime));
      }
    }
  }
  
  throw lastError;
}

// 使用示例
const result = await withRetry(
  async () => {
    await page.goto('https://example.com');
    return await page.title();
  },
  {
    maxRetries: 5,
    delay: 1000,
    onRetry: (attempt, error, waitTime) => {
      console.log(`Retry ${attempt} after ${waitTime}ms: ${error.message}`);
    }
  }
);

页面状态检查 #

javascript
async function checkPageHealth(page) {
  try {
    // 检查页面是否已关闭
    if (page.isClosed()) {
      return { healthy: false, reason: 'Page is closed' };
    }
    
    // 检查浏览器是否已断开
    const browser = page.browser();
    if (!browser.isConnected()) {
      return { healthy: false, reason: 'Browser is disconnected' };
    }
    
    // 检查页面是否响应
    await page.evaluate(() => 1 + 1);
    
    return { healthy: true };
  } catch (error) {
    return { healthy: false, reason: error.message };
  }
}

反爬虫策略 #

隐藏自动化特征 #

javascript
async function stealthMode(page) {
  // 在页面加载前注入脚本
  await page.evaluateOnNewDocument(() => {
    // 隐藏 webdriver 属性
    Object.defineProperty(navigator, 'webdriver', {
      get: () => false
    });
    
    // 添加 chrome 对象
    window.chrome = {
      runtime: {}
    };
    
    // 修改 plugins
    Object.defineProperty(navigator, 'plugins', {
      get: () => [
        {
          name: 'Chrome PDF Plugin',
          description: 'Portable Document Format',
          filename: 'internal-pdf-viewer',
          length: 1
        },
        {
          name: 'Chrome PDF Viewer',
          description: '',
          filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai',
          length: 1
        },
        {
          name: 'Native Client',
          description: '',
          filename: 'internal-nacl-plugin',
          length: 2
        }
      ]
    });
    
    // 修改 languages
    Object.defineProperty(navigator, 'languages', {
      get: () => ['zh-CN', 'zh', 'en-US', 'en']
    });
    
    // 修改 platform
    Object.defineProperty(navigator, 'platform', {
      get: () => 'Win32'
    });
    
    // 修改 hardwareConcurrency
    Object.defineProperty(navigator, 'hardwareConcurrency', {
      get: () => 8
    });
    
    // 修改 deviceMemory
    Object.defineProperty(navigator, 'deviceMemory', {
      get: () => 8
    });
    
    // 覆盖 permissions
    const originalQuery = window.navigator.permissions.query;
    window.navigator.permissions.query = (parameters) => (
      parameters.name === 'notifications' ?
        Promise.resolve({ state: Notification.permission }) :
        originalQuery(parameters)
    );
  });
}

模拟真实用户行为 #

javascript
async function humanLikeBehavior(page) {
  // 随机移动鼠标
  await page.mouse.move(
    Math.random() * 1000,
    Math.random() * 800,
    { steps: 20 }
  );
  
  // 随机滚动
  await page.evaluate(() => {
    window.scrollBy(0, Math.random() * 500);
  });
  
  // 随机延迟
  await page.waitForTimeout(Math.random() * 2000 + 500);
}

// 模拟真实输入
async function humanType(page, selector, text) {
  await page.click(selector);
  
  for (const char of text) {
    await page.keyboard.type(char, {
      delay: Math.random() * 100 + 50  // 50-150ms 随机延迟
    });
  }
}

随机 User-Agent #

javascript
const userAgents = [
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15'
];

function getRandomUserAgent() {
  return userAgents[Math.floor(Math.random() * userAgents.length)];
}

// 使用
await page.setUserAgent(getRandomUserAgent());

代理轮换 #

javascript
class ProxyRotator {
  constructor(proxies) {
    this.proxies = proxies;
    this.currentIndex = 0;
  }
  
  getNext() {
    const proxy = this.proxies[this.currentIndex];
    this.currentIndex = (this.currentIndex + 1) % this.proxies.length;
    return proxy;
  }
  
  async launchWithProxy() {
    const proxy = this.getNext();
    
    return await puppeteer.launch({
      args: [
        `--proxy-server=${proxy.host}:${proxy.port}`
      ]
    });
  }
}

// 使用示例
const rotator = new ProxyRotator([
  { host: 'proxy1.example.com', port: 8080 },
  { host: 'proxy2.example.com', port: 8080 },
  { host: 'proxy3.example.com', port: 8080 }
]);

const browser = await rotator.launchWithProxy();

资源管理 #

连接池管理 #

javascript
class BrowserPool {
  constructor(options = {}) {
    this.maxBrowsers = options.maxBrowsers || 5;
    this.maxPagesPerBrowser = options.maxPagesPerBrowser || 10;
    this.idleTimeout = options.idleTimeout || 300000; // 5 分钟
    this.browsers = [];
    this.availablePages = [];
  }
  
  async init() {
    for (let i = 0; i < this.maxBrowsers; i++) {
      const browser = await this.createBrowser();
      this.browsers.push({
        browser,
        pages: 0,
        lastUsed: Date.now()
      });
    }
    
    // 定期清理空闲浏览器
    this.startCleanup();
  }
  
  async createBrowser() {
    return await puppeteer.launch({
      headless: 'new',
      args: ['--no-sandbox', '--disable-setuid-sandbox']
    });
  }
  
  async getPage() {
    // 找到有空位的浏览器
    const browserInfo = this.browsers.find(b => b.pages < this.maxPagesPerBrowser);
    
    if (!browserInfo) {
      throw new Error('No available browser slots');
    }
    
    const page = await browserInfo.browser.newPage();
    browserInfo.pages++;
    browserInfo.lastUsed = Date.now();
    
    const releasePage = async () => {
      if (!page.isClosed()) {
        await page.close();
      }
      browserInfo.pages--;
    };
    
    return { page, release: releasePage };
  }
  
  startCleanup() {
    setInterval(async () => {
      const now = Date.now();
      
      for (const info of this.browsers) {
        if (info.pages === 0 && now - info.lastUsed > this.idleTimeout) {
          await info.browser.close();
          info.browser = await this.createBrowser();
          info.lastUsed = now;
        }
      }
    }, 60000);  // 每分钟检查一次
  }
  
  async close() {
    await Promise.all(this.browsers.map(info => info.browser.close()));
  }
}

任务队列 #

javascript
class TaskQueue {
  constructor(concurrency = 5) {
    this.concurrency = concurrency;
    this.running = 0;
    this.queue = [];
  }
  
  async add(task) {
    return new Promise((resolve, reject) => {
      this.queue.push({ task, resolve, reject });
      this.run();
    });
  }
  
  async run() {
    while (this.queue.length > 0 && this.running < this.concurrency) {
      this.running++;
      const { task, resolve, reject } = this.queue.shift();
      
      try {
        const result = await task();
        resolve(result);
      } catch (error) {
        reject(error);
      } finally {
        this.running--;
        this.run();
      }
    }
  }
}

// 使用示例
const queue = new TaskQueue(5);

const results = await Promise.all(urls.map(url => 
  queue.add(async () => {
    const page = await browser.newPage();
    await page.goto(url);
    const data = await page.evaluate(() => document.title);
    await page.close();
    return data;
  })
));

日志与监控 #

结构化日志 #

javascript
class Logger {
  constructor(name) {
    this.name = name;
  }
  
  log(level, message, data = {}) {
    const logEntry = {
      timestamp: new Date().toISOString(),
      level,
      name: this.name,
      message,
      ...data
    };
    
    console.log(JSON.stringify(logEntry));
  }
  
  info(message, data) {
    this.log('INFO', message, data);
  }
  
  error(message, data) {
    this.log('ERROR', message, data);
  }
  
  warn(message, data) {
    this.log('WARN', message, data);
  }
  
  debug(message, data) {
    this.log('DEBUG', message, data);
  }
}

// 使用示例
const logger = new Logger('Crawler');

logger.info('Page loaded', { url: page.url() });
logger.error('Navigation failed', { url, error: error.message });

性能监控 #

javascript
class PerformanceMonitor {
  constructor() {
    this.metrics = {
      pagesVisited: 0,
      requests: 0,
      errors: 0,
      totalDuration: 0,
      avgDuration: 0
    };
  }
  
  startTimer() {
    return Date.now();
  }
  
  endTimer(startTime) {
    const duration = Date.now() - startTime;
    this.metrics.totalDuration += duration;
    this.metrics.pagesVisited++;
    this.metrics.avgDuration = this.metrics.totalDuration / this.metrics.pagesVisited;
    return duration;
  }
  
  recordRequest() {
    this.metrics.requests++;
  }
  
  recordError() {
    this.metrics.errors++;
  }
  
  getMetrics() {
    return { ...this.metrics };
  }
  
  report() {
    console.log('Performance Report:');
    console.log(`  Pages visited: ${this.metrics.pagesVisited}`);
    console.log(`  Total requests: ${this.metrics.requests}`);
    console.log(`  Errors: ${this.metrics.errors}`);
    console.log(`  Total duration: ${this.metrics.totalDuration}ms`);
    console.log(`  Average duration: ${this.metrics.avgDuration.toFixed(2)}ms`);
  }
}

安全考虑 #

敏感数据处理 #

javascript
// 不要在代码中硬编码敏感信息
// ❌ 错误
const password = 'my-password-123';

// ✅ 正确:使用环境变量
const password = process.env.PASSWORD;

// ✅ 正确:使用配置文件(不提交到版本控制)
const config = require('./config.local.json');
const password = config.password;

安全的浏览器配置 #

javascript
const browser = await puppeteer.launch({
  headless: 'new',
  args: [
    '--no-sandbox',
    '--disable-setuid-sandbox',
    '--disable-dev-shm-usage',
    '--disable-web-security',  // 仅在必要时使用
    '--disable-features=IsolateOrigins,site-per-process'
  ],
  ignoreHTTPSErrors: false,  // 不要忽略 HTTPS 错误
  defaultViewport: null
});

完整示例 #

生产级爬虫 #

javascript
const puppeteer = require('puppeteer');
const fs = require('fs');

class ProductionCrawler {
  constructor(options = {}) {
    this.concurrency = options.concurrency || 5;
    this.timeout = options.timeout || 30000;
    this.retries = options.retries || 3;
    this.logger = new Logger('Crawler');
    this.monitor = new PerformanceMonitor();
    this.browserPool = null;
  }
  
  async init() {
    this.browserPool = new BrowserPool({
      maxBrowsers: this.concurrency,
      maxPagesPerBrowser: 5
    });
    await this.browserPool.init();
    this.logger.info('Crawler initialized');
  }
  
  async crawl(url) {
    const startTime = this.monitor.startTimer();
    let attempt = 0;
    
    while (attempt < this.retries) {
      attempt++;
      let pageHandle;
      
      try {
        pageHandle = await this.browserPool.getPage();
        const page = pageHandle.page;
        
        // 应用隐身模式
        await stealthMode(page);
        
        // 设置超时
        page.setDefaultTimeout(this.timeout);
        
        // 访问页面
        await page.goto(url, {
          waitUntil: 'domcontentloaded',
          timeout: this.timeout
        });
        
        // 等待内容加载
        await page.waitForSelector('.content', { timeout: 10000 });
        
        // 提取数据
        const data = await page.evaluate(() => {
          return {
            title: document.querySelector('h1')?.textContent,
            content: document.querySelector('.content')?.textContent,
            url: window.location.href
          };
        });
        
        // 记录成功
        const duration = this.monitor.endTimer(startTime);
        this.logger.info('Page crawled successfully', { url, duration, attempt });
        
        return { success: true, data, duration };
        
      } catch (error) {
        this.monitor.recordError();
        this.logger.warn('Crawl attempt failed', { 
          url, 
          attempt, 
          error: error.message 
        });
        
        if (attempt === this.retries) {
          this.logger.error('Crawl failed after all retries', { url });
          return { success: false, error: error.message };
        }
        
        // 指数退避
        await new Promise(r => setTimeout(r, 1000 * Math.pow(2, attempt)));
        
      } finally {
        if (pageHandle) {
          await pageHandle.release();
        }
      }
    }
  }
  
  async crawlMultiple(urls) {
    const queue = new TaskQueue(this.concurrency);
    
    const results = await Promise.all(
      urls.map(url => queue.add(() => this.crawl(url)))
    );
    
    this.monitor.report();
    return results;
  }
  
  async close() {
    await this.browserPool.close();
    this.logger.info('Crawler closed');
  }
}

// 使用示例
async function main() {
  const crawler = new ProductionCrawler({
    concurrency: 5,
    timeout: 30000,
    retries: 3
  });
  
  await crawler.init();
  
  const urls = [
    'https://example1.com',
    'https://example2.com',
    // ... 更多 URL
  ];
  
  const results = await crawler.crawlMultiple(urls);
  
  // 保存结果
  fs.writeFileSync('results.json', JSON.stringify(results, null, 2));
  
  await crawler.close();
}

main().catch(console.error);

总结 #

最佳实践清单 #

text
性能优化
├── 使用新的无头模式 (headless: 'new')
├── 阻止不必要的资源加载
├── 合理设置超时时间
├── 及时关闭页面释放资源
└── 使用连接池管理浏览器实例

稳定性
├── 实现自动重试机制
├── 捕获并记录错误
├── 保存错误现场(截图、HTML)
├── 检查页面健康状态
└── 使用任务队列控制并发

反爬虫
├── 隐藏 webdriver 特征
├── 模拟真实用户行为
├── 随机化请求特征
├── 使用代理轮换
└── 合理的请求频率

资源管理
├── 限制并发数量
├── 实现连接池
├── 定期清理空闲资源
├── 监控内存使用
└── 优雅关闭

安全
├── 不硬编码敏感信息
├── 使用环境变量
├── 不忽略 HTTPS 错误
├── 限制权限
└── 审计依赖

恭喜你完成了 Puppeteer 的学习之旅!现在你已经掌握了从基础到高级的所有知识,可以在实际项目中高效使用 Puppeteer 了。

最后更新:2026-03-28