Puppeteer 高级功能 #

多页面管理 #

创建和管理页面 #

javascript
const puppeteer = require('puppeteer');

async function multiPageExample() {
  const browser = await puppeteer.launch();
  
  // 创建多个页面
  const page1 = await browser.newPage();
  const page2 = await browser.newPage();
  const page3 = await browser.newPage();
  
  // 并行访问不同页面
  await Promise.all([
    page1.goto('https://example1.com'),
    page2.goto('https://example2.com'),
    page3.goto('https://example3.com')
  ]);
  
  // 获取所有页面
  const pages = await browser.pages();
  console.log(`Total pages: ${pages.length}`);
  
  await browser.close();
}

页面池管理 #

javascript
class PagePool {
  constructor(browser, poolSize = 5) {
    this.browser = browser;
    this.poolSize = poolSize;
    this.pool = [];
    this.available = [];
  }
  
  async init() {
    for (let i = 0; i < this.poolSize; i++) {
      const page = await this.browser.newPage();
      this.pool.push(page);
      this.available.push(page);
    }
  }
  
  async acquire() {
    if (this.available.length === 0) {
      throw new Error('No available pages in pool');
    }
    return this.available.pop();
  }
  
  release(page) {
    if (!this.pool.includes(page)) {
      throw new Error('Page not from this pool');
    }
    this.available.push(page);
  }
  
  async close() {
    await Promise.all(this.pool.map(page => page.close()));
  }
}

// 使用示例
async function usePagePool() {
  const browser = await puppeteer.launch();
  const pool = new PagePool(browser, 5);
  await pool.init();
  
  const page = await pool.acquire();
  await page.goto('https://example.com');
  // ... 操作
  pool.release(page);
  
  await pool.close();
  await browser.close();
}

页面事件监听 #

javascript
const browser = await puppeteer.launch();

// 监听新页面创建
browser.on('targetcreated', async (target) => {
  if (target.type() === 'page') {
    const page = await target.page();
    console.log('New page created:', page.url());
  }
});

// 监听页面关闭
browser.on('targetdestroyed', (target) => {
  console.log('Target destroyed:', target.url());
});

浏览器上下文 #

创建隔离上下文 #

javascript
const puppeteer = require('puppeteer');

async function contextExample() {
  const browser = await puppeteer.launch();
  
  // 创建独立的浏览器上下文
  const context1 = await browser.createBrowserContext();
  const context2 = await browser.createBrowserContext();
  
  // 每个上下文有独立的 Cookie 和存储
  const page1 = await context1.newPage();
  const page2 = await context2.newPage();
  
  // page1 和 page2 完全隔离
  await page1.goto('https://example.com');
  await page2.goto('https://example.com');
  
  // 关闭上下文(会关闭该上下文中的所有页面)
  await context1.close();
  await context2.close();
  
  await browser.close();
}

多用户登录 #

javascript
async function multiUserLogin() {
  const browser = await puppeteer.launch();
  
  // 用户 A 上下文
  const contextA = await browser.createBrowserContext();
  const pageA = await contextA.newPage();
  await pageA.goto('https://example.com/login');
  await pageA.type('#username', 'userA');
  await pageA.type('#password', 'passwordA');
  await pageA.click('#login');
  
  // 用户 B 上下文
  const contextB = await browser.createBrowserContext();
  const pageB = await contextB.newPage();
  await pageB.goto('https://example.com/login');
  await pageB.type('#username', 'userB');
  await pageB.type('#password', 'passwordB');
  await pageB.click('#login');
  
  // 两个用户可以同时操作,互不影响
  
  await browser.close();
}

隐身模式 #

javascript
// 创建隐身上下文
const context = await browser.createBrowserContext();

// 设置为隐身模式
await context.overridePermissions('https://example.com', ['geolocation']);

const page = await context.newPage();

iframe 处理 #

获取 iframe #

javascript
// 获取主框架
const mainFrame = page.mainFrame();

// 获取所有框架
const frames = page.frames();

// 通过名称获取框架
const frame = page.frame({ name: 'myframe' });

// 通过 URL 获取框架
const frame = page.frame({ url: /.*iframe.*/ });

iframe 操作 #

javascript
// 在 iframe 中操作
const frame = page.frames().find(f => f.name() === 'myframe');

if (frame) {
  // 等待元素
  await frame.waitForSelector('.button');
  
  // 点击
  await frame.click('.button');
  
  // 输入
  await frame.type('#input', 'text');
  
  // 执行脚本
  const text = await frame.evaluate(() => {
    return document.querySelector('.content').textContent;
  });
}

嵌套 iframe #

javascript
// 处理嵌套 iframe
async function getNestedFrame(page, selectors) {
  let frame = page.mainFrame();
  
  for (const selector of selectors) {
    await frame.waitForSelector(selector);
    const element = await frame.$(selector);
    frame = await element.contentFrame();
    
    if (!frame) {
      throw new Error(`Frame not found for selector: ${selector}`);
    }
  }
  
  return frame;
}

// 使用示例
const nestedFrame = await getNestedFrame(page, [
  'iframe#level1',
  'iframe#level2',
  'iframe#level3'
]);

性能分析 #

性能追踪 #

javascript
const puppeteer = require('puppeteer');
const fs = require('fs');

async function tracePerformance(url) {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  
  // 开始追踪
  await page.tracing.start({
    path: 'trace.json',
    screenshots: true
  });
  
  await page.goto(url);
  
  // 停止追踪
  await page.tracing.stop();
  
  await browser.close();
}

性能指标 #

javascript
async function getPerformanceMetrics(url) {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  
  await page.goto(url);
  
  // 获取页面指标
  const metrics = await page.metrics();
  console.log('Page metrics:', metrics);
  
  // 获取性能时间
  const timing = await page.evaluate(() => {
    const timing = window.performance.timing;
    return {
      dns: timing.domainLookupEnd - timing.domainLookupStart,
      tcp: timing.connectEnd - timing.connectStart,
      request: timing.responseStart - timing.requestStart,
      response: timing.responseEnd - timing.responseStart,
      domProcessing: timing.domComplete - timing.domInteractive,
      total: timing.loadEventEnd - timing.navigationStart
    };
  });
  
  console.log('Performance timing:', timing);
  
  await browser.close();
}

Web Vitals #

javascript
async function measureWebVitals(url) {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  
  await page.goto(url);
  
  const vitals = await page.evaluate(() => {
    return new Promise((resolve) => {
      const results = {};
      
      // LCP (Largest Contentful Paint)
      new PerformanceObserver((list) => {
        const entries = list.getEntries();
        results.lcp = entries[entries.length - 1].startTime;
      }).observe({ type: 'largest-contentful-paint', buffered: true });
      
      // FID (First Input Delay)
      new PerformanceObserver((list) => {
        const entries = list.getEntries();
        results.fid = entries[0].processingStart - entries[0].startTime;
      }).observe({ type: 'first-input', buffered: true });
      
      // CLS (Cumulative Layout Shift)
      new PerformanceObserver((list) => {
        const entries = list.getEntries();
        results.cls = entries.reduce((sum, entry) => sum + entry.value, 0);
      }).observe({ type: 'layout-shift', buffered: true });
      
      // 等待指标收集完成
      setTimeout(() => resolve(results), 3000);
    });
  });
  
  console.log('Web Vitals:', vitals);
  await browser.close();
}

Chrome DevTools Protocol (CDP) #

创建 CDP Session #

javascript
const puppeteer = require('puppeteer');

async function cdpExample() {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  
  // 创建 CDP Session
  const client = await page.createCDPSession();
  
  // 使用 CDP 命令
  await client.send('Network.enable');
  
  // 监听 CDP 事件
  client.on('Network.requestWillBeSent', (params) => {
    console.log('Request:', params.request.url);
  });
  
  await page.goto('https://example.com');
  
  await browser.close();
}

常用 CDP 功能 #

javascript
async function cdpFeatures() {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  const client = await page.createCDPSession();
  
  // 获取控制台日志
  await client.send('Runtime.enable');
  client.on('Runtime.consoleAPICalled', (params) => {
    console.log('Console:', params.args);
  });
  
  // 获取 JS 错误
  client.on('Runtime.exceptionThrown', (params) => {
    console.error('Exception:', params.exceptionDetails);
  });
  
  // 覆盖地理位置
  await client.send('Emulation.setGeolocationOverride', {
    latitude: 37.7749,
    longitude: -122.4194,
    accuracy: 100
  });
  
  // 覆盖设备缩放
  await client.send('Emulation.setDeviceMetricsOverride', {
    width: 1920,
    height: 1080,
    deviceScaleFactor: 2,
    mobile: false
  });
  
  // 清除浏览器缓存
  await client.send('Network.clearBrowserCache');
  
  // 清除 Cookie
  await client.send('Network.clearBrowserCookies');
  
  await page.goto('https://example.com');
  
  await browser.close();
}

性能分析 CDP #

javascript
async function cdpPerformance() {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  const client = await page.createCDPSession();
  
  // 启用性能分析
  await client.send('Performance.enable');
  
  await page.goto('https://example.com');
  
  // 获取性能指标
  const metrics = await client.send('Performance.getMetrics');
  console.log('Performance metrics:', metrics.metrics);
  
  await browser.close();
}

并发控制 #

并发爬虫 #

javascript
const puppeteer = require('puppeteer');

class ConcurrentCrawler {
  constructor(concurrency = 5) {
    this.concurrency = concurrency;
    this.browser = null;
  }
  
  async init() {
    this.browser = await puppeteer.launch();
  }
  
  async crawl(urls) {
    const results = [];
    const queue = [...urls];
    
    const workers = Array(this.concurrency).fill(null).map(async () => {
      const page = await this.browser.newPage();
      
      while (queue.length > 0) {
        const url = queue.shift();
        if (!url) break;
        
        try {
          await page.goto(url, { waitUntil: 'domcontentloaded' });
          const data = await page.evaluate(() => ({
            title: document.title,
            content: document.body.innerText.slice(0, 500)
          }));
          results.push({ url, data, success: true });
        } catch (error) {
          results.push({ url, error: error.message, success: false });
        }
      }
      
      await page.close();
    });
    
    await Promise.all(workers);
    return results;
  }
  
  async close() {
    await this.browser.close();
  }
}

// 使用示例
async function main() {
  const crawler = new ConcurrentCrawler(5);
  await crawler.init();
  
  const urls = [
    'https://example1.com',
    'https://example2.com',
    // ... 更多 URL
  ];
  
  const results = await crawler.crawl(urls);
  console.log(results);
  
  await crawler.close();
}

限流控制 #

javascript
class RateLimiter {
  constructor(rateLimit, interval = 1000) {
    this.rateLimit = rateLimit;
    this.interval = interval;
    this.tokens = rateLimit;
    this.lastRefill = Date.now();
  }
  
  async acquire() {
    await this.refill();
    
    if (this.tokens < 1) {
      await new Promise(resolve => setTimeout(resolve, this.interval));
      return this.acquire();
    }
    
    this.tokens--;
  }
  
  async refill() {
    const now = Date.now();
    const elapsed = now - this.lastRefill;
    
    if (elapsed >= this.interval) {
      this.tokens = Math.min(this.rateLimit, this.tokens + this.rateLimit);
      this.lastRefill = now;
    }
  }
}

// 使用示例
const limiter = new RateLimiter(10, 1000);  // 每秒 10 个请求

async function crawlWithRateLimit(page, url) {
  await limiter.acquire();
  await page.goto(url);
}

错误处理与重试 #

自动重试 #

javascript
async function retryOperation(operation, maxRetries = 3, delay = 1000) {
  for (let i = 0; i < maxRetries; i++) {
    try {
      return await operation();
    } catch (error) {
      console.log(`Attempt ${i + 1} failed:`, error.message);
      
      if (i === maxRetries - 1) {
        throw error;
      }
      
      await new Promise(resolve => setTimeout(resolve, delay * (i + 1)));
    }
  }
}

// 使用示例
const result = await retryOperation(async () => {
  await page.goto('https://example.com');
  return await page.evaluate(() => document.title);
});

错误监控 #

javascript
class ErrorMonitor {
  constructor() {
    this.errors = [];
  }
  
  capture(error, context = {}) {
    this.errors.push({
      timestamp: new Date().toISOString(),
      error: error.message,
      stack: error.stack,
      context
    });
  }
  
  getErrors() {
    return this.errors;
  }
  
  hasErrors() {
    return this.errors.length > 0;
  }
  
  clear() {
    this.errors = [];
  }
}

// 使用示例
const monitor = new ErrorMonitor();

page.on('error', (error) => {
  monitor.capture(error, { type: 'page_error' });
});

page.on('pageerror', (error) => {
  monitor.capture(error, { type: 'page_script_error' });
});

完整示例 #

分布式爬虫 #

javascript
const puppeteer = require('puppeteer');
const { Worker, isMainThread, parentPort, workerData } = require('worker_threads');

class DistributedCrawler {
  constructor(workerCount = 4) {
    this.workerCount = workerCount;
  }
  
  async crawl(urls) {
    const chunks = this.chunkArray(urls, this.workerCount);
    
    const workers = chunks.map((chunk, index) => {
      return new Promise((resolve, reject) => {
        const worker = new Worker(__filename, {
          workerData: { urls: chunk, workerId: index }
        });
        
        worker.on('message', resolve);
        worker.on('error', reject);
        worker.on('exit', (code) => {
          if (code !== 0) reject(new Error(`Worker stopped with code ${code}`));
        });
      });
    });
    
    const results = await Promise.all(workers);
    return results.flat();
  }
  
  chunkArray(array, chunks) {
    const result = [];
    const chunkSize = Math.ceil(array.length / chunks);
    
    for (let i = 0; i < chunks; i++) {
      result.push(array.slice(i * chunkSize, (i + 1) * chunkSize));
    }
    
    return result;
  }
}

// Worker 线程代码
async function workerCrawl() {
  const { urls, workerId } = workerData;
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  
  const results = [];
  
  for (const url of urls) {
    try {
      await page.goto(url, { waitUntil: 'domcontentloaded' });
      const data = await page.evaluate(() => ({
        title: document.title,
        url: window.location.href
      }));
      results.push({ ...data, workerId, success: true });
    } catch (error) {
      results.push({ url, workerId, error: error.message, success: false });
    }
  }
  
  await browser.close();
  parentPort.postMessage(results);
}

// 主线程入口
if (isMainThread) {
  module.exports = DistributedCrawler;
} else {
  workerCrawl();
}

自动化测试框架 #

javascript
const puppeteer = require('puppeteer');

class TestRunner {
  constructor() {
    this.browser = null;
    this.page = null;
    this.tests = [];
    this.results = [];
  }
  
  async init() {
    this.browser = await puppeteer.launch({ headless: false });
    this.page = await this.browser.newPage();
  }
  
  test(name, fn) {
    this.tests.push({ name, fn });
  }
  
  async run() {
    for (const test of this.tests) {
      const startTime = Date.now();
      
      try {
        await test.fn(this.page);
        this.results.push({
          name: test.name,
          status: 'passed',
          duration: Date.now() - startTime
        });
        console.log(`✅ ${test.name}`);
      } catch (error) {
        this.results.push({
          name: test.name,
          status: 'failed',
          error: error.message,
          duration: Date.now() - startTime
        });
        console.log(`❌ ${test.name}: ${error.message}`);
      }
    }
    
    return this.results;
  }
  
  async close() {
    await this.browser.close();
  }
}

// 使用示例
async function main() {
  const runner = new TestRunner();
  await runner.init();
  
  runner.test('should load homepage', async (page) => {
    await page.goto('https://example.com');
    const title = await page.title();
    if (!title) throw new Error('Title is empty');
  });
  
  runner.test('should find element', async (page) => {
    await page.waitForSelector('h1');
    const text = await page.$eval('h1', el => el.textContent);
    if (!text) throw new Error('H1 text is empty');
  });
  
  const results = await runner.run();
  console.log('Results:', results);
  
  await runner.close();
}

下一步 #

现在你已经掌握了 Puppeteer 的高级功能,接下来学习 最佳实践 了解如何在生产环境中高效使用 Puppeteer!

最后更新:2026-03-28