Puppeteer 高级功能 #
多页面管理 #
创建和管理页面 #
javascript
const puppeteer = require('puppeteer');
async function multiPageExample() {
const browser = await puppeteer.launch();
// 创建多个页面
const page1 = await browser.newPage();
const page2 = await browser.newPage();
const page3 = await browser.newPage();
// 并行访问不同页面
await Promise.all([
page1.goto('https://example1.com'),
page2.goto('https://example2.com'),
page3.goto('https://example3.com')
]);
// 获取所有页面
const pages = await browser.pages();
console.log(`Total pages: ${pages.length}`);
await browser.close();
}
页面池管理 #
javascript
class PagePool {
constructor(browser, poolSize = 5) {
this.browser = browser;
this.poolSize = poolSize;
this.pool = [];
this.available = [];
}
async init() {
for (let i = 0; i < this.poolSize; i++) {
const page = await this.browser.newPage();
this.pool.push(page);
this.available.push(page);
}
}
async acquire() {
if (this.available.length === 0) {
throw new Error('No available pages in pool');
}
return this.available.pop();
}
release(page) {
if (!this.pool.includes(page)) {
throw new Error('Page not from this pool');
}
this.available.push(page);
}
async close() {
await Promise.all(this.pool.map(page => page.close()));
}
}
// 使用示例
async function usePagePool() {
const browser = await puppeteer.launch();
const pool = new PagePool(browser, 5);
await pool.init();
const page = await pool.acquire();
await page.goto('https://example.com');
// ... 操作
pool.release(page);
await pool.close();
await browser.close();
}
页面事件监听 #
javascript
const browser = await puppeteer.launch();
// 监听新页面创建
browser.on('targetcreated', async (target) => {
if (target.type() === 'page') {
const page = await target.page();
console.log('New page created:', page.url());
}
});
// 监听页面关闭
browser.on('targetdestroyed', (target) => {
console.log('Target destroyed:', target.url());
});
浏览器上下文 #
创建隔离上下文 #
javascript
const puppeteer = require('puppeteer');
async function contextExample() {
const browser = await puppeteer.launch();
// 创建独立的浏览器上下文
const context1 = await browser.createBrowserContext();
const context2 = await browser.createBrowserContext();
// 每个上下文有独立的 Cookie 和存储
const page1 = await context1.newPage();
const page2 = await context2.newPage();
// page1 和 page2 完全隔离
await page1.goto('https://example.com');
await page2.goto('https://example.com');
// 关闭上下文(会关闭该上下文中的所有页面)
await context1.close();
await context2.close();
await browser.close();
}
多用户登录 #
javascript
async function multiUserLogin() {
const browser = await puppeteer.launch();
// 用户 A 上下文
const contextA = await browser.createBrowserContext();
const pageA = await contextA.newPage();
await pageA.goto('https://example.com/login');
await pageA.type('#username', 'userA');
await pageA.type('#password', 'passwordA');
await pageA.click('#login');
// 用户 B 上下文
const contextB = await browser.createBrowserContext();
const pageB = await contextB.newPage();
await pageB.goto('https://example.com/login');
await pageB.type('#username', 'userB');
await pageB.type('#password', 'passwordB');
await pageB.click('#login');
// 两个用户可以同时操作,互不影响
await browser.close();
}
隐身模式 #
javascript
// 创建隐身上下文
const context = await browser.createBrowserContext();
// 设置为隐身模式
await context.overridePermissions('https://example.com', ['geolocation']);
const page = await context.newPage();
iframe 处理 #
获取 iframe #
javascript
// 获取主框架
const mainFrame = page.mainFrame();
// 获取所有框架
const frames = page.frames();
// 通过名称获取框架
const frame = page.frame({ name: 'myframe' });
// 通过 URL 获取框架
const frame = page.frame({ url: /.*iframe.*/ });
iframe 操作 #
javascript
// 在 iframe 中操作
const frame = page.frames().find(f => f.name() === 'myframe');
if (frame) {
// 等待元素
await frame.waitForSelector('.button');
// 点击
await frame.click('.button');
// 输入
await frame.type('#input', 'text');
// 执行脚本
const text = await frame.evaluate(() => {
return document.querySelector('.content').textContent;
});
}
嵌套 iframe #
javascript
// 处理嵌套 iframe
async function getNestedFrame(page, selectors) {
let frame = page.mainFrame();
for (const selector of selectors) {
await frame.waitForSelector(selector);
const element = await frame.$(selector);
frame = await element.contentFrame();
if (!frame) {
throw new Error(`Frame not found for selector: ${selector}`);
}
}
return frame;
}
// 使用示例
const nestedFrame = await getNestedFrame(page, [
'iframe#level1',
'iframe#level2',
'iframe#level3'
]);
性能分析 #
性能追踪 #
javascript
const puppeteer = require('puppeteer');
const fs = require('fs');
async function tracePerformance(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
// 开始追踪
await page.tracing.start({
path: 'trace.json',
screenshots: true
});
await page.goto(url);
// 停止追踪
await page.tracing.stop();
await browser.close();
}
性能指标 #
javascript
async function getPerformanceMetrics(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
// 获取页面指标
const metrics = await page.metrics();
console.log('Page metrics:', metrics);
// 获取性能时间
const timing = await page.evaluate(() => {
const timing = window.performance.timing;
return {
dns: timing.domainLookupEnd - timing.domainLookupStart,
tcp: timing.connectEnd - timing.connectStart,
request: timing.responseStart - timing.requestStart,
response: timing.responseEnd - timing.responseStart,
domProcessing: timing.domComplete - timing.domInteractive,
total: timing.loadEventEnd - timing.navigationStart
};
});
console.log('Performance timing:', timing);
await browser.close();
}
Web Vitals #
javascript
async function measureWebVitals(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
const vitals = await page.evaluate(() => {
return new Promise((resolve) => {
const results = {};
// LCP (Largest Contentful Paint)
new PerformanceObserver((list) => {
const entries = list.getEntries();
results.lcp = entries[entries.length - 1].startTime;
}).observe({ type: 'largest-contentful-paint', buffered: true });
// FID (First Input Delay)
new PerformanceObserver((list) => {
const entries = list.getEntries();
results.fid = entries[0].processingStart - entries[0].startTime;
}).observe({ type: 'first-input', buffered: true });
// CLS (Cumulative Layout Shift)
new PerformanceObserver((list) => {
const entries = list.getEntries();
results.cls = entries.reduce((sum, entry) => sum + entry.value, 0);
}).observe({ type: 'layout-shift', buffered: true });
// 等待指标收集完成
setTimeout(() => resolve(results), 3000);
});
});
console.log('Web Vitals:', vitals);
await browser.close();
}
Chrome DevTools Protocol (CDP) #
创建 CDP Session #
javascript
const puppeteer = require('puppeteer');
async function cdpExample() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
// 创建 CDP Session
const client = await page.createCDPSession();
// 使用 CDP 命令
await client.send('Network.enable');
// 监听 CDP 事件
client.on('Network.requestWillBeSent', (params) => {
console.log('Request:', params.request.url);
});
await page.goto('https://example.com');
await browser.close();
}
常用 CDP 功能 #
javascript
async function cdpFeatures() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const client = await page.createCDPSession();
// 获取控制台日志
await client.send('Runtime.enable');
client.on('Runtime.consoleAPICalled', (params) => {
console.log('Console:', params.args);
});
// 获取 JS 错误
client.on('Runtime.exceptionThrown', (params) => {
console.error('Exception:', params.exceptionDetails);
});
// 覆盖地理位置
await client.send('Emulation.setGeolocationOverride', {
latitude: 37.7749,
longitude: -122.4194,
accuracy: 100
});
// 覆盖设备缩放
await client.send('Emulation.setDeviceMetricsOverride', {
width: 1920,
height: 1080,
deviceScaleFactor: 2,
mobile: false
});
// 清除浏览器缓存
await client.send('Network.clearBrowserCache');
// 清除 Cookie
await client.send('Network.clearBrowserCookies');
await page.goto('https://example.com');
await browser.close();
}
性能分析 CDP #
javascript
async function cdpPerformance() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const client = await page.createCDPSession();
// 启用性能分析
await client.send('Performance.enable');
await page.goto('https://example.com');
// 获取性能指标
const metrics = await client.send('Performance.getMetrics');
console.log('Performance metrics:', metrics.metrics);
await browser.close();
}
并发控制 #
并发爬虫 #
javascript
const puppeteer = require('puppeteer');
class ConcurrentCrawler {
constructor(concurrency = 5) {
this.concurrency = concurrency;
this.browser = null;
}
async init() {
this.browser = await puppeteer.launch();
}
async crawl(urls) {
const results = [];
const queue = [...urls];
const workers = Array(this.concurrency).fill(null).map(async () => {
const page = await this.browser.newPage();
while (queue.length > 0) {
const url = queue.shift();
if (!url) break;
try {
await page.goto(url, { waitUntil: 'domcontentloaded' });
const data = await page.evaluate(() => ({
title: document.title,
content: document.body.innerText.slice(0, 500)
}));
results.push({ url, data, success: true });
} catch (error) {
results.push({ url, error: error.message, success: false });
}
}
await page.close();
});
await Promise.all(workers);
return results;
}
async close() {
await this.browser.close();
}
}
// 使用示例
async function main() {
const crawler = new ConcurrentCrawler(5);
await crawler.init();
const urls = [
'https://example1.com',
'https://example2.com',
// ... 更多 URL
];
const results = await crawler.crawl(urls);
console.log(results);
await crawler.close();
}
限流控制 #
javascript
class RateLimiter {
constructor(rateLimit, interval = 1000) {
this.rateLimit = rateLimit;
this.interval = interval;
this.tokens = rateLimit;
this.lastRefill = Date.now();
}
async acquire() {
await this.refill();
if (this.tokens < 1) {
await new Promise(resolve => setTimeout(resolve, this.interval));
return this.acquire();
}
this.tokens--;
}
async refill() {
const now = Date.now();
const elapsed = now - this.lastRefill;
if (elapsed >= this.interval) {
this.tokens = Math.min(this.rateLimit, this.tokens + this.rateLimit);
this.lastRefill = now;
}
}
}
// 使用示例
const limiter = new RateLimiter(10, 1000); // 每秒 10 个请求
async function crawlWithRateLimit(page, url) {
await limiter.acquire();
await page.goto(url);
}
错误处理与重试 #
自动重试 #
javascript
async function retryOperation(operation, maxRetries = 3, delay = 1000) {
for (let i = 0; i < maxRetries; i++) {
try {
return await operation();
} catch (error) {
console.log(`Attempt ${i + 1} failed:`, error.message);
if (i === maxRetries - 1) {
throw error;
}
await new Promise(resolve => setTimeout(resolve, delay * (i + 1)));
}
}
}
// 使用示例
const result = await retryOperation(async () => {
await page.goto('https://example.com');
return await page.evaluate(() => document.title);
});
错误监控 #
javascript
class ErrorMonitor {
constructor() {
this.errors = [];
}
capture(error, context = {}) {
this.errors.push({
timestamp: new Date().toISOString(),
error: error.message,
stack: error.stack,
context
});
}
getErrors() {
return this.errors;
}
hasErrors() {
return this.errors.length > 0;
}
clear() {
this.errors = [];
}
}
// 使用示例
const monitor = new ErrorMonitor();
page.on('error', (error) => {
monitor.capture(error, { type: 'page_error' });
});
page.on('pageerror', (error) => {
monitor.capture(error, { type: 'page_script_error' });
});
完整示例 #
分布式爬虫 #
javascript
const puppeteer = require('puppeteer');
const { Worker, isMainThread, parentPort, workerData } = require('worker_threads');
class DistributedCrawler {
constructor(workerCount = 4) {
this.workerCount = workerCount;
}
async crawl(urls) {
const chunks = this.chunkArray(urls, this.workerCount);
const workers = chunks.map((chunk, index) => {
return new Promise((resolve, reject) => {
const worker = new Worker(__filename, {
workerData: { urls: chunk, workerId: index }
});
worker.on('message', resolve);
worker.on('error', reject);
worker.on('exit', (code) => {
if (code !== 0) reject(new Error(`Worker stopped with code ${code}`));
});
});
});
const results = await Promise.all(workers);
return results.flat();
}
chunkArray(array, chunks) {
const result = [];
const chunkSize = Math.ceil(array.length / chunks);
for (let i = 0; i < chunks; i++) {
result.push(array.slice(i * chunkSize, (i + 1) * chunkSize));
}
return result;
}
}
// Worker 线程代码
async function workerCrawl() {
const { urls, workerId } = workerData;
const browser = await puppeteer.launch();
const page = await browser.newPage();
const results = [];
for (const url of urls) {
try {
await page.goto(url, { waitUntil: 'domcontentloaded' });
const data = await page.evaluate(() => ({
title: document.title,
url: window.location.href
}));
results.push({ ...data, workerId, success: true });
} catch (error) {
results.push({ url, workerId, error: error.message, success: false });
}
}
await browser.close();
parentPort.postMessage(results);
}
// 主线程入口
if (isMainThread) {
module.exports = DistributedCrawler;
} else {
workerCrawl();
}
自动化测试框架 #
javascript
const puppeteer = require('puppeteer');
class TestRunner {
constructor() {
this.browser = null;
this.page = null;
this.tests = [];
this.results = [];
}
async init() {
this.browser = await puppeteer.launch({ headless: false });
this.page = await this.browser.newPage();
}
test(name, fn) {
this.tests.push({ name, fn });
}
async run() {
for (const test of this.tests) {
const startTime = Date.now();
try {
await test.fn(this.page);
this.results.push({
name: test.name,
status: 'passed',
duration: Date.now() - startTime
});
console.log(`✅ ${test.name}`);
} catch (error) {
this.results.push({
name: test.name,
status: 'failed',
error: error.message,
duration: Date.now() - startTime
});
console.log(`❌ ${test.name}: ${error.message}`);
}
}
return this.results;
}
async close() {
await this.browser.close();
}
}
// 使用示例
async function main() {
const runner = new TestRunner();
await runner.init();
runner.test('should load homepage', async (page) => {
await page.goto('https://example.com');
const title = await page.title();
if (!title) throw new Error('Title is empty');
});
runner.test('should find element', async (page) => {
await page.waitForSelector('h1');
const text = await page.$eval('h1', el => el.textContent);
if (!text) throw new Error('H1 text is empty');
});
const results = await runner.run();
console.log('Results:', results);
await runner.close();
}
下一步 #
现在你已经掌握了 Puppeteer 的高级功能,接下来学习 最佳实践 了解如何在生产环境中高效使用 Puppeteer!
最后更新:2026-03-28