Puppeteer JavaScript 执行与页面评估 #
基本概念 #
执行环境 #
text
┌─────────────────────────────────────────────────────────────┐
│ Puppeteer 执行环境 │
├─────────────────────────────────────────────────────────────┤
│ │
│ Node.js 环境 浏览器环境 │
│ ┌──────────────┐ ┌──────────────┐ │
│ │ Puppeteer │ │ 页面 DOM │ │
│ │ 脚本代码 │ ◄──────────► │ JavaScript │ │
│ │ │ 序列化传输 │ window │ │
│ └──────────────┘ └──────────────┘ │
│ │
│ page.evaluate() 用于在浏览器环境中执行代码 │
│ │
└─────────────────────────────────────────────────────────────┘
数据传递 #
javascript
// Node.js -> 浏览器:通过参数传递
const result = await page.evaluate((a, b) => {
return a + b; // 在浏览器环境中执行
}, 1, 2);
// 浏览器 -> Node.js:通过返回值传递
const title = await page.evaluate(() => {
return document.title; // 返回到 Node.js
});
page.evaluate() #
基本用法 #
javascript
// 执行简单表达式
const result = await page.evaluate(() => {
return 1 + 1;
});
console.log(result); // 2
// 获取页面信息
const title = await page.evaluate(() => document.title);
const url = await page.evaluate(() => window.location.href);
// 获取 DOM 信息
const text = await page.evaluate(() => {
return document.querySelector('.title').textContent;
});
传递参数 #
javascript
// 传递单个参数
const result = await page.evaluate((num) => {
return num * 2;
}, 5);
console.log(result); // 10
// 传递多个参数
const result = await page.evaluate((a, b, c) => {
return a + b + c;
}, 1, 2, 3);
console.log(result); // 6
// 传递对象
const result = await page.evaluate((config) => {
return `${config.name}: ${config.value}`;
}, { name: 'test', value: 123 });
console.log(result); // 'test: 123'
返回复杂数据 #
javascript
// 返回对象
const pageInfo = await page.evaluate(() => {
return {
title: document.title,
url: window.location.href,
width: window.innerWidth,
height: window.innerHeight
};
});
console.log(pageInfo);
// 返回数组
const links = await page.evaluate(() => {
return Array.from(document.querySelectorAll('a')).map(a => ({
text: a.textContent,
href: a.href
}));
});
console.log(links);
// 返回嵌套数据
const data = await page.evaluate(() => {
return {
user: {
name: document.querySelector('.username').textContent,
email: document.querySelector('.email').textContent
},
items: Array.from(document.querySelectorAll('.item')).map(el => ({
id: el.dataset.id,
name: el.querySelector('.name').textContent
}))
};
});
注意事项 #
javascript
// ❌ 错误:不能直接使用外部变量
const selector = '.title';
const text = await page.evaluate(() => {
return document.querySelector(selector).textContent; // selector 未定义
});
// ✅ 正确:通过参数传递
const selector = '.title';
const text = await page.evaluate((sel) => {
return document.querySelector(sel).textContent;
}, selector);
// ❌ 错误:不能返回 DOM 元素
const element = await page.evaluate(() => {
return document.querySelector('.title'); // 无法序列化
});
// ✅ 正确:返回 ElementHandle
const element = await page.$('.title');
page.evaluateHandle() #
基本用法 #
javascript
// 返回 JSHandle
const handle = await page.evaluateHandle(() => {
return document.body;
});
// JSHandle 可以在后续 evaluate 中使用
const tagName = await page.evaluate((body) => {
return body.tagName;
}, handle);
console.log(tagName); // 'BODY'
获取属性 #
javascript
// 获取对象属性
const handle = await page.evaluateHandle(() => ({
name: 'John',
age: 30
}));
const properties = await handle.getProperties();
for (const [key, value] of properties) {
console.log(key, await value.jsonValue());
}
// 获取单个属性
const nameHandle = await handle.getProperty('name');
const name = await nameHandle.jsonValue();
console.log(name); // 'John'
jsonValue() #
javascript
// 将 JSHandle 转换为 JSON 值
const handle = await page.evaluateHandle(() => ({
name: 'John',
age: 30
}));
const value = await handle.jsonValue();
console.log(value); // { name: 'John', age: 30 }
dispose() #
javascript
// 释放 JSHandle
const handle = await page.evaluateHandle(() => document.body);
// ... 使用 handle
await handle.dispose();
$eval 和 $$eval #
page.$eval() #
对单个元素执行操作:
javascript
// 获取元素文本
const text = await page.$eval('.title', el => el.textContent);
// 获取属性
const href = await page.$eval('a.link', el => el.href);
const src = await page.$eval('img', el => el.getAttribute('src'));
// 传递参数
const result = await page.$eval('.container', (el, className) => {
el.classList.add(className);
return el.className;
}, 'active');
// 修改元素
await page.$eval('#input', el => el.value = 'new value');
await page.$eval('#button', el => el.click());
page.$$eval() #
对多个元素执行操作:
javascript
// 获取所有链接文本
const texts = await page.$$eval('a', elements =>
elements.map(el => el.textContent)
);
// 获取所有图片 src
const srcs = await page.$$eval('img', elements =>
elements.map(el => el.src)
);
// 传递参数
const count = await page.$$eval('.item', (elements, className) => {
elements.forEach(el => el.classList.add(className));
return elements.length;
}, 'highlight');
// 过滤元素
const visibleItems = await page.$$eval('.item', elements =>
elements.filter(el => {
const style = window.getComputedStyle(el);
return style.display !== 'none';
}).map(el => el.textContent)
);
暴露函数到页面 #
page.exposeFunction() #
将 Node.js 函数暴露到浏览器环境:
javascript
// 暴露简单函数
await page.exposeFunction('md5', (text) => {
const crypto = require('crypto');
return crypto.createHash('md5').update(text).digest('hex');
});
// 在页面中使用
const hash = await page.evaluate(async () => {
return await window.md5('hello world');
});
console.log(hash);
实际应用示例 #
javascript
const puppeteer = require('puppeteer');
const fs = require('fs');
async function main() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
// 暴露文件读取函数
await page.exposeFunction('readFile', (path) => {
return fs.readFileSync(path, 'utf8');
});
// 暴露文件写入函数
await page.exposeFunction('writeFile', (path, content) => {
fs.writeFileSync(path, content);
return true;
});
// 暴露日志函数
await page.exposeFunction('log', (...args) => {
console.log('[Page]', ...args);
});
await page.goto('https://example.com');
// 在页面中使用暴露的函数
await page.evaluate(async () => {
const content = await window.readFile('./data.txt');
window.log('File content:', content);
await window.writeFile('./output.txt', 'Hello from page!');
});
await browser.close();
}
暴露 API 客户端 #
javascript
const puppeteer = require('puppeteer');
const axios = require('axios');
async function main() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
// 暴露 API 请求函数
await page.exposeFunction('fetchAPI', async (endpoint, options = {}) => {
try {
const response = await axios({
url: `https://api.example.com${endpoint}`,
...options
});
return response.data;
} catch (error) {
return { error: error.message };
}
});
await page.goto('https://example.com');
// 在页面中调用 API
const data = await page.evaluate(async () => {
return await window.fetchAPI('/users', {
method: 'GET'
});
});
console.log(data);
await browser.close();
}
在页面中注入脚本 #
page.evaluateOnNewDocument() #
在页面加载前注入脚本:
javascript
// 注入全局变量
await page.evaluateOnNewDocument(() => {
window.myCustomVar = 'injected value';
});
// 覆盖原生对象
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', {
get: () => false
});
});
// 注入自定义方法
await page.evaluateOnNewDocument(() => {
window.getCustomData = () => {
return {
timestamp: Date.now(),
url: window.location.href
};
};
});
await page.goto('https://example.com');
隐藏自动化特征 #
javascript
await page.evaluateOnNewDocument(() => {
// 隐藏 webdriver
Object.defineProperty(navigator, 'webdriver', {
get: () => false
});
// 添加 chrome 对象
window.chrome = {
runtime: {}
};
// 修改 plugins
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5]
});
// 修改 languages
Object.defineProperty(navigator, 'languages', {
get: () => ['zh-CN', 'zh', 'en']
});
});
page.addScriptTag() #
添加外部脚本:
javascript
// 添加本地脚本
await page.addScriptTag({ path: './inject.js' });
// 添加远程脚本
await page.addScriptTag({ url: 'https://code.jquery.com/jquery-3.6.0.min.js' });
// 添加内联脚本
await page.addScriptTag({ content: 'console.log("Injected!");' });
// 使用注入的库
await page.addScriptTag({ url: 'https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js' });
const data = await page.evaluate(async () => {
const response = await axios.get('/api/data');
return response.data;
});
page.addStyleTag() #
添加样式:
javascript
// 添加本地样式
await page.addStyleTag({ path: './styles.css' });
// 添加远程样式
await page.addStyleTag({ url: 'https://cdn.jsdelivr.net/npm/bootstrap/dist/css/bootstrap.min.css' });
// 添加内联样式
await page.addStyleTag({ content: 'body { background: #f0f0f0; }' });
高级用法 #
执行异步代码 #
javascript
// 在 evaluate 中使用 async/await
const data = await page.evaluate(async () => {
const response = await fetch('/api/data');
return response.json();
});
// 等待 Promise
const result = await page.evaluate(() => {
return new Promise((resolve) => {
setTimeout(() => resolve('done'), 1000);
});
});
使用 ElementHandle #
javascript
// 获取 ElementHandle
const element = await page.$('.card');
// 在 evaluate 中使用
const text = await page.evaluate((el) => {
return el.textContent;
}, element);
// 使用 evaluateHandle
const handle = await element.evaluateHandle((el) => {
return el.querySelector('.title');
});
const title = await handle.jsonValue();
执行上下文隔离 #
javascript
// 创建隔离的执行上下文
const context = await page.createExecutionContext();
// 在特定上下文中执行
const result = await context.evaluate(() => {
return window.location.href;
});
// 在 iframe 中执行
const frame = page.frames().find(f => f.name() === 'myframe');
const frameResult = await frame.evaluate(() => {
return document.title;
});
错误处理 #
javascript
// 捕获 evaluate 中的错误
try {
const result = await page.evaluate(() => {
throw new Error('Custom error');
});
} catch (error) {
console.error('Evaluate error:', error.message);
}
// 安全访问元素
const text = await page.evaluate(() => {
const el = document.querySelector('.maybe-not-exist');
return el ? el.textContent : null;
});
性能优化 #
减少序列化开销 #
javascript
// ❌ 多次调用 evaluate
const title = await page.evaluate(() => document.title);
const url = await page.evaluate(() => window.location.href);
const width = await page.evaluate(() => window.innerWidth);
// ✅ 一次调用获取多个值
const pageInfo = await page.evaluate(() => ({
title: document.title,
url: window.location.href,
width: window.innerWidth
}));
批量操作 #
javascript
// ❌ 逐个获取元素属性
const items = [];
for (const selector of selectors) {
const text = await page.$eval(selector, el => el.textContent);
items.push(text);
}
// ✅ 批量获取
const items = await page.evaluate((selectors) => {
return selectors.map(sel => {
const el = document.querySelector(sel);
return el ? el.textContent : null;
});
}, selectors);
完整示例 #
数据抓取 #
javascript
const puppeteer = require('puppeteer');
async function scrapeProducts(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle0' });
// 暴露数据处理函数
await page.exposeFunction('processPrice', (priceStr) => {
return parseFloat(priceStr.replace(/[^0-9.]/g, ''));
});
// 抓取数据
const products = await page.evaluate(async () => {
const items = Array.from(document.querySelectorAll('.product'));
return Promise.all(items.map(async (item) => {
const priceText = item.querySelector('.price')?.textContent || '0';
const price = await window.processPrice(priceText);
return {
name: item.querySelector('.name')?.textContent.trim(),
price: price,
image: item.querySelector('img')?.src,
rating: parseFloat(item.querySelector('.rating')?.textContent || '0'),
reviews: parseInt(item.querySelector('.review-count')?.textContent || '0')
};
}));
});
await browser.close();
return products;
}
scrapeProducts('https://example.com/products').then(console.log);
页面状态检查 #
javascript
const puppeteer = require('puppeteer');
async function checkPageState(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
const state = await page.evaluate(() => {
return {
// 基本状态
readyState: document.readyState,
title: document.title,
url: window.location.href,
// DOM 状态
elementCount: document.querySelectorAll('*').length,
imageCount: document.querySelectorAll('img').length,
scriptCount: document.querySelectorAll('script').length,
// 错误状态
hasErrors: window.onerror !== null,
// 性能状态
loadTime: window.performance.timing.loadEventEnd - window.performance.timing.navigationStart,
// 可见性
visibilityState: document.visibilityState,
hidden: document.hidden,
// 存储状态
localStorageKeys: Object.keys(localStorage),
sessionStorageKeys: Object.keys(sessionStorage),
cookies: document.cookie
};
});
await browser.close();
return state;
}
表单验证 #
javascript
const puppeteer = require('puppeteer');
async function validateForm(url, formData) {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto(url);
// 填写表单
await page.type('#email', formData.email);
await page.type('#password', formData.password);
// 检查客户端验证
const validation = await page.evaluate(() => {
const emailInput = document.querySelector('#email');
const passwordInput = document.querySelector('#password');
return {
emailValid: emailInput.checkValidity(),
passwordValid: passwordInput.checkValidity(),
emailError: emailInput.validationMessage,
passwordError: passwordInput.validationMessage
};
});
console.log('Validation:', validation);
await browser.close();
return validation;
}
下一步 #
现在你已经掌握了 Puppeteer 的 JavaScript 执行与页面评估功能,接下来学习 网络请求处理 了解如何拦截和修改网络请求!
最后更新:2026-03-28