TensorFlow.js 优化算法 #
优化器概述 #
优化器是训练神经网络的核心组件,负责根据梯度更新模型参数。
text
┌─────────────────────────────────────────────────────────────┐
│ 优化器的作用 │
├─────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ 计算梯度 │ -> │ 优化器更新 │ -> │ 更新参数 │ │
│ │ ∂L/∂w │ │ w = w - η∇L│ │ 新权重 │ │
│ └─────────────┘ └─────────────┘ └─────────────┘ │
│ │
│ 关键因素: │
│ - 学习率 (η):步长大小 │
│ - 动量:加速收敛 │
│ - 自适应:自动调整学习率 │
│ │
└─────────────────────────────────────────────────────────────┘
常用优化器 #
SGD(随机梯度下降) #
最基础的优化器,直接沿负梯度方向更新参数。
javascript
const sgd = tf.train.sgd(0.01);
model.compile({
optimizer: sgd,
loss: 'meanSquaredError'
});
带动量的 SGD #
javascript
const sgdMomentum = tf.train.momentum(0.01, 0.9);
model.compile({
optimizer: sgdMomentum,
loss: 'meanSquaredError'
});
参数说明 #
| 参数 | 说明 | 典型值 |
|---|---|---|
| learningRate | 学习率 | 0.01 |
| momentum | 动量系数 | 0.9 |
| useNesterov | 是否使用 Nesterov 动量 | false |
Adam #
最常用的自适应优化器,结合了动量和自适应学习率。
javascript
const adam = tf.train.adam(0.001);
model.compile({
optimizer: adam,
loss: 'categoricalCrossentropy'
});
完整配置 #
javascript
const adam = tf.train.adam(
0.001,
0.9,
0.999,
1e-8
);
参数说明 #
| 参数 | 说明 | 默认值 |
|---|---|---|
| learningRate | 学习率 | 0.001 |
| beta1 | 一阶矩估计衰减率 | 0.9 |
| beta2 | 二阶矩估计衰减率 | 0.999 |
| epsilon | 数值稳定性常数 | 1e-8 |
Adamax #
Adam 的变体,对学习率上限更不敏感。
javascript
const adamax = tf.train.adamax(0.002, 0.9, 0.999, 1e-8);
model.compile({
optimizer: adamax,
loss: 'meanSquaredError'
});
RMSProp #
自适应学习率优化器,适合处理非平稳目标。
javascript
const rmsprop = tf.train.rmsprop(0.001, 0.9, 0.0, 1e-8);
model.compile({
optimizer: rmsprop,
loss: 'meanSquaredError'
});
参数说明 #
| 参数 | 说明 | 默认值 |
|---|---|---|
| learningRate | 学习率 | 0.001 |
| decay | 衰减率 | 0.9 |
| momentum | 动量 | 0.0 |
| epsilon | 数值稳定性常数 | 1e-8 |
Adagrad #
自适应梯度算法,适合稀疏数据。
javascript
const adagrad = tf.train.adagrad(0.01);
model.compile({
optimizer: adagrad,
loss: 'meanSquaredError'
});
Adadelta #
Adagrad 的改进版本,解决了学习率递减问题。
javascript
const adadelta = tf.train.adadelta(1.0, 0.95, 1e-6);
model.compile({
optimizer: adadelta,
loss: 'meanSquaredError'
});
Nadam #
Adam 与 Nesterov 动量的结合。
javascript
const nadam = tf.train.nadam(0.001, 0.9, 0.999, 1e-8);
model.compile({
optimizer: nadam,
loss: 'meanSquaredError'
});
优化器对比 #
性能对比 #
text
┌─────────────────────────────────────────────────────────────┐
│ 优化器性能对比 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 收敛速度:Adam > Nadam > RMSProp > SGD with Momentum │
│ │
│ 内存占用:Adam ≈ RMSProp > SGD │
│ │
│ 泛化能力:SGD > Adam > RMSProp │
│ │
│ 适用场景: │
│ - SGD:简单模型,追求最佳泛化 │
│ - Adam:深度网络,快速收敛 │
│ - RMSProp:RNN,非平稳目标 │
│ - Adagrad:稀疏数据 │
│ │
└─────────────────────────────────────────────────────────────┘
选择建议 #
| 场景 | 推荐优化器 | 学习率 |
|---|---|---|
| 图像分类 CNN | Adam | 0.001 |
| NLP 任务 | Adam | 0.0001 |
| RNN/LSTM | RMSProp | 0.001 |
| 简单 MLP | SGD + Momentum | 0.01 |
| 迁移学习 | SGD + Momentum | 0.001 |
学习率调度 #
常量学习率 #
javascript
const optimizer = tf.train.adam(0.001);
手动学习率衰减 #
javascript
let learningRate = 0.1;
const optimizer = tf.train.adam(learningRate);
const history = await model.fit(xs, ys, {
epochs: 50,
callbacks: {
onEpochEnd: (epoch) => {
if (epoch === 20 || epoch === 35) {
learningRate *= 0.1;
optimizer.learningRate = learningRate;
console.log(`学习率调整为: ${learningRate}`);
}
}
}
});
指数衰减 #
javascript
function exponentialDecay(initialLR, decayRate, decaySteps) {
return (step) => initialLR * Math.pow(decayRate, step / decaySteps);
}
const lrSchedule = exponentialDecay(0.1, 0.96, 1000);
余弦退火 #
javascript
function cosineAnnealing(initialLR, minLR, totalSteps) {
return (step) => {
const progress = step / totalSteps;
return minLR + 0.5 * (initialLR - minLR) * (1 + Math.cos(Math.PI * progress));
};
}
预热学习率 #
javascript
async function trainWithWarmup(model, xs, ys, config) {
const { warmupEpochs, targetLR, totalEpochs } = config;
for (let epoch = 0; epoch < totalEpochs; epoch++) {
let lr;
if (epoch < warmupEpochs) {
lr = targetLR * (epoch + 1) / warmupEpochs;
} else {
lr = targetLR;
}
model.optimizer.learningRate = lr;
await model.fit(xs, ys, {
epochs: 1,
verbose: 0
});
console.log(`Epoch ${epoch + 1}, LR: ${lr.toFixed(6)}`);
}
}
周期性学习率 #
javascript
function cyclicLR(minLR, maxLR, stepSize) {
return (step) => {
const cycle = Math.floor(1 + step / (2 * stepSize));
const x = Math.abs(step / stepSize - 2 * cycle + 1);
return minLR + (maxLR - minLR) * Math.max(0, 1 - x);
};
}
梯度裁剪 #
按值裁剪 #
javascript
model.compile({
optimizer: tf.train.adam(0.001, undefined, undefined, undefined, 1.0),
loss: 'meanSquaredError'
});
按范数裁剪 #
javascript
model.compile({
optimizer: tf.train.adam(0.001, undefined, undefined, undefined, undefined, 1.0),
loss: 'meanSquaredError'
});
手动梯度裁剪 #
javascript
const optimizer = tf.train.adam(0.001);
function trainStep(x, y) {
optimizer.minimize(() => {
const grads = tf.variableGrads(() => {
const pred = model.predict(x);
return tf.losses.meanSquaredError(y, pred);
});
const clippedGrads = {};
for (const name in grads.grads) {
clippedGrads[name] = tf.clipByNorm(grads.grads[name], 1.0);
}
optimizer.applyGradients(clippedGrads);
});
}
权重衰减 #
L2 正则化 #
javascript
const model = tf.sequential({
layers: [
tf.layers.dense({
units: 64,
inputShape: [10],
activation: 'relu',
kernelRegularizer: tf.regularizers.l2({ l2: 0.01 })
}),
tf.layers.dense({
units: 1,
kernelRegularizer: tf.regularizers.l2({ l2: 0.01 })
})
]
});
L1 正则化 #
javascript
tf.layers.dense({
units: 64,
kernelRegularizer: tf.regularizers.l1({ l1: 0.01 })
});
L1-L2 正则化 #
javascript
tf.layers.dense({
units: 64,
kernelRegularizer: tf.regularizers.l1l2({ l1: 0.01, l2: 0.01 })
});
自定义优化器 #
创建自定义优化器 #
javascript
class CustomOptimizer extends tf.Optimizer {
constructor(learningRate) {
super();
this.learningRate = learningRate;
}
applyGradients(variableGradients) {
for (const name in variableGradients) {
const gradient = variableGradients[name];
const variable = tf.Variable.find(name);
variable.assign(variable.sub(gradient.mul(this.learningRate)));
}
}
static get className() {
return 'CustomOptimizer';
}
}
tf.serialization.registerClass(CustomOptimizer);
优化器状态 #
获取优化器状态 #
javascript
const optimizer = tf.train.adam(0.001);
const weights = optimizer.getWeights();
console.log(weights);
设置优化器状态 #
javascript
optimizer.setWeights(weights);
训练技巧 #
学习率查找 #
javascript
async function findLearningRate(model, xs, ys, minLR = 1e-6, maxLR = 10, steps = 100) {
const lrMult = Math.pow(maxLR / minLR, 1 / steps);
let lr = minLR;
const losses = [];
const learningRates = [];
for (let i = 0; i < steps; i++) {
model.optimizer.learningRate = lr;
const loss = model.trainOnBatch(xs.slice([i * 32], [32]), ys.slice([i * 32], [32]));
losses.push(loss);
learningRates.push(lr);
lr *= lrMult;
}
return { learningRates, losses };
}
差异学习率 #
javascript
const baseLayers = model.layers.slice(0, -2);
const headLayers = model.layers.slice(-2);
const optimizer = tf.train.adam(0.001);
function trainWithDifferentialLR() {
const variables = [];
const gradients = [];
for (const layer of baseLayers) {
layer.trainable = true;
}
for (const layer of headLayers) {
layer.trainable = true;
}
}
冻结层 #
javascript
model.layers[0].trainable = false;
model.layers[1].trainable = false;
model.compile({
optimizer: tf.train.adam(0.001),
loss: 'categoricalCrossentropy'
});
优化器配置最佳实践 #
配置模板 #
javascript
const config = {
optimizer: tf.train.adam(0.001),
loss: 'categoricalCrossentropy',
metrics: ['accuracy']
};
model.compile(config);
训练配置 #
javascript
const trainConfig = {
epochs: 100,
batchSize: 32,
validationSplit: 0.2,
callbacks: [
tf.callbacks.earlyStopping({
monitor: 'val_loss',
patience: 10
})
]
};
await model.fit(xs, ys, trainConfig);
优化器监控 #
监控梯度 #
javascript
const optimizer = tf.train.adam(0.001);
function monitorGradients(model, x, y) {
const { grads, value } = tf.variableGrads(() => {
const pred = model.predict(x);
return tf.losses.meanSquaredError(y, pred);
});
for (const name in grads) {
const grad = grads[name];
console.log(`${name}:`);
console.log(` 均值: ${grad.mean().dataSync()[0]}`);
console.log(` 标准差: ${grad.std().dataSync()[0]}`);
console.log(` 最大值: ${grad.max().dataSync()[0]}`);
console.log(` 最小值: ${grad.min().dataSync()[0]}`);
}
}
监控参数更新 #
javascript
function monitorUpdates(model, beforeWeights) {
const afterWeights = model.getWeights();
for (let i = 0; i < beforeWeights.length; i++) {
const diff = afterWeights[i].sub(beforeWeights[i]);
console.log(`层 ${i} 更新幅度: ${diff.norm().dataSync()[0]}`);
}
}
下一步 #
现在你已经掌握了优化算法,接下来学习 卷积神经网络,深入了解图像处理技术!
最后更新:2026-03-29