Ray Tune 超参数调优 #
什么是 Ray Tune? #
Ray Tune 是 Ray 提供的可扩展超参数优化框架,用于自动化机器学习模型的超参数调优。它支持多种搜索算法和调度策略,可以高效地找到最优超参数。
text
┌─────────────────────────────────────────────────────────────┐
│ Ray Tune 架构 │
├─────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ Tuner │ │
│ │ ├── 配置搜索空间 │ │
│ │ ├── 管理试验 │ │
│ │ └── 收集结果 │ │
│ └─────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ Search Algorithm │ │
│ │ ├── Grid Search │ │
│ │ ├── Random Search │ │
│ │ ├── Bayesian Optimization │ │
│ │ └── Optuna / Hyperopt │ │
│ └─────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ Scheduler │ │
│ │ ├── FIFO │ │
│ │ ├── Async HyperBand │ │
│ │ └── Population Based Training │ │
│ └─────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ Trials │ │
│ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │
│ │ │ Trial 1 │ │ Trial 2 │ │ Trial 3 │ ... │ │
│ │ └─────────┘ └─────────┘ └─────────┘ │ │
│ └─────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────┘
基本用法 #
简单调优 #
python
from ray import tune
def train_fn(config):
lr = config["lr"]
for epoch in range(10):
loss = lr / (epoch + 1)
tune.report({"loss": loss})
tuner = tune.Tuner(train_fn, param_space={"lr": 0.01})
results = tuner.fit()
print(f"Best result: {results.get_best_result().metrics}")
搜索空间 #
python
from ray import tune
config = {
"lr": tune.loguniform(1e-4, 1e-1),
"batch_size": tune.choice([16, 32, 64, 128]),
"hidden_size": tune.randint(32, 256),
"dropout": tune.uniform(0.0, 0.5),
"optimizer": tune.choice(["adam", "sgd", "rmsprop"])
}
tuner = tune.Tuner(
train_fn,
param_space=config,
tune_config=tune.TuneConfig(num_samples=10)
)
results = tuner.fit()
搜索空间类型 #
text
┌─────────────────────────────────────────────────────────────┐
│ 搜索空间类型 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 连续型: │
│ ├── tune.uniform(a, b): 均匀分布 │
│ ├── tune.loguniform(a, b): 对数均匀分布 │
│ ├── tune.randn(mean, std): 正态分布 │
│ └── tune.quniform(a, b, q): 量化均匀分布 │
│ │
│ 离散型: │
│ ├── tune.choice(options): 从选项中选择 │
│ ├── tune.randint(a, b): 整数范围 │
│ └── tune.qrandint(a, b, q): 量化整数 │
│ │
│ 条件型: │
│ └── tune.choice() + 字典嵌套 │
│ │
└─────────────────────────────────────────────────────────────┘
搜索算法 #
网格搜索 #
python
from ray import tune
config = {
"lr": tune.grid_search([0.001, 0.01, 0.1]),
"batch_size": tune.grid_search([16, 32, 64])
}
tuner = tune.Tuner(train_fn, param_space=config)
results = tuner.fit()
随机搜索 #
python
from ray import tune
tuner = tune.Tuner(
train_fn,
param_space={"lr": tune.loguniform(1e-4, 1e-1)},
tune_config=tune.TuneConfig(
num_samples=50,
search_alg=tune.search.BasicVariantGenerator()
)
)
results = tuner.fit()
贝叶斯优化 #
python
from ray import tune
from ray.tune.search.bayesopt import BayesOptSearch
tuner = tune.Tuner(
train_fn,
param_space={"lr": tune.loguniform(1e-4, 1e-1)},
tune_config=tune.TuneConfig(
num_samples=20,
search_alg=BayesOptSearch()
)
)
results = tuner.fit()
Optuna 集成 #
python
from ray import tune
from ray.tune.search.optuna import OptunaSearch
tuner = tune.Tuner(
train_fn,
param_space={"lr": tune.loguniform(1e-4, 1e-1)},
tune_config=tune.TuneConfig(
num_samples=20,
search_alg=OptunaSearch()
)
)
results = tuner.fit()
Hyperopt 集成 #
python
from ray import tune
from ray.tune.search.hyperopt import HyperOptSearch
tuner = tune.Tuner(
train_fn,
param_space={"lr": tune.loguniform(1e-4, 1e-1)},
tune_config=tune.TuneConfig(
num_samples=20,
search_alg=HyperOptSearch()
)
)
results = tuner.fit()
调度策略 #
FIFO 调度器 #
python
from ray import tune
tuner = tune.Tuner(
train_fn,
tune_config=tune.TuneConfig(
scheduler=tune.schedulers.FIFOScheduler()
)
)
results = tuner.fit()
Async HyperBand #
python
from ray import tune
scheduler = tune.schedulers.AsyncHyperBandScheduler(
metric="loss",
mode="min",
max_t=100,
grace_period=10,
reduction_factor=3
)
tuner = tune.Tuner(
train_fn,
tune_config=tune.TuneConfig(
num_samples=50,
scheduler=scheduler
)
)
results = tuner.fit()
Population Based Training #
python
from ray import tune
scheduler = tune.schedulers.PopulationBasedTraining(
time_attr="training_iteration",
metric="loss",
mode="min",
perturbation_interval=5,
hyperparam_mutations={
"lr": tune.loguniform(1e-4, 1e-1),
"batch_size": [16, 32, 64]
}
)
tuner = tune.Tuner(
train_fn,
tune_config=tune.TuneConfig(
num_samples=10,
scheduler=scheduler
)
)
results = tuner.fit()
调度器对比 #
text
┌─────────────────────────────────────────────────────────────┐
│ 调度器对比 │
├─────────────────────────────────────────────────────────────┤
│ │
│ FIFOScheduler │
│ ├── 先进先出 │
│ ├── 简单可靠 │
│ └── 不支持早停 │
│ │
│ AsyncHyperBandScheduler │
│ ├── 异步 HyperBand │
│ ├── 支持早停 │
│ └── 高效资源利用 │
│ │
│ PopulationBasedTraining │
│ ├── 基于种群训练 │
│ ├── 动态调整超参数 │
│ └── 适合长期训练 │
│ │
│ MedianStoppingRule │
│ ├── 中位数停止规则 │
│ ├── 简单早停 │
│ └── 低开销 │
│ │
└─────────────────────────────────────────────────────────────┘
早停策略 #
基于阈值 #
python
from ray import tune
def train_fn(config):
for epoch in range(100):
loss = compute_loss(config)
tune.report({"loss": loss})
if loss < 0.01:
break
tuner = tune.Tuner(train_fn)
results = tuner.fit()
基于调度器 #
python
from ray import tune
scheduler = tune.schedulers.AsyncHyperBandScheduler(
metric="loss",
mode="min",
max_t=100,
grace_period=10
)
tuner = tune.Tuner(
train_fn,
tune_config=tune.TuneConfig(scheduler=scheduler)
)
results = tuner.fit()
结果分析 #
获取最佳结果 #
python
from ray import tune
results = tuner.fit()
best_result = results.get_best_result(metric="loss", mode="min")
print(f"Best config: {best_result.config}")
print(f"Best loss: {best_result.metrics['loss']}")
best_config = results.get_best_result(metric="loss", mode="min").config
结果 DataFrame #
python
import pandas as pd
results = tuner.fit()
df = results.get_dataframe()
print(df.head())
df_filtered = df[df["loss"] < 0.5]
print(df_filtered)
结果可视化 #
python
from ray import tune
results = tuner.fit()
results.get_best_result().metrics_dataframe.plot(
x="training_iteration",
y="loss"
)
分布式执行 #
并行试验 #
python
from ray import tune
tuner = tune.Tuner(
train_fn,
tune_config=tune.TuneConfig(
num_samples=100,
max_concurrent_trials=10
)
)
results = tuner.fit()
资源配置 #
python
from ray import tune
def train_fn(config):
pass
tuner = tune.Tuner(
train_fn,
tune_config=tune.TuneConfig(
num_samples=20
),
run_config=tune.RunConfig(
resources_per_trial=tune.PlacementGroupFactory(
[{"CPU": 2, "GPU": 1}]
)
)
)
results = tuner.fit()
实验管理 #
恢复实验 #
python
from ray import tune
tuner = tune.Tuner(
train_fn,
run_config=tune.RunConfig(
name="my_experiment",
storage_path="/path/to/storage"
)
)
results = tuner.fit()
restored_tuner = tune.Tuner.restore(
"/path/to/storage/my_experiment",
trainable=train_fn
)
restored_results = restored_tuner.fit()
实验对比 #
python
from ray import tune
results1 = tune.Tuner(train_fn, param_space={"lr": 0.01}).fit()
results2 = tune.Tuner(train_fn, param_space={"lr": 0.001}).fit()
best1 = results1.get_best_result().metrics["loss"]
best2 = results2.get_best_result().metrics["loss"]
print(f"Experiment 1: {best1}, Experiment 2: {best2}")
最佳实践 #
1. 合理设置搜索空间 #
python
from ray import tune
config = {
"lr": tune.loguniform(1e-5, 1e-1),
"batch_size": tune.choice([16, 32, 64, 128]),
"hidden_size": tune.choice([64, 128, 256, 512])
}
2. 使用早停节省资源 #
python
from ray import tune
scheduler = tune.schedulers.AsyncHyperBandScheduler(
metric="loss",
mode="min",
max_t=100,
grace_period=5,
reduction_factor=3
)
3. 并行执行加速 #
python
from ray import tune
tuner = tune.Tuner(
train_fn,
tune_config=tune.TuneConfig(
num_samples=100,
max_concurrent_trials=20
)
)
4. 保存检查点 #
python
from ray import tune
def train_fn(config):
for epoch in range(100):
loss = train_one_epoch(config)
tune.report({"loss": loss})
if epoch % 10 == 0:
tune.save_checkpoint(epoch=epoch, model=model.state_dict())
下一步 #
掌握了 Ray Tune 之后,继续学习 集群管理,了解如何管理 Ray 集群!
最后更新:2026-04-05