Ray Tune 超参数调优 #

什么是 Ray Tune？ #

Ray Tune 是 Ray 提供的可扩展超参数优化框架，用于自动化机器学习模型的超参数调优。它支持多种搜索算法和调度策略，可以高效地找到最优超参数。

text

┌─────────────────────────────────────────────────────────────┐
│                    Ray Tune 架构                             │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  ┌─────────────────────────────────────────────────────┐   │
│  │                   Tuner                              │   │
│  │  ├── 配置搜索空间                                    │   │
│  │  ├── 管理试验                                        │   │
│  │  └── 收集结果                                        │   │
│  └─────────────────────────────────────────────────────┘   │
│                          │                                  │
│                          ▼                                  │
│  ┌─────────────────────────────────────────────────────┐   │
│  │                   Search Algorithm                   │   │
│  │  ├── Grid Search                                    │   │
│  │  ├── Random Search                                  │   │
│  │  ├── Bayesian Optimization                          │   │
│  │  └── Optuna / Hyperopt                              │   │
│  └─────────────────────────────────────────────────────┘   │
│                          │                                  │
│                          ▼                                  │
│  ┌─────────────────────────────────────────────────────┐   │
│  │                   Scheduler                          │   │
│  │  ├── FIFO                                           │   │
│  │  ├── Async HyperBand                                │   │
│  │  └── Population Based Training                      │   │
│  └─────────────────────────────────────────────────────┘   │
│                          │                                  │
│                          ▼                                  │
│  ┌─────────────────────────────────────────────────────┐   │
│  │                   Trials                             │   │
│  │  ┌─────────┐  ┌─────────┐  ┌─────────┐             │   │
│  │  │ Trial 1 │  │ Trial 2 │  │ Trial 3 │  ...        │   │
│  │  └─────────┘  └─────────┘  └─────────┘             │   │
│  └─────────────────────────────────────────────────────┘   │
│                                                             │
└─────────────────────────────────────────────────────────────┘

基本用法 #

简单调优 #

python

from ray import tune

def train_fn(config):
    lr = config["lr"]
    for epoch in range(10):
        loss = lr / (epoch + 1)
        tune.report({"loss": loss})

tuner = tune.Tuner(train_fn, param_space={"lr": 0.01})
results = tuner.fit()

print(f"Best result: {results.get_best_result().metrics}")

搜索空间 #

python

from ray import tune

config = {
    "lr": tune.loguniform(1e-4, 1e-1),
    "batch_size": tune.choice([16, 32, 64, 128]),
    "hidden_size": tune.randint(32, 256),
    "dropout": tune.uniform(0.0, 0.5),
    "optimizer": tune.choice(["adam", "sgd", "rmsprop"])
}

tuner = tune.Tuner(
    train_fn,
    param_space=config,
    tune_config=tune.TuneConfig(num_samples=10)
)

results = tuner.fit()

搜索空间类型 #

text

┌─────────────────────────────────────────────────────────────┐
│                    搜索空间类型                              │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  连续型：                                                    │
│  ├── tune.uniform(a, b): 均匀分布                           │
│  ├── tune.loguniform(a, b): 对数均匀分布                    │
│  ├── tune.randn(mean, std): 正态分布                        │
│  └── tune.quniform(a, b, q): 量化均匀分布                   │
│                                                             │
│  离散型：                                                    │
│  ├── tune.choice(options): 从选项中选择                     │
│  ├── tune.randint(a, b): 整数范围                          │
│  └── tune.qrandint(a, b, q): 量化整数                      │
│                                                             │
│  条件型：                                                    │
│  └── tune.choice() + 字典嵌套                               │
│                                                             │
└─────────────────────────────────────────────────────────────┘

搜索算法 #

网格搜索 #

python

from ray import tune

config = {
    "lr": tune.grid_search([0.001, 0.01, 0.1]),
    "batch_size": tune.grid_search([16, 32, 64])
}

tuner = tune.Tuner(train_fn, param_space=config)
results = tuner.fit()

随机搜索 #

python

from ray import tune

tuner = tune.Tuner(
    train_fn,
    param_space={"lr": tune.loguniform(1e-4, 1e-1)},
    tune_config=tune.TuneConfig(
        num_samples=50,
        search_alg=tune.search.BasicVariantGenerator()
    )
)

results = tuner.fit()

贝叶斯优化 #

python

from ray import tune
from ray.tune.search.bayesopt import BayesOptSearch

tuner = tune.Tuner(
    train_fn,
    param_space={"lr": tune.loguniform(1e-4, 1e-1)},
    tune_config=tune.TuneConfig(
        num_samples=20,
        search_alg=BayesOptSearch()
    )
)

results = tuner.fit()

Optuna 集成 #

python

from ray import tune
from ray.tune.search.optuna import OptunaSearch

tuner = tune.Tuner(
    train_fn,
    param_space={"lr": tune.loguniform(1e-4, 1e-1)},
    tune_config=tune.TuneConfig(
        num_samples=20,
        search_alg=OptunaSearch()
    )
)

results = tuner.fit()

Hyperopt 集成 #

python

from ray import tune
from ray.tune.search.hyperopt import HyperOptSearch

tuner = tune.Tuner(
    train_fn,
    param_space={"lr": tune.loguniform(1e-4, 1e-1)},
    tune_config=tune.TuneConfig(
        num_samples=20,
        search_alg=HyperOptSearch()
    )
)

results = tuner.fit()

调度策略 #

FIFO 调度器 #

python

from ray import tune

tuner = tune.Tuner(
    train_fn,
    tune_config=tune.TuneConfig(
        scheduler=tune.schedulers.FIFOScheduler()
    )
)

results = tuner.fit()

Async HyperBand #

python

from ray import tune

scheduler = tune.schedulers.AsyncHyperBandScheduler(
    metric="loss",
    mode="min",
    max_t=100,
    grace_period=10,
    reduction_factor=3
)

tuner = tune.Tuner(
    train_fn,
    tune_config=tune.TuneConfig(
        num_samples=50,
        scheduler=scheduler
    )
)

results = tuner.fit()

Population Based Training #

python

from ray import tune

scheduler = tune.schedulers.PopulationBasedTraining(
    time_attr="training_iteration",
    metric="loss",
    mode="min",
    perturbation_interval=5,
    hyperparam_mutations={
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": [16, 32, 64]
    }
)

tuner = tune.Tuner(
    train_fn,
    tune_config=tune.TuneConfig(
        num_samples=10,
        scheduler=scheduler
    )
)

results = tuner.fit()

调度器对比 #

text

┌─────────────────────────────────────────────────────────────┐
│                    调度器对比                                │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  FIFOScheduler                                              │
│  ├── 先进先出                                               │
│  ├── 简单可靠                                               │
│  └── 不支持早停                                             │
│                                                             │
│  AsyncHyperBandScheduler                                    │
│  ├── 异步 HyperBand                                         │
│  ├── 支持早停                                               │
│  └── 高效资源利用                                           │
│                                                             │
│  PopulationBasedTraining                                    │
│  ├── 基于种群训练                                           │
│  ├── 动态调整超参数                                         │
│  └── 适合长期训练                                           │
│                                                             │
│  MedianStoppingRule                                         │
│  ├── 中位数停止规则                                         │
│  ├── 简单早停                                               │
│  └── 低开销                                                 │
│                                                             │
└─────────────────────────────────────────────────────────────┘

早停策略 #

基于阈值 #

python

from ray import tune

def train_fn(config):
    for epoch in range(100):
        loss = compute_loss(config)
        tune.report({"loss": loss})
        
        if loss < 0.01:
            break

tuner = tune.Tuner(train_fn)
results = tuner.fit()

基于调度器 #

python

from ray import tune

scheduler = tune.schedulers.AsyncHyperBandScheduler(
    metric="loss",
    mode="min",
    max_t=100,
    grace_period=10
)

tuner = tune.Tuner(
    train_fn,
    tune_config=tune.TuneConfig(scheduler=scheduler)
)

results = tuner.fit()

结果分析 #

获取最佳结果 #

python

from ray import tune

results = tuner.fit()

best_result = results.get_best_result(metric="loss", mode="min")
print(f"Best config: {best_result.config}")
print(f"Best loss: {best_result.metrics['loss']}")

best_config = results.get_best_result(metric="loss", mode="min").config

结果 DataFrame #

python

import pandas as pd

results = tuner.fit()

df = results.get_dataframe()
print(df.head())

df_filtered = df[df["loss"] < 0.5]
print(df_filtered)

结果可视化 #

python

from ray import tune

results = tuner.fit()

results.get_best_result().metrics_dataframe.plot(
    x="training_iteration",
    y="loss"
)

分布式执行 #

并行试验 #

python

from ray import tune

tuner = tune.Tuner(
    train_fn,
    tune_config=tune.TuneConfig(
        num_samples=100,
        max_concurrent_trials=10
    )
)

results = tuner.fit()

资源配置 #

python

from ray import tune

def train_fn(config):
    pass

tuner = tune.Tuner(
    train_fn,
    tune_config=tune.TuneConfig(
        num_samples=20
    ),
    run_config=tune.RunConfig(
        resources_per_trial=tune.PlacementGroupFactory(
            [{"CPU": 2, "GPU": 1}]
        )
    )
)

results = tuner.fit()

实验管理 #

恢复实验 #

python

from ray import tune

tuner = tune.Tuner(
    train_fn,
    run_config=tune.RunConfig(
        name="my_experiment",
        storage_path="/path/to/storage"
    )
)

results = tuner.fit()

restored_tuner = tune.Tuner.restore(
    "/path/to/storage/my_experiment",
    trainable=train_fn
)
restored_results = restored_tuner.fit()

实验对比 #

python

from ray import tune

results1 = tune.Tuner(train_fn, param_space={"lr": 0.01}).fit()
results2 = tune.Tuner(train_fn, param_space={"lr": 0.001}).fit()

best1 = results1.get_best_result().metrics["loss"]
best2 = results2.get_best_result().metrics["loss"]

print(f"Experiment 1: {best1}, Experiment 2: {best2}")

最佳实践 #

1. 合理设置搜索空间 #

python

from ray import tune

config = {
    "lr": tune.loguniform(1e-5, 1e-1),
    "batch_size": tune.choice([16, 32, 64, 128]),
    "hidden_size": tune.choice([64, 128, 256, 512])
}

2. 使用早停节省资源 #

python

from ray import tune

scheduler = tune.schedulers.AsyncHyperBandScheduler(
    metric="loss",
    mode="min",
    max_t=100,
    grace_period=5,
    reduction_factor=3
)

3. 并行执行加速 #

python

from ray import tune

tuner = tune.Tuner(
    train_fn,
    tune_config=tune.TuneConfig(
        num_samples=100,
        max_concurrent_trials=20
    )
)

4. 保存检查点 #

python

from ray import tune

def train_fn(config):
    for epoch in range(100):
        loss = train_one_epoch(config)
        tune.report({"loss": loss})
        
        if epoch % 10 == 0:
            tune.save_checkpoint(epoch=epoch, model=model.state_dict())

下一步 #

掌握了 Ray Tune 之后，继续学习集群管理，了解如何管理 Ray 集群！