实验跟踪 #

MLflow Tracking 概述 #

MLflow Tracking 是 MLflow 的核心组件之一，用于记录、组织和查询机器学习实验。

text

┌─────────────────────────────────────────────────────────────┐
│                   MLflow Tracking 架构                       │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  ┌─────────────────────────────────────────────────────┐   │
│  │                   客户端 API                         │   │
│  │  ├── Python API (mlflow.*)                         │   │
│  │  ├── Java API                                      │   │
│  │  ├── R API                                         │   │
│  │  └── REST API                                      │   │
│  └─────────────────────────────────────────────────────┘   │
│                          │                                  │
│                          ▼                                  │
│  ┌─────────────────────────────────────────────────────┐   │
│  │                  Tracking Server                     │   │
│  │  ├── 接收实验数据                                   │   │
│  │  ├── 存储元数据                                     │   │
│  │  └── 提供 UI 和 API                                │   │
│  └─────────────────────────────────────────────────────┘   │
│                          │                                  │
│                          ▼                                  │
│  ┌─────────────────────────────────────────────────────┐   │
│  │                    存储层                            │   │
│  │  ├── Backend Store (元数据)                         │   │
│  │  └── Artifact Store (工件)                          │   │
│  └─────────────────────────────────────────────────────┘   │
│                                                             │
└─────────────────────────────────────────────────────────────┘

核心概念 #

Experiment（实验） #

python

import mlflow

experiment_id = mlflow.create_experiment(
    name="my-experiment",
    artifact_location="s3://my-bucket/experiments",
    tags={
        "project": "customer-churn",
        "team": "data-science"
    }
)

print(f"Experiment ID: {experiment_id}")

Run（运行） #

text

┌─────────────────────────────────────────────────────────────┐
│                      Run 结构                                │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  Run ID: 唯一标识符                                         │
│  ├── abc123def456...                                       │
│                                                             │
│  Parameters: 超参数                                         │
│  ├── learning_rate: 0.01                                   │
│  ├── batch_size: 32                                        │
│  └── epochs: 100                                           │
│                                                             │
│  Metrics: 评估指标                                          │
│  ├── accuracy: 0.95                                        │
│  ├── loss: 0.05                                            │
│  └── f1_score: 0.94                                        │
│                                                             │
│  Artifacts: 输出文件                                        │
│  ├── model/ (模型文件)                                      │
│  ├── plots/ (图表)                                          │
│  └── data/ (数据)                                           │
│                                                             │
│  Tags: 标签                                                 │
│  ├── model_type: RandomForest                              │
│  └── version: 1.0                                          │
│                                                             │
│  Metadata: 元数据                                           │
│  ├── start_time: 2024-01-01 10:00:00                       │
│  ├── end_time: 2024-01-01 10:05:00                         │
│  ├── status: FINISHED                                      │
│  └── user: data-scientist                                  │
│                                                             │
└─────────────────────────────────────────────────────────────┘

参数记录 #

基本参数记录 #

python

import mlflow

with mlflow.start_run():
    mlflow.log_param("learning_rate", 0.01)
    mlflow.log_param("batch_size", 32)
    mlflow.log_param("epochs", 100)
    mlflow.log_param("optimizer", "adam")

批量记录参数 #

python

import mlflow

with mlflow.start_run():
    params = {
        "learning_rate": 0.01,
        "batch_size": 32,
        "epochs": 100,
        "optimizer": "adam",
        "dropout": 0.5,
        "hidden_units": 128
    }
    mlflow.log_params(params)

参数命名规范 #

text

┌─────────────────────────────────────────────────────────────┐
│                    参数命名最佳实践                          │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  ✅ 推荐做法：                                               │
│  ├── 使用小写字母和下划线                                   │
│  │   learning_rate, batch_size, hidden_units               │
│  │                                                         │
│  ├── 使用层级命名                                           │
│  │   model.learning_rate, optimizer.beta1                  │
│  │                                                         │
│  └── 保持一致性                                             │
│      同类参数使用相同命名风格                               │
│                                                             │
│  ❌ 避免做法：                                               │
│  ├── 使用特殊字符                                           │
│  │   learning-rate, batch.size                             │
│  │                                                         │
│  ├── 过长的名称                                             │
│  │   this_is_a_very_long_parameter_name                    │
│  │                                                         │
│  └── 不一致的命名                                           │
│      lr, learningRate, Learning_Rate                       │
│                                                             │
└─────────────────────────────────────────────────────────────┘

指标追踪 #

基本指标记录 #

python

import mlflow

with mlflow.start_run():
    mlflow.log_metric("accuracy", 0.95)
    mlflow.log_metric("precision", 0.94)
    mlflow.log_metric("recall", 0.96)
    mlflow.log_metric("f1_score", 0.95)

时间序列指标 #

python

import mlflow

with mlflow.start_run():
    for epoch in range(100):
        train_loss = calculate_train_loss(epoch)
        val_loss = calculate_val_loss(epoch)
        
        mlflow.log_metric("train_loss", train_loss, step=epoch)
        mlflow.log_metric("val_loss", val_loss, step=epoch)

指标历史记录 #

python

import mlflow

with mlflow.start_run():
    mlflow.log_metric("accuracy", 0.80, step=1)
    mlflow.log_metric("accuracy", 0.85, step=2)
    mlflow.log_metric("accuracy", 0.90, step=3)
    mlflow.log_metric("accuracy", 0.95, step=4)

指标类型 #

text

┌─────────────────────────────────────────────────────────────┐
│                      指标类型                                │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  单值指标：                                                  │
│  ├── 最终结果指标                                           │
│  ├── accuracy, f1_score, auc                               │
│  └── 只记录最终值                                           │
│                                                             │
│  时间序列指标：                                              │
│  ├── 训练过程指标                                           │
│  ├── loss, learning_rate, gradient_norm                    │
│  └── 每个 step 记录一次                                     │
│                                                             │
│  系统指标：                                                  │
│  ├── 资源使用情况                                           │
│  ├── cpu_usage, memory_usage, gpu_memory                   │
│  └── 自动或手动记录                                         │
│                                                             │
└─────────────────────────────────────────────────────────────┘

工件存储 #

记录单个工件 #

python

import mlflow

with mlflow.start_run():
    mlflow.log_artifact("config.yaml")
    mlflow.log_artifact("model.pkl")
    mlflow.log_artifact("results.json")

记录目录 #

python

import mlflow

with mlflow.start_run():
    mlflow.log_artifacts("./outputs", artifact_path="outputs")
    mlflow.log_artifacts("./plots", artifact_path="plots")

记录图表 #

python

import mlflow
import matplotlib.pyplot as plt
import seaborn as sns

with mlflow.start_run():
    fig, ax = plt.subplots()
    ax.plot([1, 2, 3, 4], [1, 4, 2, 3])
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Loss')
    fig.savefig("training_curve.png")
    mlflow.log_artifact("training_curve.png")
    
    plt.close(fig)

记录数据文件 #

python

import mlflow
import pandas as pd

with mlflow.start_run():
    df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
    df.to_csv("sample_data.csv", index=False)
    mlflow.log_artifact("sample_data.csv")

工件类型 #

text

┌─────────────────────────────────────────────────────────────┐
│                      工件类型                                │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  模型文件：                                                  │
│  ├── model.pkl, model.joblib                               │
│  ├── model.h5, model.pt                                    │
│  └── MLflow 模型目录                                        │
│                                                             │
│  配置文件：                                                  │
│  ├── config.yaml, config.json                              │
│  ├── hyperparameters.json                                  │
│  └── requirements.txt                                      │
│                                                             │
│  可视化文件：                                                │
│  ├── training_curve.png                                    │
│  ├── confusion_matrix.png                                  │
│  └── feature_importance.png                                │
│                                                             │
│  数据文件：                                                  │
│  ├── sample_data.csv                                       │
│  ├── predictions.json                                      │
│  └── evaluation_results.json                               │
│                                                             │
│  文档文件：                                                  │
│  ├── README.md                                             │
│  ├── model_card.md                                         │
│  └── experiment_notes.txt                                  │
│                                                             │
└─────────────────────────────────────────────────────────────┘

标签管理 #

设置标签 #

python

import mlflow

with mlflow.start_run():
    mlflow.set_tag("model_type", "RandomForest")
    mlflow.set_tag("dataset", "customer-churn")
    mlflow.set_tag("team", "data-science")
    mlflow.set_tag("priority", "high")

批量设置标签 #

python

import mlflow

with mlflow.start_run():
    tags = {
        "model_type": "RandomForest",
        "dataset": "customer-churn",
        "team": "data-science",
        "priority": "high",
        "version": "1.0.0"
    }
    mlflow.set_tags(tags)

标签用途 #

text

┌─────────────────────────────────────────────────────────────┐
│                      标签用途                                │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  分类和过滤：                                                │
│  ├── 按团队筛选实验                                         │
│  ├── 按项目分组                                             │
│  └── 按优先级排序                                           │
│                                                             │
│  元数据记录：                                                │
│  ├── 数据集版本                                             │
│  ├── 模型类型                                               │
│  └── 实验目的                                               │
│                                                             │
│  团队协作：                                                  │
│  ├── 负责人信息                                             │
│  ├── 审核状态                                               │
│  └── 备注说明                                               │
│                                                             │
└─────────────────────────────────────────────────────────────┘

实验组织 #

实验命名规范 #

python

import mlflow

mlflow.set_experiment("customer-churn/prediction/v1")
mlflow.set_experiment("image-classification/resnet/experiment-1")
mlflow.set_experiment("nlp/sentiment-analysis/production")

实验层级结构 #

text

项目层级结构：
├── customer-churn/
│   ├── baseline/
│   ├── feature-engineering/
│   └── hyperparameter-tuning/
│
├── image-classification/
│   ├── resnet/
│   ├── vgg/
│   └── efficientnet/
│
└── nlp/
    ├── sentiment-analysis/
    └── text-classification/

搜索实验 #

python

import mlflow

experiments = mlflow.search_experiments(
    filter_string="tags.project = 'customer-churn'"
)

for exp in experiments:
    print(f"Name: {exp.name}")
    print(f"Tags: {exp.tags}")

运行查询 #

搜索运行 #

python

import mlflow

runs = mlflow.search_runs(
    experiment_ids=["1", "2"],
    filter_string="metrics.accuracy > 0.9 AND params.model_type = 'RandomForest'",
    order_by=["metrics.accuracy DESC"],
    max_results=100
)

print(runs[["run_id", "metrics.accuracy", "params.model_type"]])

搜索语法 #

text

┌─────────────────────────────────────────────────────────────┐
│                    搜索语法                                  │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  参数过滤：                                                  │
│  params.key = 'value'                                      │
│  params.key != 'value'                                     │
│                                                             │
│  指标过滤：                                                  │
│  metrics.key > 0.9                                         │
│  metrics.key >= 0.9                                        │
│  metrics.key < 0.1                                         │
│  metrics.key <= 0.1                                        │
│                                                             │
│  标签过滤：                                                  │
│  tags.key = 'value'                                        │
│  tags.key != 'value'                                       │
│                                                             │
│  组合条件：                                                  │
│  condition1 AND condition2                                 │
│  condition1 OR condition2                                  │
│                                                             │
│  示例：                                                     │
│  metrics.accuracy > 0.9 AND params.model = 'rf'            │
│  tags.team = 'ml' OR tags.team = 'ds'                      │
│                                                             │
└─────────────────────────────────────────────────────────────┘

获取运行详情 #

python

import mlflow

run = mlflow.get_run("run_id")

print(f"Status: {run.info.status}")
print(f"Start Time: {run.info.start_time}")
print(f"Parameters: {run.data.params}")
print(f"Metrics: {run.data.metrics}")
print(f"Tags: {run.data.tags}")

列出运行工件 #

python

import mlflow

client = mlflow.tracking.MlflowClient()

artifacts = client.list_artifacts("run_id")
for artifact in artifacts:
    print(f"Path: {artifact.path}")
    print(f"Is Directory: {artifact.is_dir}")

嵌套运行 #

父子运行 #

python

import mlflow

with mlflow.start_run(run_name="parent-run") as parent_run:
    mlflow.log_param("dataset", "iris")
    
    for fold in range(5):
        with mlflow.start_run(
            run_name=f"fold-{fold}",
            nested=True
        ) as child_run:
            mlflow.log_metric("fold_accuracy", 0.95 - fold * 0.01)
            mlflow.log_param("fold", fold)

使用场景 #

text

┌─────────────────────────────────────────────────────────────┐
│                    嵌套运行场景                              │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  交叉验证：                                                  │
│  ├── 父运行：整体实验                                       │
│  └── 子运行：每个 fold 的结果                               │
│                                                             │
│  超参数搜索：                                                │
│  ├── 父运行：搜索配置                                       │
│  └── 子运行：每组参数的结果                                 │
│                                                             │
│  集成学习：                                                  │
│  ├── 父运行：集成模型                                       │
│  └── 子运行：每个基学习器                                   │
│                                                             │
│  多阶段训练：                                                │
│  ├── 父运行：完整训练流程                                   │
│  └── 子运行：每个训练阶段                                   │
│                                                             │
└─────────────────────────────────────────────────────────────┘

自动记录 #

Scikit-learn 自动记录 #

python

import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

mlflow.sklearn.autolog(
    log_models=True,
    log_datasets=True,
    disable=False,
    exclusive=False,
    disable_for_unsupported_versions=False,
    silent=False,
    registered_model_name=None,
    extra_tags=None
)

with mlflow.start_run():
    model = RandomForestClassifier(n_estimators=100)
    model.fit(X_train, y_train)

TensorFlow 自动记录 #

python

import mlflow
import tensorflow as tf

mlflow.tensorflow.autolog(
    log_models=True,
    log_datasets=True,
    disable=False,
    exclusive=False,
    disable_for_unsupported_versions=False,
    silent=False,
    registered_model_name=None,
    every_n_iter=1
)

with mlflow.start_run():
    model.fit(X_train, y_train, epochs=10)

PyTorch Lightning 自动记录 #

python

import mlflow
import pytorch_lightning as pl

mlflow.pytorch.autolog(
    log_models=True,
    disable=False,
    exclusive=False,
    silent=False,
    registered_model_name=None
)

trainer = pl.Trainer(max_epochs=10)
trainer.fit(model)

系统指标记录 #

启用系统指标 #

python

import mlflow

mlflow.enable_system_metrics_logging()

with mlflow.start_run():
    pass

自定义系统指标 #

python

import mlflow
import psutil
import time

with mlflow.start_run():
    for i in range(100):
        cpu_percent = psutil.cpu_percent()
        memory_percent = psutil.virtual_memory().percent
        
        mlflow.log_metric("system/cpu_percent", cpu_percent, step=i)
        mlflow.log_metric("system/memory_percent", memory_percent, step=i)
        
        time.sleep(1)

最佳实践 #

1. 实验命名 #

python

import mlflow

mlflow.set_experiment(f"{project}/{task}/{version}")

2. 完整记录 #

python

import mlflow

with mlflow.start_run():
    mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    mlflow.sklearn.log_model(model, "model")
    mlflow.log_artifact("config.yaml")
    mlflow.set_tags(tags)

3. 异常处理 #

python

import mlflow

with mlflow.start_run() as run:
    try:
        train_model()
        mlflow.log_metric("status", 1)
    except Exception as e:
        mlflow.set_tag("error", str(e))
        mlflow.log_metric("status", 0)
        raise

4. 条件记录 #

python

import mlflow

with mlflow.start_run():
    if mlflow.active_run():
        mlflow.log_metric("accuracy", accuracy)

下一步 #

现在你已经掌握了 MLflow Tracking 的核心功能，接下来学习模型管理，了解如何管理和部署模型！