生产环境部署 #
生产架构概述 #
text
┌─────────────────────────────────────────────────────────────┐
│ MLflow 生产架构 │
├─────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ 负载均衡层 │ │
│ │ ┌─────────────────────────────────────────────┐ │ │
│ │ │ Nginx / Load Balancer │ │ │
│ │ └─────────────────────────────────────────────┘ │ │
│ └─────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ 应用层 │ │
│ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │
│ │ │MLflow-1 │ │MLflow-2 │ │MLflow-3 │ │ │
│ │ │ Server │ │ Server │ │ Server │ │ │
│ │ └─────────┘ └─────────┘ └─────────┘ │ │
│ └─────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ 数据层 │ │
│ │ ┌─────────────┐ ┌─────────────┐ │ │
│ │ │ PostgreSQL │ │ S3/ │ │ │
│ │ │ (主从复制) │ │ MinIO │ │ │
│ │ └─────────────┘ └─────────────┘ │ │
│ └─────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────┘
高可用部署 #
Docker Compose 部署 #
yaml
version: '3.8'
services:
nginx:
image: nginx:latest
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf
- ./ssl:/etc/nginx/ssl
depends_on:
- mlflow-1
- mlflow-2
networks:
- mlflow-network
mlflow-1:
image: ghcr.io/mlflow/mlflow:v2.10.0
command: >
mlflow server
--backend-store-uri postgresql://mlflow:password@postgres:5432/mlflow
--default-artifact-root s3://mlflow-artifacts/mlruns
--host 0.0.0.0
--port 5000
--workers 4
--gunicorn-opts "--timeout 120"
environment:
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
- MLFLOW_S3_ENDPOINT_URL=${MLFLOW_S3_ENDPOINT_URL}
depends_on:
- postgres
- minio
networks:
- mlflow-network
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:5000/health"]
interval: 30s
timeout: 10s
retries: 3
mlflow-2:
image: ghcr.io/mlflow/mlflow:v2.10.0
command: >
mlflow server
--backend-store-uri postgresql://mlflow:password@postgres:5432/mlflow
--default-artifact-root s3://mlflow-artifacts/mlruns
--host 0.0.0.0
--port 5000
--workers 4
--gunicorn-opts "--timeout 120"
environment:
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
- MLFLOW_S3_ENDPOINT_URL=${MLFLOW_S3_ENDPOINT_URL}
depends_on:
- postgres
- minio
networks:
- mlflow-network
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:5000/health"]
interval: 30s
timeout: 10s
retries: 3
postgres:
image: postgres:14
environment:
- POSTGRES_USER=mlflow
- POSTGRES_PASSWORD=password
- POSTGRES_DB=mlflow
volumes:
- postgres_data:/var/lib/postgresql/data
networks:
- mlflow-network
healthcheck:
test: ["CMD-SHELL", "pg_isready -U mlflow"]
interval: 10s
timeout: 5s
retries: 5
postgres-replica:
image: postgres:14
environment:
- POSTGRES_USER=mlflow
- POSTGRES_PASSWORD=password
- POSTGRES_DB=mlflow
volumes:
- postgres_replica_data:/var/lib/postgresql/data
networks:
- mlflow-network
minio:
image: minio/minio:latest
command: server /data --console-address ":9001"
environment:
- MINIO_ROOT_USER=minioadmin
- MINIO_ROOT_PASSWORD=minioadmin
volumes:
- minio_data:/data
ports:
- "9000:9000"
- "9001:9001"
networks:
- mlflow-network
networks:
mlflow-network:
driver: bridge
volumes:
postgres_data:
postgres_replica_data:
minio_data:
Kubernetes 部署 #
yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: mlflow-server
namespace: mlflow
spec:
replicas: 3
selector:
matchLabels:
app: mlflow
template:
metadata:
labels:
app: mlflow
spec:
containers:
- name: mlflow
image: ghcr.io/mlflow/mlflow:v2.10.0
command: ["mlflow", "server"]
args:
- --backend-store-uri
- "postgresql://$(DB_USER):$(DB_PASSWORD)@postgres:5432/mlflow"
- --default-artifact-root
- "s3://mlflow-artifacts/mlruns"
- --host
- "0.0.0.0"
- --port
- "5000"
- --workers
- "4"
env:
- name: DB_USER
valueFrom:
secretKeyRef:
name: mlflow-secrets
key: db-user
- name: DB_PASSWORD
valueFrom:
secretKeyRef:
name: mlflow-secrets
key: db-password
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: mlflow-secrets
key: aws-access-key
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: mlflow-secrets
key: aws-secret-key
ports:
- containerPort: 5000
resources:
requests:
memory: "1Gi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "1000m"
livenessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 5
periodSeconds: 5
---
apiVersion: v1
kind: Service
metadata:
name: mlflow-service
namespace: mlflow
spec:
selector:
app: mlflow
ports:
- port: 5000
targetPort: 5000
type: ClusterIP
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: mlflow-hpa
namespace: mlflow
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: mlflow-server
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: mlflow-ingress
namespace: mlflow
annotations:
nginx.ingress.kubernetes.io/ssl-redirect: "true"
cert-manager.io/cluster-issuer: "letsencrypt-prod"
spec:
tls:
- hosts:
- mlflow.example.com
secretName: mlflow-tls
rules:
- host: mlflow.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: mlflow-service
port:
number: 5000
CI/CD 集成 #
GitHub Actions #
yaml
name: ML Pipeline
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
train:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install dependencies
run: |
pip install -r requirements.txt
pip install mlflow
- name: Train model
env:
MLFLOW_TRACKING_URI: ${{ secrets.MLFLOW_TRACKING_URI }}
MLFLOW_TRACKING_USERNAME: ${{ secrets.MLFLOW_USERNAME }}
MLFLOW_TRACKING_PASSWORD: ${{ secrets.MLFLOW_PASSWORD }}
run: |
python train.py
- name: Register model
if: github.ref == 'refs/heads/main'
env:
MLFLOW_TRACKING_URI: ${{ secrets.MLFLOW_TRACKING_URI }}
run: |
python register_model.py
deploy:
needs: train
if: github.ref == 'refs/heads/main'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Deploy to production
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
run: |
python deploy.py --stage production
GitLab CI #
yaml
stages:
- train
- validate
- deploy
train_model:
stage: train
image: python:3.10
script:
- pip install -r requirements.txt
- pip install mlflow
- python train.py
artifacts:
paths:
- models/
expire_in: 1 week
validate_model:
stage: validate
image: python:3.10
script:
- pip install mlflow
- python validate.py
needs:
- train_model
deploy_production:
stage: deploy
image: python:3.10
script:
- pip install mlflow
- python deploy.py --stage production
needs:
- validate_model
only:
- main
when: manual
监控与告警 #
Prometheus 配置 #
yaml
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'mlflow'
static_configs:
- targets: ['mlflow-service:5000']
metrics_path: '/metrics'
自定义指标导出 #
python
from prometheus_client import Counter, Histogram, Gauge, start_http_server
import mlflow
PREDICTION_COUNT = Counter(
'mlflow_predictions_total',
'Total number of predictions',
['model_name', 'model_version']
)
PREDICTION_LATENCY = Histogram(
'mlflow_prediction_latency_seconds',
'Prediction latency in seconds',
['model_name']
)
MODEL_ACCURACY = Gauge(
'mlflow_model_accuracy',
'Model accuracy',
['model_name', 'model_version']
)
start_http_server(9090)
def track_prediction(model_name, model_version, latency, accuracy=None):
PREDICTION_COUNT.labels(
model_name=model_name,
model_version=model_version
).inc()
PREDICTION_LATENCY.labels(model_name=model_name).observe(latency)
if accuracy:
MODEL_ACCURACY.labels(
model_name=model_name,
model_version=model_version
).set(accuracy)
告警规则 #
yaml
groups:
- name: mlflow_alerts
rules:
- alert: MLflowServerDown
expr: up{job="mlflow"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "MLflow server is down"
description: "MLflow server has been down for more than 1 minute."
- alert: HighPredictionLatency
expr: histogram_quantile(0.95, rate(mlflow_prediction_latency_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High prediction latency"
description: "95th percentile latency is above 1 second."
- alert: ModelAccuracyDrop
expr: mlflow_model_accuracy < 0.8
for: 10m
labels:
severity: warning
annotations:
summary: "Model accuracy dropped"
description: "Model accuracy is below 80%."
安全配置 #
认证配置 #
python
import os
os.environ["MLFLOW_TRACKING_USERNAME"] = "admin"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "secure_password"
import mlflow
mlflow.set_tracking_uri("https://mlflow.example.com")
RBAC 配置 #
python
from mlflow.server.auth import create_user, create_experiment_permission
create_user("data_scientist", "password123")
create_experiment_permission(
experiment_id="1",
username="data_scientist",
permission="EDIT"
)
网络安全 #
yaml
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: mlflow-network-policy
namespace: mlflow
spec:
podSelector:
matchLabels:
app: mlflow
policyTypes:
- Ingress
- Egress
ingress:
- from:
- namespaceSelector:
matchLabels:
name: production
- podSelector:
matchLabels:
role: ml-client
ports:
- protocol: TCP
port: 5000
egress:
- to:
- podSelector:
matchLabels:
app: postgres
ports:
- protocol: TCP
port: 5432
- to:
- podSelector:
matchLabels:
app: minio
ports:
- protocol: TCP
port: 9000
备份策略 #
数据库备份 #
bash
#!/bin/bash
BACKUP_DIR="/backups/postgres"
DATE=$(date +%Y%m%d_%H%M%S)
BACKUP_FILE="${BACKUP_DIR}/mlflow_${DATE}.sql"
mkdir -p $BACKUP_DIR
pg_dump -h postgres -U mlflow -d mlflow > $BACKUP_FILE
gzip $BACKUP_FILE
find $BACKUP_DIR -name "*.sql.gz" -mtime +30 -delete
aws s3 cp $BACKUP_FILE.gz s3://mlflow-backups/postgres/
工件备份 #
bash
#!/bin/bash
DATE=$(date +%Y%m%d)
aws s3 sync s3://mlflow-artifacts/mlruns s3://mlflow-backups/artifacts/${DATE}/
aws s3 ls s3://mlflow-backups/artifacts/ | awk '{print $2}' | head -n -7 | while read prefix; do
aws s3 rm s3://mlflow-backups/artifacts/${prefix} --recursive
done
灾难恢复 #
恢复流程 #
bash
#!/bin/bash
aws s3 cp s3://mlflow-backups/postgres/mlflow_latest.sql.gz .
gunzip mlflow_latest.sql.gz
psql -h postgres -U mlflow -d mlflow < mlflow_latest.sql
aws s3 sync s3://mlflow-backups/artifacts/latest/ s3://mlflow-artifacts/mlruns/
健康检查脚本 #
python
import requests
import logging
from datetime import datetime
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def check_mlflow_health(url):
try:
response = requests.get(f"{url}/health", timeout=10)
if response.status_code == 200:
logger.info(f"MLflow is healthy at {datetime.now()}")
return True
else:
logger.error(f"MLflow returned status {response.status_code}")
return False
except Exception as e:
logger.error(f"Health check failed: {e}")
return False
def check_database_connection():
try:
import psycopg2
conn = psycopg2.connect(
host="postgres",
database="mlflow",
user="mlflow",
password="password"
)
conn.close()
logger.info("Database connection successful")
return True
except Exception as e:
logger.error(f"Database connection failed: {e}")
return False
def check_storage_connection():
try:
import boto3
s3 = boto3.client('s3')
s3.head_bucket(Bucket='mlflow-artifacts')
logger.info("Storage connection successful")
return True
except Exception as e:
logger.error(f"Storage connection failed: {e}")
return False
if __name__ == "__main__":
checks = [
("MLflow Server", lambda: check_mlflow_health("http://localhost:5000")),
("Database", check_database_connection),
("Storage", check_storage_connection)
]
all_healthy = True
for name, check in checks:
if not check():
all_healthy = False
logger.error(f"{name} check failed")
if all_healthy:
logger.info("All health checks passed")
else:
logger.error("Some health checks failed")
exit(1)
最佳实践清单 #
text
┌─────────────────────────────────────────────────────────────┐
│ 生产部署检查清单 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 架构设计: │
│ ├── 高可用部署(多副本) │
│ ├── 负载均衡配置 │
│ ├── 数据库主从复制 │
│ └── 工件存储冗余 │
│ │
│ 安全配置: │
│ ├── HTTPS 加密 │
│ ├── 认证授权 │
│ ├── 网络隔离 │
│ └── 敏感信息加密存储 │
│ │
│ 监控告警: │
│ ├── 服务健康检查 │
│ ├── 性能指标监控 │
│ ├── 错误日志收集 │
│ └── 告警通知配置 │
│ │
│ 备份恢复: │
│ ├── 数据库定期备份 │
│ ├── 工件定期备份 │
│ ├── 恢复流程测试 │
│ └── 灾难恢复计划 │
│ │
│ CI/CD: │
│ ├── 自动化训练流水线 │
│ ├── 模型自动注册 │
│ ├── 自动化部署 │
│ └── 回滚机制 │
│ │
└─────────────────────────────────────────────────────────────┘
总结 #
恭喜你完成了 MLflow 文档的学习!你现在应该已经掌握了:
- MLflow 的核心概念和架构
- 实验跟踪和模型管理
- 项目打包和模型部署
- 生产环境部署和运维
继续实践,将 MLflow 应用到你的机器学习项目中!
最后更新:2026-04-04