生产环境最佳实践 #

生产环境要求 #

核心要求 #

text
┌─────────────────────────────────────────────────────┐
│                 生产环境要求                         │
├─────────────────────────────────────────────────────┤
│                                                     │
│  1. 高可用 - 服务持续可用                           │
│  2. 可扩展 - 支持水平扩展                           │
│  3. 安全性 - 数据和访问安全                         │
│  4. 可监控 - 全面的监控和告警                       │
│  5. 可恢复 - 快速故障恢复                           │
│  6. 可追溯 - 完整的日志和审计                       │
│                                                     │
└─────────────────────────────────────────────────────┘

镜像管理 #

镜像版本策略 #

bash
# 语义化版本
myapp:v1.0.0
myapp:v1.0.1
myapp:v1.1.0

# 环境标签
myapp:dev
myapp:staging
myapp:prod

# 时间戳标签
myapp:20240101-120000

镜像仓库策略 #

yaml
# 多环境仓库
dev: registry.dev.example.com/myapp
staging: registry.staging.example.com/myapp
prod: registry.prod.example.com/myapp

# 或单仓库多项目
registry.example.com/dev/myapp
registry.example.com/staging/myapp
registry.example.com/prod/myapp

镜像安全扫描 #

bash
# CI/CD中集成扫描
trivy image --exit-code 1 --severity HIGH,CRITICAL myapp:v1.0

# 定期扫描仓库
for image in $(docker images -q); do
  trivy image $image
done

容器配置 #

资源限制 #

yaml
# docker-compose.yml
services:
  app:
    image: myapp:v1.0
    deploy:
      resources:
        limits:
          cpus: '1'
          memory: 1G
        reservations:
          cpus: '0.5'
          memory: 512M

健康检查 #

yaml
services:
  app:
    image: myapp:v1.0
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s

重启策略 #

yaml
services:
  app:
    image: myapp:v1.0
    restart: unless-stopped
    deploy:
      restart_policy:
        condition: on-failure
        delay: 5s
        max_attempts: 3

高可用配置 #

多副本部署 #

yaml
services:
  app:
    image: myapp:v1.0
    deploy:
      replicas: 3
      update_config:
        parallelism: 1
        delay: 10s
        failure_action: rollback
      rollback_config:
        parallelism: 1

负载均衡 #

yaml
services:
  nginx:
    image: nginx:alpine
    ports:
      - "80:80"
    volumes:
      - ./nginx.conf:/etc/nginx/nginx.conf
    depends_on:
      - app1
      - app2
      - app3

  app1:
    image: myapp:v1.0
  app2:
    image: myapp:v1.0
  app3:
    image: myapp:v1.0

数据库高可用 #

yaml
services:
  mysql-master:
    image: mysql:8.0
    environment:
      MYSQL_ROOT_PASSWORD: root
    volumes:
      - mysql-master-data:/var/lib/mysql

  mysql-slave:
    image: mysql:8.0
    environment:
      MYSQL_ROOT_PASSWORD: root
      MYSQL_MASTER_HOST: mysql-master
    volumes:
      - mysql-slave-data:/var/lib/mysql
    depends_on:
      - mysql-master

监控告警 #

Prometheus监控 #

yaml
# docker-compose.yml
services:
  prometheus:
    image: prom/prometheus
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - prometheus-data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.retention.time=30d'

  grafana:
    image: grafana/grafana
    ports:
      - "3000:3000"
    volumes:
      - grafana-data:/var/lib/grafana
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin

  alertmanager:
    image: prom/alertmanager
    ports:
      - "9093:9093"
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml

volumes:
  prometheus-data:
  grafana-data:

告警规则 #

yaml
# alert_rules.yml
groups:
  - name: container_alerts
    rules:
      - alert: ContainerDown
        expr: container_last_seen{name!=""} - time() > 60
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Container {{ $labels.name }} is down"

      - alert: HighCPU
        expr: rate(container_cpu_usage_seconds_total{name!=""}[5m]) > 0.8
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container {{ $labels.name }} high CPU usage"

      - alert: HighMemory
        expr: container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.9
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container {{ $labels.name }} high memory usage"

日志管理 #

日志配置 #

json
// /etc/docker/daemon.json
{
  "log-driver": "json-file",
  "log-opts": {
    "max-size": "10m",
    "max-file": "5",
    "labels": "app,environment"
  }
}

ELK日志收集 #

yaml
services:
  elasticsearch:
    image: elasticsearch:8.10.0
    environment:
      - discovery.type=single-node
      - xpack.security.enabled=false
    volumes:
      - elasticsearch-data:/usr/share/elasticsearch/data

  logstash:
    image: logstash:8.10.0
    volumes:
      - ./logstash.conf:/usr/share/logstash/pipeline/logstash.conf

  kibana:
    image: kibana:8.10.0
    environment:
      - ELASTICSEARCH_HOSTS=http://elasticsearch:9200
    ports:
      - "5601:5601"

volumes:
  elasticsearch-data:

备份恢复 #

数据备份 #

bash
#!/bin/bash
# backup.sh

DATE=$(date +%Y%m%d_%H%M%S)
BACKUP_DIR="/backup"

# 备份数据卷
docker run --rm \
  -v mysql-data:/data \
  -v $BACKUP_DIR:/backup \
  alpine tar czf /backup/mysql-$DATE.tar.gz -C /data .

# 备份配置
tar czf $BACKUP_DIR/config-$DATE.tar.gz /etc/docker

# 清理旧备份
find $BACKUP_DIR -name "*.tar.gz" -mtime +30 -delete

自动备份 #

bash
# crontab -e
0 2 * * * /path/to/backup.sh >> /var/log/backup.log 2>&1

安全加固 #

网络隔离 #

yaml
networks:
  frontend:
    driver: bridge
  backend:
    driver: bridge
    internal: true
  database:
    driver: bridge
    internal: true

访问控制 #

yaml
services:
  app:
    image: myapp:v1.0
    read_only: true
    cap_drop:
      - ALL
    cap_add:
      - NET_BIND_SERVICE
    security_opt:
      - no-new-privileges:true

运维清单 #

部署前检查 #

text
┌─────────────────────────────────────────────────────┐
│                 部署前检查清单                       │
├─────────────────────────────────────────────────────┤
│                                                     │
│  [ ] 镜像安全扫描通过                               │
│  [ ] 资源限制配置正确                               │
│  [ ] 健康检查配置正确                               │
│  [ ] 日志配置正确                                   │
│  [ ] 监控配置正确                                   │
│  [ ] 告警规则配置                                   │
│  [ ] 备份策略配置                                   │
│  [ ] 回滚方案准备                                   │
│                                                     │
└─────────────────────────────────────────────────────┘

日常运维 #

text
┌─────────────────────────────────────────────────────┐
│                   日常运维清单                       │
├─────────────────────────────────────────────────────┤
│                                                     │
│  每日:                                              │
│  [ ] 检查服务状态                                   │
│  [ ] 检查资源使用                                   │
│  [ ] 检查告警信息                                   │
│                                                     │
│  每周:                                              │
│  [ ] 检查日志大小                                   │
│  [ ] 检查备份完整性                                 │
│  [ ] 检查安全更新                                   │
│                                                     │
│  每月:                                              │
│  [ ] 清理旧镜像                                     │
│  [ ] 清理旧备份                                     │
│  [ ] 安全审计                                       │
│                                                     │
└─────────────────────────────────────────────────────┘

小结 #

本节学习了生产环境的最佳实践:

  • 镜像管理策略
  • 容器配置优化
  • 高可用配置
  • 监控告警配置
  • 日志管理
  • 备份恢复
  • 安全加固

总结 #

恭喜你完成了Docker容器化技术完全指南的学习!你已经掌握了:

  • Docker基础概念和操作
  • 镜像管理和Dockerfile编写
  • 容器管理和资源限制
  • 数据持久化和网络配置
  • Docker Compose多容器编排
  • 镜像仓库管理
  • 高级特性和性能优化
  • 生产环境最佳实践

继续实践,不断探索Docker的更多可能性!