生产部署 #
部署架构 #
整体架构 #
text
┌─────────────────────────────────────────────────────────────┐
│ LoRA 生产部署架构 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 客户端层: │
│ ├── Web 应用 │
│ ├── 移动应用 │
│ └── API 调用 │
│ │
│ 网关层: │
│ ├── 负载均衡 │
│ ├── 请求路由 │
│ └── 认证授权 │
│ │
│ 服务层: │
│ ├── 推理服务 (vLLM/TGI) │
│ ├── LoRA 管理服务 │
│ └── 缓存服务 │
│ │
│ 存储层: │
│ ├── 模型存储 (S3/OSS) │
│ ├── LoRA 权重存储 │
│ └── 日志/监控存储 │
│ │
└─────────────────────────────────────────────────────────────┘
部署模式 #
text
模式 1:单 LoRA 部署
├── 适用:单一任务场景
├── 优点:简单高效
└── 缺点:不支持多任务
模式 2:多 LoRA 动态加载
├── 适用:多任务场景
├── 优点:灵活切换
└── 缺点:切换延迟
模式 3:多 LoRA 并行服务
├── 适用:高并发多任务
├── 优点:无切换延迟
└── 缺点:资源占用高
模式 4:合并权重部署
├── 适用:固定任务组合
├── 优点:推理最快
└── 缺点:灵活性低
推理优化 #
权重合并 #
python
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
def merge_and_deploy(base_model_path, lora_path, output_path):
print("加载基座模型...")
base_model = AutoModelForCausalLM.from_pretrained(
base_model_path,
torch_dtype="auto",
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
print("加载 LoRA 权重...")
model = PeftModel.from_pretrained(base_model, lora_path)
print("合并权重...")
merged_model = model.merge_and_unload()
print("保存合并后的模型...")
merged_model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)
print(f"部署模型已保存到: {output_path}")
merge_and_deploy(
"meta-llama/Llama-2-7b-hf",
"./my-lora",
"./deployed-model"
)
量化部署 #
python
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_quant_type="nf4",
)
model = AutoModelForCausalLM.from_pretrained(
"./deployed-model",
quantization_config=quantization_config,
device_map="auto",
)
批量推理 #
python
import torch
from typing import List
class BatchInference:
def __init__(self, model, tokenizer, batch_size=8):
self.model = model
self.tokenizer = tokenizer
self.batch_size = batch_size
def generate(self, prompts: List[str], max_new_tokens=256):
results = []
for i in range(0, len(prompts), self.batch_size):
batch = prompts[i:i + self.batch_size]
inputs = self.tokenizer(
batch,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512,
).to(self.model.device)
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
)
batch_results = self.tokenizer.batch_decode(
outputs,
skip_special_tokens=True,
)
results.extend(batch_results)
return results
inference = BatchInference(model, tokenizer, batch_size=8)
results = inference.generate(["问题1", "问题2", "问题3"])
服务架构 #
FastAPI 服务 #
python
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import uvicorn
app = FastAPI(title="LoRA Inference API")
class GenerateRequest(BaseModel):
prompt: str
max_new_tokens: int = 256
temperature: float = 0.7
top_p: float = 0.9
lora_name: Optional[str] = None
class GenerateResponse(BaseModel):
generated_text: str
lora_used: str
class LoRAService:
def __init__(self, base_model_path: str):
self.base_model = AutoModelForCausalLM.from_pretrained(
base_model_path,
torch_dtype=torch.float16,
device_map="auto",
)
self.tokenizer = AutoTokenizer.from_pretrained(base_model_path)
self.loras = {}
self.current_lora = None
def load_lora(self, lora_name: str, lora_path: str):
model = PeftModel.from_pretrained(self.base_model, lora_path)
self.loras[lora_name] = model
print(f"加载 LoRA: {lora_name}")
def switch_lora(self, lora_name: str):
if lora_name in self.loras:
self.current_lora = lora_name
return True
return False
def generate(self, request: GenerateRequest) -> GenerateResponse:
if request.lora_name and request.lora_name in self.loras:
model = self.loras[request.lora_name]
lora_used = request.lora_name
else:
model = self.base_model
lora_used = "base"
inputs = self.tokenizer(
request.prompt,
return_tensors="pt",
).to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=request.max_new_tokens,
temperature=request.temperature,
top_p=request.top_p,
do_sample=True,
)
generated_text = self.tokenizer.decode(
outputs[0],
skip_special_tokens=True,
)
return GenerateResponse(
generated_text=generated_text,
lora_used=lora_used,
)
service = LoRAService("meta-llama/Llama-2-7b-hf")
@app.post("/generate", response_model=GenerateResponse)
async def generate(request: GenerateRequest):
try:
return service.generate(request)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/lora/{lora_name}/load")
async def load_lora(lora_name: str, lora_path: str):
service.load_lora(lora_name, lora_path)
return {"status": "loaded", "lora": lora_name}
@app.get("/lora/list")
async def list_loras():
return {"loras": list(service.loras.keys())}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
Docker 部署 #
dockerfile
FROM nvidia/cuda:11.8-devel-ubuntu22.04
WORKDIR /app
RUN apt-get update && apt-get install -y \
python3 \
python3-pip \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip3 install --no-cache-dir -r requirements.txt
COPY . .
EXPOSE 8000
CMD ["python3", "server.py"]
yaml
version: '3.8'
services:
lora-api:
build: .
ports:
- "8000:8000"
volumes:
- ./models:/app/models
- ./loras:/app/loras
environment:
- CUDA_VISIBLE_DEVICES=0
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
vLLM 部署 #
多 LoRA 服务 #
python
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest
llm = LLM(
model="meta-llama/Llama-2-7b-hf",
enable_lora=True,
max_loras=4,
max_lora_rank=64,
max_cpu_loras=8,
)
lora_medical = LoRARequest(
"medical",
lora_int_id=1,
lora_local_path="./loras/medical",
)
lora_code = LoRARequest(
"code",
lora_int_id=2,
lora_local_path="./loras/code",
)
sampling_params = SamplingParams(
temperature=0.7,
top_p=0.9,
max_tokens=256,
)
outputs_medical = llm.generate(
["什么是糖尿病?"],
sampling_params,
lora_request=lora_medical,
)
outputs_code = llm.generate(
["写一个 Python 函数"],
sampling_params,
lora_request=lora_code,
)
vLLM 服务启动 #
bash
python -m vllm.entrypoints.api_server \
--model meta-llama/Llama-2-7b-hf \
--enable-lora \
--lora-modules medical=./loras/medical code=./loras/code \
--max-loras 4 \
--max-lora-rank 64 \
--host 0.0.0.0 \
--port 8000
监控运维 #
性能监控 #
python
import time
import psutil
import torch
from dataclasses import dataclass
from typing import List
import json
@dataclass
class InferenceMetrics:
latency_ms: float
tokens_generated: int
tokens_per_second: float
gpu_memory_used_gb: float
gpu_memory_total_gb: float
class MetricsCollector:
def __init__(self):
self.metrics: List[InferenceMetrics] = []
def collect(self, start_time: float, end_time: float,
tokens_generated: int):
latency = (end_time - start_time) * 1000
tps = tokens_generated / (end_time - start_time)
gpu_memory_used = torch.cuda.memory_allocated() / 1e9
gpu_memory_total = torch.cuda.get_device_properties(0).total_memory / 1e9
metric = InferenceMetrics(
latency_ms=latency,
tokens_generated=tokens_generated,
tokens_per_second=tps,
gpu_memory_used_gb=gpu_memory_used,
gpu_memory_total_gb=gpu_memory_total,
)
self.metrics.append(metric)
return metric
def get_summary(self):
if not self.metrics:
return {}
latencies = [m.latency_ms for m in self.metrics]
tps_list = [m.tokens_per_second for m in self.metrics]
return {
"total_requests": len(self.metrics),
"avg_latency_ms": sum(latencies) / len(latencies),
"p50_latency_ms": sorted(latencies)[len(latencies) // 2],
"p99_latency_ms": sorted(latencies)[int(len(latencies) * 0.99)],
"avg_tokens_per_second": sum(tps_list) / len(tps_list),
}
def export_prometheus(self):
summary = self.get_summary()
metrics_text = f"""
# HELP lora_inference_latency_ms Inference latency in milliseconds
# TYPE lora_inference_latency_ms gauge
lora_inference_latency_ms{{quantile="avg"}} {summary.get('avg_latency_ms', 0)}
lora_inference_latency_ms{{quantile="p50"}} {summary.get('p50_latency_ms', 0)}
lora_inference_latency_ms{{quantile="p99"}} {summary.get('p99_latency_ms', 0)}
# HELP lora_tokens_per_second Tokens generated per second
# TYPE lora_tokens_per_second gauge
lora_tokens_per_second {summary.get('avg_tokens_per_second', 0)}
# HELP lora_total_requests Total number of inference requests
# TYPE lora_total_requests counter
lora_total_requests {summary.get('total_requests', 0)}
"""
return metrics_text
健康检查 #
python
from fastapi import FastAPI
import torch
app = FastAPI()
@app.get("/health")
async def health_check():
checks = {
"status": "healthy",
"gpu_available": torch.cuda.is_available(),
"gpu_memory": None,
}
if torch.cuda.is_available():
checks["gpu_memory"] = {
"used_gb": torch.cuda.memory_allocated() / 1e9,
"total_gb": torch.cuda.get_device_properties(0).total_memory / 1e9,
}
return checks
@app.get("/ready")
async def readiness_check():
try:
return {"status": "ready"}
except Exception as e:
return {"status": "not ready", "error": str(e)}
成本优化 #
资源规划 #
text
GPU 选择建议:
├── 开发测试:RTX 3090/4090 (24GB)
├── 小规模生产:A10 (24GB) / L4 (24GB)
├── 中规模生产:A100 (40GB/80GB)
└── 大规模生产:H100 (80GB)
成本优化策略:
├── 使用 Spot/Preemptible 实例
├── 自动扩缩容
├── 请求批处理
├── 模型量化
└── 多租户共享
自动扩缩容 #
yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: lora-inference-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: lora-inference
minReplicas: 1
maxReplicas: 10
metrics:
- type: Resource
resource:
name: gpu-utilization
target:
type: Utilization
averageUtilization: 70
- type: External
external:
metric:
name: requests_per_second
target:
type: AverageValue
averageValue: 100
安全考虑 #
API 安全 #
python
from fastapi import FastAPI, HTTPException, Depends
from fastapi.security import APIKeyHeader
import time
from collections import defaultdict
app = FastAPI()
api_key_header = APIKeyHeader(name="X-API-Key")
VALID_API_KEYS = {"key1", "key2", "key3"}
rate_limits = defaultdict(list)
async def verify_api_key(api_key: str = Depends(api_key_header)):
if api_key not in VALID_API_KEYS:
raise HTTPException(status_code=401, detail="Invalid API Key")
return api_key
async def rate_limit(api_key: str = Depends(verify_api_key)):
now = time.time()
requests = rate_limits[api_key]
requests[:] = [t for t in requests if now - t < 60]
if len(requests) >= 60:
raise HTTPException(status_code=429, detail="Rate limit exceeded")
requests.append(now)
return api_key
@app.post("/generate")
async def generate(request: GenerateRequest, api_key: str = Depends(rate_limit)):
return service.generate(request)
输入验证 #
python
from pydantic import BaseModel, validator
from typing import Optional
class GenerateRequest(BaseModel):
prompt: str
max_new_tokens: int = 256
temperature: float = 0.7
top_p: float = 0.9
@validator('prompt')
def validate_prompt(cls, v):
if len(v) > 4096:
raise ValueError('Prompt too long (max 4096 characters)')
if not v.strip():
raise ValueError('Prompt cannot be empty')
return v
@validator('max_new_tokens')
def validate_max_tokens(cls, v):
if v < 1 or v > 2048:
raise ValueError('max_new_tokens must be between 1 and 2048')
return v
@validator('temperature')
def validate_temperature(cls, v):
if v < 0 or v > 2:
raise ValueError('temperature must be between 0 and 2')
return v
总结 #
恭喜你完成了 LoRA / PEFT 完全指南的学习!你已经掌握了:
- LoRA 的核心原理和数学基础
- 使用 PEFT 库快速实现 LoRA 微调
- 超参数调优和模型合并等进阶技巧
- QLoRA、AdaLoRA 等变体方法
- 完整的生态工具链
- 生产部署的最佳实践
继续探索和实践,将 LoRA 应用到你的实际项目中!
最后更新:2026-04-05