文本分类实战 #
项目概述 #
text
┌─────────────────────────────────────────────────────────────┐
│ 项目目标 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 任务:电商评论情感分析 │
│ ├── 输入:用户评论文本 │
│ ├── 输出:情感标签(正面/负面/中性) │
│ └── 目标:准确率 > 90% │
│ │
│ 技术栈: │
│ ├── 模型:Qwen2-1.5B │
│ ├── 方法:LoRA │
│ ├── 框架:Transformers + PEFT │
│ └── 部署:FastAPI + vLLM │
│ │
└─────────────────────────────────────────────────────────────┘
项目结构 #
text
sentiment-analysis/
├── data/
│ ├── raw/ # 原始数据
│ ├── processed/ # 处理后数据
│ └── train.json # 训练数据
├── src/
│ ├── data_preparation.py # 数据准备
│ ├── train.py # 训练脚本
│ ├── evaluate.py # 评估脚本
│ └── inference.py # 推理脚本
├── configs/
│ └── config.yaml # 配置文件
├── models/
│ └── lora/ # LoRA 权重
├── requirements.txt
└── README.md
数据准备 #
数据格式 #
json
{
"text": "这个产品质量很好,物流也很快,非常满意!",
"label": "正面",
"category": "商品质量"
}
数据准备脚本 #
python
import json
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
def load_and_process_data(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
df = pd.DataFrame(data)
label_map = {"正面": 0, "中性": 1, "负面": 2}
df['label_id'] = df['label'].map(label_map)
train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['label_id'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label_id'], random_state=42)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)
return train_dataset, val_dataset, test_dataset
def preprocess_function(examples, tokenizer, max_length=256):
prompts = [
f"分析以下电商评论的情感,回答'正面'、'中性'或'负面':\n评论:{text}\n情感:"
for text in examples['text']
]
labels_text = [
"正面" if label == 0 else "中性" if label == 1 else "负面"
for label in examples['label_id']
]
full_texts = [p + l for p, l in zip(prompts, labels_text)]
model_inputs = tokenizer(
full_texts,
max_length=max_length,
truncation=True,
padding='max_length',
return_tensors='pt'
)
labels = model_inputs['input_ids'].clone()
prompt_lengths = [len(tokenizer.encode(p, add_special_tokens=False)) for p in prompts]
for i, prompt_len in enumerate(prompt_lengths):
labels[i, :prompt_len] = -100
labels[labels == tokenizer.pad_token_id] = -100
model_inputs['labels'] = labels
return model_inputs
模型训练 #
训练配置 #
python
import yaml
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
import torch
with open('configs/config.yaml', 'r') as f:
config = yaml.safe_load(f)
model_name = config['model']['name']
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=config['lora']['r'],
lora_alpha=config['lora']['alpha'],
target_modules=config['lora']['target_modules'],
lora_dropout=config['lora']['dropout'],
bias="none"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
训练脚本 #
python
def train():
train_dataset, val_dataset, test_dataset = load_and_process_data(config['data']['path'])
train_dataset = train_dataset.map(
lambda x: preprocess_function(x, tokenizer, config['model']['max_length']),
batched=True,
remove_columns=train_dataset.column_names
)
val_dataset = val_dataset.map(
lambda x: preprocess_function(x, tokenizer, config['model']['max_length']),
batched=True,
remove_columns=val_dataset.column_names
)
training_args = TrainingArguments(
output_dir=config['output']['dir'],
num_train_epochs=config['training']['epochs'],
per_device_train_batch_size=config['training']['batch_size'],
per_device_eval_batch_size=config['training']['batch_size'],
gradient_accumulation_steps=config['training']['gradient_accumulation'],
learning_rate=config['training']['learning_rate'],
weight_decay=config['training']['weight_decay'],
warmup_ratio=config['training']['warmup_ratio'],
lr_scheduler_type=config['training']['lr_scheduler'],
logging_dir=config['output']['log_dir'],
logging_steps=config['training']['logging_steps'],
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
fp16=True,
gradient_checkpointing=True,
optim="adamw_8bit",
report_to="tensorboard"
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
tokenizer=tokenizer
)
trainer.train()
trainer.save_model(config['output']['model_dir'])
tokenizer.save_pretrained(config['output']['model_dir'])
return trainer
if __name__ == "__main__":
train()
配置文件 #
yaml
model:
name: "Qwen/Qwen2-1.5B"
max_length: 256
lora:
r: 16
alpha: 32
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
dropout: 0.05
training:
epochs: 3
batch_size: 8
gradient_accumulation: 4
learning_rate: 2e-4
weight_decay: 0.01
warmup_ratio: 0.1
lr_scheduler: "cosine"
logging_steps: 10
data:
path: "data/train.json"
output:
dir: "outputs"
log_dir: "logs"
model_dir: "models/lora"
模型评估 #
评估脚本 #
python
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import numpy as np
def evaluate(model, tokenizer, test_dataset):
predictions = []
labels = []
for example in test_dataset:
text = example['text']
true_label = example['label']
prompt = f"分析以下电商评论的情感,回答'正面'、'中性'或'负面':\n评论:{text}\n情感:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=10,
temperature=0.1,
do_sample=True
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
pred_label = response.split("情感:")[-1].strip()
predictions.append(pred_label)
labels.append(true_label)
accuracy = accuracy_score(labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(
labels, predictions, average='weighted'
)
print(f"准确率: {accuracy:.4f}")
print(f"精确率: {precision:.4f}")
print(f"召回率: {recall:.4f}")
print(f"F1 分数: {f1:.4f}")
print("\n分类报告:")
print(classification_report(labels, predictions))
return {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1
}
模型推理 #
推理脚本 #
python
from peft import PeftModel
import torch
class SentimentAnalyzer:
def __init__(self, base_model_path, lora_model_path):
self.tokenizer = AutoTokenizer.from_pretrained(
base_model_path,
trust_remote_code=True
)
base_model = AutoModelForCausalLM.from_pretrained(
base_model_path,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
self.model = PeftModel.from_pretrained(base_model, lora_model_path)
self.model.eval()
def predict(self, text):
prompt = f"分析以下电商评论的情感,回答'正面'、'中性'或'负面':\n评论:{text}\n情感:"
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=10,
temperature=0.1,
do_sample=True,
pad_token_id=self.tokenizer.pad_token_id
)
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
sentiment = response.split("情感:")[-1].strip()
return sentiment
def batch_predict(self, texts, batch_size=8):
results = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i+batch_size]
prompts = [
f"分析以下电商评论的情感,回答'正面'、'中性'或'负面':\n评论:{text}\n情感:"
for text in batch
]
inputs = self.tokenizer(
prompts,
return_tensors="pt",
padding=True,
truncation=True
).to(self.model.device)
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=10,
temperature=0.1,
do_sample=True,
pad_token_id=self.tokenizer.pad_token_id
)
for output in outputs:
response = self.tokenizer.decode(output, skip_special_tokens=True)
sentiment = response.split("情感:")[-1].strip()
results.append(sentiment)
return results
analyzer = SentimentAnalyzer(
base_model_path="Qwen/Qwen2-1.5B",
lora_model_path="models/lora"
)
result = analyzer.predict("这个产品质量很好,物流也很快,非常满意!")
print(f"情感: {result}")
模型部署 #
FastAPI 服务 #
python
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List
import uvicorn
app = FastAPI(title="情感分析 API")
class TextInput(BaseModel):
text: str
class BatchTextInput(BaseModel):
texts: List[str]
class SentimentResponse(BaseModel):
sentiment: str
confidence: float = None
analyzer = None
@app.on_event("startup")
async def startup():
global analyzer
analyzer = SentimentAnalyzer(
base_model_path="Qwen/Qwen2-1.5B",
lora_model_path="models/lora"
)
@app.post("/predict", response_model=SentimentResponse)
async def predict_sentiment(input_data: TextInput):
if analyzer is None:
raise HTTPException(status_code=503, detail="Model not loaded")
sentiment = analyzer.predict(input_data.text)
return SentimentResponse(sentiment=sentiment)
@app.post("/batch_predict")
async def batch_predict_sentiment(input_data: BatchTextInput):
if analyzer is None:
raise HTTPException(status_code=503, detail="Model not loaded")
sentiments = analyzer.batch_predict(input_data.texts)
return {"sentiments": sentiments}
@app.get("/health")
async def health_check():
return {"status": "healthy"}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
Docker 部署 #
dockerfile
FROM python:3.10-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
EXPOSE 8000
CMD ["python", "src/api.py"]
yaml
version: '3.8'
services:
sentiment-api:
build: .
ports:
- "8000:8000"
volumes:
- ./models:/app/models
environment:
- CUDA_VISIBLE_DEVICES=0
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
性能优化 #
模型合并 #
python
def merge_and_save(model, tokenizer, output_path):
merged_model = model.merge_and_unload()
merged_model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)
print(f"合并后的模型已保存到: {output_path}")
merge_and_save(model, tokenizer, "models/merged")
量化部署 #
python
from transformers import BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(
"models/merged",
quantization_config=bnb_config,
device_map="auto"
)
项目总结 #
text
项目成果:
├── 模型:Qwen2-1.5B + LoRA
├── 准确率:> 92%
├── 推理速度:< 50ms/请求
└── 部署:FastAPI + Docker
技术要点:
├── 数据质量是关键
├── LoRA 参数调优
├── 评估指标选择
└── 部署优化
可改进方向:
├── 数据增强
├── 模型集成
├── 在线学习
└── A/B 测试
下一步 #
接下来学习 对话模型微调,了解对话系统的微调实战!
最后更新:2026-04-05