快速实现 #
环境准备 #
安装依赖 #
bash
pip install torch transformers peft datasets accelerate bitsandbytes
验证安装 #
python
import torch
import transformers
import peft
print(f"PyTorch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")
print(f"PEFT: {peft.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
基础示例 #
1. 加载模型和 Tokenizer #
python
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto",
)
print(f"模型参数量: {sum(p.numel() for p in model.parameters()) / 1e9:.2f}B")
2. 配置 LoRA #
python
from peft import LoraConfig, get_peft_model
lora_config = LoraConfig(
r=8,
lora_alpha=16,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
输出:
text
trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06%
3. 准备数据 #
python
from datasets import load_dataset
dataset = load_dataset("tatsu-lab/alpaca", split="train[:1000]")
def format_instruction(sample):
return f"""### Instruction:
{sample['instruction']}
### Input:
{sample['input']}
### Response:
{sample['output']}"""
def tokenize_function(example):
text = format_instruction(example)
tokenized = tokenizer(
text,
truncation=True,
max_length=512,
padding="max_length",
return_tensors="pt",
)
tokenized["labels"] = tokenized["input_ids"].clone()
return tokenized
tokenized_dataset = dataset.map(tokenize_function, remove_columns=dataset.column_names)
print(f"数据集大小: {len(tokenized_dataset)}")
4. 训练模型 #
python
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
output_dir="./lora-output",
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
fp16=True,
logging_steps=10,
save_steps=100,
save_total_limit=2,
report_to="none",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
)
trainer.train()
5. 保存和加载 #
python
model.save_pretrained("./my-lora-model")
tokenizer.save_pretrained("./my-lora-model")
from peft import PeftModel
base_model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto",
)
loaded_model = PeftModel.from_pretrained(base_model, "./my-lora-model")
完整训练脚本 #
python
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
def main():
model_name = "meta-llama/Llama-2-7b-hf"
output_dir = "./lora-output"
print("加载模型和 Tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto",
)
print("配置 LoRA...")
lora_config = LoraConfig(
r=8,
lora_alpha=16,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
print("加载数据集...")
dataset = load_dataset("tatsu-lab/alpaca", split="train[:1000]")
def format_instruction(sample):
if sample['input']:
return f"""### Instruction:
{sample['instruction']}
### Input:
{sample['input']}
### Response:
{sample['output']}"""
else:
return f"""### Instruction:
{sample['instruction']}
### Response:
{sample['output']}"""
def tokenize_function(example):
text = format_instruction(example)
result = tokenizer(
text,
truncation=True,
max_length=512,
padding="max_length",
)
result["labels"] = result["input_ids"].copy()
return result
tokenized_dataset = dataset.map(
tokenize_function,
remove_columns=dataset.column_names,
desc="Tokenizing",
)
print("配置训练参数...")
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
fp16=True,
logging_steps=10,
save_steps=100,
save_total_limit=2,
warmup_steps=50,
report_to="none",
gradient_checkpointing=True,
optim="adamw_torch",
)
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False,
)
print("开始训练...")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
data_collator=data_collator,
)
trainer.train()
print("保存模型...")
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print("训练完成!")
if __name__ == "__main__":
main()
QLoRA 实现 #
QLoRA 结合了量化和 LoRA,进一步降低显存需求。
安装依赖 #
bash
pip install bitsandbytes
QLoRA 配置 #
python
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
model_name = "meta-llama/Llama-2-7b-hf"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map="auto",
)
model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
r=64,
lora_alpha=16,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
QLoRA 训练参数 #
python
from transformers import TrainingArguments
training_args = TrainingArguments(
output_dir="./qlora-output",
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
fp16=True,
logging_steps=10,
save_steps=100,
save_total_limit=2,
optim="paged_adamw_8bit",
gradient_checkpointing=True,
report_to="none",
)
推理示例 #
基本推理 #
python
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
base_model_name = "meta-llama/Llama-2-7b-hf"
lora_model_path = "./my-lora-model"
print("加载基座模型...")
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
torch_dtype=torch.float16,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
print("加载 LoRA 权重...")
model = PeftModel.from_pretrained(base_model, lora_model_path)
def generate_response(instruction, input_text="", max_new_tokens=256):
prompt = f"""### Instruction:
{instruction}
### Input:
{input_text}
### Response:
"""
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = response.split("### Response:\n")[-1].strip()
return response
instruction = "解释什么是机器学习"
response = generate_response(instruction)
print(f"回答: {response}")
合并权重推理 #
python
from peft import PeftModel
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
torch_dtype=torch.float16,
device_map="auto",
)
model = PeftModel.from_pretrained(base_model, lora_model_path)
merged_model = model.merge_and_unload()
merged_model.save_pretrained("./merged-model")
tokenizer.save_pretrained("./merged-model")
print("权重已合并,推理无额外开销")
自定义 LoRA 模块 #
自定义 LoRA 层 #
python
import torch
import torch.nn as nn
import math
class LoRALinear(nn.Module):
def __init__(
self,
in_features,
out_features,
r=8,
lora_alpha=16,
lora_dropout=0.0,
):
super().__init__()
self.in_features = in_features
self.out_features = out_features
self.r = r
self.lora_alpha = lora_alpha
self.scaling = lora_alpha / r
self.weight = nn.Parameter(torch.empty(out_features, in_features))
self.weight.requires_grad = False
if r > 0:
self.lora_A = nn.Parameter(torch.empty(r, in_features))
self.lora_B = nn.Parameter(torch.empty(out_features, r))
self.dropout = nn.Dropout(p=lora_dropout) if lora_dropout > 0 else nn.Identity()
nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
nn.init.zeros_(self.lora_B)
nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
def forward(self, x):
result = nn.functional.linear(x, self.weight)
if self.r > 0:
lora_out = self.dropout(x) @ self.lora_A.T @ self.lora_B.T
result = result + lora_out * self.scaling
return result
def merge_weights(self):
if self.r > 0:
self.weight.data += self.scaling * (self.lora_B @ self.lora_A)
self.lora_A = None
self.lora_B = None
lora_linear = LoRALinear(4096, 4096, r=8, lora_alpha=16)
print(f"LoRA 参数: {sum(p.numel() for p in lora_linear.parameters() if p.requires_grad):,}")
替换模型中的线性层 #
python
import torch.nn as nn
from typing import Dict, List
def replace_linear_with_lora(
model: nn.Module,
target_modules: List[str],
r: int = 8,
lora_alpha: int = 16,
lora_dropout: float = 0.0,
) -> nn.Module:
for name, module in model.named_modules():
if any(target in name for target in target_modules):
if isinstance(module, nn.Linear):
parent_name = ".".join(name.split(".")[:-1])
child_name = name.split(".")[-1]
lora_layer = LoRALinear(
module.in_features,
module.out_features,
r=r,
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
)
lora_layer.weight.data = module.weight.data.clone()
if module.bias is not None:
lora_layer.bias = module.bias
parent = model.get_submodule(parent_name) if parent_name else model
setattr(parent, child_name, lora_layer)
print(f"替换层: {name}")
return model
多 GPU 训练 #
分布式训练配置 #
python
import torch
from transformers import TrainingArguments
training_args = TrainingArguments(
output_dir="./lora-output",
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
fp16=True,
logging_steps=10,
save_steps=100,
ddp_find_unused_parameters=False,
local_rank=-1,
report_to="none",
)
启动命令 #
bash
torchrun --nproc_per_node=4 train_lora.py
常见问题解决 #
1. 显存不足 #
python
training_args = TrainingArguments(
per_device_train_batch_size=1,
gradient_accumulation_steps=16,
gradient_checkpointing=True,
optim="adamw_torch_fused",
fp16=True,
)
2. 训练不稳定 #
python
training_args = TrainingArguments(
learning_rate=1e-4,
warmup_ratio=0.1,
weight_decay=0.01,
max_grad_norm=1.0,
lr_scheduler_type="cosine",
)
3. LoRA 权重加载失败 #
python
from peft import PeftModel
try:
model = PeftModel.from_pretrained(base_model, lora_path)
except Exception as e:
print(f"加载失败: {e}")
print("尝试使用 strict=False...")
model = PeftModel.from_pretrained(base_model, lora_path, is_trainable=False)
下一步 #
现在你已经掌握了 LoRA 的基本实现,接下来学习 进阶技巧,了解超参数调优、多任务学习和模型合并等高级主题!
最后更新:2026-04-05