生态工具 #
工具概览 #
text
┌─────────────────────────────────────────────────────────────┐
│ LoRA 生态工具链 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 核心库: │
│ ├── PEFT: Hugging Face 参数高效微调库 │
│ ├── bitsandbytes: 量化库 │
│ └── safetensors: 安全张量存储 │
│ │
│ 训练框架: │
│ ├── Transformers: Hugging Face 核心库 │
│ ├── Axolotl: 配置化训练工具 │
│ ├── LLaMA-Factory: 统一微调框架 │
│ └── Unsloth: 高速训练库 │
│ │
│ 合并工具: │
│ ├── mergekit: 模型合并工具 │
│ └── LM Studio: 可视化工具 │
│ │
│ 推理引擎: │
│ ├── vLLM: 高性能推理 │
│ ├── TGI: Hugging Face 推理 │
│ ├── TensorRT-LLM: NVIDIA 优化 │
│ └── llama.cpp: CPU 推理 │
│ │
└─────────────────────────────────────────────────────────────┘
PEFT 库 #
简介 #
PEFT(Parameter-Efficient Fine-Tuning)是 Hugging Face 官方的参数高效微调库,支持多种 PEFT 方法。
安装 #
bash
pip install peft
基本使用 #
python
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
torch_dtype="auto",
device_map="auto",
)
lora_config = LoraConfig(
r=8,
lora_alpha=16,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
PEFT 支持的方法 #
python
from peft import (
LoraConfig,
AdaLoraConfig,
PrefixTuningConfig,
PromptTuningConfig,
PromptEncoderConfig,
)
methods = {
"lora": LoraConfig(r=8, lora_alpha=16),
"adalora": AdaLoraConfig(init_r=12, target_r=8),
"prefix_tuning": PrefixTuningConfig(num_virtual_tokens=20),
"prompt_tuning": PromptTuningConfig(num_virtual_tokens=20),
"p_tuning": PromptEncoderConfig(num_virtual_tokens=20),
}
保存和加载 #
python
model.save_pretrained("./my-lora")
from peft import PeftModel
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
model = PeftModel.from_pretrained(base_model, "./my-lora")
merged_model = model.merge_and_unload()
merged_model.save_pretrained("./merged-model")
bitsandbytes #
简介 #
bitsandbytes 是 NVIDIA GPU 上的 8-bit 优化器和量化方法库,是 QLoRA 的核心依赖。
安装 #
bash
pip install bitsandbytes
量化配置 #
python
import torch
from transformers import BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
quantization_config=bnb_config,
device_map="auto",
)
8-bit 优化器 #
python
import bitsandbytes as bnb
optimizer = bnb.optim.AdamW8bit(
model.parameters(),
lr=2e-4,
betas=(0.9, 0.999),
weight_decay=0.01,
)
训练框架 #
Axolotl #
yaml
base_model: meta-llama/Llama-2-7b-hf
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
load_in_8bit: false
load_in_4bit: true
datasets:
- path: tatsu-lab/alpaca
type: alpaca
dataset_prepared_path: ./last_run_prepared
val_set_size: 0.05
output_dir: ./lora-output
adapter: qlora
lora_model_dir:
sequence_len: 512
sample_packing: false
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
- q_proj
- v_proj
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 3
optimizer: paged_adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0002
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 10
xformers_attention:
flash_attention: true
warmup_steps: 50
evals_per_epoch: 4
saves_per_epoch: 1
debug:
deepspeed:
weight_decay: 0.01
fsdp:
fsdp_config:
resize_token_embeddings_to_32x: false
LLaMA-Factory #
bash
pip install llama-factory
python
from llmtuner import run_exp
args = dict(
stage="sft",
do_train=True,
model_name_or_path="meta-llama/Llama-2-7b-hf",
dataset="alpaca_en_demo",
template="llama2",
finetuning_type="lora",
lora_target="q_proj,v_proj",
output_dir="lora_output",
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
lr_scheduler_type="cosine",
logging_steps=10,
save_steps=100,
learning_rate=2e-4,
num_train_epochs=3,
fp16=True,
)
run_exp(args)
Unsloth #
bash
pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
python
from unsloth import FastLanguageModel
import torch
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="meta-llama/Llama-2-7b-hf",
max_seq_length=2048,
dtype=None,
load_in_4bit=True,
)
model = FastLanguageModel.get_peft_model(
model,
r=16,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
lora_alpha=16,
lora_dropout=0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
use_rslora=False,
loftq_config=None,
)
from trl import SFTTrainer
from transformers import TrainingArguments
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=2048,
args=TrainingArguments(
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_steps=50,
max_steps=100,
learning_rate=2e-4,
fp16=not torch.cuda.is_bf16_supported(),
bf16=torch.cuda.is_bf16_supported(),
logging_steps=10,
output_dir="outputs",
),
)
trainer.train()
合并工具 #
mergekit #
bash
pip install mergekit
yaml
models:
- model: meta-llama/Llama-2-7b-hf
- model: ./lora-medical
parameters:
weight: 0.7
- model: ./lora-code
parameters:
weight: 0.3
merge_method: linear
dtype: float16
bash
mergekit-yaml config.yaml ./merged-model
手动合并脚本 #
python
import torch
from safetensors.torch import load_file, save_file
from transformers import AutoModelForCausalLM
def merge_lora_weights(base_model_path, lora_path, output_path, alpha=1.0):
base_model = AutoModelForCausalLM.from_pretrained(
base_model_path,
torch_dtype=torch.float16,
)
lora_weights = load_file(f"{lora_path}/adapter_model.safetensors")
state_dict = base_model.state_dict()
for name, param in state_dict.items():
lora_a_key = name.replace("weight", "lora_A.weight")
lora_b_key = name.replace("weight", "lora_B.weight")
if lora_a_key in lora_weights and lora_b_key in lora_weights:
lora_a = lora_weights[lora_a_key]
lora_b = lora_weights[lora_b_key]
delta = torch.matmul(lora_b, lora_a)
state_dict[name] = param + alpha * delta
base_model.save_pretrained(output_path)
print(f"合并完成,保存到: {output_path}")
推理引擎 #
vLLM #
bash
pip install vllm
python
from vllm import LLM, SamplingParams
llm = LLM(
model="meta-llama/Llama-2-7b-hf",
enable_lora=True,
max_loras=4,
max_lora_rank=64,
)
sampling_params = SamplingParams(
temperature=0.7,
top_p=0.9,
max_tokens=256,
)
outputs = llm.generate(
prompts=["解释什么是机器学习"],
sampling_params=sampling_params,
lora_request=LoRARequest("medical", 1, "./lora-medical"),
)
for output in outputs:
print(output.outputs[0].text)
TGI (Text Generation Inference) #
bash
docker run --gpus all --shm-size 1g -p 8080:80 \
ghcr.io/huggingface/text-generation-inference:latest \
--model-id meta-llama/Llama-2-7b-hf \
--lora-adapters ./lora-medical,./lora-code
llama.cpp #
bash
pip install llama-cpp-python
python
from llama_cpp import Llama
llm = Llama(
model_path="./merged-model.gguf",
n_ctx=2048,
n_gpu_layers=32,
)
output = llm(
"解释什么是机器学习",
max_tokens=256,
temperature=0.7,
)
工具对比 #
| 工具 | 类型 | 特点 | 适用场景 |
|---|---|---|---|
| PEFT | 核心库 | 官方支持,方法全面 | 所有 PEFT 任务 |
| bitsandbytes | 量化库 | 高效量化 | QLoRA 训练 |
| Axolotl | 训练框架 | 配置化,易用 | 快速实验 |
| LLaMA-Factory | 训练框架 | 功能全面 | 生产训练 |
| Unsloth | 训练框架 | 速度最快 | 快速迭代 |
| mergekit | 合并工具 | 多种合并策略 | 模型融合 |
| vLLM | 推理引擎 | 高吞吐量 | 生产推理 |
| TGI | 推理引擎 | Docker 部署 | 云端部署 |
| llama.cpp | 推理引擎 | CPU 支持 | 边缘部署 |
下一步 #
现在你已经了解了 LoRA 的生态工具,接下来学习 生产部署,掌握企业级部署的最佳实践!
最后更新:2026-04-05