LLM 微调:LoRA 与参数高效方法
LLM 微调LoRA 与参数高效方法1. 技术分析1.1 参数高效微调概述传统微调需要更新所有参数成本高昂参数高效微调方法 LoRA: 低秩适应 Adapter: 适配器层 Prefix Tuning: 前缀调优 IA³: 稀疏激活注入1.2 微调方法对比方法参数效率效果训练速度Full Fine-tuning低(100%)好慢LoRA高(1%)很好快Adapter中(2-5%)好中Prefix Tuning高(1%)中快1.3 LoRA 原理LoRA 在 Transformer 的注意力层中注入低秩矩阵LoRA 结构 W W0 A * B A: d_model x r (随机初始化) B: r x d_model (初始化为0) r: 秩(通常8-64)2. 核心功能实现2.1 LoRA 层实现import torch import torch.nn as nn import torch.nn.functional as F class LoRALayer(nn.Module): def __init__(self, in_dim, out_dim, rank8, alpha16): super().__init__() self.rank rank self.alpha alpha self.A nn.Parameter(torch.randn(in_dim, rank) / rank) self.B nn.Parameter(torch.zeros(rank, out_dim)) self.scaling alpha / rank def forward(self, x): return (x self.A self.B) * self.scaling class LoRAAttention(nn.Module): def __init__(self, d_model, num_heads, rank8, alpha16): super().__init__() self.d_model d_model self.num_heads num_heads self.d_k d_model // num_heads self.W_q nn.Linear(d_model, d_model) self.W_k nn.Linear(d_model, d_model) self.W_v nn.Linear(d_model, d_model) self.lora_q LoRALayer(d_model, d_model, rank, alpha) self.lora_k LoRALayer(d_model, d_model, rank, alpha) self.lora_v LoRALayer(d_model, d_model, rank, alpha) self.W_o nn.Linear(d_model, d_model) def forward(self, Q, K, V, maskNone): Q_orig self.W_q(Q) K_orig self.W_k(K) V_orig self.W_v(V) Q_lora self.lora_q(Q) K_lora self.lora_k(K) V_lora self.lora_v(V) Q Q_orig Q_lora K K_orig K_lora V V_orig V_lora scores torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtypetorch.float32)) if mask is not None: scores scores.masked_fill(mask 0, -1e9) attn_weights F.softmax(scores, dim-1) output torch.matmul(attn_weights, V) return self.W_o(output), attn_weights2.2 LoRA 模型封装class LoRAModelWrapper: def __init__(self, base_model, rank8, alpha16, target_modules[q_proj, v_proj]): self.base_model base_model self.rank rank self.alpha alpha self.target_modules target_modules self._inject_lora() def _inject_lora(self): for name, module in self.base_model.named_modules(): if any(target in name for target in self.target_modules): if isinstance(module, nn.Linear): self._replace_with_lora(module, name) def _replace_with_lora(self, module, name): in_dim module.in_features out_dim module.out_features lora_layer LoRALayer(in_dim, out_dim, self.rank, self.alpha) parent_name ..join(name.split(.)[:-1]) module_name name.split(.)[-1] parent self.base_model if parent_name: for part in parent_name.split(.): parent getattr(parent, part) setattr(parent, module_name, lora_layer) def freeze_base(self): for param in self.base_model.parameters(): param.requires_grad False for name, param in self.base_model.named_parameters(): if any(target in name for target in self.target_modules): param.requires_grad True def get_trainable_params(self): return [p for n, p in self.base_model.named_parameters() if p.requires_grad] class LoRATrainer: def __init__(self, model, tokenizer, config): self.model model self.tokenizer tokenizer self.config config self.lora_wrapper LoRAModelWrapper( model, rankconfig.get(rank, 8), alphaconfig.get(alpha, 16), target_modulesconfig.get(target_modules, [q_proj, v_proj]) ) self.lora_wrapper.freeze_base() def train(self, data): optimizer torch.optim.Adam( self.lora_wrapper.get_trainable_params(), lrself.config[lr] ) for epoch in range(self.config[epochs]): for batch in data: optimizer.zero_grad() input_ids batch[input_ids].to(cuda) labels batch[labels].to(cuda) outputs self.model(input_ids, labelslabels) loss outputs.loss loss.backward() optimizer.step()2.3 其他参数高效方法class AdapterLayer(nn.Module): def __init__(self, d_model, adapter_size64): super().__init__() self.down_proj nn.Linear(d_model, adapter_size) self.up_proj nn.Linear(adapter_size, d_model) self.activation nn.ReLU() def forward(self, x): residual x x self.down_proj(x) x self.activation(x) x self.up_proj(x) return x residual class PrefixTuning(nn.Module): def __init__(self, d_model, prefix_len10): super().__init__() self.prefix_len prefix_len self.prefix nn.Parameter(torch.randn(prefix_len, d_model)) def forward(self, x): batch_size x.size(0) prefix self.prefix.unsqueeze(0).expand(batch_size, -1, -1) return torch.cat([prefix, x], dim1) class IA³Layer(nn.Module): def __init__(self, d_model): super().__init__() self.scale nn.Parameter(torch.ones(d_model)) def forward(self, x): return x * self.scale3. 性能对比3.1 微调方法对比方法可训练参数训练速度内存占用效果Full100%1x高最好LoRA0.1-1%2-3x低很好Adapter2-5%1.5x中好Prefix0.1-0.5%2x低中等3.2 LoRA 参数影响秩(r)参数占比效果训练速度80.1%好最快160.2%很好快320.4%很好中640.8%最好慢3.3 不同任务效果任务LoRAAdapterFull文本分类95%92%100%问答94%91%100%生成96%93%100%4. 最佳实践4.1 LoRA 配置选择def select_lora_config(task_type): configs { classification: {rank: 8, alpha: 16, modules: [q_proj, v_proj]}, generation: {rank: 16, alpha: 32, modules: [q_proj, k_proj, v_proj]}, qa: {rank: 16, alpha: 32, modules: [q_proj, v_proj]} } return configs.get(task_type, {rank: 8, alpha: 16, modules: [q_proj, v_proj]}) class LoRAConfigGenerator: staticmethod def from_task(task_type, model_sizebase): base_config {rank: 8, alpha: 16} if model_size large: base_config[rank] 16 base_config[alpha] 32 if task_type generation: base_config[target_modules] [q_proj, k_proj, v_proj] else: base_config[target_modules] [q_proj, v_proj] return base_config4.2 LoRA 训练流程class LoRATrainingPipeline: def __init__(self, config): self.config config def run(self): model self._load_base_model() tokenizer self._load_tokenizer() lora_trainer LoRATrainer(model, tokenizer, self.config) data self._prepare_data() lora_trainer.train(data) self._save_model(model) def _load_base_model(self): from transformers import AutoModelForCausalLM return AutoModelForCausalLM.from_pretrained(self.config[model_name]) def _load_tokenizer(self): from transformers import AutoTokenizer return AutoTokenizer.from_pretrained(self.config[model_name]) def _prepare_data(self): return self.config[data] def _save_model(self, model): model.save_pretrained(self.config[output_dir])5. 总结参数高效微调是 LLM 适配的关键技术LoRA最佳平衡效果和效率Adapter适用于多任务场景Prefix Tuning适用于生成任务配置选择根据任务和模型大小调整对比数据如下LoRA 在大多数任务上达到全微调的 95% 效果训练速度提升 2-3 倍内存占用降低 50%推荐使用 rank8-16alpharank*2