0


Qwen2-1.5B-Instruct Lora微调

Qwen2-1.5B-Instruct微调Lora微调

最近做了一个基于Qwen2-1.5B-Instruct模型的比赛,记录一下自己的微调过程。怕自己以后忘了我就手把手一步一步来记录了。
大多数都是给小白看的,如果你是小白建议你用jupyter运行,按照我这个模块一块一块运行,如果你是高手单纯的想找一个训练代码直接看模块10,我在提供了完整代码。

1. 模型下载

一般模型尽量在modelscope上先搜一下,比较这个下载速度真的快。

  1. import torch
  2. from modelscope import snapshot_download, AutoModel, AutoTokenizer
  3. import os
  4. # 第一个参数表示下载模型的型号,第二个参数是下载后存放的缓存地址,第三个表示版本号,默认 master
  5. model_dir = snapshot_download('Qwen/Qwen2-1.5B-Instruct', cache_dir='./', revision='master')

2. 准备工作(高手请忽略,没啥用)

微调的主要工作其实就是数据处理,其他基本就是个架往里放就行。
接下来是一份官网给出的推理的代码,借助这个代码我们来看输入模型的数据格式长什么样。

  1. from modelscope import AutoModelForCausalLM, AutoTokenizer
  2. device ="cuda"# the device to load the model onto
  3. model = AutoModelForCausalLM.from_pretrained("./Qwen2-1.5B-Instruct",
  4. torch_dtype="auto",
  5. device_map="auto")
  6. tokenizer = AutoTokenizer.from_pretrained("qwen/Qwen2-1.5B-Instruct")
  7. prompt ="你好"
  8. messages =[{"role":"system","content":'你是医疗问答助手章鱼哥,你将帮助用户解答基础的医疗问题。'},{"role":"user","content": prompt}]
  9. text = tokenizer.apply_chat_template(
  10. messages,
  11. tokenize=False,
  12. add_generation_prompt=True)
  13. model_inputs = tokenizer([text], return_tensors="pt").to(device)
  14. generated_ids = model.generate(
  15. model_inputs.input_ids,
  16. max_new_tokens=512)
  17. response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]print(response)

可以打印看看编码后的输入数据长什么样:

  1. '<|im_start|>system\n你是医疗问答助手章鱼哥,你将帮助用户解答基础的医疗问题。<|im_end|>\n<|im_start|>user\n你好<|im_end|>\n<|im_start|>assistant\n'

这里可以看到其实apply_chat_template方法在对函数编码的时候没有给出mask等内容(他这个和智谱轻言的GLM的apply_chat_template就差距很大,在这卡了我半天)所以在数据处理的时候就不能直接用他这个模板。

3. 接下来进入正题吧(导包)

  1. from datasets import Dataset, load_dataset
  2. from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer

4. 加载数据

我这里是用了一个医疗问答的csv数据,能了解到这里的应该数据处理不需要细说了吧

  1. dataset = load_dataset("csv", data_files="./问答.csv", split="train")
  2. dataset = dataset.filter(lambda x: x["answer"]isnotNone)
  3. datasets = dataset.train_test_split(test_size=0.1)

5. 数据预处理

  1. tokenizer = AutoTokenizer.from_pretrained("./Qwen2-1.5B-Instruct", trust_remote_code=True)defprocess_func(example):
  2. MAX_LENGTH =768# 最大输入长度,根据你的显存和数据自己调整
  3. input_ids, attention_mask, labels =[],[],[]
  4. instruction = example["question"].strip()# query# instruction = tokenizer.apply_chat_template([{"role": "user", "content": instruction}],# add_generation_prompt=True,# tokenize=True,# ) # '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nquery<|im_end|>\n<|im_start|>assistant\n'
  5. instruction = tokenizer(f"<|im_start|>system\n你是医学领域的人工助手章鱼哥<|im_end|>\n<|im_start|>user\n{example['question']}<|im_end|>\n<|im_start|>assistant\n",
  6. add_special_tokens=False,)
  7. response = tokenizer(f"{example['answer']}", add_special_tokens=False)# \n response, 缺少eos token
  8. input_ids = instruction["input_ids"]+ response["input_ids"]+[tokenizer.pad_token_id]
  9. attention_mask =(instruction["attention_mask"]+ response["attention_mask"]+[1])
  10. labels =[-100]*len(instruction["input_ids"])+ response["input_ids"]+[tokenizer.pad_token_id]iflen(input_ids)> MAX_LENGTH:
  11. input_ids = input_ids[:MAX_LENGTH]
  12. attention_mask = attention_mask[:MAX_LENGTH]
  13. labels = labels[:MAX_LENGTH]return{"input_ids": input_ids,"attention_mask": attention_mask,"labels": labels
  14. }
  15. tokenized_ds = datasets['train'].map(process_func, remove_columns=['id','question','answer'])
  16. tokenized_ts = datasets['test'].map(process_func, remove_columns=['id','question','answer'])

6. 创建模型

  1. import torch
  2. model = AutoModelForCausalLM.from_pretrained("./Qwen2-1.5B-Instruct", trust_remote_code=True)from peft import LoraConfig, TaskType, get_peft_model, PeftModel
  3. config = LoraConfig(target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"], modules_to_save=["post_attention_layernorm"])# 配置Lora参数
  4. model = get_peft_model(model, config)# 创建Lora模型

7. 配置训练参数

  1. args = TrainingArguments(
  2. output_dir="./chatbot",
  3. per_device_train_batch_size=4,
  4. gradient_accumulation_steps=8,
  5. gradient_checkpointing=True,
  6. logging_steps=300,
  7. num_train_epochs=10,
  8. learning_rate=1e-4,
  9. remove_unused_columns=False,
  10. save_strategy="epoch")# 在这里如果你开起了梯度检查点gradient_checkpointing=True,就必须加上model.enable_input_require_grads(),否则会报一个很难受的错误
  11. model.enable_input_require_grads()

8. 创建训练器

  1. trainer = Trainer(
  2. model=model,
  3. args=args,
  4. train_dataset=tokenized_ds.select(range(5000)),# 我这个数据量很大,我随机抽取5000条训练不然太慢了
  5. data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),)

9. 开始训练!!!

祝你成功

  1. trainer.train()

10. 完整的.py代码

  1. import torch
  2. from datasets import Dataset, load_dataset
  3. from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer
  4. from peft import LoraConfig, TaskType, get_peft_model, PeftModel
  5. dataset = load_dataset("csv", data_files="./问答.csv", split="train")
  6. dataset = dataset.filter(lambda x: x["answer"]isnotNone)
  7. datasets = dataset.train_test_split(test_size=0.1)
  8. tokenizer = AutoTokenizer.from_pretrained("./Qwen2-1.5B-Instruct", trust_remote_code=True)defprocess_func(example):
  9. MAX_LENGTH =768
  10. input_ids, attention_mask, labels =[],[],[]
  11. instruction = example["question"].strip()# query
  12. instruction = tokenizer(f"<|im_start|>system\n你是医学领域的人工助手章鱼哥<|im_end|>\n<|im_start|>user\n{example['question']}<|im_end|>\n<|im_start|>assistant\n",
  13. add_special_tokens=False,)
  14. response = tokenizer(f"{example['answer']}", add_special_tokens=False)# \n response, 缺少eos token
  15. input_ids = instruction["input_ids"]+ response["input_ids"]+[tokenizer.pad_token_id]
  16. attention_mask =(instruction["attention_mask"]+ response["attention_mask"]+[1])
  17. labels =[-100]*len(instruction["input_ids"])+ response["input_ids"]+[tokenizer.pad_token_id]iflen(input_ids)> MAX_LENGTH:
  18. input_ids = input_ids[:MAX_LENGTH]
  19. attention_mask = attention_mask[:MAX_LENGTH]
  20. labels = labels[:MAX_LENGTH]return{"input_ids": input_ids,"attention_mask": attention_mask,"labels": labels
  21. }
  22. tokenized_ds = datasets['train'].map(process_func, remove_columns=['id','question','answer'])
  23. tokenized_ts = datasets['test'].map(process_func, remove_columns=['id','question','answer'])
  24. model = AutoModelForCausalLM.from_pretrained("./Qwen2-1.5B-Instruct", trust_remote_code=True)
  25. config = LoraConfig(target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"], modules_to_save=["post_attention_layernorm"])
  26. model = get_peft_model(model, config)
  27. args = TrainingArguments(
  28. output_dir="./law",
  29. per_device_train_batch_size=4,
  30. gradient_accumulation_steps=16,
  31. gradient_checkpointing=True,
  32. logging_steps=6,
  33. num_train_epochs=10,
  34. learning_rate=1e-4,
  35. remove_unused_columns=False,
  36. save_strategy="epoch")
  37. model.enable_input_require_grads()
  38. trainer = Trainer(
  39. model=model,
  40. args=args,
  41. train_dataset=tokenized_ds.select(range(400)),
  42. data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),)
  43. trainer.train()

11. 合并Lora推理预测代码

  1. import torch
  2. from transformers import AutoModelForCausalLM, AutoTokenizer
  3. from peft import PeftModel
  4. defpredict(messages, model, tokenizer):
  5. device ="cuda"
  6. text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
  7. model_inputs = tokenizer([text], return_tensors="pt").to(device)
  8. generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512)
  9. generated_ids =[output_ids[len(input_ids):]for input_ids, output_ids inzip(model_inputs.input_ids, generated_ids)]
  10. response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]return response
  11. # 加载原下载路径的tokenizer和model
  12. tokenizer = AutoTokenizer.from_pretrained("./Qwen2-1.5B-Instruct/", use_fast=False, trust_remote_code=True)
  13. model = AutoModelForCausalLM.from_pretrained("./Qwen2-1.5B-Instruct/", device_map="auto", torch_dtype=torch.bfloat16)# 加载训练好的Lora模型,将下面的checkpointXXX替换为实际的checkpoint文件名名称
  14. model = PeftModel.from_pretrained(model, model_id="./chatbot/checkpoint-1560")
  15. test_texts ={'instruction':"你是医学领域的人工助手章鱼哥",'input':"嗓子疼,是不是得了流感了"}
  16. instruction = test_texts['instruction']
  17. input_value = test_texts['input']
  18. messages =[{"role":"system","content":f"{instruction}"},{"role":"user","content":f"{input_value}"}]
  19. response = predict(messages, model, tokenizer)print(response)

本文转载自: https://blog.csdn.net/m0_50972200/article/details/140667653
版权归原作者 十分钟ll 所有, 如有侵权,请联系我们删除。

“Qwen2-1.5B-Instruct Lora微调”的评论:

还没有评论