网站建设c云世家宋南南,重庆做网站公司电话,外链网,旅游景区网站建设Gemma 1.使用2.RAG3.LoRA3.1LoRA分类任务3.2LoRA中文建模任务 1.使用
首先是去HF下载模型#xff0c;但一直下载不了#xff0c;所以去了HF镜像网站#xff0c;下载gemma需要HF的Token#xff0c;按照步骤就可以下载。代码主要是Kaggle论坛里面的分享内容。
huggingface-… Gemma 1.使用2.RAG3.LoRA3.1LoRA分类任务3.2LoRA中文建模任务 1.使用
首先是去HF下载模型但一直下载不了所以去了HF镜像网站下载gemma需要HF的Token按照步骤就可以下载。代码主要是Kaggle论坛里面的分享内容。
huggingface-cli download --token hf_XXX --resume-download google/gemma-7b --local-dir gemma-7b-mirror这里我有时是2b有时是7b换着用。
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer AutoTokenizer.from_pretrained(D:/Gemma/gemma-2b-int-mirror2)
Gemma AutoModelForCausalLM.from_pretrained(D:/Gemma/gemma-2b-int-mirror2)
def answer_the_question(question):input_ids tokenizer(question, return_tensorspt)generated_text Gemma.generate(**input_ids,max_length256)answer tokenizer.decode(generated_text[0], skip_special_tokensTrue)return answer
question 给我写一首优美的诗歌?
answer answer_the_question(question)
print(answer)2.RAG
参考
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
##2.1 根据question检索sentence chunk
import os
def get_all_pdfs(directory):pdf_files []for root, dirs, files in os.walk(directory):for file in files:if file.endswith(.pdf):pdf_files.append(os.path.join(root, file))return pdf_filesclass RAG:def __init__(self, num_retrieved_docs5, pdf_folder_pathD:/Gemma/PDF):pdf_files get_all_pdfs(pdf_folder_path)print(Documents used, pdf_files)loaders [PyPDFLoader(pdf_file) for pdf_file in pdf_files]all_documents []for loader in loaders:raw_documents loader.load()text_splitter CharacterTextSplitter(separator\n\n,chunk_size10,chunk_overlap1,# length_functionlen,)documents text_splitter.split_documents(raw_documents)all_documents.extend(documents)embeddings HuggingFaceEmbeddings(model_nameD:/Projects/model/m3e-base) self.db FAISS.from_documents(all_documents, embeddings)self.retriever self.db.as_retriever(search_kwargs{k: num_retrieved_docs})def search(self, query):docs self.retriever.get_relevant_documents(query)return docs
retriever RAG()
##2.2根据sentence chunk和question去回答
class Assistant:def __init__(self):self.tokenizer AutoTokenizer.from_pretrained(D:/Gemma/gemma-2b-int-mirror2)self.Gemma AutoModelForCausalLM.from_pretrained(D:/Gemma/gemma-2b-int-mirror2)def create_prompt(self, query, retrieved_info):prompt f你是人工智能助手需要根据Relevant information里面的相关内容回答用户的Instruction其中相关信息如下Instruction: {query}Relevant information: {retrieved_info}Output:print(prompt)return promptdef reply(self, query, retrieved_info):prompt self.create_prompt(query, retrieved_info)input_ids self.tokenizer(query, return_tensorspt).input_ids# Generate text with a focus on factual responsesgenerated_text self.Gemma.generate(input_ids,do_sampleTrue,max_length500,temperature0.7, # Adjust temperature according to the task, for code generation it can be 0.9)# Decode and return the answeranswer self.tokenizer.decode(generated_text[0], skip_special_tokensTrue)return answer
chatbot Assistant()
## 2.3开始使用RAG
def generate_reply(query):related_docs retriever.search(query)#print(related docs, related_docs)reply chatbot.reply(query, related_docs)return reply
reply generate_reply(存在的不足及后续的优化工作)
for s in reply.split(\n):print(s)3.LoRA
3.1LoRA分类任务
参考 使用nlp-getting-started数据集训练模型做二分类任务。首先拿到源model
from datasets import load_dataset
from transformers import AutoTokenizer,AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments,pipeline
from peft import prepare_model_for_int8_training,LoraConfig, TaskType, get_peft_model
import numpy as np
NUM_CLASSES 2#模型输出分类的类别数
BATCH_SIZE,EPOCHS,R,LORA_ALPHA,LORA_DROPOUT 8,5,64,32,0.1#LoRA训练的参数
MODEL_PATHD:/Gemma/gemma-2b-int-mirror2#模型地址
# 1.源model设置输出二分类
tokenizer AutoTokenizer.from_pretrained(MODEL_PATH)
model AutoModelForSequenceClassification.from_pretrained(MODEL_PATH,num_labelsNUM_CLASSES)
print(model)处理csv数据将输入文字经过tokenizer编码处理
#2.处理dataset输入过长进行truncationtokenizer处理后
dataset load_dataset(csv, data_filesD:/Gemma/nlp-getting-started/train.csv)
dataset[test] dataset[train]
dataset dataset.remove_columns([id, keyword, location])
dataset dataset.rename_column(target, label)#csv最后只保留了text列和label列
tokenized_dataset {}#train和test
for split in dataset.keys():tokenized_dataset[split] dataset[split].map(lambda x: tokenizer(x[text], truncationTrue), batchedTrue)
print(tokenized_dataset[train])
print(tokenized_dataset[train][1])在源model基础上配置LoRA的参数形成lora_model
#3.LoRA模型参数设置
model prepare_model_for_int8_training(model)
lora_config LoraConfig(rR,lora_alphaLORA_ALPHA,lora_dropoutLORA_DROPOUT,task_typeTaskType.SEQ_CLS,#SEQ_CLS:序列分类任务TOKEN_CLS命名实体识别SEQ2SEQ机器翻译LM语言建模任务target_modulesall-linear#all-linear所有线性层embeddings嵌入层convs卷积层
)
lora_model get_peft_model(model, lora_config)
print(lora_model)
print(lora_model.print_trainable_parameters())#LoRA模型要训练的参数配置lora_model的训练参数
#4.LoRA训练参数设置损失计算等
def compute_metrics(eval_pred):predictions, labels eval_predpredictions np.argmax(predictions, axis1)return {accuracy: (predictions labels).mean()}trainer Trainer(modellora_model,argsTrainingArguments(output_dir./LoAR_data/,learning_rate2e-5,per_device_train_batch_sizeBATCH_SIZE,per_device_eval_batch_sizeBATCH_SIZE,evaluation_strategyepoch,save_strategyepoch,num_train_epochsEPOCHS,weight_decay0.01,load_best_model_at_endTrue,logging_steps10,report_tonone),train_datasettokenized_dataset[train],eval_datasettokenized_dataset[test],tokenizertokenizer,data_collatorDataCollatorWithPadding(tokenizertokenizer),compute_metricscompute_metrics,
)开始训练并保存使用模型
#5.训练并评估
print(Evaluating the Model Before Training!)
trainer.evaluate()
print(Training the Model)
trainer.train()
print(Evaluating the trained model)
trainer.evaluate()
#6.保存并使用
lora_model.save_pretrained(fine-tuned-model)
clf pipeline(text-classification, lora_model, tokenizerMODEL_PATH)#LoRA训练后的模型3.2LoRA中文建模任务
参考 首先拿到源model和config
from transformers import AutoConfig,AutoTokenizer,AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model,prepare_model_for_kbit_training,PeftModel
import torch
import datasets
from tqdm import tqdm
import json
BATCH_SIZE,EPOCHS,R,LORA_ALPHA,LORA_DROPOUT 8,5,64,32,0.1#LoRA训练的参数
MODEL_PATHD:/Gemma/gemma-2b-int-mirror2#模型地址
device torch.device(cuda:0)
# 1.源model和model的config
config AutoConfig.from_pretrained(MODEL_PATH, trust_remote_codeTrue)
config.is_causal True #确保模型在生成文本时只能看到左侧的上下文
tokenizer AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_codeTrue)
model AutoModelForCausalLM.from_pretrained(MODEL_PATH,device_mapauto, configconfig,trust_remote_codeTrue)根据模型和config处理json数据
#2.根据model的config处理datasettokenizer处理后并保存加载
def preprocess(tokenizer: PreTrainedTokenizer, config, file_path, max_seq_length, prompt_key, target_key, skip_overlengthFalse): # 数据预处理 pad_token_id tokenizer.pad_token_id # 获取填充标记的ID with open(file_path, r, encodingutf8) as f: for line in tqdm(f.readlines()): example json.loads(line) prompt_ids tokenizer.encode(example[prompt_key], max_lengthmax_seq_length, truncationTrue) target_ids tokenizer.encode(example[target_key], max_lengthmax_seq_length, truncationTrue) # 检查prompt和target连接后是否超出最大长度并在需要时跳过 total_length len(prompt_ids) len(target_ids) (1 if config.eos_token_id is not None else 0) if skip_overlength and total_length max_seq_length: continue # 连接prompt和target并添加EOS标记如果提供 input_ids prompt_ids target_ids if config.eos_token_id is not None: input_ids.append(config.eos_token_id) # 截断序列到最大长度 input_ids input_ids[:max_seq_length] # 填充序列到最大长度 input_ids.extend([pad_token_id] * (max_seq_length - len(input_ids))) assert len(input_ids) max_seq_length, 序列长度必须等于max_seq_length yield { input_ids: input_ids, seq_len: len(prompt_ids) # 注意这里提供的seq_len是原始prompt的长度不包括填充 }
dataset datasets.Dataset.from_generator(lambda: preprocess(tokenizer, config, D:/Gemma/try/hc3_chatgpt_zh_specific_qa.json, max_seq_length2000, prompt_keyq,target_keya,))dataset.save_to_disk(h3c-chinese) # 保存处理后的数据集
train_set datasets.load_from_disk(h3c-chinese)#加载处理后的数据集配置Lora参数
#3.LoRA模型参数设置
model prepare_model_for_kbit_training(model)
lora_config LoraConfig(rR,lora_alphaLORA_ALPHA,lora_dropoutLORA_DROPOUT,task_typeCAUSAL_LM,target_modulesall-linear
)
lora_model get_peft_model(model, lora_config)
print(lora_model)
print(lora_model.print_trainable_parameters())#LoRA模型要训练的参数配置lora的训练参数包括损失计算compute_metrics并对输入的input_ids构造输入样本列表批次处理。
trainer Trainer(modellora_model,argsTrainingArguments(output_dir./LoAR_data2/,learning_rate2e-5,per_device_train_batch_sizeBATCH_SIZE,save_strategyepoch,num_train_epochsEPOCHS,weight_decay0.01,logging_steps10,report_tonone),train_datasettrain_set,tokenizertokenizer,data_collatorDataCollatorWithPadding(tokenizertokenizer),
# compute_metricscompute_metrics
)
trainer.train()