使用 LoRA 在 viggo 数据集上微调 Microsoft phi-2 小语言模型

本文介绍: 接下来，我们将看到有关如何使用 HuggingFace 中的 phi-2 进行提示的分步 Python 代码，然后我们将在 veggo 数据集上对其进行微调。我使用 T4 GPU 在 Google Colab 免费层上运行了此代码笔记本。

Microsoft 的基于 Transformer 的小语言模型。它可以根据 MIT 许可在HuggingFace上使用。

它在 96 个 A100 GPU 上使用 1.4T 令牌进行了 14 天的训练。Phi-2 是一个 27 亿个参数的预训练 Transformer，不使用 RLHF 或指示微调。它进行下一个标记预测，并可用于问答、聊天格式和代码生成中的文本生成。

事实证明，phi-2 在多个基准测试和编码和数学等任务上优于许多具有 7B 和 13B 参数的模型。

小语言模型之所以具有优异的性能，是因为使用了经过提炼的高质量训练数据或“教科书质量”的数据。小语言模型使用知识蒸馏。也就是说，他们接受了从 LLMS 中提取的核心/基本知识的培训。然后采用剪枝和量化技术来删除模型的非必要部分。训练数据通常是综合数据集的混合物，这些数据集是专门创建的，旨在教导模型执行科学、日常活动、心理理论等领域的常识推理和一般知识。它还可能包含具有高教育意义的选择性网络数据价值和质量。小语言模型使用创新技术进行扩展。

#@title Install required libraries
!pip install accelerate==0.25.0
!pip install bitsandbytes==0.41.1
!pip install datasets==2.14.6
!pip install peft==0.6.2
!pip install transformers==4.36.2
!pip install torch==2.1.0
!pip install einops==0.4.1  
!pip install huggingface_hub

import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline, logging
from datasets import Dataset

torch.set_default_device("cuda")

#create the model object and the corresponding tokenizer
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)

# https://huggingface.co/microsoft/phi-2
# This prompt is for code completion
# here the prompt is written within the tokenizer()
inputs = tokenizer('''def fibonacci(n):
   """
   This function prints the terms in Fibonacci series upto n
   """''', return_tensors="pt", return_attention_mask=False)

outputs = model.generate(**inputs, max_length=100)
text = tokenizer.batch_decode(outputs)[0]
print(text)

#https://huggingface.co/microsoft/phi-2
# here a string containing the prompt is defined separately from the tokenizer() and then passed to it
prompt = '''def fibonacci(n):
   """
   This function prints the terms in Fibonacci series upto n
   """'''
inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)
outputs = model.generate(**inputs, max_length=100)
text = tokenizer.batch_decode(outputs)[0]
print(text)

# here we see the output of phi-2 for a question-answering prompt
prompt = 'What is thee relevance of mathematics for understanding physics?'
inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)
outputs = model.generate(**inputs, max_length=200)
text = tokenizer.batch_decode(outputs)[0]
print(text)

#@title Set up accelerator to speed up the training/finetuning
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

#@title login to your huggingface account using your access token
# you can find your access token at https://huggingface.co/settings/tokens
from huggingface_hub import notebook_login
notebook_login()

#@title load viggo dataset
from datasets import load_dataset

train_dataset = load_dataset('gem/viggo', split='train')
eval_dataset = load_dataset('gem/viggo', split='validation')
test_dataset = load_dataset('gem/viggo', split='test')

#@title load base model microsoft/phi-2 
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling

base_model_id = "microsoft/phi-2"
model = AutoModelForCausalLM.from_pretrained(base_model_id, 
                                             load_in_8bit=True, 
                                             torch_dtype=torch.float16, 
                                             trust_remote_code=True)

#@title set up the tokenizer for base model
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_eos_token=True,
    add_bos_token=True, 
    use_fast=False, # needed for now, should be fixed soon
)

#@title setup tokenize function to make labels and input_ids the same for the self-supervised fine-tuning.
def tokenize(prompt):
    result = tokenizer(prompt)
    result["labels"] = result["input_ids"].copy()
    return result

#@title convert each sample into a prompt
 
def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
                This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
                   The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']

                 ### Target sentence:
                 {data_point["target"]}

                  ### Meaning representation:
                  {data_point["meaning_representation"]}
                 """
    return tokenize(full_prompt)




#@title Reformat the prompt and tokenize each sample:

tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

max_length = 320 # appropriate max length for this dataset

# redefine the tokenize function and tokenizer

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left",
    add_eos_token=True,  
    add_bos_token=True,  
    trust_remote_code=True,
    use_fast=False, # needed for now, should be fixed soon
)
tokenizer.pad_token = tokenizer.eos_token


def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result


#@title tokenize train and validation datasets using generate_and_tokenize_prompt function
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "Wqkv",
        "fc1",
        "fc2",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)


# Apply the acceleratort to the model for faster traning. 
model = accelerator.prepare_model(model)

#Train the model and push each check point to Huggingface
import transformers


tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir="./phi2-finetunedonviggodataset",
        warmup_steps=5,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        max_steps=500,
        learning_rate=2.5e-5, 
        logging_steps=50,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=50,                # Save checkpoints every 50 steps
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=50,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
        push_to_hub=True,

    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  
trainer.train()

#Load the base model
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

base_model_id = "microsoft/phi-2"

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    load_in_8bit=True,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,
)

eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token=True,
    trust_remote_code=True,
    use_fast=False,
)

#create a sample prompt for evaluation on base model
eval_prompt = """Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']

### Target sentence:
Earlier, you stated that you didn't have strong feelings about PlayStation's Little Big Adventure. Is your opinion true for all games which don't have multiplayer?

### Meaning representation:
"""

# tokenize the above prompt and generate the response from base model
model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to('cuda')
base_model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(base_model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))

from peft import PeftModel
ft_model = PeftModel.from_pretrained(base_model, "nimrita/phi2-finetunedonviggodataset", force_download=True)


eval_prompt = """Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']

### Target sentence:
Earlier, you stated that you didn't have strong feelings about PlayStation's Little Big Adventure. Is your opinion true for all games which don't have multiplayer?

### Meaning representation:
"""

model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to('cuda')
ft_model = ft_model.to('cuda')
ft_model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))