ChatGLM 6B 部署及微调【干货】

本文介绍: Cha tGLM 6B 小模型微调训练及部署、垂直领域智能助理、接口实现

cd /opt
git clone https://github.com/THUDM/ChatGLM2-6B

将模型下载后放到项目文件夹内

git lfs install # 确认安装了lfs，或者直接到项目地址点击下载
git clone https://huggingface.co/THUDM/chatglm2-6b

pip3 install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install tensorboard gradio mdtex2html -i https://pypi.tuna.tsinghua.edu.cn/simple

nvidia-smi # 输入指令&gt;&gt;&gt;
NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.

uname -r

sudo apt-get install linux-generic-hwe-20.04 #这个是下载新的内核版本

$ sudo apt-get purge libnvidia*
$ sudo apt-get purge nvidia*

$ sudo dpkg --list | grep nvidia      #找版本
$ ubuntu-drivers devices #这样也行。

$ sudo add-apt-repository ppa:graphics-drivers/ppa
$ sudo apt-get update
$ sudo apt-get install nvidia-driver-535
$ sudo prime-select nvidia
# ------------- 如果上面有报错的话 -----------------------
$ sudo apt-get install --reinstall nvidia-driver-535
# -------------------------------------------------------
$ reboot # 重启

$ nvidia-smi

将web_demo.py中的"THUDM/chatglm2-6b"模型地址指向保存的模型地址"/opt/ChatGlm6B2/model"

# 全精度全加载
model = AutoModel.from_pretrained("/opt/ChatGlm6B2/model", trust_remote_code=True).cuda()
# 半精度全加载
model = AutoModel.from_pretrained("/opt/ChatGlm6B2/model", trust_remote_code=True).half().cuda()
# 半精度8bit量化
model = AutoModel.from_pretrained("/opt/ChatGlm6B2/model", trust_remote_code=True).quantize(8).half().cuda()
# 半精度4bit量化
model = AutoModel.from_pretrained("/opt/ChatGlm6B2/model", trust_remote_code=True).quantize(4).half().cuda()

# demo.queue().launch(share=False, inbrowser=True)
demo.queue().launch(share=True, inbrowser=True, server_name='127.0.0.1', server_port=8080)

wget https://cdn-media.huggingface.co/frpc-gradio-0.2/frpc_linux_amd64

mv frpc_linux_amd64 frpc_linux_amd64_v0.2

mv frpc_linux_amd64_v0.2 /root/.virtualenvs/glm/lib/python3.8/site-packages/gradio

cd /root/.virtualenvs/glm/lib/python3.8/site-packages/gradio

chmod 777 frpc_linux_amd64_v0.2

# Gradio 启动
workon glm
cd /opt/ChatGlm6B2
python3 web_demo.py
# --------------------------------------------------------------------
python3 /opt/ChatGlm6B2/web_demo.py

# 或者 Streamlit 启动
workon glm
cd /opt/ChatGlm6B2
streamlit run web_demo2.py --server.port 8080
# --------------------------------------------------------------------
streamlit run /opt/ChatGlm6B2/web_demo2.py --server.port 8080

# 除了部署需要的依赖之外，还需要安装以下依赖
pip install rouge_chinese nltk jieba datasets transformers[torch] -i https://pypi.douban.com/simple/

参考 answers.xlsx 、 dev.xlsx 编写自己的数据

使用 GetPromp ts.py、或者使用打包好的 GetPromp ts.exe 将数据转化为json，其中history默认按,分隔

将整理好的数据集移入文件夹，参考执行脚本中即将配置的路径../answers.json和../dev.json,将文件放到以下位置

/opt/ChatGlm6B2/answers.json
/opt/ChatGlm6B2/dev.json

完整文件结构

/opt/ChatGlm6B2
	/model
	/output
	answers.json
	dev.json
	/ptuning
		main.py
		train_chat.sh
		…
	…

# /opt/ChatGlm6B2/ptuning/train_chat.sh

# torchrun命令运行PyTorch训练脚本：
# torchrun: 运行PyTorch --standalone以独立模式运行(即在单个节点上运行，而不是在分布式环境中运行) --nnodes=1 指定节点的数量为1(即单节点运行) --nproc-per-node=$NUM_GPUS 指定每个节点上的进程数(GPU的数量) main.py 运行的PyTorch训练脚本
# do_train: 指定进行训练。
# train_file $CHAT_TRAIN_DATA: 指定训练数据文件的路径。
# validation_file $CHAT_VAL_DATA: 指定验证数据文件的路径。
# preprocessing_num_workers 10: 指定预处理数据时使用的工作进程数。
# prompt_column prompt: 指定数据中包含提示的列名。
# response_column response: 指定数据中包含响应的列名。
# history_column history: 指定数据中包含历史对话的列名。
# overwrite_cache: 指定是否覆盖缓存的预处理数据。
# model_name_or_path THUDM/chatglm2-6b: 指定模型的名称或路径，这里使用了一个预训练的聊天生成模型。
# output_dir $CHECKPOINT_NAME: 指定输出目录的路径。
# overwrite_output_dir: 指定是否覆盖输出目录。
# max_source_length 256: 指定输入序列的最大长度。
# max_target_length 256: 指定目标序列的最大长度。
# per_device_train_batch_size 1: 指定每个设备的训练批次大小。
# per_device_eval_batch_size 1: 指定每个设备的评估批次大小。
# gradient_accumulation_steps 16: 指定梯度累积的步数。
# predict_with_generate: 指定在生成模式下进行预测。
# max_steps 3000: 指定训练的最大步数。
# logging_steps 10: 指定每隔多少步打印一次日志。
# save_steps 1000: 指定每隔多少步保存一次模型。
# learning_rate $LR: 指定学习率，其中`$LR`是一个环境变量，用于指定学习率的值。
# pre_seq_len $PRE_SEQ_LEN: 指定预处理序列的长度，用于指定预处理序列的长度。
# quantization_bit 4: 指定量化的位数。

PRE_SEQ_LEN=128
LR=1e-2
NUM_GPUS=1
# 对话数据
CHAT_TRAIN_DATA=../answers.json
# 验证数据
CHAT_VAL_DATA=../dev.json
# 保存路径
CHECKPOINT_NAME=../output
# 模型路径
MODEL_PATH=../model
# 训练最大步数 3000默认
MAX_STEP=500
# 一个巨坑：为了加快速度，减小了MAX_STEP，但是保存步长还是1000，未达到保存条件没保存模型文件，重头来过
# 保存步长
SAVE_STEP=100

torchrun --standalone --nnodes=1 --nproc-per-node=$NUM_GPUS main.py 
    --do_train 
    --train_file $CHAT_TRAIN_DATA 
    --validation_file $CHAT_VAL_DATA 
    --preprocessing_num_workers 10 
    --prompt_column prompt 
    --response_column response 
    --history_column history 
    --overwrite_cache 
    --model_name_or_path $MODEL_PATH 
    --output_dir $CHECKPOINT_NAME/chatglm-6b-pt-$PRE_SEQ_LEN-$LR 
    --overwrite_output_dir 
    --max_source_length 512 
    --max_target_length 512 
    --per_device_train_batch_size 1 
    --per_device_eval_batch_size 1 
    --gradient_accumulation_steps 16 
    --predict_with_generate 
    --max_steps $MAX_STEP 
    --logging_steps 10 
    --save_steps $SAVE_STEP 
    --learning_rate $LR 
    --pre_seq_len $PRE_SEQ_LEN 
    --quantization_bit 4

workon glm
cd /opt/ChatGlm6B2/ptuning
sh train_chat.sh                      # bash train_chat.sh
---------------------------------------------------------------
nohup sh train_chat.sh > output.log   # 后台执行 | 不生效

# /opt/ChatGlm6B2/ptuning/evaluate.sh
# model_name_or_path：原始ChatGLM-6B模型文件路径
# ptuning_checkpoint：训练完成后，生成的文件目录

LR=1e-2
PRE_SEQ_LEN=128
NUM_GPUS=1
# 验证数据
CHAT_VAL_DATA=../dev.json
# 保存路径
CHECKPOINT_NAME=../output
# 模型路径
MODEL_PATH=../model
# 最大步数
MAX_STEP=500

torchrun --standalone --nnodes=1 --nproc-per-node=$NUM_GPUS main.py 
    --do_predict 
    --validation_file $CHAT_VAL_DATA 
    --test_file $CHAT_VAL_DATA 
    --overwrite_cache 
    --prompt_column prompt 
    --response_column response 
    --history_column history 
    --model_name_or_path $MODEL_PATH 
    --ptuning_checkpoint $CHECKPOINT_NAME/chatglm-6b-pt-$PRE_SEQ_LEN-$LR/checkpoint-$MAX_STEP 
    --output_dir $CHECKPOINT_NAME/chatglm-6b-pt-$PRE_SEQ_LEN-$LR 
    --overwrite_output_dir 
    --max_source_length 64 
    --max_target_length 64 
    --per_device_eval_batch_size 1 
    --predict_with_generate 
    --pre_seq_len $PRE_SEQ_LEN 
    --quantization_bit 4

workon glm
cd /opt/ChatGlm6B2/ptuning
sh evaluate.sh                      # bash evaluate.sh

修改web_demo.py或web_demo2.py中的模型引入

tokenizer = AutoTokenizer.from_pretrained("/opt/ChatGlm6B2/model", trust_remote_code=True)                             # 载入Tokenizer
model = AutoModel.from_pretrained("/opt/ChatGlm6B2/model", trust_remote_code=True).quantize(8).cuda()                  # 全精度8bit量化
# model = AutoModel.from_pretrained("/opt/ChatGlm6B2/model", trust_remote_code=True).cuda()                            # 全精度全加载
# model = AutoModel.from_pretrained("/opt/ChatGlm6B2/model", trust_remote_code=True).half().cuda()                     # 半精度全加载
# model = AutoModel.from_pretrained("/opt/ChatGlm6B2/model", trust_remote_code=True).quantize(8).half().cuda()         # 半精度8bit量化
# model = AutoModel.from_pretrained("/opt/ChatGlm6B2/model", trust_remote_code=True).quantize(4).half().cuda()         # 半精度4bit量化
## 多显卡支持，使用下面两行代替上面一行，将num_gpus改为你实际的显卡数量
## from utils import load_model_on_gpus
## model = load_model_on_gpus("/opt/ChatGlm6B2/model", num_gpus=2)
model = model.eval()

tokenizer = AutoTokenizer.from_pretrained("/opt/ChatGlm6B2/model", trust_remote_code=True)                             # 载入Tokenizer

# 加载的 P-Tuning 的 checkpoint：
# 注意: 将 pre_seq_len 改成训练时的实际值。
# /opt/ChatGlm6B2/model 指向本地的模型路径（注意不是checkpoint路径）
# /opt/ChatGlm6B2/output/chatglm-6b-pt-128-1e-2/checkpoint-500 指向生成的微调模型文件路径
config = AutoConfig.from_pretrained("/opt/ChatGlm6B2/model", trust_remote_code=True, pre_seq_len=128)
model = AutoModel.from_pretrained("/opt/ChatGlm6B2/model", config=config, trust_remote_code=True)
prefix_state_dict = torch.load(os.path.join("/opt/ChatGlm6B2/output/chatglm-6b-pt-128-1e-2/checkpoint-500", "pytorch_model.bin"))
new_prefix_state_dict = {}
for k, v in prefix_state_dict.items():
    if k.startswith("transformer.prefix_encoder."):
        new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)

## 如果需要加载的是全参数微调的 checkpoint，则直接加载整个 checkpoint：
## model = AutoModel.from_pretrained("/opt/ChatGlm6B2/output/chatglm-6b-pt-128-1e-2/checkpoint-500", trust_remote_code=True)

## 之后根据需求可以进行量化，也可以直接使用：
model = model.quantize(8)
model = model.cuda()
model = model.eval()

workon glm
python3 /opt/ChatGlm6B2/web_demo.py                           # Gradio 启动
streamlit run /opt/ChatGlm6B2/web_demo2.py --server.port 8080 # 或者 Streamlit 启动

from fastapi import FastAPI, Request
from transformers import AutoConfig, AutoModel, AutoTokenizer
import uvicorn, json, datetime
import torch
import os

DEVICE = "cuda"
DEVICE_ID = "0"
CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE

def torch_gc():
    if torch.cuda.is_available():
        with torch.cuda.device(CUDA_DEVICE):
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()

app = FastAPI()

@app.post("/")
async def create_item(request: Request):
    global model, tokenizer
    json_post_raw = await request.json()
    json_post = json.dumps(json_post_raw)
    json_post_list = json.loads(json_post)
    prompt = json_post_list.get('prompt')
    history = json_post_list.get('history')
    max_length = json_post_list.get('max_length')
    top_p = json_post_list.get('top_p')
    temperature = json_post_list.get('temperature')
    response, history = model.chat(tokenizer,
                                   prompt,
                                   history=history,
                                   max_length=max_length if max_length else 2048,
                                   top_p=top_p if top_p else 0.7,
                                   temperature=temperature if temperature else 0.95)
    now = datetime.datetime.now()
    time = now.strftime("%Y-%m-%d %H:%M:%S")
    answer = {
        "response": response,
        "history": history,
        "status": 200,
        "time": time
    }
    log = "[" + time + "] " + '", prompt:"' + prompt + '", response:"' + repr(response) + '"'
    print(log)
    torch_gc()
    return answer


if __name__ == '__main__':
    # ------------------------------------------------------------------------------------------------------------------------------------------
    # 使用原模型参数
    # tokenizer = AutoTokenizer.from_pretrained("/opt/ChatGlm6B2/model", trust_remote_code=True)
    # # model = AutoModel.from_pretrained("/opt/ChatGlm6B2/model", trust_remote_code=True).cuda()                            # 全精度全加载
    # model = AutoModel.from_pretrained("/opt/ChatGlm6B2/model", trust_remote_code=True).quantize(8).cuda()                  # 全精度全加载
    # # model = AutoModel.from_pretrained("/opt/ChatGlm6B2/model", trust_remote_code=True).half().cuda()                     # 半精度全加载
    # # model = AutoModel.from_pretrained("/opt/ChatGlm6B2/model", trust_remote_code=True).quantize(8).half().cuda()         # 半精度8bit量化
    # # model = AutoModel.from_pretrained("/opt/ChatGlm6B2/model", trust_remote_code=True).quantize(4).half().cuda()         # 半精度4bit量化
    # # 多显卡支持，使用下面两行代替上面一行，将num_gpus改为你实际的显卡数量
    # # from utils import load_model_on_gpus
    # # model = load_model_on_gpus("/opt/ChatGlm6B2/model", num_gpus=2)
    # ------------------------------------------------------------------------------------------------------------------------------------------
    # 使用微调后参数
    tokenizer = AutoTokenizer.from_pretrained("/opt/ChatGlm6B2/model", trust_remote_code=True)                             # 载入Tokenizer
    # 加载的 P-Tuning 的 checkpoint：
    # 注意: 将 pre_seq_len 改成训练时的实际值。
    # /opt/ChatGlm6B2/model 指向本地的模型路径（注意不是checkpoint路径）
    # /opt/ChatGlm6B2/output/chatglm-6b-pt-128-1e-2/checkpoint-500 指向生成的微调模型文件路径
    config = AutoConfig.from_pretrained("/opt/ChatGlm6B2/model", trust_remote_code=True, pre_seq_len=128)
    model = AutoModel.from_pretrained("/opt/ChatGlm6B2/model", config=config, trust_remote_code=True)
    prefix_state_dict = torch.load(os.path.join("/opt/ChatGlm6B2/output/chatglm-6b-pt-128-1e-2/checkpoint-500", "pytorch_model.bin"))
    new_prefix_state_dict = {}
    for k, v in prefix_state_dict.items():
        if k.startswith("transformer.prefix_encoder."):
            new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
    model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
    ## 如果需要加载的是全参数微调的 checkpoint，则直接加载整个 checkpoint：
    ## model = AutoModel.from_pretrained("/opt/ChatGlm6B2/output/chatglm-6b-pt-128-1e-2/checkpoint-500", trust_remote_code=True)
    # 之后根据需求可以进行量化，也可以直接使用：
    model = model.quantize(8)
    model = model.cuda()
    # ------------------------------------------------------------------------------------------------------------------------------------------
    model.eval()
    uvicorn.run(app, host='0.0.0.0', port=8080, workers=1)

workon glm
python /opt/ChatGlm6B2/web_demo.py
# 挂载后台执行
nohup /root/.virtualenvs/glm/bin/python /opt/ChatGlm6B2/api.py > /opt/ChatGlm6B2/output.log 2>&amp;1 &amp;
nohup /root/.virtualenvs/glm/bin/python /opt/ChatGlm6B2/openai_api.py > /opt/ChatGlm6B2/apilog.log 2>&amp;1 &amp;
# 终止
ps aux | grep api.py
kill PID

const path = require('path')
const { defineConfig } = require('@vue/cli-service')
module.exports = defineConfig({
  devServer: {
    proxy: {
      '/chatglm': { // 部署前 取消注释
        target: 'http://127.0.0.1:8080/', // API服务器的地址
        // ws: true, // 如果要代理 websockets，配置这个参数
        changeOrigin: true, // 是否跨域
        pathRewrite: {
          '^/chatglm': ''
        }
      },
    }
  }
})

# vi /etc/nginx/nginx.conf

server {
    listen 8080;                                # 前端项目开放监听端口
    server_name 127.0.0.1;                    # 前端项目开放ip/域名
    location / {                                # 默认根路径
        root   /var/www/chat/;                  # build文件的位置
        try_files $uri $uri/ /index.html;       # 内部文件重定向
    }
	location / chatglm {                                # 默认根路径
        proxy_redirect off;
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_http_version 1.1;
        proxy_set_header Upgrade $http_upgrade;
        proxy_set_header Connection "upgrade"; 
        proxy_connect_timeout 1800s;
        proxy_send_timeout 1800s;
        proxy_read_timeout 1800s; 
        proxy_pass http://127.0.0.1:8080/;
    }
}

import axios from 'axios'
import { message } from 'element-ui'

export async function postChat(prompt, history=[], max_length=1000, top_p=0.8, temperature=0.9) {
    const { data: res } = await axios.post( '/chatglm',
        {
            "prompt": prompt,
            "history": history,
            "max_length": max_length,
            "top_p": top_p,
            "temperature": temperature
        }
    )
    console.log(res)
    if (res.status !== 200) {
        return message.error('接口异常，请重新尝试或联系管理员！')
    }
}

<script>
import { postChat } from '@/api/ChatApi.js'
export default {
    data() {
        return {}
    },
    created(){
        postChat("你好")
    },
}
</script>