langchain-chatchat 运行 qwen 量化模型的设置

Dockerfile:

FROM nvidia/cuda:12.2.0-devel-ubuntu22.04

# 安装python/pip/git
RUN sed -i s@/archive.ubuntu.com/@/mirrors.tuna.tsinghua.edu.cn/@g /etc/apt/sources.list \
    && apt-get update \
    && apt-get install \
    python3 \
    python3-pip \
    git \
    -y -qq

# 安装 pytorch
RUN pip config --user set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple \
    && pip3 install torch torchvision torchaudio

WORKDIR /Langchain-Chatchat
COPY ./Langchain-Chatchat /Langchain-Chatchat

RUN pip install -r requirements.txt

# 初始化参数
RUN python3 copy_config_example.py

COPY start.sh .
RUN chmod +x ./start.sh

RUN apt-get install \
    libgl1-mesa-glx \
    libglib2.0-0 \
    -y

RUN pip install zhipuai

# 支持通义千问
RUN pip install \
    transformers==4.32.0 \
    accelerate tiktoken einops scipy \
    transformers_stream_generator==0.0.4 \
    peft deepspeed
RUN git clone https://github.com/Dao-AILab/flash-attention \
    && cd flash-attention && pip install .
RUN pip install optimum auto-gptq

CMD ["/Langchain-Chatchat/start.sh"]

model_config.py:

..
LLM_MODELS = ["Qwen-14B-Chat-Int4","zhipu-api", "openai-api"]
..

Qwen-14B-Chat-Int4 模型的 ./config.json:

{
  "architectures": [
    "QWenLMHeadModel"
  ],
  "auto_map": {
    "AutoConfig": "configuration_qwen.QWenConfig",
    "AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel"
  },
  "attn_dropout_prob": 0.0,
  "bf16": false,
  "emb_dropout_prob": 0.0,
  "fp16": true,
  "fp32": false,
  "hidden_size": 5120,
  "intermediate_size": 27392,
  "initializer_range": 0.02,
  "kv_channels": 128,
  "layer_norm_epsilon": 1e-06,
  "max_position_embeddings": 8192,
  "model_type": "qwen",
  "no_bias": true,
  "num_attention_heads": 40,
  "num_hidden_layers": 40,
  "onnx_safe": null,
  "quantization_config": {
    "bits": 4,
    "group_size": 128,
    "damp_percent": 0.01,
    "desc_act": false,
    "static_groups": false,
    "sym": true,
    "true_sequential": true,
    "model_name_or_path": null,
    "model_file_base_name": "model",
    "quant_method": "gptq",
    "disable_exllama": true // 加这个
  },
..

MarshalW/chatchat-qwen-14b-int4.md

langchain-chatchat 运行 qwen 量化模型的设置