Dockerfile:
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
# 安装python/pip/git
RUN sed -i s@/archive.ubuntu.com/@/mirrors.tuna.tsinghua.edu.cn/@g /etc/apt/sources.list \
&& apt-get update \
&& apt-get install \
python3 \
python3-pip \
git \
-y -qq
# 安装 pytorch
RUN pip config --user set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple \
&& pip3 install torch torchvision torchaudio
WORKDIR /Langchain-Chatchat
COPY ./Langchain-Chatchat /Langchain-Chatchat
RUN pip install -r requirements.txt
# 初始化参数
RUN python3 copy_config_example.py
COPY start.sh .
RUN chmod +x ./start.sh
RUN apt-get install \
libgl1-mesa-glx \
libglib2.0-0 \
-y
RUN pip install zhipuai
# 支持通义千问
RUN pip install \
transformers==4.32.0 \
accelerate tiktoken einops scipy \
transformers_stream_generator==0.0.4 \
peft deepspeed
RUN git clone https://github.com/Dao-AILab/flash-attention \
&& cd flash-attention && pip install .
RUN pip install optimum auto-gptq
CMD ["/Langchain-Chatchat/start.sh"]
model_config.py:
..
LLM_MODELS = ["Qwen-14B-Chat-Int4","zhipu-api", "openai-api"]
..
Qwen-14B-Chat-Int4 模型的 ./config.json:
{
"architectures": [
"QWenLMHeadModel"
],
"auto_map": {
"AutoConfig": "configuration_qwen.QWenConfig",
"AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel"
},
"attn_dropout_prob": 0.0,
"bf16": false,
"emb_dropout_prob": 0.0,
"fp16": true,
"fp32": false,
"hidden_size": 5120,
"intermediate_size": 27392,
"initializer_range": 0.02,
"kv_channels": 128,
"layer_norm_epsilon": 1e-06,
"max_position_embeddings": 8192,
"model_type": "qwen",
"no_bias": true,
"num_attention_heads": 40,
"num_hidden_layers": 40,
"onnx_safe": null,
"quantization_config": {
"bits": 4,
"group_size": 128,
"damp_percent": 0.01,
"desc_act": false,
"static_groups": false,
"sym": true,
"true_sequential": true,
"model_name_or_path": null,
"model_file_base_name": "model",
"quant_method": "gptq",
"disable_exllama": true // 加这个
},
..