- https://huggingface.co/decapoda-research/llama-7b-hf/discussions/3
- tatsu-lab/stanford_alpaca#64
- huggingface/transformers#21955
conda activate llama
conda install python=3.9
pip isntall sentencepiece
pip install torch
pip install fairscale
pip install fire
pip install git+https://github.com/huggingface/transformers.git
- https://github.com/facebookresearch/llama/tree/main/llama
- https://huggingface.co/decapoda-research/llama-7b-hf/tree/main
Folder Struture:
.
├── llama
│ ├── __init__.py
│ ├── config.json
│ ├── generation.py
│ ├── generation_config.json
│ ├── model.py
│ ├── special_tokens_map.json
│ ├── tokenizer.model
│ └── tokenizer.py
└── token.ipynb
import transformers
device = "cpu"
model_path = ""
tokenizer = transformers.LlamaTokenizer.from_pretrained(model_path)
import re
def contains_chinese(text):
# Define the regular expression pattern to match Chinese characters
pattern = re.compile(r'[\u4e00-\u9fa5]') # This matches all Chinese characters
# Check if the string contains any Chinese characters
match = pattern.search(text)
return match is not None
tokens = list(tokenizer.get_vocab().keys())
zh_tokens = []
cnt = 0
for i in tokens:
if contains_chinese(i):
cnt += 1
zh_tokens.append(i)
print(cnt)
>>> 700