This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import accuracy_score, confusion_matrix | |
from sklearn.metrics import classification_report | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.pipeline import Pipeline | |
from sklearn.feature_extraction.text import TfidfTransformer | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
#Are the tags which we are using currently for tagging the sentences |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!pip install transformers | |
# Taken from https://huggingface.co/transformers/model_doc/auto.html | |
#=====Ways to initiate a tokenizer and embedding model===== | |
### Three ways to initiate a tokenizer ### | |
from transformers import AutoTokenizer | |
# From official models hosted on HuggingFace (just the model name) | |
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese') |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Method 1 | |
# 須先下載wqy-microhei.ttc,因中文顯示需做特殊處理 | |
zhfont = matplotlib.font_manager.FontProperties(fname='/Users/youngmihuang/Downloads/wqy-microhei.ttc') | |
================================== | |
# Method 2 | |
# install Korean font | |
!apt install fonts-nanum |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Reference source of the codes: https://github.com/ckiplab/ckiptagger. | |
# Intall dependencies | |
## !pip install -U ckiptagger[gdown] | |
## !pip install -U ckiptagger[tfgpu] # for Tensorflow GPU | |
from ckiptagger import data_utils, WS, POS, NER | |
# Download model files to ./data.zip (2GB) and extract to ./data/ | |
## data_utils.download_data_url("./") # iis-ckip (Option 1) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!pip install zhon | |
import re | |
import zhon.hanzi as hanzi | |
from string import punctuation as en_punc | |
from string import ascii_letters as roman_letters | |
zh_punc = hanzi.punctuation | |
punc_set = set(zh_punc).union(set(en_punc)) #puncs in both English and Chinese | |
punc_list = list(punc_set) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import io | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
# fasttext.train_unsupervised for word embeddings | |
def CreateTxt(df, text_col="text"): | |
# Each cell in "text_col" contains a string of a single tokenized document, with tokens seperated by spaces | |
df[text_col].to_csv('output.txt', sep='\n', index=False) | |
def CreateVecAndMeta(): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Reference: kstathou/acl-search-engine | |
# !pip install faiss-cpu --no-cache | |
# !pip install sentence_transformers | |
import faiss | |
import numpy as np | |
import pandas as pd | |
import pickle | |
import torch |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Taken from https://clay-atlas.com/blog/2020/06/30/pytorch-如何使用-hugging-face-所提供的-transformers-以-bert-為例/ | |
# coding: utf-8 | |
import torch | |
from transformers import AutoTokenizer, AutoModel | |
from keras.preprocessing.sequence import pad_sequences | |
# Tokenizer and Bert Model | |
MODEL_NAME = '' # e.g. 'bert-base-chinese' | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
OlderNewer