Skip to content

Instantly share code, notes, and snippets.

View howard-haowen's full-sized avatar
:octocat:
AI Engineer at IBM 💻 | Crafting solutions for client success! 🚀

Haowen Jiang howard-haowen

:octocat:
AI Engineer at IBM 💻 | Crafting solutions for client success! 🚀
View GitHub Profile
# Taken from https://clay-atlas.com/blog/2020/06/30/pytorch-如何使用-hugging-face-所提供的-transformers-以-bert-為例/
# coding: utf-8
import torch
from transformers import AutoTokenizer, AutoModel
from keras.preprocessing.sequence import pad_sequences
# Tokenizer and Bert Model
MODEL_NAME = '' # e.g. 'bert-base-chinese'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
@howard-haowen
howard-haowen / VectorSearch.py
Last active April 4, 2024 16:41
Text similarity search using sentence_transformers and faiss
# Reference: kstathou/acl-search-engine
# !pip install faiss-cpu --no-cache
# !pip install sentence_transformers
import faiss
import numpy as np
import pandas as pd
import pickle
import torch
@howard-haowen
howard-haowen / PrepareDataForFasttext.py
Last active January 12, 2021 09:53
Some utility functions for the fastText library
import io
import pandas as pd
from sklearn.model_selection import train_test_split
# fasttext.train_unsupervised for word embeddings
def CreateTxt(df, text_col="text"):
# Each cell in "text_col" contains a string of a single tokenized document, with tokens seperated by spaces
df[text_col].to_csv('output.txt', sep='\n', index=False)
def CreateVecAndMeta():
@howard-haowen
howard-haowen / CleanTokens.py
Last active January 7, 2021 10:05
Clean a list of tokens
#!pip install zhon
import re
import zhon.hanzi as hanzi
from string import punctuation as en_punc
from string import ascii_letters as roman_letters
zh_punc = hanzi.punctuation
punc_set = set(zh_punc).union(set(en_punc)) #puncs in both English and Chinese
punc_list = list(punc_set)
@howard-haowen
howard-haowen / CkipTagger_CPU.py
Last active January 13, 2021 03:20
Tokenize Chinese with CkipTagger
# Reference source of the codes: https://github.com/ckiplab/ckiptagger.
# Intall dependencies
## !pip install -U ckiptagger[gdown]
## !pip install -U ckiptagger[tfgpu] # for Tensorflow GPU
from ckiptagger import data_utils, WS, POS, NER
# Download model files to ./data.zip (2GB) and extract to ./data/
## data_utils.download_data_url("./") # iis-ckip (Option 1)
@howard-haowen
howard-haowen / Matplotlib-Set-Font-for-zh-TW.ipynb
Created December 7, 2020 15:57 — forked from scottt/Matplotlib-Set-Font-for-zh-TW.ipynb
Matplotlib set suitable font for zh_TW text
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@howard-haowen
howard-haowen / Matplotlib-font.py
Last active December 21, 2020 15:15
Fix font display for Matplotlib
# Method 1
# 須先下載wqy-microhei.ttc,因中文顯示需做特殊處理
zhfont = matplotlib.font_manager.FontProperties(fname='/Users/youngmihuang/Downloads/wqy-microhei.ttc')
==================================
# Method 2
# install Korean font
!apt install fonts-nanum
@howard-haowen
howard-haowen / create-streamlit-app.ipynb
Created December 5, 2020 04:13
Create-streamlit-app.ipynb
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@howard-haowen
howard-haowen / HuggingFace_models.py
Last active January 9, 2021 02:12
Load a pretrained model from Hugging Face
!pip install transformers
# Taken from https://huggingface.co/transformers/model_doc/auto.html
#=====Ways to initiate a tokenizer and embedding model=====
### Three ways to initiate a tokenizer ###
from transformers import AutoTokenizer
# From official models hosted on HuggingFace (just the model name)
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
@howard-haowen
howard-haowen / text_classification.py
Last active November 18, 2020 05:16 — forked from nicksyna01/text_classification.py
Text Classification using sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
#Are the tags which we are using currently for tagging the sentences