Skip to content

Instantly share code, notes, and snippets.

@negedng
negedng / default_covariance.py
Last active March 6, 2024 09:26
NumPy and Scikit-learn has different default value for DDOF.
import numpy as np
from sklearn import linear_model
# The data
X = np.array([10,11,15,20,30,50,60,61,70])
Y = np.array([3,4,3,5,10,10,12,11,13])
# Calculating mean
mean_X = np.mean(X)
mean_Y = np.mean(Y)
@negedng
negedng / subtitles2vocab.py
Last active May 13, 2021 20:20
Get vocab for a movie
# Parameters
language = 'es' # Tested: 'en' or 'es'
text_file_path = 'movie_subtitle.xml'
known_words_path = 'known_word_list.txt' # one word per line
# More about universal part-of-speech: https://universaldependencies.org/u/pos/
skip_upos = ['PUNCT', 'PRON', 'DET', 'ADP', 'SYM', 'X']
most_common = 30
# Loading dependencies
import re
import tensorflow_text as text # Registers the ops.
import tensorflow as tf
import tensorflow_hub as hub
# text_input = ["This is a sample sentence."]
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(
"https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/1")
encoder_inputs = preprocessor(text_input) # dict with keys: 'input_mask', 'input_type_ids', 'input_word_ids'
encoder = hub.KerasLayer(
@negedng
negedng / names_BERT.csv
Created November 11, 2020 16:54
Black-white female-male name representations in BERT
name is_black is_female no_tokens man-woman actor-actress rich-poor right-wrong
Allison 0 1 1 0.007486283779144287 0.009950459003448486 -0.03143346309661865 -0.11628705263137817
Anne 0 1 1 0.01351994276046753 0.010419964790344238 -0.012483060359954834 -0.07704287767410278
Carrie 0 1 1 0.016272783279418945 0.004095137119293213 -0.034184157848358154 -0.12184709310531616
Emily 0 1 1 0.01870173215866089 0.001316070556640625 -0.035678982734680176 -0.11799013614654541
Jill 0 1 1 0.021672487258911133 0.008700907230377197 -0.02917778491973877 -0.12922751903533936
Laurie 0 1 1 0.03744012117385864 0.009738028049468994 -0.016129016876220703 -0.11589521169662476
Kristen 0 1 1 0.02254164218902588 0.002456486225128174 -0.029145538806915283 -0.1192706823348999
Meredith 0 1 1 0.03137969970703125 0.014599621295928955 -0.03397089242935181 -0.12568223476409912
Molly 0 1 1 0.03937774896621704 0.006252169609069824 -0.02823418378829956 -0.1289793848991394
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModel.from_pretrained("bert-base-cased")
inputs = tokenizer("Hello world!", return_tensors="pt")
outputs = model(**inputs)
ds2_train = (
ds2_train
.shuffle(100000)
.batch(32)
.prefetch(tf.data.experimental.AUTOTUNE)
)
ds2_val = (
ds2_val
.batch(32)
.prefetch(tf.data.experimental.AUTOTUNE)
# Encoding the data to integer token ids
def encode(examples):
tokens = [enc.ids for enc in tokenizer.encode_batch(examples['text'])]
return {'tokens': tokens}
ds2_train = ds2_train.map(encode, batched=True)
ds2_train = ds2_train.map(lambda examples: {'labels': examples['label']}, batched=True)
ds2_val = ds2_val.map(encode, batched=True)
ds2_val = ds2_val.map(lambda examples: {'labels': examples['label']}, batched=True)
# Format to TensorFlow Dataset
@negedng
negedng / nlp_datasets_hfds_tokenizer_freq.py
Created October 18, 2020 21:49
Tokenizer from the frequency list
# Formating vocab dictionary from the most common words
vocab_dict = {k:i+4 for i,k in enumerate([l for l,m in vocabulary_counter.most_common(20000-4)])}
# Adding the special characters
vocab_dict["[PAD]"]=0
vocab_dict["[UNK]"]=1
vocab_dict["[CLS]"]=2
vocab_dict["[SEP]"]=3
vocab_dict["[MASK]"]=4
tokenizer_2 = BertWordPieceTokenizer(vocab_dict)
with open("imdb_train_plain_lines.txt",'w') as f:
for examples in ds2_train:
f.write(examples['text'])
f.write('\n')
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors, BertWordPieceTokenizer
# Initialize a tokenizer
tokenizer = BertWordPieceTokenizer()
ds2 = datasets.load_dataset("imdb")
ds2_train = ds2['train']
ds2_val = ds2['test']