Skip to content

Instantly share code, notes, and snippets.

View GeorgeSeif's full-sized avatar

George GeorgeSeif

View GitHub Profile
import nltk
snowball_stemmer = nltk.stem.SnowballStemmer('english')
s_1 = snowball_stemmer.stem("cook")
s_2 = snowball_stemmer.stem("cooks")
s_3 = snowball_stemmer.stem("cooked")
s_4 = snowball_stemmer.stem("cooking")
# s_1, s_2, s_3, s_4 all have the same result
import nltk
from nltk.corpus import stopwords
sentence = "This is a sentence for removing stop words"
tokens = nltk.word_tokenize(sentence)
stop_words = stopwords.words('english')
filtered_tokens = [w for w in tokens if w not in stop_words]
print(filtered_tokens)
import nltk
sentence = "My name is George and I love NLP"
tokens = nltk.word_tokenize(sentence)
print(tokens)
# Prints out ['My', 'name', 'is', 'George', 'and', 'I', 'love', 'NLP']
import pandas as pd
df = pd.read_csv("esea_master_dmg_demos.part1.csv")
s = time.time()
df = df.fillna(value=0)
e = time.time()
print("Pandas Concat Time = {}".format(e-s))
import modin.pandas as pd
df = pd.read_csv("esea_master_dmg_demos.part1.csv")
Operation Pandas Time Modin Time Speedup
pd.read_csv('esea_master_dmg_demos.part1.csv') 8.38 3.22 2.6
pd.concat([df for _ in range(5)]) 3.56 0.041 86.83
df.groupby(by='wp_type') 0.00029 0.059 0.0049
df.fillna(value=0) 1.8 0.21 8.57
df.dropna() 1.24 1.71 0.73
df.count() 1.09 0.046 23.70
df.drop_duplicates() 7.68 13.38 0.57
df.describe() 1.30 4.69 0.28
df['seconds'].max() 0.015 0.26 0.058
import pandas as pd
df = pd.read_csv("esea_master_dmg_demos.part1.csv")
s = time.time()
df = pd.concat([df for _ in range(5)])
e = time.time()
print("Pandas Concat Time = {}".format(e-s))
import modin.pandas as pd
df = pd.read_csv("esea_master_dmg_demos.part1.csv")
### Read in the data with Pandas
import pandas as pd
s = time.time()
df = pd.read_csv("esea_master_dmg_demos.part1.csv")
e = time.time()
print("Pandas Loading Time = {}".format(e-s))
### Read in the data with Modin
import modin.pandas as pd
Operation Pandas on CPU Time (ms) Dask on GPU Time (ms) Speedup
df['price'].mean() 2.6 0.3 8.7
df['price'].max() 2.2 0.2 11
df[df['price'] > 250] 13 0.7 18.6
df + df 163 2.6 62.7
import cudf
dask_df = dask_df.map_partitions(cudf.from_pandas)
Operation Pandas Time (ms) Dask Time (ms) Speedup
df['price'].mean() 2.6 1.0 2.6
df['price'].max() 2.2 0.6 3.7
df[df['price'] > 250] 13 0.7 18.6
df + df 163 3.4 48.5
df['price'].drop_duplicates() 4.3 0.8 5.4
df['price'].value_counts() 3.8 0.9 4.2