-
-
Save admariner/79eb3cf755dfc6462b3c9353c693f398 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
@author: satyam.kumar | |
""" | |
''' | |
Import necessary packages | |
''' | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import os | |
import gc | |
import re | |
from nltk.corpus import stopwords | |
from nltk.stem import PorterStemmer | |
from bs4 import BeautifulSoup | |
from datetime import datetime | |
import multiprocessing | |
from functools import partial | |
SAFE_DIV=0.0001 | |
BUCKET_SIZE = 5000 | |
STOP_WORDS=stopwords.words('english') | |
def preprocess(x): | |
x=str(x).lower() | |
x=x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\ | |
.replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\ | |
.replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\ | |
.replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\ | |
.replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\ | |
.replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\ | |
.replace("€", " euro ").replace("'ll", " will") | |
x = re.sub(r"([0-9]+)000000", r"lm", x) | |
x = re.sub(r"([0-9]+)000", r"lk", x) | |
porter=PorterStemmer() | |
pattern=re.compile('\W') | |
if type(x) == type(''): | |
x = re.sub(pattern, ' ', x) | |
if type(x) == type(''): | |
x = porter.stem(x) | |
example1 = BeautifulSoup(x, 'lxml') | |
x = example1.get_text() | |
return x | |
def run_process(df, start): | |
df = df[start:start+BUCKET_SIZE] | |
print(start, "to ",start+BUCKET_SIZE) | |
temp = df["question"].apply(preprocess) | |
if __name__=="__main__": | |
df = pd.read_csv("train/train.csv") | |
list_of_questions = set(list(df['question1'])+list(df['question2'])) | |
print(len(list_of_questions)) | |
df_temp = pd.DataFrame() | |
df_temp['question'] = list(list_of_questions) | |
df_temp = df_temp.fillna("") | |
sample_size = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] | |
bucket = [1000, 2500, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000] | |
single_process_time = [] | |
multi_process_time = [] | |
for x,b in zip(sample_size, bucket): | |
df = df_temp.sample(frac=x) | |
BUCKET_SIZE = b | |
print("Data Shape: ", df.shape) | |
print("BUCKET SIZE: ", BUCKET_SIZE) | |
print("Single Processing Starting") | |
st = datetime.now() | |
temp = df["question"].apply(preprocess) | |
end = datetime.now() | |
print("Single processing Time: ",end-st) | |
single_process_time.append(end-st) | |
print("Multiprocessing Starting") | |
st = datetime.now() | |
chunks = [x for x in range(0,df.shape[0],BUCKET_SIZE)] | |
pool = multiprocessing.Pool() | |
func = partial(run_process, df) | |
temp = pool.map(func,chunks) | |
pool.close() | |
pool.join() | |
end = datetime.now() | |
print("Multiprocessing Time: ",end-st) | |
multi_process_time.append(end-st) | |
print() | |
print() | |
out_time = pd.DataFrame() | |
out_time['sample size'] = sample_size | |
out_time['bucket size'] = bucket | |
out_time['single_process_time'] = single_process_time | |
out_time['multi_process_time'] = multi_process_time | |
out_time.to_csv("Benchmark.csv", index=False) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment