Skip to content

Instantly share code, notes, and snippets.

@anandsaha
Last active February 3, 2023 15:44
Show Gist options
  • Save anandsaha/a200085fda9f85d57dd33ed814151416 to your computer and use it in GitHub Desktop.
Save anandsaha/a200085fda9f85d57dd33ed814151416 to your computer and use it in GitHub Desktop.
import os
import pandas as pd
import numpy as np
from pathlib import Path
import random
import multiprocessing as mp
from tqdm.notebook import tqdm
import time
import json
count_df = pd.read_csv(os.path.join('data/vsmdata/', "giga_window5-scaled.csv.gz"), index_col=0)
corpus_path = '/home/anand/learn/cs224u-stanford/data/bert-extract/gutenberg2/corpus/all.txt'
sentences = []
with open(corpus_path, 'r') as f:
for line in f:
sentences.append(line.strip())
min_length = 10
def process_element(symbol):
counter = 0
matches = []
global sentences
global min_length
for sentence in sentences:
if len(sentence) < min_length:
continue
if symbol in sentence:
matches.append(sentence)
if len(matches) >= 500:
N_500 = random.sample(matches, 500)
else:
N_500 = matches
if len(matches) >= 50:
N_50 = random.sample(matches, 50)
else:
N_50 = matches
if len(matches) >= 10:
N_10 = random.sample(matches, 10)
else:
N_10 = matches
return (N_10, N_50, N_500)
with mp.Pool(processes=8) as pool:
vocab = count_df.index
results = list(tqdm(pool.imap(process_element, vocab), total=len(vocab)))
N_10_dict = dict()
N_50_dict = dict()
N_500_dict = dict()
for vocab_word, vocab_item in zip(count_df.index, results):
N_10, N_50, N_500 = vocab_item
N_10_dict[vocab_word] = N_10
N_50_dict[vocab_word] = N_50
N_500_dict[vocab_word] = N_500
with open('/home/anand/learn/cs224u-hw1/data/sentences_n10_gutenberg.json', 'w') as n10f:
json.dump(N_10_dict, n10f)
with open('/home/anand/learn/cs224u-hw1/data/sentences_n50_gutenberg.json', 'w') as n50f:
json.dump(N_50_dict, n50f)
with open('/home/anand/learn/cs224u-hw1/data/sentences_n500_gutenberg.json', 'w') as n500f:
json.dump(N_500_dict, n500f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment