Last active
February 3, 2023 15:44
-
-
Save anandsaha/a200085fda9f85d57dd33ed814151416 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pandas as pd | |
import numpy as np | |
from pathlib import Path | |
import random | |
import multiprocessing as mp | |
from tqdm.notebook import tqdm | |
import time | |
import json | |
count_df = pd.read_csv(os.path.join('data/vsmdata/', "giga_window5-scaled.csv.gz"), index_col=0) | |
corpus_path = '/home/anand/learn/cs224u-stanford/data/bert-extract/gutenberg2/corpus/all.txt' | |
sentences = [] | |
with open(corpus_path, 'r') as f: | |
for line in f: | |
sentences.append(line.strip()) | |
min_length = 10 | |
def process_element(symbol): | |
counter = 0 | |
matches = [] | |
global sentences | |
global min_length | |
for sentence in sentences: | |
if len(sentence) < min_length: | |
continue | |
if symbol in sentence: | |
matches.append(sentence) | |
if len(matches) >= 500: | |
N_500 = random.sample(matches, 500) | |
else: | |
N_500 = matches | |
if len(matches) >= 50: | |
N_50 = random.sample(matches, 50) | |
else: | |
N_50 = matches | |
if len(matches) >= 10: | |
N_10 = random.sample(matches, 10) | |
else: | |
N_10 = matches | |
return (N_10, N_50, N_500) | |
with mp.Pool(processes=8) as pool: | |
vocab = count_df.index | |
results = list(tqdm(pool.imap(process_element, vocab), total=len(vocab))) | |
N_10_dict = dict() | |
N_50_dict = dict() | |
N_500_dict = dict() | |
for vocab_word, vocab_item in zip(count_df.index, results): | |
N_10, N_50, N_500 = vocab_item | |
N_10_dict[vocab_word] = N_10 | |
N_50_dict[vocab_word] = N_50 | |
N_500_dict[vocab_word] = N_500 | |
with open('/home/anand/learn/cs224u-hw1/data/sentences_n10_gutenberg.json', 'w') as n10f: | |
json.dump(N_10_dict, n10f) | |
with open('/home/anand/learn/cs224u-hw1/data/sentences_n50_gutenberg.json', 'w') as n50f: | |
json.dump(N_50_dict, n50f) | |
with open('/home/anand/learn/cs224u-hw1/data/sentences_n500_gutenberg.json', 'w') as n500f: | |
json.dump(N_500_dict, n500f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment