anandsaha · February 3, 2023 15:44
diff --git a/create_json.py b/create_json.py
 import os
 import pandas as pd
 import numpy as np
 from pathlib import Path
 import random
 import multiprocessing as mp
 from tqdm.notebook import tqdm
 import time
 import json


 count_df = pd.read_csv(os.path.join('data/vsmdata/', "giga_window5-scaled.csv.gz"), index_col=0)

 corpus_path = '/home/anand/learn/cs224u-stanford/data/bert-extract/gutenberg2/corpus/all.txt'
 sentences = []
 with open(corpus_path, 'r') as f:
    for line in f:
        sentences.append(line.strip())
        
 min_length = 10

 def process_element(symbol):
    counter = 0
    matches = []
    
    global sentences
    global min_length
    
    
    for sentence in sentences:
        if len(sentence) < min_length:
            continue
        if symbol in sentence:
            matches.append(sentence)

    if len(matches) >= 500:
        N_500 = random.sample(matches, 500)
    else:
        N_500 = matches
    
    if len(matches) >= 50:
        N_50 = random.sample(matches, 50)
    else:
        N_50 = matches
        
   
    if len(matches) >= 10:
        N_10 = random.sample(matches, 10)
    else:
        N_10 = matches
        
    return (N_10, N_50, N_500)
    

 with mp.Pool(processes=8) as pool:
    vocab = count_df.index
    results = list(tqdm(pool.imap(process_element, vocab), total=len(vocab)))
    
 N_10_dict = dict()
 N_50_dict = dict()
 N_500_dict = dict()

 for vocab_word, vocab_item in zip(count_df.index, results):
    N_10, N_50, N_500 = vocab_item
    
    N_10_dict[vocab_word] = N_10
    N_50_dict[vocab_word] = N_50
    N_500_dict[vocab_word] = N_500

    
 with open('/home/anand/learn/cs224u-hw1/data/sentences_n10_gutenberg.json', 'w') as n10f:
    json.dump(N_10_dict, n10f)
    
 with open('/home/anand/learn/cs224u-hw1/data/sentences_n50_gutenberg.json', 'w') as n50f:
    json.dump(N_50_dict, n50f)

 with open('/home/anand/learn/cs224u-hw1/data/sentences_n500_gutenberg.json', 'w') as n500f:
    json.dump(N_500_dict, n500f)
	import os
	import pandas as pd
	import numpy as np
	from pathlib import Path
	import random
	import multiprocessing as mp
	from tqdm.notebook import tqdm
	import time
	import json


	count_df = pd.read_csv(os.path.join('data/vsmdata/', "giga_window5-scaled.csv.gz"), index_col=0)

	corpus_path = '/home/anand/learn/cs224u-stanford/data/bert-extract/gutenberg2/corpus/all.txt'
	sentences = []
	with open(corpus_path, 'r') as f:
	for line in f:
	sentences.append(line.strip())

	min_length = 10

	def process_element(symbol):
	counter = 0
	matches = []

	global sentences
	global min_length


	for sentence in sentences:
	if len(sentence) < min_length:
	continue
	if symbol in sentence:
	matches.append(sentence)

	if len(matches) >= 500:
	N_500 = random.sample(matches, 500)
	else:
	N_500 = matches

	if len(matches) >= 50:
	N_50 = random.sample(matches, 50)
	else:
	N_50 = matches


	if len(matches) >= 10:
	N_10 = random.sample(matches, 10)
	else:
	N_10 = matches

	return (N_10, N_50, N_500)


	with mp.Pool(processes=8) as pool:
	vocab = count_df.index
	results = list(tqdm(pool.imap(process_element, vocab), total=len(vocab)))

	N_10_dict = dict()
	N_50_dict = dict()
	N_500_dict = dict()

	for vocab_word, vocab_item in zip(count_df.index, results):
	N_10, N_50, N_500 = vocab_item

	N_10_dict[vocab_word] = N_10
	N_50_dict[vocab_word] = N_50
	N_500_dict[vocab_word] = N_500


	with open('/home/anand/learn/cs224u-hw1/data/sentences_n10_gutenberg.json', 'w') as n10f:
	json.dump(N_10_dict, n10f)

	with open('/home/anand/learn/cs224u-hw1/data/sentences_n50_gutenberg.json', 'w') as n50f:
	json.dump(N_50_dict, n50f)

	with open('/home/anand/learn/cs224u-hw1/data/sentences_n500_gutenberg.json', 'w') as n500f:
	json.dump(N_500_dict, n500f)