Skip to content

Instantly share code, notes, and snippets.

@anandsaha
Created February 3, 2023 14:52
Show Gist options
  • Save anandsaha/4db838aeb200ec85779655f53d09c118 to your computer and use it in GitHub Desktop.
Save anandsaha/4db838aeb200ec85779655f53d09c118 to your computer and use it in GitHub Desktop.
# Towards homework 1 of XCS224U
from multiprocessing import Pool
import pandas as pd
import spacy
from pathlib import Path
df = pd.read_csv('books_and_genres.csv')
nlp = spacy.load('en_core_web_sm')
total_rows = len(df)
count = 0
print(df.columns)
def func(x):
row = df.loc[x]
title = row['title']
text = str(row['text']).replace('\n', ' ')
file_name = f'corpus/{title}.txt'
if Path(file_name).exists():
print(f"Already processed {title} ")
return
print(f"Processing {title} ")
with open(file_name, 'w') as f:
sents = [i for i in nlp(title).sents]
for s in sents:
f.write(s.text)
sents = [i for i in nlp(text).sents if i.text.strip() != ""]
for s in sents:
f.write(f"{s.text}\n")
n = 10
with Pool(n) as p:
p.map(func, df.index)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment