anandsaha · February 3, 2023 14:52
diff --git a/extract.py b/extract.py
 # Towards homework 1 of XCS224U
 from multiprocessing import Pool

 import pandas as pd
 import spacy

 from pathlib import Path

 df = pd.read_csv('books_and_genres.csv')
 nlp = spacy.load('en_core_web_sm')

 total_rows = len(df)
 count = 0
 print(df.columns)


 def func(x):
    row = df.loc[x]

    title = row['title']
    text = str(row['text']).replace('\n', ' ')

    file_name = f'corpus/{title}.txt'

    if Path(file_name).exists():
        print(f"Already processed {title} ")
        return

    print(f"Processing {title} ")
    with open(file_name, 'w') as f:
        sents = [i for i in nlp(title).sents]
        for s in sents:
            f.write(s.text)

        sents = [i for i in nlp(text).sents if i.text.strip() != ""]
        for s in sents:
            f.write(f"{s.text}\n")


 n = 10
 with Pool(n) as p:
    p.map(func, df.index)
	# Towards homework 1 of XCS224U
	from multiprocessing import Pool

	import pandas as pd
	import spacy

	from pathlib import Path

	df = pd.read_csv('books_and_genres.csv')
	nlp = spacy.load('en_core_web_sm')

	total_rows = len(df)
	count = 0
	print(df.columns)


	def func(x):
	row = df.loc[x]

	title = row['title']
	text = str(row['text']).replace('\n', ' ')

	file_name = f'corpus/{title}.txt'

	if Path(file_name).exists():
	print(f"Already processed {title} ")
	return

	print(f"Processing {title} ")
	with open(file_name, 'w') as f:
	sents = [i for i in nlp(title).sents]
	for s in sents:
	f.write(s.text)

	sents = [i for i in nlp(text).sents if i.text.strip() != ""]
	for s in sents:
	f.write(f"{s.text}\n")


	n = 10
	with Pool(n) as p:
	p.map(func, df.index)