Created
January 6, 2022 14:09
-
-
Save zapalote/8c66e7c46bc863b79c423acfc604af11 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Data source: https://storage.googleapis.com/books/ngrams/books/datasetsv2.html | |
# extraction pattern: ngram TAB year TAB match_count TAB volume_count NEWLINE | |
# out: unique_ngram TAB sum(match_count) NEWLINE | |
import os, sys | |
from pathlib import Path | |
from concurrent.futures import ProcessPoolExecutor | |
from multiprocessing import freeze_support | |
import polars as pl | |
from humanfriendly import format_size | |
import time | |
t0 = 0 | |
def eta(t=None): | |
global t0 | |
if t is not None: | |
t0 = time.time() | |
return | |
else: | |
t1 = time.time() | |
t = t1 - t0 | |
t0 = t1 | |
hours, rem = divmod(t, 3600) | |
minutes, seconds = divmod(rem, 60) | |
return("Ellapsed time {:0>2}:{:0>2}:{:06.3f}".format(int(hours),int(minutes),seconds)) | |
def process_file(file): | |
global base, stopwords | |
not_word = r'(_|[^\w])' | |
# print(f"processing {base+file}") | |
df = pl.read_csv(base+file, sep="\t", columns=[0,2], new_columns=['word','count']) | |
fsize = Path(base+file).stat().st_size | |
print(f"{file} ({format_size(fsize)}), {len(df)} records") | |
# filter out terms with non alphabetical characters ... | |
df = df.filter(pl.col("word").str.contains(not_word).is_not()) | |
# ... and terms shorter than 3 chars | |
df = df.filter(pl.col("word").str.lengths() > 2) | |
# ... and stop words | |
df["word"] = df["word"].str.to_lowercase() | |
df = df.filter(pl.col("word").is_in(stopwords).is_not()) | |
# sum unique counts | |
df = df.groupby('word')['count'].sum().sort(by='count_sum', reverse=True) | |
# select only terms that appear more 20,000 times in the books | |
good = df.filter(pl.col("count_sum") > 20000) | |
# output a csv file | |
print(f"out_{file}, {len(good)} terms") | |
good.to_csv(f'out_{file}.csv', sep='\t', has_header=False) | |
# df.filter(pl.col("count_sum") < 20000).to_csv(f'bad_{file}.csv', sep='\t', has_header=False) | |
base = "googlebooks-eng-all-1gram-20120701/googlebooks-eng-all-1gram-20120701-" | |
files = ['a','b','c','d','e','f','g','h','i','j',\ | |
'k','l','m','n','o','p','q','r','s',\ | |
't','u','v','w','x','y','z'] | |
with open('stopwords.txt') as f: | |
stopwords = f.read().splitlines() | |
def main(): | |
with ProcessPoolExecutor() as procs: | |
p_res = procs.map(process_file, files) | |
def one(): | |
process_file('a') | |
# this is needed for the process pool to initialize properly | |
if __name__ == '__main__': | |
eta(0) | |
main() | |
# one() | |
print(eta()) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment