Created
March 14, 2022 21:22
-
-
Save ZhangChengX/da0e3fa7ae56e8e2bdd5ffa28f95a42b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -* | |
# Reference | |
# https://github.com/zslwyuan/google-ngrams/blob/master/getngrams.py | |
from nltk.corpus import words | |
from ast import literal_eval | |
from tqdm import tqdm | |
import requests | |
import time | |
import re | |
corpora = dict(eng_us_2012=17, eng_us_2009=5, eng_us_2019=28, | |
eng_gb_2012=18, eng_gb_2009=6, eng_gb_2019=26, | |
chi_sim_2019=34, chi_sim_2012=23, chi_sim_2009=11, | |
eng_2012=15, eng_2009=0, | |
eng_fiction_2012=16, eng_fiction_2009=4, eng_1m_2009=1, | |
fre_2019=30, fre_2012=19, fre_2009=7, | |
ger_2019=31, ger_2012=20, ger_2009=8, | |
heb_2012=24, | |
heb_2009=9, | |
spa_2019=32, spa_2012=21, spa_2009=10, | |
rus_2019=36, rus_2012=25, rus_2009=12, | |
ita_2019=33, ita_2012=22) | |
def get_ngrams(query, corpus, startYear, endYear, smoothing=3, caseInsensitive=False): | |
params = dict(content=query, year_start=startYear, year_end=endYear, | |
corpus=corpora[corpus], smoothing=smoothing, | |
case_insensitive=caseInsensitive) | |
if params['case_insensitive'] is False: | |
params.pop('case_insensitive') | |
if '?' in params['content']: | |
params['content'] = params['content'].replace('?', '*') | |
if '@' in params['content']: | |
params['content'] = params['content'].replace('@', '=>') | |
try_again = True | |
while try_again: | |
req = requests.get('http://books.google.com/ngrams/graph', params=params) | |
res = re.findall('ngrams.data = .*\];', req.text) | |
if len(res)==1: | |
try_again = False | |
else: | |
print('Try again: ', query) | |
if 'Please try again later.' == req.text: | |
print('Try again error') | |
else: | |
print('Unknown error ', res) | |
time.sleep(300) | |
# assert(len(res)==1) | |
data = None | |
if res: | |
dataDict = literal_eval(res[0].replace("ngrams.data = ", "").replace(";", "")) | |
data = {qry['ngram']: qry['timeseries'] for qry in dataDict} | |
return data | |
if __name__ == '__main__': | |
d = {} | |
# for word in tqdm(['the', 'he', 'Cheng', 'book']): | |
for word in tqdm(words.words()): | |
print('Looking up: ', word) | |
ngrams = get_ngrams(query=word, corpus='eng_us_2019', startYear=2000, endYear=2019, caseInsensitive=True) | |
# print(ngrams) | |
if not ngrams: | |
d[word] = 1000 | |
print(ngrams) | |
continue | |
if word in ngrams: | |
ngram_score = sum(ngrams[word]) / len(ngrams[word]) | |
d[word] = round(ngram_score * 100, 6) | |
else: | |
d[word] = 999 | |
print(ngrams) | |
time.sleep(30) | |
with open('google_ngrims.csv', 'w') as f: | |
for k, v in d.items(): | |
f.write("%s, %s\n" % (k, v)) | |
print('Done') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment