Skip to content

Instantly share code, notes, and snippets.

@gary136
Last active December 18, 2019 05:58
Show Gist options
  • Save gary136/46b502dd19cca83985ff36e943211dc5 to your computer and use it in GitHub Desktop.
Save gary136/46b502dd19cca83985ff36e943211dc5 to your computer and use it in GitHub Desktop.
word_frequency.py
import time
import numpy as np
import requests
import pandas as pd
# prepare
url = 'https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2016/en/en_50k.txt'
r = requests.get(url)
d = r.text.split('\n')[:-1]
word = [t[0] if len(t)==2 else None for t in [i.split(' ') for i in d]]
df = pd.DataFrame({'word':word})
df['dic'] = 'https://dictionary.cambridge.org/us/dictionary/english/'+df['word']
def flag(url):
f = '''<h1 class="ti fs fs12 lmb-0 hw superentry">'''
r = requests.get(url)
time.sleep(1)
return f in r.text
def flt(d):
d['flag'] = d.dic.apply(flag)
d=d[d.flag==True][[i for i in d.columns if i!='flag']]
return d
pd.options.display.max_colwidth = 75
pd.set_option('display.width', 1000)
def vocabulary(sample, start=2500, end=20000):
import string
p = sample.replace('-', ' ').translate(str.maketrans('', '', string.punctuation)).split()
p = [i.lower() for i in p]
p = [(i, df[df.word==i].index[0], list(df[df.word==i]['dic'])[0]) \
if i in np.array(df.word) else (i, 99999, 'N') for i in p]
w = np.array(list(set(p)))[:,0]
r = np.array(list(set(p)))[:,1].astype(int)
d = np.array(list(set(p)))[:,2]
d = pd.DataFrame({'word':w
,'rank':r
,'dic':d})\
.sort_values(by='rank').reset_index().drop('index', axis=1)
return d[(d['rank']>=start) & (d['rank']<end)]
# e.g.
# sample = \
# '''
# xxx
# '''
# d = vocabulary(sample, start=5000, end=20000)
# d = flt(d)
# print(d)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment