Skip to content

Instantly share code, notes, and snippets.

@maowug
Last active December 16, 2015 14:39
Show Gist options
  • Save maowug/5450355 to your computer and use it in GitHub Desktop.
Save maowug/5450355 to your computer and use it in GitHub Desktop.
dict3 MW learners
#!/usr/bin/env python
#encoding: utf-8
#__author__ = 'actor2019'
import urllib
from bs4 import BeautifulSoup as bs
import cPickle
import sys
sys.setrecursionlimit(99999)
def getWord(urlWord,failURLList):
'''
:param url:
:return:
'''
#forms: tells; told; tell·ing
#[u'gets', u'got', u'got', u'got\xb7ten', u'get\xb7ting']
bsWord=bs((urllib.urlopen(urlWord).read()),'lxml')
forms=[]
formList=bsWord.select('div.entry span.if')
for form in formList:
forms.append(unicode(form.string))
#type: adj./n. OR :get if got
attrs=[]
try:
attrs.append(unicode(bsWord.select('div.headword span.fl')[0].string))
except: #seldom happens
try:
attrs.append(unicode(':'+bsWord.select('div.entry span.cx a.dxt')[0].string))
except:#rarely happens
failURLList+=urlWord
#phrases if have any: like "get up"
phrases=[]
droList=bsWord.select('div.dro')
for dro in droList:
# raw_sp_mn=re.split(u'\d|—',raw) #todo:give up the meaning and examples
phrases.append(unicode(dro.find('span',class_='dre').string)) #cool if have: bsWord.select('div.dro')
phrases+=[unicode(dre.string) for dre in dro.select('span.pva')]
phrases=list(set(phrases))
#text backup
raw_text=bsWord.find('div',class_='entry').get_text()
text=unicode(raw_text[raw_text.find('};')+2:])
#return word
return dict(
spell=u'',#uspell
forms=forms, #uformS
phrases=phrases,#uphrases
attrs=attrs,
text=text,
)
if __name__ == '__main__':
urlPattern='http://www.learnersdictionary.com/browse/learners/%s.htm'
failURLList=[]
import string
# for pg in 'b':#a,b,c,...
for pg in string.ascii_lowercase[0:1]:#a,b,c,...
url_lowercase=urlPattern%pg
pageLinks=bs((urllib.urlopen(url_lowercase).read()),'lxml').select('ol.browse a') #todo:.select('ol.browse a')
wordsList=[]
for pl in pageLinks:#a:page1, page2, ...
urlBASE=r'http://www.learnersdictionary.com'
urlPage=urlBASE+pl['href']
print(urlPage)#print
wdLinks=bs((urllib.urlopen(urlPage).read()),'lxml').select('ol.browse a')
for wd in wdLinks: #-able, blue, get, ...
urlWord=wd['href']
bsWord=bs((urllib.urlopen(urlWord).read()),'lxml')
#if /search/approx. -> /search/approx
#if urlWord failed,try add '.'
if bsWord.select('div.headword span.hw')==[]:
# if search/atty+gen -> atty. gen.
# check links in ol.franklin_spelling_help
spelling_help=bsWord.select('ol.franklin-spelling-help a')
if spelling_help!=[] and set(unicode(spelling_help[0].string))-set(unicode(wd.string))==set([u'.']):
urlWord=urlBASE+spelling_help[0]['href']
else: #franklin_spelling_help failed, continue to next word
failURLList+=urlWord
continue
# urlPlusDot=urlWord+'.'
# print urlPlusDot#print
# bsPlusDot=bs((urllib.urlopen(urlPlusDot).read()),'lxml')
# if bsPlusDot.select('div.headword span.hw') == []: #todo:imp! take care of "==" and "is"
# failURLList+=urlWord
# continue
# else: #still can't a proper page, continue to next word
# urlWord=urlPlusDot
print urlWord#print just before the getWord fn
word=getWord(urlWord,failURLList)
word['spell']=unicode(wd.string) #imp! str()
wordsList.append(word)
#check other types :/search/blue[2]
ol_results=bsWord.select('ol.results li.learners a')
for ol in ol_results:
olh=ol['href']
olPath=olh[olh.rfind('/')+1:]
if olPath.find(wd.string+'[')>=0: #if find 'blue['
url_ol=urlBASE+ol['href']
print url_ol
word2=getWord(url_ol,failURLList)
word2['spell']=unicode(olPath)#imp! str()
wordsList.append(word2)
del bsWord
#eof-for wd in wdLinks:
#eof-for pl in pageLinks:
#cPickle every ascii_lowercase
data=open(u'dict3_learners_'+unicode(pg)+u'.data','wb')
cPickle.dump(wordsList,data)
data.close()
#eof-for pg in string.ascii_lowercase:
log=open('dict3_learners_.log',"wb")
log.write(unicode(failURLList))
log.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment