Last active
          December 16, 2015 14:39 
        
      - 
      
- 
        Save maowug/5450355 to your computer and use it in GitHub Desktop. 
    dict3 MW learners
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | #!/usr/bin/env python | |
| #encoding: utf-8 | |
| #__author__ = 'actor2019' | |
| import urllib | |
| from bs4 import BeautifulSoup as bs | |
| import cPickle | |
| import sys | |
| sys.setrecursionlimit(99999) | |
| def getWord(urlWord,failURLList): | |
| ''' | |
| :param url: | |
| :return: | |
| ''' | |
| #forms: tells; told; tell·ing | |
| #[u'gets', u'got', u'got', u'got\xb7ten', u'get\xb7ting'] | |
| bsWord=bs((urllib.urlopen(urlWord).read()),'lxml') | |
| forms=[] | |
| formList=bsWord.select('div.entry span.if') | |
| for form in formList: | |
| forms.append(unicode(form.string)) | |
| #type: adj./n. OR :get if got | |
| attrs=[] | |
| try: | |
| attrs.append(unicode(bsWord.select('div.headword span.fl')[0].string)) | |
| except: #seldom happens | |
| try: | |
| attrs.append(unicode(':'+bsWord.select('div.entry span.cx a.dxt')[0].string)) | |
| except:#rarely happens | |
| failURLList+=urlWord | |
| #phrases if have any: like "get up" | |
| phrases=[] | |
| droList=bsWord.select('div.dro') | |
| for dro in droList: | |
| # raw_sp_mn=re.split(u'\d|—',raw) #todo:give up the meaning and examples | |
| phrases.append(unicode(dro.find('span',class_='dre').string)) #cool if have: bsWord.select('div.dro') | |
| phrases+=[unicode(dre.string) for dre in dro.select('span.pva')] | |
| phrases=list(set(phrases)) | |
| #text backup | |
| raw_text=bsWord.find('div',class_='entry').get_text() | |
| text=unicode(raw_text[raw_text.find('};')+2:]) | |
| #return word | |
| return dict( | |
| spell=u'',#uspell | |
| forms=forms, #uformS | |
| phrases=phrases,#uphrases | |
| attrs=attrs, | |
| text=text, | |
| ) | |
| if __name__ == '__main__': | |
| urlPattern='http://www.learnersdictionary.com/browse/learners/%s.htm' | |
| failURLList=[] | |
| import string | |
| # for pg in 'b':#a,b,c,... | |
| for pg in string.ascii_lowercase[0:1]:#a,b,c,... | |
| url_lowercase=urlPattern%pg | |
| pageLinks=bs((urllib.urlopen(url_lowercase).read()),'lxml').select('ol.browse a') #todo:.select('ol.browse a') | |
| wordsList=[] | |
| for pl in pageLinks:#a:page1, page2, ... | |
| urlBASE=r'http://www.learnersdictionary.com' | |
| urlPage=urlBASE+pl['href'] | |
| print(urlPage)#print | |
| wdLinks=bs((urllib.urlopen(urlPage).read()),'lxml').select('ol.browse a') | |
| for wd in wdLinks: #-able, blue, get, ... | |
| urlWord=wd['href'] | |
| bsWord=bs((urllib.urlopen(urlWord).read()),'lxml') | |
| #if /search/approx. -> /search/approx | |
| #if urlWord failed,try add '.' | |
| if bsWord.select('div.headword span.hw')==[]: | |
| # if search/atty+gen -> atty. gen. | |
| # check links in ol.franklin_spelling_help | |
| spelling_help=bsWord.select('ol.franklin-spelling-help a') | |
| if spelling_help!=[] and set(unicode(spelling_help[0].string))-set(unicode(wd.string))==set([u'.']): | |
| urlWord=urlBASE+spelling_help[0]['href'] | |
| else: #franklin_spelling_help failed, continue to next word | |
| failURLList+=urlWord | |
| continue | |
| # urlPlusDot=urlWord+'.' | |
| # print urlPlusDot#print | |
| # bsPlusDot=bs((urllib.urlopen(urlPlusDot).read()),'lxml') | |
| # if bsPlusDot.select('div.headword span.hw') == []: #todo:imp! take care of "==" and "is" | |
| # failURLList+=urlWord | |
| # continue | |
| # else: #still can't a proper page, continue to next word | |
| # urlWord=urlPlusDot | |
| print urlWord#print just before the getWord fn | |
| word=getWord(urlWord,failURLList) | |
| word['spell']=unicode(wd.string) #imp! str() | |
| wordsList.append(word) | |
| #check other types :/search/blue[2] | |
| ol_results=bsWord.select('ol.results li.learners a') | |
| for ol in ol_results: | |
| olh=ol['href'] | |
| olPath=olh[olh.rfind('/')+1:] | |
| if olPath.find(wd.string+'[')>=0: #if find 'blue[' | |
| url_ol=urlBASE+ol['href'] | |
| print url_ol | |
| word2=getWord(url_ol,failURLList) | |
| word2['spell']=unicode(olPath)#imp! str() | |
| wordsList.append(word2) | |
| del bsWord | |
| #eof-for wd in wdLinks: | |
| #eof-for pl in pageLinks: | |
| #cPickle every ascii_lowercase | |
| data=open(u'dict3_learners_'+unicode(pg)+u'.data','wb') | |
| cPickle.dump(wordsList,data) | |
| data.close() | |
| #eof-for pg in string.ascii_lowercase: | |
| log=open('dict3_learners_.log',"wb") | |
| log.write(unicode(failURLList)) | |
| log.close() | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment