maowug · December 16, 2015 14:39
diff --git a/dict3_MW_learners.py b/dict3_MW_learners.py
 #!/usr/bin/env python
 #encoding: utf-8
 #__author__ = 'actor2019'

 import urllib
 from bs4 import BeautifulSoup as bs
 import cPickle
 import sys
 sys.setrecursionlimit(99999)

 def getWord(urlWord,failURLList):
    '''
    :param url:
    :return:
    '''
    #forms: tells; told; tell·ing
    #[u'gets', u'got', u'got', u'got\xb7ten', u'get\xb7ting']
    bsWord=bs((urllib.urlopen(urlWord).read()),'lxml')
    forms=[]
    formList=bsWord.select('div.entry span.if')
    for form in formList:
        forms.append(unicode(form.string))

    #type: adj./n. OR :get if got
    attrs=[]
    try:
        attrs.append(unicode(bsWord.select('div.headword span.fl')[0].string))
    except: #seldom happens
        try:
            attrs.append(unicode(':'+bsWord.select('div.entry span.cx a.dxt')[0].string))
        except:#rarely happens
            failURLList+=urlWord

    #phrases if have any: like "get up"
    phrases=[]
    droList=bsWord.select('div.dro')
    for dro in droList:
        # raw_sp_mn=re.split(u'\d|—',raw) #todo:give up the meaning and examples
        phrases.append(unicode(dro.find('span',class_='dre').string)) #cool if have: bsWord.select('div.dro')
        phrases+=[unicode(dre.string) for dre in dro.select('span.pva')]
        phrases=list(set(phrases))

    #text backup
    raw_text=bsWord.find('div',class_='entry').get_text()
    text=unicode(raw_text[raw_text.find('};')+2:])

    #return word
    return dict(
        spell=u'',#uspell
        forms=forms, #uformS
        phrases=phrases,#uphrases
        attrs=attrs,
        text=text,
    )

 if __name__ == '__main__':
    urlPattern='http://www.learnersdictionary.com/browse/learners/%s.htm'
    failURLList=[]
    import string
    # for pg in 'b':#a,b,c,...
    for pg in string.ascii_lowercase[0:1]:#a,b,c,...
        url_lowercase=urlPattern%pg
        pageLinks=bs((urllib.urlopen(url_lowercase).read()),'lxml').select('ol.browse a') #todo:.select('ol.browse a')
        wordsList=[]
        for pl in pageLinks:#a:page1, page2, ...
            urlBASE=r'http://www.learnersdictionary.com'
            urlPage=urlBASE+pl['href']
            print(urlPage)#print
            wdLinks=bs((urllib.urlopen(urlPage).read()),'lxml').select('ol.browse a')
            for wd in wdLinks: #-able, blue, get, ...
                urlWord=wd['href']
                bsWord=bs((urllib.urlopen(urlWord).read()),'lxml')
                                
                #if /search/approx. -> /search/approx
                #if urlWord failed,try add '.'
                if bsWord.select('div.headword span.hw')==[]:

                    # if search/atty+gen -> atty. gen.
                    # check links in ol.franklin_spelling_help
                    spelling_help=bsWord.select('ol.franklin-spelling-help a')
                    if spelling_help!=[] and set(unicode(spelling_help[0].string))-set(unicode(wd.string))==set([u'.']):
                        urlWord=urlBASE+spelling_help[0]['href']
                    else: #franklin_spelling_help failed, continue to next word
                        failURLList+=urlWord
                        continue

                    # urlPlusDot=urlWord+'.'
                    # print urlPlusDot#print
                    # bsPlusDot=bs((urllib.urlopen(urlPlusDot).read()),'lxml')
                    # if bsPlusDot.select('div.headword span.hw') == []: #todo:imp! take care of "==" and "is"
                    #     failURLList+=urlWord
                    #     continue
                    # else: #still can't a proper page, continue to next word
                    #     urlWord=urlPlusDot


                print urlWord#print just before the getWord fn
                word=getWord(urlWord,failURLList)
                word['spell']=unicode(wd.string) #imp! str()
                wordsList.append(word)

                #check other types :/search/blue[2]
                ol_results=bsWord.select('ol.results li.learners a')
                for ol in ol_results:
                    olh=ol['href']
                    olPath=olh[olh.rfind('/')+1:]
                    if olPath.find(wd.string+'[')>=0: #if find 'blue['
                        url_ol=urlBASE+ol['href']
                        print url_ol
                        word2=getWord(url_ol,failURLList)
                        word2['spell']=unicode(olPath)#imp! str()
                        wordsList.append(word2)
                del bsWord

                #eof-for wd in wdLinks:
            #eof-for pl in pageLinks:
        #cPickle every ascii_lowercase
        data=open(u'dict3_learners_'+unicode(pg)+u'.data','wb')
        cPickle.dump(wordsList,data)
        data.close()
        #eof-for pg in string.ascii_lowercase:
    log=open('dict3_learners_.log',"wb")
    log.write(unicode(failURLList))
    log.close()
	#!/usr/bin/env python
	#encoding: utf-8
	#__author__ = 'actor2019'

	import urllib
	from bs4 import BeautifulSoup as bs
	import cPickle
	import sys
	sys.setrecursionlimit(99999)

	def getWord(urlWord,failURLList):
	'''
	:param url:
	:return:
	'''
	#forms: tells; told; tell·ing
	#[u'gets', u'got', u'got', u'got\xb7ten', u'get\xb7ting']
	bsWord=bs((urllib.urlopen(urlWord).read()),'lxml')
	forms=[]
	formList=bsWord.select('div.entry span.if')
	for form in formList:
	forms.append(unicode(form.string))

	#type: adj./n. OR :get if got
	attrs=[]
	try:
	attrs.append(unicode(bsWord.select('div.headword span.fl')[0].string))
	except: #seldom happens
	try:
	attrs.append(unicode(':'+bsWord.select('div.entry span.cx a.dxt')[0].string))
	except:#rarely happens
	failURLList+=urlWord

	#phrases if have any: like "get up"
	phrases=[]
	droList=bsWord.select('div.dro')
	for dro in droList:
	# raw_sp_mn=re.split(u'\d\|—',raw) #todo:give up the meaning and examples
	phrases.append(unicode(dro.find('span',class_='dre').string)) #cool if have: bsWord.select('div.dro')
	phrases+=[unicode(dre.string) for dre in dro.select('span.pva')]
	phrases=list(set(phrases))

	#text backup
	raw_text=bsWord.find('div',class_='entry').get_text()
	text=unicode(raw_text[raw_text.find('};')+2:])

	#return word
	return dict(
	spell=u'',#uspell
	forms=forms, #uformS
	phrases=phrases,#uphrases
	attrs=attrs,
	text=text,
	)

	if __name__ == '__main__':
	urlPattern='http://www.learnersdictionary.com/browse/learners/%s.htm'
	failURLList=[]
	import string
	# for pg in 'b':#a,b,c,...
	for pg in string.ascii_lowercase[0:1]:#a,b,c,...
	url_lowercase=urlPattern%pg
	pageLinks=bs((urllib.urlopen(url_lowercase).read()),'lxml').select('ol.browse a') #todo:.select('ol.browse a')
	wordsList=[]
	for pl in pageLinks:#a:page1, page2, ...
	urlBASE=r'http://www.learnersdictionary.com'
	urlPage=urlBASE+pl['href']
	print(urlPage)#print
	wdLinks=bs((urllib.urlopen(urlPage).read()),'lxml').select('ol.browse a')
	for wd in wdLinks: #-able, blue, get, ...
	urlWord=wd['href']
	bsWord=bs((urllib.urlopen(urlWord).read()),'lxml')

	#if /search/approx. -> /search/approx
	#if urlWord failed,try add '.'
	if bsWord.select('div.headword span.hw')==[]:

	# if search/atty+gen -> atty. gen.
	# check links in ol.franklin_spelling_help
	spelling_help=bsWord.select('ol.franklin-spelling-help a')
	if spelling_help!=[] and set(unicode(spelling_help[0].string))-set(unicode(wd.string))==set([u'.']):
	urlWord=urlBASE+spelling_help[0]['href']
	else: #franklin_spelling_help failed, continue to next word
	failURLList+=urlWord
	continue

	# urlPlusDot=urlWord+'.'
	# print urlPlusDot#print
	# bsPlusDot=bs((urllib.urlopen(urlPlusDot).read()),'lxml')
	# if bsPlusDot.select('div.headword span.hw') == []: #todo:imp! take care of "==" and "is"
	# failURLList+=urlWord
	# continue
	# else: #still can't a proper page, continue to next word
	# urlWord=urlPlusDot


	print urlWord#print just before the getWord fn
	word=getWord(urlWord,failURLList)
	word['spell']=unicode(wd.string) #imp! str()
	wordsList.append(word)

	#check other types :/search/blue[2]
	ol_results=bsWord.select('ol.results li.learners a')
	for ol in ol_results:
	olh=ol['href']
	olPath=olh[olh.rfind('/')+1:]
	if olPath.find(wd.string+'[')>=0: #if find 'blue['
	url_ol=urlBASE+ol['href']
	print url_ol
	word2=getWord(url_ol,failURLList)
	word2['spell']=unicode(olPath)#imp! str()
	wordsList.append(word2)
	del bsWord

	#eof-for wd in wdLinks:
	#eof-for pl in pageLinks:
	#cPickle every ascii_lowercase
	data=open(u'dict3_learners_'+unicode(pg)+u'.data','wb')
	cPickle.dump(wordsList,data)
	data.close()
	#eof-for pg in string.ascii_lowercase:
	log=open('dict3_learners_.log',"wb")
	log.write(unicode(failURLList))
	log.close()