Last active
          December 16, 2015 14:39 
        
      - 
      
- 
        Save maowug/5450390 to your computer and use it in GitHub Desktop. 
    dict3 yeeyan
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | #!/usr/bin/env python | |
| #encoding: utf-8 | |
| import urllib | |
| from bs4 import BeautifulSoup as bs | |
| import cPickle | |
| import re | |
| from bs4 import NavigableString | |
| def getArticle(str_id): | |
| """ | |
| :param id: | |
| :return: | |
| """ | |
| url=r'http://article.yeeyan.org/bilingual/'+str(str_id) | |
| print(url) | |
| bsArticle=bs((urllib.urlopen(url).read())) | |
| # title=unicode(bsArticle.find('title').get_text()) | |
| try: | |
| user=unicode(bsArticle.find('span',id='user_info').find('a').get_text()) | |
| except: | |
| user=u'' | |
| # left_raw=bsArticle.find('div',id='left').find_all('p') | |
| # if len(left_raw)==1: #keep <br/> | |
| # left_raw=left_raw.contents | |
| # contentC=u'' | |
| # for node in left_raw: | |
| # if isinstance(node,NavigableString): | |
| # contentC+=node.get_text() | |
| # elif node==bs('<br/>'): | |
| # contentC+=u'<br/>' | |
| # else: | |
| # contentC+=node.get_text() | |
| # else: | |
| # contentC=u'<br/>'.join([para for para in [unicode(p.get_text()) for p in left_raw] if para.strip()]) | |
| # right_raw=bsArticle.find('div',id='right') | |
| # contentE=u'<br/>'.join([para for para in [unicode(p.get_text()) for p in right_raw.find_all('p')] if para.strip()]) | |
| #todo: the 1st sentence of contentC may be the title. | |
| contentC=u'' | |
| contentE=u'' | |
| try: | |
| for node in bsArticle.find('div',id='left').contents: | |
| node_text=node.string if isinstance(node,NavigableString) else unicode(node.get_text()) | |
| flagAuthor=True | |
| if flagAuthor and node_text.find(u'译者') is not -1: | |
| flagAuthor=False | |
| continue | |
| contentC+=(node_text.strip()+u'<br/>') if len(node_text)>5 else node_text.strip() | |
| for node in bsArticle.find('div',id='right').contents: | |
| node_text=node.string if isinstance(node,NavigableString) else unicode(node.get_text()) | |
| contentE+=(node_text.strip()+u'<br/>') if len(node_text)>5 else node_text.strip() | |
| except: | |
| pass | |
| return user,contentC,contentE | |
| def getPage(url,genre): | |
| bsPage=bs((urllib.urlopen(url).read())) | |
| boxes=bsPage.find_all('div',class_='box clearfix') | |
| pageList=[] | |
| for box in boxes: | |
| try: | |
| #EN: no titleC title error: http://source.yeeyan.org/index/en/business/published/all/16 | |
| titleE=unicode(box.find('h4').get_text()) | |
| titleC=unicode(box.find('div',class_='y_blue articles').find('a').get_text()) | |
| date_raw=re.search(r'\s\d{4}-\d{2}-\d{2}\s',box.find('div',class_='publicMiddleLine').find('span').get_text()) | |
| date=unicode(date_raw.group(0).strip()) | |
| url=box.find('div',class_='y_blue articles').find('a')['href'] | |
| str_id=url[url.rfind('/')+1:] | |
| user,contentC,contentE=getArticle(str_id) | |
| #EN: for early articles having no english original, continue | |
| if len(contentC)<28 or len(contentE)<28: | |
| continue | |
| article=dict( | |
| titleE=titleE, | |
| titleC=titleC, | |
| date=date, | |
| str_id=str_id, | |
| user=user, | |
| genre=genre, | |
| contentC=contentC, | |
| contentE=contentE, | |
| ) | |
| pageList.append(article) | |
| # print article['contentE'][1:50] | |
| except: | |
| continue | |
| return pageList | |
| if __name__=="__main__": | |
| # genres=['business','sport','nature','tech','society','life','culture','health'] | |
| genres=['tech'] | |
| url=r'http://source.yeeyan.org/index/en/%s/published/all/' | |
| failURLList=[] | |
| for genre in genres: | |
| genreList=[] | |
| genre_url= url % genre | |
| # print g_url | |
| bsGenre=bs((urllib.urlopen(genre_url).read())) | |
| liList=bsGenre.find('ul',class_='y_page').find_all('li') | |
| NPage=int(liList[-2].get_text().strip()) | |
| NpercPickle=300 | |
| for page in range(1,NPage+1):#[1, ...,NPage] | |
| urlPage=genre_url+str(page) | |
| print '-------------page:'+str(page)+'----------------'#todo | |
| print urlPage | |
| print '-----------------------------------------' | |
| pageList=getPage(urlPage,unicode(genre)) | |
| genreList+=pageList | |
| if page%NpercPickle==0: | |
| #EN: cPickle every NpercPickle pages | |
| genreList2del=genreList | |
| data_yeeyan=open(u'dict3_yeeyan_'+unicode(genre)+u'_'+unicode(page)+u'_all_'+unicode(NPage)+u'.data','wb') | |
| cPickle.dump(genreList,data_yeeyan) | |
| data_yeeyan.close() | |
| genreList=[] | |
| #EN: seems not needed since its reference count becomes 0 | |
| del genreList2del | |
| #cPickle | |
| data_yeeyan=open(u'dict3_yeeyan_'+unicode(genre)+u'_'+unicode(page+1)+u'_all_'+unicode(NPage)+u'.data','wb') | |
| cPickle.dump(genreList,data_yeeyan) | |
| data_yeeyan.close() | |
| log=open('dict3_yeeyan_EN.log',"w+") | |
| log.write(unicode(failURLList)) | |
| log.close() | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | #!/usr/bin/env python | |
| #encoding: utf-8 | |
| import urllib | |
| from bs4 import BeautifulSoup as bs | |
| from bs4 import NavigableString | |
| import cPickle | |
| import re | |
| def getArticle(str_id): | |
| """ | |
| :param id: | |
| :return: | |
| """ | |
| url=r'http://article.yeeyan.org/bilingual/'+str(str_id) | |
| print url | |
| bsArticle=bs((urllib.urlopen(url).read()),'lxml') | |
| # title=unicode(bsArticle.find('title').get_text()) | |
| try: | |
| user=unicode(bsArticle.find('span',id='user_info').find('a').get_text()) | |
| except: | |
| user=u'' | |
| # left_raw=bsArticle.find('div',id='left').find_all('p') | |
| # if len(left_raw)==1: #keep <br/> | |
| # left_raw=left_raw.contents | |
| # contentC=u'' | |
| # for node in left_raw: | |
| # if isinstance(node,NavigableString): | |
| # contentC+=node.get_text() | |
| # elif node==bs('<br/>'): | |
| # contentC+=u'<br/>' | |
| # else: | |
| # contentC+=node.get_text() | |
| # else: | |
| # contentC=u'<br/>'.join([para for para in [unicode(p.get_text()) for p in left_raw] if para.strip()]) | |
| # right_raw=bsArticle.find('div',id='right') | |
| # contentJ=u'<br/>'.join([para for para in [unicode(p.get_text()) for p in right_raw.find_all('p')] if para.strip()]) | |
| #todo: the 1st sentence of contentC may be title. | |
| contentC=u'' | |
| contentJ=u'' | |
| try: | |
| for node in bsArticle.find('div',id='left').contents: | |
| node_text=node.string if isinstance(node,NavigableString) else unicode(node.get_text()) | |
| flagAuthor=True | |
| if flagAuthor and node_text.find(u'译者') is not -1: | |
| flagAuthor=False | |
| continue | |
| contentC+=(node_text.strip()+u'<br/>') if len(node_text)>5 else node_text.strip() | |
| for node in bsArticle.find('div',id='right').contents: | |
| node_text=node.string if isinstance(node,NavigableString) else unicode(node.get_text()) | |
| contentJ+=(node_text.strip()+u'<br/>') if len(node_text)>5 else node_text.strip() | |
| except: | |
| pass | |
| return user,contentC,contentJ | |
| def getPage(url,genre): | |
| bsPage=bs((urllib.urlopen(url).read()),'lxml') | |
| boxes=bsPage.find_all('div',class_='box clearfix') | |
| pageList=[] | |
| for box in boxes: | |
| try: | |
| #JP: no titleC title error: http://source.yeeyan.org/index/ja/business/published/all/33 | |
| titleJ=unicode(box.find('h4').get_text()) | |
| titleC=unicode(box.find('div',class_='y_blue articles').find('a').get_text()) | |
| date_raw=re.search(r'\s\d{4}-\d{2}-\d{2}\s',box.find('div',class_='publicMiddleLine').find('span').get_text()) | |
| date=unicode(date_raw.group(0).strip()) | |
| url=box.find('div',class_='y_blue articles').find('a')['href'] | |
| str_id=url[url.rfind('/')+1:] | |
| try: | |
| user,contentC,contentJ=getArticle(str_id) | |
| except: | |
| failURLList.append(unicode(str_id)+u'\n') | |
| continue | |
| article=dict( | |
| titleJ=titleJ, | |
| titleC=titleC, | |
| date=date, | |
| str_id=str_id, | |
| user=user, | |
| genre=genre, | |
| contentC=contentC, | |
| contentJ=contentJ, | |
| ) | |
| pageList.append(article) | |
| # print article['contentJ'][1:50] | |
| except: | |
| continue | |
| return pageList | |
| # genres=['culture','health']#'business','sport','nature','life','tech', | |
| genres=['society'] | |
| url=r'http://source.yeeyan.org/index/ja/%s/published/all/' | |
| failURLList=[] | |
| for genre in genres: | |
| genreList=[] | |
| genre_url= url % genre | |
| # print g_url | |
| bsGenre=bs((urllib.urlopen(genre_url).read()),'lxml') | |
| liList=bsGenre.find('ul',class_='y_page').find_all('li') | |
| NPage=int(liList[-2].get_text().strip()) | |
| for page in range(1,NPage+1):#[1, ...,NPage] | |
| urlPage=genre_url+str(page) | |
| print '-------------page:'+str(page)+'----------------'#todo | |
| print urlPage | |
| print '-----------------------------------------' | |
| pageList=getPage(urlPage,unicode(genre)) | |
| genreList+=pageList | |
| #cPickle | |
| data_yeeyan=open(u'dict3_yeeyan_JP_'+unicode(genre)+u'.data','wb') | |
| cPickle.dump(genreList,data_yeeyan) | |
| data_yeeyan.close() | |
| log=open('dict3_yeeyan_JP.log',"wb") | |
| log.write(unicode(failURLList)) | |
| log.close() | |
| # test_str_id='332739' | |
| # getArticle(test_str_id) | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment