Skip to content

Instantly share code, notes, and snippets.

@maowug
Last active December 16, 2015 14:39
Show Gist options
  • Save maowug/5450390 to your computer and use it in GitHub Desktop.
Save maowug/5450390 to your computer and use it in GitHub Desktop.
dict3 yeeyan
#!/usr/bin/env python
#encoding: utf-8
import urllib
from bs4 import BeautifulSoup as bs
import cPickle
import re
from bs4 import NavigableString
def getArticle(str_id):
"""
:param id:
:return:
"""
url=r'http://article.yeeyan.org/bilingual/'+str(str_id)
print(url)
bsArticle=bs((urllib.urlopen(url).read()))
# title=unicode(bsArticle.find('title').get_text())
try:
user=unicode(bsArticle.find('span',id='user_info').find('a').get_text())
except:
user=u''
# left_raw=bsArticle.find('div',id='left').find_all('p')
# if len(left_raw)==1: #keep <br/>
# left_raw=left_raw.contents
# contentC=u''
# for node in left_raw:
# if isinstance(node,NavigableString):
# contentC+=node.get_text()
# elif node==bs('<br/>'):
# contentC+=u'<br/>'
# else:
# contentC+=node.get_text()
# else:
# contentC=u'<br/>'.join([para for para in [unicode(p.get_text()) for p in left_raw] if para.strip()])
# right_raw=bsArticle.find('div',id='right')
# contentE=u'<br/>'.join([para for para in [unicode(p.get_text()) for p in right_raw.find_all('p')] if para.strip()])
#todo: the 1st sentence of contentC may be the title.
contentC=u''
contentE=u''
try:
for node in bsArticle.find('div',id='left').contents:
node_text=node.string if isinstance(node,NavigableString) else unicode(node.get_text())
flagAuthor=True
if flagAuthor and node_text.find(u'译者') is not -1:
flagAuthor=False
continue
contentC+=(node_text.strip()+u'<br/>') if len(node_text)>5 else node_text.strip()
for node in bsArticle.find('div',id='right').contents:
node_text=node.string if isinstance(node,NavigableString) else unicode(node.get_text())
contentE+=(node_text.strip()+u'<br/>') if len(node_text)>5 else node_text.strip()
except:
pass
return user,contentC,contentE
def getPage(url,genre):
bsPage=bs((urllib.urlopen(url).read()))
boxes=bsPage.find_all('div',class_='box clearfix')
pageList=[]
for box in boxes:
try:
#EN: no titleC title error: http://source.yeeyan.org/index/en/business/published/all/16
titleE=unicode(box.find('h4').get_text())
titleC=unicode(box.find('div',class_='y_blue articles').find('a').get_text())
date_raw=re.search(r'\s\d{4}-\d{2}-\d{2}\s',box.find('div',class_='publicMiddleLine').find('span').get_text())
date=unicode(date_raw.group(0).strip())
url=box.find('div',class_='y_blue articles').find('a')['href']
str_id=url[url.rfind('/')+1:]
user,contentC,contentE=getArticle(str_id)
#EN: for early articles having no english original, continue
if len(contentC)<28 or len(contentE)<28:
continue
article=dict(
titleE=titleE,
titleC=titleC,
date=date,
str_id=str_id,
user=user,
genre=genre,
contentC=contentC,
contentE=contentE,
)
pageList.append(article)
# print article['contentE'][1:50]
except:
continue
return pageList
if __name__=="__main__":
# genres=['business','sport','nature','tech','society','life','culture','health']
genres=['tech']
url=r'http://source.yeeyan.org/index/en/%s/published/all/'
failURLList=[]
for genre in genres:
genreList=[]
genre_url= url % genre
# print g_url
bsGenre=bs((urllib.urlopen(genre_url).read()))
liList=bsGenre.find('ul',class_='y_page').find_all('li')
NPage=int(liList[-2].get_text().strip())
NpercPickle=300
for page in range(1,NPage+1):#[1, ...,NPage]
urlPage=genre_url+str(page)
print '-------------page:'+str(page)+'----------------'#todo
print urlPage
print '-----------------------------------------'
pageList=getPage(urlPage,unicode(genre))
genreList+=pageList
if page%NpercPickle==0:
#EN: cPickle every NpercPickle pages
genreList2del=genreList
data_yeeyan=open(u'dict3_yeeyan_'+unicode(genre)+u'_'+unicode(page)+u'_all_'+unicode(NPage)+u'.data','wb')
cPickle.dump(genreList,data_yeeyan)
data_yeeyan.close()
genreList=[]
#EN: seems not needed since its reference count becomes 0
del genreList2del
#cPickle
data_yeeyan=open(u'dict3_yeeyan_'+unicode(genre)+u'_'+unicode(page+1)+u'_all_'+unicode(NPage)+u'.data','wb')
cPickle.dump(genreList,data_yeeyan)
data_yeeyan.close()
log=open('dict3_yeeyan_EN.log',"w+")
log.write(unicode(failURLList))
log.close()
#!/usr/bin/env python
#encoding: utf-8
import urllib
from bs4 import BeautifulSoup as bs
from bs4 import NavigableString
import cPickle
import re
def getArticle(str_id):
"""
:param id:
:return:
"""
url=r'http://article.yeeyan.org/bilingual/'+str(str_id)
print url
bsArticle=bs((urllib.urlopen(url).read()),'lxml')
# title=unicode(bsArticle.find('title').get_text())
try:
user=unicode(bsArticle.find('span',id='user_info').find('a').get_text())
except:
user=u''
# left_raw=bsArticle.find('div',id='left').find_all('p')
# if len(left_raw)==1: #keep <br/>
# left_raw=left_raw.contents
# contentC=u''
# for node in left_raw:
# if isinstance(node,NavigableString):
# contentC+=node.get_text()
# elif node==bs('<br/>'):
# contentC+=u'<br/>'
# else:
# contentC+=node.get_text()
# else:
# contentC=u'<br/>'.join([para for para in [unicode(p.get_text()) for p in left_raw] if para.strip()])
# right_raw=bsArticle.find('div',id='right')
# contentJ=u'<br/>'.join([para for para in [unicode(p.get_text()) for p in right_raw.find_all('p')] if para.strip()])
#todo: the 1st sentence of contentC may be title.
contentC=u''
contentJ=u''
try:
for node in bsArticle.find('div',id='left').contents:
node_text=node.string if isinstance(node,NavigableString) else unicode(node.get_text())
flagAuthor=True
if flagAuthor and node_text.find(u'译者') is not -1:
flagAuthor=False
continue
contentC+=(node_text.strip()+u'<br/>') if len(node_text)>5 else node_text.strip()
for node in bsArticle.find('div',id='right').contents:
node_text=node.string if isinstance(node,NavigableString) else unicode(node.get_text())
contentJ+=(node_text.strip()+u'<br/>') if len(node_text)>5 else node_text.strip()
except:
pass
return user,contentC,contentJ
def getPage(url,genre):
bsPage=bs((urllib.urlopen(url).read()),'lxml')
boxes=bsPage.find_all('div',class_='box clearfix')
pageList=[]
for box in boxes:
try:
#JP: no titleC title error: http://source.yeeyan.org/index/ja/business/published/all/33
titleJ=unicode(box.find('h4').get_text())
titleC=unicode(box.find('div',class_='y_blue articles').find('a').get_text())
date_raw=re.search(r'\s\d{4}-\d{2}-\d{2}\s',box.find('div',class_='publicMiddleLine').find('span').get_text())
date=unicode(date_raw.group(0).strip())
url=box.find('div',class_='y_blue articles').find('a')['href']
str_id=url[url.rfind('/')+1:]
try:
user,contentC,contentJ=getArticle(str_id)
except:
failURLList.append(unicode(str_id)+u'\n')
continue
article=dict(
titleJ=titleJ,
titleC=titleC,
date=date,
str_id=str_id,
user=user,
genre=genre,
contentC=contentC,
contentJ=contentJ,
)
pageList.append(article)
# print article['contentJ'][1:50]
except:
continue
return pageList
# genres=['culture','health']#'business','sport','nature','life','tech',
genres=['society']
url=r'http://source.yeeyan.org/index/ja/%s/published/all/'
failURLList=[]
for genre in genres:
genreList=[]
genre_url= url % genre
# print g_url
bsGenre=bs((urllib.urlopen(genre_url).read()),'lxml')
liList=bsGenre.find('ul',class_='y_page').find_all('li')
NPage=int(liList[-2].get_text().strip())
for page in range(1,NPage+1):#[1, ...,NPage]
urlPage=genre_url+str(page)
print '-------------page:'+str(page)+'----------------'#todo
print urlPage
print '-----------------------------------------'
pageList=getPage(urlPage,unicode(genre))
genreList+=pageList
#cPickle
data_yeeyan=open(u'dict3_yeeyan_JP_'+unicode(genre)+u'.data','wb')
cPickle.dump(genreList,data_yeeyan)
data_yeeyan.close()
log=open('dict3_yeeyan_JP.log',"wb")
log.write(unicode(failURLList))
log.close()
# test_str_id='332739'
# getArticle(test_str_id)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment