wfwei · December 24, 2015 04:09
diff --git a/crawler.py b/crawler.py
 #!/usr/bin/python
 # coding:utf-8

 import urllib2
 import BeautifulSoup
 import re
 import os

 HOST = u'http://www.washingtonpost.com/'
 image_patt = re.compile('.*photo-wrapper')

 def fetch_news(url_host=HOST, url_path='politics', local_dir='/home/plex/washingtonpost'):
 	page = urllib2.urlopen(url=url_host+url_path, timeout=10000)
 	soup = BeautifulSoup.BeautifulSoup(page)
 	
 	image_dir = '%s/%s/image/'%(local_dir, url_path)
 	news_dir = '%s/%s/news/'%(local_dir, url_path)
 	if not os.path.exists(image_dir):
 		os.makedirs(image_dir)
 	if not os.path.exists(news_dir):
 		os.makedirs(news_dir)
 					
 	for item in soup.findAll(attrs={u'class':u'no-left'}):
 		news_link = str(item.a[u'href'])
 		if news_link.startswith('/'):
 			news_link = url_host + news_link[1:]
 		if '/'+url_path not in news_link:
 			print 'skip news link: %s' % news_link
 			continue
 		print 'processing news link: %s' % news_link

 		try:
 			news_page = urllib2.urlopen(news_link)
 		except Exception, e:
 			print 'Fail to fetch news:%s' % news_link
 			print e
 			continue
 		news_soup = BeautifulSoup.BeautifulSoup(news_page)
 		
 		news_title = news_soup.title.text
 		print 'news title: %s' % news_title

 		_article = news_soup.find(id=u'article-leaf-page')
 		if not _article:
 			print 'Invalid page'
 			continue
 		
 		image = _article.find(attrs={'class':image_patt})
 		if image:
 			image_link = image.find('img')['src']
 			if image_link.startswith('/'):
 				image_link = url_host + image_link[1:]
 			print 'images:', image_link
 			mat = re.search(r'[^.]+$', image_link)
 			if mat:
 				file_type = mat.group()
 				image_file = image_dir + '%s.%s'%(news_title, file_type)
 				try:
 					with open(image_file, 'wb') as f:
 						image_data = urllib2.urlopen(image_link)
 						f.write(image_data.read())
 						print 'save image:%s' % image_file
 				except Exception, e:
 					print 'fail to fetch image:%s' % image_link
 					print e
 					
 		news_content = _article.find(id='article').findAll('p')
 		news_file = news_dir + '%s.txt' % news_title
 		print 'save news:%s' % news_file
 		with open(news_file, 'w') as f:
 			for content in news_content:
 				f.write(content.text)
 				f.write("\n")
 	pass

 if __name__ == '__main__':
 	fetch_news(url_host=HOST, url_path='politics', local_dir='/home/plex/washingtonpost')
	#!/usr/bin/python
	# coding:utf-8

	import urllib2
	import BeautifulSoup
	import re
	import os

	HOST = u'http://www.washingtonpost.com/'
	image_patt = re.compile('.*photo-wrapper')

	def fetch_news(url_host=HOST, url_path='politics', local_dir='/home/plex/washingtonpost'):
	page = urllib2.urlopen(url=url_host+url_path, timeout=10000)
	soup = BeautifulSoup.BeautifulSoup(page)

	image_dir = '%s/%s/image/'%(local_dir, url_path)
	news_dir = '%s/%s/news/'%(local_dir, url_path)
	if not os.path.exists(image_dir):
	os.makedirs(image_dir)
	if not os.path.exists(news_dir):
	os.makedirs(news_dir)

	for item in soup.findAll(attrs={u'class':u'no-left'}):
	news_link = str(item.a[u'href'])
	if news_link.startswith('/'):
	news_link = url_host + news_link[1:]
	if '/'+url_path not in news_link:
	print 'skip news link: %s' % news_link
	continue
	print 'processing news link: %s' % news_link

	try:
	news_page = urllib2.urlopen(news_link)
	except Exception, e:
	print 'Fail to fetch news:%s' % news_link
	print e
	continue
	news_soup = BeautifulSoup.BeautifulSoup(news_page)

	news_title = news_soup.title.text
	print 'news title: %s' % news_title

	_article = news_soup.find(id=u'article-leaf-page')
	if not _article:
	print 'Invalid page'
	continue

	image = _article.find(attrs={'class':image_patt})
	if image:
	image_link = image.find('img')['src']
	if image_link.startswith('/'):
	image_link = url_host + image_link[1:]
	print 'images:', image_link
	mat = re.search(r'[^.]+$', image_link)
	if mat:
	file_type = mat.group()
	image_file = image_dir + '%s.%s'%(news_title, file_type)
	try:
	with open(image_file, 'wb') as f:
	image_data = urllib2.urlopen(image_link)
	f.write(image_data.read())
	print 'save image:%s' % image_file
	except Exception, e:
	print 'fail to fetch image:%s' % image_link
	print e

	news_content = _article.find(id='article').findAll('p')
	news_file = news_dir + '%s.txt' % news_title
	print 'save news:%s' % news_file
	with open(news_file, 'w') as f:
	for content in news_content:
	f.write(content.text)
	f.write("\n")
	pass

	if __name__ == '__main__':
	fetch_news(url_host=HOST, url_path='politics', local_dir='/home/plex/washingtonpost')
No results found