taesiri · July 8, 2017 11:19
diff --git a/farsnews_harvester.py b/farsnews_harvester.py
 from __future__ import print_function
 import urllib2
 import HTMLParser
 import os
 import sys
 import httplib2
 import io
 from bs4 import BeautifulSoup

 fars_news_website =  'http://www.farsnews.com'
 top_pages = 'http://www.farsnews.com/topnews?i='

 def extract_news(nurl):
    req = urllib2.urlopen( fars_news_website + nurl)
    content = req.read()
    req.headers['content-type']
    encoding=req.headers['content-type'].split('charset=')[-1]
    ucontent = unicode(content, encoding)

    soup = BeautifulSoup(ucontent, "html.parser")
    news_header =  soup.findAll("h1", { "class" : "nwstxtinfotitle" })
    all_ps = soup.findAll("p", { "class" : "rtejustify" })

    try :
        news_item =  news_header[0].text + "\n" + " \n"
    except:
        return ''

    for ptag in all_ps:
        news_item += ptag.text + "\n"

    news_item += "\n"
    news_item += "\n"
    news_item += "\n"
    news_item += "\n"

    return news_item


 def extract_links(page_nuber):
    links = []
    http = httplib2.Http()
    status, response = http.request(top_pages + str(page_nuber))
    soup = BeautifulSoup(response, "html.parser")
    for a in soup.findAll('a', href=True):
        if 'newstext.php' in a['href']:
            links.append(a['href'])
    
    return links


 all_links = []
 all_news = ""

 for i in range(1, 10):
    all_links += extract_links(i)

 for link in all_links:
    print('working on', link)
    all_news += extract_news(link)

 with io.open("all_news.txt",'w',encoding='utf8') as f:
    f.write(all_news)
	from __future__ import print_function
	import urllib2
	import HTMLParser
	import os
	import sys
	import httplib2
	import io
	from bs4 import BeautifulSoup

	fars_news_website = 'http://www.farsnews.com'
	top_pages = 'http://www.farsnews.com/topnews?i='

	def extract_news(nurl):
	req = urllib2.urlopen( fars_news_website + nurl)
	content = req.read()
	req.headers['content-type']
	encoding=req.headers['content-type'].split('charset=')[-1]
	ucontent = unicode(content, encoding)

	soup = BeautifulSoup(ucontent, "html.parser")
	news_header = soup.findAll("h1", { "class" : "nwstxtinfotitle" })
	all_ps = soup.findAll("p", { "class" : "rtejustify" })

	try :
	news_item = news_header[0].text + "\n" + " \n"
	except:
	return ''

	for ptag in all_ps:
	news_item += ptag.text + "\n"

	news_item += "\n"
	news_item += "\n"
	news_item += "\n"
	news_item += "\n"

	return news_item


	def extract_links(page_nuber):
	links = []
	http = httplib2.Http()
	status, response = http.request(top_pages + str(page_nuber))
	soup = BeautifulSoup(response, "html.parser")
	for a in soup.findAll('a', href=True):
	if 'newstext.php' in a['href']:
	links.append(a['href'])

	return links


	all_links = []
	all_news = ""

	for i in range(1, 10):
	all_links += extract_links(i)

	for link in all_links:
	print('working on', link)
	all_news += extract_news(link)

	with io.open("all_news.txt",'w',encoding='utf8') as f:
	f.write(all_news)