Skip to content

Instantly share code, notes, and snippets.

@taesiri
Last active July 8, 2017 11:19
Show Gist options
  • Save taesiri/a093df3cc1a034a78ab2158928efac5c to your computer and use it in GitHub Desktop.
Save taesiri/a093df3cc1a034a78ab2158928efac5c to your computer and use it in GitHub Desktop.
Hacky script for gathering some textual data!
from __future__ import print_function
import urllib2
import HTMLParser
import os
import sys
import httplib2
import io
from bs4 import BeautifulSoup
fars_news_website = 'http://www.farsnews.com'
top_pages = 'http://www.farsnews.com/topnews?i='
def extract_news(nurl):
req = urllib2.urlopen( fars_news_website + nurl)
content = req.read()
req.headers['content-type']
encoding=req.headers['content-type'].split('charset=')[-1]
ucontent = unicode(content, encoding)
soup = BeautifulSoup(ucontent, "html.parser")
news_header = soup.findAll("h1", { "class" : "nwstxtinfotitle" })
all_ps = soup.findAll("p", { "class" : "rtejustify" })
try :
news_item = news_header[0].text + "\n" + " \n"
except:
return ''
for ptag in all_ps:
news_item += ptag.text + "\n"
news_item += "\n"
news_item += "\n"
news_item += "\n"
news_item += "\n"
return news_item
def extract_links(page_nuber):
links = []
http = httplib2.Http()
status, response = http.request(top_pages + str(page_nuber))
soup = BeautifulSoup(response, "html.parser")
for a in soup.findAll('a', href=True):
if 'newstext.php' in a['href']:
links.append(a['href'])
return links
all_links = []
all_news = ""
for i in range(1, 10):
all_links += extract_links(i)
for link in all_links:
print('working on', link)
all_news += extract_news(link)
with io.open("all_news.txt",'w',encoding='utf8') as f:
f.write(all_news)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment