Skip to content

Instantly share code, notes, and snippets.

@vchahun
Created April 14, 2011 22:36
Show Gist options
  • Save vchahun/920741 to your computer and use it in GitHub Desktop.
Save vchahun/920741 to your computer and use it in GitHub Desktop.
Scrape links from Google News
import sys
import multiprocessing
from urllib2 import urlopen, URLError
import chardet
import urlparse, urllib
def fixurl(url):
# turn string into unicode
if not isinstance(url,unicode):
url = url.decode('utf8')
# parse it
parsed = urlparse.urlsplit(url)
# divide the netloc further
userpass,at,hostport = parsed.netloc.partition('@')
user,colon1,pass_ = userpass.partition(':')
host,colon2,port = hostport.partition(':')
# encode each component
scheme = parsed.scheme.encode('utf8')
user = urllib.quote(user.encode('utf8'))
colon1 = colon1.encode('utf8')
pass_ = urllib.quote(pass_.encode('utf8'))
at = at.encode('utf8')
host = host.encode('idna')
colon2 = colon2.encode('utf8')
port = port.encode('utf8')
path = '/'.join( # could be encoded slashes!
urllib.quote(urllib.unquote(pce).encode('utf8'),'')
for pce in parsed.path.split('/')
)
query = urllib.quote(urllib.unquote(parsed.query).encode('utf8'),'=&?/')
fragment = urllib.quote(urllib.unquote(parsed.query).encode('utf8'))
# put it back together
netloc = ''.join((user,colon1,pass_,at,host,colon2,port))
return urlparse.urlunsplit((scheme,netloc,path,query,fragment))
def getArticle(link):
(i, _, _, _, url) = link
print "Requesting #%d [%s]..."%(i, url)
html = u""
try:
article = urlopen(fixurl(url), timeout = 10).read()
if article:
encoding = chardet.detect(article)['encoding']
if encoding:
html = unicode(article, encoding)
except UnicodeEncodeError as e:
print "Failed encoding #%d (%s)"%(i, str(e))
except URLError as e:
print "Failed download #%d (%s)"%(i, str(e))
except Exception as e:
print "Unknowkn exception #%d (%s)"%(i, str(e))
return (link, html)
def scrape(fileName):
links = []
with open(fileName) as f:
for l in f:
(i, _, title, source, _, date, _, url) = l[:-1].decode("utf8").split('\t')
i = int(i)
links.append((i, title, source, date, url))
pool = multiprocessing.Pool(10)
articles = pool.map(getArticle, links)
print "##### Inserting into DB..."
import sqlite3
cnx = sqlite3.connect("articles.db")
cur = cnx.cursor()
cur.executemany("""insert into articles(id, date, title, source, url, html)
values(?, ?, ?, ?, ?, ?)""",
[(i, date, title, source, url, html) for ((i, title, source, date, url), html) in articles])
cnx.commit()
cur.close()
if __name__ == '__main__':
scrape(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment