Skip to content

Instantly share code, notes, and snippets.

@masahitojp
Created January 18, 2011 12:02
Show Gist options
  • Save masahitojp/784332 to your computer and use it in GitHub Desktop.
Save masahitojp/784332 to your computer and use it in GitHub Desktop.
scraping with multiprocessing
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#テスト
import multiprocessing
import urllib2
from pyquery import PyQuery as pq
'''
import sys,codecs
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
'''
class Worker(multiprocessing.Process):
def __init__(self, url, timeout, result_queue):
multiprocessing.Process.__init__(self)
self.url = url
self.timeout = timeout
self.result_queue = result_queue
def run(self):
try:
print 'Connecting URL: {url} ...'.format(url=self.url)
res = urllib2.urlopen(url=self.url, timeout=self.timeout)
except Exception, e:
raise e
else:
data = res.read()
d = pq(data)
self.result_queue.put(d("title").text())
return
if __name__ == '__main__':
jobs = []
results = multiprocessing.Queue()
urls= ['http://www.google.com/', 'http://www.yahoo.com/', 'http://www.python.org/']
for url in urls:
p = Worker(url, 10, results)
jobs.append(p)
p.start()
num_jobs = len(urls)
while num_jobs:
result = results.get()
print unicode(result)
num_jobs -= 1
results.close()
results.join_thread()
for j in jobs:
j.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment