sirex · May 29, 2016 15:31
diff --git a/run.py b/run.py
 import io
 import csv
 import gzip
 import urllib.parse
 import urllib.request

 import scoop
 import scoop.futures


 def read_gzip(url):
    with urllib.request.urlopen(url) as response:
        yield from gzip.open(io.BufferedReader(response))


 def get_themes(url):
    themes = []
    domain = None
    theme_path = b'wp-content/themes/'
    scoop.logger.info('parsing: %s' % url)
    for line in read_gzip(url):
        if line.startswith(b'WARC-Target-URI:'):
            domain = line[17:].decode('utf-8').strip()
            domain = urllib.parse.urlparse(domain).netloc
        elif theme_path in line:
            start = line.index(theme_path) + len(theme_path)
            end = line.find(b'/', start)
            if end >= 0:
                themes.append((domain, line[start:end].decode('utf-8')))
    return themes


 def main():
    lines = read_gzip('https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2016-07/warc.paths.gz')
    paths = ['https://commoncrawl.s3.amazonaws.com/' + x.decode('utf-8').strip() for x in lines]
    with open('results.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['domain', 'theme'])
        for result in scoop.futures.map(get_themes, paths):
            for domain, theme in result:
                writer.writerow([domain, theme])


 if __name__ == "__main__":
    main()
	import io
	import csv
	import gzip
	import urllib.parse
	import urllib.request

	import scoop
	import scoop.futures


	def read_gzip(url):
	with urllib.request.urlopen(url) as response:
	yield from gzip.open(io.BufferedReader(response))


	def get_themes(url):
	themes = []
	domain = None
	theme_path = b'wp-content/themes/'
	scoop.logger.info('parsing: %s' % url)
	for line in read_gzip(url):
	if line.startswith(b'WARC-Target-URI:'):
	domain = line[17:].decode('utf-8').strip()
	domain = urllib.parse.urlparse(domain).netloc
	elif theme_path in line:
	start = line.index(theme_path) + len(theme_path)
	end = line.find(b'/', start)
	if end >= 0:
	themes.append((domain, line[start:end].decode('utf-8')))
	return themes


	def main():
	lines = read_gzip('https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2016-07/warc.paths.gz')
	paths = ['https://commoncrawl.s3.amazonaws.com/' + x.decode('utf-8').strip() for x in lines]
	with open('results.csv', 'w', newline='') as f:
	writer = csv.writer(f)
	writer.writerow(['domain', 'theme'])
	for result in scoop.futures.map(get_themes, paths):
	for domain, theme in result:
	writer.writerow([domain, theme])


	if __name__ == "__main__":
	main()