Skip to content

Instantly share code, notes, and snippets.

@sirex
Created May 29, 2016 15:31
Show Gist options
  • Save sirex/25922c5b557606106095b4e917ce9216 to your computer and use it in GitHub Desktop.
Save sirex/25922c5b557606106095b4e917ce9216 to your computer and use it in GitHub Desktop.
import io
import csv
import gzip
import urllib.parse
import urllib.request
import scoop
import scoop.futures
def read_gzip(url):
with urllib.request.urlopen(url) as response:
yield from gzip.open(io.BufferedReader(response))
def get_themes(url):
themes = []
domain = None
theme_path = b'wp-content/themes/'
scoop.logger.info('parsing: %s' % url)
for line in read_gzip(url):
if line.startswith(b'WARC-Target-URI:'):
domain = line[17:].decode('utf-8').strip()
domain = urllib.parse.urlparse(domain).netloc
elif theme_path in line:
start = line.index(theme_path) + len(theme_path)
end = line.find(b'/', start)
if end >= 0:
themes.append((domain, line[start:end].decode('utf-8')))
return themes
def main():
lines = read_gzip('https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2016-07/warc.paths.gz')
paths = ['https://commoncrawl.s3.amazonaws.com/' + x.decode('utf-8').strip() for x in lines]
with open('results.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['domain', 'theme'])
for result in scoop.futures.map(get_themes, paths):
for domain, theme in result:
writer.writerow([domain, theme])
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment