Sebastian Nagel sebastian-nagel

correlation metrics between character set and content language

	# -- coding: utf-8 --
	"""
	common-crawl-cdx.py

	A simple example program to analyze the Common Crawl index.

	This is implemented as a single stream job which accesses S3 via HTTP,
	so that it can be easily be run from any laptop, but it could easily be
	converted to an EMR job which processed the 300 index files in parallel.

	import fileinput
	import sys
	import tldextract
	from _collections import defaultdict
	from math import log


	RANK_DIVERGENCE_THR = 0.02
	HOST_LENGTH_DIVERGENCE_THR = 0.15

	#!/bin/bash

	#### extract news sites from DMOZ.org ####

	# dependencies
	# Linux
	# bash
	# wget
	# perl
	# regexp-assemble

	# hanging executor on Spark 2.1.0 and Python 2.7

	from pyspark import SparkContext


	class BadEncodedException(Exception):
	def __init__(self, reason):
	self.msg = str(reason)
	super(BadEncodedException, self).__init__(self.msg)

	import fileinput
	import sys

	import boto3
	import botocore

	import ujson as json


	no_sign_request = botocore.client.Config(

	#% zgrep '^{"Container' .../CC-MAIN-XXX-XXX.warc.wat.gz \
	# \| jq --raw-output '."Envelope"."Payload-Metadata"."HTTP-Response-Metadata"."HTML-Metadata"."Links"[]?.path' \
	# \| sort \| uniq -c \| sort -k1,1nr
	# see also:
	# https://github.com/commoncrawl/ia-web-commons/issues/9
	# https://github.com/commoncrawl/ia-web-commons/issues/8
	# https://github.com/iipc/webarchive-commons/pull/72
	7777908 A@/href
	1266284 IMG@/src
	90022 STYLE/#text

	<?xml version="1.0" encoding="UTF-8"?>
	<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
	<sitemap>
	<loc>
	<![CDATA[ http://www.example.com/sitemap1.xml ]]>
	</loc>
	<lastmod>
	<![CDATA[ 2018-12-12 02:06:56 ]]>
	</lastmod>
	</sitemap>

	from warcio.archiveiterator import ArchiveIterator

	with open('path/to/file.wet.gz', 'rb') as stream:
	for record in ArchiveIterator(stream):
	if record.rec_type == 'conversion':
	url = record.rec_headers.get_header('WARC-Target-URI')
	text = record.content_stream().read().decode('utf-8')

	### Jython
	# install Jython (see https://www.jython.org/download)
	wget https://repo1.maven.org/maven2/org/python/jython-standalone/2.7.2/jython-standalone-2.7.2.jar

	# clone pywebgraph (fork with modifications)
	git clone https://github.com/commoncrawl/py-web-graph.git
	cd py-web-graph
	# copy console.py into current working directory so that "pywebgraph" is visible as package
	cp pywebgraph/console.py .