Sebastian Nagel sebastian-nagel

correlation metrics between character set and content language

	### Jython
	# install Jython (see https://www.jython.org/download)
	wget https://repo1.maven.org/maven2/org/python/jython-standalone/2.7.2/jython-standalone-2.7.2.jar

	# clone pywebgraph (fork with modifications)
	git clone https://github.com/commoncrawl/py-web-graph.git
	cd py-web-graph
	# copy console.py into current working directory so that "pywebgraph" is visible as package
	cp pywebgraph/console.py .

	from warcio.archiveiterator import ArchiveIterator

	with open('path/to/file.wet.gz', 'rb') as stream:
	for record in ArchiveIterator(stream):
	if record.rec_type == 'conversion':
	url = record.rec_headers.get_header('WARC-Target-URI')
	text = record.content_stream().read().decode('utf-8')

	<?xml version="1.0" encoding="UTF-8"?>
	<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
	<sitemap>
	<loc>
	<![CDATA[ http://www.example.com/sitemap1.xml ]]>
	</loc>
	<lastmod>
	<![CDATA[ 2018-12-12 02:06:56 ]]>
	</lastmod>
	</sitemap>

	#% zgrep '^{"Container' .../CC-MAIN-XXX-XXX.warc.wat.gz \
	# \| jq --raw-output '."Envelope"."Payload-Metadata"."HTTP-Response-Metadata"."HTML-Metadata"."Links"[]?.path' \
	# \| sort \| uniq -c \| sort -k1,1nr
	# see also:
	# https://github.com/commoncrawl/ia-web-commons/issues/9
	# https://github.com/commoncrawl/ia-web-commons/issues/8
	# https://github.com/iipc/webarchive-commons/pull/72
	7777908 A@/href
	1266284 IMG@/src
	90022 STYLE/#text

	import fileinput
	import sys

	import boto3
	import botocore

	import ujson as json


	no_sign_request = botocore.client.Config(

	# hanging executor on Spark 2.1.0 and Python 2.7

	from pyspark import SparkContext


	class BadEncodedException(Exception):
	def __init__(self, reason):
	self.msg = str(reason)
	super(BadEncodedException, self).__init__(self.msg)

	#!/bin/bash

	#### extract news sites from DMOZ.org ####

	# dependencies
	# Linux
	# bash
	# wget
	# perl
	# regexp-assemble

	import fileinput
	import sys
	import tldextract
	from _collections import defaultdict
	from math import log


	RANK_DIVERGENCE_THR = 0.02
	HOST_LENGTH_DIVERGENCE_THR = 0.15

	# -- coding: utf-8 --
	"""
	common-crawl-cdx.py

	A simple example program to analyze the Common Crawl index.

	This is implemented as a single stream job which accesses S3 via HTTP,
	so that it can be easily be run from any laptop, but it could easily be
	converted to an EMR job which processed the 300 index files in parallel.