correlation metrics between character set and content language
- data: Common Crawl September 2019 data set
 - SQL query on columnar index executed via AWS Athena
 
| # -*- coding: utf-8 -*- | |
| """ | |
| common-crawl-cdx.py | |
| A simple example program to analyze the Common Crawl index. | |
| This is implemented as a single stream job which accesses S3 via HTTP, | |
| so that it can be easily be run from any laptop, but it could easily be | |
| converted to an EMR job which processed the 300 index files in parallel. | 
| import fileinput | |
| import sys | |
| import tldextract | |
| from _collections import defaultdict | |
| from math import log | |
| RANK_DIVERGENCE_THR = 0.02 | |
| HOST_LENGTH_DIVERGENCE_THR = 0.15 | 
| #!/bin/bash | |
| #### extract news sites from DMOZ.org #### | |
| # dependencies | |
| # Linux | |
| # bash | |
| # wget | |
| # perl | |
| # regexp-assemble | 
| # hanging executor on Spark 2.1.0 and Python 2.7 | |
| from pyspark import SparkContext | |
| class BadEncodedException(Exception): | |
| def __init__(self, reason): | |
| self.msg = str(reason) | |
| super(BadEncodedException, self).__init__(self.msg) | 
| import fileinput | |
| import sys | |
| import boto3 | |
| import botocore | |
| import ujson as json | |
| no_sign_request = botocore.client.Config( | 
| #% zgrep '^{"Container' .../CC-MAIN-XXX-XXX.warc.wat.gz \ | |
| # | jq --raw-output '."Envelope"."Payload-Metadata"."HTTP-Response-Metadata"."HTML-Metadata"."Links"[]?.path' \ | |
| # | sort | uniq -c | sort -k1,1nr | |
| # see also: | |
| # https://github.com/commoncrawl/ia-web-commons/issues/9 | |
| # https://github.com/commoncrawl/ia-web-commons/issues/8 | |
| # https://github.com/iipc/webarchive-commons/pull/72 | |
| 7777908 A@/href | |
| 1266284 IMG@/src | |
| 90022 STYLE/#text | 
| <?xml version="1.0" encoding="UTF-8"?> | |
| <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> | |
| <sitemap> | |
| <loc> | |
| <![CDATA[ http://www.example.com/sitemap1.xml ]]> | |
| </loc> | |
| <lastmod> | |
| <![CDATA[ 2018-12-12 02:06:56 ]]> | |
| </lastmod> | |
| </sitemap> | 
| from warcio.archiveiterator import ArchiveIterator | |
| with open('path/to/file.wet.gz', 'rb') as stream: | |
| for record in ArchiveIterator(stream): | |
| if record.rec_type == 'conversion': | |
| url = record.rec_headers.get_header('WARC-Target-URI') | |
| text = record.content_stream().read().decode('utf-8') | 
correlation metrics between character set and content language
| ### Jython | |
| # install Jython (see https://www.jython.org/download) | |
| wget https://repo1.maven.org/maven2/org/python/jython-standalone/2.7.2/jython-standalone-2.7.2.jar | |
| # clone pywebgraph (fork with modifications) | |
| git clone https://github.com/commoncrawl/py-web-graph.git | |
| cd py-web-graph | |
| # copy console.py into current working directory so that "pywebgraph" is visible as package | |
| cp pywebgraph/console.py . |