correlation metrics between character set and content language
- data: Common Crawl September 2019 data set
- SQL query on columnar index executed via AWS Athena
### Jython | |
# install Jython (see https://www.jython.org/download) | |
wget https://repo1.maven.org/maven2/org/python/jython-standalone/2.7.2/jython-standalone-2.7.2.jar | |
# clone pywebgraph (fork with modifications) | |
git clone https://github.com/commoncrawl/py-web-graph.git | |
cd py-web-graph | |
# copy console.py into current working directory so that "pywebgraph" is visible as package | |
cp pywebgraph/console.py . |
correlation metrics between character set and content language
from warcio.archiveiterator import ArchiveIterator | |
with open('path/to/file.wet.gz', 'rb') as stream: | |
for record in ArchiveIterator(stream): | |
if record.rec_type == 'conversion': | |
url = record.rec_headers.get_header('WARC-Target-URI') | |
text = record.content_stream().read().decode('utf-8') |
<?xml version="1.0" encoding="UTF-8"?> | |
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> | |
<sitemap> | |
<loc> | |
<![CDATA[ http://www.example.com/sitemap1.xml ]]> | |
</loc> | |
<lastmod> | |
<![CDATA[ 2018-12-12 02:06:56 ]]> | |
</lastmod> | |
</sitemap> |
#% zgrep '^{"Container' .../CC-MAIN-XXX-XXX.warc.wat.gz \ | |
# | jq --raw-output '."Envelope"."Payload-Metadata"."HTTP-Response-Metadata"."HTML-Metadata"."Links"[]?.path' \ | |
# | sort | uniq -c | sort -k1,1nr | |
# see also: | |
# https://github.com/commoncrawl/ia-web-commons/issues/9 | |
# https://github.com/commoncrawl/ia-web-commons/issues/8 | |
# https://github.com/iipc/webarchive-commons/pull/72 | |
7777908 A@/href | |
1266284 IMG@/src | |
90022 STYLE/#text |
import fileinput | |
import sys | |
import boto3 | |
import botocore | |
import ujson as json | |
no_sign_request = botocore.client.Config( |
# hanging executor on Spark 2.1.0 and Python 2.7 | |
from pyspark import SparkContext | |
class BadEncodedException(Exception): | |
def __init__(self, reason): | |
self.msg = str(reason) | |
super(BadEncodedException, self).__init__(self.msg) |
#!/bin/bash | |
#### extract news sites from DMOZ.org #### | |
# dependencies | |
# Linux | |
# bash | |
# wget | |
# perl | |
# regexp-assemble |
import fileinput | |
import sys | |
import tldextract | |
from _collections import defaultdict | |
from math import log | |
RANK_DIVERGENCE_THR = 0.02 | |
HOST_LENGTH_DIVERGENCE_THR = 0.15 |
# -*- coding: utf-8 -*- | |
""" | |
common-crawl-cdx.py | |
A simple example program to analyze the Common Crawl index. | |
This is implemented as a single stream job which accesses S3 via HTTP, | |
so that it can be easily be run from any laptop, but it could easily be | |
converted to an EMR job which processed the 300 index files in parallel. |