correlation metrics between character set and content language
- data: Common Crawl September 2019 data set
- SQL query on columnar index executed via AWS Athena
| WARC/1.0 | |
| WARC-Type: metadata | |
| WARC-Target-URI: https://en.wikipedia.org/wiki/Saturn | |
| WARC-Date: 2024-12-11T20:20:04Z | |
| WARC-Record-ID: <urn:uuid:74b1614e-97bb-4a19-b02f-defc603ab81c> | |
| WARC-Refers-To: <urn:uuid:90f1a666-d5ba-4e8d-806d-4d848e77a0f8> | |
| Content-Type: application/json | |
| Content-Length: 1910 | |
| { |
| ### Jython | |
| # install Jython (see https://www.jython.org/download) | |
| wget https://repo1.maven.org/maven2/org/python/jython-standalone/2.7.2/jython-standalone-2.7.2.jar | |
| # clone pywebgraph (fork with modifications) | |
| git clone https://github.com/commoncrawl/py-web-graph.git | |
| cd py-web-graph | |
| # copy console.py into current working directory so that "pywebgraph" is visible as package | |
| cp pywebgraph/console.py . |
correlation metrics between character set and content language
| from warcio.archiveiterator import ArchiveIterator | |
| with open('path/to/file.wet.gz', 'rb') as stream: | |
| for record in ArchiveIterator(stream): | |
| if record.rec_type == 'conversion': | |
| url = record.rec_headers.get_header('WARC-Target-URI') | |
| text = record.content_stream().read().decode('utf-8') |
| <?xml version="1.0" encoding="UTF-8"?> | |
| <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> | |
| <sitemap> | |
| <loc> | |
| <![CDATA[ http://www.example.com/sitemap1.xml ]]> | |
| </loc> | |
| <lastmod> | |
| <![CDATA[ 2018-12-12 02:06:56 ]]> | |
| </lastmod> | |
| </sitemap> |
| #% zgrep '^{"Container' .../CC-MAIN-XXX-XXX.warc.wat.gz \ | |
| # | jq --raw-output '."Envelope"."Payload-Metadata"."HTTP-Response-Metadata"."HTML-Metadata"."Links"[]?.path' \ | |
| # | sort | uniq -c | sort -k1,1nr | |
| # see also: | |
| # https://github.com/commoncrawl/ia-web-commons/issues/9 | |
| # https://github.com/commoncrawl/ia-web-commons/issues/8 | |
| # https://github.com/iipc/webarchive-commons/pull/72 | |
| 7777908 A@/href | |
| 1266284 IMG@/src | |
| 90022 STYLE/#text |
| import fileinput | |
| import sys | |
| import boto3 | |
| import botocore | |
| import ujson as json | |
| no_sign_request = botocore.client.Config( |
| # hanging executor on Spark 2.1.0 and Python 2.7 | |
| from pyspark import SparkContext | |
| class BadEncodedException(Exception): | |
| def __init__(self, reason): | |
| self.msg = str(reason) | |
| super(BadEncodedException, self).__init__(self.msg) |
| #!/bin/bash | |
| #### extract news sites from DMOZ.org #### | |
| # dependencies | |
| # Linux | |
| # bash | |
| # wget | |
| # perl | |
| # regexp-assemble |
| import fileinput | |
| import sys | |
| import tldextract | |
| from _collections import defaultdict | |
| from math import log | |
| RANK_DIVERGENCE_THR = 0.02 | |
| HOST_LENGTH_DIVERGENCE_THR = 0.15 |