Created
November 12, 2024 19:37
-
-
Save thunderpoot/f53aa6f1ae97437d7419973a70a75012 to your computer and use it in GitHub Desktop.
SQL script used to create an external table in Amazon Athena (and so on). Contains the schema for CC's columnar index
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE EXTERNAL TABLE IF NOT EXISTS commoncrawl_index -- let’s create a new table with the following columns: | |
( | |
url_surtkey STRING, -- Sort-friendly URI Reordering Transform | |
url STRING, -- the URL (duh) including protocol (http or https) | |
url_host_name STRING, -- the hostname, including subdomain(s) | |
url_host_tld STRING, -- the top-level domain such as `.org` | |
url_host_registered_domain STRING, -- the registered domain name | |
url_host_private_domain STRING, -- private domain such as `example.com` | |
url_host_public_suffix STRING, -- public suffix of the domain such as `.co.uk` or `.edu` | |
url_protocol STRING, -- the transfer protocol used, (http or https) | |
url_port INT, -- the port used, typically 80 for http or 443 for https | |
url_path STRING, -- the stuff after the hostname, like `/cool/stuff.html` | |
url_query STRING, -- the URL query such as `?foo=bar` | |
fetch_time TIMESTAMP, -- when the page was fetched, `ISO 8601` | |
fetch_status SMALLINT, -- the HTTP status returned, like 200 or 404 etc | |
content_digest STRING, -- the SHA-1 hash of the content | |
content_mime_type STRING, -- the media type, such as `text/html` or `application/pdf` | |
content_mime_detected STRING, -- the _detected_ mime type (in case it differs) | |
content_charset STRING, -- like `UTF-8` or `ISO-8859-1` and so on | |
content_languages STRING, -- ISO 639-3 of the detected lang(s) (up to three) | |
warc_filename STRING, -- the S3 path e.g. `/over/here/foo.warc.gz` | |
warc_record_offset INT, -- the offset within the WARC file (bytes) | |
warc_record_length INT -- the content length (bytes) | |
) | |
PARTITIONED BY (crawl STRING, subset STRING) -- group by crawl ID and subset | |
STORED AS PARQUET -- columnar format | |
LOCATION 's3://commoncrawl/cc-index/table/cc-main/warc/'; -- just WARC stuff in `s3://commoncrawl/` | |
MSCK REPAIR TABLE commoncrawl_index; -- then add the partitions to the metastore |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment