thunderpoot · November 12, 2024 19:37
diff --git a/create_cc_index_table.sql b/create_cc_index_table.sql
 CREATE EXTERNAL TABLE IF NOT EXISTS commoncrawl_index       -- let’s create a new table with the following columns:
 (
  url_surtkey                 STRING,                       -- Sort-friendly URI Reordering Transform
  url                         STRING,                       -- the URL (duh) including protocol (http or https)
  url_host_name               STRING,                       -- the hostname, including subdomain(s)
  url_host_tld                STRING,                       -- the top-level domain such as `.org`
  url_host_registered_domain  STRING,                       -- the registered domain name
  url_host_private_domain     STRING,                       -- private domain such as `example.com`
  url_host_public_suffix      STRING,                       -- public suffix of the domain such as `.co.uk` or `.edu`
  url_protocol                STRING,                       -- the transfer protocol used, (http or https)
  url_port                    INT,                          -- the port used, typically 80 for http or 443 for https
  url_path                    STRING,                       -- the stuff after the hostname, like `/cool/stuff.html`
  url_query                   STRING,                       -- the URL query such as `?foo=bar`
  fetch_time                  TIMESTAMP,                    -- when the page was fetched, `ISO 8601`
  fetch_status                SMALLINT,                     -- the HTTP status returned, like 200 or 404 etc
  content_digest              STRING,                       -- the SHA-1 hash of the content
  content_mime_type           STRING,                       -- the media type, such as `text/html` or `application/pdf`
  content_mime_detected       STRING,                       -- the _detected_ mime type (in case it differs)
  content_charset             STRING,                       -- like `UTF-8` or `ISO-8859-1` and so on
  content_languages           STRING,                       -- ISO 639-3 of the detected lang(s) (up to three)
  warc_filename               STRING,                       -- the S3 path e.g. `/over/here/foo.warc.gz`
  warc_record_offset          INT,                          -- the offset within the WARC file (bytes)
  warc_record_length          INT                           -- the content length (bytes)
 )
 PARTITIONED BY (crawl STRING, subset STRING)                -- group by crawl ID and subset
 STORED AS PARQUET                                           -- columnar format
 LOCATION 's3://commoncrawl/cc-index/table/cc-main/warc/';   -- just WARC stuff in `s3://commoncrawl/`

 MSCK REPAIR TABLE commoncrawl_index;                        -- then add the partitions to the metastore
	CREATE EXTERNAL TABLE IF NOT EXISTS commoncrawl_index -- let’s create a new table with the following columns:
	(
	url_surtkey STRING, -- Sort-friendly URI Reordering Transform
	url STRING, -- the URL (duh) including protocol (http or https)
	url_host_name STRING, -- the hostname, including subdomain(s)
	url_host_tld STRING, -- the top-level domain such as `.org`
	url_host_registered_domain STRING, -- the registered domain name
	url_host_private_domain STRING, -- private domain such as `example.com`
	url_host_public_suffix STRING, -- public suffix of the domain such as `.co.uk` or `.edu`
	url_protocol STRING, -- the transfer protocol used, (http or https)
	url_port INT, -- the port used, typically 80 for http or 443 for https
	url_path STRING, -- the stuff after the hostname, like `/cool/stuff.html`
	url_query STRING, -- the URL query such as `?foo=bar`
	fetch_time TIMESTAMP, -- when the page was fetched, `ISO 8601`
	fetch_status SMALLINT, -- the HTTP status returned, like 200 or 404 etc
	content_digest STRING, -- the SHA-1 hash of the content
	content_mime_type STRING, -- the media type, such as `text/html` or `application/pdf`
	content_mime_detected STRING, -- the _detected_ mime type (in case it differs)
	content_charset STRING, -- like `UTF-8` or `ISO-8859-1` and so on
	content_languages STRING, -- ISO 639-3 of the detected lang(s) (up to three)
	warc_filename STRING, -- the S3 path e.g. `/over/here/foo.warc.gz`
	warc_record_offset INT, -- the offset within the WARC file (bytes)
	warc_record_length INT -- the content length (bytes)
	)
	PARTITIONED BY (crawl STRING, subset STRING) -- group by crawl ID and subset
	STORED AS PARQUET -- columnar format
	LOCATION 's3://commoncrawl/cc-index/table/cc-main/warc/'; -- just WARC stuff in `s3://commoncrawl/`

	MSCK REPAIR TABLE commoncrawl_index; -- then add the partitions to the metastore