Last active
February 18, 2025 12:08
-
-
Save sfc-gh-vsekar/ca004596b73905cc13479222b75db6ee to your computer and use it in GitHub Desktop.
A quick gist for the sqls used in the blog : Querying TB sized External Tables with Snowflake
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Block : #STAGE | |
DDL for defining the external stage | |
*/ | |
-- Step 1 : Create a database and schema for the experiment | |
create database ccrawl; | |
create schema ccrawl.raw; | |
use schema ccrawl.raw; | |
-- create the stage and ensure directory tables enabled | |
create or replace stage stg_common_crawl_index | |
url = 's3://commoncrawl/cc-index/table/cc-main/warc/' | |
directory = ( enable = true ) | |
file_format=(type=parquet) | |
COMMENT = 'stage to s3 bucket for common crawl. https://registry.opendata.aws/commoncrawl/' | |
; | |
-- Should be refresh to get the directory metadata information | |
alter stage stg_common_crawl_index refresh; | |
-- ================================================================= | |
/* | |
Block : #LIST_DIRECTORY | |
Find the file size and the number of partitions for each crawl | |
*/ | |
with base as ( | |
select | |
replace(SPLIT_PART(relative_path, '/', 1), 'crawl=', '') as crawl, | |
replace(SPLIT_PART(relative_path, '/', 2), 'subset=', '') as subset, | |
size | |
from directory( @stg_common_crawl_index ) | |
) | |
select crawl, subset | |
,count(*) as file_count | |
,round(sum(size) / 1024 / 1024 / 1024) as total_size_gb | |
from base | |
-- where subset = 'warc' | |
-- and crawl in ('CC-MAIN-2020-50', 'CC-MAIN-2020-45', 'CC-MAIN-2020-16', 'CC-MAIN-2020-10') | |
group by crawl, subset | |
order by crawl desc | |
; | |
-- ================================================================= | |
/* | |
Block : #CREATE_EXTERNAL_TABLE | |
Create the external table | |
*/ | |
CREATE OR REPLACE EXTERNAL TABLE COMMON_CRAWL_EXTRNL ( | |
crawl varchar as SPLIT_PART(metadata$filename, '/', 5), | |
subset varchar as SPLIT_PART(metadata$filename, '/', 6) | |
) | |
PARTITION BY ( crawl, subset ) | |
WITH LOCATION = @stg_common_crawl_index | |
REFRESH_ON_CREATE = TRUE | |
FILE_FORMAT = ( TYPE = PARQUET ) | |
COMMENT = 'common crawl index dataset. Ref https://commoncrawl.org ' | |
; | |
-- ensure to do this. | |
alter external table COMMON_CRAWL_EXTRNL refresh; | |
-- ================================================================= | |
/* | |
Block : #LOAD_LOCAL_TABLE | |
*/ | |
-- Create table defn for Snowflake local copy | |
create or replace transient table common_crawl_local ( | |
crawl varchar, | |
subset varchar, | |
content_charset varchar, | |
content_languages varchar, | |
content_mime_detected varchar, | |
content_mime_type varchar, | |
fetch_status varchar, | |
url varchar, | |
url_host_name varchar, | |
url_host_registered_domain varchar, | |
url_host_registry_suffix varchar, | |
url_host_tld varchar, | |
url_path varchar, | |
url_protocol varchar, | |
warc_filename varchar | |
) | |
cluster by ( crawl, subset ) | |
comment = 'common crawl index dataset, local copy. Ref https://commoncrawl.org ' | |
; | |
-- Insert into the table using a select on the external table. | |
insert into COMMON_CRAWL_LOCAL | |
select | |
replace(crawl, 'crawl=', '') as crawl, | |
replace(subset, 'subset=', '') as subset, | |
value:content_charset::varchar as content_charset, | |
value:content_languages::varchar as content_languages, | |
value:content_mime_detected::varchar as content_mime_detected, | |
value:content_mime_type::varchar as content_mime_type, | |
value:fetch_status::varchar as fetch_status, | |
value:url::varchar as d_url, | |
value:url_host_name::varchar as url_host_name, | |
value:url_host_registered_domain::varchar as url_host_registered_domain, | |
value:url_host_registry_suffix::varchar as url_host_registry_suffix, | |
value:url_host_tld::varchar as url_host_tld, | |
value:url_path::varchar as url_path, | |
value:url_protocol::varchar as url_protocol, | |
value:warc_filename::varchar as warc_filename | |
from COMMON_CRAWL_EXTRNL | |
where subset = 'subset=warc' | |
and crawl in ('crawl=CC-MAIN-2020-50', 'crawl=CC-MAIN-2020-45', 'crawl=CC-MAIN-2020-16', 'crawl=CC-MAIN-2020-10') | |
; | |
-- ================================================================= | |
/* | |
Block : #QUERY_EXTERNAL | |
*/ | |
-- Most crawled TLD | |
SELECT | |
value:url_host_tld::varchar as url_host_tld, | |
approx_count_distinct( value:url_host_registered_domain::varchar ) as n_domains, | |
count(*) as n_captures | |
,div0(sum(1e0) ,n_domains) as avg_captures_per_domain | |
FROM COMMON_CRAWL_EXTRNL | |
WHERE crawl = 'crawl=CC-MAIN-2020-16' | |
AND subset = 'subset=warc' | |
group by url_host_tld | |
order by n_captures desc | |
; | |
-- Australian domains with most pages archived | |
SELECT COUNT(*) AS rcount, | |
value:url_host_registered_domain::varchar as url_host_registered_domain | |
FROM COMMON_CRAWL_EXTRNL | |
WHERE crawl = 'crawl=CC-MAIN-2020-16' | |
AND subset = 'subset=warc' | |
AND value:url_host_tld::varchar = 'au' | |
GROUP BY url_host_registered_domain | |
HAVING rcount >= 100 | |
ORDER BY rcount DESC | |
limit 500 | |
; | |
-- Domains with the most subdomains | |
SELECT value:url_host_registered_domain::varchar as url_host_registered_domain, | |
approx_count_distinct(value:url_host_name::varchar) AS num_subdomains | |
FROM COMMON_CRAWL_EXTRNL | |
WHERE crawl = 'crawl=CC-MAIN-2020-16' | |
AND subset = 'subset=warc' | |
GROUP BY url_host_registered_domain | |
ORDER BY num_subdomains DESC | |
LIMIT 100 | |
; | |
-- public dropbox content type | |
SELECT value:content_mime_detected::varchar as content_mime_detected, | |
count(*) as n | |
FROM COMMON_CRAWL_EXTRNL | |
WHERE crawl = 'crawl=CC-MAIN-2020-16' | |
AND subset = 'subset=warc' | |
AND value:url_host_registered_domain::varchar = 'dropboxusercontent.com' | |
GROUP BY 1 | |
ORDER BY n DESC | |
LIMIT 100 | |
; | |
-- ================================================================= | |
/* | |
Block : #QUERY_LOCAL | |
*/ | |
-- Most crawled TLD | |
SELECT | |
url_host_tld, | |
approx_count_distinct( url_host_registered_domain ) as n_domains, | |
count(*) as n_captures | |
,div0(sum(1e0) ,n_domains) as avg_captures_per_domain | |
FROM COMMON_CRAWL_LOCAL | |
WHERE crawl = 'CC-MAIN-2020-16' | |
AND subset = 'warc' | |
group by url_host_tld | |
order by n_captures desc | |
; | |
-- Australian domains with most pages archived | |
SELECT COUNT(*) AS rcount, | |
url_host_registered_domain | |
FROM COMMON_CRAWL_LOCAL | |
WHERE crawl = 'CC-MAIN-2020-16' | |
AND subset = 'warc' | |
AND url_host_tld = 'au' | |
GROUP BY url_host_registered_domain | |
HAVING rcount >= 100 | |
ORDER BY rcount DESC | |
limit 500 | |
; | |
-- Domains with the most subdomains | |
SELECT url_host_registered_domain, | |
approx_count_distinct(url_host_name) AS num_subdomains | |
FROM COMMON_CRAWL_LOCAL | |
WHERE crawl = 'CC-MAIN-2020-16' | |
AND subset = 'warc' | |
GROUP BY url_host_registered_domain | |
ORDER BY num_subdomains DESC | |
LIMIT 100 | |
; | |
-- public dropbox content type | |
SELECT content_mime_detected, | |
count(*) as n | |
FROM COMMON_CRAWL_LOCAL | |
WHERE crawl = 'CC-MAIN-2020-16' | |
AND subset = 'warc' | |
AND url_host_registered_domain = 'dropboxusercontent.com' | |
GROUP BY 1 | |
ORDER BY n DESC | |
LIMIT 100 | |
; | |
-- ================================================================= | |
/* | |
Block : #QUERY_STAGE_DATA | |
*/ | |
-- Sample query to investigate directly on the file from the stage | |
SELECT | |
METADATA$FILENAME, | |
t.$1 | |
FROM @stg_common_crawl_index ( | |
file_format => ccrawl_parquet_format | |
,PATTERN =>'.*crawl=CC-MAIN-2020-16.*subset=warc.*' ) as t | |
limit 5 | |
; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment