Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save sfc-gh-vsekar/ca004596b73905cc13479222b75db6ee to your computer and use it in GitHub Desktop.
Save sfc-gh-vsekar/ca004596b73905cc13479222b75db6ee to your computer and use it in GitHub Desktop.
A quick gist for the sqls used in the blog : Querying TB sized External Tables with Snowflake
/*
Block : #STAGE
DDL for defining the external stage
*/
-- Step 1 : Create a database and schema for the experiment
create database ccrawl;
create schema ccrawl.raw;
use schema ccrawl.raw;
-- create the stage and ensure directory tables enabled
create or replace stage stg_common_crawl_index
url = 's3://commoncrawl/cc-index/table/cc-main/warc/'
directory = ( enable = true )
file_format=(type=parquet)
COMMENT = 'stage to s3 bucket for common crawl. https://registry.opendata.aws/commoncrawl/'
;
-- Should be refresh to get the directory metadata information
alter stage stg_common_crawl_index refresh;
-- =================================================================
/*
Block : #LIST_DIRECTORY
Find the file size and the number of partitions for each crawl
*/
with base as (
select
replace(SPLIT_PART(relative_path, '/', 1), 'crawl=', '') as crawl,
replace(SPLIT_PART(relative_path, '/', 2), 'subset=', '') as subset,
size
from directory( @stg_common_crawl_index )
)
select crawl, subset
,count(*) as file_count
,round(sum(size) / 1024 / 1024 / 1024) as total_size_gb
from base
-- where subset = 'warc'
-- and crawl in ('CC-MAIN-2020-50', 'CC-MAIN-2020-45', 'CC-MAIN-2020-16', 'CC-MAIN-2020-10')
group by crawl, subset
order by crawl desc
;
-- =================================================================
/*
Block : #CREATE_EXTERNAL_TABLE
Create the external table
*/
CREATE OR REPLACE EXTERNAL TABLE COMMON_CRAWL_EXTRNL (
crawl varchar as SPLIT_PART(metadata$filename, '/', 5),
subset varchar as SPLIT_PART(metadata$filename, '/', 6)
)
PARTITION BY ( crawl, subset )
WITH LOCATION = @stg_common_crawl_index
REFRESH_ON_CREATE = TRUE
FILE_FORMAT = ( TYPE = PARQUET )
COMMENT = 'common crawl index dataset. Ref https://commoncrawl.org '
;
-- ensure to do this.
alter external table COMMON_CRAWL_EXTRNL refresh;
-- =================================================================
/*
Block : #LOAD_LOCAL_TABLE
*/
-- Create table defn for Snowflake local copy
create or replace transient table common_crawl_local (
crawl varchar,
subset varchar,
content_charset varchar,
content_languages varchar,
content_mime_detected varchar,
content_mime_type varchar,
fetch_status varchar,
url varchar,
url_host_name varchar,
url_host_registered_domain varchar,
url_host_registry_suffix varchar,
url_host_tld varchar,
url_path varchar,
url_protocol varchar,
warc_filename varchar
)
cluster by ( crawl, subset )
comment = 'common crawl index dataset, local copy. Ref https://commoncrawl.org '
;
-- Insert into the table using a select on the external table.
insert into COMMON_CRAWL_LOCAL
select
replace(crawl, 'crawl=', '') as crawl,
replace(subset, 'subset=', '') as subset,
value:content_charset::varchar as content_charset,
value:content_languages::varchar as content_languages,
value:content_mime_detected::varchar as content_mime_detected,
value:content_mime_type::varchar as content_mime_type,
value:fetch_status::varchar as fetch_status,
value:url::varchar as d_url,
value:url_host_name::varchar as url_host_name,
value:url_host_registered_domain::varchar as url_host_registered_domain,
value:url_host_registry_suffix::varchar as url_host_registry_suffix,
value:url_host_tld::varchar as url_host_tld,
value:url_path::varchar as url_path,
value:url_protocol::varchar as url_protocol,
value:warc_filename::varchar as warc_filename
from COMMON_CRAWL_EXTRNL
where subset = 'subset=warc'
and crawl in ('crawl=CC-MAIN-2020-50', 'crawl=CC-MAIN-2020-45', 'crawl=CC-MAIN-2020-16', 'crawl=CC-MAIN-2020-10')
;
-- =================================================================
/*
Block : #QUERY_EXTERNAL
*/
-- Most crawled TLD
SELECT
value:url_host_tld::varchar as url_host_tld,
approx_count_distinct( value:url_host_registered_domain::varchar ) as n_domains,
count(*) as n_captures
,div0(sum(1e0) ,n_domains) as avg_captures_per_domain
FROM COMMON_CRAWL_EXTRNL
WHERE crawl = 'crawl=CC-MAIN-2020-16'
AND subset = 'subset=warc'
group by url_host_tld
order by n_captures desc
;
-- Australian domains with most pages archived
SELECT COUNT(*) AS rcount,
value:url_host_registered_domain::varchar as url_host_registered_domain
FROM COMMON_CRAWL_EXTRNL
WHERE crawl = 'crawl=CC-MAIN-2020-16'
AND subset = 'subset=warc'
AND value:url_host_tld::varchar = 'au'
GROUP BY url_host_registered_domain
HAVING rcount >= 100
ORDER BY rcount DESC
limit 500
;
-- Domains with the most subdomains
SELECT value:url_host_registered_domain::varchar as url_host_registered_domain,
approx_count_distinct(value:url_host_name::varchar) AS num_subdomains
FROM COMMON_CRAWL_EXTRNL
WHERE crawl = 'crawl=CC-MAIN-2020-16'
AND subset = 'subset=warc'
GROUP BY url_host_registered_domain
ORDER BY num_subdomains DESC
LIMIT 100
;
-- public dropbox content type
SELECT value:content_mime_detected::varchar as content_mime_detected,
count(*) as n
FROM COMMON_CRAWL_EXTRNL
WHERE crawl = 'crawl=CC-MAIN-2020-16'
AND subset = 'subset=warc'
AND value:url_host_registered_domain::varchar = 'dropboxusercontent.com'
GROUP BY 1
ORDER BY n DESC
LIMIT 100
;
-- =================================================================
/*
Block : #QUERY_LOCAL
*/
-- Most crawled TLD
SELECT
url_host_tld,
approx_count_distinct( url_host_registered_domain ) as n_domains,
count(*) as n_captures
,div0(sum(1e0) ,n_domains) as avg_captures_per_domain
FROM COMMON_CRAWL_LOCAL
WHERE crawl = 'CC-MAIN-2020-16'
AND subset = 'warc'
group by url_host_tld
order by n_captures desc
;
-- Australian domains with most pages archived
SELECT COUNT(*) AS rcount,
url_host_registered_domain
FROM COMMON_CRAWL_LOCAL
WHERE crawl = 'CC-MAIN-2020-16'
AND subset = 'warc'
AND url_host_tld = 'au'
GROUP BY url_host_registered_domain
HAVING rcount >= 100
ORDER BY rcount DESC
limit 500
;
-- Domains with the most subdomains
SELECT url_host_registered_domain,
approx_count_distinct(url_host_name) AS num_subdomains
FROM COMMON_CRAWL_LOCAL
WHERE crawl = 'CC-MAIN-2020-16'
AND subset = 'warc'
GROUP BY url_host_registered_domain
ORDER BY num_subdomains DESC
LIMIT 100
;
-- public dropbox content type
SELECT content_mime_detected,
count(*) as n
FROM COMMON_CRAWL_LOCAL
WHERE crawl = 'CC-MAIN-2020-16'
AND subset = 'warc'
AND url_host_registered_domain = 'dropboxusercontent.com'
GROUP BY 1
ORDER BY n DESC
LIMIT 100
;
-- =================================================================
/*
Block : #QUERY_STAGE_DATA
*/
-- Sample query to investigate directly on the file from the stage
SELECT
METADATA$FILENAME,
t.$1
FROM @stg_common_crawl_index (
file_format => ccrawl_parquet_format
,PATTERN =>'.*crawl=CC-MAIN-2020-16.*subset=warc.*' ) as t
limit 5
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment