sfc-gh-vsekar · February 18, 2025 12:08
diff --git a/blog_querying_multi_terabyte_via_external_table_sqls.sql b/blog_querying_multi_terabyte_via_external_table_sqls.sql

 /*
    Block : #STAGE
    DDL for defining the external stage
 */
 -- Step 1 : Create a database and schema for the experiment
 create database ccrawl;
 create schema ccrawl.raw;
 use schema ccrawl.raw;

 -- create the stage and ensure directory tables enabled
 create or replace stage stg_common_crawl_index
    url = 's3://commoncrawl/cc-index/table/cc-main/warc/'
    directory = ( enable = true )
    file_format=(type=parquet)
    COMMENT = 'stage to s3 bucket for common crawl. https://registry.opendata.aws/commoncrawl/'
 ;

 -- Should be refresh to get the directory metadata information  
 alter stage stg_common_crawl_index refresh;

 -- =================================================================
 /*
    Block : #LIST_DIRECTORY
    Find the file size and the number of partitions for each crawl
 */
 with base as (
    select  
      replace(SPLIT_PART(relative_path, '/', 1), 'crawl=', '') as crawl,
      replace(SPLIT_PART(relative_path, '/', 2), 'subset=', '') as subset,
      size
    from directory( @stg_common_crawl_index )
 )
 select crawl, subset
    ,count(*) as file_count
    ,round(sum(size) / 1024 / 1024 / 1024) as total_size_gb
 from base
 -- where subset = 'warc'
 --    and crawl in ('CC-MAIN-2020-50', 'CC-MAIN-2020-45', 'CC-MAIN-2020-16', 'CC-MAIN-2020-10')
 group by crawl, subset
 order by crawl desc
 ;

 -- =================================================================
 /*
    Block : #CREATE_EXTERNAL_TABLE
    Create the external table
 */
 CREATE OR REPLACE EXTERNAL TABLE COMMON_CRAWL_EXTRNL (
    crawl varchar as SPLIT_PART(metadata$filename, '/', 5),
    subset varchar as SPLIT_PART(metadata$filename, '/', 6)
 )
 PARTITION BY ( crawl, subset )
 WITH LOCATION = @stg_common_crawl_index
 REFRESH_ON_CREATE =  TRUE
 FILE_FORMAT = ( TYPE = PARQUET )
 COMMENT = 'common crawl index dataset. Ref https://commoncrawl.org '
 ;
  
 -- ensure to do this. 
 alter external table COMMON_CRAWL_EXTRNL refresh;
 
 -- =================================================================
 /*
    Block : #LOAD_LOCAL_TABLE
 */

 -- Create table defn for Snowflake local copy
 create or replace transient table common_crawl_local (
  crawl varchar,
  subset  varchar,
  content_charset  varchar,
  content_languages  varchar,
  content_mime_detected  varchar,
  content_mime_type  varchar,
  fetch_status  varchar,
  url  varchar,
  url_host_name  varchar,
  url_host_registered_domain  varchar,
  url_host_registry_suffix  varchar,
  url_host_tld  varchar,
  url_path  varchar,
  url_protocol  varchar,
  warc_filename  varchar
 )
 cluster by ( crawl, subset )
 comment = 'common crawl index dataset, local copy. Ref https://commoncrawl.org '
 ;

 -- Insert into the table using a select on the external table.
 insert into COMMON_CRAWL_LOCAL 
   select 
    replace(crawl, 'crawl=', '') as crawl,
    replace(subset, 'subset=', '') as subset,
    value:content_charset::varchar as content_charset,
    value:content_languages::varchar as content_languages,
    value:content_mime_detected::varchar as content_mime_detected,
    value:content_mime_type::varchar as content_mime_type,
    value:fetch_status::varchar as fetch_status,
    value:url::varchar as d_url,
    value:url_host_name::varchar as url_host_name,
    value:url_host_registered_domain::varchar as url_host_registered_domain,
    value:url_host_registry_suffix::varchar as url_host_registry_suffix,
    value:url_host_tld::varchar as url_host_tld,
    value:url_path::varchar as url_path,
    value:url_protocol::varchar as url_protocol,
    value:warc_filename::varchar as warc_filename
  from COMMON_CRAWL_EXTRNL
  where subset = 'subset=warc'
      and crawl in ('crawl=CC-MAIN-2020-50', 'crawl=CC-MAIN-2020-45', 'crawl=CC-MAIN-2020-16', 'crawl=CC-MAIN-2020-10')
 ;

 -- =================================================================
 /*
    Block : #QUERY_EXTERNAL
 */

 -- Most crawled TLD
 SELECT 
    value:url_host_tld::varchar as url_host_tld,
    approx_count_distinct( value:url_host_registered_domain::varchar ) as n_domains,
    count(*) as n_captures
    ,div0(sum(1e0) ,n_domains) as avg_captures_per_domain
 FROM COMMON_CRAWL_EXTRNL
 WHERE crawl = 'crawl=CC-MAIN-2020-16'
  AND subset = 'subset=warc'
 group by url_host_tld
 order by n_captures desc
 ;

 -- Australian domains with most pages archived
 SELECT COUNT(*) AS rcount,
       value:url_host_registered_domain::varchar as url_host_registered_domain
 FROM COMMON_CRAWL_EXTRNL
 WHERE crawl = 'crawl=CC-MAIN-2020-16'
  AND subset = 'subset=warc'
  AND value:url_host_tld::varchar = 'au'
 GROUP BY  url_host_registered_domain
 HAVING rcount >= 100
 ORDER BY  rcount DESC
 limit 500
 ;

 -- Domains with the most subdomains
 SELECT value:url_host_registered_domain::varchar as url_host_registered_domain,
       approx_count_distinct(value:url_host_name::varchar) AS num_subdomains
 FROM COMMON_CRAWL_EXTRNL
 WHERE crawl = 'crawl=CC-MAIN-2020-16'
  AND subset = 'subset=warc'
 GROUP BY url_host_registered_domain
 ORDER BY num_subdomains DESC
 LIMIT 100
 ;

 -- public dropbox content type
 SELECT value:content_mime_detected::varchar as content_mime_detected,
       count(*) as n
 FROM COMMON_CRAWL_EXTRNL
 WHERE crawl = 'crawl=CC-MAIN-2020-16'
  AND subset = 'subset=warc'
  AND value:url_host_registered_domain::varchar = 'dropboxusercontent.com'
 GROUP BY 1
 ORDER BY n DESC
 LIMIT 100
 ;

 -- =================================================================
 /*
    Block : #QUERY_LOCAL
 */
 -- Most crawled TLD
 SELECT 
    url_host_tld,
    approx_count_distinct( url_host_registered_domain ) as n_domains,
    count(*) as n_captures
    ,div0(sum(1e0) ,n_domains) as avg_captures_per_domain
 FROM COMMON_CRAWL_LOCAL
 WHERE crawl = 'CC-MAIN-2020-16'
  AND subset = 'warc'
 group by url_host_tld
 order by n_captures desc
 ;

 -- Australian domains with most pages archived
 SELECT COUNT(*) AS rcount,
       url_host_registered_domain
 FROM COMMON_CRAWL_LOCAL
 WHERE crawl = 'CC-MAIN-2020-16'
  AND subset = 'warc'
  AND url_host_tld = 'au'
 GROUP BY  url_host_registered_domain
 HAVING rcount >= 100
 ORDER BY  rcount DESC
 limit 500
 ;

 -- Domains with the most subdomains 
 SELECT url_host_registered_domain,
       approx_count_distinct(url_host_name) AS num_subdomains
 FROM COMMON_CRAWL_LOCAL
 WHERE crawl = 'CC-MAIN-2020-16'
  AND subset = 'warc'
 GROUP BY url_host_registered_domain
 ORDER BY num_subdomains DESC
 LIMIT 100
 ;

 -- public dropbox content type
 SELECT content_mime_detected,
       count(*) as n
 FROM COMMON_CRAWL_LOCAL
 WHERE crawl = 'CC-MAIN-2020-16'
  AND subset = 'warc'
  AND url_host_registered_domain = 'dropboxusercontent.com'
 GROUP BY 1
 ORDER BY n DESC
 LIMIT 100
 ;

 -- =================================================================
 /*
    Block : #QUERY_STAGE_DATA
 */
 -- Sample query to investigate directly on the file from the stage
 SELECT 
    METADATA$FILENAME,
    t.$1
 FROM @stg_common_crawl_index (
  file_format => ccrawl_parquet_format 
  ,PATTERN =>'.*crawl=CC-MAIN-2020-16.*subset=warc.*' ) as t
 limit 5
 ;

	/*
	Block : #STAGE
	DDL for defining the external stage
	*/
	-- Step 1 : Create a database and schema for the experiment
	create database ccrawl;
	create schema ccrawl.raw;
	use schema ccrawl.raw;

	-- create the stage and ensure directory tables enabled
	create or replace stage stg_common_crawl_index
	url = 's3://commoncrawl/cc-index/table/cc-main/warc/'
	directory = ( enable = true )
	file_format=(type=parquet)
	COMMENT = 'stage to s3 bucket for common crawl. https://registry.opendata.aws/commoncrawl/'
	;

	-- Should be refresh to get the directory metadata information
	alter stage stg_common_crawl_index refresh;

	-- =================================================================
	/*
	Block : #LIST_DIRECTORY
	Find the file size and the number of partitions for each crawl
	*/
	with base as (
	select
	replace(SPLIT_PART(relative_path, '/', 1), 'crawl=', '') as crawl,
	replace(SPLIT_PART(relative_path, '/', 2), 'subset=', '') as subset,
	size
	from directory( @stg_common_crawl_index )
	)
	select crawl, subset
	,count(*) as file_count
	,round(sum(size) / 1024 / 1024 / 1024) as total_size_gb
	from base
	-- where subset = 'warc'
	-- and crawl in ('CC-MAIN-2020-50', 'CC-MAIN-2020-45', 'CC-MAIN-2020-16', 'CC-MAIN-2020-10')
	group by crawl, subset
	order by crawl desc
	;

	-- =================================================================
	/*
	Block : #CREATE_EXTERNAL_TABLE
	Create the external table
	*/
	CREATE OR REPLACE EXTERNAL TABLE COMMON_CRAWL_EXTRNL (
	crawl varchar as SPLIT_PART(metadata$filename, '/', 5),
	subset varchar as SPLIT_PART(metadata$filename, '/', 6)
	)
	PARTITION BY ( crawl, subset )
	WITH LOCATION = @stg_common_crawl_index
	REFRESH_ON_CREATE = TRUE
	FILE_FORMAT = ( TYPE = PARQUET )
	COMMENT = 'common crawl index dataset. Ref https://commoncrawl.org '
	;

	-- ensure to do this.
	alter external table COMMON_CRAWL_EXTRNL refresh;

	-- =================================================================
	/*
	Block : #LOAD_LOCAL_TABLE
	*/

	-- Create table defn for Snowflake local copy
	create or replace transient table common_crawl_local (
	crawl varchar,
	subset varchar,
	content_charset varchar,
	content_languages varchar,
	content_mime_detected varchar,
	content_mime_type varchar,
	fetch_status varchar,
	url varchar,
	url_host_name varchar,
	url_host_registered_domain varchar,
	url_host_registry_suffix varchar,
	url_host_tld varchar,
	url_path varchar,
	url_protocol varchar,
	warc_filename varchar
	)
	cluster by ( crawl, subset )
	comment = 'common crawl index dataset, local copy. Ref https://commoncrawl.org '
	;

	-- Insert into the table using a select on the external table.
	insert into COMMON_CRAWL_LOCAL
	select
	replace(crawl, 'crawl=', '') as crawl,
	replace(subset, 'subset=', '') as subset,
	value:content_charset::varchar as content_charset,
	value:content_languages::varchar as content_languages,
	value:content_mime_detected::varchar as content_mime_detected,
	value:content_mime_type::varchar as content_mime_type,
	value:fetch_status::varchar as fetch_status,
	value:url::varchar as d_url,
	value:url_host_name::varchar as url_host_name,
	value:url_host_registered_domain::varchar as url_host_registered_domain,
	value:url_host_registry_suffix::varchar as url_host_registry_suffix,
	value:url_host_tld::varchar as url_host_tld,
	value:url_path::varchar as url_path,
	value:url_protocol::varchar as url_protocol,
	value:warc_filename::varchar as warc_filename
	from COMMON_CRAWL_EXTRNL
	where subset = 'subset=warc'
	and crawl in ('crawl=CC-MAIN-2020-50', 'crawl=CC-MAIN-2020-45', 'crawl=CC-MAIN-2020-16', 'crawl=CC-MAIN-2020-10')
	;

	-- =================================================================
	/*
	Block : #QUERY_EXTERNAL
	*/

	-- Most crawled TLD
	SELECT
	value:url_host_tld::varchar as url_host_tld,
	approx_count_distinct( value:url_host_registered_domain::varchar ) as n_domains,
	count(*) as n_captures
	,div0(sum(1e0) ,n_domains) as avg_captures_per_domain
	FROM COMMON_CRAWL_EXTRNL
	WHERE crawl = 'crawl=CC-MAIN-2020-16'
	AND subset = 'subset=warc'
	group by url_host_tld
	order by n_captures desc
	;

	-- Australian domains with most pages archived
	SELECT COUNT(*) AS rcount,
	value:url_host_registered_domain::varchar as url_host_registered_domain
	FROM COMMON_CRAWL_EXTRNL
	WHERE crawl = 'crawl=CC-MAIN-2020-16'
	AND subset = 'subset=warc'
	AND value:url_host_tld::varchar = 'au'
	GROUP BY url_host_registered_domain
	HAVING rcount >= 100
	ORDER BY rcount DESC
	limit 500
	;

	-- Domains with the most subdomains
	SELECT value:url_host_registered_domain::varchar as url_host_registered_domain,
	approx_count_distinct(value:url_host_name::varchar) AS num_subdomains
	FROM COMMON_CRAWL_EXTRNL
	WHERE crawl = 'crawl=CC-MAIN-2020-16'
	AND subset = 'subset=warc'
	GROUP BY url_host_registered_domain
	ORDER BY num_subdomains DESC
	LIMIT 100
	;

	-- public dropbox content type
	SELECT value:content_mime_detected::varchar as content_mime_detected,
	count(*) as n
	FROM COMMON_CRAWL_EXTRNL
	WHERE crawl = 'crawl=CC-MAIN-2020-16'
	AND subset = 'subset=warc'
	AND value:url_host_registered_domain::varchar = 'dropboxusercontent.com'
	GROUP BY 1
	ORDER BY n DESC
	LIMIT 100
	;

	-- =================================================================
	/*
	Block : #QUERY_LOCAL
	*/
	-- Most crawled TLD
	SELECT
	url_host_tld,
	approx_count_distinct( url_host_registered_domain ) as n_domains,
	count(*) as n_captures
	,div0(sum(1e0) ,n_domains) as avg_captures_per_domain
	FROM COMMON_CRAWL_LOCAL
	WHERE crawl = 'CC-MAIN-2020-16'
	AND subset = 'warc'
	group by url_host_tld
	order by n_captures desc
	;

	-- Australian domains with most pages archived
	SELECT COUNT(*) AS rcount,
	url_host_registered_domain
	FROM COMMON_CRAWL_LOCAL
	WHERE crawl = 'CC-MAIN-2020-16'
	AND subset = 'warc'
	AND url_host_tld = 'au'
	GROUP BY url_host_registered_domain
	HAVING rcount >= 100
	ORDER BY rcount DESC
	limit 500
	;

	-- Domains with the most subdomains
	SELECT url_host_registered_domain,
	approx_count_distinct(url_host_name) AS num_subdomains
	FROM COMMON_CRAWL_LOCAL
	WHERE crawl = 'CC-MAIN-2020-16'
	AND subset = 'warc'
	GROUP BY url_host_registered_domain
	ORDER BY num_subdomains DESC
	LIMIT 100
	;

	-- public dropbox content type
	SELECT content_mime_detected,
	count(*) as n
	FROM COMMON_CRAWL_LOCAL
	WHERE crawl = 'CC-MAIN-2020-16'
	AND subset = 'warc'
	AND url_host_registered_domain = 'dropboxusercontent.com'
	GROUP BY 1
	ORDER BY n DESC
	LIMIT 100
	;

	-- =================================================================
	/*
	Block : #QUERY_STAGE_DATA
	*/
	-- Sample query to investigate directly on the file from the stage
	SELECT
	METADATA$FILENAME,
	t.$1
	FROM @stg_common_crawl_index (
	file_format => ccrawl_parquet_format
	,PATTERN =>'.crawl=CC-MAIN-2020-16.subset=warc.*' ) as t
	limit 5
	;