This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Shell script using curl and jq to retrieve all subdomains for a given domain | |
# from Common Crawl's most recent index or a specified crawl ID. This script | |
# dynamically retrieves the latest crawl ID if none is provided, fetches data | |
# (across multiple pages if necessary), retries failed requests, and extracts | |
# unique subdomains. | |
# Usage: | |
# bash fetch_subdomains.sh <domain> [crawl_id] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE EXTERNAL TABLE IF NOT EXISTS commoncrawl_index -- let’s create a new table with the following columns: | |
( | |
url_surtkey STRING, -- Sort-friendly URI Reordering Transform | |
url STRING, -- the URL (duh) including protocol (http or https) | |
url_host_name STRING, -- the hostname, including subdomain(s) | |
url_host_tld STRING, -- the top-level domain such as `.org` | |
url_host_registered_domain STRING, -- the registered domain name | |
url_host_private_domain STRING, -- private domain such as `example.com` | |
url_host_public_suffix STRING, -- public suffix of the domain such as `.co.uk` or `.edu` | |
url_protocol STRING, -- the transfer protocol used, (http or https) |
OlderNewer