Skip to content

Instantly share code, notes, and snippets.

@rafapolo
Last active March 24, 2026 22:20
Show Gist options
  • Select an option

  • Save rafapolo/7289bf291e13ceef5c84c06f7aa25cfd to your computer and use it in GitHub Desktop.

Select an option

Save rafapolo/7289bf291e13ceef5c84c06f7aa25cfd to your computer and use it in GitHub Desktop.
Exports all basedosdados BigQuery tables → GCS (Parquet+zstd) → Hetzner
#!/usr/bin/env bash
# =============================================================================
# export_basedosdados.sh
# Exports all basedosdados BigQuery tables → GCS (Parquet+zstd) → Hetzner
#
# Prerequisites (run once before this script):
# gcloud auth login
# gcloud auth application-default login
# gcloud config set project YOUR_PROJECT_ID
#
# Usage:
# chmod +x export_basedosdados.sh
# ./export_basedosdados.sh
# =============================================================================
set -euo pipefail
# -----------------------------------------------------------------------------
# CONFIGURATION — edit these before running
# -----------------------------------------------------------------------------
YOUR_PROJECT="your-gcp-project-id" # Your GCP billing project (not basedosdados)
BUCKET_NAME="bd-export-$(date +%s)" # GCS bucket name (must be globally unique)
BUCKET_REGION="US" # Must be US — basedosdados is in US multi-region
HETZNER_USER="root" # Your Hetzner server SSH user
HETZNER_HOST="your.hetzner.ip" # Your Hetzner server IP or hostname
HETZNER_DEST="/mnt/block/basedosdados" # Destination path on Hetzner
PARALLEL_EXPORTS=8 # Number of parallel bq extract jobs
PARALLEL_UPLOADS=4 # Number of parallel rsync/scp jobs
SOURCE_PROJECT="basedosdados" # The public basedosdados BQ project
# -----------------------------------------------------------------------------
# LOGGING
# -----------------------------------------------------------------------------
LOG_FILE="export_$(date +%Y%m%d_%H%M%S).log"
FAILED_FILE="failed_tables.txt"
DONE_FILE="done_tables.txt"
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"; }
log_err() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $*" | tee -a "$LOG_FILE" >&2; }
# -----------------------------------------------------------------------------
# STEP 0 — Verify dependencies
# -----------------------------------------------------------------------------
log "Checking dependencies..."
for cmd in bq gcloud gsutil parallel rsync ssh; do
if ! command -v "$cmd" &>/dev/null; then
log_err "'$cmd' not found. Install google-cloud-sdk and GNU parallel."
exit 1
fi
done
# -----------------------------------------------------------------------------
# STEP 1 — Create GCS bucket in US region (same as basedosdados)
# -----------------------------------------------------------------------------
log "Creating GCS bucket: gs://$BUCKET_NAME in region $BUCKET_REGION"
if gsutil ls "gs://$BUCKET_NAME" &>/dev/null; then
log "Bucket already exists, skipping creation."
else
gsutil mb \
-p "$YOUR_PROJECT" \
-l "$BUCKET_REGION" \
-b on \
"gs://$BUCKET_NAME"
log "Bucket created: gs://$BUCKET_NAME"
fi
# Create SSH dir on Hetzner
log "Preparing destination directory on Hetzner..."
ssh "$HETZNER_USER@$HETZNER_HOST" "mkdir -p $HETZNER_DEST"
# Resume support: load already-done tables
touch "$DONE_FILE" "$FAILED_FILE"
# -----------------------------------------------------------------------------
# STEP 2 — Build the full table list from the basedosdados project
#
# We auto-discover all datasets and tables via the BQ API so we don't rely
# on a hardcoded list. This also detects any new tables added since the
# tables-summary.md was written.
# -----------------------------------------------------------------------------
log "Discovering all datasets in project: $SOURCE_PROJECT ..."
TABLE_LIST_FILE="all_tables.txt"
if [[ ! -f "$TABLE_LIST_FILE" ]]; then
bq ls --project_id="$SOURCE_PROJECT" --format=json 2>/dev/null \
| python3 -c "
import json, sys
datasets = json.load(sys.stdin)
for ds in datasets:
print(ds['datasetReference']['datasetId'])
" > /tmp/datasets.txt
log "Found $(wc -l < /tmp/datasets.txt) datasets. Listing tables..."
> "$TABLE_LIST_FILE"
while IFS= read -r dataset; do
bq ls \
--project_id="$SOURCE_PROJECT" \
--dataset_id="$SOURCE_PROJECT:$dataset" \
--format=json 2>/dev/null \
| python3 -c "
import json, sys
tables = json.load(sys.stdin)
for t in tables:
ref = t.get('tableReference', {})
if t.get('type') in ('TABLE', 'EXTERNAL'): # skip views
print(ref['datasetId'] + '.' + ref['tableId'])
" >> "$TABLE_LIST_FILE"
done < /tmp/datasets.txt
log "Total tables discovered: $(wc -l < "$TABLE_LIST_FILE")"
else
log "Reusing existing table list: $TABLE_LIST_FILE ($(wc -l < "$TABLE_LIST_FILE") tables)"
fi
# -----------------------------------------------------------------------------
# STEP 3 — Export function (called in parallel)
# -----------------------------------------------------------------------------
export_table() {
local table="$1"
local bucket="$2"
local project="$3"
local source="$4"
local done_file="$5"
local failed_file="$6"
local log_file="$7"
# Skip if already done
if grep -qxF "$table" "$done_file" 2>/dev/null; then
echo "[SKIP] $table (already exported)" >> "$log_file"
return 0
fi
local dataset table_id gcs_prefix
dataset=$(echo "$table" | cut -d. -f1)
table_id=$(echo "$table" | cut -d. -f2)
gcs_prefix="gs://$bucket/$dataset/$table_id"
echo "[START] Exporting $source:$table → $gcs_prefix/*.parquet" >> "$log_file"
# Run bq extract with retry (up to 3 attempts)
local attempt=0
local success=false
while [[ $attempt -lt 3 ]]; do
attempt=$((attempt + 1))
if bq extract \
--project_id="$project" \
--source_format=PARQUET \
--compression=ZSTD \
--destination_format=PARQUET \
--location=US \
"${source}:${dataset}.${table_id}" \
"${gcs_prefix}/*.parquet" \
>> "$log_file" 2>&1; then
success=true
break
else
echo "[RETRY $attempt/3] $table" >> "$log_file"
sleep $((attempt * 10))
fi
done
if $success; then
echo "$table" >> "$done_file"
echo "[DONE] $table" >> "$log_file"
else
echo "$table" >> "$failed_file"
echo "[FAIL] $table after 3 attempts" >> "$log_file"
fi
}
export -f export_table
# -----------------------------------------------------------------------------
# STEP 4 — Run exports in parallel
# -----------------------------------------------------------------------------
log "Starting parallel exports ($PARALLEL_EXPORTS workers)..."
log "Progress is logged to: $LOG_FILE"
log "Failed tables will be written to: $FAILED_FILE"
# Filter out already-done tables
comm -23 \
<(sort "$TABLE_LIST_FILE") \
<(sort "$DONE_FILE") \
| parallel \
--jobs "$PARALLEL_EXPORTS" \
--progress \
--bar \
export_table {} "$BUCKET_NAME" "$YOUR_PROJECT" "$SOURCE_PROJECT" \
"$DONE_FILE" "$FAILED_FILE" "$LOG_FILE"
TOTAL=$(wc -l < "$TABLE_LIST_FILE")
DONE=$(wc -l < "$DONE_FILE")
FAILED=$(wc -l < "$FAILED_FILE")
log "Export phase complete: $DONE/$TOTAL done, $FAILED failed"
if [[ $FAILED -gt 0 ]]; then
log "Failed tables:"
cat "$FAILED_FILE" | tee -a "$LOG_FILE"
log "To retry failed tables only, run: bash $0 --retry-failed"
fi
# -----------------------------------------------------------------------------
# STEP 5 — Download from GCS to Hetzner via rsync over SSH
#
# Strategy: download dataset-by-dataset to keep memory usage low and allow
# resuming. We stream directly GCS → Hetzner without storing locally on
# the GCE VM (saves disk space on VM).
# -----------------------------------------------------------------------------
log "Starting transfer to Hetzner ($HETZNER_HOST)..."
download_dataset() {
local dataset="$1"
local bucket="$2"
local hetzner_user="$3"
local hetzner_host="$4"
local hetzner_dest="$5"
local log_file="$6"
echo "[TRANSFER] gs://$bucket/$dataset/ → $hetzner_host:$hetzner_dest/$dataset/" >> "$log_file"
# gsutil rsync to local temp, then scp to Hetzner
# (gsutil can't rsync directly to SSH — pipe through local staging)
local local_stage="/tmp/bd_stage/$dataset"
mkdir -p "$local_stage"
gsutil -m rsync -r "gs://$bucket/$dataset/" "$local_stage/" >> "$log_file" 2>&1
ssh "$hetzner_user@$hetzner_host" "mkdir -p $hetzner_dest/$dataset"
rsync -az --progress \
"$local_stage/" \
"$hetzner_user@$hetzner_host:$hetzner_dest/$dataset/" \
>> "$log_file" 2>&1
# Clean up local staging after successful upload
rm -rf "$local_stage"
echo "[TRANSFERRED] $dataset" >> "$log_file"
}
export -f download_dataset
# Get list of exported datasets
gsutil ls "gs://$BUCKET_NAME/" \
| sed 's|gs://[^/]*/||;s|/||' \
| sort -u \
| parallel \
--jobs "$PARALLEL_UPLOADS" \
--progress \
download_dataset {} "$BUCKET_NAME" "$HETZNER_USER" "$HETZNER_HOST" \
"$HETZNER_DEST" "$LOG_FILE"
log "Transfer complete."
# -----------------------------------------------------------------------------
# STEP 6 — Verify file counts on Hetzner vs GCS
# -----------------------------------------------------------------------------
log "Verifying file counts..."
GCS_COUNT=$(gsutil ls -r "gs://$BUCKET_NAME/**" | grep '\.parquet$' | wc -l)
HETZNER_COUNT=$(ssh "$HETZNER_USER@$HETZNER_HOST" "find $HETZNER_DEST -name '*.parquet' | wc -l")
log "GCS parquet files: $GCS_COUNT"
log "Hetzner parquet files: $HETZNER_COUNT"
if [[ "$GCS_COUNT" -eq "$HETZNER_COUNT" ]]; then
log "File counts match. Transfer verified."
else
log_err "Count mismatch! GCS=$GCS_COUNT Hetzner=$HETZNER_COUNT"
log_err "Run rsync again or check $LOG_FILE for errors."
fi
# -----------------------------------------------------------------------------
# STEP 7 — Clean up GCS bucket to stop storage charges
# -----------------------------------------------------------------------------
read -rp "Delete GCS bucket gs://$BUCKET_NAME to stop storage charges? [y/N] " confirm
if [[ "$confirm" =~ ^[Yy]$ ]]; then
log "Deleting bucket gs://$BUCKET_NAME ..."
gsutil -m rm -r "gs://$BUCKET_NAME"
gsutil rb "gs://$BUCKET_NAME"
log "Bucket deleted. Storage charges stopped."
else
log "Bucket kept. Remember to delete it later: gsutil -m rm -r gs://$BUCKET_NAME && gsutil rb gs://$BUCKET_NAME"
fi
log "All done! Data is at $HETZNER_HOST:$HETZNER_DEST"
log "Total exported: $DONE tables | Failed: $FAILED tables"
log "See $LOG_FILE for full details."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment