Last active
March 24, 2026 22:20
-
-
Save rafapolo/7289bf291e13ceef5c84c06f7aa25cfd to your computer and use it in GitHub Desktop.
Exports all basedosdados BigQuery tables → GCS (Parquet+zstd) → Hetzner
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # ============================================================================= | |
| # export_basedosdados.sh | |
| # Exports all basedosdados BigQuery tables → GCS (Parquet+zstd) → Hetzner | |
| # | |
| # Prerequisites (run once before this script): | |
| # gcloud auth login | |
| # gcloud auth application-default login | |
| # gcloud config set project YOUR_PROJECT_ID | |
| # | |
| # Usage: | |
| # chmod +x export_basedosdados.sh | |
| # ./export_basedosdados.sh | |
| # ============================================================================= | |
| set -euo pipefail | |
| # ----------------------------------------------------------------------------- | |
| # CONFIGURATION — edit these before running | |
| # ----------------------------------------------------------------------------- | |
| YOUR_PROJECT="your-gcp-project-id" # Your GCP billing project (not basedosdados) | |
| BUCKET_NAME="bd-export-$(date +%s)" # GCS bucket name (must be globally unique) | |
| BUCKET_REGION="US" # Must be US — basedosdados is in US multi-region | |
| HETZNER_USER="root" # Your Hetzner server SSH user | |
| HETZNER_HOST="your.hetzner.ip" # Your Hetzner server IP or hostname | |
| HETZNER_DEST="/mnt/block/basedosdados" # Destination path on Hetzner | |
| PARALLEL_EXPORTS=8 # Number of parallel bq extract jobs | |
| PARALLEL_UPLOADS=4 # Number of parallel rsync/scp jobs | |
| SOURCE_PROJECT="basedosdados" # The public basedosdados BQ project | |
| # ----------------------------------------------------------------------------- | |
| # LOGGING | |
| # ----------------------------------------------------------------------------- | |
| LOG_FILE="export_$(date +%Y%m%d_%H%M%S).log" | |
| FAILED_FILE="failed_tables.txt" | |
| DONE_FILE="done_tables.txt" | |
| log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"; } | |
| log_err() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $*" | tee -a "$LOG_FILE" >&2; } | |
| # ----------------------------------------------------------------------------- | |
| # STEP 0 — Verify dependencies | |
| # ----------------------------------------------------------------------------- | |
| log "Checking dependencies..." | |
| for cmd in bq gcloud gsutil parallel rsync ssh; do | |
| if ! command -v "$cmd" &>/dev/null; then | |
| log_err "'$cmd' not found. Install google-cloud-sdk and GNU parallel." | |
| exit 1 | |
| fi | |
| done | |
| # ----------------------------------------------------------------------------- | |
| # STEP 1 — Create GCS bucket in US region (same as basedosdados) | |
| # ----------------------------------------------------------------------------- | |
| log "Creating GCS bucket: gs://$BUCKET_NAME in region $BUCKET_REGION" | |
| if gsutil ls "gs://$BUCKET_NAME" &>/dev/null; then | |
| log "Bucket already exists, skipping creation." | |
| else | |
| gsutil mb \ | |
| -p "$YOUR_PROJECT" \ | |
| -l "$BUCKET_REGION" \ | |
| -b on \ | |
| "gs://$BUCKET_NAME" | |
| log "Bucket created: gs://$BUCKET_NAME" | |
| fi | |
| # Create SSH dir on Hetzner | |
| log "Preparing destination directory on Hetzner..." | |
| ssh "$HETZNER_USER@$HETZNER_HOST" "mkdir -p $HETZNER_DEST" | |
| # Resume support: load already-done tables | |
| touch "$DONE_FILE" "$FAILED_FILE" | |
| # ----------------------------------------------------------------------------- | |
| # STEP 2 — Build the full table list from the basedosdados project | |
| # | |
| # We auto-discover all datasets and tables via the BQ API so we don't rely | |
| # on a hardcoded list. This also detects any new tables added since the | |
| # tables-summary.md was written. | |
| # ----------------------------------------------------------------------------- | |
| log "Discovering all datasets in project: $SOURCE_PROJECT ..." | |
| TABLE_LIST_FILE="all_tables.txt" | |
| if [[ ! -f "$TABLE_LIST_FILE" ]]; then | |
| bq ls --project_id="$SOURCE_PROJECT" --format=json 2>/dev/null \ | |
| | python3 -c " | |
| import json, sys | |
| datasets = json.load(sys.stdin) | |
| for ds in datasets: | |
| print(ds['datasetReference']['datasetId']) | |
| " > /tmp/datasets.txt | |
| log "Found $(wc -l < /tmp/datasets.txt) datasets. Listing tables..." | |
| > "$TABLE_LIST_FILE" | |
| while IFS= read -r dataset; do | |
| bq ls \ | |
| --project_id="$SOURCE_PROJECT" \ | |
| --dataset_id="$SOURCE_PROJECT:$dataset" \ | |
| --format=json 2>/dev/null \ | |
| | python3 -c " | |
| import json, sys | |
| tables = json.load(sys.stdin) | |
| for t in tables: | |
| ref = t.get('tableReference', {}) | |
| if t.get('type') in ('TABLE', 'EXTERNAL'): # skip views | |
| print(ref['datasetId'] + '.' + ref['tableId']) | |
| " >> "$TABLE_LIST_FILE" | |
| done < /tmp/datasets.txt | |
| log "Total tables discovered: $(wc -l < "$TABLE_LIST_FILE")" | |
| else | |
| log "Reusing existing table list: $TABLE_LIST_FILE ($(wc -l < "$TABLE_LIST_FILE") tables)" | |
| fi | |
| # ----------------------------------------------------------------------------- | |
| # STEP 3 — Export function (called in parallel) | |
| # ----------------------------------------------------------------------------- | |
| export_table() { | |
| local table="$1" | |
| local bucket="$2" | |
| local project="$3" | |
| local source="$4" | |
| local done_file="$5" | |
| local failed_file="$6" | |
| local log_file="$7" | |
| # Skip if already done | |
| if grep -qxF "$table" "$done_file" 2>/dev/null; then | |
| echo "[SKIP] $table (already exported)" >> "$log_file" | |
| return 0 | |
| fi | |
| local dataset table_id gcs_prefix | |
| dataset=$(echo "$table" | cut -d. -f1) | |
| table_id=$(echo "$table" | cut -d. -f2) | |
| gcs_prefix="gs://$bucket/$dataset/$table_id" | |
| echo "[START] Exporting $source:$table → $gcs_prefix/*.parquet" >> "$log_file" | |
| # Run bq extract with retry (up to 3 attempts) | |
| local attempt=0 | |
| local success=false | |
| while [[ $attempt -lt 3 ]]; do | |
| attempt=$((attempt + 1)) | |
| if bq extract \ | |
| --project_id="$project" \ | |
| --source_format=PARQUET \ | |
| --compression=ZSTD \ | |
| --destination_format=PARQUET \ | |
| --location=US \ | |
| "${source}:${dataset}.${table_id}" \ | |
| "${gcs_prefix}/*.parquet" \ | |
| >> "$log_file" 2>&1; then | |
| success=true | |
| break | |
| else | |
| echo "[RETRY $attempt/3] $table" >> "$log_file" | |
| sleep $((attempt * 10)) | |
| fi | |
| done | |
| if $success; then | |
| echo "$table" >> "$done_file" | |
| echo "[DONE] $table" >> "$log_file" | |
| else | |
| echo "$table" >> "$failed_file" | |
| echo "[FAIL] $table after 3 attempts" >> "$log_file" | |
| fi | |
| } | |
| export -f export_table | |
| # ----------------------------------------------------------------------------- | |
| # STEP 4 — Run exports in parallel | |
| # ----------------------------------------------------------------------------- | |
| log "Starting parallel exports ($PARALLEL_EXPORTS workers)..." | |
| log "Progress is logged to: $LOG_FILE" | |
| log "Failed tables will be written to: $FAILED_FILE" | |
| # Filter out already-done tables | |
| comm -23 \ | |
| <(sort "$TABLE_LIST_FILE") \ | |
| <(sort "$DONE_FILE") \ | |
| | parallel \ | |
| --jobs "$PARALLEL_EXPORTS" \ | |
| --progress \ | |
| --bar \ | |
| export_table {} "$BUCKET_NAME" "$YOUR_PROJECT" "$SOURCE_PROJECT" \ | |
| "$DONE_FILE" "$FAILED_FILE" "$LOG_FILE" | |
| TOTAL=$(wc -l < "$TABLE_LIST_FILE") | |
| DONE=$(wc -l < "$DONE_FILE") | |
| FAILED=$(wc -l < "$FAILED_FILE") | |
| log "Export phase complete: $DONE/$TOTAL done, $FAILED failed" | |
| if [[ $FAILED -gt 0 ]]; then | |
| log "Failed tables:" | |
| cat "$FAILED_FILE" | tee -a "$LOG_FILE" | |
| log "To retry failed tables only, run: bash $0 --retry-failed" | |
| fi | |
| # ----------------------------------------------------------------------------- | |
| # STEP 5 — Download from GCS to Hetzner via rsync over SSH | |
| # | |
| # Strategy: download dataset-by-dataset to keep memory usage low and allow | |
| # resuming. We stream directly GCS → Hetzner without storing locally on | |
| # the GCE VM (saves disk space on VM). | |
| # ----------------------------------------------------------------------------- | |
| log "Starting transfer to Hetzner ($HETZNER_HOST)..." | |
| download_dataset() { | |
| local dataset="$1" | |
| local bucket="$2" | |
| local hetzner_user="$3" | |
| local hetzner_host="$4" | |
| local hetzner_dest="$5" | |
| local log_file="$6" | |
| echo "[TRANSFER] gs://$bucket/$dataset/ → $hetzner_host:$hetzner_dest/$dataset/" >> "$log_file" | |
| # gsutil rsync to local temp, then scp to Hetzner | |
| # (gsutil can't rsync directly to SSH — pipe through local staging) | |
| local local_stage="/tmp/bd_stage/$dataset" | |
| mkdir -p "$local_stage" | |
| gsutil -m rsync -r "gs://$bucket/$dataset/" "$local_stage/" >> "$log_file" 2>&1 | |
| ssh "$hetzner_user@$hetzner_host" "mkdir -p $hetzner_dest/$dataset" | |
| rsync -az --progress \ | |
| "$local_stage/" \ | |
| "$hetzner_user@$hetzner_host:$hetzner_dest/$dataset/" \ | |
| >> "$log_file" 2>&1 | |
| # Clean up local staging after successful upload | |
| rm -rf "$local_stage" | |
| echo "[TRANSFERRED] $dataset" >> "$log_file" | |
| } | |
| export -f download_dataset | |
| # Get list of exported datasets | |
| gsutil ls "gs://$BUCKET_NAME/" \ | |
| | sed 's|gs://[^/]*/||;s|/||' \ | |
| | sort -u \ | |
| | parallel \ | |
| --jobs "$PARALLEL_UPLOADS" \ | |
| --progress \ | |
| download_dataset {} "$BUCKET_NAME" "$HETZNER_USER" "$HETZNER_HOST" \ | |
| "$HETZNER_DEST" "$LOG_FILE" | |
| log "Transfer complete." | |
| # ----------------------------------------------------------------------------- | |
| # STEP 6 — Verify file counts on Hetzner vs GCS | |
| # ----------------------------------------------------------------------------- | |
| log "Verifying file counts..." | |
| GCS_COUNT=$(gsutil ls -r "gs://$BUCKET_NAME/**" | grep '\.parquet$' | wc -l) | |
| HETZNER_COUNT=$(ssh "$HETZNER_USER@$HETZNER_HOST" "find $HETZNER_DEST -name '*.parquet' | wc -l") | |
| log "GCS parquet files: $GCS_COUNT" | |
| log "Hetzner parquet files: $HETZNER_COUNT" | |
| if [[ "$GCS_COUNT" -eq "$HETZNER_COUNT" ]]; then | |
| log "File counts match. Transfer verified." | |
| else | |
| log_err "Count mismatch! GCS=$GCS_COUNT Hetzner=$HETZNER_COUNT" | |
| log_err "Run rsync again or check $LOG_FILE for errors." | |
| fi | |
| # ----------------------------------------------------------------------------- | |
| # STEP 7 — Clean up GCS bucket to stop storage charges | |
| # ----------------------------------------------------------------------------- | |
| read -rp "Delete GCS bucket gs://$BUCKET_NAME to stop storage charges? [y/N] " confirm | |
| if [[ "$confirm" =~ ^[Yy]$ ]]; then | |
| log "Deleting bucket gs://$BUCKET_NAME ..." | |
| gsutil -m rm -r "gs://$BUCKET_NAME" | |
| gsutil rb "gs://$BUCKET_NAME" | |
| log "Bucket deleted. Storage charges stopped." | |
| else | |
| log "Bucket kept. Remember to delete it later: gsutil -m rm -r gs://$BUCKET_NAME && gsutil rb gs://$BUCKET_NAME" | |
| fi | |
| log "All done! Data is at $HETZNER_HOST:$HETZNER_DEST" | |
| log "Total exported: $DONE tables | Failed: $FAILED tables" | |
| log "See $LOG_FILE for full details." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment