rafapolo · March 24, 2026 22:20
diff --git a/scrap_bdd.sh b/scrap_bdd.sh
 #!/usr/bin/env bash
 # =============================================================================
 # export_basedosdados.sh
 # Exports all basedosdados BigQuery tables → GCS (Parquet+zstd) → Hetzner
 #
 # Prerequisites (run once before this script):
 #   gcloud auth login
 #   gcloud auth application-default login
 #   gcloud config set project YOUR_PROJECT_ID
 #
 # Usage:
 #   chmod +x export_basedosdados.sh
 #   ./export_basedosdados.sh
 # =============================================================================

 set -euo pipefail

 # -----------------------------------------------------------------------------
 # CONFIGURATION — edit these before running
 # -----------------------------------------------------------------------------
 YOUR_PROJECT="your-gcp-project-id"          # Your GCP billing project (not basedosdados)
 BUCKET_NAME="bd-export-$(date +%s)"         # GCS bucket name (must be globally unique)
 BUCKET_REGION="US"                           # Must be US — basedosdados is in US multi-region
 HETZNER_USER="root"                          # Your Hetzner server SSH user
 HETZNER_HOST="your.hetzner.ip"              # Your Hetzner server IP or hostname
 HETZNER_DEST="/mnt/block/basedosdados"      # Destination path on Hetzner
 PARALLEL_EXPORTS=8                           # Number of parallel bq extract jobs
 PARALLEL_UPLOADS=4                           # Number of parallel rsync/scp jobs
 SOURCE_PROJECT="basedosdados"               # The public basedosdados BQ project

 # -----------------------------------------------------------------------------
 # LOGGING
 # -----------------------------------------------------------------------------
 LOG_FILE="export_$(date +%Y%m%d_%H%M%S).log"
 FAILED_FILE="failed_tables.txt"
 DONE_FILE="done_tables.txt"

 log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"; }
 log_err() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $*" | tee -a "$LOG_FILE" >&2; }

 # -----------------------------------------------------------------------------
 # STEP 0 — Verify dependencies
 # -----------------------------------------------------------------------------
 log "Checking dependencies..."
 for cmd in bq gcloud gsutil parallel rsync ssh; do
  if ! command -v "$cmd" &>/dev/null; then
    log_err "'$cmd' not found. Install google-cloud-sdk and GNU parallel."
    exit 1
  fi
 done

 # -----------------------------------------------------------------------------
 # STEP 1 — Create GCS bucket in US region (same as basedosdados)
 # -----------------------------------------------------------------------------
 log "Creating GCS bucket: gs://$BUCKET_NAME in region $BUCKET_REGION"
 if gsutil ls "gs://$BUCKET_NAME" &>/dev/null; then
  log "Bucket already exists, skipping creation."
 else
  gsutil mb \
    -p "$YOUR_PROJECT" \
    -l "$BUCKET_REGION" \
    -b on \
    "gs://$BUCKET_NAME"
  log "Bucket created: gs://$BUCKET_NAME"
 fi

 # Create SSH dir on Hetzner
 log "Preparing destination directory on Hetzner..."
 ssh "$HETZNER_USER@$HETZNER_HOST" "mkdir -p $HETZNER_DEST"

 # Resume support: load already-done tables
 touch "$DONE_FILE" "$FAILED_FILE"

 # -----------------------------------------------------------------------------
 # STEP 2 — Build the full table list from the basedosdados project
 #
 # We auto-discover all datasets and tables via the BQ API so we don't rely
 # on a hardcoded list. This also detects any new tables added since the
 # tables-summary.md was written.
 # -----------------------------------------------------------------------------
 log "Discovering all datasets in project: $SOURCE_PROJECT ..."
 TABLE_LIST_FILE="all_tables.txt"

 if [[ ! -f "$TABLE_LIST_FILE" ]]; then
  bq ls --project_id="$SOURCE_PROJECT" --format=json 2>/dev/null \
    | python3 -c "
 import json, sys
 datasets = json.load(sys.stdin)
 for ds in datasets:
    print(ds['datasetReference']['datasetId'])
 " > /tmp/datasets.txt

  log "Found $(wc -l < /tmp/datasets.txt) datasets. Listing tables..."

  > "$TABLE_LIST_FILE"
  while IFS= read -r dataset; do
    bq ls \
      --project_id="$SOURCE_PROJECT" \
      --dataset_id="$SOURCE_PROJECT:$dataset" \
      --format=json 2>/dev/null \
    | python3 -c "
 import json, sys
 tables = json.load(sys.stdin)
 for t in tables:
    ref = t.get('tableReference', {})
    if t.get('type') in ('TABLE', 'EXTERNAL'):  # skip views
        print(ref['datasetId'] + '.' + ref['tableId'])
 " >> "$TABLE_LIST_FILE"
  done < /tmp/datasets.txt

  log "Total tables discovered: $(wc -l < "$TABLE_LIST_FILE")"
 else
  log "Reusing existing table list: $TABLE_LIST_FILE ($(wc -l < "$TABLE_LIST_FILE") tables)"
 fi

 # -----------------------------------------------------------------------------
 # STEP 3 — Export function (called in parallel)
 # -----------------------------------------------------------------------------
 export_table() {
  local table="$1"
  local bucket="$2"
  local project="$3"
  local source="$4"
  local done_file="$5"
  local failed_file="$6"
  local log_file="$7"

  # Skip if already done
  if grep -qxF "$table" "$done_file" 2>/dev/null; then
    echo "[SKIP] $table (already exported)" >> "$log_file"
    return 0
  fi

  local dataset table_id gcs_prefix
  dataset=$(echo "$table" | cut -d. -f1)
  table_id=$(echo "$table" | cut -d. -f2)
  gcs_prefix="gs://$bucket/$dataset/$table_id"

  echo "[START] Exporting $source:$table → $gcs_prefix/*.parquet" >> "$log_file"

  # Run bq extract with retry (up to 3 attempts)
  local attempt=0
  local success=false
  while [[ $attempt -lt 3 ]]; do
    attempt=$((attempt + 1))
    if bq extract \
        --project_id="$project" \
        --source_format=PARQUET \
        --compression=ZSTD \
        --destination_format=PARQUET \
        --location=US \
        "${source}:${dataset}.${table_id}" \
        "${gcs_prefix}/*.parquet" \
        >> "$log_file" 2>&1; then
      success=true
      break
    else
      echo "[RETRY $attempt/3] $table" >> "$log_file"
      sleep $((attempt * 10))
    fi
  done

  if $success; then
    echo "$table" >> "$done_file"
    echo "[DONE] $table" >> "$log_file"
  else
    echo "$table" >> "$failed_file"
    echo "[FAIL] $table after 3 attempts" >> "$log_file"
  fi
 }

 export -f export_table

 # -----------------------------------------------------------------------------
 # STEP 4 — Run exports in parallel
 # -----------------------------------------------------------------------------
 log "Starting parallel exports ($PARALLEL_EXPORTS workers)..."
 log "Progress is logged to: $LOG_FILE"
 log "Failed tables will be written to: $FAILED_FILE"

 # Filter out already-done tables
 comm -23 \
  <(sort "$TABLE_LIST_FILE") \
  <(sort "$DONE_FILE") \
 | parallel \
    --jobs "$PARALLEL_EXPORTS" \
    --progress \
    --bar \
    export_table {} "$BUCKET_NAME" "$YOUR_PROJECT" "$SOURCE_PROJECT" \
                    "$DONE_FILE" "$FAILED_FILE" "$LOG_FILE"

 TOTAL=$(wc -l < "$TABLE_LIST_FILE")
 DONE=$(wc -l < "$DONE_FILE")
 FAILED=$(wc -l < "$FAILED_FILE")
 log "Export phase complete: $DONE/$TOTAL done, $FAILED failed"

 if [[ $FAILED -gt 0 ]]; then
  log "Failed tables:"
  cat "$FAILED_FILE" | tee -a "$LOG_FILE"
  log "To retry failed tables only, run: bash $0 --retry-failed"
 fi

 # -----------------------------------------------------------------------------
 # STEP 5 — Download from GCS to Hetzner via rsync over SSH
 #
 # Strategy: download dataset-by-dataset to keep memory usage low and allow
 # resuming. We stream directly GCS → Hetzner without storing locally on
 # the GCE VM (saves disk space on VM).
 # -----------------------------------------------------------------------------
 log "Starting transfer to Hetzner ($HETZNER_HOST)..."

 download_dataset() {
  local dataset="$1"
  local bucket="$2"
  local hetzner_user="$3"
  local hetzner_host="$4"
  local hetzner_dest="$5"
  local log_file="$6"

  echo "[TRANSFER] gs://$bucket/$dataset/ → $hetzner_host:$hetzner_dest/$dataset/" >> "$log_file"

  # gsutil rsync to local temp, then scp to Hetzner
  # (gsutil can't rsync directly to SSH — pipe through local staging)
  local local_stage="/tmp/bd_stage/$dataset"
  mkdir -p "$local_stage"

  gsutil -m rsync -r "gs://$bucket/$dataset/" "$local_stage/" >> "$log_file" 2>&1

  ssh "$hetzner_user@$hetzner_host" "mkdir -p $hetzner_dest/$dataset"
  rsync -az --progress \
    "$local_stage/" \
    "$hetzner_user@$hetzner_host:$hetzner_dest/$dataset/" \
    >> "$log_file" 2>&1

  # Clean up local staging after successful upload
  rm -rf "$local_stage"
  echo "[TRANSFERRED] $dataset" >> "$log_file"
 }

 export -f download_dataset

 # Get list of exported datasets
 gsutil ls "gs://$BUCKET_NAME/" \
  | sed 's|gs://[^/]*/||;s|/||' \
  | sort -u \
 | parallel \
    --jobs "$PARALLEL_UPLOADS" \
    --progress \
    download_dataset {} "$BUCKET_NAME" "$HETZNER_USER" "$HETZNER_HOST" \
                         "$HETZNER_DEST" "$LOG_FILE"

 log "Transfer complete."

 # -----------------------------------------------------------------------------
 # STEP 6 — Verify file counts on Hetzner vs GCS
 # -----------------------------------------------------------------------------
 log "Verifying file counts..."
 GCS_COUNT=$(gsutil ls -r "gs://$BUCKET_NAME/**" | grep '\.parquet$' | wc -l)
 HETZNER_COUNT=$(ssh "$HETZNER_USER@$HETZNER_HOST" "find $HETZNER_DEST -name '*.parquet' | wc -l")

 log "GCS parquet files:     $GCS_COUNT"
 log "Hetzner parquet files: $HETZNER_COUNT"

 if [[ "$GCS_COUNT" -eq "$HETZNER_COUNT" ]]; then
  log "File counts match. Transfer verified."
 else
  log_err "Count mismatch! GCS=$GCS_COUNT Hetzner=$HETZNER_COUNT"
  log_err "Run rsync again or check $LOG_FILE for errors."
 fi

 # -----------------------------------------------------------------------------
 # STEP 7 — Clean up GCS bucket to stop storage charges
 # -----------------------------------------------------------------------------
 read -rp "Delete GCS bucket gs://$BUCKET_NAME to stop storage charges? [y/N] " confirm
 if [[ "$confirm" =~ ^[Yy]$ ]]; then
  log "Deleting bucket gs://$BUCKET_NAME ..."
  gsutil -m rm -r "gs://$BUCKET_NAME"
  gsutil rb "gs://$BUCKET_NAME"
  log "Bucket deleted. Storage charges stopped."
 else
  log "Bucket kept. Remember to delete it later: gsutil -m rm -r gs://$BUCKET_NAME && gsutil rb gs://$BUCKET_NAME"
 fi

 log "All done! Data is at $HETZNER_HOST:$HETZNER_DEST"
 log "Total exported: $DONE tables | Failed: $FAILED tables"
 log "See $LOG_FILE for full details."
	#!/usr/bin/env bash
	# =============================================================================
	# export_basedosdados.sh
	# Exports all basedosdados BigQuery tables → GCS (Parquet+zstd) → Hetzner
	#
	# Prerequisites (run once before this script):
	# gcloud auth login
	# gcloud auth application-default login
	# gcloud config set project YOUR_PROJECT_ID
	#
	# Usage:
	# chmod +x export_basedosdados.sh
	# ./export_basedosdados.sh
	# =============================================================================

	set -euo pipefail

	# -----------------------------------------------------------------------------
	# CONFIGURATION — edit these before running
	# -----------------------------------------------------------------------------
	YOUR_PROJECT="your-gcp-project-id" # Your GCP billing project (not basedosdados)
	BUCKET_NAME="bd-export-$(date +%s)" # GCS bucket name (must be globally unique)
	BUCKET_REGION="US" # Must be US — basedosdados is in US multi-region
	HETZNER_USER="root" # Your Hetzner server SSH user
	HETZNER_HOST="your.hetzner.ip" # Your Hetzner server IP or hostname
	HETZNER_DEST="/mnt/block/basedosdados" # Destination path on Hetzner
	PARALLEL_EXPORTS=8 # Number of parallel bq extract jobs
	PARALLEL_UPLOADS=4 # Number of parallel rsync/scp jobs
	SOURCE_PROJECT="basedosdados" # The public basedosdados BQ project

	# -----------------------------------------------------------------------------
	# LOGGING
	# -----------------------------------------------------------------------------
	LOG_FILE="export_$(date +%Y%m%d_%H%M%S).log"
	FAILED_FILE="failed_tables.txt"
	DONE_FILE="done_tables.txt"

	log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" \| tee -a "$LOG_FILE"; }
	log_err() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $*" \| tee -a "$LOG_FILE" >&2; }

	# -----------------------------------------------------------------------------
	# STEP 0 — Verify dependencies
	# -----------------------------------------------------------------------------
	log "Checking dependencies..."
	for cmd in bq gcloud gsutil parallel rsync ssh; do
	if ! command -v "$cmd" &>/dev/null; then
	log_err "'$cmd' not found. Install google-cloud-sdk and GNU parallel."
	exit 1
	fi
	done

	# -----------------------------------------------------------------------------
	# STEP 1 — Create GCS bucket in US region (same as basedosdados)
	# -----------------------------------------------------------------------------
	log "Creating GCS bucket: gs://$BUCKET_NAME in region $BUCKET_REGION"
	if gsutil ls "gs://$BUCKET_NAME" &>/dev/null; then
	log "Bucket already exists, skipping creation."
	else
	gsutil mb \
	-p "$YOUR_PROJECT" \
	-l "$BUCKET_REGION" \
	-b on \
	"gs://$BUCKET_NAME"
	log "Bucket created: gs://$BUCKET_NAME"
	fi

	# Create SSH dir on Hetzner
	log "Preparing destination directory on Hetzner..."
	ssh "$HETZNER_USER@$HETZNER_HOST" "mkdir -p $HETZNER_DEST"

	# Resume support: load already-done tables
	touch "$DONE_FILE" "$FAILED_FILE"

	# -----------------------------------------------------------------------------
	# STEP 2 — Build the full table list from the basedosdados project
	#
	# We auto-discover all datasets and tables via the BQ API so we don't rely
	# on a hardcoded list. This also detects any new tables added since the
	# tables-summary.md was written.
	# -----------------------------------------------------------------------------
	log "Discovering all datasets in project: $SOURCE_PROJECT ..."
	TABLE_LIST_FILE="all_tables.txt"

	if [[ ! -f "$TABLE_LIST_FILE" ]]; then
	bq ls --project_id="$SOURCE_PROJECT" --format=json 2>/dev/null \
	\| python3 -c "
	import json, sys
	datasets = json.load(sys.stdin)
	for ds in datasets:
	print(ds['datasetReference']['datasetId'])
	" > /tmp/datasets.txt

	log "Found $(wc -l < /tmp/datasets.txt) datasets. Listing tables..."

	> "$TABLE_LIST_FILE"
	while IFS= read -r dataset; do
	bq ls \
	--project_id="$SOURCE_PROJECT" \
	--dataset_id="$SOURCE_PROJECT:$dataset" \
	--format=json 2>/dev/null \
	\| python3 -c "
	import json, sys
	tables = json.load(sys.stdin)
	for t in tables:
	ref = t.get('tableReference', {})
	if t.get('type') in ('TABLE', 'EXTERNAL'): # skip views
	print(ref['datasetId'] + '.' + ref['tableId'])
	" >> "$TABLE_LIST_FILE"
	done < /tmp/datasets.txt

	log "Total tables discovered: $(wc -l < "$TABLE_LIST_FILE")"
	else
	log "Reusing existing table list: $TABLE_LIST_FILE ($(wc -l < "$TABLE_LIST_FILE") tables)"
	fi

	# -----------------------------------------------------------------------------
	# STEP 3 — Export function (called in parallel)
	# -----------------------------------------------------------------------------
	export_table() {
	local table="$1"
	local bucket="$2"
	local project="$3"
	local source="$4"
	local done_file="$5"
	local failed_file="$6"
	local log_file="$7"

	# Skip if already done
	if grep -qxF "$table" "$done_file" 2>/dev/null; then
	echo "[SKIP] $table (already exported)" >> "$log_file"
	return 0
	fi

	local dataset table_id gcs_prefix
	dataset=$(echo "$table" \| cut -d. -f1)
	table_id=$(echo "$table" \| cut -d. -f2)
	gcs_prefix="gs://$bucket/$dataset/$table_id"

	echo "[START] Exporting $source:$table → $gcs_prefix/*.parquet" >> "$log_file"

	# Run bq extract with retry (up to 3 attempts)
	local attempt=0
	local success=false
	while [[ $attempt -lt 3 ]]; do
	attempt=$((attempt + 1))
	if bq extract \
	--project_id="$project" \
	--source_format=PARQUET \
	--compression=ZSTD \
	--destination_format=PARQUET \
	--location=US \
	"${source}:${dataset}.${table_id}" \
	"${gcs_prefix}/*.parquet" \
	>> "$log_file" 2>&1; then
	success=true
	break
	else
	echo "[RETRY $attempt/3] $table" >> "$log_file"
	sleep $((attempt * 10))
	fi
	done

	if $success; then
	echo "$table" >> "$done_file"
	echo "[DONE] $table" >> "$log_file"
	else
	echo "$table" >> "$failed_file"
	echo "[FAIL] $table after 3 attempts" >> "$log_file"
	fi
	}

	export -f export_table

	# -----------------------------------------------------------------------------
	# STEP 4 — Run exports in parallel
	# -----------------------------------------------------------------------------
	log "Starting parallel exports ($PARALLEL_EXPORTS workers)..."
	log "Progress is logged to: $LOG_FILE"
	log "Failed tables will be written to: $FAILED_FILE"

	# Filter out already-done tables
	comm -23 \
	<(sort "$TABLE_LIST_FILE") \
	<(sort "$DONE_FILE") \
	\| parallel \
	--jobs "$PARALLEL_EXPORTS" \
	--progress \
	--bar \
	export_table {} "$BUCKET_NAME" "$YOUR_PROJECT" "$SOURCE_PROJECT" \
	"$DONE_FILE" "$FAILED_FILE" "$LOG_FILE"

	TOTAL=$(wc -l < "$TABLE_LIST_FILE")
	DONE=$(wc -l < "$DONE_FILE")
	FAILED=$(wc -l < "$FAILED_FILE")
	log "Export phase complete: $DONE/$TOTAL done, $FAILED failed"

	if [[ $FAILED -gt 0 ]]; then
	log "Failed tables:"
	cat "$FAILED_FILE" \| tee -a "$LOG_FILE"
	log "To retry failed tables only, run: bash $0 --retry-failed"
	fi

	# -----------------------------------------------------------------------------
	# STEP 5 — Download from GCS to Hetzner via rsync over SSH
	#
	# Strategy: download dataset-by-dataset to keep memory usage low and allow
	# resuming. We stream directly GCS → Hetzner without storing locally on
	# the GCE VM (saves disk space on VM).
	# -----------------------------------------------------------------------------
	log "Starting transfer to Hetzner ($HETZNER_HOST)..."

	download_dataset() {
	local dataset="$1"
	local bucket="$2"
	local hetzner_user="$3"
	local hetzner_host="$4"
	local hetzner_dest="$5"
	local log_file="$6"

	echo "[TRANSFER] gs://$bucket/$dataset/ → $hetzner_host:$hetzner_dest/$dataset/" >> "$log_file"

	# gsutil rsync to local temp, then scp to Hetzner
	# (gsutil can't rsync directly to SSH — pipe through local staging)
	local local_stage="/tmp/bd_stage/$dataset"
	mkdir -p "$local_stage"

	gsutil -m rsync -r "gs://$bucket/$dataset/" "$local_stage/" >> "$log_file" 2>&1

	ssh "$hetzner_user@$hetzner_host" "mkdir -p $hetzner_dest/$dataset"
	rsync -az --progress \
	"$local_stage/" \
	"$hetzner_user@$hetzner_host:$hetzner_dest/$dataset/" \
	>> "$log_file" 2>&1

	# Clean up local staging after successful upload
	rm -rf "$local_stage"
	echo "[TRANSFERRED] $dataset" >> "$log_file"
	}

	export -f download_dataset

	# Get list of exported datasets
	gsutil ls "gs://$BUCKET_NAME/" \
	\| sed 's\|gs://[^/]*/\|\|;s\|/\|\|' \
	\| sort -u \
	\| parallel \
	--jobs "$PARALLEL_UPLOADS" \
	--progress \
	download_dataset {} "$BUCKET_NAME" "$HETZNER_USER" "$HETZNER_HOST" \
	"$HETZNER_DEST" "$LOG_FILE"

	log "Transfer complete."

	# -----------------------------------------------------------------------------
	# STEP 6 — Verify file counts on Hetzner vs GCS
	# -----------------------------------------------------------------------------
	log "Verifying file counts..."
	GCS_COUNT=$(gsutil ls -r "gs://$BUCKET_NAME/**" \| grep '\.parquet$' \| wc -l)
	HETZNER_COUNT=$(ssh "$HETZNER_USER@$HETZNER_HOST" "find $HETZNER_DEST -name '*.parquet' \| wc -l")

	log "GCS parquet files: $GCS_COUNT"
	log "Hetzner parquet files: $HETZNER_COUNT"

	if [[ "$GCS_COUNT" -eq "$HETZNER_COUNT" ]]; then
	log "File counts match. Transfer verified."
	else
	log_err "Count mismatch! GCS=$GCS_COUNT Hetzner=$HETZNER_COUNT"
	log_err "Run rsync again or check $LOG_FILE for errors."
	fi

	# -----------------------------------------------------------------------------
	# STEP 7 — Clean up GCS bucket to stop storage charges
	# -----------------------------------------------------------------------------
	read -rp "Delete GCS bucket gs://$BUCKET_NAME to stop storage charges? [y/N] " confirm
	if [[ "$confirm" =~ ^[Yy]$ ]]; then
	log "Deleting bucket gs://$BUCKET_NAME ..."
	gsutil -m rm -r "gs://$BUCKET_NAME"
	gsutil rb "gs://$BUCKET_NAME"
	log "Bucket deleted. Storage charges stopped."
	else
	log "Bucket kept. Remember to delete it later: gsutil -m rm -r gs://$BUCKET_NAME && gsutil rb gs://$BUCKET_NAME"
	fi

	log "All done! Data is at $HETZNER_HOST:$HETZNER_DEST"
	log "Total exported: $DONE tables \| Failed: $FAILED tables"
	log "See $LOG_FILE for full details."
No results found