Created
December 6, 2017 21:26
-
-
Save thekensta/104e665d3b79becbd57286d14f571998 to your computer and use it in GitHub Desktop.
Load and Extract DATETIMES and TIMESTAMPS in Big Query
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
DATASET="<YOUR DATASET HERE>" | |
TABLE="fun_with_time" | |
SCHEMA_FILE="/tmp/ts-schema.json" | |
DATA_FILE="/tmp/ts-data.ndjson" | |
SQL_FILE="/tmp/query.sql" | |
BUCKET="<YOUR BUCKET HERE>" | |
cat <<EOM >${SCHEMA_FILE} | |
[ | |
{"name": "name", "type": "string"}, | |
{"name": "dt", "type": "datetime"}, | |
{"name": "ts", "type": "timestamp"} | |
] | |
EOM | |
bq rm -f ${DATASET}.${TABLE} | |
bq mk -tf ${DATASET}.${TABLE} ${SCHEMA_FILE} | |
cat <<EOM >${DATA_FILE} | |
{"name": "dt string - 1", "dt": "2017-12-01T12:34:56.789012", "ts": null} | |
{"name": "dt string - 2", "dt": "2017-12-01T12:34:56.789012+06:00", "ts": null} | |
{"name": "dt string - 3", "dt": "2017-12-01 12:34:56.789012", "ts": null} | |
{"name": "dt float - 1", "dt": "1512131696.789012", "ts": null} | |
{"name": "ts string - 1", "dt": null, "ts": "2017-12-01T12:34:56.789012"} | |
{"name": "ts float seconds", "dt": null, "ts": 1512131696.789012} | |
{"name": "ts int seconds", "dt": null, "ts": 1512131696} | |
{"name": "ts string seconds", "dt": null, "ts": "1512131696"} | |
{"name": "ts float seconds", "dt": null, "ts": "1512131696.789"} | |
{"name": "ts string + tz", "dt": null, "ts": "2017-12-01T12:34:56.789012+06:00"} | |
{"name": "ts string + z", "dt": null, "ts": "2017-12-01T12:34:56.789012Z"} | |
{"name": "ts string + tz name", "dt": null, "ts": "2017-12-01T12:34:56.789012 Europe/London"} | |
{"name": "ts string + utc", "dt": null, "ts": "2017-12-01T12:34:56.789012 UTC"} | |
EOM | |
# [query] Invalid datetime string "2017-12-01T12:34:56.789012+06:00" Field: dt; Value: 2017-12-01T12:34:56.789012+06:00 | |
# [query] Invalid datetime string "1512131696.789012" Field: dt; Value: 1512131696.789012 | |
# [mediaupload-snapshot] Error while reading data, error message: JSON parsing error in | |
# row starting at position 772: Couldn't convert value to timestamp: | |
# Unrecognized timezone: Europe/London Field: ts; Value: 2017-12-01T12:34:56.789012 Europe/London | |
echo "--- Following data manually excluded as crashed the load process ---" | |
# These will actually crash the load | |
# | |
echo '{"name": "ts int millis", "dt": null, "ts": 1496316896789}' | |
echo '{"name": "ts int millis", "dt": null, "ts": 1512131696789}' | |
echo '{"name": "ts int micros", "dt": null, "ts": 1512131696789012}' | |
echo " --> The range of valid timestamp values is [0001-01-1 00:00:00, 9999-12-31 23:59:59.999999];" | |
echo " -------------------------------------------------------------------" | |
# | |
# Error in query string: Error processing job '<JOB_ID>': | |
# Cannot return an invalid timestamp value of -501317255171232512 microseconds relative to the Unix | |
# epoch. The range of valid timestamp values is [0001-01-1 00:00:00, 9999-12-31 23:59:59.999999]; | |
# error in writing field ts | |
bq load --source_format=NEWLINE_DELIMITED_JSON \ | |
--replace \ | |
--max_bad_records=10 \ | |
${DATASET}.${TABLE} ${DATA_FILE} | |
cat <<EOM >${SQL_FILE} | |
select name, dt, ts, unix_millis(ts) as ms, | |
unix_micros(ts) as micros | |
from $DATASET.$TABLE | |
EOM | |
bq query --use_legacy_sql=False < ${SQL_FILE} | |
# +-------------------+----------------------------+---------------------+---------------+------------------+ | |
# | name | dt | ts | ms | micros | | |
# +-------------------+----------------------------+---------------------+---------------+------------------+ | |
# | dt string - 3 | 2017-12-01T12:34:56.789012 | NULL | NULL | NULL | | |
# | dt string - 1 | 2017-12-01T12:34:56.789012 | NULL | NULL | NULL | | |
# | ts string seconds | NULL | 2017-12-01 12:34:56 | 1512131696000 | 1512131696000000 | | |
# | ts int seconds | NULL | 2017-12-01 12:34:56 | 1512131696000 | 1512131696000000 | | |
# | ts float seconds | NULL | 2017-12-01 12:34:56 | 1512131696789 | 1512131696789000 | | |
# | ts string + utc | NULL | 2017-12-01 12:34:56 | 1512131696789 | 1512131696789012 | | |
# | ts float seconds | NULL | 2017-12-01 12:34:56 | 1512131696789 | 1512131696789012 | | |
# | ts string + z | NULL | 2017-12-01 12:34:56 | 1512131696789 | 1512131696789012 | | |
# | ts string - 1 | NULL | 2017-12-01 12:34:56 | 1512131696789 | 1512131696789012 | | |
# | ts string + tz | NULL | 2017-12-01 06:34:56 | 1512110096789 | 1512110096789012 | | |
# +-------------------+----------------------------+---------------------+---------------+------------------+ | |
bq extract --destination_format=NEWLINE_DELIMITED_JSON \ | |
${DATASET}.${TABLE} gs://${BUCKET}/${TABLE}.json | |
gsutil cp gs://${BUCKET}/${TABLE}.json /tmp/ | |
# {"name":"dt string - 3","dt":"2017-12-01T12:34:56.789012"} | |
# {"name":"dt string - 1","dt":"2017-12-01T12:34:56.789012"} | |
# {"name":"ts string seconds","ts":"2017-12-01 12:34:56 UTC"} | |
# {"name":"ts int seconds","ts":"2017-12-01 12:34:56 UTC"} | |
# {"name":"ts float seconds","ts":"2017-12-01 12:34:56.789 UTC"} | |
# {"name":"ts float seconds","ts":"2017-12-01 12:34:56.789012 UTC"} | |
# {"name":"ts string + utc","ts":"2017-12-01 12:34:56.789012 UTC"} | |
# {"name":"ts string + z","ts":"2017-12-01 12:34:56.789012 UTC"} | |
# {"name":"ts string - 1","ts":"2017-12-01 12:34:56.789012 UTC"} | |
# {"name":"ts string + tz","ts":"2017-12-01 06:34:56.789012 UTC"} | |
# Avro Timestamp work around | |
# https://stackoverflow.com/questions/40651878/compatibility-of-avro-dates-and-times-with-bigquery | |
bq extract --destination_format=AVRO \ | |
${DATASET}.${TABLE} gs://${BUCKET}/${TABLE}.avro | |
gsutil cp gs://${BUCKET}/${TABLE}.avro /tmp/ | |
avro-tools getschema /tmp/${TABLE}.avro | |
# { | |
# "type" : "record", | |
# "name" : "Root", | |
# "fields" : [ { | |
# "name" : "name", | |
# "type" : [ "null", "string" ] | |
# }, { | |
# "name" : "dt", | |
# "type" : [ "null", "string" ] | |
# }, { | |
# "name" : "ts", | |
# "type" : [ "null", "long" ] | |
# } ] | |
# } | |
avro-tools tojson /tmp/${TABLE}.avro | |
# {"name":{"string":"dt string - 3"},"dt":{"string":"2017-12-01T12:34:56.789012"},"ts":null} | |
# {"name":{"string":"dt string - 1"},"dt":{"string":"2017-12-01T12:34:56.789012"},"ts":null} | |
# {"name":{"string":"ts string seconds"},"dt":null,"ts":{"long":1512131696000000}} | |
# {"name":{"string":"ts int seconds"},"dt":null,"ts":{"long":1512131696000000}} | |
# {"name":{"string":"ts float seconds"},"dt":null,"ts":{"long":1512131696789000}} | |
# {"name":{"string":"ts float seconds"},"dt":null,"ts":{"long":1512131696789012}} | |
# {"name":{"string":"ts string + utc"},"dt":null,"ts":{"long":1512131696789012}} | |
# {"name":{"string":"ts string + z"},"dt":null,"ts":{"long":1512131696789012}} | |
# {"name":{"string":"ts string - 1"},"dt":null,"ts":{"long":1512131696789012}} | |
# {"name":{"string":"ts string + tz"},"dt":null,"ts":{"long":1512110096789012}} | |
echo "-------- Trying to Reload Exported AVRO to Big Query with Replace -------" | |
echo " --- Current Schema ---" | |
bq show ${DATASET}.${TABLE} | |
# Last modified Schema Total Rows Total Bytes Expiration Time Partitioning Labels kmsKeyName | |
# ----------------- ------------------ ------------ ------------- ------------ ------------------- -------- ------------ | |
# 06 Dec 21:16:06 |- name: string 0 0 | |
# |- dt: datetime | |
# |- ts: timestamp | |
# Re-load the extracted AVRO | |
bq load --source_format=AVRO \ | |
--replace \ | |
--max_bad_records=10 \ | |
${DATASET}.${TABLE} /tmp/${TABLE}.avro | |
echo " --- Current Schema ---" | |
bq show ${DATASET}.${TABLE} | |
# Schema has changed to string, string, integer from the AVRO | |
# | |
# Last modified Schema Total Rows Total Bytes Expiration Time Partitioning Labels kmsKeyName | |
# ----------------- ----------------- ------------ ------------- ------------ ------------------- -------- ------------ | |
# 06 Dec 21:00:38 |- name: string 10 284 | |
# |- dt: string | |
# |- ts: integer | |
echo "-------- Querying Replaced Data ---------" | |
bq query --use_legacy_sql=False < ${SQL_FILE} | |
# Error in query string: Error processing job '<JOB_ID>': No matching signature for | |
# function UNIX_MILLIS for argument types: INT64. Supported signature: UNIX_MILLIS(TIMESTAMP) at [1:22] | |
echo "-------- Recreating Table ---------" | |
bq rm -f ${DATASET}.${TABLE} | |
bq mk -t ${DATASET}.${TABLE} ${SCHEMA_FILE} | |
echo "-------- Reloading AVRO Data WITHOUT replace ---------" | |
bq load --source_format=AVRO \ | |
--max_bad_records=10 \ | |
${DATASET}.${TABLE} /tmp/${TABLE}.avro | |
# BigQuery error in load operation: Error processing job 'hx-trial:bqjob_ra05543495ed27af_000001602d98d9fd_1': Provided Schema | |
# does not match Table hx-trial:chrisk.fun_with_time. Field dt has changed type from DATETIME to STRING. | |
echo "-------- Reloading AVRO Data WITH EXPLICIT SCHEMA ---------" | |
bq load --source_format=AVRO \ | |
--max_bad_records=10 \ | |
${DATASET}.${TABLE} \ | |
/tmp/${TABLE}.avro \ | |
${SCHEMA_FILE} | |
# Error in query string: Error processing job '<JOB_ID>': Cannot read tablet : | |
# Incompatible types. 'dt' : STRING 'dt' : INT64 | |
bq query --use_legacy_sql=False < ${SQL_FILE} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment