Last active
November 25, 2016 02:48
-
-
Save nownabe/e2ebcda4b576411d8aea31270d289c32 to your computer and use it in GitHub Desktop.
Guess BigQuery Schema from JSON Lines
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# frozen_string_literal: true | |
# Guess BigQuery table schema automatically from JSON Lines | |
# | |
# example: | |
# $ cat source.jsonl | |
# {"required":123,"optional":true,"nested":{"required":1234,"optional":"yes"},"array":[0,1,2,3,4]} | |
# {"required":456,"optional":false,"nested":{"required":1234,"optional":"yes","nested":{"prop":1}},"array":[5,6,7,8,9]} | |
# {"required":789,"nested":{"required":1234,"optional":"yes","additional":"added"},"array":[]} | |
# | |
# $ ./guess_bq_schema.rb source.jsonl | |
# [ | |
# { | |
# "name": "required", | |
# "type": "INTEGER", | |
# "mode": "REQUIRED" | |
# }, | |
# { | |
# "name": "optional", | |
# "type": "BOOLEAN", | |
# "mode": "NULLABLE" | |
# }, | |
# { | |
# "name": "nested", | |
# "type": "RECORD", | |
# "mode": "REQUIRED", | |
# "fields": [ | |
# { | |
# "name": "required", | |
# "type": "INTEGER", | |
# "mode": "REQUIRED" | |
# }, | |
# { | |
# "name": "optional", | |
# "type": "STRING", | |
# "mode": "REQUIRED" | |
# }, | |
# { | |
# "name": "nested", | |
# "type": "RECORD", | |
# "mode": "NULLABLE", | |
# "fields": [ | |
# { | |
# "name": "prop", | |
# "type": "INTEGER", | |
# "mode": "REQUIRED" | |
# } | |
# ] | |
# }, | |
# { | |
# "name": "additional", | |
# "type": "STRING", | |
# "mode": "NULLABLE" | |
# } | |
# ] | |
# }, | |
# { | |
# "name": "array", | |
# "type": "INTEGER", | |
# "mode": "REPEATED" | |
# } | |
# ] | |
require "json" | |
class Field | |
attr_reader :name, :mode, :type | |
def initialize(name) | |
@name = name | |
@mode = :required | |
end | |
def repeated! | |
@mode = :repeated | |
end | |
def repeated? | |
@mode == :repeated | |
end | |
def nullable! | |
@mode = :nullable | |
end | |
def to_schema | |
{ | |
name: name, | |
type: type.to_s.upcase, | |
mode: mode.to_s.upcase | |
} | |
end | |
end | |
class StringField < Field | |
def initialize(name) | |
super | |
@type = :string | |
end | |
end | |
class IntegerField < Field | |
def initialize(name) | |
super | |
@type = :integer | |
end | |
end | |
class FloatField < Field | |
def initialize(name) | |
super | |
@type = :float | |
end | |
end | |
class BooleanField < Field | |
def initialize(name) | |
super | |
@type = :boolean | |
end | |
end | |
class NullField < Field | |
def initialize(name) | |
super | |
@type = :string | |
end | |
end | |
class RecordField < Field | |
attr_reader :fields | |
def initialize(name, fields) | |
super(name) | |
@type = :record | |
@fields = fields | |
end | |
def nullable! | |
super | |
end | |
def to_schema | |
super.merge(fields: fields.to_schema) | |
end | |
end | |
class Schema | |
def initialize(record) | |
@schema = guess(record) | |
end | |
def guess(record) | |
record.each_with_object({}) do |(key, value), schema| | |
schema[key] = _guess(key, value) | |
end | |
end | |
def _guess(key, value) | |
case value | |
when String | |
StringField.new(key) | |
when Integer | |
IntegerField.new(key) | |
when Float | |
FloatField.new(key) | |
when TrueClass, FalseClass | |
BooleanField.new(key) | |
when Hash | |
RecordField.new(key, self.class.new(value)) | |
when Array | |
f = _guess(key, value.first) | |
f.repeated! | |
f | |
when NilClass | |
NullField.new(key) | |
end | |
end | |
def null?(key) | |
[email protected]?(key) || @schema[key].is_a?(NullField) | |
end | |
def key?(key) | |
@schema.key?(key) | |
end | |
def [](key) | |
@schema[key] | |
end | |
def keys | |
@schema.keys | |
end | |
def merge!(other) | |
@schema.each do |key, val| | |
if other.null?(key) | |
val.nullable! unless other[key]&.repeated? | |
else | |
if val.is_a?(NullField) | |
@schema[key] = other[key] | |
@schema[key].nullable! unless other[key].repeated? | |
end | |
@schema[key].fields.merge!(other[key].fields) if val.is_a?(RecordField) && other[key].is_a?(RecordField) | |
end | |
end | |
(other.keys - keys).each do |key| | |
@schema[key] = other[key] | |
@schema[key].nullable! | |
end | |
end | |
def nullable! | |
@schema.each_value(&:nullable!) | |
end | |
def to_schema | |
@schema.values.map(&:to_schema) | |
end | |
end | |
input_file = File.expand_path(ARGV[0]) | |
schema = nil | |
File.read(input_file).each_line do |line| | |
record = JSON.parse(line) | |
if schema | |
schema.merge!(Schema.new(record)) | |
else | |
schema = Schema.new(record) | |
end | |
end | |
puts JSON.pretty_generate(schema.to_schema) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment