Skip to content

Instantly share code, notes, and snippets.

@nownabe
Last active November 25, 2016 02:48
Show Gist options
  • Save nownabe/e2ebcda4b576411d8aea31270d289c32 to your computer and use it in GitHub Desktop.
Save nownabe/e2ebcda4b576411d8aea31270d289c32 to your computer and use it in GitHub Desktop.
Guess BigQuery Schema from JSON Lines
#!/usr/bin/env ruby
# frozen_string_literal: true
# Guess BigQuery table schema automatically from JSON Lines
#
# example:
# $ cat source.jsonl
# {"required":123,"optional":true,"nested":{"required":1234,"optional":"yes"},"array":[0,1,2,3,4]}
# {"required":456,"optional":false,"nested":{"required":1234,"optional":"yes","nested":{"prop":1}},"array":[5,6,7,8,9]}
# {"required":789,"nested":{"required":1234,"optional":"yes","additional":"added"},"array":[]}
#
# $ ./guess_bq_schema.rb source.jsonl
# [
# {
# "name": "required",
# "type": "INTEGER",
# "mode": "REQUIRED"
# },
# {
# "name": "optional",
# "type": "BOOLEAN",
# "mode": "NULLABLE"
# },
# {
# "name": "nested",
# "type": "RECORD",
# "mode": "REQUIRED",
# "fields": [
# {
# "name": "required",
# "type": "INTEGER",
# "mode": "REQUIRED"
# },
# {
# "name": "optional",
# "type": "STRING",
# "mode": "REQUIRED"
# },
# {
# "name": "nested",
# "type": "RECORD",
# "mode": "NULLABLE",
# "fields": [
# {
# "name": "prop",
# "type": "INTEGER",
# "mode": "REQUIRED"
# }
# ]
# },
# {
# "name": "additional",
# "type": "STRING",
# "mode": "NULLABLE"
# }
# ]
# },
# {
# "name": "array",
# "type": "INTEGER",
# "mode": "REPEATED"
# }
# ]
require "json"
class Field
attr_reader :name, :mode, :type
def initialize(name)
@name = name
@mode = :required
end
def repeated!
@mode = :repeated
end
def repeated?
@mode == :repeated
end
def nullable!
@mode = :nullable
end
def to_schema
{
name: name,
type: type.to_s.upcase,
mode: mode.to_s.upcase
}
end
end
class StringField < Field
def initialize(name)
super
@type = :string
end
end
class IntegerField < Field
def initialize(name)
super
@type = :integer
end
end
class FloatField < Field
def initialize(name)
super
@type = :float
end
end
class BooleanField < Field
def initialize(name)
super
@type = :boolean
end
end
class NullField < Field
def initialize(name)
super
@type = :string
end
end
class RecordField < Field
attr_reader :fields
def initialize(name, fields)
super(name)
@type = :record
@fields = fields
end
def nullable!
super
end
def to_schema
super.merge(fields: fields.to_schema)
end
end
class Schema
def initialize(record)
@schema = guess(record)
end
def guess(record)
record.each_with_object({}) do |(key, value), schema|
schema[key] = _guess(key, value)
end
end
def _guess(key, value)
case value
when String
StringField.new(key)
when Integer
IntegerField.new(key)
when Float
FloatField.new(key)
when TrueClass, FalseClass
BooleanField.new(key)
when Hash
RecordField.new(key, self.class.new(value))
when Array
f = _guess(key, value.first)
f.repeated!
f
when NilClass
NullField.new(key)
end
end
def null?(key)
[email protected]?(key) || @schema[key].is_a?(NullField)
end
def key?(key)
@schema.key?(key)
end
def [](key)
@schema[key]
end
def keys
@schema.keys
end
def merge!(other)
@schema.each do |key, val|
if other.null?(key)
val.nullable! unless other[key]&.repeated?
else
if val.is_a?(NullField)
@schema[key] = other[key]
@schema[key].nullable! unless other[key].repeated?
end
@schema[key].fields.merge!(other[key].fields) if val.is_a?(RecordField) && other[key].is_a?(RecordField)
end
end
(other.keys - keys).each do |key|
@schema[key] = other[key]
@schema[key].nullable!
end
end
def nullable!
@schema.each_value(&:nullable!)
end
def to_schema
@schema.values.map(&:to_schema)
end
end
input_file = File.expand_path(ARGV[0])
schema = nil
File.read(input_file).each_line do |line|
record = JSON.parse(line)
if schema
schema.merge!(Schema.new(record))
else
schema = Schema.new(record)
end
end
puts JSON.pretty_generate(schema.to_schema)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment