Skip to content

Instantly share code, notes, and snippets.

@oyvindholmstad
Last active September 21, 2018 06:54
Show Gist options
  • Save oyvindholmstad/f5ecd3dcf4e471f77907 to your computer and use it in GitHub Desktop.
Save oyvindholmstad/f5ecd3dcf4e471f77907 to your computer and use it in GitHub Desktop.
BigQuery JSON schema generator in Javascript and Scala
/*
A script to generate a Google BigQuery-complient JSON-schema from a JSON object.
Make sure the JSON object is complete before generating, null values will be skipped.
References:
https://cloud.google.com/bigquery/docs/data
https://cloud.google.com/bigquery/docs/personsDataSchema.json
https://gist.github.com/igrigorik/83334277835625916cd6
... and a couple of visits to StackOverflow
To run:
nodejs schema-generator.js
Output:
schema.json
*/
var fs = require('fs')
function readJson(filename, callback) {
fs.readFile(filename, 'utf8', function (err,data) {
if (err) {
return callback(undefined);
}
callback(data);
});
}
function writeJsonSchema(data) {
fs.writeFile("schema.json", data, function(err) {
if(err) {}
console.log("The schema was saved to schema.json");
});
}
function isObject(obj) { return obj.constructor === {}.constructor }
function isString(obj) { return obj.constructor === "test".constructor }
function isArray(obj) { return obj.constructor === [].constructor }
function isNumber(obj) { return !isNaN(parseFloat(obj)) }
function isBoolean(obj) { return typeof(obj) == "boolean" }
function getType(object) {
if (object === null) {
return "STRING";
} else if (object === undefined) {
return "undefined";
} else if (isString(object)) {
try {
/* Check if string is a date string
The length check is present cause we only want to accept ISO strings with both time and date.
This also avoids a string like "2014" to be interpreted as a timestamp */
var tryDate = new Date(object);
if (object.length > 18 && !isNaN(tryDate.getTime())) {
return "TIMESTAMP";
} else {
return "STRING";
}
}
catch(err) {
return "STRING";
}
} else if (isArray(object)) {
return "Array";
} else if (isObject(object)) {
return "Object";
} else if (isNumber(object)) {
if (object.toString().indexOf('.') > 0 && object.toString().indexOf(',') > 0) {
return "FLOAT";
} else {
return "INTEGER";
}
} else if (isBoolean(object)) {
return "BOOLEAN";
} else {
return undefined;
}
}
function createField(type, name, mode) {
if (mode) {
return {
name: name,
type: type,
mode: mode
};
}
return {
name: name,
type: type
};
}
function traverse(fields, o) {
for (i in o) {
var name = i;
var type = getType(o[i]);
if (type == 'null') {
// Skip empty fields.
}
else if (type == "Array")
{
var field = traverseArray(name, [], o[i]);
if (field != undefined)
fields.push(field);
}
else if (type == "Object")
{
var field = createField("record", name, undefined);
field.fields = traverse([], o[i]);
fields.push(field);
} else {
fields.push(createField(type, name, undefined));
}
}
return fields;
}
function traverseArray(name, fields, o) {
if (o.length > 0) {
// We only want to look at the first item in the array.
// The rest should be just like it, and will be expressed through mode=repeated.
var firstElement = o[0];
var type = getType(firstElement);
if (type == 'Object') {
// If the array has an object as the first element, we need to create a record type with more fields
var field = createField("record", name, "repeated");
field.fields = traverse(fields, firstElement);
return field;
} else {
// If the array only has native types, we only created a simple repeated field
var field = createField(type, name, "repeated");
return field;
}
}
return undefined;
}
// RUN
var filename = 'data.json'
var arguments = process.argv.slice(2);
if (arguments.length > 0) {
filename = arguments[0];
}
readJson(filename, function(data) {
if (data != undefined) {
data = JSON.parse(data);
fields = [];
traverse(fields, data);
writeJsonSchema(JSON.stringify(fields));
} else {
console.log("Unable to read file " + filename);
}
});
/*
Generate a Google BigQuery-complient JSON-schema from a JsValue (Play framework) object.
References:
https://cloud.google.com/bigquery/docs/data
https://cloud.google.com/bigquery/docs/personsDataSchema.json
https://gist.github.com/igrigorik/83334277835625916cd6
... and a couple of visits to StackOverflow
*/
import org.joda.time.DateTime
import play.api.libs.json._
import scala.collection.mutable.ArrayBuffer
/**
* Created by Oyvind Holmstad on 27.01.2016.
*/
object BigQuerySchemaGenerator {
def schemaFromJsValue(json: JsValue) : JsValue = {
val jsonObject = json.asInstanceOf[JsObject]
val result: Seq[JsValue] = traverse(jsonObject.fields)
Json.toJson(result)
}
def traverse(fields: Seq[(String, JsValue)]): Seq[JsValue] = {
fields.map(f => {
val name = f._1
val fieldType = f._2
fieldType match {
case x: JsArray => traverseArray(name, x)
case x: JsObject => createRecordField(name, "nullable", traverse(x.fields)).asInstanceOf[JsValue]
case default => createValueField(name, default, "nullable")
}
})
}
def traverseArray(name: String, array: JsArray) : JsValue = {
val first = array.productIterator.toList.head.asInstanceOf[ArrayBuffer[JsValue]].head
first match {
case x: JsArray => JsNull
case x: JsObject => createRecordField(name, "repeated", traverse(x.fields))
case default => createValueField(name, default, "nullable")
}
}
def createValueField(name: String, fieldType: JsValue, mode: String): JsValue = {
fieldType match {
case x: JsNumber => {
if (!x.value.toString().contains(".")) {
createField(name, "INTEGER", mode)
}
else {
createField(name, "FLOAT", mode)
}
}
case x: JsString => {
// Strings on ISO_8601 format will be interpreted as timestamps
if (x.value.length == 29) {
try {
val possibleDate = new DateTime(x.value)
createField(name, "TIMESTAMP", mode)
} catch {
case e: Exception => createField(name, "STRING", mode)
}
} else {
createField(name, "STRING", mode)
}
}
case x: JsBoolean => {
createField(name, "BOOLEAN", mode)
}
case _ => {
JsNull.asInstanceOf[JsValue]
}
}
}
def createField(name: String, fieldType: String, mode: String) = {
Json.obj(
"name" -> name,
"type" -> fieldType,
"mode" -> mode
).asInstanceOf[JsValue]
}
def createRecordField(name: String, mode: String, fields: Seq[JsValue]) = {
Json.obj(
"name" -> name,
"type" -> "record",
"mode" -> "nullable",
"fields" -> fields
).asInstanceOf[JsValue]
}
}
@seancolyer
Copy link

Thanks for this. FYI I think https://gist.github.com/oyvindholmstad/f5ecd3dcf4e471f77907#file-schema-generator-js-L68 should read:

if (object.toString().indexOf('.') > 0 || object.toString().indexOf(',') > 0) {

for OR instead of AND.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment