Last active
January 8, 2018 06:45
-
-
Save liancheng/19eac168295a907fbfd5fcb517c09580 to your computer and use it in GitHub Desktop.
Simple Scala DSL for constructing Apache Arrow schemas.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package example | |
import scala.collection.JavaConverters._ | |
import scala.language.implicitConversions | |
import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema} | |
trait FieldBuilder { | |
def named(name: String): Field | |
def ! : this.type = { | |
nullable = false | |
this | |
} | |
def ? : this.type = { | |
nullable = true | |
this | |
} | |
protected def isNullable: Boolean = nullable | |
private var nullable: Boolean = true | |
} | |
trait LowPriorityImplicits { | |
implicit class FieldDSL(name: String) { | |
def ~ (builder: FieldBuilder): Field = builder.named(name) | |
} | |
implicit class PrimitiveFieldBuilder(arrowType: ArrowType.PrimitiveType) extends FieldBuilder { | |
override def named(name: String): Field = { | |
val fieldType = new FieldType(isNullable, arrowType, null) | |
new Field(name, fieldType, null) | |
} | |
} | |
class ListFieldBuilder(elementTypeBuilder: FieldBuilder) extends FieldBuilder { | |
override def named(name: String): Field = { | |
val fieldType = new FieldType(isNullable, ArrowType.List.INSTANCE, null) | |
new Field(name, fieldType, Seq(elementTypeBuilder.named(null)).asJava) | |
} | |
} | |
class StructFieldBuilder(fields: Seq[Field]) extends FieldBuilder { | |
override def named(name: String): Field = { | |
val fieldType = new FieldType(isNullable, ArrowType.Struct.INSTANCE, null) | |
new Field(name, fieldType, fields.asJava) | |
} | |
} | |
} | |
object dsl extends LowPriorityImplicits { | |
def ArrowSchema(fields: Field*): Schema = new Schema(fields.asJava) | |
def ArrowStruct(fields: Field*): StructFieldBuilder = new StructFieldBuilder(fields) | |
def ArrowList(elementType: FieldBuilder): ListFieldBuilder = new ListFieldBuilder(elementType) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scala.collection.JavaConverters._ | |
import example.dsl._ | |
import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema} | |
// Here we are using the address book example schema used in the Twitter Parquet blog post. | |
// https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet.html | |
// Using the Scala DSL | |
ArrowSchema( | |
// A non-nullable string field. | |
"owner" ~ ArrowType.Utf8.INSTANCE.!, | |
// A non-nullable list field whose elements are non-nullable strings. | |
"ownerPhoneNumbers" ~ ArrowList(ArrowType.Utf8.INSTANCE.!).!, | |
// A non-nullable list field whose elements are non-nullable structs. | |
"contacts" ~ ArrowList( | |
// A non-nullable struct type. | |
ArrowStruct( | |
// A non-nullable string field. | |
"name" ~ ArrowType.Utf8.INSTANCE.!, | |
// A nullable string field. | |
"phoneNumber" ~ ArrowType.Utf8.INSTANCE.? | |
).! | |
).! | |
) | |
// Using the Apache Arrow Java API in Scala | |
new Schema( | |
Seq( | |
// A non-nullable string field. | |
new Field("owner", new FieldType(false, ArrowType.Utf8.INSTANCE, null), null), | |
// A non-nullable list field whose elements are non-nullable strings. | |
new Field( | |
"ownerPhoneNumbers", | |
new FieldType(false, ArrowType.List.INSTANCE, null), | |
Seq(new Field(null, new FieldType(false, ArrowType.Utf8.INSTANCE, null), null)).asJava | |
), | |
// A non-nullable list field whose elements are non-nullable structs. | |
new Field( | |
"contacts", | |
new FieldType(false, ArrowType.List.INSTANCE, null), | |
Seq( | |
// A non-nullable struct | |
new Field( | |
null, | |
new FieldType(false, ArrowType.Struct.INSTANCE, null), | |
Seq( | |
// A non-nullable string field. | |
new Field("name", new FieldType(false, ArrowType.Utf8.INSTANCE, null), null), | |
// A nullable string field. | |
new Field("phoneNumber", FieldType.nullable(ArrowType.Utf8.INSTANCE), null) | |
).asJava | |
) | |
).asJava | |
) | |
).asJava | |
) |
Nice writeup.
But I think more comments about how the ArrowType->FieldBuilder
and FieldDSL
implicit take effect should be addressed.
It took me a while to figure it out.
@advancedxy Refactored it a little bit and removed the `ArrowType->FieldBuilder`
implicit conversion. Hopefully, it's easier to reason about this time.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Field type metadata is not covered yet but should be easy to add.