Skip to content

Instantly share code, notes, and snippets.

@hamnis
Created April 17, 2015 11:08
Show Gist options
  • Save hamnis/b7defee6b73130048b12 to your computer and use it in GitHub Desktop.
Save hamnis/b7defee6b73130048b12 to your computer and use it in GitHub Desktop.
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.avro.mapred.{AvroOutputFormat, AvroWrapper, AvroKey, AvroJob}
import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.mapred.JobConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
package object avro {
implicit class AvroRDDFunctions[A <: GenericRecord](val rdd: RDD[A]) extends AnyVal {
def writeToAvroFile(path: String, schema: Schema)(implicit sc: SparkContext): Unit = {
val jobConf = new JobConf(sc.hadoopConfiguration)
AvroJob.setOutputSchema(jobConf, schema)
rdd.map(x => (new AvroKey(x), NullWritable.get())).
saveAsHadoopFile(path,
classOf[AvroWrapper[GenericRecord]],
classOf[NullWritable],
classOf[AvroOutputFormat[GenericRecord]],
jobConf)
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment