Last active
November 8, 2016 21:42
-
-
Save dportabella/dd8886ebb8d5f0eddd1196e1c30e34f6 to your computer and use it in GitHub Desktop.
How to deserialize a hadoop result sequence file outside hadoop (or a spark saveAsObjectFile outside spark)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// libraryDependencies += "org.apache.hadoop" % "hadoop-common" % "2.7.3" | |
import java.io.{ByteArrayInputStream, ObjectInputStream} | |
import org.apache.hadoop.conf._ | |
import org.apache.hadoop.fs._ | |
import org.apache.hadoop.io._ | |
val f = "/path/to/part-00000" | |
val reader = new SequenceFile.Reader(new Configuration(), SequenceFile.Reader.file(new Path(f))) | |
try { | |
val key = NullWritable.get | |
val value = new BytesWritable | |
while (reader.next(key, value)) { | |
val values = new ObjectInputStream(new ByteArrayInputStream(value.getBytes)).readObject().asInstanceOf[Array[_]] // or specify the type if you know it | |
values.foreach(println) | |
} | |
} finally reader.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment