AtlasPilotPuppy · August 3, 2016 14:21 · rain1024 · Oct 30, 2014 · AtlasPilotPuppy · Jun 24, 2015
diff --git a/hbase_rdd.scala b/hbase_rdd.scala
 import org.apache.spark.rdd.NewHadoopRDD
 import org.apache.hadoop.hbase.mapreduce.TableInputFormat
 import org.apache.hadoop.hbase.HBaseConfiguration
 import org.apache.hadoop.hbase.client.Result
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 import scala.collection.JavaConversions._
 import scala.collection.JavaConverters._

 val sc = new SparkContext("local", "Simple App")

 val hbaseConfiguration = (tableName: String) => {
  val hbaseConfiguration = HBaseConfiguration.create()
  hbaseConfiguration.set(TableInputFormat.INPUT_TABLE, tableName)
  hbaseConfiguration
  }

 val tableRDD = (table: String) => {
 val rdd = new NewHadoopRDD(
  sc,
  classOf[TableInputFormat],
  classOf[ImmutableBytesWritable],
  classOf[Result],
  hbaseConfiguration(table)
 )
 rdd
 }

 val rdd = tableRDD("table-with-data")

 /** Convert columns to strings **/
 val columns = rdd.map(tuple => tuple._2).map(result => result.getColumn("Column Family".getBytes(),
     "ColumnQualifier".getBytes())).map(keyValues => {
     new String(keyValues.asScala.reduceLeft{
     (a,b) => if (a.getTimestamp > b.getTimestamp) a else b
     }.getValue.map(_.toChar))})

 /** another way to get multiple columns */
 val cols = rdd.map(tuple => tuple._2).map(result => result.getColumn("CF".getBytes, "CQ1".getBytes) :: result.getColumn("CF".getBytes, "CQ2".getBytes) :: result.getColumn("CF".getBytes, "CQ2".getBytes):: Nil)

 // remove invalid items from rdd

 val filtered = cols.filter( row.map(_.length > 0).reduce((acc, tip) => acc & tip) )

 /** convert all values to strings **/
 val row_vals = filtered.map(row => row.map(ele => new String(ele.head.getValue.map(_.toChar))))
	import org.apache.spark.rdd.NewHadoopRDD
	import org.apache.hadoop.hbase.mapreduce.TableInputFormat
	import org.apache.hadoop.hbase.HBaseConfiguration
	import org.apache.hadoop.hbase.client.Result
	import org.apache.hadoop.hbase.io.ImmutableBytesWritable
	import scala.collection.JavaConversions._
	import scala.collection.JavaConverters._

	val sc = new SparkContext("local", "Simple App")

	val hbaseConfiguration = (tableName: String) => {
	val hbaseConfiguration = HBaseConfiguration.create()
	hbaseConfiguration.set(TableInputFormat.INPUT_TABLE, tableName)
	hbaseConfiguration
	}

	val tableRDD = (table: String) => {
	val rdd = new NewHadoopRDD(
	sc,
	classOf[TableInputFormat],
	classOf[ImmutableBytesWritable],
	classOf[Result],
	hbaseConfiguration(table)
	)
	rdd
	}

	val rdd = tableRDD("table-with-data")

	/ Convert columns to strings /
	val columns = rdd.map(tuple => tuple._2).map(result => result.getColumn("Column Family".getBytes(),
	"ColumnQualifier".getBytes())).map(keyValues => {
	new String(keyValues.asScala.reduceLeft{
	(a,b) => if (a.getTimestamp > b.getTimestamp) a else b
	}.getValue.map(_.toChar))})

	/** another way to get multiple columns */
	val cols = rdd.map(tuple => tuple._2).map(result => result.getColumn("CF".getBytes, "CQ1".getBytes) :: result.getColumn("CF".getBytes, "CQ2".getBytes) :: result.getColumn("CF".getBytes, "CQ2".getBytes):: Nil)

	// remove invalid items from rdd

	val filtered = cols.filter( row.map(_.length > 0).reduce((acc, tip) => acc & tip) )

	/ convert all values to strings /
	val row_vals = filtered.map(row => row.map(ele => new String(ele.head.getValue.map(_.toChar))))