Created
April 4, 2011 15:55
-
-
Save kzk/901873 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ORIG_SIZE: 18,826,943,581 | |
// PACKED_SIZE: 13,203,414,226 | |
package net.kzk9; | |
import java.io.*; | |
import java.util.*; | |
import org.apache.hadoop.conf.*; | |
import org.apache.hadoop.hbase.*; | |
import org.apache.hadoop.hbase.client.*; | |
import org.apache.hadoop.hbase.filter.*; | |
import org.apache.hadoop.hbase.io.*; | |
import org.apache.hadoop.hbase.util.*; | |
import org.apache.hadoop.hbase.mapreduce.*; | |
import org.apache.hadoop.mapreduce.*; | |
import org.apache.hadoop.mapreduce.lib.output.*; | |
import org.apache.hadoop.util.*; | |
import org.json.simple.*; | |
import org.json.simple.parser.*; | |
import org.msgpack.*; | |
public class HBaseTwitterJSONtoMsgPack { | |
/** | |
* hBaseのテーブルを入力とするMapper | |
*/ | |
static class JSONParseMapper extends TableMapper<ImmutableBytesWritable, Put> { | |
public static enum Counters { VALID_ROWS, INVALID_ROWS, ORIG_SIZE, PACKED_SIZE } | |
protected JSONParser parser; | |
protected ContainerFactory containerFactory; | |
@Override | |
protected void setup(Mapper.Context context) { | |
// JSONパーサーを初期化 | |
parser = new JSONParser(); | |
ContainerFactory containerFactory = new ContainerFactory() { | |
public List creatArrayContainer() { return new ArrayList(); } | |
public Map createObjectContainer() { return new HashMap(); } | |
}; | |
} | |
// HBaseテーブルの1行がTwitterのJSON形式のデータ | |
@Override | |
public void map(ImmutableBytesWritable row, Result values, Context context) throws IOException { | |
for (KeyValue value: values.list()) { | |
String json_str = Bytes.toString(value.getValue()); | |
try { | |
HashMap tweet = (HashMap)parser.parse(new StringReader(json_str), containerFactory); | |
Put put = new Put(row.get()); | |
byte[] packed = MessagePack.pack(tweet); | |
put.add(Bytes.toBytes("packed_data"), Bytes.toBytes("msgpack"), packed); | |
context.write(row, put); | |
// increment valid rows | |
context.getCounter(Counters.VALID_ROWS).increment(1); | |
context.getCounter(Counters.ORIG_SIZE).increment(value.getLength()); | |
context.getCounter(Counters.PACKED_SIZE).increment(packed.length); | |
} catch (Throwable e) { | |
// ignore format & parse error | |
context.getCounter(Counters.INVALID_ROWS).increment(1); | |
continue; | |
} | |
} | |
} | |
} | |
public static void main(String[] args) throws Exception { | |
// 設定情報の読み込み | |
Configuration conf = HBaseConfiguration.create(); | |
conf.addResource("/etc/hbase/conf/hbase-default.xml"); | |
conf.addResource("/etc/hbase/conf/hbase-site.xml"); | |
conf.set("hbase.client.scanner.caching", "300"); | |
// conf.set("mapred.job.tracker", "local"); | |
// 引数のパース | |
new GenericOptionsParser(conf, args); | |
// ジョブの作成 | |
String tableName = "twitter"; | |
Job job = new Job(conf, "HBaseTwitterJSONtoMsgPack_" + tableName); | |
job.setJarByClass(HBaseTwitterJSONtoMsgPack.class); | |
// 圧縮 | |
conf.setCompressMapOutput(true); | |
// Scan条件の指定 | |
Scan scan = new Scan(); | |
// 最初のデータのみを取得 | |
scan.setFilter(new FirstKeyOnlyFilter()); | |
// data:jsonのみを取得 | |
scan = scan.addColumn(Bytes.toBytes("data"), Bytes.toBytes("json")); | |
// 補助関数を利用したMapperの初期化 | |
TableMapReduceUtil.initTableMapperJob(tableName, // テーブル名 | |
scan, // Mapperに渡す前に使用するScan | |
JSONParseMapper.class, // Mapperクラス | |
ImmutableBytesWritable.class, // MapperのKeyの型 | |
Put.class, // MapperのValueの型 | |
job); | |
// 補助関数を利用したReducerの初期化 | |
TableMapReduceUtil.initTableReducerJob( | |
"twitter", // 出力テーブル名 | |
IdentityTableReducer.class, | |
job); | |
job.setNumReduceTasks(8); | |
// ジョブの実行 | |
System.exit(job.waitForCompletion(true) ? 0 : 1); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment