Created
April 4, 2011 17:42
-
-
Save kzk/902048 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package net.kzk9; | |
import java.io.*; | |
import java.util.*; | |
import org.apache.hadoop.conf.*; | |
import org.apache.hadoop.hbase.*; | |
import org.apache.hadoop.hbase.client.*; | |
import org.apache.hadoop.hbase.filter.*; | |
import org.apache.hadoop.hbase.io.*; | |
import org.apache.hadoop.hbase.util.*; | |
import org.apache.hadoop.hbase.mapreduce.*; | |
import org.apache.hadoop.mapreduce.*; | |
import org.apache.hadoop.mapreduce.lib.output.*; | |
import org.apache.hadoop.util.*; | |
import org.json.simple.*; | |
import org.json.simple.parser.*; | |
import org.msgpack.*; | |
import org.msgpack.object.*; | |
public class HBaseTwitterMsgPack { | |
/** | |
* hBaseのテーブルを入力とするMapper | |
*/ | |
static class MsgPackParseMapper extends TableMapper<ImmutableBytesWritable, Put> { | |
public static enum Counters { VALID_ROWS, INVALID_ROWS } | |
@Override | |
protected void setup(Mapper.Context context) {} | |
// HBaseテーブルの1行がTwitterのMessagePack形式のデータ | |
@Override | |
public void map(ImmutableBytesWritable row, Result values, Context context) throws IOException { | |
for (KeyValue value: values.list()) { | |
String json_str = Bytes.toString(value.getValue()); | |
try { | |
byte[] raw = value.getValue(); | |
Map<MessagePackObject, MessagePackObject> tweet = MessagePack.unpack(raw).asMap(); | |
MessagePackObject id_obj = tweet.get(RawType.create("id_str")); | |
if (id_obj == null) | |
throw new IOException("id_str not found"); | |
String id_str = id_obj.asString(); | |
Map<MessagePackObject, MessagePackObject> user = tweet.get(RawType.create("user")).asMap(); | |
if (user == null) | |
throw new IOException("user not found"); | |
MessagePackObject screen_name_obj = user.get(RawType.create("screen_name")); | |
if (screen_name_obj == null) | |
throw new IOException("screen_name not found"); | |
String screen_name = screen_name_obj.asString(); | |
// increment valid rows | |
context.getCounter(Counters.VALID_ROWS).increment(1); | |
} catch (Throwable e) { | |
// ignore format & parse error | |
context.getCounter(Counters.INVALID_ROWS).increment(1); | |
continue; | |
} | |
} | |
} | |
} | |
public static void main(String[] args) throws Exception { | |
// 設定情報の読み込み | |
Configuration conf = HBaseConfiguration.create(); | |
conf.addResource("/etc/hbase/conf/hbase-default.xml"); | |
conf.addResource("/etc/hbase/conf/hbase-site.xml"); | |
conf.set("hbase.client.scanner.caching", "300"); | |
// conf.set("mapred.job.tracker", "local"); | |
// 引数のパース | |
new GenericOptionsParser(conf, args); | |
// ジョブの作成 | |
String tableName = "twitter"; | |
Job job = new Job(conf, "HBaseTwitterMsgPack_" + tableName); | |
job.setJarByClass(HBaseTwitterMsgPack.class); | |
// Scan条件の指定 | |
Scan scan = new Scan(); | |
// 最初のデータのみを取得 | |
scan.setFilter(new FirstKeyOnlyFilter()); | |
// data:jsonのみを取得 | |
scan = scan.addColumn(Bytes.toBytes("packed_data"), Bytes.toBytes("msgpack")); | |
// 補助関数を利用したMapperの初期化 | |
TableMapReduceUtil.initTableMapperJob(tableName, // テーブル名 | |
scan, // Mapperに渡す前に使用するScan | |
MsgPackParseMapper.class, // Mapperクラス | |
ImmutableBytesWritable.class, // MapperのKeyの型 | |
Put.class, // MapperのValueの型 | |
job); | |
// 補助関数を利用したReducerの初期化 | |
/* | |
TableMapReduceUtil.initTableReducerJob( | |
"twitter", // 出力テーブル名 | |
IdentityTableReducer.class, | |
job); | |
*/ | |
// Reducerは使用しない。Counterで行数を数える。 | |
job.setOutputFormatClass(NullOutputFormat.class); | |
job.setNumReduceTasks(0); | |
// ジョブの実行 | |
System.exit(job.waitForCompletion(true) ? 0 : 1); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment