-
-
Save maiha/2260678 to your computer and use it in GitHub Desktop.
| java-1.6.0-openjdk | |
| hadoop-1.0.1 | |
| hbase-0.92.1 | |
| OS: ubuntu-11.10 | |
| input src: csv(1億行/22GB) | |
| disk rest: 77GB | |
| memory: 24GB (内12GBはramdiskで利用) | |
| io.compression.codecs: なし(無圧縮で利用) |
| <?xml version="1.0"?> | |
| <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> | |
| <!-- Put site-specific property overrides in this file. --> | |
| <configuration> | |
| <property> | |
| <name>hadoop.tmp.dir</name> | |
| <value>/usr/local/lib/hadoop/tmp</value> | |
| <description>A base for other temporary directories.</description> | |
| </property> | |
| <property> | |
| <name>fs.default.name</name> | |
| <value>hdfs://localhost:9000</value> | |
| <description>primary NameNode</description> | |
| </property> | |
| </configuration> |
| # 以下、変更点のみ記載 | |
| export JAVA_HOME=/usr/lib/jvm/default-java | |
| export HADOOP_OPTS=-server | |
| export HADOOP_PID_DIR=/var/run/hadoop |
| <?xml version="1.0"?> | |
| <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> | |
| <!-- Put site-specific property overrides in this file. --> | |
| <configuration> | |
| <property> | |
| <name>dfs.replication</name> | |
| <value>1</value> | |
| <description>localhost only</description> | |
| </property> | |
| <property> | |
| <name>dfs.permissions</name> | |
| <value>false</value> | |
| </property> | |
| </configuration> |
| <?xml version="1.0"?> | |
| <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> | |
| <!-- Put site-specific property overrides in this file. --> | |
| <configuration> | |
| <property> | |
| <name>mapred.job.tracker</name> | |
| <value>localhost:9001</value> | |
| <description>JobTracker</description> | |
| </property> | |
| </configuration> |
| <?xml version="1.0"?> | |
| <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> | |
| <!-- Put site-specific property overrides in this file. --> | |
| <configuration> | |
| <property> | |
| <name>hadoop.tmp.dir</name> | |
| <value>/usr/local/lib/hadoop/tmp</value> | |
| <description>A base for other temporary directories.</description> | |
| </property> | |
| <property> | |
| <name>fs.default.name</name> | |
| <value>hdfs://localhost:9000</value> | |
| <description>primary NameNode</description> | |
| </property> | |
| </configuration> |
| # 以下、変更点のみ記載 | |
| export JAVA_HOME=/usr/lib/jvm/default-java | |
| export HADOOP_OPTS=-server | |
| export HADOOP_PID_DIR=/var/run/hadoop |
| <?xml version="1.0"?> | |
| <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> | |
| <!-- Put site-specific property overrides in this file. --> | |
| <configuration> | |
| <property> | |
| <name>dfs.replication</name> | |
| <value>1</value> | |
| <description>localhost only</description> | |
| </property> | |
| <property> | |
| <name>dfs.permissions</name> | |
| <value>false</value> | |
| </property> | |
| </configuration> |
| <?xml version="1.0"?> | |
| <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> | |
| <!-- Put site-specific property overrides in this file. --> | |
| <configuration> | |
| <property> | |
| <name>mapred.job.tracker</name> | |
| <value>localhost:9001</value> | |
| <description>JobTracker</description> | |
| </property> | |
| </configuration> |
| # 追加分のみ | |
| export JAVA_HOME=/usr/lib/jvm/default-java |
| <configuration> | |
| <property> | |
| <name>hbase.rootdir</name> | |
| <value>file:///usr/local/lib/hbase/data</value> | |
| </property> | |
| </configuration> |
| # 追加分のみ | |
| export JAVA_HOME=/usr/lib/jvm/default-java |
| <configuration> | |
| <property> | |
| <name>hbase.rootdir</name> | |
| <value>file:///usr/local/lib/hbase/data</value> | |
| </property> | |
| </configuration> |
| 12/03/31 15:29:54 INFO mapred.JobClient: map 100% reduce 10% | |
| 12/03/31 15:30:17 INFO mapred.JobClient: Task Id : attempt_201203310137_0010_r_000000_2, Status : FAILED | |
| java.io.IOException: Task: attempt_201203310137_0010_r_000000_2 - The reduce copier failed | |
| at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:389) | |
| at org.apache.hadoop.mapred.Child$4.run(Child.java:255) | |
| at java.security.AccessController.doPrivileged(Native Method) | |
| at javax.security.auth.Subject.doAs(Subject.java:416) | |
| at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1093) | |
| at org.apache.hadoop.mapred.Child.main(Child.java:249) | |
| Caused by: org.apache.hadoop.util.DiskChecker$DiskErrorException: Could not find any valid local directo\ | |
| ry for file:/usr/local/lib/hadoop/tmp/mapred/local/taskTracker/maiha/jobcache/job_201203310137_0010/atte\ | |
| mpt_201203310137_0010_r_000000_2/output/map_105.out | |
| at org.apache.hadoop.fs.LocalDirAllocator$AllocatorPerContext.getLocalPathForWrite(LocalDirAlloc\ | |
| ator.java:381) | |
| at org.apache.hadoop.fs.LocalDirAllocator.getLocalPathForWrite(LocalDirAllocator.java:146) | |
| at org.apache.hadoop.fs.LocalDirAllocator.getLocalPathForWrite(LocalDirAllocator.java:127) | |
| at org.apache.hadoop.mapred.ReduceTask$ReduceCopier$LocalFSMerger.run(ReduceTask.java:2639) | |
| attempt_201203310137_0010_r_000000_2: log4j:WARN No appenders could be found for logger (org.apache.hado\ | |
| op.mapred.ReduceTask). | |
| attempt_201203310137_0010_r_000000_2: log4j:WARN Please initialize the log4j system properly. | |
| 12/03/31 15:30:19 INFO mapred.JobClient: map 100% reduce 0% |
あ、もしかすると、そのディレクトリーの容量が不足しているのかもしれません。
ご指摘の通り、ディスク不足でした。
bulk.output時は最終的なhtable使用量の4~5倍の中間ファイルを生成するようです。
速度は高々50%増しでディスク消費が最大5倍になるため、bulk.outputのメリットは薄そうです。
速度が必要なら結局M/Rを自作するでしょうし、お手軽なら生importtsvでよいという結論に。
参考までにimporttsv周りのディスク消費量の調査データを記載します。
<入力:tsv/10M行/2.2GB>
擬似分散(importtsv)
7:27.96 total
first: 122.33GB
max : 134.25GB
last : 134.19GB
peak: x 1.01
擬似分散(bulk.output)
5:24.85 total
first: 122.29GB
max : 138.41GB
last : 126.98GB
peak: x 3.44
<入力:tsv/25M行/5.5GB>
擬似分散(importtsv)
18:35.59 total
first: 98.31GB
max : 118.89GB
last : 113.12GB
peak: x 1.39
擬似分散(bulk.output)
14:41.34 total
first: 100.66GB
max : 135.98GB
last : 108.83GB
peak: x 4.32
<入力:tsv/50M行/11GB>
擬似分散(importtsv)
38:16.45 total
first: 125.07GB
max : 158.86GB
last : 158.65GB
peak: x 1.01
擬似分散(bulk.output)
29:31.15 total
first: 114.03GB
max : 192.11GB
last : 134.52GB
peak: x 3.81
「file:/usr/local/lib/hadoop/tmp/mapred~」にアクセスしようとして例外が出ているように見えます。
たぶん「/usr/local/lib/hadoop」はインストールした場所なので、普通はその下にテンポラリーは作らないような気がします。
MapReduceのテンポラリーディレクトリーはmapred-site.xmlのmapred.local.dirで指定するので、そこを確認してみるのがよいのではないかと思います。