Last active
August 29, 2015 14:17
-
-
Save nalingarg2/139984886140f37a783d to your computer and use it in GitHub Desktop.
Flume + Hive using Cloudwick LogGenerator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Hadoop:: flumeHive | |
# Recipe:: Flume and Hive | |
# | |
# Copyright (C) 2015 Cloudwick labs | |
# Contact :: [email protected] | |
# All rights reserved - Do Not Redistribute | |
# | |
#install sbt | |
curl https://bintray.com/sbt/rpm/rpm | sudo tee /etc/yum.repos.d/bintray-sbt-rpm.repo | |
sudo yum install sbt | |
#install scala | |
wget http://downloads.typesafe.com/scala/2.11.6/scala-2.11.6.tgz | |
tar xvf scala-2.11.6.tgz | |
sudo mv scala-2.11.6 /usr/lib | |
sudo ln -s /usr/lib/scala-2.11.6 /usr/lib/scala | |
export PATH=$PATH:/usr/lib/scala/bin (could add to /etc/profile.d/) | |
scala -version | |
#clone log generator | |
git clone https://github.com/nalingarg2/generator.git | |
cd generator/ | |
sbt assembly | |
#plz make sure u have success! | |
#below is mock log(apache) | |
# 95.22.50.11 - - [09/Sep/2013:16:36:44 -0700] "GET /test.php HTTP/1.1" 200 1832 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) #Gecko/20110421 Firefox/6.0a1" | |
bin/generator --help | |
#run either of below two | |
bin/generator log --totalEvents 100000 --filePath /home/test | |
bin/generator log --totalEvents 100000000 --filePath /home/test --fileRollSize 67108864 | |
#install flume | |
yum install flume | |
yum install flume-agent | |
#prepare a *.conf file | |
#PS: name of Agent is "agent" | |
# Define a memory channel on agent called memory-channel. | |
agent.channels.memory-channel.type = memory | |
# Define a source on agent and connect to channel memory-channel. | |
agent.sources.tail-source.type = exec | |
agent.sources.tail-source.command = tail -F /home/test/mock_apache_pool-1-thread-1.data | |
agent.sources.tail-source.channels = memory-channel | |
#Two sinks have been defined but one could also work perfect | |
# Define a sink that outputs to logger. | |
agent.sinks.log-sink.channel = memory-channel | |
agent.sinks.log-sink.type = logger | |
# Define a sink that outputs to hdfs. | |
#plz make sure port is right for hdfs | |
agent.sinks.hdfs-sink.channel = memory-channel | |
agent.sinks.hdfs-sink.type = hdfs | |
agent.sinks.hdfs-sink.hdfs.path = hdfs://localhost:8020/Flume/logs/ | |
agent.sinks.hdfs-sink.hdfs.fileType = DataStream | |
# Finally, activate. | |
agent.channels = memory-channel | |
agent.sources = tail-source | |
agent.sinks = log-sink hdfs-sink | |
# Run flume-ng, with log messages to the console. | |
$ bin/flume-ng agent --conf ./conf/ -f conf/flume.conf -Dflume.root.logger=DEBUG,console -n agent | |
$ flume-ng agent --conf conf --conf-file ex.conf -Dflume.root.logger=INFO,console -n agent(prefered) | |
HIVE | |
#create table | |
CREATE EXTERNAL TABLE webser_logs( | |
host STRING, | |
identity STRING, | |
user STRING, | |
time STRING, | |
request STRING, | |
status STRING, | |
size STRING, | |
REFERER STRING, | |
agent STRING) | |
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDE' | |
WITH SERDEPROPERTIES( | |
"input.regex" = "([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^\"]*\") ([^ \"]*|\"[^\"]*\"))?", | |
"output.format.string" = "%l$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s" | |
) | |
LOCATION '/Flume/logs/'; | |
#query | |
SELECT * FROM TABLE webser_logs; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment