Skip to content

Instantly share code, notes, and snippets.

@nalingarg2
Last active August 29, 2015 14:17
Show Gist options
  • Save nalingarg2/139984886140f37a783d to your computer and use it in GitHub Desktop.
Save nalingarg2/139984886140f37a783d to your computer and use it in GitHub Desktop.
Flume + Hive using Cloudwick LogGenerator
#
# Hadoop:: flumeHive
# Recipe:: Flume and Hive
#
# Copyright (C) 2015 Cloudwick labs
# Contact :: [email protected]
# All rights reserved - Do Not Redistribute
#
#install sbt
curl https://bintray.com/sbt/rpm/rpm | sudo tee /etc/yum.repos.d/bintray-sbt-rpm.repo
sudo yum install sbt
#install scala
wget http://downloads.typesafe.com/scala/2.11.6/scala-2.11.6.tgz
tar xvf scala-2.11.6.tgz
sudo mv scala-2.11.6 /usr/lib
sudo ln -s /usr/lib/scala-2.11.6 /usr/lib/scala
export PATH=$PATH:/usr/lib/scala/bin (could add to /etc/profile.d/)
scala -version
#clone log generator
git clone https://github.com/nalingarg2/generator.git
cd generator/
sbt assembly
#plz make sure u have success!
#below is mock log(apache)
# 95.22.50.11 - - [09/Sep/2013:16:36:44 -0700] "GET /test.php HTTP/1.1" 200 1832 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) #Gecko/20110421 Firefox/6.0a1"
bin/generator --help
#run either of below two
bin/generator log --totalEvents 100000 --filePath /home/test
bin/generator log --totalEvents 100000000 --filePath /home/test --fileRollSize 67108864
#install flume
yum install flume
yum install flume-agent
#prepare a *.conf file
#PS: name of Agent is "agent"
# Define a memory channel on agent called memory-channel.
agent.channels.memory-channel.type = memory
# Define a source on agent and connect to channel memory-channel.
agent.sources.tail-source.type = exec
agent.sources.tail-source.command = tail -F /home/test/mock_apache_pool-1-thread-1.data
agent.sources.tail-source.channels = memory-channel
#Two sinks have been defined but one could also work perfect
# Define a sink that outputs to logger.
agent.sinks.log-sink.channel = memory-channel
agent.sinks.log-sink.type = logger
# Define a sink that outputs to hdfs.
#plz make sure port is right for hdfs
agent.sinks.hdfs-sink.channel = memory-channel
agent.sinks.hdfs-sink.type = hdfs
agent.sinks.hdfs-sink.hdfs.path = hdfs://localhost:8020/Flume/logs/
agent.sinks.hdfs-sink.hdfs.fileType = DataStream
# Finally, activate.
agent.channels = memory-channel
agent.sources = tail-source
agent.sinks = log-sink hdfs-sink
# Run flume-ng, with log messages to the console.
$ bin/flume-ng agent --conf ./conf/ -f conf/flume.conf -Dflume.root.logger=DEBUG,console -n agent
$ flume-ng agent --conf conf --conf-file ex.conf -Dflume.root.logger=INFO,console -n agent(prefered)
HIVE
#create table
CREATE EXTERNAL TABLE webser_logs(
host STRING,
identity STRING,
user STRING,
time STRING,
request STRING,
status STRING,
size STRING,
REFERER STRING,
agent STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDE'
WITH SERDEPROPERTIES(
"input.regex" = "([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
"output.format.string" = "%l$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
)
LOCATION '/Flume/logs/';
#query
SELECT * FROM TABLE webser_logs;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment