Created
May 12, 2016 23:03
-
-
Save tsusanto/1d96f17808b3f2b20edc2ab91c544613 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Below is my Flume config file to push files dropped in folder to HDFS | |
The files are usually about 2MB in size. | |
The default property deserializer.maxLineLength is set to 2048. Which means after 2048 bytes of data, | |
flume truncates the data and treats it as a new event. Thus the resulting file in HDFS had a lot of newlines. | |
I changed it to 4096000, which is about 4MB | |
#Flume config file | |
tier1.sources = xml-source1 | |
tier1.channels = channel1 | |
tier1.sinks = hdfs-sink1 | |
tier1.channels.channel1.type = memory | |
tier1.channels.channel1.checkpointDir = /mnt/data/tenny/checkpoint_xml | |
tier1.channels.channel1.dataDirs = /mnt/data/tenny/kafka_source_xml | |
tier1.channels.channel1.capacity = 10000000 | |
tier1.channels.channel1.transactionCapacity = 100000 | |
tier1.sources.xml-source1.type = spooldir | |
tier1.sources.xml-source1.channels = channel1 | |
tier1.sources.xml-source1.spoolDir = /mnt/data/tenny/SpoolDir | |
tier1.sources.xml-source1.fileHeader = true | |
tier1.sources.xml-source1.fileSuffix = .DONE | |
tier1.sources.xml-source1.basenameHeader = true | |
tier1.sources.xml-source1.ignorePattern = .*.tmp$ | |
tier1.sources.xml-source1.deserializer.maxLineLength = 4096000 | |
tier1.sinks.hdfs-sink1.type = hdfs | |
tier1.sinks.hdfs-sink1.hdfs.path = /user/tsusanto/salesfiles/%Y%m%d/%H00 | |
tier1.sinks.hdfs-sink1.brokerList = broker1_host:9092,broker2_host:9092,broker3_host:9092 | |
tier1.sinks.hdfs-sink1.channel = channel1 | |
tier1.sinks.hdfs-sink1.hdfs.useLocalTimeStamp = true | |
tier1.sinks.hdfs-sink1.hdfs.rollCount = 0 | |
tier1.sinks.hdfs-sink1.hdfs.rollInterval = 5 | |
tier1.sinks.hdfs-sink1.hdfs.rollSize = 0 | |
tier1.sinks.hdfs-sink1.hdfs.fileType = DataStream | |
tier1.sinks.hdfs-sink1.hdfs.writeFormat = Text | |
tier1.sinks.hdfs-sink1.hdfs.filePrefix = %{basename} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment