Created
February 20, 2017 20:17
-
-
Save alopresto/4414529f84f64ea6f270d630be1c6ada to your computer and use it in GitHub Desktop.
Generates a flowfile with content provided in StackOverflow question and splits into multiple CSV files. Split regexes are specific to prescribed input but can be modified to be generic.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" ?> | |
<template encoding-version="1.0"> | |
<description>Generates a flowfile with content provided in StackOverflow question and splits into multiple CSV files. Split regexes are specific to prescribed input but can be modified to be generic. </description> | |
<groupId>5d1d8d50-015a-1000-6eef-5c6f05d60cb7</groupId> | |
<name>Split multiple files from CSV</name> | |
<snippet> | |
<connections> | |
<id>015a1001-fcfd-1d1e-0000-000000000000</id> | |
<parentGroupId>5d1d8d50-015a-1000-0000-000000000000</parentGroupId> | |
<backPressureDataSizeThreshold>1 GB</backPressureDataSizeThreshold> | |
<backPressureObjectThreshold>10000</backPressureObjectThreshold> | |
<destination> | |
<groupId>5d1d8d50-015a-1000-0000-000000000000</groupId> | |
<id>5d1efcfd-015a-1000-0000-000000000000</id> | |
<type>PROCESSOR</type> | |
</destination> | |
<flowFileExpiration>0 sec</flowFileExpiration> | |
<labelIndex>1</labelIndex> | |
<name></name> | |
<selectedRelationships>success</selectedRelationships> | |
<source> | |
<groupId>5d1d8d50-015a-1000-0000-000000000000</groupId> | |
<id>5d1dbdc8-015a-1000-0000-000000000000</id> | |
<type>PROCESSOR</type> | |
</source> | |
<zIndex>0</zIndex> | |
</connections> | |
<connections> | |
<id>015a1002-fcfd-1d1e-0000-000000000000</id> | |
<parentGroupId>5d1d8d50-015a-1000-0000-000000000000</parentGroupId> | |
<backPressureDataSizeThreshold>1 GB</backPressureDataSizeThreshold> | |
<backPressureObjectThreshold>10000</backPressureObjectThreshold> | |
<destination> | |
<groupId>5d1d8d50-015a-1000-0000-000000000000</groupId> | |
<id>5d1dd2e7-015a-1000-0000-000000000000</id> | |
<type>PROCESSOR</type> | |
</destination> | |
<flowFileExpiration>0 sec</flowFileExpiration> | |
<labelIndex>1</labelIndex> | |
<name></name> | |
<selectedRelationships>success</selectedRelationships> | |
<source> | |
<groupId>5d1d8d50-015a-1000-0000-000000000000</groupId> | |
<id>5d1efcfd-015a-1000-0000-000000000000</id> | |
<type>PROCESSOR</type> | |
</source> | |
<zIndex>0</zIndex> | |
</connections> | |
<connections> | |
<id>015a1003-fcfd-1d1e-0000-000000000000</id> | |
<parentGroupId>5d1d8d50-015a-1000-0000-000000000000</parentGroupId> | |
<backPressureDataSizeThreshold>1 GB</backPressureDataSizeThreshold> | |
<backPressureObjectThreshold>10000</backPressureObjectThreshold> | |
<destination> | |
<groupId>5d1d8d50-015a-1000-0000-000000000000</groupId> | |
<id>015a1000-fcfd-1d1e-0000-000000000000</id> | |
<type>PROCESSOR</type> | |
</destination> | |
<flowFileExpiration>0 sec</flowFileExpiration> | |
<labelIndex>1</labelIndex> | |
<name></name> | |
<selectedRelationships>splits</selectedRelationships> | |
<source> | |
<groupId>5d1d8d50-015a-1000-0000-000000000000</groupId> | |
<id>5d1dd2e7-015a-1000-0000-000000000000</id> | |
<type>PROCESSOR</type> | |
</source> | |
<zIndex>0</zIndex> | |
</connections> | |
<connections> | |
<id>015a1005-fcfd-1d1e-0000-000000000000</id> | |
<parentGroupId>5d1d8d50-015a-1000-0000-000000000000</parentGroupId> | |
<backPressureDataSizeThreshold>1 GB</backPressureDataSizeThreshold> | |
<backPressureObjectThreshold>10000</backPressureObjectThreshold> | |
<destination> | |
<groupId>5d1d8d50-015a-1000-0000-000000000000</groupId> | |
<id>015a1004-fcfd-1d1e-0000-000000000000</id> | |
<type>PROCESSOR</type> | |
</destination> | |
<flowFileExpiration>0 sec</flowFileExpiration> | |
<labelIndex>1</labelIndex> | |
<name></name> | |
<selectedRelationships>success</selectedRelationships> | |
<source> | |
<groupId>5d1d8d50-015a-1000-0000-000000000000</groupId> | |
<id>015a1000-fcfd-1d1e-0000-000000000000</id> | |
<type>PROCESSOR</type> | |
</source> | |
<zIndex>0</zIndex> | |
</connections> | |
<processors> | |
<id>015a1000-fcfd-1d1e-0000-000000000000</id> | |
<parentGroupId>5d1d8d50-015a-1000-0000-000000000000</parentGroupId> | |
<position> | |
<x>3.0</x> | |
<y>564.0</y> | |
</position> | |
<config> | |
<bulletinLevel>WARN</bulletinLevel> | |
<comments></comments> | |
<concurrentlySchedulableTaskCount>1</concurrentlySchedulableTaskCount> | |
<descriptors> | |
<entry> | |
<key>Regular Expression</key> | |
<value> | |
<name>Regular Expression</name> | |
</value> | |
</entry> | |
<entry> | |
<key>Replacement Value</key> | |
<value> | |
<name>Replacement Value</name> | |
</value> | |
</entry> | |
<entry> | |
<key>Character Set</key> | |
<value> | |
<name>Character Set</name> | |
</value> | |
</entry> | |
<entry> | |
<key>Maximum Buffer Size</key> | |
<value> | |
<name>Maximum Buffer Size</name> | |
</value> | |
</entry> | |
<entry> | |
<key>Replacement Strategy</key> | |
<value> | |
<name>Replacement Strategy</name> | |
</value> | |
</entry> | |
<entry> | |
<key>Evaluation Mode</key> | |
<value> | |
<name>Evaluation Mode</name> | |
</value> | |
</entry> | |
</descriptors> | |
<executionNode>ALL</executionNode> | |
<lossTolerant>false</lossTolerant> | |
<penaltyDuration>30 sec</penaltyDuration> | |
<properties> | |
<entry> | |
<key>Regular Expression</key> | |
<value>\d inside csv,*</value> | |
</entry> | |
<entry> | |
<key>Replacement Value</key> | |
<value></value> | |
</entry> | |
<entry> | |
<key>Character Set</key> | |
<value>UTF-8</value> | |
</entry> | |
<entry> | |
<key>Maximum Buffer Size</key> | |
<value>1 MB</value> | |
</entry> | |
<entry> | |
<key>Replacement Strategy</key> | |
<value>Regex Replace</value> | |
</entry> | |
<entry> | |
<key>Evaluation Mode</key> | |
<value>Entire text</value> | |
</entry> | |
</properties> | |
<runDurationMillis>0</runDurationMillis> | |
<schedulingPeriod>0 sec</schedulingPeriod> | |
<schedulingStrategy>TIMER_DRIVEN</schedulingStrategy> | |
<yieldDuration>1 sec</yieldDuration> | |
</config> | |
<name>Remove remaining per-file header</name> | |
<relationships> | |
<autoTerminate>true</autoTerminate> | |
<name>failure</name> | |
</relationships> | |
<relationships> | |
<autoTerminate>false</autoTerminate> | |
<name>success</name> | |
</relationships> | |
<style></style> | |
<type>org.apache.nifi.processors.standard.ReplaceText</type> | |
</processors> | |
<processors> | |
<id>015a1004-fcfd-1d1e-0000-000000000000</id> | |
<parentGroupId>5d1d8d50-015a-1000-0000-000000000000</parentGroupId> | |
<position> | |
<x>2.5</x> | |
<y>739.0</y> | |
</position> | |
<config> | |
<bulletinLevel>WARN</bulletinLevel> | |
<comments></comments> | |
<concurrentlySchedulableTaskCount>1</concurrentlySchedulableTaskCount> | |
<descriptors> | |
<entry> | |
<key>Log Level</key> | |
<value> | |
<name>Log Level</name> | |
</value> | |
</entry> | |
<entry> | |
<key>Log Payload</key> | |
<value> | |
<name>Log Payload</name> | |
</value> | |
</entry> | |
<entry> | |
<key>Attributes to Log</key> | |
<value> | |
<name>Attributes to Log</name> | |
</value> | |
</entry> | |
<entry> | |
<key>Attributes to Ignore</key> | |
<value> | |
<name>Attributes to Ignore</name> | |
</value> | |
</entry> | |
<entry> | |
<key>Log prefix</key> | |
<value> | |
<name>Log prefix</name> | |
</value> | |
</entry> | |
</descriptors> | |
<executionNode>ALL</executionNode> | |
<lossTolerant>false</lossTolerant> | |
<penaltyDuration>30 sec</penaltyDuration> | |
<properties> | |
<entry> | |
<key>Log Level</key> | |
<value>info</value> | |
</entry> | |
<entry> | |
<key>Log Payload</key> | |
<value>true</value> | |
</entry> | |
<entry> | |
<key>Attributes to Log</key> | |
</entry> | |
<entry> | |
<key>Attributes to Ignore</key> | |
</entry> | |
<entry> | |
<key>Log prefix</key> | |
</entry> | |
</properties> | |
<runDurationMillis>0</runDurationMillis> | |
<schedulingPeriod>0 sec</schedulingPeriod> | |
<schedulingStrategy>TIMER_DRIVEN</schedulingStrategy> | |
<yieldDuration>1 sec</yieldDuration> | |
</config> | |
<name>LogAttribute</name> | |
<relationships> | |
<autoTerminate>true</autoTerminate> | |
<name>success</name> | |
</relationships> | |
<style></style> | |
<type>org.apache.nifi.processors.standard.LogAttribute</type> | |
</processors> | |
<processors> | |
<id>5d1dbdc8-015a-1000-0000-000000000000</id> | |
<parentGroupId>5d1d8d50-015a-1000-0000-000000000000</parentGroupId> | |
<position> | |
<x>0.0</x> | |
<y>0.0</y> | |
</position> | |
<config> | |
<bulletinLevel>WARN</bulletinLevel> | |
<comments></comments> | |
<concurrentlySchedulableTaskCount>1</concurrentlySchedulableTaskCount> | |
<descriptors> | |
<entry> | |
<key>File Size</key> | |
<value> | |
<name>File Size</name> | |
</value> | |
</entry> | |
<entry> | |
<key>Batch Size</key> | |
<value> | |
<name>Batch Size</name> | |
</value> | |
</entry> | |
<entry> | |
<key>Data Format</key> | |
<value> | |
<name>Data Format</name> | |
</value> | |
</entry> | |
<entry> | |
<key>Unique FlowFiles</key> | |
<value> | |
<name>Unique FlowFiles</name> | |
</value> | |
</entry> | |
<entry> | |
<key>generate-ff-custom-text</key> | |
<value> | |
<name>generate-ff-custom-text</name> | |
</value> | |
</entry> | |
</descriptors> | |
<executionNode>ALL</executionNode> | |
<lossTolerant>false</lossTolerant> | |
<penaltyDuration>30 sec</penaltyDuration> | |
<properties> | |
<entry> | |
<key>File Size</key> | |
<value>0B</value> | |
</entry> | |
<entry> | |
<key>Batch Size</key> | |
<value>1</value> | |
</entry> | |
<entry> | |
<key>Data Format</key> | |
<value>Text</value> | |
</entry> | |
<entry> | |
<key>Unique FlowFiles</key> | |
<value>false</value> | |
</entry> | |
<entry> | |
<key>generate-ff-custom-text</key> | |
<value> Sample NiFi Data demonstration for below | |
Due dates 20-02-2017,23-03-2017 | |
My Input No1 inside csv,,,,,, | |
Animals,Today-20.02.2017,Yesterday-19-02.2017 | |
Fox,21,32 | |
Lion,20,12 | |
My Input No2 inside csv,,,, | |
Name,ID,City | |
Mahi,12,UK | |
And,21,US | |
Prabh,32,LI</value> | |
</entry> | |
</properties> | |
<runDurationMillis>0</runDurationMillis> | |
<schedulingPeriod>3 sec</schedulingPeriod> | |
<schedulingStrategy>TIMER_DRIVEN</schedulingStrategy> | |
<yieldDuration>1 sec</yieldDuration> | |
</config> | |
<name>GenerateFlowFile</name> | |
<relationships> | |
<autoTerminate>false</autoTerminate> | |
<name>success</name> | |
</relationships> | |
<style></style> | |
<type>org.apache.nifi.processors.standard.GenerateFlowFile</type> | |
</processors> | |
<processors> | |
<id>5d1dd2e7-015a-1000-0000-000000000000</id> | |
<parentGroupId>5d1d8d50-015a-1000-0000-000000000000</parentGroupId> | |
<position> | |
<x>0.0</x> | |
<y>372.0</y> | |
</position> | |
<config> | |
<bulletinLevel>WARN</bulletinLevel> | |
<comments></comments> | |
<concurrentlySchedulableTaskCount>1</concurrentlySchedulableTaskCount> | |
<descriptors> | |
<entry> | |
<key>Byte Sequence Format</key> | |
<value> | |
<name>Byte Sequence Format</name> | |
</value> | |
</entry> | |
<entry> | |
<key>Byte Sequence</key> | |
<value> | |
<name>Byte Sequence</name> | |
</value> | |
</entry> | |
<entry> | |
<key>Keep Byte Sequence</key> | |
<value> | |
<name>Keep Byte Sequence</name> | |
</value> | |
</entry> | |
<entry> | |
<key>Byte Sequence Location</key> | |
<value> | |
<name>Byte Sequence Location</name> | |
</value> | |
</entry> | |
</descriptors> | |
<executionNode>ALL</executionNode> | |
<lossTolerant>false</lossTolerant> | |
<penaltyDuration>30 sec</penaltyDuration> | |
<properties> | |
<entry> | |
<key>Byte Sequence Format</key> | |
<value>Text</value> | |
</entry> | |
<entry> | |
<key>Byte Sequence</key> | |
<value>My Input No</value> | |
</entry> | |
<entry> | |
<key>Keep Byte Sequence</key> | |
<value>false</value> | |
</entry> | |
<entry> | |
<key>Byte Sequence Location</key> | |
<value>Trailing</value> | |
</entry> | |
</properties> | |
<runDurationMillis>0</runDurationMillis> | |
<schedulingPeriod>0 sec</schedulingPeriod> | |
<schedulingStrategy>TIMER_DRIVEN</schedulingStrategy> | |
<yieldDuration>1 sec</yieldDuration> | |
</config> | |
<name>Split on input file</name> | |
<relationships> | |
<autoTerminate>true</autoTerminate> | |
<name>original</name> | |
</relationships> | |
<relationships> | |
<autoTerminate>false</autoTerminate> | |
<name>splits</name> | |
</relationships> | |
<style></style> | |
<type>org.apache.nifi.processors.standard.SplitContent</type> | |
</processors> | |
<processors> | |
<id>5d1efcfd-015a-1000-0000-000000000000</id> | |
<parentGroupId>5d1d8d50-015a-1000-0000-000000000000</parentGroupId> | |
<position> | |
<x>0.0</x> | |
<y>188.0</y> | |
</position> | |
<config> | |
<bulletinLevel>WARN</bulletinLevel> | |
<comments></comments> | |
<concurrentlySchedulableTaskCount>1</concurrentlySchedulableTaskCount> | |
<descriptors> | |
<entry> | |
<key>Regular Expression</key> | |
<value> | |
<name>Regular Expression</name> | |
</value> | |
</entry> | |
<entry> | |
<key>Replacement Value</key> | |
<value> | |
<name>Replacement Value</name> | |
</value> | |
</entry> | |
<entry> | |
<key>Character Set</key> | |
<value> | |
<name>Character Set</name> | |
</value> | |
</entry> | |
<entry> | |
<key>Maximum Buffer Size</key> | |
<value> | |
<name>Maximum Buffer Size</name> | |
</value> | |
</entry> | |
<entry> | |
<key>Replacement Strategy</key> | |
<value> | |
<name>Replacement Strategy</name> | |
</value> | |
</entry> | |
<entry> | |
<key>Evaluation Mode</key> | |
<value> | |
<name>Evaluation Mode</name> | |
</value> | |
</entry> | |
</descriptors> | |
<executionNode>ALL</executionNode> | |
<lossTolerant>false</lossTolerant> | |
<penaltyDuration>30 sec</penaltyDuration> | |
<properties> | |
<entry> | |
<key>Regular Expression</key> | |
<value>^[\s\S]*?\n{2}</value> | |
</entry> | |
<entry> | |
<key>Replacement Value</key> | |
<value></value> | |
</entry> | |
<entry> | |
<key>Character Set</key> | |
<value>UTF-8</value> | |
</entry> | |
<entry> | |
<key>Maximum Buffer Size</key> | |
<value>1 MB</value> | |
</entry> | |
<entry> | |
<key>Replacement Strategy</key> | |
<value>Regex Replace</value> | |
</entry> | |
<entry> | |
<key>Evaluation Mode</key> | |
<value>Entire text</value> | |
</entry> | |
</properties> | |
<runDurationMillis>0</runDurationMillis> | |
<schedulingPeriod>0 sec</schedulingPeriod> | |
<schedulingStrategy>TIMER_DRIVEN</schedulingStrategy> | |
<yieldDuration>1 sec</yieldDuration> | |
</config> | |
<name>Remove global header</name> | |
<relationships> | |
<autoTerminate>true</autoTerminate> | |
<name>failure</name> | |
</relationships> | |
<relationships> | |
<autoTerminate>false</autoTerminate> | |
<name>success</name> | |
</relationships> | |
<style></style> | |
<type>org.apache.nifi.processors.standard.ReplaceText</type> | |
</processors> | |
</snippet> | |
<timestamp>02/20/2017 12:15:58 PST</timestamp> | |
</template> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment