ceteri · October 26, 2012 18:19
diff --git a/gistfile1.txt b/gistfile1.txt
 bash-3.2$ ls
 README.md		build.gradle		cascading.pattern.ipr	data			model.log		src
 build			cascading.pattern.iml	cascading.pattern.iws	dot			output
 bash-3.2$ more output/
 classify/ measure/  
 bash-3.2$ more output/measure/
 output/measure/ is a directory
 bash-3.2$ more output/measure/part-00000 
 label   score   count
 0       0       73
 0       1       12
 1       0       13
 1       1       102
 bash-3.2$ head output/classify/part-00000 
 label	var0	var1	var2	order_id	predicted	score
 1	0	1	0	6f8e1014	1	1
 0	0	0	1	6f8ea22e	0	0
 1	0	1	0	6f8ea435	1	1
 0	0	0	1	6f8ea5e1	0	0
 1	0	1	0	6f8ea785	1	1
 1	0	1	0	6f8ea91e	1	1
 0	1	0	0	6f8eaaba	0	0
 1	0	1	0	6f8eac54	1	1
 0	1	1	0	6f8eade3	1	1
 bash-3.2$ more model.log 
 [1] 200   5
  label var0 var1 var2 order_id
 1     1    0    1    0 6f8e1014
 2     0    0    0    1 6f8ea22e
 3     1    0    1    0 6f8ea435
 4     0    0    0    1 6f8ea5e1
 5     1    0    1    0 6f8ea785
 6     1    0    1    0 6f8ea91e
 [1] 40  5

 0  1 
 19 21 
 [1] 160   4
     MeanDecreaseGini
 var0        0.6591701
 var1       33.8625179
 var2        8.0290020

 Call:
 randomForest(formula = f, data = data_train, ntree = 2) 
               Type of random forest: classification
                     Number of trees: 2
 No. of variables tried at each split: 1

        OOB estimate of  error rate: 13.83%
 Confusion matrix:
   0  1 class.error
 0 28  5   0.1515152
 1  8 53   0.1311475
    true
 pred   0   1
   0  73  13
   1  12 102
 [1] "./data/sample.xml"
 bash-3.2$ more README.md 
 cascading.pattern
 =================

 _Pattern_ sub-project for Cascading.org which uses flows as containers
 for machine learning models, importing
 [PMML](http://en.wikipedia.org/wiki/Predictive_Model_Markup_Language)
 model descriptions from _R_, _SAS_, _Weka_, _RapidMiner_, _SQL
 Server_, etc.

 Currently supported algorithms for PMML include:

 * [Random Forest](http://en.wikipedia.org/wiki/Random_forest)


 Build Instructions
 ------------------

 To build _Pattern_ and then run its unit tests:

    gradle --info --stacktrace clean test

 The following scripts generate a baseline for the _Random Forest_
 algorithm. This baseline includes a reference data set (simulated
 ecommerce orders) plus a predictive model in PMML:

    ./src/py/rf_sample.py 200 > data/orders.tsv
    R --vanilla --slave < src/r/rf_model.R > model.log

 To build _Pattern_ and then run this baseline test:

    gradle clean jar
    rm -rf output
    hadoop jar build/libs/pattern.jar data/sample.xml data/sample.tsv output/classify output/measure output/trap

 For each tuple in the data, a _stream assertion_ tests whether the
 `predicted` field matches the `score` field generated by the
 model. Tuples which fail that assertion get trapped into
 `output/trap/part*` for inspection.

 Also, the _confusion matrix_ shown in `output/measure/part*` should
 match the one logged in `model.log` from baseline generated in _R_.
 bash-3.2$ more data/orders.tsv 
 label   var0    var1    var2    order_id
 1       0       1       0       6f8e1014
 0       0       0       1       6f8ea22e
 1       0       1       0       6f8ea435
 0       0       0       1       6f8ea5e1
 1       0       1       0       6f8ea785
 1       0       1       0       6f8ea91e
 0       1       0       0       6f8eaaba
 1       0       1       0       6f8eac54
 0       1       1       0       6f8eade3
 0       1       0       0       6f8eaf87
 0       0       1       0       6f8eb119
 1       0       1       0       6f8eb2e3
 1       1       1       0       6f8eb45e
 1       0       1       0       6f8eb5e6
 1       0       1       0       6f8eb761
 0       0       0       1       6f8eb8d4
 0       0       0       1       6f8eba59
 0       0       1       0       6f8ebbd4
 1       0       1       0       6f8ebd5c
 1       0       1       0       6f8ebec5
 0       1       0       0       6f8ec04a
 0       0       0       1       6f8ec1bd
 1       0       1       0       6f8ec319
 1       0       1       0       6f8ec480
 1       1       0       0       6f8ec5d7
 1       1       1       0       6f8ec72b
 0       0       0       1       6f8ec887
 0       0       0       1       6f8eca05
 1       1       1       0       6f8ecb94
 1       0       1       0       6f8ecd0f
 0       1       0       0       6f8ece82
 1       1       1       0       6f8ecfd7
 1       0       1       0       6f8ed135
 1       1       1       0       6f8ed27d
 0       0       0       1       6f8ed3c7
 0       0       0       1       6f8ed511
 0       0       0       1       6f8ed663
 0       0       0       1       6f8ed7eb
 1       0       1       0       6f8ed940
 0       1       0       0       6f8eda8a
 bash-3.2$ more 
 Missing filename ("less --help" for help)
 bash-3.2$ more src/
 main/ py/   r/    test/ 
 bash-3.2$ more src/r/rf_model.R
 ## uncomment the following two lines to install the required libraries
 #install.packages("pmml")
 #install.packages("randomForest")

 library(pmml)
 library(randomForest)

 ## load the "baseline" reference data

 dat_folder <- './data'
 data <- read.table(file=paste(dat_folder, "orders.tsv", sep="/"), sep="\t", quote="", na.strings="NULL", header=TRUE, encoding="UTF8")

 dim(data)
 head(data)

 ## split data into test and train sets

 set.seed(71)
 split_ratio <- 2/10
 split <- round(dim(data)[1] * split_ratio)

 data_tests <- data[1:split,]
 dim(data_tests)
 print(table(data_tests[,"label"]))

 data_train <- data[(split + 1):dim(data)[1],]
 i <- colnames(data_train) == "order_id"
 j <- 1:length(i)
 data_train <- data_train[,-j[i]]
 dim(data_train)

 ## train a RandomForest model

 f <- as.formula("as.factor(label) ~ .")
 fit <- randomForest(f, data_train, ntree=2)

 ## test the model on the holdout test set

 print(fit$importance)
 print(fit)

 predicted <- predict(fit, data)
 data$predicted <- predicted
 confuse <- table(pred = predicted, true = data[,1])
 print(confuse)

 ## export predicted labels to TSV

 write.table(data, file=paste(dat_folder, "sample.tsv", sep="/"), quote=FALSE, sep="\t", row.names=FALSE)

 ## export model to PMML

 saveXML(pmml(fit), file=paste(dat_folder, "sample.xml", sep="/"))
 bash-3.2$ 
 bash-3.2$ more 
 .git/                  README.md              cascading.pattern.iml  data/                  output/                
 .gitignore             build/                 cascading.pattern.ipr  dot/                   src/                   
 .gradle/               build.gradle           cascading.pattern.iws  model.log              
 bash-3.2$ more src/
 main/ py/   r/    test/ 
 bash-3.2$ more src/main/java/pattern/
 Classifier.java           ClassifierFactory.java~   DataField.java            PatternException.java     rf/                       
 Classifier.java~          ClassifierFunction.java   Main.java                 PatternException.java~    
 ClassifierFactory.java    ClassifierFunction.java~  Main.java~                XPathReader.java          
 bash-3.2$ more src/main/java/pattern/ClassifierF
 ClassifierFactory.java    ClassifierFactory.java~   ClassifierFunction.java   ClassifierFunction.java~  
 bash-3.2$ more src/main/java/pattern/Main.java
diff --git a/main.java b/main.java
 public static void main( String[] args ) {
  String pmmlPath = args[ 0 ];
  String ordersPath = args[ 1 ];
  String classifyPath = args[ 2 ];

  Properties properties = new Properties();
  AppProps.setApplicationJarClass( properties, Main.class );
  HadoopFlowConnector flowConnector = new HadoopFlowConnector( properties );

  // create source and sink taps
  Tap ordersTap = new Hfs( new TextDelimited( true, "\t" ), ordersPath );
  Tap classifyTap = new Hfs( new TextDelimited( true, "\t" ), classifyPath );

  // build the classifier model from PMML
  Classifier model = null;

  try {
    model = ClassifierFactory.getClassifier( pmmlPath );
  } catch ( PatternException e ) {
    e.printStackTrace();
    System.exit( -1 );
  }

  // define a "Classifier" to evaluate the orders
  Pipe classifyPipe = new Pipe( "classify" );
  classifyPipe = new Each( classifyPipe, Fields.ALL, new ClassifierFunction( new Fields( "score" ), model ), Fields.ALL );

  // connect the taps, pipes, etc., into a flow
  FlowDef flowDef = FlowDef.flowDef().setName( "classify" )
   .addSource( classifyPipe, ordersTap )
   .addSink( classifyPipe, classifyTap );

  // write a DOT file and run the flow
  Flow classifyFlow = flowConnector.connect( flowDef );
  classifyFlow.writeDOT( "dot/classify.dot" );
  classifyFlow.complete();
 }
diff --git a/pmml.xml b/pmml.xml
 <?xml version="1.0"?>
 <PMML version="4.0" xmlns="http://www.dmg.org/PMML-4_0"
 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 xsi:schemaLocation="http://www.dmg.org/PMML-4_0  http://www.dmg.org/v4-0/pmml-4-0.xsd">
 <Header copyright="Copyright (c) 2012 ceteri" description="Random Forest Tree Model">
  <Extension name="user" value="ceteri" extender="Rattle/PMML"/>
  <Application name="Rattle/PMML" version="1.2.30"/>
  <Timestamp>2012-10-22 19:39:28</Timestamp>
 </Header>
 <DataDictionary numberOfFields="4">
  <DataField name="label" optype="categorical" dataType="string">
   <Value value="0"/>
   <Value value="1"/>
  </DataField>
  <DataField name="var0" optype="continuous" dataType="double"/>
  <DataField name="var1" optype="continuous" dataType="double"/>
  <DataField name="var2" optype="continuous" dataType="double"/>
 </DataDictionary>
 <MiningModel modelName="randomForest_Model" functionName="classification">
  <MiningSchema>
   <MiningField name="label" usageType="predicted"/>
   <MiningField name="var0" usageType="active"/>
   <MiningField name="var1" usageType="active"/>
   <MiningField name="var2" usageType="active"/>
  </MiningSchema>
  <Segmentation multipleModelMethod="majorityVote">
   <Segment id="1">
    <True/>
    <TreeModel modelName="randomForest_Model" functionName="classification" algorithmName="randomForest" splitCharacteristic="binarySplit">
     <MiningSchema>
      <MiningField name="label" usageType="predicted"/>
      <MiningField name="var0" usageType="active"/>
      <MiningField name="var1" usageType="active"/>
      <MiningField name="var2" usageType="active"/>
     </MiningSchema>
 ...
	bash-3.2$ ls
	README.md build.gradle cascading.pattern.ipr data model.log src
	build cascading.pattern.iml cascading.pattern.iws dot output
	bash-3.2$ more output/
	classify/ measure/
	bash-3.2$ more output/measure/
	output/measure/ is a directory
	bash-3.2$ more output/measure/part-00000
	label score count
	0 0 73
	0 1 12
	1 0 13
	1 1 102
	bash-3.2$ head output/classify/part-00000
	label var0 var1 var2 order_id predicted score
	1 0 1 0 6f8e1014 1 1
	0 0 0 1 6f8ea22e 0 0
	1 0 1 0 6f8ea435 1 1
	0 0 0 1 6f8ea5e1 0 0
	1 0 1 0 6f8ea785 1 1
	1 0 1 0 6f8ea91e 1 1
	0 1 0 0 6f8eaaba 0 0
	1 0 1 0 6f8eac54 1 1
	0 1 1 0 6f8eade3 1 1
	bash-3.2$ more model.log
	[1] 200 5
	label var0 var1 var2 order_id
	1 1 0 1 0 6f8e1014
	2 0 0 0 1 6f8ea22e
	3 1 0 1 0 6f8ea435
	4 0 0 0 1 6f8ea5e1
	5 1 0 1 0 6f8ea785
	6 1 0 1 0 6f8ea91e
	[1] 40 5

	0 1
	19 21
	[1] 160 4
	MeanDecreaseGini
	var0 0.6591701
	var1 33.8625179
	var2 8.0290020

	Call:
	randomForest(formula = f, data = data_train, ntree = 2)
	Type of random forest: classification
	Number of trees: 2
	No. of variables tried at each split: 1

	OOB estimate of error rate: 13.83%
	Confusion matrix:
	0 1 class.error
	0 28 5 0.1515152
	1 8 53 0.1311475
	true
	pred 0 1
	0 73 13
	1 12 102
	[1] "./data/sample.xml"
	bash-3.2$ more README.md
	cascading.pattern
	=================

	_Pattern_ sub-project for Cascading.org which uses flows as containers
	for machine learning models, importing
	[PMML](http://en.wikipedia.org/wiki/Predictive_Model_Markup_Language)
	model descriptions from _R_, _SAS_, _Weka_, _RapidMiner_, _SQL
	Server_, etc.

	Currently supported algorithms for PMML include:

	* [Random Forest](http://en.wikipedia.org/wiki/Random_forest)


	Build Instructions
	------------------

	To build _Pattern_ and then run its unit tests:

	gradle --info --stacktrace clean test

	The following scripts generate a baseline for the _Random Forest_
	algorithm. This baseline includes a reference data set (simulated
	ecommerce orders) plus a predictive model in PMML:

	./src/py/rf_sample.py 200 > data/orders.tsv
	R --vanilla --slave < src/r/rf_model.R > model.log

	To build _Pattern_ and then run this baseline test:

	gradle clean jar
	rm -rf output
	hadoop jar build/libs/pattern.jar data/sample.xml data/sample.tsv output/classify output/measure output/trap

	For each tuple in the data, a _stream assertion_ tests whether the
	`predicted` field matches the `score` field generated by the
	model. Tuples which fail that assertion get trapped into
	`output/trap/part*` for inspection.

	Also, the _confusion matrix_ shown in `output/measure/part*` should
	match the one logged in `model.log` from baseline generated in _R_.
	bash-3.2$ more data/orders.tsv
	label var0 var1 var2 order_id
	1 0 1 0 6f8e1014
	0 0 0 1 6f8ea22e
	1 0 1 0 6f8ea435
	0 0 0 1 6f8ea5e1
	1 0 1 0 6f8ea785
	1 0 1 0 6f8ea91e
	0 1 0 0 6f8eaaba
	1 0 1 0 6f8eac54
	0 1 1 0 6f8eade3
	0 1 0 0 6f8eaf87
	0 0 1 0 6f8eb119
	1 0 1 0 6f8eb2e3
	1 1 1 0 6f8eb45e
	1 0 1 0 6f8eb5e6
	1 0 1 0 6f8eb761
	0 0 0 1 6f8eb8d4
	0 0 0 1 6f8eba59
	0 0 1 0 6f8ebbd4
	1 0 1 0 6f8ebd5c
	1 0 1 0 6f8ebec5
	0 1 0 0 6f8ec04a
	0 0 0 1 6f8ec1bd
	1 0 1 0 6f8ec319
	1 0 1 0 6f8ec480
	1 1 0 0 6f8ec5d7
	1 1 1 0 6f8ec72b
	0 0 0 1 6f8ec887
	0 0 0 1 6f8eca05
	1 1 1 0 6f8ecb94
	1 0 1 0 6f8ecd0f
	0 1 0 0 6f8ece82
	1 1 1 0 6f8ecfd7
	1 0 1 0 6f8ed135
	1 1 1 0 6f8ed27d
	0 0 0 1 6f8ed3c7
	0 0 0 1 6f8ed511
	0 0 0 1 6f8ed663
	0 0 0 1 6f8ed7eb
	1 0 1 0 6f8ed940
	0 1 0 0 6f8eda8a
	bash-3.2$ more
	Missing filename ("less --help" for help)
	bash-3.2$ more src/
	main/ py/ r/ test/
	bash-3.2$ more src/r/rf_model.R
	## uncomment the following two lines to install the required libraries
	#install.packages("pmml")
	#install.packages("randomForest")

	library(pmml)
	library(randomForest)

	## load the "baseline" reference data

	dat_folder <- './data'
	data <- read.table(file=paste(dat_folder, "orders.tsv", sep="/"), sep="\t", quote="", na.strings="NULL", header=TRUE, encoding="UTF8")

	dim(data)
	head(data)

	## split data into test and train sets

	set.seed(71)
	split_ratio <- 2/10
	split <- round(dim(data)[1] * split_ratio)

	data_tests <- data[1:split,]
	dim(data_tests)
	print(table(data_tests[,"label"]))

	data_train <- data[(split + 1):dim(data)[1],]
	i <- colnames(data_train) == "order_id"
	j <- 1:length(i)
	data_train <- data_train[,-j[i]]
	dim(data_train)

	## train a RandomForest model

	f <- as.formula("as.factor(label) ~ .")
	fit <- randomForest(f, data_train, ntree=2)

	## test the model on the holdout test set

	print(fit$importance)
	print(fit)

	predicted <- predict(fit, data)
	data$predicted <- predicted
	confuse <- table(pred = predicted, true = data[,1])
	print(confuse)

	## export predicted labels to TSV

	write.table(data, file=paste(dat_folder, "sample.tsv", sep="/"), quote=FALSE, sep="\t", row.names=FALSE)

	## export model to PMML

	saveXML(pmml(fit), file=paste(dat_folder, "sample.xml", sep="/"))
	bash-3.2$
	bash-3.2$ more
	.git/ README.md cascading.pattern.iml data/ output/
	.gitignore build/ cascading.pattern.ipr dot/ src/
	.gradle/ build.gradle cascading.pattern.iws model.log
	bash-3.2$ more src/
	main/ py/ r/ test/
	bash-3.2$ more src/main/java/pattern/
	Classifier.java ClassifierFactory.java~ DataField.java PatternException.java rf/
	Classifier.java~ ClassifierFunction.java Main.java PatternException.java~
	ClassifierFactory.java ClassifierFunction.java~ Main.java~ XPathReader.java
	bash-3.2$ more src/main/java/pattern/ClassifierF
	ClassifierFactory.java ClassifierFactory.java~ ClassifierFunction.java ClassifierFunction.java~
	bash-3.2$ more src/main/java/pattern/Main.java
	public static void main( String[] args ) {
	String pmmlPath = args[ 0 ];
	String ordersPath = args[ 1 ];
	String classifyPath = args[ 2 ];

	Properties properties = new Properties();
	AppProps.setApplicationJarClass( properties, Main.class );
	HadoopFlowConnector flowConnector = new HadoopFlowConnector( properties );

	// create source and sink taps
	Tap ordersTap = new Hfs( new TextDelimited( true, "\t" ), ordersPath );
	Tap classifyTap = new Hfs( new TextDelimited( true, "\t" ), classifyPath );

	// build the classifier model from PMML
	Classifier model = null;

	try {
	model = ClassifierFactory.getClassifier( pmmlPath );
	} catch ( PatternException e ) {
	e.printStackTrace();
	System.exit( -1 );
	}

	// define a "Classifier" to evaluate the orders
	Pipe classifyPipe = new Pipe( "classify" );
	classifyPipe = new Each( classifyPipe, Fields.ALL, new ClassifierFunction( new Fields( "score" ), model ), Fields.ALL );

	// connect the taps, pipes, etc., into a flow
	FlowDef flowDef = FlowDef.flowDef().setName( "classify" )
	.addSource( classifyPipe, ordersTap )
	.addSink( classifyPipe, classifyTap );

	// write a DOT file and run the flow
	Flow classifyFlow = flowConnector.connect( flowDef );
	classifyFlow.writeDOT( "dot/classify.dot" );
	classifyFlow.complete();
	}
	<?xml version="1.0"?>
	<PMML version="4.0" xmlns="http://www.dmg.org/PMML-4_0"
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://www.dmg.org/PMML-4_0  http://www.dmg.org/v4-0/pmml-4-0.xsd">
	<Header copyright="Copyright (c) 2012 ceteri" description="Random Forest Tree Model">
	<Extension name="user" value="ceteri" extender="Rattle/PMML"/>
	<Application name="Rattle/PMML" version="1.2.30"/>
	<Timestamp>2012-10-22 19:39:28</Timestamp>
	</Header>
	<DataDictionary numberOfFields="4">
	<DataField name="label" optype="categorical" dataType="string">
	<Value value="0"/>
	<Value value="1"/>
	</DataField>
	<DataField name="var0" optype="continuous" dataType="double"/>
	<DataField name="var1" optype="continuous" dataType="double"/>
	<DataField name="var2" optype="continuous" dataType="double"/>
	</DataDictionary>
	<MiningModel modelName="randomForest_Model" functionName="classification">
	<MiningSchema>
	<MiningField name="label" usageType="predicted"/>
	<MiningField name="var0" usageType="active"/>
	<MiningField name="var1" usageType="active"/>
	<MiningField name="var2" usageType="active"/>
	</MiningSchema>
	<Segmentation multipleModelMethod="majorityVote">
	<Segment id="1">
	<True/>
	<TreeModel modelName="randomForest_Model" functionName="classification" algorithmName="randomForest" splitCharacteristic="binarySplit">
	<MiningSchema>
	<MiningField name="label" usageType="predicted"/>
	<MiningField name="var0" usageType="active"/>
	<MiningField name="var1" usageType="active"/>
	<MiningField name="var2" usageType="active"/>
	</MiningSchema>
	...