ceteri · October 6, 2015 19:47
diff --git a/Main.java b/Main.java
 public class
  Main
  {
  public static void
  main( String[] args )
    {
    String docPath = args[ 0 ];
    String wcPath = args[ 1 ];
    String stopPath = args[ 2 ];

    Properties properties = new Properties();
    AppProps.setApplicationJarClass( properties, Main.class );
    HadoopFlowConnector flowConnector = new HadoopFlowConnector( properties );

    // create source and sink taps
    Tap docTap = new Hfs( new TextDelimited( true, "\t" ), docPath );
    Tap wcTap = new Hfs( new TextDelimited( true, "\t" ), wcPath );

    Fields stop = new Fields( "stop" );
    Tap stopTap = new Hfs( new TextDelimited( stop, true, "\t" ), stopPath );

    // specify a regex operation to split the "document" text lines into a token stream
    Fields token = new Fields( "token" );
    Fields text = new Fields( "text" );
    RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ \\[\\]\\(\\),.]" );
    Fields fieldSelector = new Fields( "doc_id", "token" );
    Pipe docPipe = new Each( "token", text, splitter, fieldSelector );

    // define "ScrubFunction" to clean up the token stream
    Fields scrubArguments = new Fields( "doc_id", "token" );
    docPipe = new Each( docPipe, scrubArguments, new ScrubFunction( scrubArguments ), Fields.RESULTS );

    // perform a left join to remove stop words, discarding the rows
    // which joined with stop words, i.e., were non-null after left join
    Pipe stopPipe = new Pipe( "stop" );
    Pipe tokenPipe = new HashJoin( docPipe, token, stopPipe, stop, new LeftJoin() );
    tokenPipe = new Each( tokenPipe, stop, new RegexFilter( "^$" ) );

    // determine the word counts
    Pipe wcPipe = new Pipe( "wc", tokenPipe );
    wcPipe = new Retain( wcPipe, token );
    wcPipe = new GroupBy( wcPipe, token );
    wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

    // connect the taps, pipes, etc., into a flow
    FlowDef flowDef = FlowDef.flowDef()
     .setName( "wc" )
     .addSource( docPipe, docTap )
     .addSource( stopPipe, stopTap )
     .addTailSink( wcPipe, wcTap );

    // write a DOT file and run the flow
    Flow wcFlow = flowConnector.connect( flowDef );
    wcFlow.writeDOT( "dot/wc.dot" );
    wcFlow.complete();
    }
  }
diff --git a/en.stop b/en.stop
 stop
 a
 about
 after
 all
 along
 an
 and
 any
 are
 around
 as
 asked
 at
 away
 back
 be
 been
 before
 between
 both
 but
 by
 can
 could
 did
 do
 even
 few
 for
 from
 get
 got
 had
 hand
 has
 have
 he
 he
 her
 here
 high
 him
 his
 how
 i
 if
 in
 into
 is
 it
 its
 just
 large
 like
 long
 man
 many
 more
 most
 much
 my
 near
 new
 next
 no
 not
 now
 of
 off
 on
 one
 or
 other
 our
 out
 over
 right
 said
 see
 she
 side
 small
 so
 some
 than
 that
 the
 their
 them
 then
 there
 these
 they
 this
 those
 through
 time
 to
 too
 two
 up
 us
 used
 was
 way
 we
 were
 what
 when
 where
 which
 while
 who
 will
 with
 within
 would
 you
 your
diff --git a/hive.log b/hive.log
 bash-3.2$ rm -rf derby.log metastore_db/
 bash-3.2$ hive -hiveconf hive.metastore.warehouse.dir=/tmp/metadb < src/scripts/wc.q 
 WARNING: org.apache.hadoop.metrics.jvm.EventCounter is deprecated. Please use org.apache.hadoop.log.metrics.EventCounter in all the log4j.properties files.
 Logging initialized using configuration in jar:file:/Users/ceteri/opt/hive-0.9.0-bin/lib/hive-common-0.9.0.jar!/hive-log4j.properties
 Hive history file=/tmp/ceteri/hive_job_log_ceteri_201212231521_680816595.txt
 2012-12-23 15:21:11.165 java[7881:1903] Unable to load realm info from SCDynamicStore
 hive> -- prepare DDL for loading the raw data
    > 
    > CREATE TABLE raw_docs (
    >  doc_id STRING,
    >  text STRING
    > )
    > ROW FORMAT DELIMITED                             
    > FIELDS TERMINATED BY '\t'
    > STORED AS TEXTFILE
    > ;
 OK
 Time taken: 3.619 seconds
 hive> 
    > CREATE TABLE raw_stop (
    >  stop STRING
    > )
    > ROW FORMAT DELIMITED                             
    > FIELDS TERMINATED BY '\t'
    > STORED AS TEXTFILE
    > ;
 OK
 Time taken: 0.025 seconds
 hive> 
    > -- load the raw data
    > 
    > LOAD DATA 
    > LOCAL INPATH 'data/rain.txt' 
    > OVERWRITE INTO TABLE raw_docs
    > ;
 Copying data from file:/Users/ceteri/src/concur/Impatient/part4/data/rain.txt
 Copying file: file:/Users/ceteri/src/concur/Impatient/part4/data/rain.txt
 Loading data to table default.raw_docs
 Deleted file:/tmp/metadb/raw_docs
 OK
 Time taken: 0.204 seconds
 hive> 
    > LOAD DATA 
    > LOCAL INPATH 'data/en.stop' 
    > OVERWRITE INTO TABLE raw_stop
    > ;
 Copying data from file:/Users/ceteri/src/concur/Impatient/part4/data/en.stop
 Copying file: file:/Users/ceteri/src/concur/Impatient/part4/data/en.stop
 Loading data to table default.raw_stop
 Deleted file:/tmp/metadb/raw_stop
 OK
 Time taken: 0.075 seconds
 hive> 
    > -- additional steps to remove headers, yay
    > 
    > CREATE TABLE docs (
    >  doc_id STRING,
    >  text STRING
    > )
    > ;
 OK
 Time taken: 0.024 seconds
 hive> 
    > INSERT OVERWRITE TABLE docs
    > SELECT
    >  *
    > FROM raw_docs
    > WHERE doc_id <> 'doc_id'
    > ;
 Total MapReduce jobs = 2
 Launching Job 1 out of 2
 Number of reduce tasks is set to 0 since there's no reduce operator
 12/12/23 15:21:16 WARN conf.HiveConf: DEPRECATED: Ignoring hive-default.xml found on the CLASSPATH at /Users/ceteri/opt/hive-0.9.0-bin/conf/hive-default.xml
 12/12/23 15:21:16 WARN conf.HiveConf: hive-site.xml not found on CLASSPATH
 WARNING: org.apache.hadoop.metrics.jvm.EventCounter is deprecated. Please use org.apache.hadoop.log.metrics.EventCounter in all the log4j.properties files.
 Execution log at: /tmp/ceteri/ceteri_20121223152121_120b279e-0911-4dcc-9d7b-fc9d76ed0562.log
 2012-12-23 15:21:16.918 java[7939:1903] Unable to load realm info from SCDynamicStore
 Job running in-process (local Hadoop)
 Hadoop job information for null: number of mappers: 0; number of reducers: 0
 2012-12-23 15:21:19,333 null map = 0%,  reduce = 0%
 2012-12-23 15:21:22,338 null map = 100%,  reduce = 0%
 Ended Job = job_local_0001
 Execution completed successfully
 Mapred Local Task Succeeded . Convert the Join into MapJoin
 Ended Job = 1640864005, job is filtered out (removed at runtime).
 Moving data to: file:/tmp/hive-ceteri/hive_2012-12-23_15-21-15_366_7720940276006194670/-ext-10000
 Loading data to table default.docs
 Deleted file:/tmp/metadb/docs
 Table default.docs stats: [num_partitions: 0, num_files: 1, num_rows: 5, total_size: 498, raw_data_size: 493]
 OK
 Time taken: 7.393 seconds
 hive> 
    > CREATE TABLE stop (
    >  stop STRING
    > )
    > ;
 OK
 Time taken: 0.019 seconds
 hive> 
    > INSERT OVERWRITE TABLE stop
    > SELECT
    >  *
    > FROM raw_stop
    > WHERE stop <> 'stop'
    > ;
 Total MapReduce jobs = 2
 Launching Job 1 out of 2
 Number of reduce tasks is set to 0 since there's no reduce operator
 12/12/23 15:21:23 WARN conf.HiveConf: DEPRECATED: Ignoring hive-default.xml found on the CLASSPATH at /Users/ceteri/opt/hive-0.9.0-bin/conf/hive-default.xml
 12/12/23 15:21:23 WARN conf.HiveConf: hive-site.xml not found on CLASSPATH
 WARNING: org.apache.hadoop.metrics.jvm.EventCounter is deprecated. Please use org.apache.hadoop.log.metrics.EventCounter in all the log4j.properties files.
 Execution log at: /tmp/ceteri/ceteri_20121223152121_f1511b39-cffa-4197-8eb4-cdb017ea797e.log
 2012-12-23 15:21:24.070 java[7966:1903] Unable to load realm info from SCDynamicStore
 Job running in-process (local Hadoop)
 Hadoop job information for null: number of mappers: 0; number of reducers: 0
 2012-12-23 15:21:26,961 null map = 0%,  reduce = 0%
 2012-12-23 15:21:29,966 null map = 100%,  reduce = 0%
 Ended Job = job_local_0001
 Execution completed successfully
 Mapred Local Task Succeeded . Convert the Join into MapJoin
 Ended Job = 1551628365, job is filtered out (removed at runtime).
 Moving data to: file:/tmp/hive-ceteri/hive_2012-12-23_15-21-22_781_7709385169981999922/-ext-10000
 Loading data to table default.stop
 Deleted file:/tmp/metadb/stop
 Table default.stop stats: [num_partitions: 0, num_files: 1, num_rows: 119, total_size: 539, raw_data_size: 420]
 OK
 Time taken: 7.571 seconds
 hive> 
    > -- tokenize using external Python script
    > 
    > CREATE TABLE tokens (
    >  token STRING
    > )
    > ;
 OK
 Time taken: 0.026 seconds
 hive> 
    > INSERT OVERWRITE TABLE tokens
    > SELECT
    >  TRANSFORM(text) USING 'python ./src/scripts/tokenizer.py' AS token
    > FROM docs
    > ;
 Total MapReduce jobs = 2
 Launching Job 1 out of 2
 Number of reduce tasks is set to 0 since there's no reduce operator
 12/12/23 15:21:31 WARN conf.HiveConf: DEPRECATED: Ignoring hive-default.xml found on the CLASSPATH at /Users/ceteri/opt/hive-0.9.0-bin/conf/hive-default.xml
 12/12/23 15:21:31 WARN conf.HiveConf: hive-site.xml not found on CLASSPATH
 WARNING: org.apache.hadoop.metrics.jvm.EventCounter is deprecated. Please use org.apache.hadoop.log.metrics.EventCounter in all the log4j.properties files.
 Execution log at: /tmp/ceteri/ceteri_20121223152121_43bba36d-b43f-4098-87f4-e2388633b086.log
 2012-12-23 15:21:31.946 java[7994:1903] Unable to load realm info from SCDynamicStore
 Job running in-process (local Hadoop)
 Hadoop job information for null: number of mappers: 0; number of reducers: 0
 2012-12-23 15:21:34,666 null map = 0%,  reduce = 0%
 2012-12-23 15:21:37,670 null map = 100%,  reduce = 0%
 Ended Job = job_local_0001
 Execution completed successfully
 Mapred Local Task Succeeded . Convert the Join into MapJoin
 Ended Job = -2104034200, job is filtered out (removed at runtime).
 Moving data to: file:/tmp/hive-ceteri/hive_2012-12-23_15-21-30_385_6822997415441284398/-ext-10000
 Loading data to table default.tokens
 Deleted file:/tmp/metadb/tokens
 Table default.tokens stats: [num_partitions: 0, num_files: 1, num_rows: 89, total_size: 454, raw_data_size: 365]
 OK
 Time taken: 7.626 seconds
 hive> 
    > -- filter with a left join, then count
    > 
    > SELECT token, COUNT(*) AS count
    > FROM (
    >   SELECT
    >    *
    >   FROM tokens LEFT OUTER JOIN stop
    >    ON (tokens.token = stop.stop)
    >   WHERE stop IS NULL
    > ) t
    > GROUP BY token
    > ;
 Total MapReduce jobs = 2
 Launching Job 1 out of 2
 Number of reduce tasks not specified. Estimated from input data size: 1
 In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
 In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
 In order to set a constant number of reducers:
  set mapred.reduce.tasks=<number>
 12/12/23 15:21:39 WARN conf.HiveConf: DEPRECATED: Ignoring hive-default.xml found on the CLASSPATH at /Users/ceteri/opt/hive-0.9.0-bin/conf/hive-default.xml
 12/12/23 15:21:39 WARN conf.HiveConf: hive-site.xml not found on CLASSPATH
 WARNING: org.apache.hadoop.metrics.jvm.EventCounter is deprecated. Please use org.apache.hadoop.log.metrics.EventCounter in all the log4j.properties files.
 Execution log at: /tmp/ceteri/ceteri_20121223152121_b14dcc61-a51b-4a18-b35e-6faf75706b82.log
 2012-12-23 15:21:39.618 java[8022:1903] Unable to load realm info from SCDynamicStore
 Job running in-process (local Hadoop)
 Hadoop job information for null: number of mappers: 0; number of reducers: 0
 2012-12-23 15:21:41,868 null map = 0%,  reduce = 0%
 2012-12-23 15:21:44,872 null map = 100%,  reduce = 0%
 2012-12-23 15:21:50,880 null map = 100%,  reduce = 100%
 Ended Job = job_local_0001
 Execution completed successfully
 Mapred Local Task Succeeded . Convert the Join into MapJoin
 Launching Job 2 out of 2
 Number of reduce tasks not specified. Estimated from input data size: 1
 In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
 In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
 In order to set a constant number of reducers:
  set mapred.reduce.tasks=<number>
 12/12/23 15:21:52 WARN conf.HiveConf: DEPRECATED: Ignoring hive-default.xml found on the CLASSPATH at /Users/ceteri/opt/hive-0.9.0-bin/conf/hive-default.xml
 12/12/23 15:21:52 WARN conf.HiveConf: hive-site.xml not found on CLASSPATH
 WARNING: org.apache.hadoop.metrics.jvm.EventCounter is deprecated. Please use org.apache.hadoop.log.metrics.EventCounter in all the log4j.properties files.
 Execution log at: /tmp/ceteri/ceteri_20121223152121_b14dcc61-a51b-4a18-b35e-6faf75706b82.log
 2012-12-23 15:21:52.315 java[8049:1903] Unable to load realm info from SCDynamicStore
 Job running in-process (local Hadoop)
 Hadoop job information for null: number of mappers: 0; number of reducers: 0
 2012-12-23 15:21:54,126 null map = 0%,  reduce = 0%
 2012-12-23 15:21:57,131 null map = 100%,  reduce = 0%
 2012-12-23 15:22:00,135 null map = 100%,  reduce = 100%
 Ended Job = job_local_0001
 Execution completed successfully
 Mapred Local Task Succeeded . Convert the Join into MapJoin
 OK
 air    1
 area	4
 australia	1
 broken	1
 california's	1
 cause	1
 cloudcover	1
 death	1
 deserts	1
 downwind	1
 dry	3
 dvd	1
 effect	1
 known	1
 land	2
 lee	2
 leeward	2
 less	1
 lies	1
 mountain	3
 mountainous	1
 primary	1
 produces	1
 rain	5
 ranges	1
 secrets	1
 shadow	4
 sinking	1
 such	1
 valley	1
 women	1
 Time taken: 22.384 seconds
 hive> bash-3.2$ 
diff --git a/log b/log
 bash-3.2$ ls
 LICENSE.txt	README.md	build.gradle	data		src
 bash-3.2$ hadoop version
 Warning: $HADOOP_HOME is deprecated.

 Hadoop 1.0.3
 Subversion https://svn.apache.org/repos/asf/hadoop/common/branches/branch-1.0 -r 1335192
 Compiled by hortonfo on Tue May  8 20:31:25 UTC 2012
 From source with checksum e6b0c1e23dcf76907c5fecb4b832f3be
 bash-3.2$ gradle -version

 ------------------------------------------------------------
 Gradle 1.0
 ------------------------------------------------------------

 Gradle build time: Tuesday, June 12, 2012 12:56:21 AM UTC
 Groovy: 1.8.6
 Ant: Apache Ant(TM) version 1.8.2 compiled on December 20 2010
 Ivy: 2.2.0
 JVM: 1.6.0_33 (Apple Inc. 20.8-b03-424)
 OS: Mac OS X 10.7.4 x86_64

 bash-3.2$ gradle clean jar
 :clean UP-TO-DATE
 :compileJava
 :processResources UP-TO-DATE
 :classes
 :jar

 BUILD SUCCESSFUL

 Total time: 7.836 secs
 bash-3.2$ hadoop jar ./build/libs/impatient.jar data/rain.txt output/wc data/en.stop
 Warning: $HADOOP_HOME is deprecated.

 12/07/23 13:11:39 INFO util.HadoopUtil: resolving application jar from found main method on: impatient.Main
 12/07/23 13:11:39 INFO planner.HadoopPlanner: using application jar: /Users/ceteri/src/concur/Impatient/part4/./build/libs/impatient.jar
 12/07/23 13:11:39 INFO property.AppProps: using app.id: D22F09ABBCAB0AE1A6D24FFF0F6C64E3
 2012-07-23 13:11:39.978 java[3209:1903] Unable to load realm info from SCDynamicStore
 12/07/23 13:11:40 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
 12/07/23 13:11:40 WARN snappy.LoadSnappy: Snappy native library not loaded
 12/07/23 13:11:40 INFO mapred.FileInputFormat: Total input paths to process : 1
 12/07/23 13:11:40 INFO util.Version: Concurrent, Inc - Cascading 2.0.1
 12/07/23 13:11:40 INFO flow.Flow: [wc] starting
 12/07/23 13:11:40 INFO flow.Flow: [wc]  source: Hfs["TextDelimited[['stop']]"]["data/en.stop"]"]
 12/07/23 13:11:40 INFO flow.Flow: [wc]  source: Hfs["TextDelimited[['doc_id', 'text']->[ALL]]"]["data/rain.txt"]"]
 12/07/23 13:11:40 INFO flow.Flow: [wc]  sink: Hfs["TextDelimited[[UNKNOWN]->['token', 'count']]"]["output/wc"]"]
 12/07/23 13:11:40 INFO flow.Flow: [wc]  parallel execution is enabled: false
 12/07/23 13:11:40 INFO flow.Flow: [wc]  starting jobs: 1
 12/07/23 13:11:40 INFO flow.Flow: [wc]  allocating threads: 1
 12/07/23 13:11:40 INFO flow.FlowStep: [wc] starting step: (1/1) output/wc
 12/07/23 13:11:40 INFO mapred.FileInputFormat: Total input paths to process : 1
 12/07/23 13:11:40 INFO flow.FlowStep: [wc] submitted hadoop job: job_local_0001
 12/07/23 13:11:40 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
 12/07/23 13:11:40 INFO io.MultiInputSplit: current split input path: file:/Users/ceteri/src/concur/Impatient/part4/data/rain.txt
 12/07/23 13:11:40 INFO mapred.MapTask: numReduceTasks: 1
 12/07/23 13:11:40 INFO mapred.MapTask: io.sort.mb = 100
 12/07/23 13:11:40 INFO mapred.MapTask: data buffer = 79691776/99614720
 12/07/23 13:11:40 INFO mapred.MapTask: record buffer = 262144/327680
 12/07/23 13:11:40 INFO hadoop.FlowMapper: sourcing from: Hfs["TextDelimited[['doc_id', 'text']->[ALL]]"]["data/rain.txt"]"]
 12/07/23 13:11:40 INFO hadoop.FlowMapper: sourcing from: Hfs["TextDelimited[['stop']]"]["data/en.stop"]"]
 12/07/23 13:11:40 INFO hadoop.FlowMapper: sinking to: GroupBy(wc)[by:[{1}:'token']]
 12/07/23 13:11:40 INFO collect.SpillableTupleList: attempting to load codec: org.apache.hadoop.io.compress.GzipCodec
 12/07/23 13:11:40 INFO collect.SpillableTupleList: found codec: org.apache.hadoop.io.compress.GzipCodec
 12/07/23 13:11:40 INFO mapred.FileInputFormat: Total input paths to process : 1
 12/07/23 13:11:40 INFO collect.SpillableTupleList: attempting to load codec: org.apache.hadoop.io.compress.GzipCodec
 12/07/23 13:11:40 INFO collect.SpillableTupleList: found codec: org.apache.hadoop.io.compress.GzipCodec
 12/07/23 13:11:40 INFO mapred.MapTask: Starting flush of map output
 12/07/23 13:11:40 INFO mapred.MapTask: Finished spill 0
 12/07/23 13:11:40 INFO mapred.Task: Task:attempt_local_0001_m_000000_0 is done. And is in the process of commiting
 12/07/23 13:11:43 INFO mapred.LocalJobRunner: file:/Users/ceteri/src/concur/Impatient/part4/data/rain.txt:0+510
 12/07/23 13:11:43 INFO mapred.Task: Task 'attempt_local_0001_m_000000_0' done.
 12/07/23 13:11:43 INFO mapred.Task:  Using ResourceCalculatorPlugin : null
 12/07/23 13:11:43 INFO mapred.LocalJobRunner: 
 12/07/23 13:11:43 INFO mapred.Merger: Merging 1 sorted segments
 12/07/23 13:11:43 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 751 bytes
 12/07/23 13:11:43 INFO mapred.LocalJobRunner: 
 12/07/23 13:11:43 INFO hadoop.FlowReducer: sourcing from: GroupBy(wc)[by:[{1}:'token']]
 12/07/23 13:11:43 INFO hadoop.FlowReducer: sinking to: Hfs["TextDelimited[[UNKNOWN]->['token', 'count']]"]["output/wc"]"]
 12/07/23 13:11:43 INFO mapred.Task: Task:attempt_local_0001_r_000000_0 is done. And is in the process of commiting
 12/07/23 13:11:43 INFO mapred.LocalJobRunner: 
 12/07/23 13:11:43 INFO mapred.Task: Task attempt_local_0001_r_000000_0 is allowed to commit now
 12/07/23 13:11:43 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0001_r_000000_0' to file:/Users/ceteri/src/concur/Impatient/part4/output/wc
 12/07/23 13:11:46 INFO mapred.LocalJobRunner: reduce > reduce
 12/07/23 13:11:46 INFO mapred.Task: Task 'attempt_local_0001_r_000000_0' done.
 12/07/23 13:11:50 INFO util.Hadoop18TapUtil: deleting temp path output/wc/_temporary
 bash-3.2$ more output/wc/part-00000
 token   count
 air     1
 area    4
 australia       1
 broken  1
 california's    1
 cause   1
 cloudcover      1
 death   1
 deserts 1
 downwind        1
 dry     3
 dvd     1
 effect  1
 known   1
 land    2
 lee     2
 leeward 2
 less    1
 lies    1
 mountain        3
 mountainous     1
 primary 1
 produces        1
 rain    5
 ranges  1
 secrets 1
 shadow  4
 sinking 1
 such    1
 valley  1
 women   1
 bash-3.2$
diff --git a/pig.log b/pig.log
 bash-3.2$ rm -rf output
 bash-3.2$ mkdir -p dot
 bash-3.2$ pig -version
 Warning: $HADOOP_HOME is deprecated.

 Apache Pig version 0.10.0 (r1328203) 
 compiled Apr 19 2012, 22:54:12
 bash-3.2$ pig -p docPath=./data/rain.txt -p wcPath=./output/wc -p stopPath=./data/en.stop ./src/scripts/wc.pig
 Warning: $HADOOP_HOME is deprecated.

 2012-12-22 10:41:33,271 [main] INFO  org.apache.pig.Main - Apache Pig version 0.10.0 (r1328203) compiled Apr 19 2012, 22:54:12
 2012-12-22 10:41:33,272 [main] INFO  org.apache.pig.Main - Logging error messages to: /Users/ceteri/src/concur/Impatient/part4/pig_1356201693269.log
 2012-12-22 10:41:33.371 java[2020:1903] Unable to load realm info from SCDynamicStore
 2012-12-22 10:41:33,579 [main] INFO  org.apache.pig.backend.hadoop.executionengine.HExecutionEngine - Connecting to hadoop file system at: file:///
 2012-12-22 10:41:34,152 [main] WARN  org.apache.pig.PigServer - Encountered Warning IMPLICIT_CAST_TO_CHARARRAY 2 time(s).
 2012-12-22 10:41:34,152 [main] WARN  org.apache.pig.PigServer - Encountered Warning USING_OVERLOADED_FUNCTION 1 time(s).
 2012-12-22 10:41:34,300 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler - File concatenation threshold: 100 optimistic? false
 2012-12-22 10:41:34,309 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.CombinerOptimizer - Choosing to move algebraic foreach to combiner
 2012-12-22 10:41:34,325 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - MR plan size before optimization: 2
 2012-12-22 10:41:34,325 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - MR plan size after optimization: 2
 2012-12-22 10:41:34,361 [main] WARN  org.apache.pig.PigServer - Encountered Warning IMPLICIT_CAST_TO_CHARARRAY 2 time(s).
 2012-12-22 10:41:34,362 [main] WARN  org.apache.pig.PigServer - Encountered Warning USING_OVERLOADED_FUNCTION 1 time(s).
 2012-12-22 10:41:34,364 [main] INFO  org.apache.pig.tools.pigstats.ScriptState - Pig features used in the script: HASH_JOIN,GROUP_BY,FILTER
 2012-12-22 10:41:34,396 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler - File concatenation threshold: 100 optimistic? false
 2012-12-22 10:41:34,399 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.CombinerOptimizer - Choosing to move algebraic foreach to combiner
 2012-12-22 10:41:34,402 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - MR plan size before optimization: 2
 2012-12-22 10:41:34,402 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - MR plan size after optimization: 2
 2012-12-22 10:41:34,417 [main] INFO  org.apache.pig.tools.pigstats.ScriptState - Pig script settings are added to the job
 2012-12-22 10:41:34,428 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3
 2012-12-22 10:41:34,431 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - creating jar file Job6009769361318502147.jar
 2012-12-22 10:41:38,205 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - jar file Job6009769361318502147.jar created
 2012-12-22 10:41:38,215 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job
 2012-12-22 10:41:38,221 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - BytesPerReducer=1000000000 maxReducers=999 totalInputFileSize=1054
 2012-12-22 10:41:38,221 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Neither PARALLEL nor default parallelism is set for this job. Setting number of reducers to 1
 2012-12-22 10:41:38,274 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 1 map-reduce job(s) waiting for submission.
 2012-12-22 10:41:38,282 [Thread-6] WARN  org.apache.hadoop.util.NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
 2012-12-22 10:41:38,385 [Thread-6] INFO  org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1
 2012-12-22 10:41:38,385 [Thread-6] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1
 2012-12-22 10:41:38,390 [Thread-6] WARN  org.apache.hadoop.io.compress.snappy.LoadSnappy - Snappy native library not loaded
 2012-12-22 10:41:38,392 [Thread-6] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1
 2012-12-22 10:41:38,398 [Thread-6] INFO  org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1
 2012-12-22 10:41:38,398 [Thread-6] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1
 2012-12-22 10:41:38,398 [Thread-6] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1
 2012-12-22 10:41:38,583 [Thread-7] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
 2012-12-22 10:41:38,595 [Thread-7] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/Users/ceteri/src/concur/Impatient/part4/data/en.stop:0+544
 2012-12-22 10:41:38,599 [Thread-7] INFO  org.apache.hadoop.mapred.MapTask - io.sort.mb = 100
 2012-12-22 10:41:38,688 [Thread-7] INFO  org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720
 2012-12-22 10:41:38,690 [Thread-7] INFO  org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680
 2012-12-22 10:41:38,730 [Thread-7] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Created input record counter: Input records from _1_en.stop
 2012-12-22 10:41:38,744 [Thread-7] INFO  org.apache.hadoop.mapred.MapTask - Starting flush of map output
 2012-12-22 10:41:38,752 [Thread-7] INFO  org.apache.hadoop.mapred.MapTask - Finished spill 0
 2012-12-22 10:41:38,753 [Thread-7] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0001_m_000000_0 is done. And is in the process of commiting
 2012-12-22 10:41:38,775 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_local_0001
 2012-12-22 10:41:38,776 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 0% complete
 2012-12-22 10:41:41,568 [Thread-7] INFO  org.apache.hadoop.mapred.LocalJobRunner - 
 2012-12-22 10:41:41,568 [Thread-7] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0001_m_000000_0' done.
 2012-12-22 10:41:41,572 [Thread-7] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
 2012-12-22 10:41:41,576 [Thread-7] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/Users/ceteri/src/concur/Impatient/part4/data/rain.txt:0+510
 2012-12-22 10:41:41,576 [Thread-7] INFO  org.apache.hadoop.mapred.MapTask - io.sort.mb = 100
 2012-12-22 10:41:41,654 [Thread-7] INFO  org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720
 2012-12-22 10:41:41,655 [Thread-7] INFO  org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680
 2012-12-22 10:41:41,679 [Thread-7] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Created input record counter: Input records from _0_rain.txt
 2012-12-22 10:41:41,691 [Thread-7] INFO  org.apache.hadoop.mapred.MapTask - Starting flush of map output
 2012-12-22 10:41:41,694 [Thread-7] INFO  org.apache.hadoop.mapred.MapTask - Finished spill 0
 2012-12-22 10:41:41,698 [Thread-7] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0001_m_000001_0 is done. And is in the process of commiting
 2012-12-22 10:41:44,571 [Thread-7] INFO  org.apache.hadoop.mapred.LocalJobRunner - 
 2012-12-22 10:41:44,572 [Thread-7] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0001_m_000001_0' done.
 2012-12-22 10:41:44,583 [Thread-7] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
 2012-12-22 10:41:44,583 [Thread-7] INFO  org.apache.hadoop.mapred.LocalJobRunner - 
 2012-12-22 10:41:44,588 [Thread-7] INFO  org.apache.hadoop.mapred.Merger - Merging 2 sorted segments
 2012-12-22 10:41:44,596 [Thread-7] INFO  org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 2 segments left of total size: 3284 bytes
 2012-12-22 10:41:44,596 [Thread-7] INFO  org.apache.hadoop.mapred.LocalJobRunner - 
 2012-12-22 10:41:44,635 [Thread-7] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0001_r_000000_0 is done. And is in the process of commiting
 2012-12-22 10:41:44,636 [Thread-7] INFO  org.apache.hadoop.mapred.LocalJobRunner - 
 2012-12-22 10:41:44,636 [Thread-7] INFO  org.apache.hadoop.mapred.Task - Task attempt_local_0001_r_000000_0 is allowed to commit now
 2012-12-22 10:41:44,639 [Thread-7] INFO  org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - Saved output of task 'attempt_local_0001_r_000000_0' to file:/tmp/temp1482927600/tmp-1364727125
 2012-12-22 10:41:47,587 [Thread-7] INFO  org.apache.hadoop.mapred.LocalJobRunner - reduce > reduce
 2012-12-22 10:41:47,587 [Thread-7] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0001_r_000000_0' done.
 2012-12-22 10:41:47,589 [Thread-7] WARN  org.apache.hadoop.mapred.FileOutputCommitter - Output path is null in cleanup
 2012-12-22 10:41:48,796 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 50% complete
 2012-12-22 10:41:48,799 [main] WARN  org.apache.pig.tools.pigstats.PigStatsUtil - Failed to get RunningJob for job job_local_0001
 2012-12-22 10:41:48,800 [main] INFO  org.apache.pig.tools.pigstats.ScriptState - Pig script settings are added to the job
 2012-12-22 10:41:48,801 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3
 2012-12-22 10:41:48,801 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - creating jar file Job8611044961811192709.jar
 2012-12-22 10:41:52,423 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - jar file Job8611044961811192709.jar created
 2012-12-22 10:41:52,428 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job
 2012-12-22 10:41:52,433 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - BytesPerReducer=1000000000 maxReducers=999 totalInputFileSize=1037
 2012-12-22 10:41:52,433 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Neither PARALLEL nor default parallelism is set for this job. Setting number of reducers to 1
 2012-12-22 10:41:52,445 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 1 map-reduce job(s) waiting for submission.
 2012-12-22 10:41:52,505 [Thread-11] INFO  org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1
 2012-12-22 10:41:52,505 [Thread-11] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1
 2012-12-22 10:41:52,505 [Thread-11] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1
 2012-12-22 10:41:52,588 [Thread-12] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
 2012-12-22 10:41:52,592 [Thread-12] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/tmp/temp1482927600/tmp-1364727125/part-r-00000:0+1037
 2012-12-22 10:41:52,593 [Thread-12] INFO  org.apache.hadoop.mapred.MapTask - io.sort.mb = 100
 2012-12-22 10:41:52,612 [Thread-12] INFO  org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720
 2012-12-22 10:41:52,612 [Thread-12] INFO  org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680
 2012-12-22 10:41:52,638 [Thread-12] INFO  org.apache.hadoop.mapred.MapTask - Starting flush of map output
 2012-12-22 10:41:52,653 [Thread-12] INFO  org.apache.hadoop.mapred.MapTask - Finished spill 0
 2012-12-22 10:41:52,655 [Thread-12] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0002_m_000000_0 is done. And is in the process of commiting
 2012-12-22 10:41:52,946 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_local_0002
 2012-12-22 10:41:55,586 [Thread-12] INFO  org.apache.hadoop.mapred.LocalJobRunner - 
 2012-12-22 10:41:55,586 [Thread-12] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0002_m_000000_0' done.
 2012-12-22 10:41:55,594 [Thread-12] INFO  org.apache.hadoop.mapred.Task -  Using ResourceCalculatorPlugin : null
 2012-12-22 10:41:55,594 [Thread-12] INFO  org.apache.hadoop.mapred.LocalJobRunner - 
 2012-12-22 10:41:55,594 [Thread-12] INFO  org.apache.hadoop.mapred.Merger - Merging 1 sorted segments
 2012-12-22 10:41:55,595 [Thread-12] INFO  org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 1 segments left of total size: 809 bytes
 2012-12-22 10:41:55,595 [Thread-12] INFO  org.apache.hadoop.mapred.LocalJobRunner - 
 2012-12-22 10:41:55,608 [Thread-12] INFO  org.apache.hadoop.mapred.Task - Task:attempt_local_0002_r_000000_0 is done. And is in the process of commiting
 2012-12-22 10:41:55,610 [Thread-12] INFO  org.apache.hadoop.mapred.LocalJobRunner - 
 2012-12-22 10:41:55,610 [Thread-12] INFO  org.apache.hadoop.mapred.Task - Task attempt_local_0002_r_000000_0 is allowed to commit now
 2012-12-22 10:41:55,613 [Thread-12] INFO  org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - Saved output of task 'attempt_local_0002_r_000000_0' to file:/Users/ceteri/src/concur/Impatient/part4/output/wc
 2012-12-22 10:41:58,590 [Thread-12] INFO  org.apache.hadoop.mapred.LocalJobRunner - reduce > reduce
 2012-12-22 10:41:58,591 [Thread-12] INFO  org.apache.hadoop.mapred.Task - Task 'attempt_local_0002_r_000000_0' done.
 2012-12-22 10:41:58,592 [Thread-12] WARN  org.apache.hadoop.mapred.FileOutputCommitter - Output path is null in cleanup
 2012-12-22 10:42:02,969 [main] WARN  org.apache.pig.tools.pigstats.PigStatsUtil - Failed to get RunningJob for job job_local_0002
 2012-12-22 10:42:02,971 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 100% complete
 2012-12-22 10:42:02,973 [main] INFO  org.apache.pig.tools.pigstats.SimplePigStats - Script Statistics: 

 HadoopVersion  PigVersion	UserId	StartedAt	FinishedAt	Features
 1.0.3	0.10.0	ceteri	2012-12-22 10:41:34	2012-12-22 10:42:02	HASH_JOIN,GROUP_BY,FILTER

 Success!

 Job Stats (time in seconds):
 JobId	Maps	Reduces	MaxMapTime	MinMapTIme	AvgMapTime	MaxReduceTime	MinReduceTime	AvgReduceTime	Alias	Feature	Outputs
 job_local_0001	1	1	n/a	n/a	n/a	n/a	n/a	n/a	docPipe,stopPipe,tokenPipe	HASH_JOIN	
 job_local_0002	1	1	n/a	n/a	n/a	n/a	n/a	n/a	tokenGroups,wcPipe	GROUP_BY,COMBINER	file:///Users/ceteri/src/concur/Impatient/part4/output/wc,

 Input(s):
 Successfully read 0 records from: "file:///Users/ceteri/src/concur/Impatient/part4/data/rain.txt"
 Successfully read 0 records from: "file:///Users/ceteri/src/concur/Impatient/part4/data/en.stop"

 Output(s):
 Successfully stored 0 records in: "file:///Users/ceteri/src/concur/Impatient/part4/output/wc"

 Counters:
 Total records written : 0
 Total bytes written : 0
 Spillable Memory Manager spill count : 0
 Total bags proactively spilled: 0
 Total records proactively spilled: 0

 Job DAG:
 job_local_0001	->	job_local_0002,
 job_local_0002


 2012-12-22 10:42:02,973 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Success!
 bash-3.2$ cat output/wc/part-r-00000 
 air	1
 dry	3
 dvd	1
 lee	2
 area	4
 land	2
 less	1
 lies	1
 rain	5
 such	1
 cause	1
 death	1
 known	1
 women	1
 broken	1
 effect	1
 ranges	1
 shadow	4
 valley	1
 deserts	1
 leeward	2
 primary	1
 secrets	1
 sinking	1
 downwind	1
 mountain	3
 produces	1
 australia	1
 cloudcover	1
 mountainous	1
 california's	1
 bash-3.2$ 
diff --git a/wc.pig b/wc.pig
 docPipe = LOAD '$docPath' USING PigStorage('\t', 'tagsource') AS (doc_id, text);
 docPipe = FILTER docPipe BY doc_id != 'doc_id';
 
 stopPipe = LOAD '$stopPath' USING PigStorage('\t', 'tagsource') AS (stop:chararray);
 stopPipe = FILTER stopPipe BY stop != 'stop';

 -- specify a regex operation to split the "document" text lines into a token stream
 tokenPipe = FOREACH docPipe GENERATE doc_id, FLATTEN(TOKENIZE(LOWER(text), ' [](),.')) AS token;
 tokenPipe = FILTER tokenPipe BY token MATCHES '\\w.*';
 
 -- perform a left join to remove stop words, discarding the rows
 -- which joined with stop words, i.e., were non-null after left join
 tokenPipe = JOIN tokenPipe BY token LEFT, stopPipe BY stop;
 tokenPipe = FILTER tokenPipe BY stopPipe::stop IS NULL;
 
 -- determine the word counts
 tokenGroups = GROUP tokenPipe BY token;
 wcPipe = FOREACH tokenGroups GENERATE group AS token, COUNT(tokenPipe) AS count;
 
 -- output
 STORE wcPipe INTO '$wcPath' using PigStorage('\t', 'tagsource');
 EXPLAIN -out dot/wc_pig.dot -dot wcPipe;
diff --git a/wc.q b/wc.q
 -- prepare DDL for loading the raw data

 CREATE TABLE raw_docs (
 doc_id STRING,
 text STRING
 )
 ROW FORMAT DELIMITED                             
 FIELDS TERMINATED BY '\t'
 STORED AS TEXTFILE
 ;

 CREATE TABLE raw_stop (
 stop STRING
 )
 ROW FORMAT DELIMITED                             
 FIELDS TERMINATED BY '\t'
 STORED AS TEXTFILE
 ;

 -- load the raw data

 LOAD DATA 
 LOCAL INPATH 'data/rain.txt' 
 OVERWRITE INTO TABLE raw_docs
 ;

 LOAD DATA 
 LOCAL INPATH 'data/en.stop' 
 OVERWRITE INTO TABLE raw_stop
 ;

 -- additional steps to remove headers, yay

 CREATE TABLE docs (
 doc_id STRING,
 text STRING
 )
 ;

 INSERT OVERWRITE TABLE docs
 SELECT
 *
 FROM raw_docs
 WHERE doc_id <> 'doc_id'
 ;

 CREATE TABLE stop (
 stop STRING
 )
 ;

 INSERT OVERWRITE TABLE stop
 SELECT
 *
 FROM raw_stop
 WHERE stop <> 'stop'
 ;

 -- tokenize using external Python script

 CREATE TABLE tokens (
 token STRING
 )
 ;

 INSERT OVERWRITE TABLE tokens
 SELECT
 TRANSFORM(text) USING 'python ./src/scripts/tokenizer.py' AS token
 FROM docs
 ;

 -- filter with a left join, then count

 SELECT token, COUNT(*) AS count
 FROM (
  SELECT
   *
  FROM tokens LEFT OUTER JOIN stop
   ON (tokens.token = stop.stop)
  WHERE stop IS NULL
 ) t
 GROUP BY token
 ;
	public class
	Main
	{
	public static void
	main( String[] args )
	{
	String docPath = args[ 0 ];
	String wcPath = args[ 1 ];
	String stopPath = args[ 2 ];

	Properties properties = new Properties();
	AppProps.setApplicationJarClass( properties, Main.class );
	HadoopFlowConnector flowConnector = new HadoopFlowConnector( properties );

	// create source and sink taps
	Tap docTap = new Hfs( new TextDelimited( true, "\t" ), docPath );
	Tap wcTap = new Hfs( new TextDelimited( true, "\t" ), wcPath );

	Fields stop = new Fields( "stop" );
	Tap stopTap = new Hfs( new TextDelimited( stop, true, "\t" ), stopPath );

	// specify a regex operation to split the "document" text lines into a token stream
	Fields token = new Fields( "token" );
	Fields text = new Fields( "text" );
	RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ \\[\\]\\(\\),.]" );
	Fields fieldSelector = new Fields( "doc_id", "token" );
	Pipe docPipe = new Each( "token", text, splitter, fieldSelector );

	// define "ScrubFunction" to clean up the token stream
	Fields scrubArguments = new Fields( "doc_id", "token" );
	docPipe = new Each( docPipe, scrubArguments, new ScrubFunction( scrubArguments ), Fields.RESULTS );

	// perform a left join to remove stop words, discarding the rows
	// which joined with stop words, i.e., were non-null after left join
	Pipe stopPipe = new Pipe( "stop" );
	Pipe tokenPipe = new HashJoin( docPipe, token, stopPipe, stop, new LeftJoin() );
	tokenPipe = new Each( tokenPipe, stop, new RegexFilter( "^$" ) );

	// determine the word counts
	Pipe wcPipe = new Pipe( "wc", tokenPipe );
	wcPipe = new Retain( wcPipe, token );
	wcPipe = new GroupBy( wcPipe, token );
	wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

	// connect the taps, pipes, etc., into a flow
	FlowDef flowDef = FlowDef.flowDef()
	.setName( "wc" )
	.addSource( docPipe, docTap )
	.addSource( stopPipe, stopTap )
	.addTailSink( wcPipe, wcTap );

	// write a DOT file and run the flow
	Flow wcFlow = flowConnector.connect( flowDef );
	wcFlow.writeDOT( "dot/wc.dot" );
	wcFlow.complete();
	}
	}
	stop
	a
	about
	after
	all
	along
	an
	and
	any
	are
	around
	as
	asked
	at
	away
	back
	be
	been
	before
	between
	both
	but
	by
	can
	could
	did
	do
	even
	few
	for
	from
	get
	got
	had
	hand
	has
	have
	he
	he
	her
	here
	high
	him
	his
	how
	i
	if
	in
	into
	is
	it
	its
	just
	large
	like
	long
	man
	many
	more
	most
	much
	my
	near
	new
	next
	no
	not
	now
	of
	off
	on
	one
	or
	other
	our
	out
	over
	right
	said
	see
	she
	side
	small
	so
	some
	than
	that
	the
	their
	them
	then
	there
	these
	they
	this
	those
	through
	time
	to
	too
	two
	up
	us
	used
	was
	way
	we
	were
	what
	when
	where
	which
	while
	who
	will
	with
	within
	would
	you
	your
	bash-3.2$ rm -rf derby.log metastore_db/
	bash-3.2$ hive -hiveconf hive.metastore.warehouse.dir=/tmp/metadb < src/scripts/wc.q
	WARNING: org.apache.hadoop.metrics.jvm.EventCounter is deprecated. Please use org.apache.hadoop.log.metrics.EventCounter in all the log4j.properties files.
	Logging initialized using configuration in jar:file:/Users/ceteri/opt/hive-0.9.0-bin/lib/hive-common-0.9.0.jar!/hive-log4j.properties
	Hive history file=/tmp/ceteri/hive_job_log_ceteri_201212231521_680816595.txt
	2012-12-23 15:21:11.165 java[7881:1903] Unable to load realm info from SCDynamicStore
	hive> -- prepare DDL for loading the raw data
	>
	> CREATE TABLE raw_docs (
	> doc_id STRING,
	> text STRING
	> )
	> ROW FORMAT DELIMITED
	> FIELDS TERMINATED BY '\t'
	> STORED AS TEXTFILE
	> ;
	OK
	Time taken: 3.619 seconds
	hive>
	> CREATE TABLE raw_stop (
	> stop STRING
	> )
	> ROW FORMAT DELIMITED
	> FIELDS TERMINATED BY '\t'
	> STORED AS TEXTFILE
	> ;
	OK
	Time taken: 0.025 seconds
	hive>
	> -- load the raw data
	>
	> LOAD DATA
	> LOCAL INPATH 'data/rain.txt'
	> OVERWRITE INTO TABLE raw_docs
	> ;
	Copying data from file:/Users/ceteri/src/concur/Impatient/part4/data/rain.txt
	Copying file: file:/Users/ceteri/src/concur/Impatient/part4/data/rain.txt
	Loading data to table default.raw_docs
	Deleted file:/tmp/metadb/raw_docs
	OK
	Time taken: 0.204 seconds
	hive>
	> LOAD DATA
	> LOCAL INPATH 'data/en.stop'
	> OVERWRITE INTO TABLE raw_stop
	> ;
	Copying data from file:/Users/ceteri/src/concur/Impatient/part4/data/en.stop
	Copying file: file:/Users/ceteri/src/concur/Impatient/part4/data/en.stop
	Loading data to table default.raw_stop
	Deleted file:/tmp/metadb/raw_stop
	OK
	Time taken: 0.075 seconds
	hive>
	> -- additional steps to remove headers, yay
	>
	> CREATE TABLE docs (
	> doc_id STRING,
	> text STRING
	> )
	> ;
	OK
	Time taken: 0.024 seconds
	hive>
	> INSERT OVERWRITE TABLE docs
	> SELECT
	> *
	> FROM raw_docs
	> WHERE doc_id <> 'doc_id'
	> ;
	Total MapReduce jobs = 2
	Launching Job 1 out of 2
	Number of reduce tasks is set to 0 since there's no reduce operator
	12/12/23 15:21:16 WARN conf.HiveConf: DEPRECATED: Ignoring hive-default.xml found on the CLASSPATH at /Users/ceteri/opt/hive-0.9.0-bin/conf/hive-default.xml
	12/12/23 15:21:16 WARN conf.HiveConf: hive-site.xml not found on CLASSPATH
	WARNING: org.apache.hadoop.metrics.jvm.EventCounter is deprecated. Please use org.apache.hadoop.log.metrics.EventCounter in all the log4j.properties files.
	Execution log at: /tmp/ceteri/ceteri_20121223152121_120b279e-0911-4dcc-9d7b-fc9d76ed0562.log
	2012-12-23 15:21:16.918 java[7939:1903] Unable to load realm info from SCDynamicStore
	Job running in-process (local Hadoop)
	Hadoop job information for null: number of mappers: 0; number of reducers: 0
	2012-12-23 15:21:19,333 null map = 0%, reduce = 0%
	2012-12-23 15:21:22,338 null map = 100%, reduce = 0%
	Ended Job = job_local_0001
	Execution completed successfully
	Mapred Local Task Succeeded . Convert the Join into MapJoin
	Ended Job = 1640864005, job is filtered out (removed at runtime).
	Moving data to: file:/tmp/hive-ceteri/hive_2012-12-23_15-21-15_366_7720940276006194670/-ext-10000
	Loading data to table default.docs
	Deleted file:/tmp/metadb/docs
	Table default.docs stats: [num_partitions: 0, num_files: 1, num_rows: 5, total_size: 498, raw_data_size: 493]
	OK
	Time taken: 7.393 seconds
	hive>
	> CREATE TABLE stop (
	> stop STRING
	> )
	> ;
	OK
	Time taken: 0.019 seconds
	hive>
	> INSERT OVERWRITE TABLE stop
	> SELECT
	> *
	> FROM raw_stop
	> WHERE stop <> 'stop'
	> ;
	Total MapReduce jobs = 2
	Launching Job 1 out of 2
	Number of reduce tasks is set to 0 since there's no reduce operator
	12/12/23 15:21:23 WARN conf.HiveConf: DEPRECATED: Ignoring hive-default.xml found on the CLASSPATH at /Users/ceteri/opt/hive-0.9.0-bin/conf/hive-default.xml
	12/12/23 15:21:23 WARN conf.HiveConf: hive-site.xml not found on CLASSPATH
	WARNING: org.apache.hadoop.metrics.jvm.EventCounter is deprecated. Please use org.apache.hadoop.log.metrics.EventCounter in all the log4j.properties files.
	Execution log at: /tmp/ceteri/ceteri_20121223152121_f1511b39-cffa-4197-8eb4-cdb017ea797e.log
	2012-12-23 15:21:24.070 java[7966:1903] Unable to load realm info from SCDynamicStore
	Job running in-process (local Hadoop)
	Hadoop job information for null: number of mappers: 0; number of reducers: 0
	2012-12-23 15:21:26,961 null map = 0%, reduce = 0%
	2012-12-23 15:21:29,966 null map = 100%, reduce = 0%
	Ended Job = job_local_0001
	Execution completed successfully
	Mapred Local Task Succeeded . Convert the Join into MapJoin
	Ended Job = 1551628365, job is filtered out (removed at runtime).
	Moving data to: file:/tmp/hive-ceteri/hive_2012-12-23_15-21-22_781_7709385169981999922/-ext-10000
	Loading data to table default.stop
	Deleted file:/tmp/metadb/stop
	Table default.stop stats: [num_partitions: 0, num_files: 1, num_rows: 119, total_size: 539, raw_data_size: 420]
	OK
	Time taken: 7.571 seconds
	hive>
	> -- tokenize using external Python script
	>
	> CREATE TABLE tokens (
	> token STRING
	> )
	> ;
	OK
	Time taken: 0.026 seconds
	hive>
	> INSERT OVERWRITE TABLE tokens
	> SELECT
	> TRANSFORM(text) USING 'python ./src/scripts/tokenizer.py' AS token
	> FROM docs
	> ;
	Total MapReduce jobs = 2
	Launching Job 1 out of 2
	Number of reduce tasks is set to 0 since there's no reduce operator
	12/12/23 15:21:31 WARN conf.HiveConf: DEPRECATED: Ignoring hive-default.xml found on the CLASSPATH at /Users/ceteri/opt/hive-0.9.0-bin/conf/hive-default.xml
	12/12/23 15:21:31 WARN conf.HiveConf: hive-site.xml not found on CLASSPATH
	WARNING: org.apache.hadoop.metrics.jvm.EventCounter is deprecated. Please use org.apache.hadoop.log.metrics.EventCounter in all the log4j.properties files.
	Execution log at: /tmp/ceteri/ceteri_20121223152121_43bba36d-b43f-4098-87f4-e2388633b086.log
	2012-12-23 15:21:31.946 java[7994:1903] Unable to load realm info from SCDynamicStore
	Job running in-process (local Hadoop)
	Hadoop job information for null: number of mappers: 0; number of reducers: 0
	2012-12-23 15:21:34,666 null map = 0%, reduce = 0%
	2012-12-23 15:21:37,670 null map = 100%, reduce = 0%
	Ended Job = job_local_0001
	Execution completed successfully
	Mapred Local Task Succeeded . Convert the Join into MapJoin
	Ended Job = -2104034200, job is filtered out (removed at runtime).
	Moving data to: file:/tmp/hive-ceteri/hive_2012-12-23_15-21-30_385_6822997415441284398/-ext-10000
	Loading data to table default.tokens
	Deleted file:/tmp/metadb/tokens
	Table default.tokens stats: [num_partitions: 0, num_files: 1, num_rows: 89, total_size: 454, raw_data_size: 365]
	OK
	Time taken: 7.626 seconds
	hive>
	> -- filter with a left join, then count
	>
	> SELECT token, COUNT(*) AS count
	> FROM (
	> SELECT
	> *
	> FROM tokens LEFT OUTER JOIN stop
	> ON (tokens.token = stop.stop)
	> WHERE stop IS NULL
	> ) t
	> GROUP BY token
	> ;
	Total MapReduce jobs = 2
	Launching Job 1 out of 2
	Number of reduce tasks not specified. Estimated from input data size: 1
	In order to change the average load for a reducer (in bytes):
	set hive.exec.reducers.bytes.per.reducer=<number>
	In order to limit the maximum number of reducers:
	set hive.exec.reducers.max=<number>
	In order to set a constant number of reducers:
	set mapred.reduce.tasks=<number>
	12/12/23 15:21:39 WARN conf.HiveConf: DEPRECATED: Ignoring hive-default.xml found on the CLASSPATH at /Users/ceteri/opt/hive-0.9.0-bin/conf/hive-default.xml
	12/12/23 15:21:39 WARN conf.HiveConf: hive-site.xml not found on CLASSPATH
	WARNING: org.apache.hadoop.metrics.jvm.EventCounter is deprecated. Please use org.apache.hadoop.log.metrics.EventCounter in all the log4j.properties files.
	Execution log at: /tmp/ceteri/ceteri_20121223152121_b14dcc61-a51b-4a18-b35e-6faf75706b82.log
	2012-12-23 15:21:39.618 java[8022:1903] Unable to load realm info from SCDynamicStore
	Job running in-process (local Hadoop)
	Hadoop job information for null: number of mappers: 0; number of reducers: 0
	2012-12-23 15:21:41,868 null map = 0%, reduce = 0%
	2012-12-23 15:21:44,872 null map = 100%, reduce = 0%
	2012-12-23 15:21:50,880 null map = 100%, reduce = 100%
	Ended Job = job_local_0001
	Execution completed successfully
	Mapred Local Task Succeeded . Convert the Join into MapJoin
	Launching Job 2 out of 2
	Number of reduce tasks not specified. Estimated from input data size: 1
	In order to change the average load for a reducer (in bytes):
	set hive.exec.reducers.bytes.per.reducer=<number>
	In order to limit the maximum number of reducers:
	set hive.exec.reducers.max=<number>
	In order to set a constant number of reducers:
	set mapred.reduce.tasks=<number>
	12/12/23 15:21:52 WARN conf.HiveConf: DEPRECATED: Ignoring hive-default.xml found on the CLASSPATH at /Users/ceteri/opt/hive-0.9.0-bin/conf/hive-default.xml
	12/12/23 15:21:52 WARN conf.HiveConf: hive-site.xml not found on CLASSPATH
	WARNING: org.apache.hadoop.metrics.jvm.EventCounter is deprecated. Please use org.apache.hadoop.log.metrics.EventCounter in all the log4j.properties files.
	Execution log at: /tmp/ceteri/ceteri_20121223152121_b14dcc61-a51b-4a18-b35e-6faf75706b82.log
	2012-12-23 15:21:52.315 java[8049:1903] Unable to load realm info from SCDynamicStore
	Job running in-process (local Hadoop)
	Hadoop job information for null: number of mappers: 0; number of reducers: 0
	2012-12-23 15:21:54,126 null map = 0%, reduce = 0%
	2012-12-23 15:21:57,131 null map = 100%, reduce = 0%
	2012-12-23 15:22:00,135 null map = 100%, reduce = 100%
	Ended Job = job_local_0001
	Execution completed successfully
	Mapred Local Task Succeeded . Convert the Join into MapJoin
	OK
	air 1
	area 4
	australia 1
	broken 1
	california's 1
	cause 1
	cloudcover 1
	death 1
	deserts 1
	downwind 1
	dry 3
	dvd 1
	effect 1
	known 1
	land 2
	lee 2
	leeward 2
	less 1
	lies 1
	mountain 3
	mountainous 1
	primary 1
	produces 1
	rain 5
	ranges 1
	secrets 1
	shadow 4
	sinking 1
	such 1
	valley 1
	women 1
	Time taken: 22.384 seconds
	hive> bash-3.2$
	bash-3.2$ ls
	LICENSE.txt README.md build.gradle data src
	bash-3.2$ hadoop version
	Warning: $HADOOP_HOME is deprecated.

	Hadoop 1.0.3
	Subversion https://svn.apache.org/repos/asf/hadoop/common/branches/branch-1.0 -r 1335192
	Compiled by hortonfo on Tue May 8 20:31:25 UTC 2012
	From source with checksum e6b0c1e23dcf76907c5fecb4b832f3be
	bash-3.2$ gradle -version

	------------------------------------------------------------
	Gradle 1.0
	------------------------------------------------------------

	Gradle build time: Tuesday, June 12, 2012 12:56:21 AM UTC
	Groovy: 1.8.6
	Ant: Apache Ant(TM) version 1.8.2 compiled on December 20 2010
	Ivy: 2.2.0
	JVM: 1.6.0_33 (Apple Inc. 20.8-b03-424)
	OS: Mac OS X 10.7.4 x86_64

	bash-3.2$ gradle clean jar
	:clean UP-TO-DATE
	:compileJava
	:processResources UP-TO-DATE
	:classes
	:jar

	BUILD SUCCESSFUL

	Total time: 7.836 secs
	bash-3.2$ hadoop jar ./build/libs/impatient.jar data/rain.txt output/wc data/en.stop
	Warning: $HADOOP_HOME is deprecated.

	12/07/23 13:11:39 INFO util.HadoopUtil: resolving application jar from found main method on: impatient.Main
	12/07/23 13:11:39 INFO planner.HadoopPlanner: using application jar: /Users/ceteri/src/concur/Impatient/part4/./build/libs/impatient.jar
	12/07/23 13:11:39 INFO property.AppProps: using app.id: D22F09ABBCAB0AE1A6D24FFF0F6C64E3
	2012-07-23 13:11:39.978 java[3209:1903] Unable to load realm info from SCDynamicStore
	12/07/23 13:11:40 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
	12/07/23 13:11:40 WARN snappy.LoadSnappy: Snappy native library not loaded
	12/07/23 13:11:40 INFO mapred.FileInputFormat: Total input paths to process : 1
	12/07/23 13:11:40 INFO util.Version: Concurrent, Inc - Cascading 2.0.1
	12/07/23 13:11:40 INFO flow.Flow: [wc] starting
	12/07/23 13:11:40 INFO flow.Flow: [wc] source: Hfs["TextDelimited[['stop']]"]["data/en.stop"]"]
	12/07/23 13:11:40 INFO flow.Flow: [wc] source: Hfs["TextDelimited[['doc_id', 'text']->[ALL]]"]["data/rain.txt"]"]
	12/07/23 13:11:40 INFO flow.Flow: [wc] sink: Hfs["TextDelimited[[UNKNOWN]->['token', 'count']]"]["output/wc"]"]
	12/07/23 13:11:40 INFO flow.Flow: [wc] parallel execution is enabled: false
	12/07/23 13:11:40 INFO flow.Flow: [wc] starting jobs: 1
	12/07/23 13:11:40 INFO flow.Flow: [wc] allocating threads: 1
	12/07/23 13:11:40 INFO flow.FlowStep: [wc] starting step: (1/1) output/wc
	12/07/23 13:11:40 INFO mapred.FileInputFormat: Total input paths to process : 1
	12/07/23 13:11:40 INFO flow.FlowStep: [wc] submitted hadoop job: job_local_0001
	12/07/23 13:11:40 INFO mapred.Task: Using ResourceCalculatorPlugin : null
	12/07/23 13:11:40 INFO io.MultiInputSplit: current split input path: file:/Users/ceteri/src/concur/Impatient/part4/data/rain.txt
	12/07/23 13:11:40 INFO mapred.MapTask: numReduceTasks: 1
	12/07/23 13:11:40 INFO mapred.MapTask: io.sort.mb = 100
	12/07/23 13:11:40 INFO mapred.MapTask: data buffer = 79691776/99614720
	12/07/23 13:11:40 INFO mapred.MapTask: record buffer = 262144/327680
	12/07/23 13:11:40 INFO hadoop.FlowMapper: sourcing from: Hfs["TextDelimited[['doc_id', 'text']->[ALL]]"]["data/rain.txt"]"]
	12/07/23 13:11:40 INFO hadoop.FlowMapper: sourcing from: Hfs["TextDelimited[['stop']]"]["data/en.stop"]"]
	12/07/23 13:11:40 INFO hadoop.FlowMapper: sinking to: GroupBy(wc)[by:[{1}:'token']]
	12/07/23 13:11:40 INFO collect.SpillableTupleList: attempting to load codec: org.apache.hadoop.io.compress.GzipCodec
	12/07/23 13:11:40 INFO collect.SpillableTupleList: found codec: org.apache.hadoop.io.compress.GzipCodec
	12/07/23 13:11:40 INFO mapred.FileInputFormat: Total input paths to process : 1
	12/07/23 13:11:40 INFO collect.SpillableTupleList: attempting to load codec: org.apache.hadoop.io.compress.GzipCodec
	12/07/23 13:11:40 INFO collect.SpillableTupleList: found codec: org.apache.hadoop.io.compress.GzipCodec
	12/07/23 13:11:40 INFO mapred.MapTask: Starting flush of map output
	12/07/23 13:11:40 INFO mapred.MapTask: Finished spill 0
	12/07/23 13:11:40 INFO mapred.Task: Task:attempt_local_0001_m_000000_0 is done. And is in the process of commiting
	12/07/23 13:11:43 INFO mapred.LocalJobRunner: file:/Users/ceteri/src/concur/Impatient/part4/data/rain.txt:0+510
	12/07/23 13:11:43 INFO mapred.Task: Task 'attempt_local_0001_m_000000_0' done.
	12/07/23 13:11:43 INFO mapred.Task: Using ResourceCalculatorPlugin : null
	12/07/23 13:11:43 INFO mapred.LocalJobRunner:
	12/07/23 13:11:43 INFO mapred.Merger: Merging 1 sorted segments
	12/07/23 13:11:43 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 751 bytes
	12/07/23 13:11:43 INFO mapred.LocalJobRunner:
	12/07/23 13:11:43 INFO hadoop.FlowReducer: sourcing from: GroupBy(wc)[by:[{1}:'token']]
	12/07/23 13:11:43 INFO hadoop.FlowReducer: sinking to: Hfs["TextDelimited[[UNKNOWN]->['token', 'count']]"]["output/wc"]"]
	12/07/23 13:11:43 INFO mapred.Task: Task:attempt_local_0001_r_000000_0 is done. And is in the process of commiting
	12/07/23 13:11:43 INFO mapred.LocalJobRunner:
	12/07/23 13:11:43 INFO mapred.Task: Task attempt_local_0001_r_000000_0 is allowed to commit now
	12/07/23 13:11:43 INFO mapred.FileOutputCommitter: Saved output of task 'attempt_local_0001_r_000000_0' to file:/Users/ceteri/src/concur/Impatient/part4/output/wc
	12/07/23 13:11:46 INFO mapred.LocalJobRunner: reduce > reduce
	12/07/23 13:11:46 INFO mapred.Task: Task 'attempt_local_0001_r_000000_0' done.
	12/07/23 13:11:50 INFO util.Hadoop18TapUtil: deleting temp path output/wc/_temporary
	bash-3.2$ more output/wc/part-00000
	token count
	air 1
	area 4
	australia 1
	broken 1
	california's 1
	cause 1
	cloudcover 1
	death 1
	deserts 1
	downwind 1
	dry 3
	dvd 1
	effect 1
	known 1
	land 2
	lee 2
	leeward 2
	less 1
	lies 1
	mountain 3
	mountainous 1
	primary 1
	produces 1
	rain 5
	ranges 1
	secrets 1
	shadow 4
	sinking 1
	such 1
	valley 1
	women 1
	bash-3.2$
	bash-3.2$ rm -rf output
	bash-3.2$ mkdir -p dot
	bash-3.2$ pig -version
	Warning: $HADOOP_HOME is deprecated.

	Apache Pig version 0.10.0 (r1328203)
	compiled Apr 19 2012, 22:54:12
	bash-3.2$ pig -p docPath=./data/rain.txt -p wcPath=./output/wc -p stopPath=./data/en.stop ./src/scripts/wc.pig
	Warning: $HADOOP_HOME is deprecated.

	2012-12-22 10:41:33,271 [main] INFO org.apache.pig.Main - Apache Pig version 0.10.0 (r1328203) compiled Apr 19 2012, 22:54:12
	2012-12-22 10:41:33,272 [main] INFO org.apache.pig.Main - Logging error messages to: /Users/ceteri/src/concur/Impatient/part4/pig_1356201693269.log
	2012-12-22 10:41:33.371 java[2020:1903] Unable to load realm info from SCDynamicStore
	2012-12-22 10:41:33,579 [main] INFO org.apache.pig.backend.hadoop.executionengine.HExecutionEngine - Connecting to hadoop file system at: file:///
	2012-12-22 10:41:34,152 [main] WARN org.apache.pig.PigServer - Encountered Warning IMPLICIT_CAST_TO_CHARARRAY 2 time(s).
	2012-12-22 10:41:34,152 [main] WARN org.apache.pig.PigServer - Encountered Warning USING_OVERLOADED_FUNCTION 1 time(s).
	2012-12-22 10:41:34,300 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler - File concatenation threshold: 100 optimistic? false
	2012-12-22 10:41:34,309 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.CombinerOptimizer - Choosing to move algebraic foreach to combiner
	2012-12-22 10:41:34,325 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - MR plan size before optimization: 2
	2012-12-22 10:41:34,325 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - MR plan size after optimization: 2
	2012-12-22 10:41:34,361 [main] WARN org.apache.pig.PigServer - Encountered Warning IMPLICIT_CAST_TO_CHARARRAY 2 time(s).
	2012-12-22 10:41:34,362 [main] WARN org.apache.pig.PigServer - Encountered Warning USING_OVERLOADED_FUNCTION 1 time(s).
	2012-12-22 10:41:34,364 [main] INFO org.apache.pig.tools.pigstats.ScriptState - Pig features used in the script: HASH_JOIN,GROUP_BY,FILTER
	2012-12-22 10:41:34,396 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler - File concatenation threshold: 100 optimistic? false
	2012-12-22 10:41:34,399 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.CombinerOptimizer - Choosing to move algebraic foreach to combiner
	2012-12-22 10:41:34,402 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - MR plan size before optimization: 2
	2012-12-22 10:41:34,402 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - MR plan size after optimization: 2
	2012-12-22 10:41:34,417 [main] INFO org.apache.pig.tools.pigstats.ScriptState - Pig script settings are added to the job
	2012-12-22 10:41:34,428 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3
	2012-12-22 10:41:34,431 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - creating jar file Job6009769361318502147.jar
	2012-12-22 10:41:38,205 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - jar file Job6009769361318502147.jar created
	2012-12-22 10:41:38,215 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job
	2012-12-22 10:41:38,221 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - BytesPerReducer=1000000000 maxReducers=999 totalInputFileSize=1054
	2012-12-22 10:41:38,221 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Neither PARALLEL nor default parallelism is set for this job. Setting number of reducers to 1
	2012-12-22 10:41:38,274 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 1 map-reduce job(s) waiting for submission.
	2012-12-22 10:41:38,282 [Thread-6] WARN org.apache.hadoop.util.NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
	2012-12-22 10:41:38,385 [Thread-6] INFO org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1
	2012-12-22 10:41:38,385 [Thread-6] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1
	2012-12-22 10:41:38,390 [Thread-6] WARN org.apache.hadoop.io.compress.snappy.LoadSnappy - Snappy native library not loaded
	2012-12-22 10:41:38,392 [Thread-6] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1
	2012-12-22 10:41:38,398 [Thread-6] INFO org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1
	2012-12-22 10:41:38,398 [Thread-6] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1
	2012-12-22 10:41:38,398 [Thread-6] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1
	2012-12-22 10:41:38,583 [Thread-7] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null
	2012-12-22 10:41:38,595 [Thread-7] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/Users/ceteri/src/concur/Impatient/part4/data/en.stop:0+544
	2012-12-22 10:41:38,599 [Thread-7] INFO org.apache.hadoop.mapred.MapTask - io.sort.mb = 100
	2012-12-22 10:41:38,688 [Thread-7] INFO org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720
	2012-12-22 10:41:38,690 [Thread-7] INFO org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680
	2012-12-22 10:41:38,730 [Thread-7] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Created input record counter: Input records from _1_en.stop
	2012-12-22 10:41:38,744 [Thread-7] INFO org.apache.hadoop.mapred.MapTask - Starting flush of map output
	2012-12-22 10:41:38,752 [Thread-7] INFO org.apache.hadoop.mapred.MapTask - Finished spill 0
	2012-12-22 10:41:38,753 [Thread-7] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0001_m_000000_0 is done. And is in the process of commiting
	2012-12-22 10:41:38,775 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_local_0001
	2012-12-22 10:41:38,776 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 0% complete
	2012-12-22 10:41:41,568 [Thread-7] INFO org.apache.hadoop.mapred.LocalJobRunner -
	2012-12-22 10:41:41,568 [Thread-7] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0001_m_000000_0' done.
	2012-12-22 10:41:41,572 [Thread-7] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null
	2012-12-22 10:41:41,576 [Thread-7] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/Users/ceteri/src/concur/Impatient/part4/data/rain.txt:0+510
	2012-12-22 10:41:41,576 [Thread-7] INFO org.apache.hadoop.mapred.MapTask - io.sort.mb = 100
	2012-12-22 10:41:41,654 [Thread-7] INFO org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720
	2012-12-22 10:41:41,655 [Thread-7] INFO org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680
	2012-12-22 10:41:41,679 [Thread-7] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Created input record counter: Input records from _0_rain.txt
	2012-12-22 10:41:41,691 [Thread-7] INFO org.apache.hadoop.mapred.MapTask - Starting flush of map output
	2012-12-22 10:41:41,694 [Thread-7] INFO org.apache.hadoop.mapred.MapTask - Finished spill 0
	2012-12-22 10:41:41,698 [Thread-7] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0001_m_000001_0 is done. And is in the process of commiting
	2012-12-22 10:41:44,571 [Thread-7] INFO org.apache.hadoop.mapred.LocalJobRunner -
	2012-12-22 10:41:44,572 [Thread-7] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0001_m_000001_0' done.
	2012-12-22 10:41:44,583 [Thread-7] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null
	2012-12-22 10:41:44,583 [Thread-7] INFO org.apache.hadoop.mapred.LocalJobRunner -
	2012-12-22 10:41:44,588 [Thread-7] INFO org.apache.hadoop.mapred.Merger - Merging 2 sorted segments
	2012-12-22 10:41:44,596 [Thread-7] INFO org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 2 segments left of total size: 3284 bytes
	2012-12-22 10:41:44,596 [Thread-7] INFO org.apache.hadoop.mapred.LocalJobRunner -
	2012-12-22 10:41:44,635 [Thread-7] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0001_r_000000_0 is done. And is in the process of commiting
	2012-12-22 10:41:44,636 [Thread-7] INFO org.apache.hadoop.mapred.LocalJobRunner -
	2012-12-22 10:41:44,636 [Thread-7] INFO org.apache.hadoop.mapred.Task - Task attempt_local_0001_r_000000_0 is allowed to commit now
	2012-12-22 10:41:44,639 [Thread-7] INFO org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - Saved output of task 'attempt_local_0001_r_000000_0' to file:/tmp/temp1482927600/tmp-1364727125
	2012-12-22 10:41:47,587 [Thread-7] INFO org.apache.hadoop.mapred.LocalJobRunner - reduce > reduce
	2012-12-22 10:41:47,587 [Thread-7] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0001_r_000000_0' done.
	2012-12-22 10:41:47,589 [Thread-7] WARN org.apache.hadoop.mapred.FileOutputCommitter - Output path is null in cleanup
	2012-12-22 10:41:48,796 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 50% complete
	2012-12-22 10:41:48,799 [main] WARN org.apache.pig.tools.pigstats.PigStatsUtil - Failed to get RunningJob for job job_local_0001
	2012-12-22 10:41:48,800 [main] INFO org.apache.pig.tools.pigstats.ScriptState - Pig script settings are added to the job
	2012-12-22 10:41:48,801 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3
	2012-12-22 10:41:48,801 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - creating jar file Job8611044961811192709.jar
	2012-12-22 10:41:52,423 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - jar file Job8611044961811192709.jar created
	2012-12-22 10:41:52,428 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job
	2012-12-22 10:41:52,433 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - BytesPerReducer=1000000000 maxReducers=999 totalInputFileSize=1037
	2012-12-22 10:41:52,433 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Neither PARALLEL nor default parallelism is set for this job. Setting number of reducers to 1
	2012-12-22 10:41:52,445 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 1 map-reduce job(s) waiting for submission.
	2012-12-22 10:41:52,505 [Thread-11] INFO org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1
	2012-12-22 10:41:52,505 [Thread-11] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1
	2012-12-22 10:41:52,505 [Thread-11] INFO org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1
	2012-12-22 10:41:52,588 [Thread-12] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null
	2012-12-22 10:41:52,592 [Thread-12] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigRecordReader - Current split being processed file:/tmp/temp1482927600/tmp-1364727125/part-r-00000:0+1037
	2012-12-22 10:41:52,593 [Thread-12] INFO org.apache.hadoop.mapred.MapTask - io.sort.mb = 100
	2012-12-22 10:41:52,612 [Thread-12] INFO org.apache.hadoop.mapred.MapTask - data buffer = 79691776/99614720
	2012-12-22 10:41:52,612 [Thread-12] INFO org.apache.hadoop.mapred.MapTask - record buffer = 262144/327680
	2012-12-22 10:41:52,638 [Thread-12] INFO org.apache.hadoop.mapred.MapTask - Starting flush of map output
	2012-12-22 10:41:52,653 [Thread-12] INFO org.apache.hadoop.mapred.MapTask - Finished spill 0
	2012-12-22 10:41:52,655 [Thread-12] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0002_m_000000_0 is done. And is in the process of commiting
	2012-12-22 10:41:52,946 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_local_0002
	2012-12-22 10:41:55,586 [Thread-12] INFO org.apache.hadoop.mapred.LocalJobRunner -
	2012-12-22 10:41:55,586 [Thread-12] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0002_m_000000_0' done.
	2012-12-22 10:41:55,594 [Thread-12] INFO org.apache.hadoop.mapred.Task - Using ResourceCalculatorPlugin : null
	2012-12-22 10:41:55,594 [Thread-12] INFO org.apache.hadoop.mapred.LocalJobRunner -
	2012-12-22 10:41:55,594 [Thread-12] INFO org.apache.hadoop.mapred.Merger - Merging 1 sorted segments
	2012-12-22 10:41:55,595 [Thread-12] INFO org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 1 segments left of total size: 809 bytes
	2012-12-22 10:41:55,595 [Thread-12] INFO org.apache.hadoop.mapred.LocalJobRunner -
	2012-12-22 10:41:55,608 [Thread-12] INFO org.apache.hadoop.mapred.Task - Task:attempt_local_0002_r_000000_0 is done. And is in the process of commiting
	2012-12-22 10:41:55,610 [Thread-12] INFO org.apache.hadoop.mapred.LocalJobRunner -
	2012-12-22 10:41:55,610 [Thread-12] INFO org.apache.hadoop.mapred.Task - Task attempt_local_0002_r_000000_0 is allowed to commit now
	2012-12-22 10:41:55,613 [Thread-12] INFO org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - Saved output of task 'attempt_local_0002_r_000000_0' to file:/Users/ceteri/src/concur/Impatient/part4/output/wc
	2012-12-22 10:41:58,590 [Thread-12] INFO org.apache.hadoop.mapred.LocalJobRunner - reduce > reduce
	2012-12-22 10:41:58,591 [Thread-12] INFO org.apache.hadoop.mapred.Task - Task 'attempt_local_0002_r_000000_0' done.
	2012-12-22 10:41:58,592 [Thread-12] WARN org.apache.hadoop.mapred.FileOutputCommitter - Output path is null in cleanup
	2012-12-22 10:42:02,969 [main] WARN org.apache.pig.tools.pigstats.PigStatsUtil - Failed to get RunningJob for job job_local_0002
	2012-12-22 10:42:02,971 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 100% complete
	2012-12-22 10:42:02,973 [main] INFO org.apache.pig.tools.pigstats.SimplePigStats - Script Statistics:

	HadoopVersion PigVersion UserId StartedAt FinishedAt Features
	1.0.3 0.10.0 ceteri 2012-12-22 10:41:34 2012-12-22 10:42:02 HASH_JOIN,GROUP_BY,FILTER

	Success!

	Job Stats (time in seconds):
	JobId Maps Reduces MaxMapTime MinMapTIme AvgMapTime MaxReduceTime MinReduceTime AvgReduceTime Alias Feature Outputs
	job_local_0001 1 1 n/a n/a n/a n/a n/a n/a docPipe,stopPipe,tokenPipe HASH_JOIN
	job_local_0002 1 1 n/a n/a n/a n/a n/a n/a tokenGroups,wcPipe GROUP_BY,COMBINER file:///Users/ceteri/src/concur/Impatient/part4/output/wc,

	Input(s):
	Successfully read 0 records from: "file:///Users/ceteri/src/concur/Impatient/part4/data/rain.txt"
	Successfully read 0 records from: "file:///Users/ceteri/src/concur/Impatient/part4/data/en.stop"

	Output(s):
	Successfully stored 0 records in: "file:///Users/ceteri/src/concur/Impatient/part4/output/wc"

	Counters:
	Total records written : 0
	Total bytes written : 0
	Spillable Memory Manager spill count : 0
	Total bags proactively spilled: 0
	Total records proactively spilled: 0

	Job DAG:
	job_local_0001 -> job_local_0002,
	job_local_0002


	2012-12-22 10:42:02,973 [main] INFO org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Success!
	bash-3.2$ cat output/wc/part-r-00000
	air 1
	dry 3
	dvd 1
	lee 2
	area 4
	land 2
	less 1
	lies 1
	rain 5
	such 1
	cause 1
	death 1
	known 1
	women 1
	broken 1
	effect 1
	ranges 1
	shadow 4
	valley 1
	deserts 1
	leeward 2
	primary 1
	secrets 1
	sinking 1
	downwind 1
	mountain 3
	produces 1
	australia 1
	cloudcover 1
	mountainous 1
	california's 1
	bash-3.2$
	docPipe = LOAD '$docPath' USING PigStorage('\t', 'tagsource') AS (doc_id, text);
	docPipe = FILTER docPipe BY doc_id != 'doc_id';

	stopPipe = LOAD '$stopPath' USING PigStorage('\t', 'tagsource') AS (stop:chararray);
	stopPipe = FILTER stopPipe BY stop != 'stop';

	-- specify a regex operation to split the "document" text lines into a token stream
	tokenPipe = FOREACH docPipe GENERATE doc_id, FLATTEN(TOKENIZE(LOWER(text), ' [](),.')) AS token;
	tokenPipe = FILTER tokenPipe BY token MATCHES '\\w.*';

	-- perform a left join to remove stop words, discarding the rows
	-- which joined with stop words, i.e., were non-null after left join
	tokenPipe = JOIN tokenPipe BY token LEFT, stopPipe BY stop;
	tokenPipe = FILTER tokenPipe BY stopPipe::stop IS NULL;

	-- determine the word counts
	tokenGroups = GROUP tokenPipe BY token;
	wcPipe = FOREACH tokenGroups GENERATE group AS token, COUNT(tokenPipe) AS count;

	-- output
	STORE wcPipe INTO '$wcPath' using PigStorage('\t', 'tagsource');
	EXPLAIN -out dot/wc_pig.dot -dot wcPipe;
	-- prepare DDL for loading the raw data

	CREATE TABLE raw_docs (
	doc_id STRING,
	text STRING
	)
	ROW FORMAT DELIMITED
	FIELDS TERMINATED BY '\t'
	STORED AS TEXTFILE
	;

	CREATE TABLE raw_stop (
	stop STRING
	)
	ROW FORMAT DELIMITED
	FIELDS TERMINATED BY '\t'
	STORED AS TEXTFILE
	;

	-- load the raw data

	LOAD DATA
	LOCAL INPATH 'data/rain.txt'
	OVERWRITE INTO TABLE raw_docs
	;

	LOAD DATA
	LOCAL INPATH 'data/en.stop'
	OVERWRITE INTO TABLE raw_stop
	;

	-- additional steps to remove headers, yay

	CREATE TABLE docs (
	doc_id STRING,
	text STRING
	)
	;

	INSERT OVERWRITE TABLE docs
	SELECT
	*
	FROM raw_docs
	WHERE doc_id <> 'doc_id'
	;

	CREATE TABLE stop (
	stop STRING
	)
	;

	INSERT OVERWRITE TABLE stop
	SELECT
	*
	FROM raw_stop
	WHERE stop <> 'stop'
	;

	-- tokenize using external Python script

	CREATE TABLE tokens (
	token STRING
	)
	;

	INSERT OVERWRITE TABLE tokens
	SELECT
	TRANSFORM(text) USING 'python ./src/scripts/tokenizer.py' AS token
	FROM docs
	;

	-- filter with a left join, then count

	SELECT token, COUNT(*) AS count
	FROM (
	SELECT
	*
	FROM tokens LEFT OUTER JOIN stop
	ON (tokens.token = stop.stop)
	WHERE stop IS NULL
	) t
	GROUP BY token
	;