chriswhite199 · September 29, 2013 18:34
diff --git a/AvroWordCountNewAPI.java b/AvroWordCountNewAPI.java
 /**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 package csw;

 import java.io.IOException;
 import java.util.StringTokenizer;

 import org.apache.avro.mapred.AvroKey;
 import org.apache.avro.mapreduce.AvroJob;
 import org.apache.avro.mapreduce.AvroKeyOutputFormat;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.Reducer;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;

 import csw.avro.WordCount;

 /**
 * The classic WordCount example modified to output Avro Pair<CharSequence,
 * Integer> records instead of text
 * <p>
 * Adapted from
 * http://svn.apache.org/viewvc/avro/trunk/doc/examples/mr-example/src
 * /main/java/example/AvroWordCount.java?view=co and modified to use the new
 * Hadoop mapreduce API
 */
 public class AvroWordCountNewAPI extends Configured implements Tool {

    public static class Map extends
            Mapper<LongWritable, Text, Text, IntWritable> {
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();

        @Override
        public void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            String line = value.toString();
            StringTokenizer tokenizer = new StringTokenizer(line);
            while (tokenizer.hasMoreTokens()) {
                word.set(tokenizer.nextToken());
                context.write(word, one);
            }
        }
    }

    public static class Reduce extends
            Reducer<Text, IntWritable, AvroKey<WordCount>, NullWritable> {
        AvroKey<WordCount> outputKey = new AvroKey<WordCount>(new WordCount());

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values,
                Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable value : values) {
                sum += value.get();
            }
            outputKey.datum().setWord(key.toString());
            outputKey.datum().setCount(sum);

            context.write(outputKey, null);
        }
    }

    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            System.err
                    .println("Usage: AvroWordCountNewAPI <input path> <output path>");
            return -1;
        }

        Job conf = new Job(getConf());
        conf.setJobName("wordcount");
        conf.setJarByClass(AvroWordCountNewAPI.class);

        AvroJob.setOutputKeySchema(conf, WordCount.getClassSchema());

        conf.setMapperClass(Map.class);
        conf.setReducerClass(Reduce.class);

        conf.setInputFormatClass(TextInputFormat.class);
        conf.setOutputFormatClass(AvroKeyOutputFormat.class);

        conf.setMapOutputKeyClass(Text.class);
        conf.setMapOutputValueClass(IntWritable.class);
        
        conf.setOutputKeyClass(AvroKey.class);
        conf.setOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));

        return conf.waitForCompletion(true) ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(),
                new AvroWordCountNewAPI(), args);
        System.exit(res);
    }
 }
diff --git a/wc.avsc b/wc.avsc
 {"namespace": "csw.avro",
 "type": "record",
 "name": "WordCount",
 "fields": [
     {"name": "word", "type": "string"},
     {"name": "count",  "type": "int"}
 ]
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package csw;

	import java.io.IOException;
	import java.util.StringTokenizer;

	import org.apache.avro.mapred.AvroKey;
	import org.apache.avro.mapreduce.AvroJob;
	import org.apache.avro.mapreduce.AvroKeyOutputFormat;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.conf.Configured;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.IntWritable;
	import org.apache.hadoop.io.LongWritable;
	import org.apache.hadoop.io.NullWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.Job;
	import org.apache.hadoop.mapreduce.Mapper;
	import org.apache.hadoop.mapreduce.Reducer;
	import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
	import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
	import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
	import org.apache.hadoop.util.Tool;
	import org.apache.hadoop.util.ToolRunner;

	import csw.avro.WordCount;

	/**
	* The classic WordCount example modified to output Avro Pair<CharSequence,
	* Integer> records instead of text
	* <p>
	* Adapted from
	* http://svn.apache.org/viewvc/avro/trunk/doc/examples/mr-example/src
	* /main/java/example/AvroWordCount.java?view=co and modified to use the new
	* Hadoop mapreduce API
	*/
	public class AvroWordCountNewAPI extends Configured implements Tool {

	public static class Map extends
	Mapper<LongWritable, Text, Text, IntWritable> {
	private final static IntWritable one = new IntWritable(1);
	private Text word = new Text();

	@Override
	public void map(LongWritable key, Text value, Context context)
	throws IOException, InterruptedException {
	String line = value.toString();
	StringTokenizer tokenizer = new StringTokenizer(line);
	while (tokenizer.hasMoreTokens()) {
	word.set(tokenizer.nextToken());
	context.write(word, one);
	}
	}
	}

	public static class Reduce extends
	Reducer<Text, IntWritable, AvroKey<WordCount>, NullWritable> {
	AvroKey<WordCount> outputKey = new AvroKey<WordCount>(new WordCount());

	@Override
	protected void reduce(Text key, Iterable<IntWritable> values,
	Context context) throws IOException, InterruptedException {
	int sum = 0;
	for (IntWritable value : values) {
	sum += value.get();
	}
	outputKey.datum().setWord(key.toString());
	outputKey.datum().setCount(sum);

	context.write(outputKey, null);
	}
	}

	public int run(String[] args) throws Exception {
	if (args.length != 2) {
	System.err
	.println("Usage: AvroWordCountNewAPI <input path> <output path>");
	return -1;
	}

	Job conf = new Job(getConf());
	conf.setJobName("wordcount");
	conf.setJarByClass(AvroWordCountNewAPI.class);

	AvroJob.setOutputKeySchema(conf, WordCount.getClassSchema());

	conf.setMapperClass(Map.class);
	conf.setReducerClass(Reduce.class);

	conf.setInputFormatClass(TextInputFormat.class);
	conf.setOutputFormatClass(AvroKeyOutputFormat.class);

	conf.setMapOutputKeyClass(Text.class);
	conf.setMapOutputValueClass(IntWritable.class);

	conf.setOutputKeyClass(AvroKey.class);
	conf.setOutputValueClass(NullWritable.class);

	FileInputFormat.setInputPaths(conf, new Path(args[0]));
	FileOutputFormat.setOutputPath(conf, new Path(args[1]));

	return conf.waitForCompletion(true) ? 0 : 1;
	}

	public static void main(String[] args) throws Exception {
	int res = ToolRunner.run(new Configuration(),
	new AvroWordCountNewAPI(), args);
	System.exit(res);
	}
	}
	{"namespace": "csw.avro",
	"type": "record",
	"name": "WordCount",
	"fields": [
	{"name": "word", "type": "string"},
	{"name": "count", "type": "int"}
	]
	}