kimsterv · September 28, 2010 16:42 · sanjayvacharya · Feb 9, 2011 · alexmc6 · Feb 9, 2011
diff --git a/PigJsonLoader.java b/PigJsonLoader.java
 import java.io.IOException;
 import java.util.Map;

 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.InputFormat;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.RecordReader;
 import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
 import org.apache.pig.LoadFunc;
 import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigFileInputFormat;
 import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
 import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigTextInputFormat;
 import org.apache.pig.data.Tuple;
 import org.apache.pig.data.TupleFactory;
 import org.json.simple.JSONObject;
 import org.json.simple.parser.JSONParser;
 import org.json.simple.parser.ParseException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import com.google.common.collect.Maps;

 public class PigJsonLoader extends LoadFunc {
 	private static final Logger LOG = LoggerFactory.getLogger(PigJsonLoader.class);
 	private static final TupleFactory tupleFactory_ = TupleFactory.getInstance();
 	private final JSONParser jsonParser_ = new JSONParser();
 	private LineRecordReader in = null;
 	
 	public PigJsonLoader() {
 		
 	}

 	@SuppressWarnings("unchecked")
 	@Override
 	public InputFormat getInputFormat() throws IOException {
 		return new PigTextInputFormat();
 	}

 	@Override
 	public Tuple getNext() throws IOException {
 		boolean notDone = in.nextKeyValue();
 		if (!notDone) {
 			return null;
 		}

 		String line;
 		Text val = in.getCurrentValue();
 		if (val == null) {
 			return null;
 		}

 		line = val.toString();
 		if (line.length() > 0) {
 			Tuple t = parseStringToTuple(line);

 			if (t != null) {
 				return t;
 			}
 		}

 		return null;
 	}

 	protected Tuple parseStringToTuple(String line) {
 		try {
 			Map<String, String> values = Maps.newHashMap();
 			JSONObject jsonObj = (JSONObject) jsonParser_.parse(line);
 			for (Object key : jsonObj.keySet()) {
 				Object value = jsonObj.get(key);
 				values.put(key.toString(), value != null ? value.toString()
 							: null);
 			}
 			return tupleFactory_.newTuple(values);
 		} catch (ParseException e) {
 			LOG.warn("Could not json-decode string: " + line, e);
 			return null;
 		} catch (NumberFormatException e) {
 			LOG.warn("Very big number exceeds the scale of long: " + line, e);
 			return null;
 		}
 	}

 	@SuppressWarnings("unchecked")
 	@Override
 	public void prepareToRead(RecordReader reader, PigSplit split)
 			throws IOException {
 		in = (LineRecordReader) reader;		
 	}

 	@Override
 	public void setLocation(String location, Job job) throws IOException {
 		 PigFileInputFormat.setInputPaths(job, location);		
 	}
 }
	import java.io.IOException;
	import java.util.Map;

	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.InputFormat;
	import org.apache.hadoop.mapreduce.Job;
	import org.apache.hadoop.mapreduce.RecordReader;
	import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
	import org.apache.pig.LoadFunc;
	import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigFileInputFormat;
	import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
	import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigTextInputFormat;
	import org.apache.pig.data.Tuple;
	import org.apache.pig.data.TupleFactory;
	import org.json.simple.JSONObject;
	import org.json.simple.parser.JSONParser;
	import org.json.simple.parser.ParseException;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	import com.google.common.collect.Maps;

	public class PigJsonLoader extends LoadFunc {
	private static final Logger LOG = LoggerFactory.getLogger(PigJsonLoader.class);
	private static final TupleFactory tupleFactory_ = TupleFactory.getInstance();
	private final JSONParser jsonParser_ = new JSONParser();
	private LineRecordReader in = null;

	public PigJsonLoader() {

	}

	@SuppressWarnings("unchecked")
	@Override
	public InputFormat getInputFormat() throws IOException {
	return new PigTextInputFormat();
	}

	@Override
	public Tuple getNext() throws IOException {
	boolean notDone = in.nextKeyValue();
	if (!notDone) {
	return null;
	}

	String line;
	Text val = in.getCurrentValue();
	if (val == null) {
	return null;
	}

	line = val.toString();
	if (line.length() > 0) {
	Tuple t = parseStringToTuple(line);

	if (t != null) {
	return t;
	}
	}

	return null;
	}

	protected Tuple parseStringToTuple(String line) {
	try {
	Map<String, String> values = Maps.newHashMap();
	JSONObject jsonObj = (JSONObject) jsonParser_.parse(line);
	for (Object key : jsonObj.keySet()) {
	Object value = jsonObj.get(key);
	values.put(key.toString(), value != null ? value.toString()
	: null);
	}
	return tupleFactory_.newTuple(values);
	} catch (ParseException e) {
	LOG.warn("Could not json-decode string: " + line, e);
	return null;
	} catch (NumberFormatException e) {
	LOG.warn("Very big number exceeds the scale of long: " + line, e);
	return null;
	}
	}

	@SuppressWarnings("unchecked")
	@Override
	public void prepareToRead(RecordReader reader, PigSplit split)
	throws IOException {
	in = (LineRecordReader) reader;
	}

	@Override
	public void setLocation(String location, Job job) throws IOException {
	PigFileInputFormat.setInputPaths(job, location);
	}
	}