Created
September 28, 2010 16:42
-
-
Save kimsterv/601331 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.IOException; | |
import java.util.Map; | |
import org.apache.hadoop.io.Text; | |
import org.apache.hadoop.mapreduce.InputFormat; | |
import org.apache.hadoop.mapreduce.Job; | |
import org.apache.hadoop.mapreduce.RecordReader; | |
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; | |
import org.apache.pig.LoadFunc; | |
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigFileInputFormat; | |
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; | |
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigTextInputFormat; | |
import org.apache.pig.data.Tuple; | |
import org.apache.pig.data.TupleFactory; | |
import org.json.simple.JSONObject; | |
import org.json.simple.parser.JSONParser; | |
import org.json.simple.parser.ParseException; | |
import org.slf4j.Logger; | |
import org.slf4j.LoggerFactory; | |
import com.google.common.collect.Maps; | |
public class PigJsonLoader extends LoadFunc { | |
private static final Logger LOG = LoggerFactory.getLogger(PigJsonLoader.class); | |
private static final TupleFactory tupleFactory_ = TupleFactory.getInstance(); | |
private final JSONParser jsonParser_ = new JSONParser(); | |
private LineRecordReader in = null; | |
public PigJsonLoader() { | |
} | |
@SuppressWarnings("unchecked") | |
@Override | |
public InputFormat getInputFormat() throws IOException { | |
return new PigTextInputFormat(); | |
} | |
@Override | |
public Tuple getNext() throws IOException { | |
boolean notDone = in.nextKeyValue(); | |
if (!notDone) { | |
return null; | |
} | |
String line; | |
Text val = in.getCurrentValue(); | |
if (val == null) { | |
return null; | |
} | |
line = val.toString(); | |
if (line.length() > 0) { | |
Tuple t = parseStringToTuple(line); | |
if (t != null) { | |
return t; | |
} | |
} | |
return null; | |
} | |
protected Tuple parseStringToTuple(String line) { | |
try { | |
Map<String, String> values = Maps.newHashMap(); | |
JSONObject jsonObj = (JSONObject) jsonParser_.parse(line); | |
for (Object key : jsonObj.keySet()) { | |
Object value = jsonObj.get(key); | |
values.put(key.toString(), value != null ? value.toString() | |
: null); | |
} | |
return tupleFactory_.newTuple(values); | |
} catch (ParseException e) { | |
LOG.warn("Could not json-decode string: " + line, e); | |
return null; | |
} catch (NumberFormatException e) { | |
LOG.warn("Very big number exceeds the scale of long: " + line, e); | |
return null; | |
} | |
} | |
@SuppressWarnings("unchecked") | |
@Override | |
public void prepareToRead(RecordReader reader, PigSplit split) | |
throws IOException { | |
in = (LineRecordReader) reader; | |
} | |
@Override | |
public void setLocation(String location, Job job) throws IOException { | |
PigFileInputFormat.setInputPaths(job, location); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Yes, I have ran into this issue and I have gotten around it is by extracting each value in one step and flatten. Each value is a Tuple that you have to flatten out.
e.g.
exp = foreach all_data generate flatten($0#'request_details') as recs, (long)$0#'timestamp' as ts, $0#'user_id' as uid;
exp = filter exp by ts is not null and ts != 0;
req = foreach exp generate recs#'request_type' as req_type, ts, uid, flatten(recs#'_checkin_request_details') as loc_details, flatten(recs#'award_details') as award;