Created
September 28, 2010 16:42
-
-
Save kimsterv/601331 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.IOException; | |
import java.util.Map; | |
import org.apache.hadoop.io.Text; | |
import org.apache.hadoop.mapreduce.InputFormat; | |
import org.apache.hadoop.mapreduce.Job; | |
import org.apache.hadoop.mapreduce.RecordReader; | |
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; | |
import org.apache.pig.LoadFunc; | |
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigFileInputFormat; | |
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; | |
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigTextInputFormat; | |
import org.apache.pig.data.Tuple; | |
import org.apache.pig.data.TupleFactory; | |
import org.json.simple.JSONObject; | |
import org.json.simple.parser.JSONParser; | |
import org.json.simple.parser.ParseException; | |
import org.slf4j.Logger; | |
import org.slf4j.LoggerFactory; | |
import com.google.common.collect.Maps; | |
public class PigJsonLoader extends LoadFunc { | |
private static final Logger LOG = LoggerFactory.getLogger(PigJsonLoader.class); | |
private static final TupleFactory tupleFactory_ = TupleFactory.getInstance(); | |
private final JSONParser jsonParser_ = new JSONParser(); | |
private LineRecordReader in = null; | |
public PigJsonLoader() { | |
} | |
@SuppressWarnings("unchecked") | |
@Override | |
public InputFormat getInputFormat() throws IOException { | |
return new PigTextInputFormat(); | |
} | |
@Override | |
public Tuple getNext() throws IOException { | |
boolean notDone = in.nextKeyValue(); | |
if (!notDone) { | |
return null; | |
} | |
String line; | |
Text val = in.getCurrentValue(); | |
if (val == null) { | |
return null; | |
} | |
line = val.toString(); | |
if (line.length() > 0) { | |
Tuple t = parseStringToTuple(line); | |
if (t != null) { | |
return t; | |
} | |
} | |
return null; | |
} | |
protected Tuple parseStringToTuple(String line) { | |
try { | |
Map<String, String> values = Maps.newHashMap(); | |
JSONObject jsonObj = (JSONObject) jsonParser_.parse(line); | |
for (Object key : jsonObj.keySet()) { | |
Object value = jsonObj.get(key); | |
values.put(key.toString(), value != null ? value.toString() | |
: null); | |
} | |
return tupleFactory_.newTuple(values); | |
} catch (ParseException e) { | |
LOG.warn("Could not json-decode string: " + line, e); | |
return null; | |
} catch (NumberFormatException e) { | |
LOG.warn("Very big number exceeds the scale of long: " + line, e); | |
return null; | |
} | |
} | |
@SuppressWarnings("unchecked") | |
@Override | |
public void prepareToRead(RecordReader reader, PigSplit split) | |
throws IOException { | |
in = (LineRecordReader) reader; | |
} | |
@Override | |
public void setLocation(String location, Job job) throws IOException { | |
PigFileInputFormat.setInputPaths(job, location); | |
} | |
} |
Yes, I have ran into this issue and I have gotten around it is by extracting each value in one step and flatten. Each value is a Tuple that you have to flatten out.
e.g.
exp = foreach all_data generate flatten($0#'request_details') as recs, (long)$0#'timestamp' as ts, $0#'user_id' as uid;
exp = filter exp by ts is not null and ts != 0;
req = foreach exp generate recs#'request_type' as req_type, ts, uid, flatten(recs#'_checkin_request_details') as loc_details, flatten(recs#'award_details') as award;
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I'm running into an error when trying to access a field of nested JSON, maybe one of you can give me some insight. I'm using ayonsinha's version of the JSON parser which /should/ convert a nested structure to a map? Maybe I'm just missing something?
e.g.
grunt> x = load '/my_json_data' using ...JsonLoader() as (json:map[]);
grunt> y = foreach x generate json#'field1'#'field2' as field2;
grunt> describe y
2011-06-29 18:24:16,980 [main] WARN org.apache.pig.PigServer - Encountered Warning IMPLICIT_CAST_TO_MAP 1 time(s).
y: {user_id: bytearray}
grunt> dump y
...
ERROR 1081: Cannot cast to map. Expected bytearray but received: tuple
Thing error seems reasonable to me since getNext is returning a Tuple. Has anyone else ran into this, and if so did you get around it somehow?
Thanks.