Skip to content

Instantly share code, notes, and snippets.

@balshor
Created July 29, 2010 21:37
Show Gist options
  • Save balshor/499319 to your computer and use it in GitHub Desktop.
Save balshor/499319 to your computer and use it in GitHub Desktop.
package com.bizo.hive.udtf;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
@Description(name = "tokenize", value = "_FUNC_(doc) - emits (token, 1) for each token in the input document")
public class TokenizeUDTF extends GenericUDTF {
private PrimitiveObjectInspector stringOI = null;
@Override
public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException {
if (args.length != 1) {
throw new UDFArgumentException("tokenize() takes exactly one argument");
}
if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE
&& ((PrimitiveObjectInspector) args[0]).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING) {
throw new UDFArgumentException("tokenize() takes a string as a parameter");
}
stringOI = (PrimitiveObjectInspector) args[0];
List<String> fieldNames = new ArrayList<String>(2);
List<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>(2);
fieldNames.add("word");
fieldNames.add("cnt");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldOIs.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
@Override
public void process(Object[] record) throws HiveException {
final String document = (String) stringOI.getPrimitiveJavaObject(record[0]);
if (document == null) {
return;
}
String[] tokens = document.split("\\s+");
for (String token : tokens) {
forward(new Object[] { token, Integer.valueOf(1) });
}
}
@Override
public void close() throws HiveException {
// do nothing
}
}
@balshor
Copy link
Author

balshor commented May 11, 2012

Thanks, fixed.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment