Created
April 24, 2013 06:23
-
-
Save VJ310/5450026 to your computer and use it in GitHub Desktop.
Sentence Boundary Detection using Pig + Java UDF + OpenNLP
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
register 'opennlp-tools-1.5.1-incubating.jar'; | |
register 'opennlp-maxent-3.0.1-incubating.jar'; | |
register 'SentimentUDF-1.0-SNAPSHOT.jar'; | |
define getSentences com.Sentiment.udfSentence(); | |
--load reviews from json file | |
raw_review = LOAD 'review.json' USING JsonLoader('votes:(funny:int,useful:int,cool:int),user_id:chararray,review_id:chararray,stars:int,date:chararray,text:chararray,type:chararray,business_id:chararray'); | |
--seperate sentences from given review text using java UDF | |
sentences = FOREACH raw_review GENERATE review_id as review_id ,business_id as business_id, flatten(getSentences(text)) as sentence:chararray; | |
--store seperated sentences to json file | |
STORE sentences INTO 'sentences' USING JsonStorage(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.IOException; | |
import java.io.InputStream; | |
import opennlp.tools.sentdetect.SentenceDetectorME; | |
import opennlp.tools.sentdetect.SentenceModel; | |
import org.apache.pig.EvalFunc; | |
import org.apache.pig.data.BagFactory; | |
import org.apache.pig.data.DataBag; | |
import org.apache.pig.data.DataType; | |
import org.apache.pig.data.DefaultBagFactory; | |
import org.apache.pig.data.Tuple; | |
import org.apache.pig.data.TupleFactory; | |
import org.apache.pig.impl.logicalLayer.FrontendException; | |
import org.apache.pig.impl.logicalLayer.schema.Schema; | |
/** | |
* | |
* Detects sentences from a given text using opennlp sentence tokenizer | |
* | |
*/ | |
public class udfSentence extends EvalFunc<DataBag> { | |
private static final String ENGLISH_SENTMODEL_PATH = "opennlp/en-sent.bin"; | |
private static TupleFactory tupleFactory = TupleFactory.getInstance(); | |
private static BagFactory bagFactory = BagFactory.getInstance(); | |
private SentenceModel model; | |
public udfSentence() throws IOException { | |
ClassLoader loader = getClass().getClassLoader(); | |
//loads sentence model using given file location | |
InputStream in = loader.getResourceAsStream(ENGLISH_SENTMODEL_PATH); | |
if (in == null) { | |
String message = String.format("Failed to find resource for model" | |
+ " sentence detection model: %s", path); | |
throw new IOException(message); | |
} | |
//loads sentence model from input stream | |
model = new SentenceModel(in); | |
} | |
@Override | |
public DataBag exec(Tuple input) throws IOException { | |
if (input == null || input.size() == 0 || input.isNull(0)) | |
return null; | |
DataBag bagOfSentences = bagFactory.newDefaultBag(); | |
if (model != null) { | |
SentenceDetectorME sentenceDetector = new SentenceDetectorME(model); | |
Object objText = input.get(0); | |
if (!(objText instanceof String)) { | |
throw new IOException( | |
"Expected input to be chararray, but got " | |
+ t0.getClass().getName()); | |
} | |
String text = (String) objText; | |
//seperates sentences from given text | |
String sentences[] = sentenceDetector.sentDetect(text); | |
for (String sentence : sentences) { | |
Tuple sentenceTuple = tupleFactory.newTuple(sentence); | |
bagOfSentences.add(sentenceTuple); | |
} | |
return bagOfSentences; | |
} else { | |
return null; | |
} | |
} | |
public Schema outputSchema(Schema input) { | |
Schema bagSchema = new Schema(); | |
bagSchema.add(new Schema.FieldSchema("sentences", DataType.TUPLE)); | |
try { | |
return new Schema(new Schema.FieldSchema(getSchemaName(this | |
.getClass().getName().toLowerCase(), input), bagSchema, | |
DataType.BAG)); | |
} catch (FrontendException e) { | |
return null; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you for the example! I made a version with input schema check at compile-time (https://gist.github.com/alfonsonishikawa/6494478).