Created
September 4, 2011 13:10
-
-
Save danbri/1192831 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bash-3.2$ cat miglib.pig | |
-- Mahout Pig integration | |
-- only proper piglatin can go in an imported macro; file-management, jar registration etc. has | |
-- to be run via .pig files. | |
-- We need piggybank.jar for reading Mahout's Hadoop Sequence files, plus other utilities: | |
-- | |
REGISTER /Users/bandri/working/pig/pig-0.9.0/contrib/piggybank/java/piggybank.jar; | |
-- We import definitions for Mahout Macros. | |
-- Note: you can only do this once per Pig-environment, so don't re-invoke this script. | |
-- The macro will expose Mahout tasks as Pig functions: | |
-- | |
-- * collocation (path_to_seq_files, junk_dummy_dir) | |
-- * nothing else yet | |
IMPORT 'mig.macro'; | |
-- We load something pointless, since MAPREDUCE requires us to pass in data via Pig tuples, even when | |
-- the real Mahout input is already in hdfs from previous operations. Not clear how to improve this. | |
-- | |
IGNORE = load 'migtest/input.txt' using PigStorage('\t') AS (foo: chararray, bar:chararray); -- dummy | |
run dummy.pig; -- clean up from any previous dummy-input files left around in hdfs | |
bash-3.2$ cat mig.macro | |
-- MAHOUT PIG INTEGRATION MACROs | |
-- | |
-- use via: IMPORT 'mig.macro'; | |
-- | |
-- | |
-- REGISTER /Users/bandri/working/pig/pig-0.9.0/contrib/piggybank/java/piggybank.jar; -- not allowed in macro | |
DEFINE collocations (SEQDIR,IGNORE) RETURNS sorted_concepts { | |
DEFINE SequenceFileLoader org.apache.pig.piggybank.storage.SequenceFileLoader(); | |
raw_concepts = MAPREDUCE '../../core/target/mahout-core-0.6-SNAPSHOT-job.jar' STORE IGNORE INTO 'migtest/dummy-input' LOAD 'migtest/collocations_output/ngrams/part-r-*' USING SequenceFileLoader AS (phrase: chararray, score: float) `org.apache.mahout.driver.MahoutDriver org.apache.mahout.vectorizer.collocations.llr.CollocDriver -i $SEQDIR -o migtest/collocations_output --analyzerName org.apache.mahout.vectorizer.DefaultAnalyzer --maxNGramSize 2 --preprocess --overwrite `; | |
$sorted_concepts = order raw_concepts by score desc; | |
}; | |
-- see also http://www.hortonworks.com/new-apache-pig-features-part-1-macro/ | |
bash-3.2$ cat mig.pig | |
-- This Pig script shows basic Mahout integration | |
-- It is a kludge, because our source data is already in an hdfs seqdir. | |
run miglib.pig; -- basic setup, including macro definitions | |
-- get collocated phrases from a seqdir | |
reuters_phrases = collocations('/user/danbri/migtest/reuters-out-seqdir', IGNORE); | |
market_phrases = FILTER reuters_phrases BY phrase MATCHES '.*(market|exchange|finance).*' AND score > (float)10; | |
political_phrases = FILTER reuters_phrases BY phrase MATCHES '.*(president|minister|government|election).*' AND score > (float)10; | |
run dummy.pig; -- cleans any dummy input files left in hdfs | |
danbri$ cat dummy.pig | |
mkdir migtest/dummy-input/; | |
rm migtest/dummy-input/; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment