Skip to content

Instantly share code, notes, and snippets.

@jayunit100
Created August 26, 2013 21:48
Show Gist options
  • Save jayunit100/6347078 to your computer and use it in GitHub Desktop.
Save jayunit100/6347078 to your computer and use it in GitHub Desktop.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p/>
* http://www.apache.org/licenses/LICENSE-2.0
* <p/>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.bigtop.itest.mahout.smoke;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.After;
import org.junit.Test;
import org.apache.bigtop.itest.JarContent;
import org.apache.bigtop.itest.shell.Shell;
/**
* Test Mahout examples shipped with the distribution.
*/
public class TestMahoutExamples {
public static final String TEMP_DIR = "/tmp/mahout.${(new Date().getTime())}";
public static final String WORK_DIR = TEMP_DIR;
/**
* If MAHOUT_HOME is supplied, use that as the executable. Else, use
* mahout. This eases the testing of tarball installations and other scenarios
* where possible more than one version of an ecosystem component is available.
*/
public static String MAHOUT_HOME = System.getenv("MAHOUT_HOME") ;
public static String MAHOUT = MAHOUT_HOME ? MAHOUT_HOME+"/bin/mahout":"mahout"
private static Shell sh = new Shell("/bin/bash -s");
public static String download_dir = System.getProperty("mahout.examples.resources.download.path") ?: "/tmp" ;
@BeforeClass
public static void setUp() {
// download resources
sh.exec(
"if [ ! -f ${download_dir}/20news-bydate.tar.gz ]; then " +
"curl http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o ${download_dir}/20news-bydate.tar.gz; " +
"fi");
sh.exec(
"if [ ! -f ${download_dir}/reuters21578.tar.gz ]; then " +
"curl http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz -o ${download_dir}/reuters21578.tar.gz; " +
"fi");
sh.exec(
"if [ ! -f ${download_dir}/synthetic_control.data ]; then " +
"curl http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data -o ${download_dir}/synthetic_control.data; " +
"fi");
sh.exec(
"if [ ! -f ${download_dir}/ml-1m.zip ]; then " +
"curl http://www.grouplens.org/system/files/ml-1m.zip -o ${download_dir}/ml-1m.zip; " +
"fi");
// uncompress archives
// 20news-bydate.tar.gz
// reuters21578.tar.gz
// ml-1m.zip
sh.exec("mkdir ${TEMP_DIR}",
"cd ${TEMP_DIR}",
"mkdir 20news-bydate",
"cd 20news-bydate",
"tar xzf ${download_dir}/20news-bydate.tar.gz",
"cd ..",
"mkdir 20news-all",
"cp -R 20news-bydate/*/* 20news-all",
"mkdir reuters-sgm",
"cd reuters-sgm",
"tar xzf ${download_dir}/reuters21578.tar.gz",
"cd ..",
"mkdir movielens",
"cd movielens",
"unzip ${download_dir}/ml-1m.zip");
assertEquals("Failed to uncompress archives", 0, sh.getRet());
sh.exec("hadoop fs -mkdir ${WORK_DIR}");
assertEquals("Unable to create work dir in hdfs", 0, sh.getRet());
rmr("temp");
}
/**
* Run method that tests for 0 return code and logs the entire command.
*/
public void assertRun(String mahoutJob){
final String cmd = MAHOUT+" "+mahoutJob;
sh.exec(cmd);
assertEquals("Failed to run: "+cmd, 0, sh.getRet());
}
@AfterClass
public static void tearDown() {
sh.exec("rm -rf ${TEMP_DIR}",
"hadoop fs -rmr ${WORK_DIR}");
}
private static void rmr(String path) {
sh.exec("hadoop fs -test -e $path");
if (sh.getRet() == 0) {
sh.exec("hadoop fs -rmr -skipTrash $path");
assertEquals("Deletion of $path from HDFS failed", 0, sh.getRet());
}
}
@After
public void killHangingProcess() {
sh.exec("mapred job -list | grep 'Total jobs:0'");
if (sh.getRet() == 0) {
sh.exec("for jobid in `mapred job -list | grep 'RUNNING' |awk '{print \$1}'`;",
"do mapred job -kill \${jobid};",
"done");
}
}
@Test(timeout=12000000L)
public void factorizeMovieLensRatings() {
// convert ratings
sh.exec("cat ${TEMP_DIR}/movielens/ml-1m/ratings.dat |sed -e s/::/,/g| cut -d, -f1,2,3 > ${TEMP_DIR}/movielens/ratings.csv");
assertEquals("Unexpected error from converting ratings", 0, sh.getRet());
// put ratings in hdfs
sh.exec("hadoop fs -mkdir ${WORK_DIR}/movielens",
"hadoop fs -put ${TEMP_DIR}/movielens/ratings.csv ${WORK_DIR}/movielens/ratings.csv");
assertEquals("Unable to put movielens/ratings.csv in hdfs", 0, sh.getRet());
//create a 90% percent training set and a 10% probe set
assertRun("splitDataset --input ${WORK_DIR}/movielens/ratings.csv --output ${WORK_DIR}/dataset " +
"--trainingPercentage 0.9 --probePercentage 0.1 --tempDir ${WORK_DIR}/dataset/tmp");
//run distributed ALS-WR to factorize the rating matrix based on the training set
assertRun("parallelALS --input ${WORK_DIR}/dataset/trainingSet/ --output ${WORK_DIR}/als/out " +
"--tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations 10 --lambda 0.065");
//compute predictions against the probe set, measure the error
assertRun("evaluateFactorization --output ${WORK_DIR}/als/rmse --input ${WORK_DIR}/dataset/probeSet/ " +
"--userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp");
//compute recommendations
assertRun("recommendfactorized --input ${WORK_DIR}/als/out/userRatings/ --output ${WORK_DIR}/recommendations " +
"--userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ " +
"--numRecommendations 6 --maxRating 5");
// check that error has been calculated
sh.exec("hadoop fs -test -e ${WORK_DIR}/als/rmse/rmse.txt");
assertEquals("${WORK_DIR}/als/rmse/rmse.txt does not exist", 0, sh.getRet());
// print the error
sh.exec("hadoop fs -cat ${WORK_DIR}/als/rmse/rmse.txt");
assertEquals("Unexpected error from running hadoop", 0, sh.getRet());
// check that recommendations has been calculated
sh.exec("hadoop fs -test -e ${WORK_DIR}/recommendations/part-m-00000");
assertEquals("${WORK_DIR}/recommendations/part-m-00000 does not exist", 0, sh.getRet());
}
// it's too much of a pain to use junit parameterized tests, so do it
// the simple way
private void _clusterSyntheticControlData(String algorithm) {
rmr("testdata");
sh.exec("hadoop fs -mkdir testdata",
"hadoop fs -put ${download_dir}/synthetic_control.data testdata");
assertEquals("Unable to put data in hdfs", 0, sh.getRet());
sh.exec(MAHOUT+" org.apache.mahout.clustering.syntheticcontrol.${algorithm}.Job");
assertEquals("Unexpected error from running mahout", 0, sh.getRet());
}
@Test(timeout=900000L)
public void clusterControlDataWithCanopy() {
_clusterSyntheticControlData("canopy");
}
@Test(timeout=9000000L)
public void clusterControlDataWithKMeans() {
_clusterSyntheticControlData("kmeans");
}
@Test(timeout=9000000L)
public void clusterControlDataWithFuzzyKMeans() {
_clusterSyntheticControlData("fuzzykmeans");
}
@Test(timeout=900000L)
public void clusterControlDataWithDirichlet() {
_clusterSyntheticControlData("dirichlet");
}
@Test(timeout=900000L)
public void clusterControlDataWithMeanShift() {
_clusterSyntheticControlData("meanshift");
}
@Test(timeout=7200000L)
public void testReutersLDA() {
// where does lda.algorithm come in?
assertRun("org.apache.lucene.benchmark.utils.ExtractReuters ${TEMP_DIR}/reuters-sgm ${TEMP_DIR}/reuters-out");
//put ${TEMP_DIR}/reuters-out into hdfs as we have to run seqdirectory in mapreduce mode, so files need be in hdfs
sh.exec("hadoop fs -put ${TEMP_DIR}/reuters-out ${WORK_DIR}/reuters-out");
assertEquals("Unable to put reuters-out-seqdir in hdfs", 0, sh.getRet());
assertRun("seqdirectory -i ${TEMP_DIR}/reuters-out -o ${TEMP_DIR}/reuters-out-seqdir -c UTF-8 -chunk 5");
assertEquals("Unexpected error from running mahout", 0, sh.getRet());
/*
// reuters-out-seqdir exists on a local disk at this point,
// copy it to hdfs
rmr("${WORK_DIR}/reuters-out-seqdir");
sh.exec("hadoop fs -put ${TEMP_DIR}/reuters-out-seqdir ${WORK_DIR}/reuters-out-seqdir");
assertEquals("Unable to put reuters-out-seqdir in hdfs", 0, sh.getRet());
*/
assertRun("""seq2sparse \
-i ${WORK_DIR}/reuters-out-seqdir/ \
-o ${WORK_DIR}/reuters-out-seqdir-sparse-lda \
-wt tf -seq -nr 3 --namedVector""");
sh.exec("hadoop fs -mkdir ${WORK_DIR}/reuters-lda");
assertEquals("Unable to make dir reuters-lda in hdfs", 0, sh.getRet());
assertRun("""lda \
-i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tf-vectors \
-o ${WORK_DIR}/reuters-lda -k 20 -x 20 \
&& \
mahout ldatopics \
-i ${WORK_DIR}/reuters-lda/state-20 \
-d ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-0 \
-dt sequencefile""");
}
@Test(timeout=9000000L)
public void testBayesNewsgroupClassifier() {
// put bayes-train-input and bayes-test-input in hdfs
sh.exec("hadoop fs -mkdir ${WORK_DIR}/20news-vectors");
sh.exec("hadoop fs -put ${TEMP_DIR}/20news-all ${WORK_DIR}/20news-all");
assertEquals("Unable to put bayes-train-input in hdfs", 0, sh.getRet());
sh.exec(MAHOUT+" seqdirectory -i ${WORK_DIR}/20news-all -o ${WORK_DIR}/20news-seq");
assertEquals("Unexpected error from running mahout", 0, sh.getRet());
sh.exec(MAHOUT+" seq2sparse -i ${WORK_DIR}/20news-seq -o ${WORK_DIR}/20news-vectors -lnorm -nv -wt tfidf");
assertEquals("Unexpected error from running mahout", 0, sh.getRet());
assertRun("""split \
-i ${WORK_DIR}/20news-vectors/tfidf-vectors \
--trainingOutput ${WORK_DIR}/20news-train-vectors \
--testOutput ${WORK_DIR}/20news-test-vectors \
--randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential""");
assertRun("""trainnb \
-i ${WORK_DIR}/20news-train-vectors -el \
-o ${WORK_DIR}/model \
-li ${WORK_DIR}/labelindex \
-ow""");
assertRun("""testnb \
-i ${WORK_DIR}/20news-train-vectors \
-m ${WORK_DIR}/model \
-l ${WORK_DIR}/labelindex \
-ow -o ${WORK_DIR}/20news-testing""");
assertRun("""testnb \
-i ${WORK_DIR}/20news-test-vectors \
-m ${WORK_DIR}/model \
-l ${WORK_DIR}/labelindex \
-ow -o ${WORK_DIR}/20news-testing""");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment