# for auc()
> library(pROC)
# for performance plots
> library(ROCR)
Loading required package: gplots
KernSmooth 2.23 loaded
Copyright M. P. Wand 1997-2009
Attaching package: ‘gplots’
| package com.mapr.bench; | |
| import com.google.caliper.Benchmark; | |
| import com.google.caliper.runner.Running; | |
| import org.apache.commons.math3.util.FastMath; | |
| import org.junit.Assert; | |
| import org.junit.Test; | |
| public class Distance { | |
| @Benchmark |
# for auc()
> library(pROC)
# for performance plots
> library(ROCR)
Loading required package: gplots
KernSmooth 2.23 loaded
Copyright M. P. Wand 1997-2009
Attaching package: ‘gplots’
| public static class BigDecimalWritable implements Writable { | |
| private BigDecimal value; | |
| public BigDecimalWritable(BigDecimal value) { | |
| this.value = value; | |
| } | |
| public BigDecimal value() { | |
| return value; | |
| } |
| @Test | |
| public void testStats() { | |
| // the reference limits here were derived using a numerical simulation where I took | |
| // 10,000 samples from the distribution in question and computed the stats from that | |
| // sample to get min, 25%-ile, median and so on. I did this 1000 times to get 5% and | |
| // 95% confidence limits for those values. | |
| // symmetrical, well behaved | |
| System.out.printf("normal\n"); | |
| check(normal(10000)); |
| Set<String> common = Sets.newHashSet(firstListOfEmails); | |
| common.retainAll(secondListOfEmails); |
| public class HbaseLookup { | |
| static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(TrigoMathFunctions.class); | |
| private HbaseLookup(){} | |
| @FunctionTemplate(name = "hLookup", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) | |
| public static class Lookup implements DrillSimpleFunc { | |
| @Param VarCharHolder table; // the table to read from |
| # picking the corners of the hyper cube at random usually gives us a good selection | |
| d = 0 | |
| while (d == 0) { | |
| centers = matrix(runif(10*10)>0.5, ncol=10) + 0 | |
| # but occasionally we get a duplicate row that is easily detected | |
| d = det(centers) | |
| } | |
| # start x out by selecting clusters | |
| x = data.frame(n = ceiling(runif(10000,1e-10,10))) |
| # Experiments with t-digest in R | |
| standard.size.bound = function(n, q) { | |
| 4 * n * q * (1-q) | |
| } | |
| constant.size.bound = function(n, q) { | |
| n | |
| } |
| Log in to the cluster: | |
| ted:downloads$ ssh se-node10.se.lab | |
| Last login: Mon Mar 23 17:35:37 2015 from 10.250.0.220 | |
| Please check the cluster reservation calendar: | |
| https://www.google.com/calendar/embed?src=maprtech.com_2d38343133383836382d313737%40resource.calendar.google.com | |
| Poke around looking for my volume and such: | |
| [tdunning@se-node10 ~]$ ls /mapr/se1/user/t |
| import fileinput | |
| from string import join | |
| import json | |
| import csv | |
| import json | |
| ### read the output from MAHOUT and collect into hash ### | |
| with open('x','rb') as csv_file: | |
| csv_reader = csv.reader(csv_file,delimiter='\t') | |
| old_id = "" | |
| indicators = [] |