Makoto YUI myui

Senior Principal Engineer at Treasure Data

Recently created

Least recently created

Recently updated

Least recently updated

myui / page.sql

Created December 15, 2016 11:30

	create table page (
	docid int,
	contents string
	);

	INSERT OVERWRITE TABLE page_exploded
	select
	d.docid,
	normalize_unicode(t.word) as word
	from

myui / list_github_users.py

Last active December 15, 2016 18:05

	# -- coding: utf-8 --

	# sort list.txt \| uniq \| grep -v '#' \| grep -v 'noreply' \| grep -v 'local' \| grep -e '\.' \| grep -v 'internal' \| grep -v 'contact'

	import os
	import sys
	import requests
	import time
	from github3 import login
	from tqdm import tqdm

myui / auc.py

Last active June 8, 2018 08:08

	def auc(num_positives, num_negatives, predicted):
	l_sorted = sorted(range(len(predicted)),key=lambda i: predicted[i],
	reverse=True)
	fp_cur = 0.0
	tp_cur = 0.0
	fp_prev = 0.0
	tp_prev = 0.0
	fp_sum = 0.0
	auc_tmp = 0.0
	last_score = float("nan")

myui / hundred_balls.txt

Created November 17, 2016 09:04

	69.613 129.070 52.111
	70.670 128.161 52.446
	72.303 128.450 52.853
	73.759 127.522 51.786
	74.085 129.067 53.352
	74.561 134.031 50.992
	74.911 134.944 50.744
	75.205 129.162 52.800
	75.395 129.711 52.844
	75.554 132.642 51.427

myui / calibrated_probability.md

Created November 14, 2016 08:29

q = calibrated probability 
  = p / (p + (1-p) / w)

https://pdfs.semanticscholar.org/daf9/ed5dc6c6bad5367d7fd8561527da30e9b8dd.pdf

where 
    p = predicted probability 
    w = negative down-sampling rate
 = (Neg/Neg+(Pos*k)) / (Neg/(Neg+Pos))

myui / lr.sql

Created October 29, 2016 01:41

	CREATE TABLE lr_model AS
	SELECT
	feature, -- reducers perform model averaging in parallel
	avg(weight) as weight
	FROM (
	SELECT logress(features,label,..) as (feature,weight)
	FROM train
	) t -- map-only task
	GROUP BY feature; -- shuffled to reducers

myui / sparkstreaming.scala

Last active October 29, 2016 00:59

	val testData =
	ssc.textFileStream(...).map(LabeledPoint.parse) // Infinite stream

	testData.predict { case testDf =>
	// Explode features in input streams
	val testDf_exploded = ...
	testDf_exploded
	.join(model, testDf_exploded("feature") === model("feature"), "LEFT_OUTER")
	.select($"rowid", ($"weight" * $"value").as("value"))
	.groupby("rowid").sum("value")

myui / hivecontext.scala

Created October 29, 2016 00:27

	context = HiveContext(sc)

	context.sql("
	SELECT
	feature, avg(weight) as weight
	FROM (
	SELECT train_logregr(features, label)
	as (feature, weight)
	FROM trainTable
	) model

myui / logress.scala

Last active October 29, 2016 00:17

	val trainDf =
	spark.read.format("libsvm”).load(“a9a.train")

	trainDf.train_logregr($"feature", $"label")
	.groupby("feature")
	.agg("weight"->"avg")

myui / logress.pig

Created October 29, 2016 00:08

	a = load 'a9a.train'
	as (rowid:int, label:float, features:{(featurepair:chararray)});
	b = foreach c generate flatten(
	logress(features, label, '-total_steps ${total_steps}')
	) as (feature, weight);
	c = group b by feature;
	d = foreach c generate group, AVG(c.weight);
	store d into 'a9a_model1';

Newer Older