Skip to content

Instantly share code, notes, and snippets.

View myui's full-sized avatar

Makoto YUI myui

View GitHub Profile
create table page (
docid int,
contents string
);
INSERT OVERWRITE TABLE page_exploded
select
d.docid,
normalize_unicode(t.word) as word
from
# -*- coding: utf-8 -*-
# sort list.txt | uniq | grep -v '#' | grep -v 'noreply' | grep -v 'local' | grep -e '\.' | grep -v 'internal' | grep -v 'contact'
import os
import sys
import requests
import time
from github3 import login
from tqdm import tqdm
@myui
myui / auc.py
Last active June 8, 2018 08:08
def auc(num_positives, num_negatives, predicted):
l_sorted = sorted(range(len(predicted)),key=lambda i: predicted[i],
reverse=True)
fp_cur = 0.0
tp_cur = 0.0
fp_prev = 0.0
tp_prev = 0.0
fp_sum = 0.0
auc_tmp = 0.0
last_score = float("nan")
69.613 129.070 52.111
70.670 128.161 52.446
72.303 128.450 52.853
73.759 127.522 51.786
74.085 129.067 53.352
74.561 134.031 50.992
74.911 134.944 50.744
75.205 129.162 52.800
75.395 129.711 52.844
75.554 132.642 51.427
q = calibrated probability 
  = p / (p + (1-p) / w)

https://pdfs.semanticscholar.org/daf9/ed5dc6c6bad5367d7fd8561527da30e9b8dd.pdf

where 
    p = predicted probability 
    w = negative down-sampling rate
 = (Neg/Neg+(Pos*k)) / (Neg/(Neg+Pos))
@myui
myui / lr.sql
Created October 29, 2016 01:41
CREATE TABLE lr_model AS
SELECT
feature, -- reducers perform model averaging in parallel
avg(weight) as weight
FROM (
SELECT logress(features,label,..) as (feature,weight)
FROM train
) t -- map-only task
GROUP BY feature; -- shuffled to reducers
val testData =
ssc.textFileStream(...).map(LabeledPoint.parse) // Infinite stream
testData.predict { case testDf =>
// Explode features in input streams
val testDf_exploded = ...
testDf_exploded
.join(model, testDf_exploded("feature") === model("feature"), "LEFT_OUTER")
.select($"rowid", ($"weight" * $"value").as("value"))
.groupby("rowid").sum("value")
context = HiveContext(sc)
context.sql("
SELECT
feature, avg(weight) as weight
FROM (
SELECT train_logregr(features, label)
as (feature, weight)
FROM trainTable
) model
val trainDf =
spark.read.format("libsvm”).load(“a9a.train")
trainDf.train_logregr($"feature", $"label")
.groupby("feature")
.agg("weight"->"avg")
a = load 'a9a.train'
as (rowid:int, label:float, features:{(featurepair:chararray)});
b = foreach c generate flatten(
logress(features, label, '-total_steps ${total_steps}')
) as (feature, weight);
c = group b by feature;
d = foreach c generate group, AVG(c.weight);
store d into 'a9a_model1';