q = calibrated probability
= p / (p + (1-p) / w)
https://pdfs.semanticscholar.org/daf9/ed5dc6c6bad5367d7fd8561527da30e9b8dd.pdf
where
p = predicted probability
w = negative down-sampling rate
= (Neg/Neg+(Pos*k)) / (Neg/(Neg+Pos))
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
create table page ( | |
docid int, | |
contents string | |
); | |
INSERT OVERWRITE TABLE page_exploded | |
select | |
d.docid, | |
normalize_unicode(t.word) as word | |
from |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# sort list.txt | uniq | grep -v '#' | grep -v 'noreply' | grep -v 'local' | grep -e '\.' | grep -v 'internal' | grep -v 'contact' | |
import os | |
import sys | |
import requests | |
import time | |
from github3 import login | |
from tqdm import tqdm |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def auc(num_positives, num_negatives, predicted): | |
l_sorted = sorted(range(len(predicted)),key=lambda i: predicted[i], | |
reverse=True) | |
fp_cur = 0.0 | |
tp_cur = 0.0 | |
fp_prev = 0.0 | |
tp_prev = 0.0 | |
fp_sum = 0.0 | |
auc_tmp = 0.0 | |
last_score = float("nan") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
69.613 129.070 52.111 | |
70.670 128.161 52.446 | |
72.303 128.450 52.853 | |
73.759 127.522 51.786 | |
74.085 129.067 53.352 | |
74.561 134.031 50.992 | |
74.911 134.944 50.744 | |
75.205 129.162 52.800 | |
75.395 129.711 52.844 | |
75.554 132.642 51.427 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE TABLE lr_model AS | |
SELECT | |
feature, -- reducers perform model averaging in parallel | |
avg(weight) as weight | |
FROM ( | |
SELECT logress(features,label,..) as (feature,weight) | |
FROM train | |
) t -- map-only task | |
GROUP BY feature; -- shuffled to reducers |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val testData = | |
ssc.textFileStream(...).map(LabeledPoint.parse) // Infinite stream | |
testData.predict { case testDf => | |
// Explode features in input streams | |
val testDf_exploded = ... | |
testDf_exploded | |
.join(model, testDf_exploded("feature") === model("feature"), "LEFT_OUTER") | |
.select($"rowid", ($"weight" * $"value").as("value")) | |
.groupby("rowid").sum("value") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
context = HiveContext(sc) | |
context.sql(" | |
SELECT | |
feature, avg(weight) as weight | |
FROM ( | |
SELECT train_logregr(features, label) | |
as (feature, weight) | |
FROM trainTable | |
) model |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val trainDf = | |
spark.read.format("libsvm”).load(“a9a.train") | |
trainDf.train_logregr($"feature", $"label") | |
.groupby("feature") | |
.agg("weight"->"avg") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
a = load 'a9a.train' | |
as (rowid:int, label:float, features:{(featurepair:chararray)}); | |
b = foreach c generate flatten( | |
logress(features, label, '-total_steps ${total_steps}') | |
) as (feature, weight); | |
c = group b by feature; | |
d = foreach c generate group, AVG(c.weight); | |
store d into 'a9a_model1'; |