Last active
February 4, 2021 19:35
-
-
Save ceshine/c0f9538c48beb2069f57 to your computer and use it in GitHub Desktop.
Kaggle Avazu Challenge: FTRL-Proximal with L1 & L2 implemented in Go (single-threaded)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Based on tinrtgu's Python script here: | |
// https://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory | |
package main | |
import ( | |
"encoding/csv" | |
"os" | |
"strconv" | |
"hash/fnv" | |
"math" | |
"log" | |
"time" | |
) | |
// ############################### | |
// parameters | |
//################################ | |
var train string | |
var test string | |
var submission string | |
var epoch int | |
var D uint32 | |
type FTRL struct { | |
alpha, beta, L1, L2 float64 | |
n map[uint32]float64 // squared sum of past gradients | |
z map[uint32]float64 // coefficients / weights | |
w map[uint32]float64 // tmp coefficients / weights | |
} | |
func (m *FTRL) predict(x []uint32) float64{ | |
wTx := 0.0 | |
for i := 0; i < len(x); i++{ | |
z, ok := m.z[x[i]] | |
if ok == false{ | |
m.z[x[i]] = 0 | |
m.n[x[i]] = 0 | |
m.w[x[i]] = 0 | |
z = 0 | |
} | |
sign := 1.0 | |
if z < 0 {sign = -1.0} | |
if sign * z <= m.L1{ | |
m.w[x[i]] = 0. | |
}else{ | |
m.w[x[i]] = (sign * m.L1 - z) / ((m.beta + math.Sqrt(m.n[x[i]])) / m.alpha + m.L2) | |
} | |
wTx += m.w[x[i]] | |
} | |
return 1.0 / (1.0 + math.Exp(-math.Max(math.Min(wTx, 35.0), -35.0))) | |
} | |
func (m *FTRL) update(x []uint32, p, y float64) { | |
// gradient under logloss | |
g := p - y | |
// update z and n | |
for i := 0; i< len(x); i++ { | |
sigma := (math.Sqrt(m.n[x[i]] + g * g) - math.Sqrt(m.n[x[i]])) / m.alpha | |
m.z[x[i]] += g - sigma * m.w[x[i]] | |
m.n[x[i]] += g * g | |
} | |
} | |
func hash(s string) uint32 { | |
h := fnv.New32a() | |
h.Write([]byte(s)) | |
return h.Sum32() | |
} | |
func nextRow(reader *csv.Reader, column_names map[string]int) (string, float64, int, []uint32){ | |
row, err := reader.Read() | |
if err != nil { | |
return "", 0, 0, nil | |
} | |
features_n := len(row) - 1 | |
ID := row[column_names["id"]] | |
//process clicks | |
y := 0.0 | |
_, click := column_names["click"] | |
if click == true { | |
if row[column_names["click"]] == "1"{ | |
y = 1.0 | |
} | |
features_n -= 1 | |
} | |
date, _ := strconv.Atoi(row[column_names["hour"]][4:6]) | |
date -= 20 | |
row[column_names["hour"]] = row[column_names["hour"]][6:] | |
features := make([]uint32, features_n) | |
count := 0 | |
for i := 0; i < len(row); i++ { | |
if i != column_names["id"]{ | |
if click == false || i != column_names["click"]{ | |
features[count] = hash(strconv.Itoa(count) + "_" + row[i]) % D | |
count += 1 | |
} | |
} | |
} | |
return ID, y, date, features | |
} | |
func logloss(p, y float64) float64{ | |
p = math.Max(math.Min(p, 1.0 - 10e-15), 10e-15) | |
if y == 1. { | |
return -math.Log(p) | |
}else{ | |
return -math.Log(1. - p) | |
} | |
} | |
func opencsv(filename string, create bool) *os.File{ | |
var err error | |
var csvfile *os.File | |
if create{ | |
csvfile, err = os.Create(filename) | |
}else{ | |
csvfile, err = os.Open(filename) | |
} | |
if err != nil{ | |
log.Fatal(err) | |
} | |
return csvfile | |
} | |
func main(){ | |
//Set up parameters | |
D = 1 << 20 | |
train = "head20" | |
test = "t20" | |
submission = "submission_go.csv" | |
holdout := 30 | |
epoch = 2 | |
start := time.Now() | |
model := FTRL{alpha: 0.15, beta: 1.1, L1: 1.1, L2:1.1, | |
n: make(map[uint32]float64), z: make(map[uint32]float64), w:make(map[uint32]float64)} | |
var trainfile *os.File | |
var reader *csv.Reader | |
var header []string | |
var elapsed time.Duration | |
var column_names map[string]int | |
for r := 0; r < epoch; r++ { | |
trainfile = opencsv(train, false) | |
reader= csv.NewReader(trainfile) | |
header, _ = reader.Read() | |
column_names = make(map[string]int) | |
for i, name := range header { | |
column_names[name] = i | |
} | |
count := 1 | |
l_count := 0.0 | |
loss := 0.0 | |
for { | |
_, y, _, features := nextRow(reader, column_names) | |
if features == nil { break } // reach EOF | |
p := model.predict(features) | |
if count % holdout== 0 { | |
l_count += 1 | |
loss += logloss(p, y) | |
if count % (holdout * 100000) == 0 { | |
log.Println(p, y, loss/l_count) | |
} | |
} | |
count += 1 | |
model.update(features, p, y) | |
} | |
trainfile.Close() | |
elapsed = time.Since(start) | |
log.Printf("Epoch %d took %s logloss %f", r+1, elapsed, loss/l_count) | |
start = time.Now() | |
} | |
//Start testing | |
testfile := opencsv(test, false) | |
outfile := opencsv(submission, true) | |
reader = csv.NewReader(testfile) | |
writer := csv.NewWriter(outfile) | |
header, _ = reader.Read() | |
column_names = make(map[string]int) | |
for i, name := range header { | |
column_names[name] = i | |
} | |
writer.Write([]string{"id","click"}) // add header to the submission file | |
for{ | |
ID, _, _, features := nextRow(reader, column_names) | |
if features == nil { break } // reach EOF | |
p := model.predict(features) | |
writer.Write([]string{ID, strconv.FormatFloat(p, 'f', -1, 64)}) | |
} | |
writer.Flush() | |
testfile.Close() | |
outfile.Close() | |
elapsed = time.Since(start) | |
log.Printf("Testing took %s", elapsed) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment