Last active
July 8, 2017 08:28
-
-
Save d4l3k/d76c1f63027bd404d3e7357c7d575cbd to your computer and use it in GitHub Desktop.
nwHacks Registration Machine Learning Pipeline
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"context" | |
"encoding/csv" | |
"flag" | |
"fmt" | |
"log" | |
"net/http" | |
"os" | |
"sort" | |
"strings" | |
"sync" | |
"unicode" | |
"github.com/d4l3k/docconv" | |
"github.com/google/go-github/github" | |
"github.com/jasonwinn/geocoder" | |
geo "github.com/kellydunn/golang-geo" | |
"github.com/xlvector/hector" | |
"github.com/xlvector/hector/core" | |
"../db" | |
) | |
var ( | |
file = flag.String("f", "", "the file to load") | |
model = flag.String("model", "", "path to the model to test") | |
classifierName = flag.String("classifier", "rf", "the classifier to use") | |
) | |
func extractWords(words string) []string { | |
return strings.FieldsFunc(strings.ToLower(words), func(r rune) bool { | |
return !(unicode.IsLetter(r) || unicode.IsDigit(r)) | |
}) | |
} | |
var ( | |
wordMap = map[string]int{} | |
wordMapMu sync.Mutex | |
) | |
func wordToID(word string) int { | |
word = clean(word) | |
wordMapMu.Lock() | |
id, ok := wordMap[word] | |
if !ok { | |
id = len(wordMap) + 1 | |
wordMap[word] = id | |
} | |
wordMapMu.Unlock() | |
return id | |
} | |
func wordsToIDs(words []string) []int { | |
var out []int | |
for _, word := range words { | |
out = append(out, wordToID(word)) | |
} | |
return out | |
} | |
func countIDs(ids []int) map[int]int { | |
out := map[int]int{} | |
for _, id := range ids { | |
out[id]++ | |
} | |
return out | |
} | |
func openDataset(name string) *os.File { | |
f, err := os.OpenFile(name, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0755) | |
if err != nil { | |
log.Fatal(name, err) | |
} | |
return f | |
} | |
func writeFeaturesToDataset(ds *os.File, positive bool, features map[int]float64) { | |
var err error | |
if positive { | |
_, err = ds.Write([]byte("1")) | |
} else { | |
_, err = ds.Write([]byte("0")) | |
} | |
if err != nil { | |
log.Fatal(err) | |
} | |
var featureIDs []int | |
for id := range features { | |
featureIDs = append(featureIDs, id) | |
} | |
sort.Sort(sort.IntSlice(featureIDs)) | |
for _, id := range featureIDs { | |
fmt.Fprintf(ds, " %d:%f", id, features[id]) | |
} | |
ds.Write([]byte("\n")) | |
} | |
func btof(b bool) float64 { | |
if b { | |
return 1.0 | |
} | |
return 0.0 | |
} | |
func getGithubFeatures(username string, features map[int]float64) (string, error) { | |
t := &github.UnauthenticatedRateLimitedTransport{ | |
ClientID: "", | |
ClientSecret: "", | |
} | |
ctx := context.Background() | |
client := github.NewClient(t.Client()) | |
username = strings.TrimPrefix(username, "https://github.com/") | |
username = strings.TrimPrefix(username, "http://github.com/") | |
user, _, err := client.Users.Get(ctx, username) | |
if err != nil { | |
return "", err | |
} | |
features[30] = float64(user.GetPublicRepos()) | |
features[31] = float64(user.GetPublicGists()) | |
features[32] = float64(user.GetFollowers()) | |
features[33] = float64(user.GetFollowing()) | |
words := user.GetBio() | |
repos, _, err := client.Repositories.List(ctx, username, nil) | |
if err != nil { | |
return "", err | |
} | |
stars := 0 | |
for _, repo := range repos { | |
stars += repo.GetStargazersCount() | |
words += " " + repo.GetDescription() | |
words += " " + repo.GetName() | |
} | |
features[34] = float64(stars) | |
return words, nil | |
} | |
func getFeatures(reg *db.Registration) map[int]float64 { | |
words := extractWords(reg.Reason) | |
resume := reg.ResumeLink() | |
if len(resume) > 0 { | |
resp, err := http.Get(resume) | |
if err != nil { | |
log.Fatal(err) | |
} | |
res, _, err := docconv.ConvertPDF(resp.Body) | |
if err != nil { | |
log.Fatal(err) | |
} | |
resp.Body.Close() | |
words = append(words, extractWords(res)...) | |
} | |
features := map[int]float64{} | |
features[0] = btof(reg.FirstHackathon) | |
features[1] = btof(reg.Mentor) | |
features[2] = btof(len(reg.GitHub) > 0) | |
features[3] = btof(len(reg.PersonalSite) > 0) | |
features[4] = btof(len(reg.LinkedIn) > 0) | |
features[5] = float64(len(strings.Split(reg.Teammates, ","))) | |
if len(reg.GitHub) > 0 { | |
out, err := getGithubFeatures(reg.GitHub, features) | |
if err != nil { | |
log.Println(err) | |
} else { | |
words = append(words, extractWords(out)...) | |
} | |
} | |
if len(reg.School) > 0 { | |
features[1000+wordToID("school:"+reg.School)] = 1.0 | |
} | |
if len(reg.City) > 0 { | |
lat, lng, err := geocoder.Geocode(reg.City) | |
if err != nil { | |
log.Println(err) | |
} | |
city := geo.NewPoint(lat, lng) | |
features[6] = vancouver.GreatCircleDistance(city) | |
} | |
wordIDs := wordsToIDs(words) | |
counts := countIDs(wordIDs) | |
for id, count := range counts { | |
features[id+1000] = float64(count) / float64(len(words)) | |
} | |
return features | |
} | |
func featuresToSample(features map[int]float64) *core.Sample { | |
sample := core.Sample{} | |
for id, val := range features { | |
sample.Features = append(sample.Features, core.Feature{ | |
Id: int64(id), | |
Value: val, | |
}) | |
} | |
return &sample | |
} | |
var vancouver *geo.Point | |
const workers = 32 | |
func main() { | |
flag.Parse() | |
geocoder.SetAPIKey("") | |
vanLat, vanLng, err := geocoder.Geocode("Vancouver, BC, Canada") | |
if err != nil { | |
log.Fatal(err) | |
} | |
vancouver = geo.NewPoint(vanLat, vanLng) | |
if len(*model) > 0 { | |
classifier := hector.GetClassifier(*classifierName) | |
classifier.LoadModel(*model) | |
reg := &db.Registration{ | |
Name: "Tristan Rice", | |
School: "University of British Columbia", | |
City: "Vancouver", | |
GitHub: "d4l3k", | |
LinkedIn: "d4l3k", | |
Reason: "I really want to come to nwHacks and make some cool stuff! I've gone that past couple of years and really enjoyed it.", | |
Resume: "https://fn.lc/resume.pdf", | |
Mentor: true, | |
FirstHackathon: false, | |
Teammates: "jinny, roy", | |
PersonalSite: "https://fn.lc", | |
Email: "[email protected]", | |
} | |
sample := featuresToSample(getFeatures(reg)) | |
log.Printf("Predicted value = %f", classifier.Predict(sample)) | |
} else { | |
generate() | |
} | |
} | |
func generate() { | |
accepted := openDataset("./accepted.data") | |
checkin := openDataset("./checkin.data") | |
holisticCheckin := openDataset("./holisticCheckin.data") | |
submit := openDataset("./submit.data") | |
holisticSubmit := openDataset("./holisticSubmit.data") | |
submitted := map[string]bool{} | |
input, err := os.Open(*file) | |
if err != nil { | |
log.Fatal(err) | |
} | |
defer input.Close() | |
reader := csv.NewReader(input) | |
reader.FieldsPerRecord = -1 | |
records, err := reader.ReadAll() | |
if err != nil { | |
log.Fatal(err) | |
} | |
for _, record := range records { | |
for _, entry := range record[10:] { | |
submitted[clean(entry)] = true | |
} | |
} | |
firebase := db.NewDB() | |
regs, err := firebase.AllRegistrations() | |
if err != nil { | |
log.Fatal(err) | |
} | |
regChan := make(chan *db.Registration, workers) | |
go func() { | |
defer close(regChan) | |
for _, reg := range regs { | |
regChan <- reg | |
} | |
}() | |
type fetchedReg struct { | |
reg *db.Registration | |
features map[int]float64 | |
} | |
fetchedChan := make(chan *fetchedReg, workers) | |
var wg sync.WaitGroup | |
for i := 0; i < workers; i++ { | |
wg.Add(1) | |
go func() { | |
defer wg.Done() | |
for reg := range regChan { | |
fetchedChan <- &fetchedReg{ | |
reg: reg, | |
features: getFeatures(reg), | |
} | |
} | |
}() | |
} | |
go func() { | |
wg.Wait() | |
close(fetchedChan) | |
}() | |
count := 0 | |
for fetched := range fetchedChan { | |
count++ | |
log.Printf("Fetched %d/%d", count, len(regs)) | |
reg := fetched.reg | |
features := fetched.features | |
isAccepted := reg.Status == db.StatusAccepted | |
writeFeaturesToDataset(accepted, isAccepted, features) | |
writeFeaturesToDataset(holisticCheckin, reg.CheckedIn, features) | |
didSubmit := submitted[clean(reg.Name)] || submitted[clean(reg.Email)] | |
writeFeaturesToDataset( | |
holisticSubmit, | |
didSubmit, | |
features, | |
) | |
if len(reg.RSVP) > 0 || reg.CheckedIn { | |
writeFeaturesToDataset(checkin, reg.CheckedIn, features) | |
} | |
if reg.CheckedIn { | |
writeFeaturesToDataset(submit, didSubmit, features) | |
} | |
} | |
} | |
func clean(str string) string { | |
return strings.ToLower(strings.TrimSpace(str)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment