Skip to content

Instantly share code, notes, and snippets.

View Yankim's full-sized avatar

Yannick Kimmel Yankim

View GitHub Profile
@Yankim
Yankim / clean_up_corrupt_pics.py
Created February 25, 2019 23:24
clean up corrupt images
import os
from PIL import Image
import pathlib
PATH_ = pathlib.Path(__file__).parent
broken_images=[]
def check_images(PATH_):
for pic_class in os.listdir(PATH_):
for pic in os.listdir(f'{PATH_}/{pic_class}'):
try:
img = Image.open(f'{PATH_}/{pic_class}/{pic}')
#Generate County map with a few var of interest
percent_map <- function(var, color, legend.title, min = 0, max = 100, name = "") {
# generate vector of fill colors for map
shades <- colorRampPalette(c("white", color))(100)
# constrain gradient to percents that occur between min and max
var <- pmax(var, min)
var <- pmin(var, max)
percents <- as.integer(cut(var, 100,
include.lowest = TRUE, ordered = TRUE))
fills <- shades[percents]
#Stepwise regression was used to predict obesity rate. Starting from saturated model
#variables not found significant were removed using stepwise regression.
data = fulldb[,-c(1:3)]
complete.data = data[complete.cases(data),]
model.saturated = lm(PCT_OBESE_ADULTS10 ~ ., data = complete.data)
model.empty = lm(PCT_OBESE_ADULTS10 ~ 1, data = complete.data)
scope = list(lower = formula(model.empty), upper = formula(model.saturated))
backwardAIC = step(model.saturated, scope, direction = "backward", k = 2)
#Used to predict obesity rate with multiple linear regression
#Generate County map with a few var of interest
percent_map <- function(var, color, legend.title, min = 0, max = 100, name = "") {
# generate vector of fill colors for map
shades <- colorRampPalette(c("white", color))(100)
# constrain gradient to percents that occur between min and max
var <- pmax(var, min)
var <- pmin(var, max)
percents <- as.integer(cut(var, 100,
include.lowest = TRUE, ordered = TRUE))
fills <- shades[percents]
#Predicting the test data
xgmat.test <- xgb.DMatrix(as.matrix(higgs.test.dummy))
xgboostTestPred <- predict(bst, newdata=xgmat.test)
predicted <- rep("s",550000)
predicted[xgboostTestPred>=threshold] <- "b"
weightRank = rank(xgboostTestPred, ties.method= "random")
write.csv(as.data.frame(xgboostTestPred), "Submissions/xgboost_prob.csv")
write.csv(as.data.frame(higgs.testId), "Submissions/EventID.csv")
#Predicting training data
xgmat.train <- xgb.DMatrix(as.matrix(higgs.train.dummy),
label = as.numeric(higgs.labels == "X0"),
weight = scaled.weight)
xgboostTrainPred <- predict(bst, newdata = xgmat.train)
labels <- ifelse(as.character(higgs.labels)=="X1", 1, 0)
auc = roc(labels, xgboostTrainPred)
plot(auc, print.thres=TRUE)
######## From the graph, we can tell the best threshold is 0.002
# Grid for the parameter search
#The guidlines for how to tune parameters are commented below and are taken from
# Owen Zheng http://www.slideshare.net/OwenZhang2/tips-for-data-science-competitions
xgb_grid_1 = expand.grid(
eta = c(.5, 1, 1.5), #[2-10]/num trees
max_depth = c(4, 6, 8), #Start with 6
nrounds = 100, #Fix at 100
gamma = 0, #Usually ok to leave at 0
colsample_bytree = c(.3, .5, .7), #.3 - .5
min_child_weight = 1 #start with 1/sqrt(eventrate)
library(xgboost); library(methods); library(pROC); library(caret); library(xgboost); library(readr); library(plyr); library(dplyr)
library(tidyr); library(dummies); library(doMC); registerDoMC(cores = 4)
#Read in the data
#higgs.___.full is raw data
higgs.train.full = read.csv('./data/training.csv', header=T)
higgs.test.full = read.csv('./data/test.csv', header=T)
higgs.testId = higgs.test.full$EventId
#############################################
@Yankim
Yankim / ui.R
Last active September 19, 2016 13:13
library(shiny); library(shinydashboard); library(plotly); source("helpers.R"); library(DT)
shinyUI(dashboardPage(
dashboardHeader(title = "Food and health demographics in the USA", titleWidth = 400),
dashboardSidebar(
sidebarUserPanel("Yannick Kimmel", image = "Yannick.jpg"),
sidebarMenu(
menuItem("Map", tabName = "mappanel", icon = icon("map")),
menuItem("Trends", tabName = "trends", icon = icon("line-chart")),
@Yankim
Yankim / server.R
Last active September 19, 2016 14:10
# server.R
library(shiny); library(maps); library(mapproj); library(shinydashboard); library(plotly); library(DT); source("helpers.R")
shinyServer(
function(input, output) {
output$map <- renderPlot({
args <- switch(input$var,
"Percent Adult Obese 2009" = list(health2$PCT_OBESE_ADULTS09, "darkgreen", "% Obese"),
"Percent Adult Obese 2010" = list(health2$PCT_OBESE_ADULTS10, "darkgreen", "% Obese"),
"Percent Adult Diabetic 2009" = list(health2$PCT_DIABETES_ADULTS09, "darkred", "% Diabetic"),