Skip to content

Instantly share code, notes, and snippets.

View Yankim's full-sized avatar

Yannick Kimmel Yankim

View GitHub Profile
br = webdriver.Firefox() #open firefox
br.get('https://www.allrecipes.com/recipes/'+str(yearurls[i]))
###ID number for year, example 1997 has ID of 14486
html_list = br.find_element_by_id("grid")
urls = html_list.find_elements(By.CLASS_NAME, "favorite")
#All top 20 recipes have hearts associated with them. Inside
#the heart contains the unique ID number for the given recipe
for i, e in enumerate(urls):
def scrape_recipe(br, year, idnumber):
#This is called when user wants to scrape for specific recipe site
#Try functions were used to prevent any one element from stopping the operation
#recipe title
try:
rtitle = br.find_element_by_tag_name('h1').text
except:
rtitle = 'NA'
import numpy as np
import pymongo
import pandas as pd
# Connection to Mongo DB and import recipe and ingredients collections as Pandas
try:
conn=pymongo.MongoClient()
print "Connected successfully!!!"
except pymongo.errors.ConnectionFailure, e:
print "Could not connect to MongoDB: %s" % e
conn
@Yankim
Yankim / ResAndDev
Last active September 19, 2016 13:10
#Loading packages
library(dplyr); library(ggplot2); library(RColorBrewer); library(rworldmap)
#Loading datasets
indicators = read.csv("subbeddata.csv", header = TRUE, stringsAsFactors = FALSE)
allindicators = read.csv("/Users/YannickMac/Dropbox/Applications/Data science/NYCDSA/Data_visualization_project/world-development-indicators/Indicators.csv")
counts <- indicators %>%
group_by(IndicatorCode, IndicatorName) %>%
summarise(NumCountries = n_distinct(CountryName),
@Yankim
Yankim / helpers.R
Last active September 19, 2016 13:49
library(shiny); library(maps); library(mapproj); library(dplyr); library(plotly); library(googleVis); library(car)
data("county.fips")
#Join data by county indentification numbe (fips) to the county data in the maps package
flipjoin = function(x) {
y = read.csv(x)
left_join(county.fips, y, by = c("fips" = "FIPS"))
}
#Data wrangling for map plot
health2 = flipjoin("data/health.csv")
@Yankim
Yankim / server.R
Last active September 19, 2016 14:10
# server.R
library(shiny); library(maps); library(mapproj); library(shinydashboard); library(plotly); library(DT); source("helpers.R")
shinyServer(
function(input, output) {
output$map <- renderPlot({
args <- switch(input$var,
"Percent Adult Obese 2009" = list(health2$PCT_OBESE_ADULTS09, "darkgreen", "% Obese"),
"Percent Adult Obese 2010" = list(health2$PCT_OBESE_ADULTS10, "darkgreen", "% Obese"),
"Percent Adult Diabetic 2009" = list(health2$PCT_DIABETES_ADULTS09, "darkred", "% Diabetic"),
@Yankim
Yankim / ui.R
Last active September 19, 2016 13:13
library(shiny); library(shinydashboard); library(plotly); source("helpers.R"); library(DT)
shinyUI(dashboardPage(
dashboardHeader(title = "Food and health demographics in the USA", titleWidth = 400),
dashboardSidebar(
sidebarUserPanel("Yannick Kimmel", image = "Yannick.jpg"),
sidebarMenu(
menuItem("Map", tabName = "mappanel", icon = icon("map")),
menuItem("Trends", tabName = "trends", icon = icon("line-chart")),
library(xgboost); library(methods); library(pROC); library(caret); library(xgboost); library(readr); library(plyr); library(dplyr)
library(tidyr); library(dummies); library(doMC); registerDoMC(cores = 4)
#Read in the data
#higgs.___.full is raw data
higgs.train.full = read.csv('./data/training.csv', header=T)
higgs.test.full = read.csv('./data/test.csv', header=T)
higgs.testId = higgs.test.full$EventId
#############################################
# Grid for the parameter search
#The guidlines for how to tune parameters are commented below and are taken from
# Owen Zheng http://www.slideshare.net/OwenZhang2/tips-for-data-science-competitions
xgb_grid_1 = expand.grid(
eta = c(.5, 1, 1.5), #[2-10]/num trees
max_depth = c(4, 6, 8), #Start with 6
nrounds = 100, #Fix at 100
gamma = 0, #Usually ok to leave at 0
colsample_bytree = c(.3, .5, .7), #.3 - .5
min_child_weight = 1 #start with 1/sqrt(eventrate)
#Predicting training data
xgmat.train <- xgb.DMatrix(as.matrix(higgs.train.dummy),
label = as.numeric(higgs.labels == "X0"),
weight = scaled.weight)
xgboostTrainPred <- predict(bst, newdata = xgmat.train)
labels <- ifelse(as.character(higgs.labels)=="X1", 1, 0)
auc = roc(labels, xgboostTrainPred)
plot(auc, print.thres=TRUE)
######## From the graph, we can tell the best threshold is 0.002