Created
June 27, 2015 15:13
-
-
Save donarus/3b07c76d996c108dcb61 to your computer and use it in GitHub Desktop.
NBA example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(RCurl) | |
library(XML) | |
library(plyr) | |
library(doParallel) | |
######## SETTINGS BEGIN | |
defaultUserAgent <- paste0( | |
"AcademicAgent. Collecting historic performance and biographical data ", | |
"of individual players for student academic research purposes only, not for ", | |
"commercial use or sharing. Webpage containing description of academic research ", | |
"project: https://www.dropbox.com/sh/ywgcce11euj9iec/AADA7J6yoP5QZD8Tmkv6sBvja?dl=0. ", | |
"Contact info: [email protected]" | |
) | |
injuriesSearchPageTemplate <- "http://www.prosportstransactions.com/basketball/Search/SearchResults.php?Player=&Team=&BeginDate=&EndDate=&ILChkBx=yes&InjuriesChkBx=yes&submit=Search&start=%d" | |
######## SETTING END | |
######## FUNCTIONS BEGIN | |
trim <- function (x) { | |
gsub("^\\s+|\\s+$", "", x) | |
} | |
downloadPageTree <- function(url, userAgent = defaultUserAgent) { | |
page <- getURL(url, httpheader = c('User-Agent' = userAgent)) | |
pageLines <- readLines(tc <- textConnection(page)); close(tc) | |
htmlTreeParse(pageLines, error=function(...){}, useInternalNodes = TRUE) | |
} | |
######## FUNCTIONS END | |
pagetree <- downloadPageTree(sprintf(injuriesSearchPageTemplate, 0)) | |
pagesCnt <- as.numeric(xpathSApply(pagetree, '/html/body/div[3]/table[2]/tr/td[3]/p/a[last()]/text()', xmlValue)[[1]]) | |
print("## Downloading player injuries pages") | |
pages <- llply(0:(pagesCnt - 1), function(i) { | |
if((i+1) %% 100 == 0) { | |
tts <- sample(20,1) | |
print(sprintf("Sleeping for %d seconds after 100 downloaded pages", tts)) | |
Sys.sleep(tts) | |
} | |
tts <- sample(5,1) | |
print(sprintf('downloading page %d/%d and sleeping for %d seconds after download is finished', (i+1), pagesCnt, tts)) | |
pagetree <- downloadPageTree(sprintf(injuriesSearchPageTemplate, i)) | |
Sys.sleep(tts) # just random sleep after each request | |
pagetree | |
}, .progress = "time") | |
players <- unique(unlist(llply(pages, function(page) { | |
acquired <- xpathSApply(page, '/html/body/div[3]/table[1]/tr/td[3]', xmlValue) | |
relinquished <- xpathSApply(page, '/html/body/div[3]/table[1]/tr/td[4]', xmlValue) | |
res <- list() | |
if (length(acquired) > 1) { | |
res <- c(acquired[2:length(acquired)], relinquished[2:length(acquired)]) | |
} | |
res <- unlist(strsplit(res, "/")) | |
res <- unlist(llply(res, trim)) | |
res <- res[res != ""] | |
res | |
}))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment