Last active
November 8, 2017 16:26
-
-
Save luisDVA/9c12fff91cf1df47645c03ad224db9bc to your computer and use it in GitHub Desktop.
ggpup function modified to take in a vector of dog breeds to search for
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# define function | |
# this code is just for fun and I do not have rights to the images on dogtime.com I just like dogs | |
ggpupBV <- function(ggplotObject,breedVector){ | |
# required packages | |
require(dplyr) | |
require(jpeg) | |
require(grid) | |
require(gridExtra) | |
require(RCurl) | |
require(rvest) | |
require(stringi) | |
require(extrafont) | |
require(fuzzyjoin) | |
# scrape a list of image URLS from the dogtime breed profiles homepage | |
if (!exists("imgurls")){ | |
# read page source | |
dogIndex <- read_html("http://dogtime.com/dog-breeds/profiles") | |
# keep node of interest (identified using Selectorgadget) | |
scrapedHtml <- dogIndex %>% html_nodes(".horizontal-group-listing") %>% toString() | |
# match image urls | |
## Find everything that starts with "http" | |
### No white spaces allowed | |
### Ends with jpg | |
#### Note: this regex is for PC | |
imgurlslist <- stri_match_all_regex(scrapedHtml,"(http[^\\s]+(jpg)\\b)") | |
# subset into character vector | |
imgurls <- imgurlslist[[1]][,1] | |
} | |
# convert list to df | |
imgurlsDF <- data.frame(imgurls) | |
# clean up urls (gradually) | |
imgurlsDF$breedname <- stri_extract_all_regex(imgurlsDF$imgurls,"(?<=\\_).*?(?=\\.)") | |
imgurlsDF$breedname <- stri_extract_all_regex(imgurlsDF$breedname,"(?<=\\_).*?(?=300)") | |
imgurlsDF$breedname <- stri_replace_all_fixed(imgurlsDF$breedname,pattern = "-"," ") | |
imgurlsDF$breedname <- trimws(imgurlsDF$breedname) | |
imgurlsDF$breedname <- stri_replace_all_fixed(imgurlsDF$breedname,"dog breed","") | |
imgurlsDF$breedname <- stri_replace_all_fixed(imgurlsDF$breedname,"what is a","") | |
imgurlsDF$breedname <- stri_replace_all_fixed(imgurlsDF$breedname,"what is the","") | |
#match provided breed vector with URLS | |
imgurlsDF <- imgurlsDF %>% filter(!is.na(breedname)) | |
breedVector <- data.frame(breeds=breedVector) | |
joinedMatches <- stringdist_left_join(breedVector,imgurlsDF, by=c("breeds"="breedname"),max_dist=3) | |
joinedMatches$imgurls <- as.character(joinedMatches$imgurls) | |
joinedMatches <- joinedMatches %>% filter(!is.na(imgurls)) | |
# scrape two dog breed photos | |
urlInd <- sample(nrow(joinedMatches),2,rep=F) | |
# for upper right corner | |
dogImg.URLU <- joinedMatches$imgurls[urlInd[1]] | |
# save as a raster object | |
dogImgU <- rasterGrob(readJPEG(getURLContent(dogImg.URLU))) | |
# label for grob | |
dogImgUlab <- textGrob(paste(stri_trans_totitle(joinedMatches$breedname[urlInd[1]]))) | |
# for lower right corner | |
dogImg.URLL <- joinedMatches$imgurls[urlInd[2]] | |
# save as a raster object | |
dogImgL <- rasterGrob(readJPEG(getURLContent(dogImg.URLL))) | |
# label for grob | |
dogImgLlab <- textGrob(paste(stri_trans_totitle(joinedMatches$breedname[urlInd[2]]))) | |
# graphical parameters | |
# define plot layout | |
lay <- rbind(c(1,1,2), | |
c(1,1,3), | |
c(1,1,4), | |
c(1,1,5)) | |
# set up some attribution text | |
# the fontfamily parameter is optional, erase or change to something else to match the fonts available for your system | |
rightText=textGrob("images from www.dogtime.com", rot=90, gp=gpar(fontfamily = "Roboto Condensed Light")) | |
# arrange the plot and the image side by side | |
grid.arrange(ggplotObject, dogImgU, dogImgUlab, dogImgL, dogImgLlab, layout_matrix=lay,widths=c(2,1,1),right=rightText) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment