Created
March 1, 2018 19:54
-
-
Save anonymous/88482173f7d66b916f206090932c7077 to your computer and use it in GitHub Desktop.
TuCarro.com Images Scrap
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "library(rvest)\nlibrary(dplyr)\n\n# Get some urls of relevant cars\nnpages <- 30\nfirsts <- seq(1, 48*npages, 48) # There are 48 posts per page\nurls <- c()\nfor (i in 1:npages) {\n link <- paste(\"https://carros.tucarro.com.co/carros-camionetas/_Desde_\", firsts[i], sep=\"\")\n links <- read_html(link) %>% html_nodes('div a') %>% html_attr(\"href\")\n links <- links[grep(\"MCO-\", links)] %>% unique() %>% c()\n urls <- rbind(urls, links)\n print(firsts[i])\n}\n\n# Get images links\nimages <- c()\nfor (i in 1:length(urls)) {\n link <- urls[i]\n links <- read_html(link) %>% html_nodes('.gallery-trigger , img') %>% html_attr(\"src\")\n links <- links[grep(\"-O.jpg\", links)] %>% unique() %>% c()\n images <- c(images, links)\n print(paste(i,\"out of\",length(urls)))\n}\n\n# Download images\nsetwd(\"/Users/bernardo/Dropbox (ID)/CM Data Science/Car Photos\")\nfor (i in 1:length(images)) {\n url <- as.character(images[i])\n name <- paste(\"Images/Originals/\",gsub(\".*.com/\", \"\", url), sep=\"\")\n download.file(url, name, quiet=T, mode = 'wb')\n print(paste(i,\"out of\",length(images)))\n}\n\n# Create CSV with file name and brand\nfiles <- data.frame(name = paste(\"Images/Originals/\",gsub(\".*.com/\", \"\", images), sep=\"\"))\nbrands <- c(\"kia\", \"audi\", \"bmw\", \"chevrolet\", \"renault\", \"toyota\", \"citron\", \"citroen\",\n \"ford\", \"jeep\", \"mazda\", \"mercedes-benz\", \"nissan\", \"peugeot\", \"suzuki\",\n \"volkswagen\", \"zotye\", \"willyz\", \"volvo\", \"fiat\", \"subaru\", \"ssangyong\", \n \"seat\", \"mini-cooper\", \"mg-gt\", \"lexus\", \"land-rover\", \"hyundai\", \"honda\",\n \"great-wall-wingle\", \"foton-tunland\", \"dodge\", \"faw\", \"daihatsu\")\nfiles$withbrand <- ifelse(grepl(paste(brands, collapse=\"|\"), files$name), TRUE, FALSE)\nfiles <- filter(files, withbrand == TRUE) %>% select(-withbrand)\nfor (img in 1:nrow(files)) {\n for (b in 1:length(brands)) { \n if (grepl(brands[b], files$name) == TRUE) { \n files$brand[img] = brands[b]\n print(paste(img,\"out of\",nrow(files))) \n }\n } \n}\nwrite.csv(files, \"files_brands.csv\")", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"gist": { | |
"id": "", | |
"data": { | |
"description": "TuCarro.com Images Scrap", | |
"public": true | |
} | |
}, | |
"kernelspec": { | |
"name": "ir", | |
"display_name": "R", | |
"language": "R" | |
}, | |
"language_info": { | |
"name": "R", | |
"codemirror_mode": "r", | |
"pygments_lexer": "r", | |
"mimetype": "text/x-r-source", | |
"file_extension": ".r", | |
"version": "3.4.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment