Created
December 18, 2017 21:16
-
-
Save juananpe/2db2a60cd441a68011571a5f8a2eb6c0 to your computer and use it in GitHub Desktop.
films.R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library('rvest') | |
url <- 'http://www.imdb.com/search/title?count=100&release_date=2018,2016&title_type=feature' | |
webpage <- read_html(url) | |
#Using CSS selectors to scrap the rankings section | |
rank_data_html <- html_nodes(webpage,'.text-primary') | |
#Converting the ranking data to text | |
rank_data <- html_text(rank_data_html) | |
#Data-Preprocessing: Converting rankings to numerical | |
rank_data<-as.numeric(rank_data) | |
#Using CSS selectors to scrap the title section | |
title_data_html <- html_nodes(webpage,'.lister-item-header a') | |
#Converting the title data to text | |
title_data <- html_text(title_data_html) | |
#Using CSS selectors to scrap the description section | |
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted') | |
#Converting the description data to text | |
description_data <- html_text(description_data_html) | |
#Data-Preprocessing: removing '\n' | |
description_data<-gsub("\n","",description_data) | |
for (i in c(20,34,51,54,66)){ | |
a<-description_data[1:(i-1)] | |
b<-description_data[i:length(description_data)] | |
description_data<-append(a,"NA") | |
description_data<-append(description_data,b) | |
} | |
length(description_data) | |
#Using CSS selectors to scrap the Movie runtime section | |
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime') | |
#Converting the runtime data to text | |
runtime_data <- html_text(runtime_data_html) | |
#Data-Preprocessing: removing mins and converting it to numerical | |
runtime_data<-gsub(" min","",runtime_data) | |
runtime_data<-as.numeric(runtime_data) | |
#Using CSS selectors to scrap the Movie genre section | |
genre_data_html <- html_nodes(webpage,'.genre') | |
#Converting the genre data to text | |
genre_data <- html_text(genre_data_html) | |
#Data-Preprocessing: removing \n | |
genre_data<-gsub("\n","",genre_data) | |
#Data-Preprocessing: removing excess spaces | |
genre_data<-gsub(" ","",genre_data) | |
#taking only the first genre of each movie | |
genre_data<-gsub(",.*","",genre_data) | |
#Convering each genre from text to factor | |
genre_data<-as.factor(genre_data) | |
#Using CSS selectors to scrap the IMDB rating section | |
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong') | |
#Converting the ratings data to text | |
rating_data <- html_text(rating_data_html) | |
#Data-Preprocessing: converting ratings to numerical | |
rating_data<-as.numeric(rating_data) | |
length(rating_data) | |
for (i in c(20,34,51,54,66)){ | |
a<-rating_data[1:(i-1)] | |
b<-rating_data[i:length(rating_data)] | |
rating_data<-append(a,"NA") | |
rating_data<-append(rating_data,b) | |
} | |
#Data-Preprocessing: converting ratings to numerical | |
rating_data<-as.numeric(rating_data) | |
#Using CSS selectors to scrap the votes section | |
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)') | |
#Converting the votes data to text | |
votes_data <- html_text(votes_data_html) | |
#Data-Preprocessing: removing commas | |
votes_data<-gsub(",","",votes_data) | |
length(votes_data) | |
for (i in c(20,34,51,54,66)){ | |
a<-votes_data[1:(i-1)] | |
b<-votes_data[i:length(votes_data)] | |
votes_data<-append(a,"NA") | |
votes_data<-append(votes_data,b) | |
} | |
#Data-Preprocessing: converting votes to numerical | |
votes_data<-as.numeric(votes_data) | |
#Using CSS selectors to scrap the directors section | |
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)') | |
#Converting the directors data to text | |
directors_data <- html_text(directors_data_html) | |
#Data-Preprocessing: converting directors data into factors | |
directors_data<-as.factor(directors_data) | |
# fix director 34 (there is an error in the source) | |
directors_data <- directors_data[-c(34)] | |
#Using CSS selectors to scrap the actors section | |
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a') | |
#Converting the gross actors data to text | |
actors_data <- html_text(actors_data_html) | |
#Data-Preprocessing: converting actors data into factors | |
actors_data<-as.factor(actors_data) | |
#Using CSS selectors to scrap the metascore section | |
metascore_data_html <- html_nodes(webpage,'.metascore') | |
#Converting the runtime data to text | |
metascore_data <- html_text(metascore_data_html) | |
#Data-Preprocessing: removing extra space in metascore | |
metascore_data<-gsub(" ","",metascore_data) | |
length(metascore_data) | |
for (i in c(20,34,51,54,55,62,65,66,72)){ | |
a<-metascore_data[1:(i-1)] | |
b<-metascore_data[i:length(metascore_data)] | |
metascore_data<-append(a,"NA") | |
metascore_data<-append(metascore_data,b) | |
} | |
#Data-Preprocessing: converting metascore to numerical | |
metascore_data<-as.numeric(metascore_data) | |
#Let's look at summary statistics | |
summary(metascore_data) | |
#Using CSS selectors to scrap the gross revenue section | |
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span') | |
#Converting the gross revenue data to text | |
gross_data <- html_text(gross_data_html) | |
#Data-Preprocessing: removing '$' and 'M' signs | |
gross_data<-gsub("M","",gross_data) | |
gross_data<-substring(gross_data,2,6) | |
gross_data <- append(list("NA"), gross_data) # first element is missing | |
#Filling missing entries with NA | |
for (i in c(13,16,20,34,44,48,49,51,54,55,56,62,63,64,65,66,69,70,72,73,74,76,84,86,87)){ | |
a<-gross_data[1:(i-1)] | |
b<-gross_data[i:length(gross_data)] | |
gross_data<-append(a,list("NA")) | |
gross_data<-append(gross_data,b) | |
} | |
#Data-Preprocessing: converting gross to numerical | |
gross_data<-as.numeric(gross_data) | |
length(gross_data) | |
summary(gross_data) | |
length(rank_data) | |
length(title_data) | |
length(description_data) | |
length(runtime_data) | |
length(genre_data) | |
length(rating_data) | |
length(metascore_data) | |
length(votes_data) | |
length(gross_data) | |
length(directors_data) | |
length(actors_data) | |
#Combining all the lists to form a data frame | |
movies_df<-data.frame(Rank = rank_data, Title = title_data, | |
Description = description_data, Runtime = runtime_data, | |
Genre = genre_data, Rating = rating_data, | |
Metascore = metascore_data, Votes = votes_data, | |
Gross_Earning_in_Mil = gross_data, | |
Director = directors_data, Actor = actors_data) | |
colnames(movies_df) | |
theme_set(theme_gray()) | |
# Histogram on a Continuous (Numeric) Variable | |
g <- ggplot(movies_df, aes(Runtime)) # + scale_fill_brewer(palette = "Spectral") | |
g <- g + geom_histogram(aes(fill=Genre), | |
#binwidth = 5, | |
bins = 30, | |
col="black", | |
size=.1) # change binwidth | |
# labs(title="Histogram of films' runtime ", | |
# subtitle="(stacked by genre, in minutes)") | |
# Same plot, different command | |
# qplot(data = movies_df,Runtime,fill = Genre,bins = 30) | |
plot(g) | |
ggplot(movies_df,aes(x=Runtime,y=Rating))+ | |
geom_point(aes(size=Votes,col=Genre)) | |
ggplot(movies_df,aes(x=Runtime,y=Gross_Earning_in_Mil))+ | |
geom_point(aes(size=Rating,col=Genre)) | |
# Source: https://www.analyticsvidhya.com/blog/2017/03/beginners-guide-on-web-scraping-in-r-using-rvest-with-hands-on-knowledge/ | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment