Created
February 20, 2022 00:30
-
-
Save caioertai/d94d2d80ae00dd74544e96f42b0b3e05 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "yaml" | |
require_relative "scraper" | |
# Get the top 5 urls | |
urls = top_5_links | |
# Scrape the movie info for each of them | |
movie_infos = urls.map do |url| | |
puts "Scraping #{url}...." | |
movie_info(url) | |
end | |
File.open("movies.yml", "wb") do |file| | |
file << YAML.dump(movie_infos) | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "nokogiri" | |
require "httparty" # gem install httparty | |
def top_5_links | |
top_url = "https://www.imdb.com/chart/top" | |
# Open the link as an html string | |
html_string = HTTParty.get(top_url) | |
# Parse the html string | |
doc = Nokogiri::HTML.parse(html_string) | |
# Identify the first 5 <a> elements from the page | |
elements = doc.search(".lister-list .titleColumn a").first(5) | |
elements.map do |element| | |
# Get the href from each element and put them into an array | |
"https://www.imdb.com#{element.attr("href")}" | |
end | |
end | |
def movie_info(url) | |
# Open the url | |
html_string = HTTParty.get(url) | |
# Parse the html string | |
doc = Nokogiri::HTML.parse(html_string) | |
meta_elements = doc.at(".ipc-metadata-list").children | |
# find the element with title | |
title = doc.at("h1").text | |
# find the element with director | |
director = meta_elements.first.at("a").text | |
# find the element with cast | |
cast = meta_elements.last.search("li a").map { |el| el.text } | |
# find the element with storyline | |
storyline = doc.at('[data-testid="plot"]').text | |
# find the element with year | |
year = doc.at('.ipc-link').text.to_i | |
# return the movie info hash | |
{ title: title, cast: cast, storyline: storyline, year: year, director: director } | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment