caioertai · February 20, 2022 00:30
diff --git a/interface.rb b/interface.rb
 require "yaml"
 require_relative "scraper"

 # Get the top 5 urls
 urls = top_5_links

 # Scrape the movie info for each of them
 movie_infos = urls.map do |url|
  puts "Scraping #{url}...."
  movie_info(url)
 end

 File.open("movies.yml", "wb") do |file|
  file << YAML.dump(movie_infos)
 end
diff --git a/scraper.rb b/scraper.rb
 require "nokogiri"
 require "httparty" # gem install httparty

 def top_5_links
  top_url = "https://www.imdb.com/chart/top"
  # Open the link as an html string
  html_string = HTTParty.get(top_url)
  # Parse the html string
  doc = Nokogiri::HTML.parse(html_string)
  # Identify the first 5 <a> elements from the page

  elements = doc.search(".lister-list .titleColumn a").first(5)
  elements.map do |element|
    # Get the href from each element and put them into an array
    "https://www.imdb.com#{element.attr("href")}"
  end
 end

 def movie_info(url)
  # Open the url
  html_string = HTTParty.get(url)
  # Parse the html string
  doc = Nokogiri::HTML.parse(html_string)
  meta_elements = doc.at(".ipc-metadata-list").children

  # find the element with title
  title = doc.at("h1").text
  # find the element with director
  director = meta_elements.first.at("a").text
  # find the element with cast
  cast = meta_elements.last.search("li a").map { |el| el.text }
  # find the element with storyline
  storyline = doc.at('[data-testid="plot"]').text
  # find the element with year
  year = doc.at('.ipc-link').text.to_i

  # return the movie info hash
  { title: title, cast: cast, storyline: storyline, year: year, director: director }
 end
	require "yaml"
	require_relative "scraper"

	# Get the top 5 urls
	urls = top_5_links

	# Scrape the movie info for each of them
	movie_infos = urls.map do \|url\|
	puts "Scraping #{url}...."
	movie_info(url)
	end

	File.open("movies.yml", "wb") do \|file\|
	file << YAML.dump(movie_infos)
	end
	require "nokogiri"
	require "httparty" # gem install httparty

	def top_5_links
	top_url = "https://www.imdb.com/chart/top"
	# Open the link as an html string
	html_string = HTTParty.get(top_url)
	# Parse the html string
	doc = Nokogiri::HTML.parse(html_string)
	# Identify the first 5 <a> elements from the page

	elements = doc.search(".lister-list .titleColumn a").first(5)
	elements.map do \|element\|
	# Get the href from each element and put them into an array
	"https://www.imdb.com#{element.attr("href")}"
	end
	end

	def movie_info(url)
	# Open the url
	html_string = HTTParty.get(url)
	# Parse the html string
	doc = Nokogiri::HTML.parse(html_string)
	meta_elements = doc.at(".ipc-metadata-list").children

	# find the element with title
	title = doc.at("h1").text
	# find the element with director
	director = meta_elements.first.at("a").text
	# find the element with cast
	cast = meta_elements.last.search("li a").map { \|el\| el.text }
	# find the element with storyline
	storyline = doc.at('[data-testid="plot"]').text
	# find the element with year
	year = doc.at('.ipc-link').text.to_i

	# return the movie info hash
	{ title: title, cast: cast, storyline: storyline, year: year, director: director }
	end