caioertai · January 29, 2019 21:00
diff --git a/interface.rb b/interface.rb
 require_relative 'scraper'

 # Using movie getter to scrape top 5 urls
 movies_urls = movie_getter

 # Iterate over the urls to scrape movies
 movies = movies_urls.map do |movie_url|
  puts "Scraping #{movie_url}"
  # calls #scrape_movie, that returns a hash
  # that represents a movie, making our
  # array into an array of movies
  scrape_movie(movie_url)
 end

 # Iterate on our array of movies to print their
 # information in a nice fashion
 movies.each_with_index do |movie, index|
  puts "------------------------------------------"
  puts "Top #{index + 1}"
  puts "  Title:     #{movie[:title]}"
  puts "  Year:      #{movie[:year]}"
  puts "  Director:  #{movie[:director]}"
  puts "  Cast:      #{movie[:cast]}"
  puts "  Storyline: #{movie[:storyline]}"
 end
diff --git a/scraper.rb b/scraper.rb
 # require the open-uri
 require 'open-uri'

 # require nokogiri
 require 'nokogiri'

 def movie_getter
  # set the imdb url
  base_url = 'https://www.imdb.com'
  top_movies_url = base_url + '/chart/top'

  # open the url and read it
  page_string = open(top_movies_url).read

  # parse the page into a nokogiri doc
  page = Nokogiri::HTML(page_string)

  # search the page for the relevant info
  movies_rows = page.search('.titleColumn a')
  # limit to the top 5 movies

  # map to return movie urls in an array
  movies_rows.first(5).map do |movie_row|
    base_url + movie_row.attr('href').match(/(.*)\?/)[1]
  end
 end

 def scrape_movie(movie_url)
  # open and read the movie url
  page_string = open(movie_url, "Accept-Language" => "en").read

  # parse the page string into a nokogiri object
  page = Nokogiri::HTML(page_string)

  # search for:
  #   title
  #   year
  # title and year are together in the same string
  # so we used a regex on a #match to separate them
  header_data = page.search('h1').text.match(/(?<title>.*)\((?<year>\d{4})/)
  title = header_data[:title].strip[0..-2]
  year = header_data[:year].to_i

  credit_summary_items = page.search('.credit_summary_item')

  #   cast
  cast_elements = credit_summary_items[2].search('a').first(3)
  cast = cast_elements.map do |element|
    element.text
  end

  #   director
  director = credit_summary_items[0].search('a').text

  #   storyline
  storyline = page.search('.plot_summary .summary_text').text.strip

  # return a hash of the movie info
  {
    title: title,
    year: year,
    cast: cast,
    director: director,
    storyline: storyline
  }
 end
	require_relative 'scraper'

	# Using movie getter to scrape top 5 urls
	movies_urls = movie_getter

	# Iterate over the urls to scrape movies
	movies = movies_urls.map do \|movie_url\|
	puts "Scraping #{movie_url}"
	# calls #scrape_movie, that returns a hash
	# that represents a movie, making our
	# array into an array of movies
	scrape_movie(movie_url)
	end

	# Iterate on our array of movies to print their
	# information in a nice fashion
	movies.each_with_index do \|movie, index\|
	puts "------------------------------------------"
	puts "Top #{index + 1}"
	puts " Title: #{movie[:title]}"
	puts " Year: #{movie[:year]}"
	puts " Director: #{movie[:director]}"
	puts " Cast: #{movie[:cast]}"
	puts " Storyline: #{movie[:storyline]}"
	end
	# require the open-uri
	require 'open-uri'

	# require nokogiri
	require 'nokogiri'

	def movie_getter
	# set the imdb url
	base_url = 'https://www.imdb.com'
	top_movies_url = base_url + '/chart/top'

	# open the url and read it
	page_string = open(top_movies_url).read

	# parse the page into a nokogiri doc
	page = Nokogiri::HTML(page_string)

	# search the page for the relevant info
	movies_rows = page.search('.titleColumn a')
	# limit to the top 5 movies

	# map to return movie urls in an array
	movies_rows.first(5).map do \|movie_row\|
	base_url + movie_row.attr('href').match(/(.*)\?/)[1]
	end
	end

	def scrape_movie(movie_url)
	# open and read the movie url
	page_string = open(movie_url, "Accept-Language" => "en").read

	# parse the page string into a nokogiri object
	page = Nokogiri::HTML(page_string)

	# search for:
	# title
	# year
	# title and year are together in the same string
	# so we used a regex on a #match to separate them
	header_data = page.search('h1').text.match(/(?<title>.*)\((?<year>\d{4})/)
	title = header_data[:title].strip[0..-2]
	year = header_data[:year].to_i

	credit_summary_items = page.search('.credit_summary_item')

	# cast
	cast_elements = credit_summary_items[2].search('a').first(3)
	cast = cast_elements.map do \|element\|
	element.text
	end

	# director
	director = credit_summary_items[0].search('a').text

	# storyline
	storyline = page.search('.plot_summary .summary_text').text.strip

	# return a hash of the movie info
	{
	title: title,
	year: year,
	cast: cast,
	director: director,
	storyline: storyline
	}
	end