caioertai · July 11, 2018 09:56
diff --git a/interface.rb b/interface.rb
 require_relative "scraper.rb"

 puts "Fetching urls..."
 urls = movie_getter

 movies = urls.map do |url|
  puts "Scraping #{url}"
  scrape_movie(url)
 end

 movies.each_with_index do |movie, index|
  puts "------------------------------------------"
  puts "Top #{index + 1}"
  puts "  Title:     #{movie[:title]}"
  puts "  Year:      #{movie[:year]}"
  puts "  Director:  #{movie[:director]}"
  puts "  Cast:      #{movie[:cast]}"
  puts "  Storyline: #{movie[:storyline]}"
 end
 puts "------------------------------------------"
diff --git a/scraper.rb b/scraper.rb
 require "open-uri"
 require "nokogiri"

 def movie_getter
  url = "http://www.imdb.com/chart/top"
  # open file and run Nokogiri
  html_file = URI.parse(url).open("Accept-Language" => "en")
  # grab content
  html = Nokogiri::HTML(html_file)
  # return first 5 top movies url
  top5 = html.search(".titleColumn a").first(5)
  top5.map do |movie|
    "http://www.imdb.com#{movie.attribute("href").value.gsub(/\?.*/, "")}"
  end
 end

 def scrape_movie(url)
  # open file and parse with nokogiri
  file = URI.parse(url).open("Accept-Language" => "en")
  # grab the content
  doc = Nokogiri::HTML(file)

  # title is inside (.title_wrapper h1) class
  title = doc.search(".title_wrapper h1").text.gsub(/.\(.*/, "")

  # year is inside id (#titleYear)
  year = doc.search("#titleYear").text.gsub(/\W/, "")

  # storyline is in (.summary_text)
  storyline = doc.search(".summary_text").text.strip

  # director is within (.credit_summary_item .itemprop)
  director = doc.at(".credit_summary_item .itemprop").text

  # cast is an array with 3 items
  cast = doc.at("#title-overview-widget > div.plot_summary_wrapper > div.plot_summary > div:nth-child(4)").text
  match_data = cast.match(/\A\s+Stars:\s+(\w.+),\s+(\w.+),\s+(\w.+?\s\w.+)\s/)
  stars = [match_data[1], match_data[2], match_data[3].strip]

  # .search returns an array
  # .at return the first element found

  # return a hash with the data
  {
    cast: stars,
    director: director,
    storyline: storyline,
    title: title,
    year: year.to_i
  }
 end
diff --git a/scraper_spec.rb b/scraper_spec.rb
 require_relative '../scraper.rb'
 describe '#movie_getter' do
  it 'should scrape imdb for top 5 movies' do
    movies_expected = [
      'http://www.imdb.com/title/tt0111161/',
      'http://www.imdb.com/title/tt0068646/',
      'http://www.imdb.com/title/tt0071562/',
      'http://www.imdb.com/title/tt0468569/',
      'http://www.imdb.com/title/tt0050083/'
    ]
    actual = movie_getter
    expect(actual).to eq(movies_expected)
  end
 end

 describe '#scrape_movie' do
  it 'should scrape movie url for expected data' do
    expected_data = {
      cast: [ 'Christian Bale', 'Heath Ledger', 'Aaron Eckhart' ],
      director: 'Christopher Nolan',
      storyline: 'When the menace known as the Joker emerges from his mysterious past, he wreaks havoc and chaos on the people of Gotham. The Dark Knight must accept one of the greatest psychological and physical tests of his ability to fight injustice.',
      title: 'The Dark Knight',
      year: 2008
    }
    actual = scrape_movie('http://www.imdb.com/title/tt0468569/')
    expect(actual).to eq(expected_data)
  end
 end
	require_relative "scraper.rb"

	puts "Fetching urls..."
	urls = movie_getter

	movies = urls.map do \|url\|
	puts "Scraping #{url}"
	scrape_movie(url)
	end

	movies.each_with_index do \|movie, index\|
	puts "------------------------------------------"
	puts "Top #{index + 1}"
	puts " Title: #{movie[:title]}"
	puts " Year: #{movie[:year]}"
	puts " Director: #{movie[:director]}"
	puts " Cast: #{movie[:cast]}"
	puts " Storyline: #{movie[:storyline]}"
	end
	puts "------------------------------------------"
	require "open-uri"
	require "nokogiri"

	def movie_getter
	url = "http://www.imdb.com/chart/top"
	# open file and run Nokogiri
	html_file = URI.parse(url).open("Accept-Language" => "en")
	# grab content
	html = Nokogiri::HTML(html_file)
	# return first 5 top movies url
	top5 = html.search(".titleColumn a").first(5)
	top5.map do \|movie\|
	"http://www.imdb.com#{movie.attribute("href").value.gsub(/\?.*/, "")}"
	end
	end

	def scrape_movie(url)
	# open file and parse with nokogiri
	file = URI.parse(url).open("Accept-Language" => "en")
	# grab the content
	doc = Nokogiri::HTML(file)

	# title is inside (.title_wrapper h1) class
	title = doc.search(".title_wrapper h1").text.gsub(/.\(.*/, "")

	# year is inside id (#titleYear)
	year = doc.search("#titleYear").text.gsub(/\W/, "")

	# storyline is in (.summary_text)
	storyline = doc.search(".summary_text").text.strip

	# director is within (.credit_summary_item .itemprop)
	director = doc.at(".credit_summary_item .itemprop").text

	# cast is an array with 3 items
	cast = doc.at("#title-overview-widget > div.plot_summary_wrapper > div.plot_summary > div:nth-child(4)").text
	match_data = cast.match(/\A\s+Stars:\s+(\w.+),\s+(\w.+),\s+(\w.+?\s\w.+)\s/)
	stars = [match_data[1], match_data[2], match_data[3].strip]

	# .search returns an array
	# .at return the first element found

	# return a hash with the data
	{
	cast: stars,
	director: director,
	storyline: storyline,
	title: title,
	year: year.to_i
	}
	end
	require_relative '../scraper.rb'
	describe '#movie_getter' do
	it 'should scrape imdb for top 5 movies' do
	movies_expected = [
	'http://www.imdb.com/title/tt0111161/',
	'http://www.imdb.com/title/tt0068646/',
	'http://www.imdb.com/title/tt0071562/',
	'http://www.imdb.com/title/tt0468569/',
	'http://www.imdb.com/title/tt0050083/'
	]
	actual = movie_getter
	expect(actual).to eq(movies_expected)
	end
	end

	describe '#scrape_movie' do
	it 'should scrape movie url for expected data' do
	expected_data = {
	cast: [ 'Christian Bale', 'Heath Ledger', 'Aaron Eckhart' ],
	director: 'Christopher Nolan',
	storyline: 'When the menace known as the Joker emerges from his mysterious past, he wreaks havoc and chaos on the people of Gotham. The Dark Knight must accept one of the greatest psychological and physical tests of his ability to fight injustice.',
	title: 'The Dark Knight',
	year: 2008
	}
	actual = scrape_movie('http://www.imdb.com/title/tt0468569/')
	expect(actual).to eq(expected_data)
	end
	end