Skip to content

Instantly share code, notes, and snippets.

@Haumer
Last active July 14, 2020 21:39
Show Gist options
  • Select an option

  • Save Haumer/8a9cb24294c954418815d15b61cbb1bf to your computer and use it in GitHub Desktop.

Select an option

Save Haumer/8a9cb24294c954418815d15b61cbb1bf to your computer and use it in GitHub Desktop.
require_relative 'scraper'
puts "Fetching URLs"
urls = get_urls
movies = urls.map do |url|
puts "Scraping #{url}"
scrape_movie(url)
end
p movies
require 'open-uri'
require 'nokogiri'
# Method to scrape the top five movies
def get_urls
url = 'https://www.imdb.com/chart/top'
urls = []
imdb_html = open(url).read
imdb = Nokogiri::HTML(imdb_html)
imdb.search('.titleColumn a').first(5).each do |movie|
urls << "http://www.imdb.com" + movie.attributes['href'].value
end
urls
end
# Method to scrape the info from a single movie
def scrape_movie(url)
# returns a hash
movie = {}
imdb_html = open(url).read
imdb = Nokogiri::HTML(imdb_html)
movie_info = imdb.search('h1').text.strip.split('(')
movie[:title] = movie_info.first[0..-2]
movie[:year] = movie_info.last[0..-2].to_i
story = imdb.search('.summary_text').text.strip
movie[:storyline] = story
movie[:director] = imdb.search("h4:contains('Director') + a").text
cast = []
imdb.search(".primary_photo a img").take(3).each do |member|
cast << member.attributes["title"].value
end
movie[:cast] = cast
movie
end
# scrape_movie('https://www.imdb.com/title/tt0468569/')
# rubocop:disable all
require_relative '../scraper'
describe '#get_urls' do
it 'should return the correct array of urls' do
expected = [
'http://www.imdb.com/title/tt0111161/',
'http://www.imdb.com/title/tt0068646/',
'http://www.imdb.com/title/tt0071562/',
'http://www.imdb.com/title/tt0468569/',
'http://www.imdb.com/title/tt0050083/'
]
actual = get_urls
expect(actual).to eq(expected)
end
end
describe '#scrape_movie' do
it 'should return the correct movie hash' do
expected = {
cast: ['Christian Bale', 'Heath Ledger', 'Aaron Eckhart'],
director: 'Christopher Nolan',
storyline: 'When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice.',
title: 'The Dark Knight',
year: 2008
}
actual = scrape_movie('https://www.imdb.com/title/tt0468569/')
expect(actual).to eq(expected)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment