Skip to content

Instantly share code, notes, and snippets.

@Haumer
Created July 14, 2020 17:26
Show Gist options
  • Select an option

  • Save Haumer/856362cca54a657a91d7b76d9aacb09c to your computer and use it in GitHub Desktop.

Select an option

Save Haumer/856362cca54a657a91d7b76d9aacb09c to your computer and use it in GitHub Desktop.
require_relative 'scraper'
puts "Fetching URLs"
urls = fetch_movie_urls
movies = urls.map do |url|
puts "Scraping #{url}"
scrape_movie(url)
end
p movies
require 'open-uri'
require 'nokogiri'
# Method to scrape the top five movies
def get_urls
url = 'https://www.imdb.com/chart/top'
urls = []
imdb_html = open(url).read
imdb = Nokogiri::HTML(imdb_html)
imdb.search('.titleColumn a').first(5).each do |movie|
urls << "http://www.imdb.com" + movie.attributes['href'].value
end
urls
end
# Method to scrape the info from a single movie
def scrape_movie(url)
# returns a hash
movie = {}
imdb_html = open(url).read
imdb = Nokogiri::HTML(imdb_html)
movie_info = imdb.search('h1').text.strip.split('(')
movie[:title] = movie_info.first[0..-2]
movie[:year] = movie_info.last[0..-2].to_i
story = imdb.search('.summary_text').text.strip
movie[:storyline] = story
movie[:director] = imdb.search("h4:contains('Director') + a").text
cast = []
imdb.search(".primary_photo a img").take(3).each do |member|
cast << member.attribute.value
end
movie[:cast] = cast
movie
end
scrape_movie('https://www.imdb.com/title/tt0468569/')
# rubocop:disable all
require_relative '../scraper'
describe '#get_urls' do
it 'should return the correct array of urls' do
expected = [
'http://www.imdb.com/title/tt0111161/',
'http://www.imdb.com/title/tt0068646/',
'http://www.imdb.com/title/tt0071562/',
'http://www.imdb.com/title/tt0468569/',
'http://www.imdb.com/title/tt0050083/'
]
actual = get_urls
expect(actual).to eq(expected)
end
end
describe '#scrape_movie' do
it 'should return the correct movie hash' do
expected = {
cast: ['Christian Bale', 'Heath Ledger', 'Aaron Eckhart'],
director: 'Christopher Nolan',
storyline: 'When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice.',
title: 'The Dark Knight',
year: 2008
}
actual = scrape_movie('https://www.imdb.com/title/tt0468569/')
expect(actual).to eq(expected)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment