Skip to content

Instantly share code, notes, and snippets.

@themacmarketer
Forked from rodloboz/scraper.rb
Created September 10, 2018 09:42
Show Gist options
  • Save themacmarketer/42fdaad2b7eae5c80354c4f44f75bbf0 to your computer and use it in GitHub Desktop.
Save themacmarketer/42fdaad2b7eae5c80354c4f44f75bbf0 to your computer and use it in GitHub Desktop.
livecode scraping
require 'open-uri'
require 'nokogiri'
require 'pry-byebug'
BASE_URL = "https://www.imdb.com"
def fetch_urls
url = "https://www.imdb.com/chart/top"
html_file = open(url).read # string
html_doc = Nokogiri::HTML(html_file) #Nokogiri::HTML::Document
urls = []
movie_list = html_doc.search('.titleColumn a').each do |element|
suburl_with_query = element['href'] # this is a string
match_data = suburl_with_query.match(/(.+)(\?.+)/) # matchdata object
suburl = match_data[1] # this is a string
urls << BASE_URL + suburl # string contatenation
end
urls.take(5)
end
def get_movie_info(url)
return {} if url.empty?
html_file = open(url, "Accept-Language" => "en").read
html_doc = Nokogiri::HTML(html_file)
url
title_string = html_doc.search('h1').text
match_data = title_string.match(/(?<title>.+)\((?<year>\d{4}).+[[:space:]]/)
title = match_data[:title][0..-2]
year = match_data[:year].to_i
storyline = html_doc.search('.summary_text').text.strip
director = html_doc.search('span[itemprop="director"]').text.strip
actors = html_doc.search('span[itemprop="actors"]') # array like object
cast = []
actors.each do |actor|
cast << actor.text.strip.split(',').first # string
end
hash = {
title: title,
year: year,
storyline: storyline,
director: director,
cast: cast
}
hash
end
# get_movie_info('https://www.imdb.com/title/tt0111161/')
require_relative '../scraper.rb'
describe '#fetch_urls' do
it "should return an array of the top 5 urls" do
actual = fetch_urls
expected = [
"https://www.imdb.com/title/tt0111161/",
"https://www.imdb.com/title/tt0068646/",
"https://www.imdb.com/title/tt0071562/",
"https://www.imdb.com/title/tt0468569/",
"https://www.imdb.com/title/tt0050083/"
]
expect(actual).to eq(expected)
end
end
describe '#get_movie_info' do
it "should return and empty hash if passed invalid url" do
url = ""
actual = get_movie_info(url)
expected = {}
expect(actual).to eq(expected)
end
it "should return a hash with the correct movie info" do
url = 'https://www.imdb.com/title/tt0111161/'
actual = get_movie_info(url)
expected = {
title: "The Shawshank Redemption",
year: 1994,
cast: ['Tim Robbins', 'Morgan Freeman', 'Bob Gunton'],
storyline: 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.',
director: 'Frank Darabont'
}
expect(actual).to eq(expected)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment