Last active
December 24, 2015 16:49
-
-
Save gregeng/6831157 to your computer and use it in GitHub Desktop.
scraping student pages
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Scraping Most Voted Hackernews | |
require 'pry' | |
require 'nokogiri' | |
require 'open-uri' | |
# Get all the Posts on Hackernews | |
# student_profile = Nokogiri::HTML(open('http://students.flatironschool.com/students/greg_eng.html')) | |
student_index_page = Nokogiri::HTML(open('http://students.flatironschool.com')) | |
# Figure out their vote count | |
# individual student pages | |
# social_links = student_profile.css('div.social-icons a').collect do |link| | |
# link['href'] | |
# end | |
# quote = student_profile.css("div#testimonial-slider").text | |
# name = student_profile.css("h4.ib_main_header").text | |
# biography = student_profile.css("div.services p")[0].text | |
#index page | |
student_links = | |
student_index_page.css("div.blog-title div.big-comment h3 a").collect do |link| | |
"http://students.flatironschool.com/#{link['href']}" ## i dunno why this href works -- its cause its a hash!!! thanks vivian :D | |
end | |
student_hysterical_tag_lines = | |
student_index_page.css("div.blog-title p.home-blog-post-meta").collect do |tagline| | |
tagline.text | |
end | |
##### Collect data for students at all the links. | |
# first i will need to know how to change the link to a new student each time. | |
# i = 0 | |
# while i < student_links.size | |
# p "#{student_links[i+1]}" | |
# i += 1 | |
# end | |
student_profiles_hash = {} | |
student_links.each do |student| | |
begin | |
student_profile = Nokogiri::HTML(open(student)) | |
name = student_profile.css("h4.ib_main_header").text.to_sym | |
quote = student_profile.css("div#testimonial-slider").text.gsub("\n","").strip! | |
biography = student_profile.css("div.services p")[0].text.gsub("\n","").strip! | |
social_links = student_profile.css('div.social-icons a') | |
twitter = student_profile.css('div.social-icons a')[0].first[1] | |
linkedin = student_profile.css('div.social-icons a')[1].first[1] | |
github = student_profile.css('div.social-icons a')[2].first[1] | |
blog = student_profile.css('div.social-icons a')[3].first[1] | |
# Nokogiri::HTML(open('http://students.flatironschool.com/students/sam_yang.html')).css('div.social-icons a')[0].first[1] | |
student_profiles_hash[name] = {} | |
student_profiles_hash[name][:quote] = quote | |
student_profiles_hash[name][:biography] = biography | |
student_profiles_hash[name][:social_links] = {} | |
student_profiles_hash[name][:social_links][:twitter] = twitter || "no twitter" | |
student_profiles_hash[name][:social_links][:linkedin] = linkedin || "no linkedin" | |
student_profiles_hash[name][:social_links][:github] = github || "no github" | |
student_profiles_hash[name][:social_links][:blog] = blog || "no blog" | |
# student_profile.css('div.social-icons a')[0].first[1] | |
# student_profile.css('div.social-icons a').collect {|link| link['href']} | |
rescue | |
puts "#{student} just created an error" | |
end | |
end | |
# student_profile.css('div.social-icons a')[0] link['href'] | |
# student_profile = Nokogiri::HTML(open(student_links[0])) | |
# collect_individual_student_data(student_links) | |
binding.pry | |
# favorite_cities = student_profile.css("h4.ib_main_header").text | |
# we will probably want links | |
# quotes | |
# Education | |
# Hysterical tag lines | |
# Biography | |
# index Biography | |
# favorite cities | |
# quotes | |
# favorite cities | |
# name | |
# tag lines | |
# Biography | |
# links |
student = Student.new(student_profiles_hash[name], student_profiles_hash[name][:twitter], student_profiles_hash[name][:social_links][:linkedin], student_profiles_hash[name][:social_links][:github], student_profiles_hash[name][:social_links][:blog])
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
require 'nokogiri'
require 'open-uri'
student_profile = Nokogiri::HTML(open('http://students.flatironschool.com/students/emily_xie.html'))
student_profile.css('div.social-icons a').collect do |link|
link['href']
end
doc.css('div.heat a').map { |link| link['href'] }