Last active
December 24, 2015 19:49
-
-
Save mecampbellsoup/6853829 to your computer and use it in GitHub Desktop.
student profile scraper!
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require_relative 'student_profiles_rake' | |
| require 'pry' | |
| def students_scrape_loop | |
| Student.make_schema | |
| Student.get_students.each do |student_url| | |
| obj = Student.new(student_url) | |
| obj.scrape_data | |
| obj.save_to_database | |
| end | |
| end | |
| students_scrape_loop | |
| system("sqlite3 #{Student::DatabaseExtension}") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require 'nokogiri' | |
| require 'open-uri' | |
| require 'pry' | |
| require 'awesome_print' | |
| class Student | |
| attr_accessor :student_ext, :student_url, :student_doc, :name, | |
| :social_links, :education, :quote, :full_name, | |
| :personal_projects, :work, :biography, :code_profiles | |
| def initialize(name) | |
| @name = name | |
| end | |
| def get_students # this method gets me each student's unique profile url (in array) | |
| Nokogiri::HTML(open("http://students.flatironschool.com/")).css("div.blog-thumb a").collect do |student| | |
| student.attr('href') | |
| end | |
| end | |
| def scrape_data | |
| self.student_ext = get_students.select {|student_url| student_url if student_url.include? self.name }.join | |
| self.student_url = "http://students.flatironschool.com/#{self.student_ext}" unless self.student_ext.nil? | |
| self.student_doc = Nokogiri::HTML(open(self.student_url)) # unique student profile url | |
| # next call the other scraping methods in here | |
| self.full_name = scrape_full_name | |
| self.social_links = scrape_social_links | |
| self.education = scrape_education | |
| self.personal_projects = scrape_personal_projects | |
| self.quote = scrape_quote | |
| self.work = scrape_work | |
| self.biography = scrape_biography | |
| self.code_profiles = scrape_code_profiles | |
| end | |
| def show_data # choose the data I want to show in this method | |
| ap "Name: #{self.full_name}" | |
| ap "Social: #{self.social_links}" | |
| ap "Education: #{self.education}" | |
| ap "Projects: #{self.personal_projects}" | |
| ap "Quote: #{self.quote}" | |
| ap "Work: #{self.work}" | |
| ap "Bio: #{self.biography}" | |
| ap "Coder Cred: #{self.code_profiles}" | |
| end | |
| def scrape_full_name | |
| self.student_doc.css('h4.ib_main_header').children.text | |
| end | |
| def scrape_social_links | |
| self.student_doc.css('div.social-icons a').collect do |social| | |
| social.attr('href') unless social.attr('href') == "#" | |
| end.compact.join(", ") # now I have a string of social links, yay! | |
| end | |
| def scrape_personal_projects | |
| self.student_doc.css('h3').each_with_index do |section_title, i| | |
| if section_title.text.strip.downcase == "personal projects" | |
| self.personal_projects = section_title.parent.parent.css('p').collect { |pp| pp.text }.join(", ") | |
| end | |
| end | |
| self.personal_projects | |
| end | |
| def scrape_education | |
| self.student_doc.css('h3').each_with_index do |section_title, i| | |
| if section_title.text.strip.downcase == "education" | |
| self.education = section_title.parent.parent.css('ul').text.strip | |
| #section_title.parent.parent.css('p') | |
| end | |
| end | |
| self.education | |
| end | |
| def scrape_quote | |
| self.student_doc.css('div.textwidget h3').text | |
| end | |
| def scrape_work | |
| self.student_doc.css('h3').each_with_index do |section_title, i| | |
| if section_title.text.strip.downcase == "work" | |
| if section_title.parent.parent.css('p').collect { |pp| pp.text }.join(", ").strip.empty? | |
| self.work = section_title.parent.parent.css('li').collect { |pp| pp.text }.join(", ") | |
| else | |
| self.work = section_title.parent.parent.css('p').collect { |pp| pp.text }.join(", ").strip | |
| end | |
| end | |
| end | |
| self.work | |
| end | |
| def scrape_biography | |
| self.student_doc.css('h3').each_with_index do |section_title, i| | |
| if section_title.text.strip.downcase == "biography" | |
| if section_title.parent.parent.css('p').collect { |pp| pp.text }.join(", ").strip.empty? | |
| self.biography = section_title.parent.parent.css('li').collect { |pp| pp.text }.join(", ") | |
| else | |
| self.biography = section_title.parent.parent.css('p').collect { |pp| pp.text }.join(", ").strip | |
| end | |
| end | |
| end | |
| self.biography | |
| end | |
| def scrape_code_profiles | |
| self.student_doc.css('h3').each_with_index do |section_title, i| | |
| if section_title.text.strip.downcase == "coding profiles" | |
| self.code_profiles = section_title.parent.parent.css('p a').collect { |cp| cp.attr('href') }.join(", ") | |
| end | |
| end | |
| self.code_profiles | |
| end | |
| end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require 'nokogiri' | |
| require 'open-uri' | |
| require 'pry' | |
| require 'awesome_print' | |
| require 'sqlite3' | |
| class Student | |
| DatabaseExtension = 'flatiron_student_profiles.db' | |
| StudentsDatabase = SQLite3::Database.new(DatabaseExtension) | |
| attr_accessor :student_doc, :url, | |
| :social_links, :education, :quote, :full_name, | |
| :personal_projects, :work, :biography, :code_profiles | |
| def self.make_schema | |
| StudentsDatabase.execute "CREATE TABLE IF NOT EXISTS students( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| name TEXT, | |
| social_links TEXT, | |
| education TEXT, | |
| quote TEXT, | |
| work TEXT, | |
| biography TEXT, | |
| code_profiles TEXT, | |
| personal_projects TEXT | |
| );" | |
| end | |
| def self.get_students # this method gets me each student's unique profile url (in array) | |
| Nokogiri::HTML(open("http://students.flatironschool.com/")).css("div.blog-thumb a").collect do |student| | |
| student.attr('href') | |
| end | |
| end | |
| def initialize(url) | |
| if url == "students/stephanie_name.html" | |
| @url = "http://students.flatironschool.com/students/stephanie_oh.html" | |
| else | |
| @url = "http://students.flatironschool.com/#{url}" | |
| end | |
| end | |
| def save_to_database | |
| StudentsDatabase.execute "INSERT INTO students | |
| (name, social_links, education, personal_projects, quote, work, biography, code_profiles) VALUES | |
| (\"#{self.full_name}\", \"#{self.social_links}\", \"#{self.education}\", \"#{self.personal_projects}\", \"#{self.quote}\", \"#{self.work}\", \"#{self.biography}\", \"#{self.code_profiles}\");" | |
| end | |
| def scrape_data | |
| self.student_doc = Nokogiri::HTML(open(self.url)) | |
| # next call the other scraping methods in here | |
| self.full_name = scrape_full_name | |
| self.social_links = scrape_social_links | |
| self.education = scrape_education | |
| self.personal_projects = scrape_personal_projects | |
| self.quote = scrape_quote | |
| self.work = scrape_work | |
| self.biography = scrape_biography | |
| self.code_profiles = scrape_code_profiles | |
| end | |
| def show_data # choose the data I want to show in this method | |
| ap "Name: #{self.full_name}" | |
| ap "Social: #{self.social_links}" | |
| ap "Education: #{self.education}" | |
| ap "Projects: #{self.personal_projects}" | |
| ap "Quote: #{self.quote}" | |
| ap "Work: #{self.work}" | |
| ap "Bio: #{self.biography}" | |
| ap "Coder Cred: #{self.code_profiles}" | |
| end | |
| def scrape_full_name | |
| self.student_doc.css('h4.ib_main_header').children.text.gsub("\"", "") | |
| end | |
| def scrape_social_links | |
| self.student_doc.css('div.social-icons a').collect do |social| | |
| social.attr('href') unless social.attr('href') == "#" | |
| end.compact.join(", ") # now I have a string of social links, yay! | |
| end | |
| def scrape_personal_projects | |
| self.student_doc.css('h3').each_with_index do |section_title, i| | |
| if section_title.text.strip.downcase == "personal projects" | |
| self.personal_projects = section_title.parent.parent.css('p').collect { |pp| pp.text }.join(", ") | |
| end | |
| end | |
| self.personal_projects | |
| end | |
| def scrape_education | |
| self.student_doc.css('h3').each_with_index do |section_title, i| | |
| if section_title.text.strip.downcase == "education" | |
| self.education = section_title.parent.parent.css('ul').text.strip | |
| #section_title.parent.parent.css('p') | |
| end | |
| end | |
| self.education | |
| end | |
| def scrape_quote | |
| self.student_doc.css('div.textwidget h3').text.gsub("\"", "") | |
| end | |
| def scrape_work | |
| self.student_doc.css('h3').each_with_index do |section_title, i| | |
| if section_title.text.strip.downcase == "work" | |
| if section_title.parent.parent.css('p').collect { |pp| pp.text }.join(", ").strip.empty? | |
| self.work = section_title.parent.parent.css('li').collect { |pp| pp.text }.join(", ") | |
| else | |
| self.work = section_title.parent.parent.css('p').collect { |pp| pp.text }.join(", ").strip | |
| end | |
| end | |
| end | |
| self.work | |
| end | |
| def scrape_biography | |
| self.student_doc.css('h3').each_with_index do |section_title, i| | |
| if section_title.text.strip.downcase == "biography" | |
| if section_title.parent.parent.css('p').collect { |pp| pp.text }.join(", ").strip.empty? | |
| self.biography = section_title.parent.parent.css('li').collect { |pp| pp.text }.join(", ") | |
| else | |
| self.biography = section_title.parent.parent.css('p').collect { |pp| pp.text }.join(", ").strip | |
| end | |
| end | |
| end | |
| self.biography | |
| end | |
| def scrape_code_profiles | |
| self.student_doc.css('h3').each_with_index do |section_title, i| | |
| if section_title.text.strip.downcase == "coding profiles" | |
| self.code_profiles = section_title.parent.parent.css('p a').collect { |cp| cp.attr('href') }.join(", ") | |
| end | |
| end | |
| self.code_profiles | |
| end | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment