Created
February 5, 2012 07:51
-
-
Save MelanieS/1743882 to your computer and use it in GitHub Desktop.
Lists names, urls, and text of all of a HubPages users' hubs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#http://www.codegurl.com/2012/02/disguising-load-times-w-information.html | |
require 'nokogiri' | |
require 'open-uri' | |
def create_base_url(username) | |
#takes hub username and turns it into url using 'lastest' | |
base_url = "http://#{username}.hubpages.com/hubs/latest".to_s | |
end | |
def get_index_pages(base_url, username) | |
index_pages = [] | |
doc = Nokogiri::HTML(open(base_url)) | |
range = doc.xpath('//span[@class="range"]').inner_text | |
#finds number of hubs in 'range' string | |
str_array = range.split(' ') | |
number_of_hubs = str_array[2] | |
#strips out unnessesary info from range string and returns number of hubs | |
number_of_hubs = number_of_hubs.to_i | |
number_of_index_pages = number_of_hubs / 10 + 1 | |
#finds the number of index pages, 10 hubs per page, one extra page for remainder. | |
while number_of_index_pages != 0 | |
number_of_index_pages = number_of_index_pages.to_s | |
index_pages << "http://#{username}.hubpages.com/hubs/latest?page=#{number_of_index_pages}" | |
number_of_index_pages = number_of_index_pages.to_i | |
number_of_index_pages = number_of_index_pages - 1 | |
end | |
return index_pages | |
end | |
def get_hub_urls(index_list) | |
hubs = [] | |
index_list.each do |something| | |
doc = Nokogiri::HTML(open(something)) | |
doc.xpath('//div[@class="hub_pic"]/a').each do |e| | |
hubs << e['href'] | |
end | |
end | |
return hubs | |
end | |
def pull_text(hub_urls) | |
hubs = Hash.new | |
results = [] | |
hub_urls.each do |something| | |
doc = Nokogiri::HTML(open(something)) | |
main_text = doc.xpath('//div[@class="module moduleText color0"]').inner_text | |
blue_text = doc.xpath('//div[@class="module moduleText color2"]').inner_text | |
grey_text = doc.xpath('//div[@class="module moduleText color1"]').inner_text | |
table_text = doc.xpath('//div[@class="module moduleTable color0"]').inner_text | |
title = doc.search('title').inner_text | |
all_text = main_text + blue_text + grey_text + table_text | |
hubs[title] = all_text | |
end | |
return hubs | |
end | |
def clean_text(hubtxt_hash) | |
hubtxt_hash.each |key| | |
key = key.delete(",").gsub(" ", ",") | |
key = key.delete("\n").split(",") | |
return hubtxt_hash | |
end | |
puts "Enter HubPages username:" | |
username = gets.chomp | |
base_url = create_base_url(username) | |
index_pages = get_index_pages(base_url, username) | |
hub_urls = get_hub_urls(index_pages) | |
text = pull_text(hub_urls) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment