Created
September 11, 2012 20:32
-
-
Save 0xGGGGG/3701780 to your computer and use it in GitHub Desktop.
simple script to scrabe hurriyet's posts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "rubygems" | |
require "mechanize" | |
require "cgi" | |
require "csv" | |
require "active_record" | |
require "./lib/colorize_output" | |
def url_for(yazar) | |
"http://www.hurriyet.com.tr/index/#{CGI::escape(yazar.text.downcase!.gsub(' ','_'))}" | |
end | |
def post_links_of(agent) | |
agent.page.links_with(:href => /http:\/\/hurarsiv.hurriyet.com.tr\/goster\/haber.aspx/) | |
end | |
def post_validates_with?(agent) | |
!(agent.page.at(".hurtextverdana").nil? | agent.page.at(".yazarisim").nil? | agent.page.at(".haberdevambaslik").nil?) | |
end | |
def current_post_array_from(agent) | |
author_name = agent.page.at(".yazarisim").text | |
post_date = agent.page.at(".muhabir").text | |
post_title = agent.page.at(".haberdevambaslik").text | |
post_body = agent.page.at(".hurtextverdana").search("b").text + agent.page.at(".hurtextverdana").search("p").text | |
[author_name, post_date, post_title, post_body] | |
end | |
agent = Mechanize.new | |
puts green("http get > http://hurarsiv.hurriyet.com.tr/yazarlar/Default.aspx") | |
# get all editors list from a dropdownlist at link | |
agent.get("http://hurarsiv.hurriyet.com.tr/yazarlar/") | |
yazarlar = agent.page.forms.first.field("ddYazarlar").options.drop(1) | |
post_count = 0 | |
CSV.open("data/posts.csv", "w", encoding: "ISO8859-1") do |csv| | |
yazarlar.each do |yazar| | |
begin | |
agent.get url_for yazar | |
links = post_links_of agent | |
links.each do |link| | |
puts yellow("http get > #{link.href}") | |
link.click | |
if post_validates_with? agent | |
post = current_post_array_from agent | |
post_count += 1 | |
puts " >> [#{post_count}]: #{post[0]}, #{post[1]}, #{post[2]}, #{post[3]}" | |
csv << post | |
end | |
agent.history.pop | |
end | |
rescue | |
end | |
end | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment