Created
November 4, 2012 20:58
-
-
Save krtx/4013751 to your computer and use it in GitHub Desktop.
langrid scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# 日本語でやった人の日本語のデータと、 | |
# 英語でやった人の英語のデータを取得する | |
# データは打ち込んだ文(表示されてるやつ)と | |
# 折り返し翻訳されたものの2つ | |
# language = 'ja' | |
# $ ruby langrid.rb > ja | |
# language = 'en' | |
# $ ruby langrid.rb > en | |
# $ paste -d , ja en | cut -d ',' -f 1-3,6- | column -s , -t | |
# (column はオプション) | |
require 'mechanize' # 2.5.1 | |
require 'pp' | |
# !!! edit here !!! | |
# uid = '772' # tukawanai | |
userid = '' | |
password = '' | |
language = '' # 'ja' or 'en' | |
topic_id = # integer | |
# !!! kokomade !!! | |
agent = Mechanize.new | |
top = agent.get('http://langrid.org/tools/toolbox') | |
top.form_with(:name => 'login_form') do |form| | |
form.field_with(:name => 'uname').value = userid | |
form.field_with(:name => 'pass').value = password | |
form.click_button | |
end | |
page = agent.get("http://langrid.org/tools/toolbox/modules/forum/?topicId=#{topic_id.to_s}&lang=#{language}&ml_lang=#{language}") | |
loop do | |
posts = page.search('tr.bbs-post-header') | |
posts.each do |tr| | |
info = tr.search('td.list_line01:first > a:eq(1)') | |
sname = info.inner_html.to_s.strip | |
sid = info[0]['href'].scan(/http:\/\/langrid\.org\/tools\/toolbox\/userinfo\.php\?uid=([\d]+)/)[0][0] | |
text = tr.next_element.search('td').inner_html.to_s.strip.gsub(/\r\n/, '') | |
date = info[0].next_sibling.to_s.strip.scan(/[\d]{4}\/[\d]{1,2}\/[\d]{1,2} [\d]{1,2}:[\d]{1,2}/)[0] | |
=begin | |
# 折り返し翻訳 | |
# なんとかしてくれ | |
if sid == uid | |
formname = tr.search('td.list_line01 form')[0]['name'] | |
if /^editMessageLinkForm/ =~ formname | |
page.form_with(:name => formname) do |form| | |
form.click_button | |
end | |
end | |
=end | |
puts "#{sname}, #{date}, #{text}" | |
end | |
next_link = nil | |
page.search('div.page_index li a').each do |a| | |
if /Next/ =~ a.inner_html | |
next_link = a['href'] | |
end | |
end | |
if next_link | |
page = agent.get(next_link) | |
else | |
break | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment