Skip to content

Instantly share code, notes, and snippets.

@krtx
Created November 4, 2012 20:58
Show Gist options
  • Save krtx/4013751 to your computer and use it in GitHub Desktop.
Save krtx/4013751 to your computer and use it in GitHub Desktop.
langrid scraper
# coding: utf-8
# 日本語でやった人の日本語のデータと、
# 英語でやった人の英語のデータを取得する
# データは打ち込んだ文(表示されてるやつ)と
# 折り返し翻訳されたものの2つ
# language = 'ja'
# $ ruby langrid.rb > ja
# language = 'en'
# $ ruby langrid.rb > en
# $ paste -d , ja en | cut -d ',' -f 1-3,6- | column -s , -t
# (column はオプション)
require 'mechanize' # 2.5.1
require 'pp'
# !!! edit here !!!
# uid = '772' # tukawanai
userid = ''
password = ''
language = '' # 'ja' or 'en'
topic_id = # integer
# !!! kokomade !!!
agent = Mechanize.new
top = agent.get('http://langrid.org/tools/toolbox')
top.form_with(:name => 'login_form') do |form|
form.field_with(:name => 'uname').value = userid
form.field_with(:name => 'pass').value = password
form.click_button
end
page = agent.get("http://langrid.org/tools/toolbox/modules/forum/?topicId=#{topic_id.to_s}&lang=#{language}&ml_lang=#{language}")
loop do
posts = page.search('tr.bbs-post-header')
posts.each do |tr|
info = tr.search('td.list_line01:first > a:eq(1)')
sname = info.inner_html.to_s.strip
sid = info[0]['href'].scan(/http:\/\/langrid\.org\/tools\/toolbox\/userinfo\.php\?uid=([\d]+)/)[0][0]
text = tr.next_element.search('td').inner_html.to_s.strip.gsub(/\r\n/, '')
date = info[0].next_sibling.to_s.strip.scan(/[\d]{4}\/[\d]{1,2}\/[\d]{1,2} [\d]{1,2}:[\d]{1,2}/)[0]
=begin
# 折り返し翻訳
# なんとかしてくれ
if sid == uid
formname = tr.search('td.list_line01 form')[0]['name']
if /^editMessageLinkForm/ =~ formname
page.form_with(:name => formname) do |form|
form.click_button
end
end
=end
puts "#{sname}, #{date}, #{text}"
end
next_link = nil
page.search('div.page_index li a').each do |a|
if /Next/ =~ a.inner_html
next_link = a['href']
end
end
if next_link
page = agent.get(next_link)
else
break
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment