Skip to content

Instantly share code, notes, and snippets.

@skuroki
Created December 6, 2012 12:12
Show Gist options
  • Save skuroki/4224022 to your computer and use it in GitHub Desktop.
Save skuroki/4224022 to your computer and use it in GitHub Desktop.
2ch crawler
# coding: utf-8
require 'nokogiri'
require 'mongo'
require 'open-uri'
require 'pp'
def extract_threads
l = open('http://hayabusa3.2ch.net/appli/subback.html').read.force_encoding('cp932').encode('utf-8')
n = Nokogiri::HTML.parse(l)
n.css('#trad a').select { |node| node.children[0].to_s =~ /.*スレ/ }.map { |node| node['href'].split('/')[0].to_i }
end
def extract_posts(thread_id, from)
l = open("http://hayabusa3.2ch.net/test/read.cgi/appli/#{thread_id}/#{from}-").read.force_encoding('cp932').encode('utf-8', invalid: :replace, undef: :replace)
n = Nokogiri::HTML.parse(l)
n.css('.thread dd').map do |dd|
dt = dd.previous
match_data = dt.to_s.match(/<dt>(\d*).*(\d{4})\/(\d{2})\/(\d{2}).*(\d{2}):(\d{2}):(\d{2})/)
if match_data
post_id, year, month, day, hour, minute, second = match_data.captures.map(&:to_i)
if post_id > 1
body = dd.to_s.gsub(/<\/?dd>/, '').gsub("\n", '').split('<br>')
{ thread_id: thread_id, post_id: post_id, posted_at: Time.local(year, month, day, hour, minute, second), body: body }
else
nil
end
else
nil
end
end.compact
end
while true do
begin
puts "#{Time.now} crawling..."
hogehoge = Mongo::Connection.new.db("hogehoge")
posts = hogehoge.collection("posts")
thread_ids = extract_threads
lasts = extract_threads.map do |thread_id|
posts.
find(thread_id: thread_id).
map { |post| post['post_id'] }.max || 0
end
thread_ids.zip(lasts).each do |thread_id, last|
next if last >= 1000
puts "extracting post thread:#{thread_id}, range:#{last + 1}-"
extracted_posts = extract_posts(thread_id, last + 1)
puts "got #{extracted_posts.size} posts"
extracted_posts.each { |post| posts.insert post }
end
rescue => e
puts e.message
end
sleep 300
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment