Skip to content

Instantly share code, notes, and snippets.

@sasamijp
Created August 6, 2014 17:12
Show Gist options
  • Save sasamijp/702cd781aebdc97deae1 to your computer and use it in GitHub Desktop.
Save sasamijp/702cd781aebdc97deae1 to your computer and use it in GitHub Desktop.
http://ssimas.blog.fc2.com/ の スレッドの本文を抽出する
# -*- encoding: utf-8 -*-
require 'nokogiri'
require "open-uri"
require './SSparser.rb'
require './db_manager.rb'
# http://ssimas.blog.fc2.com/
#url = 'http://ssimas.blog.fc2.com/blog-entry-3218.html#more'
puts "input url"
url = gets
charset = nil
html = open(url) do |f|
charset = f.charset
f.read
end
thread = ''
doc = Nokogiri::HTML.parse(html, nil, charset)
doc.xpath('//div[@class="ently_text"]')[0].text.split("\n").each do |v|
break if v.include?("document.write")
str = ''
vvv = nil
id = 0
v.split("").each do |vv|
if vv == 'D' and vvv =='I'
id = 11
end
id -= 1
if id == 0
str << vv
str << "__split__"
else
str << vv
end
vvv = vv
end
str.split("__split__").each do |v|
break if v.include?("転載元:")
matchA = v.match(/^\d+ :.*:\d+\/\d+\/\d+\(.\) \d+:\d+:\d+.\d+ ID:........./)
matchB = v.match(/\d+ :.*:\d+\/\d+\/\d+\(.\) \d+:\d+:\d+.\d+ ID:.........$/)
next if matchA
if matchB
thread << v[0..v.index(/\d+ :.*:\d+\/\d+\/\d+\(.\) \d+:\d+:\d+.\d+ ID:.........$/)-1]
else
thread << v
end
end
end
#thread.split(" ").delete_if{|v|v==""}.each do |v|
# p v
# p "nil" if v.nil?
#end
ss = thread.split(' ').map!{|v|v.split("\r")}.flatten.delete_if{|v|v==""}
#puts ss
parser = SSparser.new
puts "puts or insert?"
output = gets.chomp
if output == "puts" or output == "p"
puts parser.parse(ss).delete_if{|v|v[:name]!='春香'}
elsif output == "insert" or output == "i"
db = DbManager.new
db.insert "amami.db", parser.parse(ss).delete_if{|v|v[:name]!='春香'}
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment