Skip to content

Instantly share code, notes, and snippets.

@KitaitiMakoto
Last active September 27, 2015 17:31
Show Gist options
  • Save KitaitiMakoto/f493229009d168b87969 to your computer and use it in GitHub Desktop.
Save KitaitiMakoto/f493229009d168b87969 to your computer and use it in GitHub Desktop.
# coding: utf-8
require 'pp'
require 'pathname'
require 'open-uri'
require 'nokogiri'
require 'active_groonga'
require 'baby_erubis'
current_dir = Pathname(__dir__).expand_path
ORIGIN_URI = URI('http://sinap.jp')
INDEX_PAGE = ORIGIN_URI + '/blog/'
DATABASE_PATH = current_dir + 'database' + 'sinaplog.db'
CONTENTS_DIR = current_dir + 'sinaplog'
MAIN_AREA_SELECTOR = 'article .entry-body'
def main
setup_database
download_contents
load_contents
list_similar_contents
end
def setup_database
DATABASE_PATH.dirname.mkpath
ActiveGroonga::Base.configurations = {
'development' => {
'database' => DATABASE_PATH,
'encoding' => 'utf8'
}
}
ActiveGroonga::Base.configure 'development'
ActiveGroonga::Base.logger = Logger.new($stderr)
ActiveGroonga::Base.database.ensure_available
ActiveGroonga::Base.context.register_plugin 'token_filters/stop_word'
version = Time.now.to_i
migration_file_path = Pathname(__FILE__).expand_path
$stdout = STDERR
ActiveGroonga::Schema.define version: version do |schema|
[CreatePages, CreatePagesContentIndex, LoadStopWords].each do |migration|
definitions = migration.new(version, migration_file_path, schema).migrate(:up)
definitions.each do |definition|
definition.define if definition.respond_to? :define
end
end
end
$stdout = STDOUT
end
def download_contents
CONTENTS_DIR.mkpath
index_path = download(INDEX_PAGE)
doc = Nokogiri.HTML(index_path.open)
start_point = doc.css('html body#pagetop div.wrapper div#page.section-content.home div#page-body.section-body article.entry div.section-content div.section-complementary.entry-complementary aside div.section-content div.section-body section.entry-detail div.section-content div.section-body ul li a').first['href']
path = download(start_point)
doc = Nokogiri.HTML(path.open)
while prev_page_link = doc.css('.blog-previous-page a').first
path = download(prev_page_link['href'])
doc = Nokogiri.HTML(path.open)
end
end
def load_contents
Pathname.glob("#{CONTENTS_DIR}/**/*").each do |path|
next unless path.file?
next if path.basename.to_path == 'index.html'
doc = Nokogiri.HTML(path.open)
uri = ORIGIN_URI + "/#{path.relative_path_from(CONTENTS_DIR)}"
title = doc.css('title').first.content
content = doc.css(MAIN_AREA_SELECTOR).first.content
Page.create key: uri.to_s, title: title, content: content
end
end
def list_similar_contents
template = BabyErubis::Html.new.from_str(DATA.read)
puts template.render(pages: Page.all)
end
def download(uri)
uri = URI(uri)
path = uri.path[1..-1]
if uri.path.end_with? '/'
path += 'index.html'
end
path = CONTENTS_DIR.join(path)
unless path.dirname.directory?
$stderr.puts "Making directory: #{path.dirname}..."
path.dirname.mkpath
end
if path.file?
$stderr.puts "#{path.relative_path_from(CONTENTS_DIR)} exists. Skip downloading"
else
$stderr.puts "Downloading: #{uri} -> #{path}"
path.write uri.read
sleep 1
end
path
end
class CreatePages < ActiveGroonga::Migration
def up
create_table :pages, type: :hash do |table|
table.short_text 'title'
table.text 'content'
end
end
def down
remove_table :pages
end
end
class CreatePagesContentIndex < ActiveGroonga::Migration
def up
options = {
type: :patricia_trie,
normalizer: :NormalizerAuto,
default_tokenizer: :TokenMecab,
token_filters: ['TokenFilterStopWord']
}
create_table :pages_content_index, options do |table|
table.index 'pages.content', with_position: true
table.boolean 'is_stop_word', type: :scalar
end
end
def down
remove_table :pages_content_index
end
end
class LoadStopWords < ActiveGroonga::Migration
STOP_WORDS = %w[の は 。 、です ます , .]
def up
STOP_WORDS.each do |word|
table.add word, is_stop_word: true
end
end
def down
STOP_WORDS.each do |word|
table.delete word
end
end
private
def table
Groonga['pages_content_index']
end
end
class Page < ActiveGroonga::Base
alias uri key
def similar_pages
results = Page.table.select {|page|
page.content.similar_search(content) &
(page.key != key)
}
results.sort([key: '_score', order: 'descending'], limit: 6)
end
end
if $0 == __FILE__
main
end
__END__
<!doctype html>
<title>SINAPlog similar contents</title>
<style>
body {
margin: 1em 2em;
line-height: 2em;
}
h2 > a {
color: rgb(60, 60, 60);
text-decoration: none;
}
h2 > a:hover, h2 > a:active {
text-decoration: underline;
}
li > a {
color: rgb(30, 30, 200);
}
li > a:hover, li > a:active {
color: rgba(30, 30, 200, 0.5);
text-decoration: none;
}
</style>
<h1>SINAPlog similar contents</h1>
<%- @pages.each do |page| -%>
<section>
<h2><a href=<%= page.uri %>><%= page.title.gsub(' | ブログ | SINAP - 株式会社シナップ', '') %></a></h2>
<ol>
<%- page.similar_pages.each do |similar_page| -%>
<li><a href=<%= similar_page['_key'] %>><%= similar_page.title.gsub(' | ブログ | SINAP - 株式会社シナップ', '') %></a> (score: <%= similar_page.score %>)
<%- end -%>
</ol>
</section>
<%- end -%>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment