Last active
September 27, 2015 17:31
-
-
Save KitaitiMakoto/f493229009d168b87969 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
require 'pp' | |
require 'pathname' | |
require 'open-uri' | |
require 'nokogiri' | |
require 'active_groonga' | |
require 'baby_erubis' | |
current_dir = Pathname(__dir__).expand_path | |
ORIGIN_URI = URI('http://sinap.jp') | |
INDEX_PAGE = ORIGIN_URI + '/blog/' | |
DATABASE_PATH = current_dir + 'database' + 'sinaplog.db' | |
CONTENTS_DIR = current_dir + 'sinaplog' | |
MAIN_AREA_SELECTOR = 'article .entry-body' | |
def main | |
setup_database | |
download_contents | |
load_contents | |
list_similar_contents | |
end | |
def setup_database | |
DATABASE_PATH.dirname.mkpath | |
ActiveGroonga::Base.configurations = { | |
'development' => { | |
'database' => DATABASE_PATH, | |
'encoding' => 'utf8' | |
} | |
} | |
ActiveGroonga::Base.configure 'development' | |
ActiveGroonga::Base.logger = Logger.new($stderr) | |
ActiveGroonga::Base.database.ensure_available | |
ActiveGroonga::Base.context.register_plugin 'token_filters/stop_word' | |
version = Time.now.to_i | |
migration_file_path = Pathname(__FILE__).expand_path | |
$stdout = STDERR | |
ActiveGroonga::Schema.define version: version do |schema| | |
[CreatePages, CreatePagesContentIndex, LoadStopWords].each do |migration| | |
definitions = migration.new(version, migration_file_path, schema).migrate(:up) | |
definitions.each do |definition| | |
definition.define if definition.respond_to? :define | |
end | |
end | |
end | |
$stdout = STDOUT | |
end | |
def download_contents | |
CONTENTS_DIR.mkpath | |
index_path = download(INDEX_PAGE) | |
doc = Nokogiri.HTML(index_path.open) | |
start_point = doc.css('html body#pagetop div.wrapper div#page.section-content.home div#page-body.section-body article.entry div.section-content div.section-complementary.entry-complementary aside div.section-content div.section-body section.entry-detail div.section-content div.section-body ul li a').first['href'] | |
path = download(start_point) | |
doc = Nokogiri.HTML(path.open) | |
while prev_page_link = doc.css('.blog-previous-page a').first | |
path = download(prev_page_link['href']) | |
doc = Nokogiri.HTML(path.open) | |
end | |
end | |
def load_contents | |
Pathname.glob("#{CONTENTS_DIR}/**/*").each do |path| | |
next unless path.file? | |
next if path.basename.to_path == 'index.html' | |
doc = Nokogiri.HTML(path.open) | |
uri = ORIGIN_URI + "/#{path.relative_path_from(CONTENTS_DIR)}" | |
title = doc.css('title').first.content | |
content = doc.css(MAIN_AREA_SELECTOR).first.content | |
Page.create key: uri.to_s, title: title, content: content | |
end | |
end | |
def list_similar_contents | |
template = BabyErubis::Html.new.from_str(DATA.read) | |
puts template.render(pages: Page.all) | |
end | |
def download(uri) | |
uri = URI(uri) | |
path = uri.path[1..-1] | |
if uri.path.end_with? '/' | |
path += 'index.html' | |
end | |
path = CONTENTS_DIR.join(path) | |
unless path.dirname.directory? | |
$stderr.puts "Making directory: #{path.dirname}..." | |
path.dirname.mkpath | |
end | |
if path.file? | |
$stderr.puts "#{path.relative_path_from(CONTENTS_DIR)} exists. Skip downloading" | |
else | |
$stderr.puts "Downloading: #{uri} -> #{path}" | |
path.write uri.read | |
sleep 1 | |
end | |
path | |
end | |
class CreatePages < ActiveGroonga::Migration | |
def up | |
create_table :pages, type: :hash do |table| | |
table.short_text 'title' | |
table.text 'content' | |
end | |
end | |
def down | |
remove_table :pages | |
end | |
end | |
class CreatePagesContentIndex < ActiveGroonga::Migration | |
def up | |
options = { | |
type: :patricia_trie, | |
normalizer: :NormalizerAuto, | |
default_tokenizer: :TokenMecab, | |
token_filters: ['TokenFilterStopWord'] | |
} | |
create_table :pages_content_index, options do |table| | |
table.index 'pages.content', with_position: true | |
table.boolean 'is_stop_word', type: :scalar | |
end | |
end | |
def down | |
remove_table :pages_content_index | |
end | |
end | |
class LoadStopWords < ActiveGroonga::Migration | |
STOP_WORDS = %w[の は 。 、です ます , .] | |
def up | |
STOP_WORDS.each do |word| | |
table.add word, is_stop_word: true | |
end | |
end | |
def down | |
STOP_WORDS.each do |word| | |
table.delete word | |
end | |
end | |
private | |
def table | |
Groonga['pages_content_index'] | |
end | |
end | |
class Page < ActiveGroonga::Base | |
alias uri key | |
def similar_pages | |
results = Page.table.select {|page| | |
page.content.similar_search(content) & | |
(page.key != key) | |
} | |
results.sort([key: '_score', order: 'descending'], limit: 6) | |
end | |
end | |
if $0 == __FILE__ | |
main | |
end | |
__END__ | |
<!doctype html> | |
<title>SINAPlog similar contents</title> | |
<style> | |
body { | |
margin: 1em 2em; | |
line-height: 2em; | |
} | |
h2 > a { | |
color: rgb(60, 60, 60); | |
text-decoration: none; | |
} | |
h2 > a:hover, h2 > a:active { | |
text-decoration: underline; | |
} | |
li > a { | |
color: rgb(30, 30, 200); | |
} | |
li > a:hover, li > a:active { | |
color: rgba(30, 30, 200, 0.5); | |
text-decoration: none; | |
} | |
</style> | |
<h1>SINAPlog similar contents</h1> | |
<%- @pages.each do |page| -%> | |
<section> | |
<h2><a href=<%= page.uri %>><%= page.title.gsub(' | ブログ | SINAP - 株式会社シナップ', '') %></a></h2> | |
<ol> | |
<%- page.similar_pages.each do |similar_page| -%> | |
<li><a href=<%= similar_page['_key'] %>><%= similar_page.title.gsub(' | ブログ | SINAP - 株式会社シナップ', '') %></a> (score: <%= similar_page.score %>) | |
<%- end -%> | |
</ol> | |
</section> | |
<%- end -%> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment