-
-
Save weirdpercent/7205512 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Runs with Picky 3.3.1. | |
# | |
# Note: You need some example html files in | |
# the same directory. | |
# | |
# Index data and start: | |
# 1. rake index | |
# 2. ruby app.rb | |
# | |
# Then, search on it using: | |
# 3. curl localhost:4567/pages?query=text | |
# | |
require 'sinatra' | |
require 'picky' | |
include Picky | |
# This handles the loading and preprocessing | |
# of websites and handing it into Picky. | |
# | |
class WebsiteSource | |
Website = Struct.new :id, :title, :body | |
attr_reader :html_files | |
def initialize *html_files | |
@html_files = html_files | |
end | |
def remove_tags_and_clean html | |
html.gsub /<\s*([a-oq-z]|p\w|\!)[^>]*>|<\s*\/\s*([a-oq-z]|p\w)[^>]*>/im, '' | |
end | |
def each &block | |
html_files.each do |html_file| | |
absolute_html_file = File.expand_path html_file, File.dirname(__FILE__) | |
File.open(absolute_html_file) do |file| | |
html = file.read | |
title = html.match(%r{<title>(.*?)</title>}m)[1] | |
body = html.match(%r{<body>(.*?)</body>}mi)[1] | |
yield Website.new "http://yourpage.com/#{html_file}", remove_tags_and_clean(title), remove_tags_and_clean(body) | |
end | |
end | |
end | |
end | |
# Define an index. | |
# | |
site_index = Index.new :pages do | |
source { WebsiteSource.new *Dir['*.html'] } | |
key_format :to_sym | |
indexing substitutes_characters_with: CharacterSubstituters::WestEuropean.new, | |
removes_characters: /[^a-z0-9\s\"\~\*\:\,]/i | |
category :title | |
category :body | |
end | |
# Define a search over the books index. | |
# | |
site_search = Search.new site_index do | |
searching substitutes_characters_with: CharacterSubstituters::WestEuropean.new, # Normalizes special user input, Ä -> Ae, ñ -> n etc. | |
removes_characters: /[^a-z0-9\s\/\-\_\&\.\"\~\*\:\,]/i, # Picky needs control chars *"~:, to pass through. | |
stopwords: /\b(and|the|of|it|in|for)\b/i, | |
splits_text_on: /[\s\/\-\&]+/ | |
end | |
# Route /books to the books search and log when searching. | |
# | |
get '/pages' do | |
results = site_search.search params[:query], params[:ids] || 20, params[:offset] || 0 | |
results.ids.to_json | |
end | |
site_index.reload |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE HTML> | |
<html lang="de"> | |
<head> | |
<meta http-equiv="content-type" content="text/html; charset=UTF-8"> | |
<title>Rechtsanwalt Michael Below</title> | |
<link rel="stylesheet" type="text/css" href="/style.css" media="screen"> | |
<link href="/aktuelles.xml" type="application/atom+xml" rel="alternate" title="Rechtsanwalt Michael Below: Aktuelles ATOM Feed"> | |
<link rel="canonical" href="http://judiz.de/"> | |
<meta name="author" content="Michael Below"> | |
<meta name="description" content="Engagierte Kanzlei in Berlin-Friedrichshain, besondere Interessen: Internet-/Medien-/Urheberrecht sowie Schadens- und Versicherungsrecht."> | |
<meta name="keywords" content="Anwalt, Berlin, Friedrichshain, Kreuzberg, IT-Recht, Medienrecht, Versicherungsrecht"> | |
<meta name="google-site-verification" content="Jv1k7Pc2t1oTqTW5rJurLHoWe5hrdx0084hXxBMkfUE"> | |
<meta name="geo.region" content="DE-BE"> | |
<meta name="geo.placename" content="Berlin"> | |
<meta name="geo.position" content="52.51448;13.45992"> | |
</head> | |
<body> | |
<h1>RECHTSANWALT MICHAEL BELOW</h1> | |
<table class="full"> | |
<tbody><tr> | |
<td class="nav"> | |
<span class="active" title="You're here.">Start</span><br> | |
<a href="/schwerpunkte/">Schwerpunkte</a><br> | |
<a href="/aktuelles/">Aktuelles</a><br> | |
<a href="/kosten/">Kosten</a><br> | |
<a href="/links/">Links</a><br> | |
<a href="/kontakt/">Kontakt</a><br> | |
</td> | |
<td class="text"> | |
<table class="pic"><tbody><tr><td><img src="/images/below.jpg" alt="Rechtsanwalt Michael Below" height="333" width="499"></td></tr><tr><td><p class="stat">© <a href="http://www.fischkatron.de/">Katha Schmidt</a> 2010</p></td></tr></tbody></table> | |
<p>Willkommen auf den Webseiten von Rechtsanwalt Michael Below, Berlin.</p> | |
<p>Seit 2005 bin ich überwiegend im Zivilrecht tätig, ein Schwerpunkt liegt dabei auf den Rechtsfragen rund um elektronische Datenverarbeitung, Medien und Internet. Ein weiterer Interessenschwerpunkt ist das Schadens- und Versicherungsrecht. Näheres erfahren Sie unter <a href="/schwerpunkte/index.html">Schwerpunkte</a>.</p> | |
<p>Neuigkeiten rund um Recht und Kanzlei finden Sie unter <a href="/aktuelles/index.html">Aktuelles</a>, zuletzt: »<a href="/aktuelles/stromnetz/">Erhöhung der Netzentgelte? Stromnetze in Bürgerhand!</a>«</p> | |
<p>Auf elektronischem und konventionellem Wege können Sie jederzeit <a href="/kontakt/index.html">Kontakt</a> mit mir aufnehmen. Während meiner Bürozeiten bin ich selbstverständlich auch persönlich ansprechbar.</p> | |
</td> | |
</tr> | |
<tr> | |
<td class="logo"> | |
<p class="logo">Partner</p> | |
<A HREF="http://www.adiuro.de/"><img class="logo" src="/images/adiuro.png" width="90" height="24" alt="ADIURO. Rechtsanwälte"></a> | |
</td> | |
<td class="stat"> | |
<p class="impressum"><a href="/datenschutz/">Datenschutz</a> | |
· <a href="/impressum/">Impressum</a></p> | |
<p class="stat">Letzte Änderung: 21.10.2011</p></td></tr></tbody> | |
</table> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment