Last active
December 16, 2015 01:09
-
-
Save AdolfVonKleist/5352560 to your computer and use it in GitHub Desktop.
More usable, simple, standalone WSGI interface to the Balanced Corpus of Contemporary Written Japanese: http://www.kotonoha.gr.jp/shonagon/ Run the script from a termina: $ ./get-bccwj-examples.py then view in your browser at: localhost:8000. Run the script with '--help' to see additional options. Script will cache your vocabulary search results…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- mode: python; coding: utf-8 -*- | |
from wsgiref.simple_server import make_server | |
from cgi import parse_qs, escape | |
import urllib, urllib2, cookielib, sqlite3, re | |
html = """ | |
<html> | |
<head> | |
<style type="text/css"> | |
/* form css from: http://webdesignerwall.com/tutorials/beautiful-css3-search-form */ | |
.searchform { | |
display: inline-block; | |
zoom: 1; | |
border: solid 1px #d2d2d2; | |
padding: 3px 5px; | |
box-shadow: 0 1px 0px rgba(0,0,0,.1); | |
background: #f1f1f1; | |
background: -webkit-gradient(linear, left top, left bottom, from(#fff), to(#ededed)); | |
background: -moz-linear-gradient(top, #fff, #ededed); | |
border-radius: 2em; | |
filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffff', endColorstr='#ededed'); /* ie7 */ | |
-webkit-border-radius: 2em; | |
-moz-border-radius: 2em; | |
-webkit-box-shadow: 0 1px 0px rgba(0,0,0,.1); | |
-moz-box-shadow: 0 1px 0px rgba(0,0,0,.1); | |
-ms-filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffff', endColorstr='#ededed'); /* ie8 */ | |
*display: inline; | |
} | |
.searchform input { | |
font: normal 12px Arial, Helvetica, sans-serif; | |
} | |
.searchform .searchfield { | |
background: #fff; | |
padding: 6px 6px 6px 8px; | |
width: 202px; | |
border: solid 1px #bcbbbb; | |
outline: none; | |
-webkit-border-radius: 2em; | |
-moz-border-radius: 2em; | |
border-radius: 2em; | |
-moz-box-shadow: inset 0 1px 2px rgba(0,0,0,.2); | |
-webkit-box-shadow: inset 0 1px 2px rgba(0,0,0,.2); | |
box-shadow: inset 0 1px 2px rgba(0,0,0,.2); | |
} | |
.searchform .searchbutton { | |
color: #fff; | |
border: solid 1px #494949; | |
font-size: 11px; | |
height: 27px; | |
width: 57px; | |
text-shadow: 0 1px 1px rgba(0,0,0,.6); | |
-webkit-border-radius: 2em; | |
-moz-border-radius: 2em; | |
border-radius: 2em; | |
background: #5f5f5f; | |
background: -webkit-gradient(linear, left top, left bottom, from(#9e9e9e), to(#454545)); | |
background: -moz-linear-gradient(top, #9e9e9e, #454545); | |
filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#9e9e9e', endColorstr='#454545'); /* ie7 */ | |
-ms-filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#9e9e9e', endColorstr='#454545'); /* ie8 */ | |
} | |
td.cell01 { width: 250px; border-bottom: 2px solid green; } | |
td.cell02 { width: 100px; font-weight: bold; color: navy; text-align: center; border-bottom: 2px solid green; } | |
td.cell03 { width: 250px; border-bottom: 2px solid green; } | |
</style> | |
</head> | |
<body> | |
<div id="searchdiv" style="padding-left: 30px; padding-top: 20px;"> | |
<p>Enter a Japanese word that you would like to find example sentences for.</p> | |
<div style="font-size: .8em;"> | |
<p>Intended for study purposes and individual use only. <br /> | |
See the official <a href="http://www.kotonoha.gr.jp/shonagon/">BCCWJ</a> website for details.</p> | |
</div> | |
<form class="searchform" method="get" action="parsing_get.wsgi"> | |
<input | |
type="text" name="tango" class="searchfield" | |
value=" 単語を入力して下さい" | |
onfocus="if(this.value==this.defaultValue)this.value='';" | |
onblur="if(this.value=='')this.value=.this.defaultValue;" > | |
<input type="submit" value="検索する" class="searchbutton"> | |
</form> | |
</div> | |
<div style="padding-left: 20px;"> | |
%s | |
</div> | |
</body> | |
</html>""" | |
table_header = """<table> | |
<tr> | |
<th class="sample cell01 reversetext">前文脈</th> | |
<th class="sample cell02 nosort">検索文字列</th> | |
<th class="sample cell03 text">後文脈</th> | |
</tr>""" | |
def retrieve_response( tango ): | |
""" | |
Retrieve the table of example sentences for the | |
desired vocabulary item. Searches and scrapes the | |
results from the | |
Balanced Corpus of Contemporary Written Japanese: | |
http://www.kotonoha.gr.jp/shonagon | |
""" | |
cookie_jar = cookielib.CookieJar() | |
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar)) | |
opener.addheaders.append(('User-agent', 'Mozilla/5.0')) | |
opener.addheaders.append(('Referer','http://www.kotonoha.gr.jp/shonagon/search_form')) | |
urllib2.install_opener(opener) | |
## acquire cookie | |
url_1 = 'http://www.kotonoha.gr.jp/shonagon/search_form' | |
req = urllib2.Request(url_1) | |
rsp = urllib2.urlopen(req) | |
# do POST | |
url_2 = 'http://www.kotonoha.gr.jp/shonagon/search_result' | |
# skip '書箱' and '韻文' | |
types = [ | |
'雑誌','新聞','白書', | |
'教科書','広報紙','Yahoo!知恵袋', | |
'Yahoo!ブログ','法律','国会会議録' | |
] | |
etypes = [ urllib.quote_plus(x) for x in types ] | |
values = dict( | |
query_string=tango, | |
lcontext_regex='', | |
rcontext_regex='', | |
entire_period='1', | |
) | |
data = urllib.urlencode(values) | |
for t in etypes: | |
data += "&media="+t | |
req = urllib2.Request(url_2, data) | |
rsp = urllib2.urlopen(req) | |
content = rsp.read() | |
table = [] | |
entry = [] | |
for line in content.split("\n"): | |
if "<th" in line: | |
continue | |
if "cell0" in line: | |
line = re.sub(r"<td[^>]*>","",line) | |
line = line.replace("</td>","\t") | |
line = line.strip().decode("utf8") | |
entry.append(line) | |
if len(entry)==3: | |
table.append([entry[1], entry[0], entry[2]]) | |
entry = [] | |
return table | |
def fetch_result( tango ): | |
#Open the DB or create it if it doesn't exist | |
conn = sqlite3.connect("sentences.db") | |
cursor = conn.cursor() | |
cursor.execute("""CREATE TABLE IF NOT EXISTS sentences (tango text, lc text, rc text)""") | |
#Check whether we already have the entries for this word | |
# stored in the local DB | |
sql = "SELECT * FROM sentences WHERE tango=?" | |
cursor.execute(sql, [(tango.decode("utf8"))]) | |
sentences = cursor.fetchall() | |
if len(sentences)>0: | |
return sentences | |
else: | |
#Didn't find anything. Try and fetch from online | |
sentences = retrieve_response( tango ) | |
#Finally insert whatever we found online into the local | |
# DB so we don't have to waste time and resources fetching | |
# it again later on during review | |
for sent in sentences: | |
sql = "INSERT INTO sentences VALUES ('%s', '%s', '%s' )" | |
sql = sql % ( tango.decode("utf8"), sent[1], sent[2] ) | |
cursor.execute( sql ) | |
conn.commit( ) | |
return sentences | |
def application(environ, start_response): | |
""" | |
Purloined from: | |
http://webpython.codepoint.net/wsgi_request_parsing_get | |
and modified for the purpose of this app. | |
""" | |
# Returns a dictionary containing lists as values. | |
d = parse_qs(environ['QUERY_STRING']) | |
# In this idiom you must issue a list containing a default value. | |
tango = d.get('tango', [''])[0] | |
table = table_header | |
if tango: | |
sentences = fetch_result( tango ) | |
for sent in sentences: | |
row = "<tr><td class='cell01'>"+sent[1].encode("utf8")+\ | |
"</td><td class='cell02'>"+sent[0].encode("utf8")+\ | |
"</td><td class='cell03'>"+sent[2].encode("utf8")+"</td></tr>" | |
table = table+row | |
table = table + "</table>" | |
response_body = html % ( table ) | |
status = '200 OK' | |
# Now content type is text/html | |
response_headers = [('Content-Type', 'text/html'), | |
('Content-Length', str(len(response_body)))] | |
start_response(status, response_headers) | |
return [response_body] | |
if __name__=="__main__": | |
import sys, argparse | |
parser = argparse.ArgumentParser( | |
description="""Simple standalone WSGI script for | |
retrieving example sentences from the Balanced Corpus of | |
Contemporary Written Japanese.""" ) | |
parser.add_argument('--host', '-i', help="host name to use. Default to localhost.", default="localhost" ) | |
parser.add_argument('--port', '-p', help="port to use. Default to '8000'.", default=80, type=int ) | |
parser.add_argument('--verbose', '-v', help="Verbose mode.", default=False, action="store_true" ) | |
args = parser.parse_args() | |
if args.verbose==True: | |
for k,v in args.__dict__.items(): | |
print k,"=",v | |
httpd = make_server('localhost', 8000, application) | |
httpd.serve_forever( ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment