Skip to content

Instantly share code, notes, and snippets.

@yoshikischmitz
Created October 12, 2014 05:33
Show Gist options
  • Save yoshikischmitz/e31018925d1356d5a0ef to your computer and use it in GitHub Desktop.
Save yoshikischmitz/e31018925d1356d5a0ef to your computer and use it in GitHub Desktop.
Use Examples
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import sys
from urlparse import urljoin
import urllib
import urllib2
import xml.etree.ElementTree as ET
from lxml import etree
import sys, os
sys.path.append(os.path.join(os.path.dirname(__file__), 'beautifulsoup'))
from bs4 import BeautifulSoup
import re
def build_query_url(source):
qdict = source.encode('utf-8')
return "http://dict.cn/" + urllib.quote(qdict)
def get_html(get_url):
request = urllib2.Request(get_url)
response = urllib2.urlopen(request)
return response.read()
# Update the Usage area with examples from dict.cn
def update_fields(field, updated_field, model_name, model_type):
if updated_field == "Hanzi":
get_url = build_query_url(field["Hanzi"])
html = get_html(get_url)
r = re.compile('<ol slider="2">.+?<\/ol>',re.DOTALL)
usage_examples = r.search(unicode(html,"UTF-8")).group()
field["Usage"] = unicode(usage_examples)
def build_query_url(qstring):
return urljoin("http://www.nciku.com/search/all/examples/", urllib.quote(qstring))
def get_html(url):
request = urllib2.Request(url)
response = urllib2.urlopen(request)
return response.read()
def strip_tags(node):
a = ""
for e in node.find_all('span'):
a += e.string
return a
def extract_contents(node):
node_text = ''.join(node.stripped_strings)
return re.sub('\s+', ' ', node_text)
# Takes the html document and returns an array of dicts with the structure:
# => {"hanzi": '', "english": ''}
def get_examples(hanzi):
url = build_query_url(hanzi)
response_html = get_html(url)
html = BeautifulSoup(response_html,'xml')
dts = html.select('dd[class~=txt] ~ dt')
dds = html.select('dd[class=txt]')
arr = []
nums = [len(dds), len(dts)]
num_times = min(nums)
for i in range(num_times):
pair = {};
pair['hanzi'] = extract_contents(dts[i])
pair['english'] = extract_contents(dds[i])
arr.append(pair)
return arr
usage_example = ""
for i in get_examples(sys.argv[1]):
usage_example += i['hanzi'] + "\n" + i['english'] + "\n"
print usage_example
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment