yoshikischmitz · October 12, 2014 05:33
diff --git a/usage_example_scraper.py b/usage_example_scraper.py
 #  This program is free software: you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation, either version 3 of the License, or
 #  (at your option) any later version.
 #
 #  This program is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #  GNU General Public License for more details.
 #
 #  You should have received a copy of the GNU General Public License
 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.

 import sys
 from urlparse import urljoin
 import urllib
 import urllib2
 import xml.etree.ElementTree as ET
 from lxml import etree
 import sys, os
 sys.path.append(os.path.join(os.path.dirname(__file__), 'beautifulsoup'))
 from bs4 import BeautifulSoup
 import re

 def build_query_url(source):
    qdict = source.encode('utf-8')
    return "http://dict.cn/" + urllib.quote(qdict)
 
 def get_html(get_url):
    request = urllib2.Request(get_url)
    response = urllib2.urlopen(request)
    return response.read()

 # Update the Usage area with examples from dict.cn
 def update_fields(field, updated_field, model_name, model_type):
    if updated_field == "Hanzi":
        get_url = build_query_url(field["Hanzi"])
        html = get_html(get_url)
        r = re.compile('<ol slider="2">.+?<\/ol>',re.DOTALL)
        usage_examples = r.search(unicode(html,"UTF-8")).group()
        field["Usage"] = unicode(usage_examples)

 def build_query_url(qstring):
    return urljoin("http://www.nciku.com/search/all/examples/", urllib.quote(qstring))

 def get_html(url):
    request = urllib2.Request(url)
    response = urllib2.urlopen(request)
    return response.read()

 def strip_tags(node):
    a = ""
    for e in node.find_all('span'):
        a += e.string
    return a

 def extract_contents(node):
    node_text = ''.join(node.stripped_strings)
    return re.sub('\s+', ' ', node_text)

 # Takes the html document and returns an array of dicts with the structure:
 #      =>   {"hanzi": '', "english": ''}
 def get_examples(hanzi):
    url = build_query_url(hanzi)
    response_html = get_html(url)
    html = BeautifulSoup(response_html,'xml')
    dts = html.select('dd[class~=txt] ~ dt')
    dds = html.select('dd[class=txt]') 
    arr = []
    nums = [len(dds), len(dts)]
    num_times = min(nums)
    for i in range(num_times):
        pair = {};
        pair['hanzi'] = extract_contents(dts[i])
        pair['english'] = extract_contents(dds[i])
        arr.append(pair)
    return arr

 usage_example = ""
 for i in get_examples(sys.argv[1]):
    usage_example += i['hanzi'] + "\n" + i['english'] + "\n"

 print usage_example
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <http://www.gnu.org/licenses/>.

	import sys
	from urlparse import urljoin
	import urllib
	import urllib2
	import xml.etree.ElementTree as ET
	from lxml import etree
	import sys, os
	sys.path.append(os.path.join(os.path.dirname(__file__), 'beautifulsoup'))
	from bs4 import BeautifulSoup
	import re

	def build_query_url(source):
	qdict = source.encode('utf-8')
	return "http://dict.cn/" + urllib.quote(qdict)

	def get_html(get_url):
	request = urllib2.Request(get_url)
	response = urllib2.urlopen(request)
	return response.read()

	# Update the Usage area with examples from dict.cn
	def update_fields(field, updated_field, model_name, model_type):
	if updated_field == "Hanzi":
	get_url = build_query_url(field["Hanzi"])
	html = get_html(get_url)
	r = re.compile('<ol slider="2">.+?<\/ol>',re.DOTALL)
	usage_examples = r.search(unicode(html,"UTF-8")).group()
	field["Usage"] = unicode(usage_examples)

	def build_query_url(qstring):
	return urljoin("http://www.nciku.com/search/all/examples/", urllib.quote(qstring))

	def get_html(url):
	request = urllib2.Request(url)
	response = urllib2.urlopen(request)
	return response.read()

	def strip_tags(node):
	a = ""
	for e in node.find_all('span'):
	a += e.string
	return a

	def extract_contents(node):
	node_text = ''.join(node.stripped_strings)
	return re.sub('\s+', ' ', node_text)

	# Takes the html document and returns an array of dicts with the structure:
	# => {"hanzi": '', "english": ''}
	def get_examples(hanzi):
	url = build_query_url(hanzi)
	response_html = get_html(url)
	html = BeautifulSoup(response_html,'xml')
	dts = html.select('dd[class~=txt] ~ dt')
	dds = html.select('dd[class=txt]')
	arr = []
	nums = [len(dds), len(dts)]
	num_times = min(nums)
	for i in range(num_times):
	pair = {};
	pair['hanzi'] = extract_contents(dts[i])
	pair['english'] = extract_contents(dds[i])
	arr.append(pair)
	return arr

	usage_example = ""
	for i in get_examples(sys.argv[1]):
	usage_example += i['hanzi'] + "\n" + i['english'] + "\n"

	print usage_example