Skip to content

Instantly share code, notes, and snippets.

@callison-burch
Created January 27, 2010 18:46
Show Gist options
  • Select an option

  • Save callison-burch/288074 to your computer and use it in GitHub Desktop.

Select an option

Save callison-burch/288074 to your computer and use it in GitHub Desktop.
from BeautifulSoup import BeautifulSoup
from urllib import quote_plus, unquote_plus
from wt_articles.splitting import determine_splitter
from pyango_view import str2img
import wikipydia
lang = 'ur'
category = u'\u0632\u0645\u0631\u06c1:\u0645\u0646\u062a\u062e\u0628_\u0645\u0642\u0627\u0644\u06d2'
sentence_filename = '/Users/ccb/Desktop/urdu_sentences'
csv_filename = '/Users/ccb/Desktop/wikipedia_article_to_translate-2.csv'
num_sentences_per_hit = 10
articles = wikipydia.query_category_members(category, language=lang)
def write_sentences_to_file(output_filename, sentences):
"""
Appends a list of sentences to file.
"""
output_file = open(output_filename, 'a')
for sentence in sentences:
output_file.write(sentence.encode('UTF-8'))
output_file.write('\n'.encode('UTF-8'))
output_file.close()
def generate_images(output_dir, article_id, sentences):
"""
Saves image representations of the sentences to the output dir.
The file name contains the article ID and the segment number.
"""
width = 350
extension = 'png'
for i, sentence in enumerate(sentences):
# the string2img method uses quote marks, so we replace them with ''
sentence = sentence.replace('"', "''")
# also replace the HTML non-breaking space characters
sentence = sentence.replace(" ", " ")
sentence = sentence.replace(" ", " ")
img_outputfile = '%s/%s_%s.%s' % (output_dir, article_id, i, extension)
retval = str2img(sentence, font='Nafees', output=img_outputfile, width=width)
def get_sentences_for_article(article, lang, short_sentence_char_cutoff=7):
"""
Converts the article to text, splits it into sentences,
and returns sentences longer than short_sentence_char_cutoff
"""
rendered_text = wikipydia.query_text_rendered(article, language=lang)
soup = BeautifulSoup(rendered_text['html'])
split_sentences = determine_splitter(lang)
sentences = []
for paragraph in soup.findAll('p'):
paragraph_text = ''.join(paragraph.findAll(text=True)).strip()
for sentence in split_sentences(paragraph_text):
if(len(sentence) > short_sentence_char_cutoff):
sentences.append(sentence)
return sentences
def write_csv_file(csv_filename, articles, lang, num_sentences_per_hit, img_output_dir, img_url_dir):
"""
Generates a comma seperated value file and associated image files
so that a Mechanical Turk translation HIT can be created.
"""
csv_output_file = open(csv_filename, 'w')
header = "article_id"
for i in range(1, num_sentences_per_hit+1):
header += ",seg_id" + str(i) + ",seg" + str(i) + ",img_url" + str(i)
csv_output_file.write(header.encode('UTF-8'))
# create a placeholder file in case we have an odd number of segments
width = 350
dnt_filename = img_output_dir + 'do-not-translate.png'
retval = str2img("Do not translate this sentence.", font='Nafees', output=dnt_filename, width=width)
#
offset = 0
for article in articles:
article_id = wikipydia.query_page_id(article, language=lang)
sentences = get_sentences_for_article(article, lang)
for i, sentence in enumerate(sentences):
if (i + offset) % num_sentences_per_hit == 0 and i != len(sentences):
line = "\n" + str(article_id)
csv_output_file.write(line.encode('UTF-8'))
img_filename = '%s/%s_%s.%s' % (img_output_dir, article_id, i, "png")
retval = str2img(format_for_image(sentence), font='Nafees', output=img_filename, width=width)
img_url = '%s/%s_%s.%s' % (img_url_dir, article_id, i, "png")
line = "," + str(i) + "," + format_for_csv(sentence) + "," + img_url
csv_output_file.write(line.encode('UTF-8'))
offset = (i + offset) % num_sentences_per_hit
# if there are an odd number of sentences,
# then fill out the rest of the CVS file
if not offset % num_sentences_per_hit == 0:
dnt_url = ",,," + img_url_dir + "/do-not-translate.png"
line = dnt_url * (num_sentences_per_hit - offset)
csv_output_file.write(line.encode('UTF-8'))
csv_output_file.write('\n'.encode('UTF-8'))
csv_output_file.close()
def format_for_image(string):
"""
Cleans up the string so that it can be better rendered as an image.
"""
string = string.replace(" ", " ")
string = string.replace(" ", " ")
# double quotes have to be converted to to two single quotes,
# because the image to string converter wraps the string in double quotes
string = string.replace('"', "''")
return string
def format_for_csv(string):
"""
Replaces special characters used by comma separated value (CSV) files
with their HTML equivalents.
"""
string = string.strip()
string = string.replace('\n', ' ')
string = string.replace('&', "&")
string = string.replace(',', ",")
string = string.replace('>', ">")
string = string.replace('<', "&lt;")
string = string.replace('"', "&quot;")
string = string.replace("'", "&#39;")
return string
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment