callison-burch · January 27, 2010 18:46
diff --git a/generate_csv_for_mturk_translation.py b/generate_csv_for_mturk_translation.py
 from BeautifulSoup import BeautifulSoup
 from urllib import quote_plus, unquote_plus
 from wt_articles.splitting import determine_splitter
 from pyango_view import str2img
 import wikipydia

 lang = 'ur'
 category = u'\u0632\u0645\u0631\u06c1:\u0645\u0646\u062a\u062e\u0628_\u0645\u0642\u0627\u0644\u06d2'
 sentence_filename = '/Users/ccb/Desktop/urdu_sentences'
 csv_filename = '/Users/ccb/Desktop/wikipedia_article_to_translate-2.csv'
 num_sentences_per_hit = 10

 articles = wikipydia.query_category_members(category, language=lang)


 def write_sentences_to_file(output_filename, sentences):
    """
    Appends a list of sentences to file.   
    """
    output_file = open(output_filename, 'a')
    for sentence in sentences:
        output_file.write(sentence.encode('UTF-8'))
        output_file.write('\n'.encode('UTF-8'))
    output_file.close()


 def generate_images(output_dir, article_id, sentences):
   """
   Saves image representations of the sentences to the output dir.
   The file name contains the article ID and the segment number.
   """
   width = 350
   extension = 'png'
   for i, sentence in enumerate(sentences):
       # the string2img method uses quote marks, so we replace them with ''
       sentence = sentence.replace('"', "''")
       # also replace the HTML non-breaking space characters
       sentence = sentence.replace("&#160;", " ")
       sentence = sentence.replace("&nbsp;", " ")
       img_outputfile = '%s/%s_%s.%s' % (output_dir, article_id, i, extension)
       retval = str2img(sentence, font='Nafees', output=img_outputfile, width=width)


 def get_sentences_for_article(article, lang, short_sentence_char_cutoff=7):
   """
   Converts the article to text, splits it into sentences,
   and returns sentences longer than short_sentence_char_cutoff
   """
   rendered_text = wikipydia.query_text_rendered(article, language=lang)
   soup = BeautifulSoup(rendered_text['html'])
   split_sentences = determine_splitter(lang)
   sentences = []
   for paragraph in soup.findAll('p'):
      paragraph_text = ''.join(paragraph.findAll(text=True)).strip()
      for sentence in split_sentences(paragraph_text):
         if(len(sentence) > short_sentence_char_cutoff):
            sentences.append(sentence)
   return sentences

 def write_csv_file(csv_filename, articles, lang, num_sentences_per_hit, img_output_dir, img_url_dir):
   """
   Generates a comma seperated value file and associated image files 
   so that a Mechanical Turk translation HIT can be created.
   """
   csv_output_file = open(csv_filename, 'w')
   header = "article_id"
   for i in range(1, num_sentences_per_hit+1):
      header += ",seg_id" + str(i) + ",seg" + str(i) + ",img_url" + str(i)
   csv_output_file.write(header.encode('UTF-8'))
   # create a placeholder file in case we have an odd number of segments
   width = 350
   dnt_filename = img_output_dir + 'do-not-translate.png'
   retval = str2img("Do not translate this sentence.", font='Nafees', output=dnt_filename, width=width)
   #
   offset = 0
   for article in articles:
       article_id = wikipydia.query_page_id(article, language=lang)
       sentences = get_sentences_for_article(article, lang)
       for i, sentence in enumerate(sentences):
           if (i + offset) % num_sentences_per_hit == 0 and i != len(sentences):
               line = "\n" + str(article_id)
               csv_output_file.write(line.encode('UTF-8'))
           img_filename = '%s/%s_%s.%s' % (img_output_dir, article_id, i, "png")
           retval = str2img(format_for_image(sentence), font='Nafees', output=img_filename, width=width)
           img_url = '%s/%s_%s.%s' % (img_url_dir, article_id, i, "png")
           line = "," + str(i) + "," + format_for_csv(sentence) + "," + img_url
           csv_output_file.write(line.encode('UTF-8'))
       offset = (i + offset) % num_sentences_per_hit
   # if there are an odd number of sentences, 
   # then fill out the rest of the CVS file
   if not offset % num_sentences_per_hit == 0:
       dnt_url = ",,," + img_url_dir + "/do-not-translate.png"
       line = dnt_url * (num_sentences_per_hit - offset)
       csv_output_file.write(line.encode('UTF-8'))
   csv_output_file.write('\n'.encode('UTF-8'))
   csv_output_file.close()


 def format_for_image(string):
    """
    Cleans up the string so that it can be better rendered as an image.
    """
    string = string.replace("&nbsp;", " ")
    string = string.replace("&#160;", " ")
    # double quotes have to be converted to to two single quotes, 
    # because the image to string converter wraps the string in double quotes
    string = string.replace('"', "''")
    return string

 def format_for_csv(string):
    """
    Replaces special characters used by comma separated value (CSV) files
    with their HTML equivalents.
    """
    string = string.strip()
    string = string.replace('\n', ' ')
    string = string.replace('&', "&amp;")
    string = string.replace(',', "&#44;")
    string = string.replace('>', "&gt;")
    string = string.replace('<', "&lt;")
    string = string.replace('"', "&quot;")
    string = string.replace("'", "&#39;")
    return string
	from BeautifulSoup import BeautifulSoup
	from urllib import quote_plus, unquote_plus
	from wt_articles.splitting import determine_splitter
	from pyango_view import str2img
	import wikipydia

	lang = 'ur'
	category = u'\u0632\u0645\u0631\u06c1:\u0645\u0646\u062a\u062e\u0628_\u0645\u0642\u0627\u0644\u06d2'
	sentence_filename = '/Users/ccb/Desktop/urdu_sentences'
	csv_filename = '/Users/ccb/Desktop/wikipedia_article_to_translate-2.csv'
	num_sentences_per_hit = 10

	articles = wikipydia.query_category_members(category, language=lang)


	def write_sentences_to_file(output_filename, sentences):
	"""
	Appends a list of sentences to file.
	"""
	output_file = open(output_filename, 'a')
	for sentence in sentences:
	output_file.write(sentence.encode('UTF-8'))
	output_file.write('\n'.encode('UTF-8'))
	output_file.close()


	def generate_images(output_dir, article_id, sentences):
	"""
	Saves image representations of the sentences to the output dir.
	The file name contains the article ID and the segment number.
	"""
	width = 350
	extension = 'png'
	for i, sentence in enumerate(sentences):
	# the string2img method uses quote marks, so we replace them with ''
	sentence = sentence.replace('"', "''")
	# also replace the HTML non-breaking space characters
	sentence = sentence.replace(" ", " ")
	sentence = sentence.replace(" ", " ")
	img_outputfile = '%s/%s_%s.%s' % (output_dir, article_id, i, extension)
	retval = str2img(sentence, font='Nafees', output=img_outputfile, width=width)


	def get_sentences_for_article(article, lang, short_sentence_char_cutoff=7):
	"""
	Converts the article to text, splits it into sentences,
	and returns sentences longer than short_sentence_char_cutoff
	"""
	rendered_text = wikipydia.query_text_rendered(article, language=lang)
	soup = BeautifulSoup(rendered_text['html'])
	split_sentences = determine_splitter(lang)
	sentences = []
	for paragraph in soup.findAll('p'):
	paragraph_text = ''.join(paragraph.findAll(text=True)).strip()
	for sentence in split_sentences(paragraph_text):
	if(len(sentence) > short_sentence_char_cutoff):
	sentences.append(sentence)
	return sentences

	def write_csv_file(csv_filename, articles, lang, num_sentences_per_hit, img_output_dir, img_url_dir):
	"""
	Generates a comma seperated value file and associated image files
	so that a Mechanical Turk translation HIT can be created.
	"""
	csv_output_file = open(csv_filename, 'w')
	header = "article_id"
	for i in range(1, num_sentences_per_hit+1):
	header += ",seg_id" + str(i) + ",seg" + str(i) + ",img_url" + str(i)
	csv_output_file.write(header.encode('UTF-8'))
	# create a placeholder file in case we have an odd number of segments
	width = 350
	dnt_filename = img_output_dir + 'do-not-translate.png'
	retval = str2img("Do not translate this sentence.", font='Nafees', output=dnt_filename, width=width)
	#
	offset = 0
	for article in articles:
	article_id = wikipydia.query_page_id(article, language=lang)
	sentences = get_sentences_for_article(article, lang)
	for i, sentence in enumerate(sentences):
	if (i + offset) % num_sentences_per_hit == 0 and i != len(sentences):
	line = "\n" + str(article_id)
	csv_output_file.write(line.encode('UTF-8'))
	img_filename = '%s/%s_%s.%s' % (img_output_dir, article_id, i, "png")
	retval = str2img(format_for_image(sentence), font='Nafees', output=img_filename, width=width)
	img_url = '%s/%s_%s.%s' % (img_url_dir, article_id, i, "png")
	line = "," + str(i) + "," + format_for_csv(sentence) + "," + img_url
	csv_output_file.write(line.encode('UTF-8'))
	offset = (i + offset) % num_sentences_per_hit
	# if there are an odd number of sentences,
	# then fill out the rest of the CVS file
	if not offset % num_sentences_per_hit == 0:
	dnt_url = ",,," + img_url_dir + "/do-not-translate.png"
	line = dnt_url * (num_sentences_per_hit - offset)
	csv_output_file.write(line.encode('UTF-8'))
	csv_output_file.write('\n'.encode('UTF-8'))
	csv_output_file.close()


	def format_for_image(string):
	"""
	Cleans up the string so that it can be better rendered as an image.
	"""
	string = string.replace(" ", " ")
	string = string.replace(" ", " ")
	# double quotes have to be converted to to two single quotes,
	# because the image to string converter wraps the string in double quotes
	string = string.replace('"', "''")
	return string

	def format_for_csv(string):
	"""
	Replaces special characters used by comma separated value (CSV) files
	with their HTML equivalents.
	"""
	string = string.strip()
	string = string.replace('\n', ' ')
	string = string.replace('&', "&")
	string = string.replace(',', ",")
	string = string.replace('>', ">")
	string = string.replace('<', "<")
	string = string.replace('"', """)
	string = string.replace("'", "'")
	return string
No results found