Created
January 27, 2010 18:46
-
-
Save callison-burch/288074 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from BeautifulSoup import BeautifulSoup | |
| from urllib import quote_plus, unquote_plus | |
| from wt_articles.splitting import determine_splitter | |
| from pyango_view import str2img | |
| import wikipydia | |
| lang = 'ur' | |
| category = u'\u0632\u0645\u0631\u06c1:\u0645\u0646\u062a\u062e\u0628_\u0645\u0642\u0627\u0644\u06d2' | |
| sentence_filename = '/Users/ccb/Desktop/urdu_sentences' | |
| csv_filename = '/Users/ccb/Desktop/wikipedia_article_to_translate-2.csv' | |
| num_sentences_per_hit = 10 | |
| articles = wikipydia.query_category_members(category, language=lang) | |
| def write_sentences_to_file(output_filename, sentences): | |
| """ | |
| Appends a list of sentences to file. | |
| """ | |
| output_file = open(output_filename, 'a') | |
| for sentence in sentences: | |
| output_file.write(sentence.encode('UTF-8')) | |
| output_file.write('\n'.encode('UTF-8')) | |
| output_file.close() | |
| def generate_images(output_dir, article_id, sentences): | |
| """ | |
| Saves image representations of the sentences to the output dir. | |
| The file name contains the article ID and the segment number. | |
| """ | |
| width = 350 | |
| extension = 'png' | |
| for i, sentence in enumerate(sentences): | |
| # the string2img method uses quote marks, so we replace them with '' | |
| sentence = sentence.replace('"', "''") | |
| # also replace the HTML non-breaking space characters | |
| sentence = sentence.replace(" ", " ") | |
| sentence = sentence.replace(" ", " ") | |
| img_outputfile = '%s/%s_%s.%s' % (output_dir, article_id, i, extension) | |
| retval = str2img(sentence, font='Nafees', output=img_outputfile, width=width) | |
| def get_sentences_for_article(article, lang, short_sentence_char_cutoff=7): | |
| """ | |
| Converts the article to text, splits it into sentences, | |
| and returns sentences longer than short_sentence_char_cutoff | |
| """ | |
| rendered_text = wikipydia.query_text_rendered(article, language=lang) | |
| soup = BeautifulSoup(rendered_text['html']) | |
| split_sentences = determine_splitter(lang) | |
| sentences = [] | |
| for paragraph in soup.findAll('p'): | |
| paragraph_text = ''.join(paragraph.findAll(text=True)).strip() | |
| for sentence in split_sentences(paragraph_text): | |
| if(len(sentence) > short_sentence_char_cutoff): | |
| sentences.append(sentence) | |
| return sentences | |
| def write_csv_file(csv_filename, articles, lang, num_sentences_per_hit, img_output_dir, img_url_dir): | |
| """ | |
| Generates a comma seperated value file and associated image files | |
| so that a Mechanical Turk translation HIT can be created. | |
| """ | |
| csv_output_file = open(csv_filename, 'w') | |
| header = "article_id" | |
| for i in range(1, num_sentences_per_hit+1): | |
| header += ",seg_id" + str(i) + ",seg" + str(i) + ",img_url" + str(i) | |
| csv_output_file.write(header.encode('UTF-8')) | |
| # create a placeholder file in case we have an odd number of segments | |
| width = 350 | |
| dnt_filename = img_output_dir + 'do-not-translate.png' | |
| retval = str2img("Do not translate this sentence.", font='Nafees', output=dnt_filename, width=width) | |
| # | |
| offset = 0 | |
| for article in articles: | |
| article_id = wikipydia.query_page_id(article, language=lang) | |
| sentences = get_sentences_for_article(article, lang) | |
| for i, sentence in enumerate(sentences): | |
| if (i + offset) % num_sentences_per_hit == 0 and i != len(sentences): | |
| line = "\n" + str(article_id) | |
| csv_output_file.write(line.encode('UTF-8')) | |
| img_filename = '%s/%s_%s.%s' % (img_output_dir, article_id, i, "png") | |
| retval = str2img(format_for_image(sentence), font='Nafees', output=img_filename, width=width) | |
| img_url = '%s/%s_%s.%s' % (img_url_dir, article_id, i, "png") | |
| line = "," + str(i) + "," + format_for_csv(sentence) + "," + img_url | |
| csv_output_file.write(line.encode('UTF-8')) | |
| offset = (i + offset) % num_sentences_per_hit | |
| # if there are an odd number of sentences, | |
| # then fill out the rest of the CVS file | |
| if not offset % num_sentences_per_hit == 0: | |
| dnt_url = ",,," + img_url_dir + "/do-not-translate.png" | |
| line = dnt_url * (num_sentences_per_hit - offset) | |
| csv_output_file.write(line.encode('UTF-8')) | |
| csv_output_file.write('\n'.encode('UTF-8')) | |
| csv_output_file.close() | |
| def format_for_image(string): | |
| """ | |
| Cleans up the string so that it can be better rendered as an image. | |
| """ | |
| string = string.replace(" ", " ") | |
| string = string.replace(" ", " ") | |
| # double quotes have to be converted to to two single quotes, | |
| # because the image to string converter wraps the string in double quotes | |
| string = string.replace('"', "''") | |
| return string | |
| def format_for_csv(string): | |
| """ | |
| Replaces special characters used by comma separated value (CSV) files | |
| with their HTML equivalents. | |
| """ | |
| string = string.strip() | |
| string = string.replace('\n', ' ') | |
| string = string.replace('&', "&") | |
| string = string.replace(',', ",") | |
| string = string.replace('>', ">") | |
| string = string.replace('<', "<") | |
| string = string.replace('"', """) | |
| string = string.replace("'", "'") | |
| return string | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment