Created
March 8, 2012 17:22
-
-
Save jergason/2002189 to your computer and use it in GitHub Desktop.
stack overflow data formatting task
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #Stack Overfow build script | |
| import codecs | |
| import os | |
| import re | |
| import json | |
| from nltk.tokenize import TreebankWordTokenizer | |
| from bs4 import BeautifulSoup | |
| from build import create_dirs_and_open | |
| from topic_modeling import anyjson | |
| from backend import config as c | |
| c['num_topics'] = 100 | |
| c['dataset_name'] = 'stack_overflow' | |
| c['dataset_readable_name'] = 'Stack Overflow' | |
| def task_extract_data(): | |
| print('calling task_extract_data in stackoverflow.py!') | |
| data_dir = "%s/%s" % (c['raw_data_dir'], "data") | |
| dest_dir = c['files_dir'] | |
| task = dict() | |
| task['targets'] = [dest_dir] | |
| task['actions'] = [(_extract, [data_dir, dest_dir])] | |
| task['clean'] = ['rm -rf '+dest_dir] | |
| return task | |
| def _extract(data_dir, result_dir): | |
| print('getting stack overflow data! woot woot') | |
| counter = 0 | |
| # for each user dir | |
| # for each thing in questions/ and answers/ | |
| # pull out the text from the HTML | |
| # Do some other kind of tokenization and cleanup? | |
| # profit! | |
| print(data_dir) | |
| user_dirs = os.walk(data_dir).next()[1] | |
| print(user_dirs) | |
| progress_counter = 0 | |
| for user in user_dirs: | |
| print(user) | |
| if user == '.': | |
| continue | |
| print(os.listdir(os.path.join(data_dir, user, 'questions'))) | |
| counter += _clean_questions_and_answers(os.path.join(data_dir, user), 'questions', result_dir) | |
| counter += _clean_questions_and_answers(os.path.join(data_dir, user), 'answers', result_dir) | |
| progress_counter += 1 | |
| print('Done with extracting stuff for user %d of %d', (progress_counter, len(user_dirs))) | |
| def _clean_questions_and_answers(base_dir, q_or_a, output_dir): | |
| num_files_cleaned = 0 | |
| for dat in os.listdir(os.path.join(base_dir, q_or_a)): | |
| with open(os.path.join(base_dir, q_or_a, dat), 'r') as f: | |
| text = f.read() | |
| soup = BeautifulSoup(text) | |
| w = create_dirs_and_open(os.path.join(output_dir, str(num_files_cleaned))) | |
| num_files_cleaned += 1 | |
| w.write(soup.get_text()) | |
| w.close() | |
| return num_files_cleaned |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment