Skip to content

Instantly share code, notes, and snippets.

@jergason
Created March 8, 2012 17:22
Show Gist options
  • Select an option

  • Save jergason/2002189 to your computer and use it in GitHub Desktop.

Select an option

Save jergason/2002189 to your computer and use it in GitHub Desktop.
stack overflow data formatting task
#Stack Overfow build script
import codecs
import os
import re
import json
from nltk.tokenize import TreebankWordTokenizer
from bs4 import BeautifulSoup
from build import create_dirs_and_open
from topic_modeling import anyjson
from backend import config as c
c['num_topics'] = 100
c['dataset_name'] = 'stack_overflow'
c['dataset_readable_name'] = 'Stack Overflow'
def task_extract_data():
print('calling task_extract_data in stackoverflow.py!')
data_dir = "%s/%s" % (c['raw_data_dir'], "data")
dest_dir = c['files_dir']
task = dict()
task['targets'] = [dest_dir]
task['actions'] = [(_extract, [data_dir, dest_dir])]
task['clean'] = ['rm -rf '+dest_dir]
return task
def _extract(data_dir, result_dir):
print('getting stack overflow data! woot woot')
counter = 0
# for each user dir
# for each thing in questions/ and answers/
# pull out the text from the HTML
# Do some other kind of tokenization and cleanup?
# profit!
print(data_dir)
user_dirs = os.walk(data_dir).next()[1]
print(user_dirs)
progress_counter = 0
for user in user_dirs:
print(user)
if user == '.':
continue
print(os.listdir(os.path.join(data_dir, user, 'questions')))
counter += _clean_questions_and_answers(os.path.join(data_dir, user), 'questions', result_dir)
counter += _clean_questions_and_answers(os.path.join(data_dir, user), 'answers', result_dir)
progress_counter += 1
print('Done with extracting stuff for user %d of %d', (progress_counter, len(user_dirs)))
def _clean_questions_and_answers(base_dir, q_or_a, output_dir):
num_files_cleaned = 0
for dat in os.listdir(os.path.join(base_dir, q_or_a)):
with open(os.path.join(base_dir, q_or_a, dat), 'r') as f:
text = f.read()
soup = BeautifulSoup(text)
w = create_dirs_and_open(os.path.join(output_dir, str(num_files_cleaned)))
num_files_cleaned += 1
w.write(soup.get_text())
w.close()
return num_files_cleaned
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment