Created
July 18, 2014 02:04
-
-
Save alexchinco/2eb7ef55a9fb029c926d to your computer and use it in GitHub Desktop.
Reads HTML files containing WSJ articles mentioning S&P 500 companies and saves metadata.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import time | |
import random | |
import os | |
import urllib2 | |
import io | |
import gzip | |
import sys | |
import urllib | |
import re | |
import json | |
from bs4 import BeautifulSoup | |
from StringIO import StringIO | |
RAW_DATA_DIR = "/Users/alexchinco/Dropbox/Summer Work 2/" | |
CLEAN_DATA_DIR = "/Users/alexchinco/Dropbox/research/absolutely_small/data/wsj_data2/" | |
dateFinder = re.compile('Publication date: </strong>(.*)</p>') | |
subjectFinder = re.compile('Subject: </strong>(.*)</p>') | |
wordcountFinder = re.compile('<strong>Full text: </strong>(.*)">') | |
pageFinder = re.compile('<strong>Pages: </strong>(.*)</p>') | |
countryFinder = re.compile('<strong>Country of publication: </strong>(.*)</p>') | |
sectionFinder = re.compile('<strong>Publication subject: </strong>(.*)</p>') | |
locationFinder = re.compile('Location: </strong>(.*)</p>') | |
companyFinder = re.compile('Company / organization: </strong>(.*)</p>') | |
docidFinder = re.compile('ProQuest document ID: </strong>(.*)</p>') | |
count = 0 | |
for x in os.walk(RAW_DATA_DIR): | |
if (count == 0): | |
tickerList = x | |
count = count + 1 | |
tickerList = tickerList[1] | |
print tickerList | |
print " " | |
print " " | |
print " " | |
print " " | |
print " " | |
for ticker in tickerList: | |
DOCINFO_FILE_NAME = ticker.lower() + "-search-docinfo.csv" | |
docInfoFile = open(CLEAN_DATA_DIR + DOCINFO_FILE_NAME, 'wb') | |
docInfoWriter = csv.writer(docInfoFile, delimiter = ",") | |
docInfoWriter.writerow(["id", "docid", "date", "wordcount", "country", "page", "section"]) | |
SUBJECT_FILE_NAME = ticker.lower() + "-search-subjects.csv" | |
subjectFile = open(CLEAN_DATA_DIR + SUBJECT_FILE_NAME, 'wb') | |
subjectWriter = csv.writer(subjectFile, delimiter = ",") | |
subjectWriter.writerow(["id", "docid", "date", "subject"]) | |
TICKER_DIR = ticker + '/' | |
print ticker | |
fileList = [f for f in os.listdir(RAW_DATA_DIR + TICKER_DIR) if os.path.isfile(os.path.join(RAW_DATA_DIR + TICKER_DIR,f))] | |
print fileList | |
articleCount = 0 | |
for f in fileList: | |
soup = BeautifulSoup(open(RAW_DATA_DIR + TICKER_DIR + str(f))) | |
articleList = soup.findAll("div", {"style": "margin-bottom:20px;border-bottom:2px solid #ccc;padding-bottom:5px"}) | |
for article in articleList: | |
id = articleCount | |
date = "na" | |
for elem in article.findAll("p", {"style":"margin-bottom:5pt; margin-top:0; margin-right:0; margin-left:0; padding-left:0;"}): | |
if (date == "na"): | |
item = re.findall(dateFinder, str(elem)) | |
if (item != []): | |
date = str(item[0]) | |
docid = "na" | |
for elem in article.findAll("p", {"style":"margin-bottom:5pt; margin-top:0; margin-right:0; margin-left:0; padding-left:0;"}): | |
item = re.findall(docidFinder, str(elem)) | |
if (item != []): | |
docid = str(item[0]) | |
### Write document file | |
wordcount = "na" | |
for elem in article.findAll("p", {"style":"margin-bottom:5pt; margin-top:0; margin-right:0; margin-left:0; padding:0;"}): | |
try: | |
item = re.findall(wordcountFinder, str(elem)) | |
if (item != []): | |
wordcount = str(item[0]).split('"')[3] | |
except: | |
print "Word Count Read Error!" | |
country = "na" | |
for elem in article.findAll("p", {"style":"margin-bottom:5pt; margin-top:0; margin-right:0; margin-left:0; padding-left:0;"}): | |
item = re.findall(countryFinder, str(elem)) | |
if (item != []): | |
country = str(item[0]) | |
page = "na" | |
for elem in article.findAll("p", {"style":"margin-bottom:5pt; margin-top:0; margin-right:0; margin-left:0; padding-left:0;"}): | |
item = re.findall(pageFinder, str(elem)) | |
if (item != []): | |
page = str(item[0]) | |
section = "na" | |
for elem in article.findAll("p", {"style":"margin-bottom:5pt; margin-top:0; margin-right:0; margin-left:0; padding-left:0;"}): | |
item = re.findall(sectionFinder, str(elem)) | |
if (item != []): | |
section = str(item[0]) | |
docInfoWriter.writerow([id, docid, date, wordcount, country, page, section]) | |
### Write subject file | |
for elem in article.findAll("p", {"style":"margin-bottom:5pt; margin-top:0; margin-right:0; margin-left:0; padding-left:0;"}): | |
item = re.findall(subjectFinder, str(elem)) | |
if (item != []): | |
item = str(item[0]) | |
item = item.replace("&", "and") | |
item = item.replace("Editorials -- ", "") | |
for i in item.split(";"): | |
subjectWriter.writerow([id, docid, date, i.strip()]) | |
articleCount = articleCount + 1 | |
docInfoFile.close() | |
subjectFile.close() | |
print " " | |
print " " | |
print " " | |
print " " | |
print " " | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment