Skip to content

Instantly share code, notes, and snippets.

@alexchinco
Created July 18, 2014 02:04
Show Gist options
  • Save alexchinco/2eb7ef55a9fb029c926d to your computer and use it in GitHub Desktop.
Save alexchinco/2eb7ef55a9fb029c926d to your computer and use it in GitHub Desktop.
Reads HTML files containing WSJ articles mentioning S&P 500 companies and saves metadata.
import csv
import time
import random
import os
import urllib2
import io
import gzip
import sys
import urllib
import re
import json
from bs4 import BeautifulSoup
from StringIO import StringIO
RAW_DATA_DIR = "/Users/alexchinco/Dropbox/Summer Work 2/"
CLEAN_DATA_DIR = "/Users/alexchinco/Dropbox/research/absolutely_small/data/wsj_data2/"
dateFinder = re.compile('Publication date: </strong>(.*)</p>')
subjectFinder = re.compile('Subject: </strong>(.*)</p>')
wordcountFinder = re.compile('<strong>Full text: </strong>(.*)">')
pageFinder = re.compile('<strong>Pages: </strong>(.*)</p>')
countryFinder = re.compile('<strong>Country of publication: </strong>(.*)</p>')
sectionFinder = re.compile('<strong>Publication subject: </strong>(.*)</p>')
locationFinder = re.compile('Location: </strong>(.*)</p>')
companyFinder = re.compile('Company / organization: </strong>(.*)</p>')
docidFinder = re.compile('ProQuest document ID: </strong>(.*)</p>')
count = 0
for x in os.walk(RAW_DATA_DIR):
if (count == 0):
tickerList = x
count = count + 1
tickerList = tickerList[1]
print tickerList
print " "
print " "
print " "
print " "
print " "
for ticker in tickerList:
DOCINFO_FILE_NAME = ticker.lower() + "-search-docinfo.csv"
docInfoFile = open(CLEAN_DATA_DIR + DOCINFO_FILE_NAME, 'wb')
docInfoWriter = csv.writer(docInfoFile, delimiter = ",")
docInfoWriter.writerow(["id", "docid", "date", "wordcount", "country", "page", "section"])
SUBJECT_FILE_NAME = ticker.lower() + "-search-subjects.csv"
subjectFile = open(CLEAN_DATA_DIR + SUBJECT_FILE_NAME, 'wb')
subjectWriter = csv.writer(subjectFile, delimiter = ",")
subjectWriter.writerow(["id", "docid", "date", "subject"])
TICKER_DIR = ticker + '/'
print ticker
fileList = [f for f in os.listdir(RAW_DATA_DIR + TICKER_DIR) if os.path.isfile(os.path.join(RAW_DATA_DIR + TICKER_DIR,f))]
print fileList
articleCount = 0
for f in fileList:
soup = BeautifulSoup(open(RAW_DATA_DIR + TICKER_DIR + str(f)))
articleList = soup.findAll("div", {"style": "margin-bottom:20px;border-bottom:2px solid #ccc;padding-bottom:5px"})
for article in articleList:
id = articleCount
date = "na"
for elem in article.findAll("p", {"style":"margin-bottom:5pt; margin-top:0; margin-right:0; margin-left:0; padding-left:0;"}):
if (date == "na"):
item = re.findall(dateFinder, str(elem))
if (item != []):
date = str(item[0])
docid = "na"
for elem in article.findAll("p", {"style":"margin-bottom:5pt; margin-top:0; margin-right:0; margin-left:0; padding-left:0;"}):
item = re.findall(docidFinder, str(elem))
if (item != []):
docid = str(item[0])
### Write document file
wordcount = "na"
for elem in article.findAll("p", {"style":"margin-bottom:5pt; margin-top:0; margin-right:0; margin-left:0; padding:0;"}):
try:
item = re.findall(wordcountFinder, str(elem))
if (item != []):
wordcount = str(item[0]).split('"')[3]
except:
print "Word Count Read Error!"
country = "na"
for elem in article.findAll("p", {"style":"margin-bottom:5pt; margin-top:0; margin-right:0; margin-left:0; padding-left:0;"}):
item = re.findall(countryFinder, str(elem))
if (item != []):
country = str(item[0])
page = "na"
for elem in article.findAll("p", {"style":"margin-bottom:5pt; margin-top:0; margin-right:0; margin-left:0; padding-left:0;"}):
item = re.findall(pageFinder, str(elem))
if (item != []):
page = str(item[0])
section = "na"
for elem in article.findAll("p", {"style":"margin-bottom:5pt; margin-top:0; margin-right:0; margin-left:0; padding-left:0;"}):
item = re.findall(sectionFinder, str(elem))
if (item != []):
section = str(item[0])
docInfoWriter.writerow([id, docid, date, wordcount, country, page, section])
### Write subject file
for elem in article.findAll("p", {"style":"margin-bottom:5pt; margin-top:0; margin-right:0; margin-left:0; padding-left:0;"}):
item = re.findall(subjectFinder, str(elem))
if (item != []):
item = str(item[0])
item = item.replace("&amp;", "and")
item = item.replace("Editorials -- ", "")
for i in item.split(";"):
subjectWriter.writerow([id, docid, date, i.strip()])
articleCount = articleCount + 1
docInfoFile.close()
subjectFile.close()
print " "
print " "
print " "
print " "
print " "
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment