Skip to content

Instantly share code, notes, and snippets.

@hhuuggoo
Created October 7, 2014 01:49
Show Gist options
  • Save hhuuggoo/71c508c23080982af4c1 to your computer and use it in GitHub Desktop.
Save hhuuggoo/71c508c23080982af4c1 to your computer and use it in GitHub Desktop.
sa scraping
import requests
import sys
import datetime as dt
import os
from os.path import exists, join
import cPickle as pickle
from pyquery import PyQuery as PQ
from dateutil.parser import parse as _dparse
def dparse(input_string):
try:
return _dparse(input_string)
except ValueError:
raise Exception("could not parse %s" % input_string)
def last_page(data, pq=None):
if pq is None:
pq = PQ(data)
last_page = PQ(pq('#paging')('li:not(.next)')('li:not(.dots)')[-1]).text()
last_page = int(last_page)
return last_page
def _pull(url, basedir, prefix):
st = dt.datetime.now().isoformat()
user_agent = {'User-agent': 'Mozilla/5.0'}
resp = requests.get(url, headers = user_agent)
if resp.status_code != 200:
raise Exception(url + resp.content)
data = resp.content
ed = dt.datetime.now().isoformat()
meta = {'url' : url, 'st' : st, 'ed' : ed}
with open(join(basedir, '%s.html' % prefix), "w+") as f:
f.write(data)
with open(join(basedir, '%s.pkl' % prefix), "w+") as f:
pickle.dump(meta, f, -1)
return data
def pull(ticker, basedir):
print '***TICKER', ticker
path = join(basedir, ticker)
if not exists(path):
os.makedirs(path)
if exists(join(path, 'complete')):
print 'already complete', ticker
basedir = path
url = "http://seekingalpha.com/symbol/%s/news/%s" % (ticker, 1)
data = _pull(url, basedir, str(1))
idx = last_page(data)
print 'last_page', idx
for c in range(2, idx+1):
print 'page', c
url = "http://seekingalpha.com/symbol/%s/news/%s" % (ticker, c)
_pull(url, basedir, str(c))
with open(join(path, 'complete'), "w+") as f:
f.write('complete')
def parse_date(input_string, meta):
if 'Today' in input_string or 'Yesterday' in input_string:
time = input_string.split(",")[-1]
time = dparse(time).time()
st = meta['st']
ed = meta['st']
if ed.date() != st.date():
raise Exception ('ambiguous')
date = st.date()
if 'Yesterday' in input_string:
date = date - dt.timedelta(days=1)
ts = dt.datetime.combine(st.date(), time)
else:
#sometimes years aren't specified, but
# dateutil defaults to 2014 which is correct
ts = dparse(input_string)
return ts
def parse(data, meta):
d = PQ(data)
articles = [PQ(x) for x in d('.mc_list_texting')]
data = []
for a in articles:
a = a.text()
date_string = a.rsplit("|")[-2]
ts = parse_date(date_string, meta)
temp = meta.copy()
temp.update({'text' : a,
'ts' : ts})
data.append(temp)
return data
def grab_all(tickers):
### pulling down data
import cPickle as pickle
import logging
import os
from os.path import relpath, join, basename, abspath
from kitchensink import setup_client, client, do, du, dp
setup_client('http://power:6323/')
c = client()
logging.basicConfig(level=logging.INFO)
basedir = abspath('downloads')
with open("tickers.pkl", "rb") as f:
tickers = pickle.load(f)
def helper(ticker, basedir):
try:
pull(ticker, basedir)
except Exception as e:
logging.exception(e)
for t in tickers:
if '#' in t:
continue
if exists(join(basedir, t)):
continue
c.bc(helper, t, basedir)
c.execute()
c.br()
def process_all(tickers):
import cPickle as pickle
from os.path import join, splitext, basename, abspath, exists
import os
import bsddb
from kitchensink import setup_client, client, do, du, dp
setup_client('http://power:6323/')
c = client()
c.reducetree('*parsing*')
db = bsddb.hashopen("data.db", "w")
basedir = abspath('downloads')
def helper(path):
html_contents = os.listdir(path)
html_contents = [splitext(x)[0] for x in html_contents if x.endswith('html')]
vals = []
for c in html_contents:
html_path = join(path, c + ".html")
pkl_path = join(path, c + ".pkl")
with open(pkl_path) as f:
meta = pickle.load(f)
meta['st'] = dparse(meta['st'])
meta['ed'] = dparse(meta['ed'])
with open(html_path) as f:
data = f.read()
vals += parse(data, meta)
obj = do(vals)
obj.save(prefix="parsing/")
return obj
result_tickers = []
for t in tickers:
print t
path = join(basedir, t)
if not exists(path):
continue
if not exists(join(path, 'complete')):
continue
result_tickers.append(t)
c.bc(helper, path)
c.execute()
results = c.br()
for t, r in zip(result_tickers, results):
print t
db[t] = r.raw()
db.sync()
if __name__ == "__main__":
with open("tickers.pkl", "rb") as f:
tickers = pickle.load(f)
#grab_all(tickers)
process_all(tickers)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment