Created
October 7, 2014 01:49
-
-
Save hhuuggoo/71c508c23080982af4c1 to your computer and use it in GitHub Desktop.
sa scraping
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import sys | |
import datetime as dt | |
import os | |
from os.path import exists, join | |
import cPickle as pickle | |
from pyquery import PyQuery as PQ | |
from dateutil.parser import parse as _dparse | |
def dparse(input_string): | |
try: | |
return _dparse(input_string) | |
except ValueError: | |
raise Exception("could not parse %s" % input_string) | |
def last_page(data, pq=None): | |
if pq is None: | |
pq = PQ(data) | |
last_page = PQ(pq('#paging')('li:not(.next)')('li:not(.dots)')[-1]).text() | |
last_page = int(last_page) | |
return last_page | |
def _pull(url, basedir, prefix): | |
st = dt.datetime.now().isoformat() | |
user_agent = {'User-agent': 'Mozilla/5.0'} | |
resp = requests.get(url, headers = user_agent) | |
if resp.status_code != 200: | |
raise Exception(url + resp.content) | |
data = resp.content | |
ed = dt.datetime.now().isoformat() | |
meta = {'url' : url, 'st' : st, 'ed' : ed} | |
with open(join(basedir, '%s.html' % prefix), "w+") as f: | |
f.write(data) | |
with open(join(basedir, '%s.pkl' % prefix), "w+") as f: | |
pickle.dump(meta, f, -1) | |
return data | |
def pull(ticker, basedir): | |
print '***TICKER', ticker | |
path = join(basedir, ticker) | |
if not exists(path): | |
os.makedirs(path) | |
if exists(join(path, 'complete')): | |
print 'already complete', ticker | |
basedir = path | |
url = "http://seekingalpha.com/symbol/%s/news/%s" % (ticker, 1) | |
data = _pull(url, basedir, str(1)) | |
idx = last_page(data) | |
print 'last_page', idx | |
for c in range(2, idx+1): | |
print 'page', c | |
url = "http://seekingalpha.com/symbol/%s/news/%s" % (ticker, c) | |
_pull(url, basedir, str(c)) | |
with open(join(path, 'complete'), "w+") as f: | |
f.write('complete') | |
def parse_date(input_string, meta): | |
if 'Today' in input_string or 'Yesterday' in input_string: | |
time = input_string.split(",")[-1] | |
time = dparse(time).time() | |
st = meta['st'] | |
ed = meta['st'] | |
if ed.date() != st.date(): | |
raise Exception ('ambiguous') | |
date = st.date() | |
if 'Yesterday' in input_string: | |
date = date - dt.timedelta(days=1) | |
ts = dt.datetime.combine(st.date(), time) | |
else: | |
#sometimes years aren't specified, but | |
# dateutil defaults to 2014 which is correct | |
ts = dparse(input_string) | |
return ts | |
def parse(data, meta): | |
d = PQ(data) | |
articles = [PQ(x) for x in d('.mc_list_texting')] | |
data = [] | |
for a in articles: | |
a = a.text() | |
date_string = a.rsplit("|")[-2] | |
ts = parse_date(date_string, meta) | |
temp = meta.copy() | |
temp.update({'text' : a, | |
'ts' : ts}) | |
data.append(temp) | |
return data | |
def grab_all(tickers): | |
### pulling down data | |
import cPickle as pickle | |
import logging | |
import os | |
from os.path import relpath, join, basename, abspath | |
from kitchensink import setup_client, client, do, du, dp | |
setup_client('http://power:6323/') | |
c = client() | |
logging.basicConfig(level=logging.INFO) | |
basedir = abspath('downloads') | |
with open("tickers.pkl", "rb") as f: | |
tickers = pickle.load(f) | |
def helper(ticker, basedir): | |
try: | |
pull(ticker, basedir) | |
except Exception as e: | |
logging.exception(e) | |
for t in tickers: | |
if '#' in t: | |
continue | |
if exists(join(basedir, t)): | |
continue | |
c.bc(helper, t, basedir) | |
c.execute() | |
c.br() | |
def process_all(tickers): | |
import cPickle as pickle | |
from os.path import join, splitext, basename, abspath, exists | |
import os | |
import bsddb | |
from kitchensink import setup_client, client, do, du, dp | |
setup_client('http://power:6323/') | |
c = client() | |
c.reducetree('*parsing*') | |
db = bsddb.hashopen("data.db", "w") | |
basedir = abspath('downloads') | |
def helper(path): | |
html_contents = os.listdir(path) | |
html_contents = [splitext(x)[0] for x in html_contents if x.endswith('html')] | |
vals = [] | |
for c in html_contents: | |
html_path = join(path, c + ".html") | |
pkl_path = join(path, c + ".pkl") | |
with open(pkl_path) as f: | |
meta = pickle.load(f) | |
meta['st'] = dparse(meta['st']) | |
meta['ed'] = dparse(meta['ed']) | |
with open(html_path) as f: | |
data = f.read() | |
vals += parse(data, meta) | |
obj = do(vals) | |
obj.save(prefix="parsing/") | |
return obj | |
result_tickers = [] | |
for t in tickers: | |
print t | |
path = join(basedir, t) | |
if not exists(path): | |
continue | |
if not exists(join(path, 'complete')): | |
continue | |
result_tickers.append(t) | |
c.bc(helper, path) | |
c.execute() | |
results = c.br() | |
for t, r in zip(result_tickers, results): | |
print t | |
db[t] = r.raw() | |
db.sync() | |
if __name__ == "__main__": | |
with open("tickers.pkl", "rb") as f: | |
tickers = pickle.load(f) | |
#grab_all(tickers) | |
process_all(tickers) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment