$ brew install apache-spark
A python shell with a preconfigured SparkContext (available as sc
). It is
import collections | |
def update(d, other): | |
"""Recursively merge or update dict-like objects. | |
>>> from pprint import pprint | |
>>> pprint(update({'k1': {'k2': 2}}, {'k1': {'k2': {'k3': 3}}, 'k4': 4})) | |
{'k1': {'k2': {'k3': 3}}, 'k4': 4} | |
>>> pprint(update({'k1': {'k2': 2}}, {'k1': {'k3': 3}})) | |
{'k1': {'k2': 2, 'k3': 3}} | |
>>> pprint(update({'k1': {'k2': 2}}, dict())) |
from bokeh.plotting import figure, ColumnDataSource | |
from bokeh.models import HoverTool | |
def scatter_with_hover(df, x, y, | |
fig=None, cols=None, name=None, marker='x', | |
fig_width=500, fig_height=500, **kwargs): | |
""" | |
Plots an interactive scatter plot of `x` vs `y` using bokeh, with automatic | |
tooltips showing columns from `df`. |
import signal | |
class GracefulInterruptHandler(object): | |
def __init__(self, sig=signal.SIGINT): | |
self.sig = sig | |
def __enter__(self): | |
self.interrupted = False |
import json | |
import boto3 | |
s3 = boto3.resource('s3') | |
obj = s3.Object(bucket, key) | |
data = obj.get()['Body'].read() | |
d = json.loads(data) |
def chunk_query(sql, cursor, chunksize=10): | |
"""Yields rows in chunks.""" | |
cursor.execute(sql) | |
while True: | |
nextrows = cursor.fetchmany(chunksize) | |
if not nextrows: | |
break | |
yield nextrows | |
def iquery(sql, cursor, chunksize=10): |
from MySQLdb.cursors import SSDictCursor | |
def iterate_query(query, connection, arraysize=1): | |
c = connection.cursor(cursorclass=SSDictCursor) | |
c.execute(query) | |
while True: | |
nextrows = c.fetchmany(arraysize) | |
if not nextrows: | |
break |
import pandas as pd | |
df = pd.DataFrame([['2012', 'A', 1], ['2012', 'B', 4], ['2011', 'A', 5], ['2011', 'B', 4]], | |
columns=['year', 'manager', 'return_pct']) | |
df['total_return'] = (df | |
.groupby('manager')['return_pct'] | |
.transform(lambda group: (1 + group / 100.).cumprod().iat[-1])) - 1 | |
df['ranking'] = df.total_return.rank(ascending=False, method='dense') |
#!/usr/bin/env python | |
from gevent import monkey | |
monkey.patch_all() # Patch everything | |
import gevent | |
import time | |
class Hub(object): | |
"""A simple reactor hub... In async!""" |