Skip to content

Instantly share code, notes, and snippets.

@ccerv1
ccerv1 / usda_coffee.py
Created January 27, 2019 16:58
Create a dataframe with the latest USDA coffee statistics
import pandas as pd
import requests, zipfile, io
# requests the data from the USDA url
response = requests.get('https://apps.fas.usda.gov/psdonline/downloads/psd_coffee_csv.zip')
# the response is a zipfile, which we unzip in memory rather than saving to the disk
zf = zipfile.ZipFile(io.BytesIO(response.content))
usda = pd.read_csv(zf.open('psd_coffee.csv'))
@ccerv1
ccerv1 / usda_country_query.py
Last active January 27, 2019 17:21
Queries the database for the data on a single country
def country_query(country, year_type='Market_Year', attribute=''):
df = usda[usda['Country_Name'] == country]
if attribute:
df = df[df['Attribute_Description'] == attribute]
return df.pivot(
index='Attribute_Description',
columns=year_type,
values='Value'
)
@ccerv1
ccerv1 / usda_attribute_query.py
Created January 27, 2019 17:30
Queries the database for the data on a single attribute
def attribute_query(attribute, year_type='Market_Year', countries=[]):
df = usda[usda['Attribute_Description'] == attribute]
if countries:
df = df[df['Country_Name'].isin(countries)]
return df.pivot(
index='Country_Name',
columns=year_type,
values='Value'
)
@ccerv1
ccerv1 / usda_chart_assums.py
Created February 16, 2019 11:57
Dependencies and assumptions for our analysis
%matplotlib inline
import matplotlib.pyplot as plt
# Some constant assumptions we'll use in our analysis
YEARS_IN_SERIES = 25
MAJOR_PRODUCER_THRESHOLD = 5000 # thousands of bags
MIDSIZE_PRODUCER_THRESHOLD = 500 # thousands of bags
@ccerv1
ccerv1 / is_producing_above.py
Created February 16, 2019 12:00
Module to decide whether a country belongs in a list of 'major', 'midsize', or 'minor' producers
def is_producing_above(country_data, threshold_value, num_years_back=5):
recent_production = country_data.iloc[0,-num_years_back:]
production_peak = recent_production.mean()
return production_peak > threshold_value
list_of_all_countries = usda['Country_Name'].unique()
major_producers = []
midsize_producers = []
minor_producers = []
@ccerv1
ccerv1 / other_producer_volumes.py
Created February 16, 2019 12:02
Put the volumes from smaller producing countries into a single table and plot it
minor_producers_volumes = attribute_query('Production', countries=minor_producers).sum()
midsize_producers_volumes = attribute_query('Production', countries=midsize_producers).sum()
other_producers_volumes = pd.DataFrame(
data=[minor_producers_volumes, midsize_producers_volumes],
index=['Minor producers', 'Mid-size producers']
)
other_producers_volumes.T.plot(kind='area', stacked=True, figsize=(12,8), alpha=.4)
@ccerv1
ccerv1 / major_producers.py
Created February 16, 2019 12:05
Plot the production of major coffee producing countries
major_producers_volumes = attribute_query('Production', countries=major_producers)
major_producers_volumes = major_producers_volumes.sort_values(2019)
major_producers_volumes.T.plot(kind='area', stacked=True, figsize=(12,8), alpha=.4)
@ccerv1
ccerv1 / production_table.py
Created February 16, 2019 12:11
Put our tables together, reduce the timeframe, and transpose
production_table = pd.concat([other_producers_volumes, major_producers_volumes], axis=0)
production_table = production_table.iloc[:,-YEARS_IN_SERIES:]
production_table = production_table.T
production_table
@ccerv1
ccerv1 / pretty_production_chart_v1.py
Created February 16, 2019 12:16
Chart of coffee production by origin
production_table /= 1000 # change to millions instead of thousands of bags
fig, ax = plt.subplots()
production_table.plot(
kind='area',
stacked=True,
figsize=(12,8),
alpha=0.4,
colormap='Blues',
legend=False,
@ccerv1
ccerv1 / pretty_production_chart_v2.py
Created February 16, 2019 12:18
Updated chart with annotated labels
labels = []
y_vals = []
cum_sum = 0
for col_num, country_name in enumerate(production_table.columns):
labels.append(country_name)
prod_val = production_table.iloc[-1,col_num]
y_vals.append((cum_sum * 2 + prod_val)/2)
cum_sum += prod_val
for i, label in enumerate(labels):