Skip to content

Instantly share code, notes, and snippets.

@idontgetoutmuch
Last active September 1, 2016 13:15
Show Gist options
  • Save idontgetoutmuch/51c45bb8839afcfe0351389f61ab2c9a to your computer and use it in GitHub Desktop.
Save idontgetoutmuch/51c45bb8839afcfe0351389f61ab2c9a to your computer and use it in GitHub Desktop.
An experiment with data analysis
{-# LANGUAGE OverloadedLabels, TypeOperators, DataKinds, FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
import Labels
import Labels.Cassava
import Data.Vector ( Vector )
import Data.Text ( Text )
import Data.Text.Lazy.Encoding
import qualified Data.Text.Lazy as LT
import Data.Csv hiding ( decodeByName )
import Data.Csv.Streaming
import qualified Data.ByteString.Lazy as LB
import qualified Data.Vector as V
import Data.Map ( Map )
--------------------------------------------------------------------------------
-- Basic
reformat ::
(Has "text" [Char] r, Has "style" [Char] r, Has "indent" Bool r) =>
r -> [Char]
reformat kw = "Style: " ++ get #style kw ++ "\n" ++ (if get #indent kw then " " else "") ++ get #text kw
demo :: ("foo" := (), "bar" := Char)
demo = (#foo := (), #bar := 'a')
--------------------------------------------------------------------------------
-- Consing
consed :: ("mu" := Char, "foo" := ())
consed = cons (#mu := 'x') (#foo := ())
consed' :: ("mu" := Char, "foo" := (), "bar" := Char)
consed' = cons (#mu := 'x') (#foo := (), #bar := 'a')
fi = cons (#mu := 123) (#foo := "hi")
bob = get #mu (set #mu 5 (cons (#blah := [1,2,3]) (cons (#mu := 123) (#foo := "hi"))))
--------------------------------------------------------------------------------
-- Abstraction
increment field record = set field (get field record + 1) record
foo = increment #bar (#bar := 123)
main = do
foo <- fmap decodeByName (LB.readFile "AAPL.csv")
case foo of
Right (header :: Vector Name,
rows :: Records ( "Date" := String
, "High" := Maybe Double
, "Low" := Maybe Double
)
) -> do
putStrLn $ show header
putStrLn $ show rows
Left err -> putStrLn err
main2 = do
foo <- fmap decodeByName (LB.readFile "examples/787338586_T_ONTIME.csv")
case foo of
Left err -> putStrLn err
-- Right (header :: Vector Name, rows :: Records (Map String String)) -> do
-- putStrLn $ show header
-- let header' = V.map (LT.toLower . decodeUtf8 . LB.fromStrict) header
-- putStrLn $ LT.unpack $ LT.concat $ V.toList $
-- V.map (LT.cons '(') $ V.map (LT.cons '"') $
-- V.zipWith LT.append
-- (V.map (\ x -> LT.snoc x '"') header')
-- (V.replicate (V.length header') (LT.pack " := String), "))
Right (header :: Vector Name,
rows :: Records (("FL_DATE" := String),
-- ("unique_carrier" := String),
-- ("airline_id" := String),
-- ("tail_num" := String),
-- ("fl_num" := String),
-- ("origin_airport_id" := String),
-- ("origin_airport_seq_id" := String),
-- ("origin_city_market_id" := String),
-- ("origin" := String),
-- ("origin_city_name" := String),
-- ("origin_state_nm" := String),
-- ("dest_airport_id" := String),
-- ("dest_airport_seq_id" := String),
-- ("dest_city_market_id" := String),
-- ("dest" := String),
-- ("dest_city_name" := String),
-- ("dest_state_nm" := String),
-- ("crs_dep_time" := String),
("DEP_TIME" := String),
-- ("dep_delay" := String),
-- ("taxi_out" := String),
-- ("wheels_off" := String),
-- ("wheels_on" := String),
-- ("taxi_in" := String),
-- ("crs_arr_time" := String),
-- ("arr_time" := String),
-- ("arr_delay" := String),
-- ("cancelled" := String),
-- ("cancellation_code" := String),
-- ("diverted" := String),
-- ("distance" := String),
-- ("carrier_delay" := String),
-- ("weather_delay" := String),
-- ("nas_delay" := String),
-- ("security_delay" := String),
("LATE_AIRCRAFT_DELAY" := String))) ->
do let (Cons x _) = rows
putStrLn $ show x
import zipfile
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
headers = {
'Pragma': 'no-cache',
'Origin': 'http://www.transtats.bts.gov',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.8',
'Upgrade-Insecure-Requests': '1',
'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'),
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Cache-Control': 'no-cache',
'Referer': 'http://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time',
'Connection': 'keep-alive',
'DNT': '1',
}
# Sorry about the long url
data = 'UserTableName=On_Time_Performance&DBShortName=On_Time&RawDataTable=T_ONTIME&sqlstr=+SELECT+FL_DATE%2CUNIQUE_CARRIER%2CAIRLINE_ID%2CTAIL_NUM%2CFL_NUM%2CORIGIN_AIRPORT_ID%2CORIGIN_AIRPORT_SEQ_ID%2CORIGIN_CITY_MARKET_ID%2CORIGIN%2CORIGIN_CITY_NAME%2CORIGIN_STATE_NM%2CDEST_AIRPORT_ID%2CDEST_AIRPORT_SEQ_ID%2CDEST_CITY_MARKET_ID%2CDEST%2CDEST_CITY_NAME%2CDEST_STATE_NM%2CCRS_DEP_TIME%2CDEP_TIME%2CDEP_DELAY%2CTAXI_OUT%2CWHEELS_OFF%2CWHEELS_ON%2CTAXI_IN%2CCRS_ARR_TIME%2CARR_TIME%2CARR_DELAY%2CCANCELLED%2CCANCELLATION_CODE%2CDIVERTED%2CDISTANCE%2CCARRIER_DELAY%2CWEATHER_DELAY%2CNAS_DELAY%2CSECURITY_DELAY%2CLATE_AIRCRAFT_DELAY+FROM++T_ONTIME+WHERE+Month+%3D1+AND+YEAR%3D2014&varlist=FL_DATE%2CUNIQUE_CARRIER%2CAIRLINE_ID%2CTAIL_NUM%2CFL_NUM%2CORIGIN_AIRPORT_ID%2CORIGIN_AIRPORT_SEQ_ID%2CORIGIN_CITY_MARKET_ID%2CORIGIN%2CORIGIN_CITY_NAME%2CORIGIN_STATE_NM%2CDEST_AIRPORT_ID%2CDEST_AIRPORT_SEQ_ID%2CDEST_CITY_MARKET_ID%2CDEST%2CDEST_CITY_NAME%2CDEST_STATE_NM%2CCRS_DEP_TIME%2CDEP_TIME%2CDEP_DELAY%2CTAXI_OUT%2CWHEELS_OFF%2CWHEELS_ON%2CTAXI_IN%2CCRS_ARR_TIME%2CARR_TIME%2CARR_DELAY%2CCANCELLED%2CCANCELLATION_CODE%2CDIVERTED%2CDISTANCE%2CCARRIER_DELAY%2CWEATHER_DELAY%2CNAS_DELAY%2CSECURITY_DELAY%2CLATE_AIRCRAFT_DELAY&grouplist=&suml=&sumRegion=&filter1=title%3D&filter2=title%3D&geo=All%A0&time=January&timename=Month&GEOGRAPHY=All&XYEAR=2014&FREQUENCY=1&VarDesc=Year&VarType=Num&VarDesc=Quarter&VarType=Num&VarDesc=Month&VarType=Num&VarDesc=DayofMonth&VarType=Num&VarDesc=DayOfWeek&VarType=Num&VarName=FL_DATE&VarDesc=FlightDate&VarType=Char&VarName=UNIQUE_CARRIER&VarDesc=UniqueCarrier&VarType=Char&VarName=AIRLINE_ID&VarDesc=AirlineID&VarType=Num&VarDesc=Carrier&VarType=Char&VarName=TAIL_NUM&VarDesc=TailNum&VarType=Char&VarName=FL_NUM&VarDesc=FlightNum&VarType=Char&VarName=ORIGIN_AIRPORT_ID&VarDesc=OriginAirportID&VarType=Num&VarName=ORIGIN_AIRPORT_SEQ_ID&VarDesc=OriginAirportSeqID&VarType=Num&VarName=ORIGIN_CITY_MARKET_ID&VarDesc=OriginCityMarketID&VarType=Num&VarName=ORIGIN&VarDesc=Origin&VarType=Char&VarName=ORIGIN_CITY_NAME&VarDesc=OriginCityName&VarType=Char&VarDesc=OriginState&VarType=Char&VarDesc=OriginStateFips&VarType=Char&VarName=ORIGIN_STATE_NM&VarDesc=OriginStateName&VarType=Char&VarDesc=OriginWac&VarType=Num&VarName=DEST_AIRPORT_ID&VarDesc=DestAirportID&VarType=Num&VarName=DEST_AIRPORT_SEQ_ID&VarDesc=DestAirportSeqID&VarType=Num&VarName=DEST_CITY_MARKET_ID&VarDesc=DestCityMarketID&VarType=Num&VarName=DEST&VarDesc=Dest&VarType=Char&VarName=DEST_CITY_NAME&VarDesc=DestCityName&VarType=Char&VarDesc=DestState&VarType=Char&VarDesc=DestStateFips&VarType=Char&VarName=DEST_STATE_NM&VarDesc=DestStateName&VarType=Char&VarDesc=DestWac&VarType=Num&VarName=CRS_DEP_TIME&VarDesc=CRSDepTime&VarType=Char&VarName=DEP_TIME&VarDesc=DepTime&VarType=Char&VarName=DEP_DELAY&VarDesc=DepDelay&VarType=Num&VarDesc=DepDelayMinutes&VarType=Num&VarDesc=DepDel15&VarType=Num&VarDesc=DepartureDelayGroups&VarType=Num&VarDesc=DepTimeBlk&VarType=Char&VarName=TAXI_OUT&VarDesc=TaxiOut&VarType=Num&VarName=WHEELS_OFF&VarDesc=WheelsOff&VarType=Char&VarName=WHEELS_ON&VarDesc=WheelsOn&VarType=Char&VarName=TAXI_IN&VarDesc=TaxiIn&VarType=Num&VarName=CRS_ARR_TIME&VarDesc=CRSArrTime&VarType=Char&VarName=ARR_TIME&VarDesc=ArrTime&VarType=Char&VarName=ARR_DELAY&VarDesc=ArrDelay&VarType=Num&VarDesc=ArrDelayMinutes&VarType=Num&VarDesc=ArrDel15&VarType=Num&VarDesc=ArrivalDelayGroups&VarType=Num&VarDesc=ArrTimeBlk&VarType=Char&VarName=CANCELLED&VarDesc=Cancelled&VarType=Num&VarName=CANCELLATION_CODE&VarDesc=CancellationCode&VarType=Char&VarName=DIVERTED&VarDesc=Diverted&VarType=Num&VarDesc=CRSElapsedTime&VarType=Num&VarDesc=ActualElapsedTime&VarType=Num&VarDesc=AirTime&VarType=Num&VarDesc=Flights&VarType=Num&VarName=DISTANCE&VarDesc=Distance&VarType=Num&VarDesc=DistanceGroup&VarType=Num&VarName=CARRIER_DELAY&VarDesc=CarrierDelay&VarType=Num&VarName=WEATHER_DELAY&VarDesc=WeatherDelay&VarType=Num&VarName=NAS_DELAY&VarDesc=NASDelay&VarType=Num&VarName=SECURITY_DELAY&VarDesc=SecurityDelay&VarType=Num&VarName=LATE_AIRCRAFT_DELAY&VarDesc=LateAircraftDelay&VarType=Num&VarDesc=FirstDepTime&VarType=Char&VarDesc=TotalAddGTime&VarType=Num&VarDesc=LongestAddGTime&VarType=Num&VarDesc=DivAirportLandings&VarType=Num&VarDesc=DivReachedDest&VarType=Num&VarDesc=DivActualElapsedTime&VarType=Num&VarDesc=DivArrDelay&VarType=Num&VarDesc=DivDistance&VarType=Num&VarDesc=Div1Airport&VarType=Char&VarDesc=Div1AirportID&VarType=Num&VarDesc=Div1AirportSeqID&VarType=Num&VarDesc=Div1WheelsOn&VarType=Char&VarDesc=Div1TotalGTime&VarType=Num&VarDesc=Div1LongestGTime&VarType=Num&VarDesc=Div1WheelsOff&VarType=Char&VarDesc=Div1TailNum&VarType=Char&VarDesc=Div2Airport&VarType=Char&VarDesc=Div2AirportID&VarType=Num&VarDesc=Div2AirportSeqID&VarType=Num&VarDesc=Div2WheelsOn&VarType=Char&VarDesc=Div2TotalGTime&VarType=Num&VarDesc=Div2LongestGTime&VarType=Num&VarDesc=Div2WheelsOff&VarType=Char&VarDesc=Div2TailNum&VarType=Char&VarDesc=Div3Airport&VarType=Char&VarDesc=Div3AirportID&VarType=Num&VarDesc=Div3AirportSeqID&VarType=Num&VarDesc=Div3WheelsOn&VarType=Char&VarDesc=Div3TotalGTime&VarType=Num&VarDesc=Div3LongestGTime&VarType=Num&VarDesc=Div3WheelsOff&VarType=Char&VarDesc=Div3TailNum&VarType=Char&VarDesc=Div4Airport&VarType=Char&VarDesc=Div4AirportID&VarType=Num&VarDesc=Div4AirportSeqID&VarType=Num&VarDesc=Div4WheelsOn&VarType=Char&VarDesc=Div4TotalGTime&VarType=Num&VarDesc=Div4LongestGTime&VarType=Num&VarDesc=Div4WheelsOff&VarType=Char&VarDesc=Div4TailNum&VarType=Char&VarDesc=Div5Airport&VarType=Char&VarDesc=Div5AirportID&VarType=Num&VarDesc=Div5AirportSeqID&VarType=Num&VarDesc=Div5WheelsOn&VarType=Char&VarDesc=Div5TotalGTime&VarType=Num&VarDesc=Div5LongestGTime&VarType=Num&VarDesc=Div5WheelsOff&VarType=Char&VarDesc=Div5TailNum&VarType=Char'
r = requests.post('http://www.transtats.bts.gov/DownLoad_Table.asp?Table_ID=236&Has_Group=3&Is_Zipped=0',
headers=headers, data=data, stream=True)
with open("flights.csv", 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
zf = zipfile.ZipFile("flights.csv")
filename = zf.filelist[0].filename
fp = zf.extract(filename)
df = pd.read_csv(fp, parse_dates="FL_DATE").rename(columns=str.lower)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment