Last active
September 1, 2016 13:15
-
-
Save idontgetoutmuch/51c45bb8839afcfe0351389f61ab2c9a to your computer and use it in GitHub Desktop.
An experiment with data analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE OverloadedLabels, TypeOperators, DataKinds, FlexibleContexts #-} | |
{-# LANGUAGE OverloadedStrings #-} | |
import Labels | |
import Labels.Cassava | |
import Data.Vector ( Vector ) | |
import Data.Text ( Text ) | |
import Data.Text.Lazy.Encoding | |
import qualified Data.Text.Lazy as LT | |
import Data.Csv hiding ( decodeByName ) | |
import Data.Csv.Streaming | |
import qualified Data.ByteString.Lazy as LB | |
import qualified Data.Vector as V | |
import Data.Map ( Map ) | |
-------------------------------------------------------------------------------- | |
-- Basic | |
reformat :: | |
(Has "text" [Char] r, Has "style" [Char] r, Has "indent" Bool r) => | |
r -> [Char] | |
reformat kw = "Style: " ++ get #style kw ++ "\n" ++ (if get #indent kw then " " else "") ++ get #text kw | |
demo :: ("foo" := (), "bar" := Char) | |
demo = (#foo := (), #bar := 'a') | |
-------------------------------------------------------------------------------- | |
-- Consing | |
consed :: ("mu" := Char, "foo" := ()) | |
consed = cons (#mu := 'x') (#foo := ()) | |
consed' :: ("mu" := Char, "foo" := (), "bar" := Char) | |
consed' = cons (#mu := 'x') (#foo := (), #bar := 'a') | |
fi = cons (#mu := 123) (#foo := "hi") | |
bob = get #mu (set #mu 5 (cons (#blah := [1,2,3]) (cons (#mu := 123) (#foo := "hi")))) | |
-------------------------------------------------------------------------------- | |
-- Abstraction | |
increment field record = set field (get field record + 1) record | |
foo = increment #bar (#bar := 123) | |
main = do | |
foo <- fmap decodeByName (LB.readFile "AAPL.csv") | |
case foo of | |
Right (header :: Vector Name, | |
rows :: Records ( "Date" := String | |
, "High" := Maybe Double | |
, "Low" := Maybe Double | |
) | |
) -> do | |
putStrLn $ show header | |
putStrLn $ show rows | |
Left err -> putStrLn err | |
main2 = do | |
foo <- fmap decodeByName (LB.readFile "examples/787338586_T_ONTIME.csv") | |
case foo of | |
Left err -> putStrLn err | |
-- Right (header :: Vector Name, rows :: Records (Map String String)) -> do | |
-- putStrLn $ show header | |
-- let header' = V.map (LT.toLower . decodeUtf8 . LB.fromStrict) header | |
-- putStrLn $ LT.unpack $ LT.concat $ V.toList $ | |
-- V.map (LT.cons '(') $ V.map (LT.cons '"') $ | |
-- V.zipWith LT.append | |
-- (V.map (\ x -> LT.snoc x '"') header') | |
-- (V.replicate (V.length header') (LT.pack " := String), ")) | |
Right (header :: Vector Name, | |
rows :: Records (("FL_DATE" := String), | |
-- ("unique_carrier" := String), | |
-- ("airline_id" := String), | |
-- ("tail_num" := String), | |
-- ("fl_num" := String), | |
-- ("origin_airport_id" := String), | |
-- ("origin_airport_seq_id" := String), | |
-- ("origin_city_market_id" := String), | |
-- ("origin" := String), | |
-- ("origin_city_name" := String), | |
-- ("origin_state_nm" := String), | |
-- ("dest_airport_id" := String), | |
-- ("dest_airport_seq_id" := String), | |
-- ("dest_city_market_id" := String), | |
-- ("dest" := String), | |
-- ("dest_city_name" := String), | |
-- ("dest_state_nm" := String), | |
-- ("crs_dep_time" := String), | |
("DEP_TIME" := String), | |
-- ("dep_delay" := String), | |
-- ("taxi_out" := String), | |
-- ("wheels_off" := String), | |
-- ("wheels_on" := String), | |
-- ("taxi_in" := String), | |
-- ("crs_arr_time" := String), | |
-- ("arr_time" := String), | |
-- ("arr_delay" := String), | |
-- ("cancelled" := String), | |
-- ("cancellation_code" := String), | |
-- ("diverted" := String), | |
-- ("distance" := String), | |
-- ("carrier_delay" := String), | |
-- ("weather_delay" := String), | |
-- ("nas_delay" := String), | |
-- ("security_delay" := String), | |
("LATE_AIRCRAFT_DELAY" := String))) -> | |
do let (Cons x _) = rows | |
putStrLn $ show x |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import zipfile | |
import requests | |
import numpy as np | |
import pandas as pd | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
headers = { | |
'Pragma': 'no-cache', | |
'Origin': 'http://www.transtats.bts.gov', | |
'Accept-Encoding': 'gzip, deflate', | |
'Accept-Language': 'en-US,en;q=0.8', | |
'Upgrade-Insecure-Requests': '1', | |
'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) ' | |
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'), | |
'Content-Type': 'application/x-www-form-urlencoded', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'Cache-Control': 'no-cache', | |
'Referer': 'http://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time', | |
'Connection': 'keep-alive', | |
'DNT': '1', | |
} | |
# Sorry about the long url | |
data = 'UserTableName=On_Time_Performance&DBShortName=On_Time&RawDataTable=T_ONTIME&sqlstr=+SELECT+FL_DATE%2CUNIQUE_CARRIER%2CAIRLINE_ID%2CTAIL_NUM%2CFL_NUM%2CORIGIN_AIRPORT_ID%2CORIGIN_AIRPORT_SEQ_ID%2CORIGIN_CITY_MARKET_ID%2CORIGIN%2CORIGIN_CITY_NAME%2CORIGIN_STATE_NM%2CDEST_AIRPORT_ID%2CDEST_AIRPORT_SEQ_ID%2CDEST_CITY_MARKET_ID%2CDEST%2CDEST_CITY_NAME%2CDEST_STATE_NM%2CCRS_DEP_TIME%2CDEP_TIME%2CDEP_DELAY%2CTAXI_OUT%2CWHEELS_OFF%2CWHEELS_ON%2CTAXI_IN%2CCRS_ARR_TIME%2CARR_TIME%2CARR_DELAY%2CCANCELLED%2CCANCELLATION_CODE%2CDIVERTED%2CDISTANCE%2CCARRIER_DELAY%2CWEATHER_DELAY%2CNAS_DELAY%2CSECURITY_DELAY%2CLATE_AIRCRAFT_DELAY+FROM++T_ONTIME+WHERE+Month+%3D1+AND+YEAR%3D2014&varlist=FL_DATE%2CUNIQUE_CARRIER%2CAIRLINE_ID%2CTAIL_NUM%2CFL_NUM%2CORIGIN_AIRPORT_ID%2CORIGIN_AIRPORT_SEQ_ID%2CORIGIN_CITY_MARKET_ID%2CORIGIN%2CORIGIN_CITY_NAME%2CORIGIN_STATE_NM%2CDEST_AIRPORT_ID%2CDEST_AIRPORT_SEQ_ID%2CDEST_CITY_MARKET_ID%2CDEST%2CDEST_CITY_NAME%2CDEST_STATE_NM%2CCRS_DEP_TIME%2CDEP_TIME%2CDEP_DELAY%2CTAXI_OUT%2CWHEELS_OFF%2CWHEELS_ON%2CTAXI_IN%2CCRS_ARR_TIME%2CARR_TIME%2CARR_DELAY%2CCANCELLED%2CCANCELLATION_CODE%2CDIVERTED%2CDISTANCE%2CCARRIER_DELAY%2CWEATHER_DELAY%2CNAS_DELAY%2CSECURITY_DELAY%2CLATE_AIRCRAFT_DELAY&grouplist=&suml=&sumRegion=&filter1=title%3D&filter2=title%3D&geo=All%A0&time=January&timename=Month&GEOGRAPHY=All&XYEAR=2014&FREQUENCY=1&VarDesc=Year&VarType=Num&VarDesc=Quarter&VarType=Num&VarDesc=Month&VarType=Num&VarDesc=DayofMonth&VarType=Num&VarDesc=DayOfWeek&VarType=Num&VarName=FL_DATE&VarDesc=FlightDate&VarType=Char&VarName=UNIQUE_CARRIER&VarDesc=UniqueCarrier&VarType=Char&VarName=AIRLINE_ID&VarDesc=AirlineID&VarType=Num&VarDesc=Carrier&VarType=Char&VarName=TAIL_NUM&VarDesc=TailNum&VarType=Char&VarName=FL_NUM&VarDesc=FlightNum&VarType=Char&VarName=ORIGIN_AIRPORT_ID&VarDesc=OriginAirportID&VarType=Num&VarName=ORIGIN_AIRPORT_SEQ_ID&VarDesc=OriginAirportSeqID&VarType=Num&VarName=ORIGIN_CITY_MARKET_ID&VarDesc=OriginCityMarketID&VarType=Num&VarName=ORIGIN&VarDesc=Origin&VarType=Char&VarName=ORIGIN_CITY_NAME&VarDesc=OriginCityName&VarType=Char&VarDesc=OriginState&VarType=Char&VarDesc=OriginStateFips&VarType=Char&VarName=ORIGIN_STATE_NM&VarDesc=OriginStateName&VarType=Char&VarDesc=OriginWac&VarType=Num&VarName=DEST_AIRPORT_ID&VarDesc=DestAirportID&VarType=Num&VarName=DEST_AIRPORT_SEQ_ID&VarDesc=DestAirportSeqID&VarType=Num&VarName=DEST_CITY_MARKET_ID&VarDesc=DestCityMarketID&VarType=Num&VarName=DEST&VarDesc=Dest&VarType=Char&VarName=DEST_CITY_NAME&VarDesc=DestCityName&VarType=Char&VarDesc=DestState&VarType=Char&VarDesc=DestStateFips&VarType=Char&VarName=DEST_STATE_NM&VarDesc=DestStateName&VarType=Char&VarDesc=DestWac&VarType=Num&VarName=CRS_DEP_TIME&VarDesc=CRSDepTime&VarType=Char&VarName=DEP_TIME&VarDesc=DepTime&VarType=Char&VarName=DEP_DELAY&VarDesc=DepDelay&VarType=Num&VarDesc=DepDelayMinutes&VarType=Num&VarDesc=DepDel15&VarType=Num&VarDesc=DepartureDelayGroups&VarType=Num&VarDesc=DepTimeBlk&VarType=Char&VarName=TAXI_OUT&VarDesc=TaxiOut&VarType=Num&VarName=WHEELS_OFF&VarDesc=WheelsOff&VarType=Char&VarName=WHEELS_ON&VarDesc=WheelsOn&VarType=Char&VarName=TAXI_IN&VarDesc=TaxiIn&VarType=Num&VarName=CRS_ARR_TIME&VarDesc=CRSArrTime&VarType=Char&VarName=ARR_TIME&VarDesc=ArrTime&VarType=Char&VarName=ARR_DELAY&VarDesc=ArrDelay&VarType=Num&VarDesc=ArrDelayMinutes&VarType=Num&VarDesc=ArrDel15&VarType=Num&VarDesc=ArrivalDelayGroups&VarType=Num&VarDesc=ArrTimeBlk&VarType=Char&VarName=CANCELLED&VarDesc=Cancelled&VarType=Num&VarName=CANCELLATION_CODE&VarDesc=CancellationCode&VarType=Char&VarName=DIVERTED&VarDesc=Diverted&VarType=Num&VarDesc=CRSElapsedTime&VarType=Num&VarDesc=ActualElapsedTime&VarType=Num&VarDesc=AirTime&VarType=Num&VarDesc=Flights&VarType=Num&VarName=DISTANCE&VarDesc=Distance&VarType=Num&VarDesc=DistanceGroup&VarType=Num&VarName=CARRIER_DELAY&VarDesc=CarrierDelay&VarType=Num&VarName=WEATHER_DELAY&VarDesc=WeatherDelay&VarType=Num&VarName=NAS_DELAY&VarDesc=NASDelay&VarType=Num&VarName=SECURITY_DELAY&VarDesc=SecurityDelay&VarType=Num&VarName=LATE_AIRCRAFT_DELAY&VarDesc=LateAircraftDelay&VarType=Num&VarDesc=FirstDepTime&VarType=Char&VarDesc=TotalAddGTime&VarType=Num&VarDesc=LongestAddGTime&VarType=Num&VarDesc=DivAirportLandings&VarType=Num&VarDesc=DivReachedDest&VarType=Num&VarDesc=DivActualElapsedTime&VarType=Num&VarDesc=DivArrDelay&VarType=Num&VarDesc=DivDistance&VarType=Num&VarDesc=Div1Airport&VarType=Char&VarDesc=Div1AirportID&VarType=Num&VarDesc=Div1AirportSeqID&VarType=Num&VarDesc=Div1WheelsOn&VarType=Char&VarDesc=Div1TotalGTime&VarType=Num&VarDesc=Div1LongestGTime&VarType=Num&VarDesc=Div1WheelsOff&VarType=Char&VarDesc=Div1TailNum&VarType=Char&VarDesc=Div2Airport&VarType=Char&VarDesc=Div2AirportID&VarType=Num&VarDesc=Div2AirportSeqID&VarType=Num&VarDesc=Div2WheelsOn&VarType=Char&VarDesc=Div2TotalGTime&VarType=Num&VarDesc=Div2LongestGTime&VarType=Num&VarDesc=Div2WheelsOff&VarType=Char&VarDesc=Div2TailNum&VarType=Char&VarDesc=Div3Airport&VarType=Char&VarDesc=Div3AirportID&VarType=Num&VarDesc=Div3AirportSeqID&VarType=Num&VarDesc=Div3WheelsOn&VarType=Char&VarDesc=Div3TotalGTime&VarType=Num&VarDesc=Div3LongestGTime&VarType=Num&VarDesc=Div3WheelsOff&VarType=Char&VarDesc=Div3TailNum&VarType=Char&VarDesc=Div4Airport&VarType=Char&VarDesc=Div4AirportID&VarType=Num&VarDesc=Div4AirportSeqID&VarType=Num&VarDesc=Div4WheelsOn&VarType=Char&VarDesc=Div4TotalGTime&VarType=Num&VarDesc=Div4LongestGTime&VarType=Num&VarDesc=Div4WheelsOff&VarType=Char&VarDesc=Div4TailNum&VarType=Char&VarDesc=Div5Airport&VarType=Char&VarDesc=Div5AirportID&VarType=Num&VarDesc=Div5AirportSeqID&VarType=Num&VarDesc=Div5WheelsOn&VarType=Char&VarDesc=Div5TotalGTime&VarType=Num&VarDesc=Div5LongestGTime&VarType=Num&VarDesc=Div5WheelsOff&VarType=Char&VarDesc=Div5TailNum&VarType=Char' | |
r = requests.post('http://www.transtats.bts.gov/DownLoad_Table.asp?Table_ID=236&Has_Group=3&Is_Zipped=0', | |
headers=headers, data=data, stream=True) | |
with open("flights.csv", 'wb') as f: | |
for chunk in r.iter_content(chunk_size=1024): | |
if chunk: | |
f.write(chunk) | |
zf = zipfile.ZipFile("flights.csv") | |
filename = zf.filelist[0].filename | |
fp = zf.extract(filename) | |
df = pd.read_csv(fp, parse_dates="FL_DATE").rename(columns=str.lower) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment