Created
November 13, 2017 20:34
-
-
Save andypowe11/a8f8bd8d9c274d3735835fb9196a6922 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import utilities as ut | |
import transform_records as tr | |
import time as tm | |
# Import BANES car parking data | |
rawData = pd.read_csv('../../../data/BANES_Historic_Car_Park_Occupancy.csv') | |
# Transform the data so that it is in a format suitable for import into AWS | |
# Clean up raw records | |
# Select columns containing useful information only | |
# Remove any records with NA entries | |
# Remove records for "test car park" | |
# Remove records with negative occupancies | |
# Calculate Proportion column (Occupancy/Capacity) | |
# Remove records with Proportion greater than \code{max_prop} | |
# Remove duplicate records (see \code{first_upload}) | |
# This is what raw data looks like | |
# {Name}{The name of the car park where the record was taken.} | |
# {LastUpdate}{The time the record was taken (POSIXct date-time object).} | |
# {DateUploaded}{The time the record was uploaded to the Bath: Hacked | |
# database (POSIXct date-time object).} | |
# {Occupancy}{The total number of cars in the car park.} | |
# {Capacity}{The number of parking spaces in the car park.} | |
# {Status}{Description of the change in occupancy since the previous | |
# record from that car park.} | |
# {Proportion}{Calculated as (Occupancy/Capacity).} | |
refinedDF = pd.DataFrame(rawData, columns = ['LastUpdate', 'Name', | |
'Occupancy', 'Capacity']) | |
# Remove any records with NA entries | |
refinedDF = refinedDF.dropna() | |
# Remove records for test car park | |
refinedDF = refinedDF[refinedDF['Name'] != 'test car park'] | |
# Remove records with negative occupancies | |
refinedDF = refinedDF[refinedDF['Occupancy'] >0] | |
# Calculate Proportion column | |
refinedDF = refinedDF.assign( | |
Proportion = refinedDF['Occupancy']/refinedDF['Capacity'] | |
) | |
# Remove records with overly-full occupancies | |
# 1.1, or 110% full, to allow for circulating cars). | |
max_prop = 1.1 | |
refinedDF = refinedDF[refinedDF['Proportion'] <max_prop] | |
# Calculate Spaces column | |
refinedDF = refinedDF.assign( | |
Spaces = refinedDF['Capacity']-refinedDF['Occupancy'] | |
) | |
# Convert date strings to date objects (API provides date-time | |
# objects already) | |
refinedDF['LastUpdate'] = pd.to_datetime(refinedDF['LastUpdate'], format = '%d/%m/%Y %H:%M:%S %p') | |
# Extract Date and Time from LastUpdate | |
refinedDF['Date'] = refinedDF['LastUpdate'].dt.date | |
refinedDF['Time'] = refinedDF['LastUpdate'].dt.time | |
refinedDF['Month'] = refinedDF['LastUpdate'].dt.month | |
refinedDF['Hour'] = refinedDF['LastUpdate'].dt.hour | |
refinedDF['Minute'] = refinedDF['LastUpdate'].dt.minute | |
refinedDF.loc[refinedDF['Minute'].lt(15), 'MinuteSlot'] = 0.0 | |
refinedDF.loc[refinedDF['Minute'].gt(14) & refinedDF['Minute'].lt(30), 'MinuteSlot'] = 0.25 | |
refinedDF.loc[refinedDF['Minute'].gt(29) & refinedDF['Minute'].lt(45), 'MinuteSlot'] = 0.5 | |
refinedDF.loc[refinedDF['Minute'].gt(44), 'MinuteSlot'] = 0.75 | |
# Calculate TimeSlot column | |
refinedDF = refinedDF.assign( | |
TimeSlot = refinedDF['Hour']+refinedDF['MinuteSlot'] | |
) | |
#refinedDF['WeekDay'] = refinedDF['LastUpdate'].dt.weekday | |
# Weekday? | |
refinedDF = refinedDF.assign( | |
IsWeekday = refinedDF['LastUpdate'].dt.weekday < 5 | |
) | |
# Convert date strings to date objects (API provides date-time | |
refinedDF = refinedDF.assign( | |
Spaces = refinedDF['Capacity']-refinedDF['Occupancy'] | |
) | |
# Convert date strings to date objects (API provides date-time | |
# Get rid of various columns | |
refinedDF.drop('LastUpdate', axis=1, inplace=True) | |
refinedDF.drop('Proportion', axis=1, inplace=True) | |
refinedDF.drop('Occupancy', axis=1, inplace=True) | |
refinedDF.drop('Capacity', axis=1, inplace=True) | |
refinedDF.drop('Hour', axis=1, inplace=True) | |
refinedDF.drop('Minute', axis=1, inplace=True) | |
refinedDF.drop('MinuteSlot', axis=1, inplace=True) | |
refinedDF.drop('Date', axis=1, inplace=True) | |
refinedDF.drop('Time', axis=1, inplace=True) | |
# Save our work... | |
refinedDF.to_csv('clean_data.csv', index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment