Skip to content

Instantly share code, notes, and snippets.

@andypowe11
Created November 13, 2017 20:34
Show Gist options
  • Save andypowe11/a8f8bd8d9c274d3735835fb9196a6922 to your computer and use it in GitHub Desktop.
Save andypowe11/a8f8bd8d9c274d3735835fb9196a6922 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import utilities as ut
import transform_records as tr
import time as tm
# Import BANES car parking data
rawData = pd.read_csv('../../../data/BANES_Historic_Car_Park_Occupancy.csv')
# Transform the data so that it is in a format suitable for import into AWS
# Clean up raw records
# Select columns containing useful information only
# Remove any records with NA entries
# Remove records for "test car park"
# Remove records with negative occupancies
# Calculate Proportion column (Occupancy/Capacity)
# Remove records with Proportion greater than \code{max_prop}
# Remove duplicate records (see \code{first_upload})
# This is what raw data looks like
# {Name}{The name of the car park where the record was taken.}
# {LastUpdate}{The time the record was taken (POSIXct date-time object).}
# {DateUploaded}{The time the record was uploaded to the Bath: Hacked
# database (POSIXct date-time object).}
# {Occupancy}{The total number of cars in the car park.}
# {Capacity}{The number of parking spaces in the car park.}
# {Status}{Description of the change in occupancy since the previous
# record from that car park.}
# {Proportion}{Calculated as (Occupancy/Capacity).}
refinedDF = pd.DataFrame(rawData, columns = ['LastUpdate', 'Name',
'Occupancy', 'Capacity'])
# Remove any records with NA entries
refinedDF = refinedDF.dropna()
# Remove records for test car park
refinedDF = refinedDF[refinedDF['Name'] != 'test car park']
# Remove records with negative occupancies
refinedDF = refinedDF[refinedDF['Occupancy'] >0]
# Calculate Proportion column
refinedDF = refinedDF.assign(
Proportion = refinedDF['Occupancy']/refinedDF['Capacity']
)
# Remove records with overly-full occupancies
# 1.1, or 110% full, to allow for circulating cars).
max_prop = 1.1
refinedDF = refinedDF[refinedDF['Proportion'] <max_prop]
# Calculate Spaces column
refinedDF = refinedDF.assign(
Spaces = refinedDF['Capacity']-refinedDF['Occupancy']
)
# Convert date strings to date objects (API provides date-time
# objects already)
refinedDF['LastUpdate'] = pd.to_datetime(refinedDF['LastUpdate'], format = '%d/%m/%Y %H:%M:%S %p')
# Extract Date and Time from LastUpdate
refinedDF['Date'] = refinedDF['LastUpdate'].dt.date
refinedDF['Time'] = refinedDF['LastUpdate'].dt.time
refinedDF['Month'] = refinedDF['LastUpdate'].dt.month
refinedDF['Hour'] = refinedDF['LastUpdate'].dt.hour
refinedDF['Minute'] = refinedDF['LastUpdate'].dt.minute
refinedDF.loc[refinedDF['Minute'].lt(15), 'MinuteSlot'] = 0.0
refinedDF.loc[refinedDF['Minute'].gt(14) & refinedDF['Minute'].lt(30), 'MinuteSlot'] = 0.25
refinedDF.loc[refinedDF['Minute'].gt(29) & refinedDF['Minute'].lt(45), 'MinuteSlot'] = 0.5
refinedDF.loc[refinedDF['Minute'].gt(44), 'MinuteSlot'] = 0.75
# Calculate TimeSlot column
refinedDF = refinedDF.assign(
TimeSlot = refinedDF['Hour']+refinedDF['MinuteSlot']
)
#refinedDF['WeekDay'] = refinedDF['LastUpdate'].dt.weekday
# Weekday?
refinedDF = refinedDF.assign(
IsWeekday = refinedDF['LastUpdate'].dt.weekday < 5
)
# Convert date strings to date objects (API provides date-time
refinedDF = refinedDF.assign(
Spaces = refinedDF['Capacity']-refinedDF['Occupancy']
)
# Convert date strings to date objects (API provides date-time
# Get rid of various columns
refinedDF.drop('LastUpdate', axis=1, inplace=True)
refinedDF.drop('Proportion', axis=1, inplace=True)
refinedDF.drop('Occupancy', axis=1, inplace=True)
refinedDF.drop('Capacity', axis=1, inplace=True)
refinedDF.drop('Hour', axis=1, inplace=True)
refinedDF.drop('Minute', axis=1, inplace=True)
refinedDF.drop('MinuteSlot', axis=1, inplace=True)
refinedDF.drop('Date', axis=1, inplace=True)
refinedDF.drop('Time', axis=1, inplace=True)
# Save our work...
refinedDF.to_csv('clean_data.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment