This gist can be used to create test files for the
bikedata
package. These are ultimately
stored in an .rda
structure, but this removes a lot of the original text-based
idiosyncracies of the raw .csv
files. This script is thus necessary to convert
the read.csv()
results back in to the precise forms of the individual .csv
files for each city.
First step is to create sample test files for each city and compress them in
.zip
archives. These can then be unzipped and read into R
with these lines:
unzip ("./tests/sample-cabi-dc-trips-history-data.zip")
unzip ("./tests/sample-citibike-tripdata.zip")
unzip ("./tests/sample-divvy-trips.zip", junkpaths = TRUE)
unzip ("./tests/sample-hubway-tripdata.zip")
unzip ("./tests/sample-JourneyDataExtract-london.csv.zip")
unzip ("./tests/sample-la-metro.zip")
dc <- read.csv ("2017-Q1-Trips-History-Data.csv", header = TRUE)
lo <- read.csv ("01aJourneyDataExtract10Jan16-23Jan16.csv", header = TRUE)
bo <- read.csv ("201604-hubway-tripdata.csv", header = TRUE)
ny <- read.csv ("201612-citibike-tripdata.csv", header = TRUE)
ch_st <- read.csv ("Divvy_Stations.csv", header = TRUE)
ch_tr <- read.csv ("Divvy_Trips_sample.csv", header = TRUE)
la <- read.csv ("la_metro_gbfs_trips_Q1_2017.csv", header = TRUE)
Then each of these needs to be tidied to reflect the original text format:
names (ny) <- c ("Trip Duration", "Start Time", "Stop Time",
"Start Station ID", "Start Station Name",
"Start Station Latitude", "Start Station Longitude",
"End Station ID", "End Station Name",
"End Station Latitude", "End Station Longitude",
"Bike ID", "User Type", "Birth Year", "Gender")
names (dc) <- c ("Duration", "Start date", "End date",
"Start station number", "Start station",
"End station number", "End station",
"Bike number", "Member Type")
names (bo) <- c ("tripduration", "starttime", "stoptime",
"start station id", "start station name",
"start station latitude", "start station longitude",
"end station id", "end station name",
"end station latitude", "end station longitude",
"bikeid", "usertype", "birth year", "gender")
# bo data are quoted, so non-char fields need to be converted to char
nms <- c ('tripduration', 'start station id', 'start station latitude',
'start station longitude', 'end station id', 'end station longitude',
'end station latitude', 'bikeid', 'birth year', 'gender')
indx <- which (names (bo) %in% nms)
for (i in indx)
bo [,i] <- gsub ("^\\s+|\\s+$", "", as.character (bo [,i]))
# bo [,i] <- trimws (as.character (bo [,i]), which = 'both')
# trimws is only R >= 3.2.0, so gsub keeps package more general
# same for ch_tr data
nms <- c ('trip_id', 'bikeid', 'tripduration', 'from_station_id',
'to_station_id', 'birthyear')
indx <- which (names (ch_tr) %in% nms)
for (i in indx)
ch_tr [,i] <- gsub ("^\\s+|\\s+$", "", as.character (ch_tr [,i]))
ch_tr$birthyear [is.na (ch_tr$birthyear)] <- ""
names (lo) <- c ("Rental Id", "Duration", "Bike Id", "End Date",
"EndStation Id", "EndStation Name", "Start Date",
"StartStation Id", "StartStation Name")
indx <- which (names (lo) %in% c ('EndStation Name', 'StartStation Name'))
for (i in indx)
lo [,i] <- paste0 ('\"', lo [,i], '\"')
The resultant structures can then be used to create the test-data.rda
file
bike_dat <- list (dc = dc, lo = lo, bo = bo, ny = ny,
ch_st = ch_st, ch_tr = ch_tr, la = la)
save (bike_dat, file = './data/bike-test-data.rda')
#rm (list = ls())
system ('rm *.csv')