Script to create the `bikedata/data/bike-test-data.rda` file.

This gist can be used to create test files for the bikedata package. These are ultimately stored in an .rda structure, but this removes a lot of the original text-based idiosyncracies of the raw .csv files. This script is thus necessary to convert the read.csv() results back in to the precise forms of the individual .csv files for each city.

First step is to create sample test files for each city and compress them in .zip archives. These can then be unzipped and read into R with these lines:

unzip ("./tests/sample-cabi-dc-trips-history-data.zip")
unzip ("./tests/sample-citibike-tripdata.zip")
unzip ("./tests/sample-divvy-trips.zip", junkpaths = TRUE)
unzip ("./tests/sample-hubway-tripdata.zip")
unzip ("./tests/sample-JourneyDataExtract-london.csv.zip")
unzip ("./tests/sample-la-metro.zip")
dc <- read.csv ("2017-Q1-Trips-History-Data.csv", header = TRUE)
lo <- read.csv ("01aJourneyDataExtract10Jan16-23Jan16.csv", header = TRUE)
bo <- read.csv ("201604-hubway-tripdata.csv", header = TRUE)
ny <- read.csv ("201612-citibike-tripdata.csv", header = TRUE)
ch_st <- read.csv ("Divvy_Stations.csv", header = TRUE)
ch_tr <- read.csv ("Divvy_Trips_sample.csv", header = TRUE)
la <- read.csv ("la_metro_gbfs_trips_Q1_2017.csv", header = TRUE)

Then each of these needs to be tidied to reflect the original text format:

names (ny) <- c ("Trip Duration", "Start Time", "Stop Time",
                 "Start Station ID", "Start Station Name",
                 "Start Station Latitude", "Start Station Longitude",
                 "End Station ID", "End Station Name",
                 "End Station Latitude", "End Station Longitude",
                 "Bike ID", "User Type", "Birth Year", "Gender")
names (dc) <- c ("Duration", "Start date", "End date",
                 "Start station number", "Start station",
                 "End station number", "End station",
                 "Bike number", "Member Type")
names (bo) <- c ("tripduration", "starttime", "stoptime",
                 "start station id", "start station name", 
                 "start station latitude", "start station longitude", 
                 "end station id", "end station name", 
                 "end station latitude", "end station longitude", 
                 "bikeid", "usertype", "birth year", "gender")
# bo data are quoted, so non-char fields need to be converted to char
nms <- c ('tripduration', 'start station id', 'start station latitude',
          'start station longitude', 'end station id', 'end station longitude',
          'end station latitude', 'bikeid', 'birth year', 'gender')
indx <- which (names (bo) %in% nms)
for (i in indx)
    bo [,i] <- gsub ("^\\s+|\\s+$", "", as.character (bo [,i]))
#    bo [,i] <- trimws (as.character (bo [,i]), which = 'both')
# trimws is only R >= 3.2.0, so gsub keeps package more general
# same for ch_tr data
nms <- c ('trip_id', 'bikeid', 'tripduration', 'from_station_id',
          'to_station_id', 'birthyear')
indx <- which (names (ch_tr) %in% nms)
for (i in indx)
    ch_tr [,i] <- gsub ("^\\s+|\\s+$", "", as.character (ch_tr [,i]))
ch_tr$birthyear [is.na (ch_tr$birthyear)] <- ""
names (lo) <- c ("Rental Id", "Duration", "Bike Id", "End Date", 
                  "EndStation Id", "EndStation Name", "Start Date", 
                  "StartStation Id", "StartStation Name")
indx <- which (names (lo) %in% c ('EndStation Name', 'StartStation Name'))
for (i in indx)
    lo [,i] <- paste0 ('\"', lo [,i], '\"')

The resultant structures can then be used to create the test-data.rda file

bike_dat <- list (dc = dc, lo = lo, bo = bo, ny = ny,
                  ch_st = ch_st, ch_tr = ch_tr, la = la)
save (bike_dat, file = './data/bike-test-data.rda')
#rm (list = ls())
system ('rm *.csv')

mpadge/bikedata-testfiles.md