Created
December 11, 2020 10:58
-
-
Save pietrocolombo/ecc70dda9353d084a373d5b51cb0d6c3 to your computer and use it in GitHub Desktop.
consistency check and add new features
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # let's add information to our dataset | |
| # function to transform angles deg into radians | |
| deg2rad <- function(deg) {(deg * pi) / (180)} | |
| if(!require(geosphere)){ | |
| install.packages("geosphere") | |
| library("geosphere") | |
| } | |
| # path to the file that generated the parsing_geolife_dataset.r script | |
| perc_csv <- "dataset_raw.csv" | |
| dati <- read.csv(perc_csv, header = TRUE, sep =",", quote = "\"", dec = ".") | |
| # we check if there are null values | |
| print(colSums(is.na(dati))) | |
| # we remove the file that generates this script | |
| file.remove("dataset_with_add_features.csv") | |
| # number of tuples in the dataset | |
| print(nrow(dati)) | |
| # we delete lines which have latitude > 90 0 <-90 | |
| # as they make no sense being out of range | |
| dati <- dati[dati$Latitude < 90,] | |
| dati <- dati[dati$Latitude > -90,] | |
| # we delete lines which have latitude > 180 0 <-180 | |
| # as they make no sense being out of range | |
| dati <- dati[dati$Longitude < 180,] | |
| dati <- dati[dati$Longitude > -180,] | |
| # number of tuples in the dataset after removing tuples with no consistency | |
| print(nrow(dati)) | |
| # we create an array of true or false because R is faster than evaluating conditions | |
| # we compare id_user and id_perc and label if all are equal to the previous line | |
| # is true and false when they are different it indicates the change of path | |
| cond <- c(FALSE, (dati$Id_user[-nrow(dati)] == dati$Id_user[-1]) & (dati$Id_perc[-nrow(dati)] == dati$Id_perc[-1]) & (dati$Label[-nrow(dati)] == dati$Label[-1])) | |
| # I copy the columns in support array to optimize the execution time | |
| longitudine <- dati$Longitude | |
| latitudine <- dati$Latitude | |
| date_time <- dati$Date_Time | |
| # new feature | |
| dati$distance <- 0 | |
| dati$vel <- 0 | |
| dati$delta_time <- 0 | |
| dati$angle <- 0 | |
| distance <- dati$distance | |
| vel <- dati$vel | |
| delta_time <- dati$delta_time | |
| angle <- dati$angle | |
| for(i_row in 1:nrow(dati)) | |
| { | |
| if(i_row %% 10000 == 0) | |
| { | |
| # print for debug | |
| print(i_row) | |
| } | |
| if(cond[i_row]) | |
| { | |
| # if the previous line and the current line have id_user and id_perc and label the same | |
| # we calculate the distance in meters between the previous point and the current point | |
| distance[i_row] <- distGeo(c(longitudine[i_row-1], latitudine[i_row-1]), c(longitudine[i_row], latitudine[i_row])) | |
| # we calculate in seconds the time between the previous point and the next | |
| delta_time[i_row] <- as.numeric(difftime(date_time[i_row], date_time[i_row-1], units = "secs")) | |
| # if the delta time or delta distance are at 0 I set the speed to 0 | |
| if(distance[i_row] == 0 | delta_time[i_row] == 0) | |
| { | |
| vel[i_row] = 0 | |
| } | |
| else | |
| { | |
| # calculate speed | |
| vel[i_row] <- distance[i_row]/delta_time[i_row] | |
| } | |
| # I calculate the angle between north and two coordinates | |
| bearing <- atan2(sin(deg2rad(longitudine[i_row]) - deg2rad(longitudine[i_row-1])) * cos(deg2rad(latitudine[i_row])), | |
| cos(deg2rad(latitudine[i_row-1])) * sin(deg2rad(latitudine[i_row])) - sin(deg2rad(latitudine[i_row-1])) | |
| * cos(deg2rad(latitudine[i_row])) | |
| * cos(deg2rad(longitudine[i_row]) - deg2rad(longitudine[i_row-1]))) | |
| bearing = bearing + 2.0 * pi | |
| while(bearing > 2.0 * pi) | |
| { | |
| bearing = bearing - 2.0 * pi | |
| } | |
| angle[i_row] <- bearing | |
| } | |
| } | |
| # I insert the new calculated values in the dataframe | |
| dati$distance <- distance | |
| dati$vel <- vel | |
| dati$delta_time <- delta_time | |
| dati$angle <- angle | |
| # save the new CSV | |
| write.csv(dati,file="dataset_with_add_features.csv" ,row.names=FALSE) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment