Skip to content

Instantly share code, notes, and snippets.

@pietrocolombo
Created December 11, 2020 10:58
Show Gist options
  • Select an option

  • Save pietrocolombo/ecc70dda9353d084a373d5b51cb0d6c3 to your computer and use it in GitHub Desktop.

Select an option

Save pietrocolombo/ecc70dda9353d084a373d5b51cb0d6c3 to your computer and use it in GitHub Desktop.
consistency check and add new features
# let's add information to our dataset
# function to transform angles deg into radians
deg2rad <- function(deg) {(deg * pi) / (180)}
if(!require(geosphere)){
install.packages("geosphere")
library("geosphere")
}
# path to the file that generated the parsing_geolife_dataset.r script
perc_csv <- "dataset_raw.csv"
dati <- read.csv(perc_csv, header = TRUE, sep =",", quote = "\"", dec = ".")
# we check if there are null values
print(colSums(is.na(dati)))
# we remove the file that generates this script
file.remove("dataset_with_add_features.csv")
# number of tuples in the dataset
print(nrow(dati))
# we delete lines which have latitude > 90 0 <-90
# as they make no sense being out of range
dati <- dati[dati$Latitude < 90,]
dati <- dati[dati$Latitude > -90,]
# we delete lines which have latitude > 180 0 <-180
# as they make no sense being out of range
dati <- dati[dati$Longitude < 180,]
dati <- dati[dati$Longitude > -180,]
# number of tuples in the dataset after removing tuples with no consistency
print(nrow(dati))
# we create an array of true or false because R is faster than evaluating conditions
# we compare id_user and id_perc and label if all are equal to the previous line
# is true and false when they are different it indicates the change of path
cond <- c(FALSE, (dati$Id_user[-nrow(dati)] == dati$Id_user[-1]) & (dati$Id_perc[-nrow(dati)] == dati$Id_perc[-1]) & (dati$Label[-nrow(dati)] == dati$Label[-1]))
# I copy the columns in support array to optimize the execution time
longitudine <- dati$Longitude
latitudine <- dati$Latitude
date_time <- dati$Date_Time
# new feature
dati$distance <- 0
dati$vel <- 0
dati$delta_time <- 0
dati$angle <- 0
distance <- dati$distance
vel <- dati$vel
delta_time <- dati$delta_time
angle <- dati$angle
for(i_row in 1:nrow(dati))
{
if(i_row %% 10000 == 0)
{
# print for debug
print(i_row)
}
if(cond[i_row])
{
# if the previous line and the current line have id_user and id_perc and label the same
# we calculate the distance in meters between the previous point and the current point
distance[i_row] <- distGeo(c(longitudine[i_row-1], latitudine[i_row-1]), c(longitudine[i_row], latitudine[i_row]))
# we calculate in seconds the time between the previous point and the next
delta_time[i_row] <- as.numeric(difftime(date_time[i_row], date_time[i_row-1], units = "secs"))
# if the delta time or delta distance are at 0 I set the speed to 0
if(distance[i_row] == 0 | delta_time[i_row] == 0)
{
vel[i_row] = 0
}
else
{
# calculate speed
vel[i_row] <- distance[i_row]/delta_time[i_row]
}
# I calculate the angle between north and two coordinates
bearing <- atan2(sin(deg2rad(longitudine[i_row]) - deg2rad(longitudine[i_row-1])) * cos(deg2rad(latitudine[i_row])),
cos(deg2rad(latitudine[i_row-1])) * sin(deg2rad(latitudine[i_row])) - sin(deg2rad(latitudine[i_row-1]))
* cos(deg2rad(latitudine[i_row]))
* cos(deg2rad(longitudine[i_row]) - deg2rad(longitudine[i_row-1])))
bearing = bearing + 2.0 * pi
while(bearing > 2.0 * pi)
{
bearing = bearing - 2.0 * pi
}
angle[i_row] <- bearing
}
}
# I insert the new calculated values in the dataframe
dati$distance <- distance
dati$vel <- vel
dati$delta_time <- delta_time
dati$angle <- angle
# save the new CSV
write.csv(dati,file="dataset_with_add_features.csv" ,row.names=FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment