Skip to content

Instantly share code, notes, and snippets.

@pietrocolombo
Last active December 11, 2020 10:25
Show Gist options
  • Select an option

  • Save pietrocolombo/6e974423445e36b8dc589b771d635e35 to your computer and use it in GitHub Desktop.

Select an option

Save pietrocolombo/6e974423445e36b8dc589b771d635e35 to your computer and use it in GitHub Desktop.
parsing geolife dataset to csv with label
# data-set
# http://research.microsoft.com/en-us/projects/geolife/
# script for reading .plt file and label file
# assigns the associated label to each point
# each folder represents a user, we only consider users who have the labels.txt file
# I take the path of each folder
dirs_perc = list.dirs("../Geolife Trajectories 1.3/Data", recursive = FALSE)
# I take the name of each folder which identifies me as the user
dirs_name = list.files("../Geolife Trajectories 1.3/Data")
first_time = TRUE
file.remove("dataset_raw.csv")
# cycle on all the folders which indicate the user
for (i_dirs in 1:length(dirs_perc)){
print(i_dirs)
# I look at the number of files in the folder if it is 3 I have the labels as the mac sees an Icon\r file
file = list.files(dirs_perc[i_dirs])
file <- file[file!="Icon\r"]
# if I have two files it means that I also have the labels
if (length(file) == 2) {
# I read the label file
label <- read.table(paste(dirs_perc[i_dirs],"/labels.txt", sep = ""), quote = "\"", sep = "\t", header = TRUE, colClasses = c("character", "character", "character") )
# I get the path of the trajectories
trajectory_perc <- paste(dirs_perc[i_dirs],"/Trajectory", sep = "")
file_trajectory <- list.files(trajectory_perc)
# I take only the .plt files
index_file <- grep(".plt",file_trajectory)
# I uniform the dates in order to be able to compare them
label$Start.Time.Posix <- as.POSIXct(label$Start.Time, format="%Y/%m/%d %H:%M:%OS", tz="GMT")
label$End.Time.Posix <- as.POSIXct(label$End.Time, format="%Y/%m/%d %H:%M:%OS", tz="GMT")
# cycle on the single user's .plt files if the user has the labels
for (index in index_file){
# I read one .plt file at a time
dati <- read.table(paste(trajectory_perc, "/", file_trajectory[index], sep = ''), header = FALSE, quote = "\"", skip = 6, sep = ",", colClasses = c("character", "character", "character", "character", "character", "character", "character") , numerals = "no.loss")
# I uniform the dates in order to be able to compare them
dati$Date_Time <- as.POSIXct(paste(dati$V6, dati$V7, sep = " "),format="%Y-%m-%d %H:%M:%OS", tz="GMT")
# I delete the columns that I don't need
# which are columns 3 and 5
# as v3 is always at 0 and V5 represents the days since a certain date but we have the date directly.
dati$V3 <- NULL
dati$V5 <- NULL
# I change the names to the columns with more meaningful names
colnames(dati) <- c("Latitude", "Longitude", "Altitude", "Date", "Time", "Date_Time")
# I delete Date and Time I already have Date_Time
dati$Date <- NULL
dati$Time <- NULL
# in order not to lose information on the decimal places in the coordinates
options(digits=10)
for(i in 1:3) {
dati[,i] <- as.double(dati[,i])
}
# I add user id
dati$Id_user <- dirs_name[i_dirs]
# I add the id that identifies the path
dati$Id_perc <- file_trajectory[index]
# I add the column for the empty label
dati$Label <- ""
# vector to keep track of the labels already associated with the points
row_delete <- c()
# check that we have the labels referred to the trajectories file
# so i look in the label table if i have a trajectory with the label starting with the same timestamps
# as there can be multiple labels for each plt file
# cycle for each file on all label elements
if(nrow(label)>0)
{
for(i_row_label in 1:nrow(label))
{
# print(paste(i_row_label, " numero righe label ", nrow(label)))
find_label <- grep(label$Start.Time.Posix[i_row_label], dati$Date_Time)
if(length(find_label) != 0)
{
find_label_end <- grep(label$End.Time.Posix[i_row_label], dati$Date_Time)
if(length(find_label_end) != 0)
{
if(length(find_label) > 1){
# happens if I have multiple points with the same timestamp it depends on the amount of data
print(paste("I found more labels, the frequency of points very dense", length(find_label), " file name ", file_trajectory[index], " directory name ", dirs_name[i_dirs], " index for ", i_dirs))
}
# if I found a label that matches the path
# I put the label only for the path section of the corresponding label
dati$Label[find_label[1] : tail(find_label_end, 1)] <- label$Transportation.Mode[i_row_label]
# save the section of the corresponding path in a new dataframe
dati2 <- dati[find_label[1] : tail(find_label_end, 1),]
# save the data in a csv file
if(first_time){
#data_Trajectorys <- dati
write.table(dati2,file="dataset_raw.csv", append=TRUE,sep=",",row.names=FALSE)
first_time = FALSE
}else{
# rbind gives us speed problems for memory management that must find a continuous space
#data_Trajectorys <- rbind(data_Trajectorys, dati)
write.table(dati2,file="dataset_raw.csv", append=TRUE,sep=",",col.names=FALSE,row.names=FALSE)
}
# array to delete the lines I have already found
row_delete <- c(row_delete, i_row_label)
}else
{
print("start timestamp matches but there is no end timestamp")
}
}
}
}
# I delete the lines I have already found
if(length(row_delete) != 0){
label <- label[-c(row_delete),]
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment