Last active
December 11, 2020 10:25
-
-
Save pietrocolombo/6e974423445e36b8dc589b771d635e35 to your computer and use it in GitHub Desktop.
parsing geolife dataset to csv with label
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # data-set | |
| # http://research.microsoft.com/en-us/projects/geolife/ | |
| # script for reading .plt file and label file | |
| # assigns the associated label to each point | |
| # each folder represents a user, we only consider users who have the labels.txt file | |
| # I take the path of each folder | |
| dirs_perc = list.dirs("../Geolife Trajectories 1.3/Data", recursive = FALSE) | |
| # I take the name of each folder which identifies me as the user | |
| dirs_name = list.files("../Geolife Trajectories 1.3/Data") | |
| first_time = TRUE | |
| file.remove("dataset_raw.csv") | |
| # cycle on all the folders which indicate the user | |
| for (i_dirs in 1:length(dirs_perc)){ | |
| print(i_dirs) | |
| # I look at the number of files in the folder if it is 3 I have the labels as the mac sees an Icon\r file | |
| file = list.files(dirs_perc[i_dirs]) | |
| file <- file[file!="Icon\r"] | |
| # if I have two files it means that I also have the labels | |
| if (length(file) == 2) { | |
| # I read the label file | |
| label <- read.table(paste(dirs_perc[i_dirs],"/labels.txt", sep = ""), quote = "\"", sep = "\t", header = TRUE, colClasses = c("character", "character", "character") ) | |
| # I get the path of the trajectories | |
| trajectory_perc <- paste(dirs_perc[i_dirs],"/Trajectory", sep = "") | |
| file_trajectory <- list.files(trajectory_perc) | |
| # I take only the .plt files | |
| index_file <- grep(".plt",file_trajectory) | |
| # I uniform the dates in order to be able to compare them | |
| label$Start.Time.Posix <- as.POSIXct(label$Start.Time, format="%Y/%m/%d %H:%M:%OS", tz="GMT") | |
| label$End.Time.Posix <- as.POSIXct(label$End.Time, format="%Y/%m/%d %H:%M:%OS", tz="GMT") | |
| # cycle on the single user's .plt files if the user has the labels | |
| for (index in index_file){ | |
| # I read one .plt file at a time | |
| dati <- read.table(paste(trajectory_perc, "/", file_trajectory[index], sep = ''), header = FALSE, quote = "\"", skip = 6, sep = ",", colClasses = c("character", "character", "character", "character", "character", "character", "character") , numerals = "no.loss") | |
| # I uniform the dates in order to be able to compare them | |
| dati$Date_Time <- as.POSIXct(paste(dati$V6, dati$V7, sep = " "),format="%Y-%m-%d %H:%M:%OS", tz="GMT") | |
| # I delete the columns that I don't need | |
| # which are columns 3 and 5 | |
| # as v3 is always at 0 and V5 represents the days since a certain date but we have the date directly. | |
| dati$V3 <- NULL | |
| dati$V5 <- NULL | |
| # I change the names to the columns with more meaningful names | |
| colnames(dati) <- c("Latitude", "Longitude", "Altitude", "Date", "Time", "Date_Time") | |
| # I delete Date and Time I already have Date_Time | |
| dati$Date <- NULL | |
| dati$Time <- NULL | |
| # in order not to lose information on the decimal places in the coordinates | |
| options(digits=10) | |
| for(i in 1:3) { | |
| dati[,i] <- as.double(dati[,i]) | |
| } | |
| # I add user id | |
| dati$Id_user <- dirs_name[i_dirs] | |
| # I add the id that identifies the path | |
| dati$Id_perc <- file_trajectory[index] | |
| # I add the column for the empty label | |
| dati$Label <- "" | |
| # vector to keep track of the labels already associated with the points | |
| row_delete <- c() | |
| # check that we have the labels referred to the trajectories file | |
| # so i look in the label table if i have a trajectory with the label starting with the same timestamps | |
| # as there can be multiple labels for each plt file | |
| # cycle for each file on all label elements | |
| if(nrow(label)>0) | |
| { | |
| for(i_row_label in 1:nrow(label)) | |
| { | |
| # print(paste(i_row_label, " numero righe label ", nrow(label))) | |
| find_label <- grep(label$Start.Time.Posix[i_row_label], dati$Date_Time) | |
| if(length(find_label) != 0) | |
| { | |
| find_label_end <- grep(label$End.Time.Posix[i_row_label], dati$Date_Time) | |
| if(length(find_label_end) != 0) | |
| { | |
| if(length(find_label) > 1){ | |
| # happens if I have multiple points with the same timestamp it depends on the amount of data | |
| print(paste("I found more labels, the frequency of points very dense", length(find_label), " file name ", file_trajectory[index], " directory name ", dirs_name[i_dirs], " index for ", i_dirs)) | |
| } | |
| # if I found a label that matches the path | |
| # I put the label only for the path section of the corresponding label | |
| dati$Label[find_label[1] : tail(find_label_end, 1)] <- label$Transportation.Mode[i_row_label] | |
| # save the section of the corresponding path in a new dataframe | |
| dati2 <- dati[find_label[1] : tail(find_label_end, 1),] | |
| # save the data in a csv file | |
| if(first_time){ | |
| #data_Trajectorys <- dati | |
| write.table(dati2,file="dataset_raw.csv", append=TRUE,sep=",",row.names=FALSE) | |
| first_time = FALSE | |
| }else{ | |
| # rbind gives us speed problems for memory management that must find a continuous space | |
| #data_Trajectorys <- rbind(data_Trajectorys, dati) | |
| write.table(dati2,file="dataset_raw.csv", append=TRUE,sep=",",col.names=FALSE,row.names=FALSE) | |
| } | |
| # array to delete the lines I have already found | |
| row_delete <- c(row_delete, i_row_label) | |
| }else | |
| { | |
| print("start timestamp matches but there is no end timestamp") | |
| } | |
| } | |
| } | |
| } | |
| # I delete the lines I have already found | |
| if(length(row_delete) != 0){ | |
| label <- label[-c(row_delete),] | |
| } | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment