pietrocolombo · December 11, 2020 10:25
diff --git a/parsing_geolife_dataset.r b/parsing_geolife_dataset.r
 # data-set
 # http://research.microsoft.com/en-us/projects/geolife/
 # script for reading .plt file and label file
 # assigns the associated label to each point
 # each folder represents a user, we only consider users who have the labels.txt file


 # I take the path of each folder
 dirs_perc = list.dirs("../Geolife Trajectories 1.3/Data", recursive = FALSE)
 # I take the name of each folder which identifies me as the user
 dirs_name = list.files("../Geolife Trajectories 1.3/Data")

 first_time = TRUE
 file.remove("dataset_raw.csv")

 # cycle on all the folders which indicate the user
 for (i_dirs in 1:length(dirs_perc)){
  print(i_dirs)
  # I look at the number of files in the folder if it is 3 I have the labels as the mac sees an Icon\r file
  file = list.files(dirs_perc[i_dirs])
  file <- file[file!="Icon\r"]
  # if I have two files it means that I also have the labels
  if (length(file) == 2) {
    # I read the label file
    label <- read.table(paste(dirs_perc[i_dirs],"/labels.txt", sep = ""), quote = "\"", sep = "\t", header = TRUE, colClasses = c("character", "character", "character") )
    # I get the path of the trajectories
    trajectory_perc <- paste(dirs_perc[i_dirs],"/Trajectory", sep = "")
    file_trajectory <- list.files(trajectory_perc)
    # I take only the .plt files
    index_file <- grep(".plt",file_trajectory)
    
    # I uniform the dates in order to be able to compare them
    label$Start.Time.Posix <- as.POSIXct(label$Start.Time, format="%Y/%m/%d %H:%M:%OS", tz="GMT")
    label$End.Time.Posix <- as.POSIXct(label$End.Time, format="%Y/%m/%d %H:%M:%OS", tz="GMT")
    
    # cycle on the single user's .plt files if the user has the labels
    for (index in index_file){
      # I read one .plt file at a time
      dati <- read.table(paste(trajectory_perc, "/", file_trajectory[index], sep = ''), header = FALSE, quote = "\"", skip = 6, sep = ",", colClasses = c("character", "character", "character", "character", "character", "character", "character") , numerals = "no.loss")
      # I uniform the dates in order to be able to compare them
      dati$Date_Time <- as.POSIXct(paste(dati$V6, dati$V7, sep = " "),format="%Y-%m-%d %H:%M:%OS", tz="GMT")
      
      # I delete the columns that I don't need
      # which are columns 3 and 5
      # as v3 is always at 0 and V5 represents the days since a certain date but we have the date directly.
      dati$V3 <- NULL
      dati$V5 <- NULL
      # I change the names to the columns with more meaningful names
      colnames(dati) <- c("Latitude", "Longitude", "Altitude", "Date", "Time", "Date_Time")
      # I delete Date and Time I already have Date_Time
      dati$Date <- NULL
      dati$Time <- NULL
      # in order not to lose information on the decimal places in the coordinates
      options(digits=10)
      for(i in 1:3) {
        dati[,i] <- as.double(dati[,i])
      }
      
      # I add user id
      dati$Id_user <- dirs_name[i_dirs]
      # I add the id that identifies the path
      dati$Id_perc <- file_trajectory[index]
      
      # I add the column for the empty label
      dati$Label <- ""
      
      # vector to keep track of the labels already associated with the points
      row_delete <- c()
      
      # check that we have the labels referred to the trajectories file
      # so i look in the label table if i have a trajectory with the label starting with the same timestamps
      # as there can be multiple labels for each plt file
      # cycle for each file on all label elements
      if(nrow(label)>0)
      {
        for(i_row_label in 1:nrow(label))
        {
          # print(paste(i_row_label, " numero righe label ", nrow(label)))
          find_label <- grep(label$Start.Time.Posix[i_row_label], dati$Date_Time)
          if(length(find_label) != 0)
          {
            find_label_end <- grep(label$End.Time.Posix[i_row_label], dati$Date_Time)
            if(length(find_label_end) != 0)
            {
              if(length(find_label) > 1){
                # happens if I have multiple points with the same timestamp it depends on the amount of data
                print(paste("I found more labels, the frequency of points very dense", length(find_label), " file name ", file_trajectory[index], " directory name ", dirs_name[i_dirs], " index for ", i_dirs))
              }
              # if I found a label that matches the path
              # I put the label only for the path section of the corresponding label
              dati$Label[find_label[1] : tail(find_label_end, 1)] <- label$Transportation.Mode[i_row_label]
              
              # save the section of the corresponding path in a new dataframe
              dati2 <- dati[find_label[1] : tail(find_label_end, 1),]
              
              # save the data in a csv file
              if(first_time){
                #data_Trajectorys <- dati
                write.table(dati2,file="dataset_raw.csv", append=TRUE,sep=",",row.names=FALSE) 
                first_time = FALSE
              }else{
                # rbind gives us speed problems for memory management that must find a continuous space
                #data_Trajectorys <- rbind(data_Trajectorys, dati)
                write.table(dati2,file="dataset_raw.csv", append=TRUE,sep=",",col.names=FALSE,row.names=FALSE)
              }
              # array to delete the lines I have already found
              row_delete <- c(row_delete, i_row_label)
            }else
            {
              print("start timestamp matches but there is no end timestamp")
            }
          }
        }
      }
      # I delete the lines I have already found
      if(length(row_delete) != 0){
        label <- label[-c(row_delete),]
      }
    }
  }
 }
	# data-set
	# http://research.microsoft.com/en-us/projects/geolife/
	# script for reading .plt file and label file
	# assigns the associated label to each point
	# each folder represents a user, we only consider users who have the labels.txt file


	# I take the path of each folder
	dirs_perc = list.dirs("../Geolife Trajectories 1.3/Data", recursive = FALSE)
	# I take the name of each folder which identifies me as the user
	dirs_name = list.files("../Geolife Trajectories 1.3/Data")

	first_time = TRUE
	file.remove("dataset_raw.csv")

	# cycle on all the folders which indicate the user
	for (i_dirs in 1:length(dirs_perc)){
	print(i_dirs)
	# I look at the number of files in the folder if it is 3 I have the labels as the mac sees an Icon\r file
	file = list.files(dirs_perc[i_dirs])
	file <- file[file!="Icon\r"]
	# if I have two files it means that I also have the labels
	if (length(file) == 2) {
	# I read the label file
	label <- read.table(paste(dirs_perc[i_dirs],"/labels.txt", sep = ""), quote = "\"", sep = "\t", header = TRUE, colClasses = c("character", "character", "character") )
	# I get the path of the trajectories
	trajectory_perc <- paste(dirs_perc[i_dirs],"/Trajectory", sep = "")
	file_trajectory <- list.files(trajectory_perc)
	# I take only the .plt files
	index_file <- grep(".plt",file_trajectory)

	# I uniform the dates in order to be able to compare them
	label$Start.Time.Posix <- as.POSIXct(label$Start.Time, format="%Y/%m/%d %H:%M:%OS", tz="GMT")
	label$End.Time.Posix <- as.POSIXct(label$End.Time, format="%Y/%m/%d %H:%M:%OS", tz="GMT")

	# cycle on the single user's .plt files if the user has the labels
	for (index in index_file){
	# I read one .plt file at a time
	dati <- read.table(paste(trajectory_perc, "/", file_trajectory[index], sep = ''), header = FALSE, quote = "\"", skip = 6, sep = ",", colClasses = c("character", "character", "character", "character", "character", "character", "character") , numerals = "no.loss")
	# I uniform the dates in order to be able to compare them
	dati$Date_Time <- as.POSIXct(paste(dati$V6, dati$V7, sep = " "),format="%Y-%m-%d %H:%M:%OS", tz="GMT")

	# I delete the columns that I don't need
	# which are columns 3 and 5
	# as v3 is always at 0 and V5 represents the days since a certain date but we have the date directly.
	dati$V3 <- NULL
	dati$V5 <- NULL
	# I change the names to the columns with more meaningful names
	colnames(dati) <- c("Latitude", "Longitude", "Altitude", "Date", "Time", "Date_Time")
	# I delete Date and Time I already have Date_Time
	dati$Date <- NULL
	dati$Time <- NULL
	# in order not to lose information on the decimal places in the coordinates
	options(digits=10)
	for(i in 1:3) {
	dati[,i] <- as.double(dati[,i])
	}

	# I add user id
	dati$Id_user <- dirs_name[i_dirs]
	# I add the id that identifies the path
	dati$Id_perc <- file_trajectory[index]

	# I add the column for the empty label
	dati$Label <- ""

	# vector to keep track of the labels already associated with the points
	row_delete <- c()

	# check that we have the labels referred to the trajectories file
	# so i look in the label table if i have a trajectory with the label starting with the same timestamps
	# as there can be multiple labels for each plt file
	# cycle for each file on all label elements
	if(nrow(label)>0)
	{
	for(i_row_label in 1:nrow(label))
	{
	# print(paste(i_row_label, " numero righe label ", nrow(label)))
	find_label <- grep(label$Start.Time.Posix[i_row_label], dati$Date_Time)
	if(length(find_label) != 0)
	{
	find_label_end <- grep(label$End.Time.Posix[i_row_label], dati$Date_Time)
	if(length(find_label_end) != 0)
	{
	if(length(find_label) > 1){
	# happens if I have multiple points with the same timestamp it depends on the amount of data
	print(paste("I found more labels, the frequency of points very dense", length(find_label), " file name ", file_trajectory[index], " directory name ", dirs_name[i_dirs], " index for ", i_dirs))
	}
	# if I found a label that matches the path
	# I put the label only for the path section of the corresponding label
	dati$Label[find_label[1] : tail(find_label_end, 1)] <- label$Transportation.Mode[i_row_label]

	# save the section of the corresponding path in a new dataframe
	dati2 <- dati[find_label[1] : tail(find_label_end, 1),]

	# save the data in a csv file
	if(first_time){
	#data_Trajectorys <- dati
	write.table(dati2,file="dataset_raw.csv", append=TRUE,sep=",",row.names=FALSE)
	first_time = FALSE
	}else{
	# rbind gives us speed problems for memory management that must find a continuous space
	#data_Trajectorys <- rbind(data_Trajectorys, dati)
	write.table(dati2,file="dataset_raw.csv", append=TRUE,sep=",",col.names=FALSE,row.names=FALSE)
	}
	# array to delete the lines I have already found
	row_delete <- c(row_delete, i_row_label)
	}else
	{
	print("start timestamp matches but there is no end timestamp")
	}
	}
	}
	}
	# I delete the lines I have already found
	if(length(row_delete) != 0){
	label <- label[-c(row_delete),]
	}
	}
	}
	}
No results found