Last active
February 13, 2025 13:18
-
-
Save AlexandraKapp/296f79e77e3b7772f8af486a72689da0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A Script to | |
# - download the GeoLife (https://www.microsoft.com/en-us/research/publication/geolife-gps-trajectory-dataset-user-guide/) data | |
# - transform it to a .csv file | |
# - and cut records to given outlines | |
# not very performant bc of preprocessing with pandas (takes about 30 min) | |
# produces a 1,8 GB output csv file | |
import os | |
from pathlib import Path | |
import csv | |
import numpy as np | |
import pandas as pd | |
from io import BytesIO | |
from zipfile import ZipFile | |
from urllib.request import urlopen | |
from tqdm.auto import tqdm | |
##### INPUT VARIABLES ##### | |
# set path names | |
RAW_DATA_PATH="raw/geolife" | |
PROCESSED_DATA_PATH ="preprocessed" | |
# set geo boundaries | |
CUT_RECORDS_TO_BOUNDARY = True | |
LNG_MIN=116.08 | |
LNG_MAX=116.69 | |
LAT_MIN=39.66 | |
LAT_MAX=40.27 | |
############ Download data ############### | |
# GEOLIFE | |
if not os.path.exists(RAW_DATA_PATH): | |
with tqdm(total=1, desc="Download geolife data",) as pbar: # progress bar | |
os.makedirs(RAW_DATA_PATH) | |
url = "https://download.microsoft.com/download/F/4/8/F4894AA5-FDBC-481E-9285-D5F8C4C4F039/Geolife%20Trajectories%201.3.zip" | |
with urlopen(url) as zipresp: | |
with ZipFile(BytesIO(zipresp.read())) as zfile: | |
zfile.extractall( | |
RAW_DATA_PATH | |
) | |
pbar.update() | |
else: | |
print("Geolife data already exists. Download is skipped.") | |
############ Preprocess data ############### | |
#### FUNCTIONS #### | |
# clean header of plt files and write all data into single csv | |
def geolife_clean_plt(root, user_id, input_filepath, traj_id): | |
# read plt file | |
with open(root + "/" + user_id + "/Trajectory/" + input_filepath, "rt") as fin: | |
cr = csv.reader(fin) | |
filecontents = [line for line in cr][6:] | |
for l in filecontents: | |
l.insert(0, traj_id) | |
l.insert(0, user_id) | |
return filecontents | |
def geolife_data_to_df(dir): | |
data = [] | |
col_names = ["uid", "tid", "lat", "lng", "-", "Alt", "dayNo", "date", "time"] | |
user_id_dirs = [ | |
name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir, name)) | |
] | |
with tqdm(total=len(user_id_dirs), desc="Preprocess Geolife data",) as pbar: # progress bar | |
for user_id in np.sort(user_id_dirs): | |
tempdirs = os.listdir(dir + "/" + user_id + "/Trajectory") | |
subdirs = [] | |
for item in tempdirs: | |
if not item.endswith(".DS_Store"): | |
subdirs.append(item) | |
traj_id = 0 | |
for subdir in subdirs: | |
data += geolife_clean_plt(dir, user_id, subdir, traj_id) | |
traj_id = traj_id + 1 | |
pbar.update() | |
return pd.DataFrame(data, columns=col_names) | |
##### | |
##### SCRIPT ##### | |
if Path(os.path.join(PROCESSED_DATA_PATH, "geolife.csv")).exists(): | |
print("Geolife data is already preprocessed. Processing is skipped.") | |
df = pd.read_csv(os.path.join(PROCESSED_DATA_PATH, "geolife.csv")) | |
else: | |
if not Path(PROCESSED_DATA_PATH).exists(): | |
os.makedirs(PROCESSED_DATA_PATH) | |
geolife_dir = os.path.join( | |
RAW_DATA_PATH, "Geolife Trajectories 1.3", "Data" | |
) | |
df = geolife_data_to_df(geolife_dir) | |
df["datetime"] = df.date + " " + df.time | |
df["datetime"] = pd.to_datetime(df.datetime) | |
df.drop("date", inplace=True, axis=1) | |
df.drop("time", inplace=True, axis=1) | |
## fix datetime timezone | |
df["datetime"] = ( | |
df["datetime"] | |
.dt.tz_localize("GMT") | |
.dt.tz_convert("Asia/Shanghai") | |
.dt.tz_localize(None) | |
) | |
df.to_csv(os.path.join(PROCESSED_DATA_PATH, "geolife.csv"), index=False) | |
############ Cut to outline of given boundary ############### | |
if CUT_RECORDS_TO_BOUNDARY: | |
print("Records are cut to outline of given boundary.") | |
df.lat = df.lat.astype(float) | |
df.lng = df.lng.astype(float) | |
df = df[(df.lat > LAT_MIN) & (df.lat < LAT_MAX) & (df.lng > LNG_MIN) & (df.lng < LNG_MAX)] | |
df.to_csv(os.path.join(PROCESSED_DATA_PATH, "geolife_in_boundary.csv"), index=False) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment