tim-fan · August 5, 2024 20:06
diff --git a/country_entry_history.py b/country_entry_history.py
 """
 For a visa application, I need to provide a list of all countries I've visited in the past 10 
 years, along with the associated entry/exit dates. 

 This is a pain as I've visited 37 countries in that timeframe.

 This script parses Google location history to provide a list of countries visited along with entry/exit dates.

 Usage:
 1) Download and unzip timeline history from https://takeout.google.com/
 2) Download and unzip shapefile of country boundaries from https://www.naturalearthdata.com/downloads/10m-cultural-vectors/10m-admin-0-countries/
 3) Run this script. Output is provided in file 'country_visits.csv'

 Limitations:
 * I'm using semantic location history (place visits). This is quicker than parsing full location history,
  but I'm wondering if in some cases a country visit could be missed, if for some reason an associated place 
  visit is not logged. Parsing the full Records.json may be more reliably but probably slower.
 * I've noticed some country lookups fail due to limited resolution of the countries shape file (e.g. when near coastline).
  The script should be extended to handle this case, or perhaps switch to use a reverse geocoding service which should be
  more reliable.
 """


 import json
 import pandas as pd
 import geopandas as gpd
 import numpy as np
 from pathlib import Path
 from shapely.geometry import Point



 def parse_file(json_filename:Path) -> pd.DataFrame:

    with open(json_filename) as f:
        timeline_objects = json.load(f)['timelineObjects']

    # timeline_objects contains 'placeVisit' and 'activitySegment' objects
    # filter out just the place visits
    place_visits = [obj['placeVisit'] for obj in timeline_objects if 'placeVisit' in obj]

    # remove visits without coords
    place_visits = [visit for visit in place_visits if 'latitudeE7' in visit['location']]

    countries = []

    for p in place_visits:
        lat = p['location']['latitudeE7'] * 1e-7
        lon = p['location']['longitudeE7'] * 1e-7
        time = pd.to_datetime(p['duration']['startTimestamp'])

        countries.append(dict(
            time=time,
            lat=lat,
            lon=lon,
        ))

    return pd.DataFrame(countries)

 # read all json files to single dataframe
 json_files = Path("Takeout/Location History (Timeline)/Semantic Location History/").glob("**/*.json")
 country_visits = pd.concat([
    parse_file(json_file) for json_file in json_files
 ],ignore_index=True).reset_index(drop=True)
 country_visits.sort_values(by="time",inplace=True)

 # lookup country name from lat/lon using geopandas + world shape file
 # shapefile source: https://www.naturalearthdata.com/downloads/10m-cultural-vectors/10m-admin-0-countries/
 country_visits["geometry"]= country_visits.apply(lambda row: Point(row['lon'], row['lat']), axis=1)
 gdf = gpd.GeoDataFrame(country_visits, geometry='geometry')
 world = gpd.read_file('ne_10m_admin_0_countries/ne_10m_admin_0_countries.shp')
 world = world.set_crs(epsg=4326)
 gdf = gdf.set_crs(epsg=4326)
 gdf = gpd.sjoin(gdf, world, how='left', predicate='within')
 country_visits["country"] = gdf["NAME"]

 # some country lookups are failing as the coords fall outside shape file boundaries.
 # Ignoring for now.
 country_visits.dropna(axis="index",inplace=True)

 # filter to only show change of country
 country_change = np.concatenate([np.array([True]), country_visits.country[1:].values != country_visits.country[:-1].values])
 country_visits = country_visits[country_change].reset_index(drop=True)

 # add date column for easier reading
 country_visits['entry_date'] = country_visits['time'].dt.strftime('%Y-%m-%d')
 country_visits['exit_date'] = np.concatenate([country_visits['entry_date'][1:],np.array([np.nan])])

 # save csv
 country_visits.to_csv("country_visits.csv",index=False)

 # alternate output - only show the most recent visit to each country
 most_recent_vists = country_visits.drop_duplicates('country',keep='last')
 most_recent_vists.to_csv("most_recent_vists.csv",index=False)
	"""
	For a visa application, I need to provide a list of all countries I've visited in the past 10
	years, along with the associated entry/exit dates.

	This is a pain as I've visited 37 countries in that timeframe.

	This script parses Google location history to provide a list of countries visited along with entry/exit dates.

	Usage:
	1) Download and unzip timeline history from https://takeout.google.com/
	2) Download and unzip shapefile of country boundaries from https://www.naturalearthdata.com/downloads/10m-cultural-vectors/10m-admin-0-countries/
	3) Run this script. Output is provided in file 'country_visits.csv'

	Limitations:
	* I'm using semantic location history (place visits). This is quicker than parsing full location history,
	but I'm wondering if in some cases a country visit could be missed, if for some reason an associated place
	visit is not logged. Parsing the full Records.json may be more reliably but probably slower.
	* I've noticed some country lookups fail due to limited resolution of the countries shape file (e.g. when near coastline).
	The script should be extended to handle this case, or perhaps switch to use a reverse geocoding service which should be
	more reliable.
	"""


	import json
	import pandas as pd
	import geopandas as gpd
	import numpy as np
	from pathlib import Path
	from shapely.geometry import Point



	def parse_file(json_filename:Path) -> pd.DataFrame:

	with open(json_filename) as f:
	timeline_objects = json.load(f)['timelineObjects']

	# timeline_objects contains 'placeVisit' and 'activitySegment' objects
	# filter out just the place visits
	place_visits = [obj['placeVisit'] for obj in timeline_objects if 'placeVisit' in obj]

	# remove visits without coords
	place_visits = [visit for visit in place_visits if 'latitudeE7' in visit['location']]

	countries = []

	for p in place_visits:
	lat = p['location']['latitudeE7'] * 1e-7
	lon = p['location']['longitudeE7'] * 1e-7
	time = pd.to_datetime(p['duration']['startTimestamp'])

	countries.append(dict(
	time=time,
	lat=lat,
	lon=lon,
	))

	return pd.DataFrame(countries)

	# read all json files to single dataframe
	json_files = Path("Takeout/Location History (Timeline)/Semantic Location History/").glob("*/.json")
	country_visits = pd.concat([
	parse_file(json_file) for json_file in json_files
	],ignore_index=True).reset_index(drop=True)
	country_visits.sort_values(by="time",inplace=True)

	# lookup country name from lat/lon using geopandas + world shape file
	# shapefile source: https://www.naturalearthdata.com/downloads/10m-cultural-vectors/10m-admin-0-countries/
	country_visits["geometry"]= country_visits.apply(lambda row: Point(row['lon'], row['lat']), axis=1)
	gdf = gpd.GeoDataFrame(country_visits, geometry='geometry')
	world = gpd.read_file('ne_10m_admin_0_countries/ne_10m_admin_0_countries.shp')
	world = world.set_crs(epsg=4326)
	gdf = gdf.set_crs(epsg=4326)
	gdf = gpd.sjoin(gdf, world, how='left', predicate='within')
	country_visits["country"] = gdf["NAME"]

	# some country lookups are failing as the coords fall outside shape file boundaries.
	# Ignoring for now.
	country_visits.dropna(axis="index",inplace=True)

	# filter to only show change of country
	country_change = np.concatenate([np.array([True]), country_visits.country[1:].values != country_visits.country[:-1].values])
	country_visits = country_visits[country_change].reset_index(drop=True)

	# add date column for easier reading
	country_visits['entry_date'] = country_visits['time'].dt.strftime('%Y-%m-%d')
	country_visits['exit_date'] = np.concatenate([country_visits['entry_date'][1:],np.array([np.nan])])

	# save csv
	country_visits.to_csv("country_visits.csv",index=False)

	# alternate output - only show the most recent visit to each country
	most_recent_vists = country_visits.drop_duplicates('country',keep='last')
	most_recent_vists.to_csv("most_recent_vists.csv",index=False)