Last active
September 19, 2023 12:00
-
-
Save HTenkanen/3b214be899f0d3885bad48577de48150 to your computer and use it in GitHub Desktop.
Proof of concept: Geopandas to PostGIS. See https://gist.github.com/HTenkanen/3b214be899f0d3885bad48577de48150#gistcomment-3187471.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import geopandas as gpd | |
from sqlalchemy import create_engine | |
from geoalchemy2 import Geometry | |
from shapely.geometry import MultiLineString, MultiPoint, MultiPolygon | |
from shapely.wkb import dumps | |
import io | |
from pyproj import CRS | |
import csv | |
import time | |
import pygeos | |
def timeit(method): | |
def timed(*args, **kw): | |
ts = time.time() | |
result = method(*args, **kw) | |
te = time.time() | |
if 'log_time' in kw: | |
name = kw.get('log_name', method.__name__.upper()) | |
kw['log_time'][name] = round((te - ts) / 60, 2) | |
else: | |
print('%r %.2f seconds' % \ | |
(method.__name__, (te - ts) )) | |
return result | |
return timed | |
@timeit | |
def get_geometry_type(gdf): | |
"""Get basic geometry type of a GeoDataFrame, and information if the gdf contains Geometry Collections.""" | |
geom_types = list(gdf.geometry.geom_type.unique()) | |
geom_collection = False | |
# Get the basic geometry type | |
basic_types = [] | |
for gt in geom_types: | |
if 'Multi' in gt: | |
geom_collection = True | |
basic_types.append(gt.replace('Multi', '')) | |
else: | |
basic_types.append(gt) | |
geom_types = list(set(basic_types)) | |
# Check for mixed geometry types | |
assert len(geom_types) < 2, "GeoDataFrame contains mixed geometry types, cannot proceed with mixed geometries." | |
geom_type = geom_types[0] | |
return (geom_type, geom_collection) | |
@timeit | |
def get_srid_from_crs(gdf): | |
""" | |
Get EPSG code from CRS if available. If not, return -1. | |
""" | |
if gdf.crs is not None: | |
try: | |
if isinstance(gdf.crs, dict): | |
# If CRS is in typical geopandas format take only the value to avoid pyproj Future warning | |
if 'init' in gdf.crs.keys(): | |
srid = CRS(gdf.crs['init']).to_epsg(min_confidence=25) | |
else: | |
srid = CRS(gdf.crs).to_epsg(min_confidence=25) | |
else: | |
srid = CRS(gdf).to_epsg(min_confidence=25) | |
if srid is None: | |
srid = -1 | |
except: | |
srid = -1 | |
if srid == -1: | |
print("Warning: Could not parse coordinate reference system from GeoDataFrame. Inserting data without defined CRS.") | |
return srid | |
@timeit | |
def convert_to_wkb(gdf, geom_name): | |
# Convert geometries to wkb | |
# With pygeos | |
gdf[geom_name] = pygeos.to_wkb(pygeos.from_shapely(gdf[geom_name].to_list()), hex=True) | |
# With Shapely | |
# gdf[geom_name] = gdf[geom_name].apply(lambda x: dumps(x, hex=True)) | |
return gdf | |
@timeit | |
def write_to_db(gdf, engine, index, tbl, table, schema, srid, geom_name): | |
# Convert columns to lists and make a generator | |
args = [list(gdf[i]) for i in gdf.columns] | |
if index: | |
args.insert(0,list(gdf.index)) | |
data_iter = zip(*args) | |
# get list of columns using pandas | |
keys = tbl.insert_data()[0] | |
columns = ', '.join('"{}"'.format(k) for k in list(keys)) | |
# borrowed from https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#insertion-method | |
s_buf = io.StringIO() | |
writer = csv.writer(s_buf) | |
writer.writerows(data_iter) | |
s_buf.seek(0) | |
conn = engine.raw_connection() | |
cur = conn.cursor() | |
sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format( | |
table, columns) | |
try: | |
cur.copy_expert(sql=sql, file=s_buf) | |
cur.execute("SELECT UpdateGeometrySRID('{table}', '{geometry}', {srid})".format( | |
schema=schema, table=table, geometry=geom_name, srid=srid)) | |
conn.commit() | |
except Exception as e: | |
conn.connection.rollback() | |
conn.close() | |
raise(e) | |
conn.close() | |
@timeit | |
def copy_to_postgis(gdf, engine, table, if_exists='fail', | |
schema=None, dtype=None, index=False, | |
): | |
""" | |
Fast upload of GeoDataFrame into PostGIS database using COPY. | |
Parameters | |
---------- | |
gdf : GeoDataFrame | |
GeoDataFrame containing the data for upload. | |
engine : SQLAclchemy engine. | |
Connection. | |
if_exists : str | |
What to do if table exists already: 'replace' | 'append' | 'fail'. | |
schema : db-schema | |
Database schema where the data will be uploaded (optional). | |
dtype : dict of column name to SQL type, default None | |
Optional specifying the datatype for columns. The SQL type should be a | |
SQLAlchemy type, or a string for sqlite3 fallback connection. | |
index : bool | |
Store DataFrame index to the database as well. | |
""" | |
gdf = gdf.copy() | |
geom_name = gdf.geometry.name | |
if schema is not None: | |
schema_name = schema | |
else: | |
schema_name = 'public' | |
# Get srid | |
srid = get_srid_from_crs(gdf) | |
# Check geometry types | |
geometry_type, contains_multi_geoms = get_geometry_type(gdf) | |
# Build target geometry type | |
if contains_multi_geoms: | |
target_geom_type = "Multi{geom_type}".format(geom_type=geometry_type) | |
else: | |
target_geom_type = geometry_type | |
# Build dtype with Geometry (srid is updated afterwards) | |
if dtype is not None: | |
dtype[geom_name] = Geometry(geometry_type=target_geom_type) | |
else: | |
dtype = {geom_name: Geometry(geometry_type=target_geom_type)} | |
# Get Pandas SQLTable object (ignore 'geometry') | |
# If dtypes is used, update table schema accordingly. | |
pandas_sql = pd.io.sql.SQLDatabase(engine) | |
tbl = pd.io.sql.SQLTable(name=table, pandas_sql_engine=pandas_sql, | |
frame=gdf, dtype=dtype, index=index) | |
# Check if table exists | |
if tbl.exists(): | |
# If it exists, check if should overwrite | |
if if_exists == 'replace': | |
pandas_sql.drop_table(table) | |
tbl.create() | |
elif if_exists == 'fail': | |
raise Exception("Table '{table}' exists in the database.".format(table=table)) | |
elif if_exists == 'append': | |
pass | |
else: | |
tbl.create() | |
# Ensure all geometries all Geometry collections if there were MultiGeometries in the table | |
if contains_multi_geoms: | |
mask = gdf[geom_name].geom_type==geometry_type | |
if geometry_type == 'Point': | |
gdf.loc[mask, geom_name] = gdf.loc[mask, geom_name].apply(lambda geom: MultiPoint([geom])) | |
elif geometry_type == 'LineString': | |
gdf.loc[mask, geom_name] = gdf.loc[mask, geom_name].apply(lambda geom: MultiLineString([geom])) | |
elif geometry_type == 'Polygon': | |
gdf.loc[mask, geom_name] = gdf.loc[mask, geom_name].apply(lambda geom: MultiPolygon([geom])) | |
# Convert geometries to WKB | |
gdf = convert_to_wkb(gdf, geom_name) | |
# Write to database | |
write_to_db(gdf, engine, index, tbl, table, schema, srid, geom_name) | |
return | |
# ===================== | |
# TEST | |
# ===================== | |
data = gpd.read_file("https://gist.githubusercontent.com/HTenkanen/456ec4611a943955823a65729c9cf2aa/raw/be56f5e1e5c06c33cd51e89f823a7d770d8769b5/ykr_basegrid.geojson") | |
engine = create_engine("postgresql+psycopg2://myuser:mypwd@localhost:5432/mydb") | |
# Run with %timeit to get a proper time-profile | |
copy_to_postgis(data, engine, table='ykr_test', if_exists='replace', schema=None, dtype=None, index=True) |
@CibelesR Sorry for not answering to this before. Now 3D geometries are supported as well. What comes to your comment about mismatching indices, we have witnessed the same behavior and are currently looking into this. For some reason, the records seem to be inserted into the PostGIS table sometimes in a different order than they are in the original GeoDataFrame. All the values match, but it can be a bit confusing for the user as you reported.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Dear @HTenkanen, I found something and I think it could be an issue. When I upload a geodataframe with gdf.to_postgis() the information and the geometry do not match. An example:
I have a column with 'order' information. This is a calculated column. If I save to a .shp (gdf.to_file('path*.shp')) and I visualized in a GIS software I have 'order' in the right geometry. But If I do load the layer from Postgres this information has changed.
I found this on Windows 10 x64
The code is:
Maybe it is easier to understand with an image. Black is the number I get in the GeoPandasDataframe and blue is the number I get after uploading it.