HTenkanen · September 19, 2023 12:00 · CibelesR · Mar 5, 2020 · HTenkanen · May 1, 2020
diff --git a/gpd_to_postgis.py b/gpd_to_postgis.py
 import pandas as pd
 import geopandas as gpd
 from sqlalchemy import create_engine
 from geoalchemy2 import Geometry
 from shapely.geometry import MultiLineString, MultiPoint, MultiPolygon
 from shapely.wkb import dumps
 import io
 from pyproj import CRS
 import csv
 import time
 import pygeos

 def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = round((te - ts) / 60, 2)
        else:
            print('%r  %.2f seconds' % \
                  (method.__name__, (te - ts) ))
        return result

    return timed

 @timeit
 def get_geometry_type(gdf):
    """Get basic geometry type of a GeoDataFrame, and information if the gdf contains Geometry Collections."""
    geom_types = list(gdf.geometry.geom_type.unique())
    geom_collection = False
    
    # Get the basic geometry type 
    basic_types = []
    for gt in geom_types:
        if 'Multi' in gt:
            geom_collection = True
            basic_types.append(gt.replace('Multi', ''))
        else:
            basic_types.append(gt)
    geom_types = list(set(basic_types))
    
    # Check for mixed geometry types
    assert len(geom_types) < 2, "GeoDataFrame contains mixed geometry types, cannot proceed with mixed geometries."
    geom_type = geom_types[0]
    return (geom_type, geom_collection)

 @timeit
 def get_srid_from_crs(gdf):
        """
        Get EPSG code from CRS if available. If not, return -1. 
        """
        if gdf.crs is not None:
            try:
                if isinstance(gdf.crs, dict):
                    # If CRS is in typical geopandas format take only the value to avoid pyproj Future warning
                    if 'init' in gdf.crs.keys():
                        srid = CRS(gdf.crs['init']).to_epsg(min_confidence=25)
                    else:
                        srid = CRS(gdf.crs).to_epsg(min_confidence=25)
                else:
                    srid = CRS(gdf).to_epsg(min_confidence=25)
                if srid is None:
                    srid = -1
            except:
                srid = -1
        if srid == -1:
            print("Warning: Could not parse coordinate reference system from GeoDataFrame. Inserting data without defined CRS.")
    
        return srid
    
 @timeit
 def convert_to_wkb(gdf, geom_name):
    # Convert geometries to wkb 
    # With pygeos
    gdf[geom_name] = pygeos.to_wkb(pygeos.from_shapely(gdf[geom_name].to_list()), hex=True)
    
    # With Shapely
 #    gdf[geom_name] = gdf[geom_name].apply(lambda x: dumps(x, hex=True))
    return gdf
    
 @timeit
 def write_to_db(gdf, engine, index, tbl, table, schema, srid, geom_name):
    # Convert columns to lists and make a generator
    args = [list(gdf[i]) for i in gdf.columns]
    if index:
        args.insert(0,list(gdf.index))
    
    data_iter = zip(*args)
    
    # get list of columns using pandas
    keys = tbl.insert_data()[0]
    columns = ', '.join('"{}"'.format(k) for k in list(keys))
    # borrowed from https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#insertion-method
    s_buf = io.StringIO()
    writer = csv.writer(s_buf)
    writer.writerows(data_iter)
    s_buf.seek(0)
    
    conn = engine.raw_connection()
    cur = conn.cursor()
    
    sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
        table, columns)
    
    try:
        cur.copy_expert(sql=sql, file=s_buf)
        cur.execute("SELECT UpdateGeometrySRID('{table}', '{geometry}', {srid})".format(
                schema=schema, table=table, geometry=geom_name, srid=srid))
        conn.commit()
    except Exception as e:
        conn.connection.rollback()
        conn.close()
        raise(e)
    conn.close()

    
 @timeit
 def copy_to_postgis(gdf, engine, table, if_exists='fail',  
                    schema=None, dtype=None, index=False,
                    ):
    """
    Fast upload of GeoDataFrame into PostGIS database using COPY. 
    
    Parameters
    ----------
    
    gdf : GeoDataFrame
        GeoDataFrame containing the data for upload.
    engine : SQLAclchemy engine.
        Connection.
    if_exists : str
        What to do if table exists already: 'replace' | 'append' | 'fail'.
    schema : db-schema
        Database schema where the data will be uploaded (optional).
    dtype : dict of column name to SQL type, default None
        Optional specifying the datatype for columns. The SQL type should be a 
        SQLAlchemy type, or a string for sqlite3 fallback connection.
    index : bool
        Store DataFrame index to the database as well.
    """
    gdf = gdf.copy()
    geom_name = gdf.geometry.name
    if schema is not None:
        schema_name = schema
    else:
        schema_name = 'public'
    
    # Get srid
    srid = get_srid_from_crs(gdf)
    
    # Check geometry types
    geometry_type, contains_multi_geoms = get_geometry_type(gdf)
    
    # Build target geometry type
    if contains_multi_geoms:
        target_geom_type = "Multi{geom_type}".format(geom_type=geometry_type)
    else:
        target_geom_type = geometry_type
    
    # Build dtype with Geometry (srid is updated afterwards)
    if dtype is not None:
        dtype[geom_name] = Geometry(geometry_type=target_geom_type)
    else:
        dtype = {geom_name: Geometry(geometry_type=target_geom_type)}
        
    # Get Pandas SQLTable object (ignore 'geometry')
    # If dtypes is used, update table schema accordingly.
    pandas_sql = pd.io.sql.SQLDatabase(engine)
    tbl = pd.io.sql.SQLTable(name=table, pandas_sql_engine=pandas_sql, 
                             frame=gdf, dtype=dtype, index=index)
    
    # Check if table exists
    if tbl.exists():
        # If it exists, check if should overwrite    
        if if_exists == 'replace':
            pandas_sql.drop_table(table)
            tbl.create()
        elif if_exists == 'fail':
            raise Exception("Table '{table}' exists in the database.".format(table=table))
        elif if_exists == 'append':
            pass
    else:
        tbl.create()
    
    # Ensure all geometries all Geometry collections if there were MultiGeometries in the table
    if contains_multi_geoms:
        mask = gdf[geom_name].geom_type==geometry_type
        if geometry_type == 'Point':
            gdf.loc[mask, geom_name] = gdf.loc[mask, geom_name].apply(lambda geom: MultiPoint([geom]))
        elif geometry_type == 'LineString':
            gdf.loc[mask, geom_name] = gdf.loc[mask, geom_name].apply(lambda geom: MultiLineString([geom]))
        elif geometry_type == 'Polygon':
            gdf.loc[mask, geom_name] = gdf.loc[mask, geom_name].apply(lambda geom: MultiPolygon([geom]))
            
    # Convert geometries to WKB
    gdf = convert_to_wkb(gdf, geom_name)
    
    # Write to database
    write_to_db(gdf, engine, index, tbl, table, schema, srid, geom_name)
    
    return

 # =====================
 # TEST
 # =====================
    
 data = gpd.read_file("https://gist.githubusercontent.com/HTenkanen/456ec4611a943955823a65729c9cf2aa/raw/be56f5e1e5c06c33cd51e89f823a7d770d8769b5/ykr_basegrid.geojson")
 engine = create_engine("postgresql+psycopg2://myuser:mypwd@localhost:5432/mydb")

 # Run with %timeit to get a proper time-profile
 copy_to_postgis(data, engine, table='ykr_test', if_exists='replace',  schema=None, dtype=None, index=True)
	import pandas as pd
	import geopandas as gpd
	from sqlalchemy import create_engine
	from geoalchemy2 import Geometry
	from shapely.geometry import MultiLineString, MultiPoint, MultiPolygon
	from shapely.wkb import dumps
	import io
	from pyproj import CRS
	import csv
	import time
	import pygeos

	def timeit(method):
	def timed(args, *kw):
	ts = time.time()
	result = method(args, *kw)
	te = time.time()
	if 'log_time' in kw:
	name = kw.get('log_name', method.__name__.upper())
	kw['log_time'][name] = round((te - ts) / 60, 2)
	else:
	print('%r %.2f seconds' % \
	(method.__name__, (te - ts) ))
	return result

	return timed

	@timeit
	def get_geometry_type(gdf):
	"""Get basic geometry type of a GeoDataFrame, and information if the gdf contains Geometry Collections."""
	geom_types = list(gdf.geometry.geom_type.unique())
	geom_collection = False

	# Get the basic geometry type
	basic_types = []
	for gt in geom_types:
	if 'Multi' in gt:
	geom_collection = True
	basic_types.append(gt.replace('Multi', ''))
	else:
	basic_types.append(gt)
	geom_types = list(set(basic_types))

	# Check for mixed geometry types
	assert len(geom_types) < 2, "GeoDataFrame contains mixed geometry types, cannot proceed with mixed geometries."
	geom_type = geom_types[0]
	return (geom_type, geom_collection)

	@timeit
	def get_srid_from_crs(gdf):
	"""
	Get EPSG code from CRS if available. If not, return -1.
	"""
	if gdf.crs is not None:
	try:
	if isinstance(gdf.crs, dict):
	# If CRS is in typical geopandas format take only the value to avoid pyproj Future warning
	if 'init' in gdf.crs.keys():
	srid = CRS(gdf.crs['init']).to_epsg(min_confidence=25)
	else:
	srid = CRS(gdf.crs).to_epsg(min_confidence=25)
	else:
	srid = CRS(gdf).to_epsg(min_confidence=25)
	if srid is None:
	srid = -1
	except:
	srid = -1
	if srid == -1:
	print("Warning: Could not parse coordinate reference system from GeoDataFrame. Inserting data without defined CRS.")

	return srid

	@timeit
	def convert_to_wkb(gdf, geom_name):
	# Convert geometries to wkb
	# With pygeos
	gdf[geom_name] = pygeos.to_wkb(pygeos.from_shapely(gdf[geom_name].to_list()), hex=True)

	# With Shapely
	# gdf[geom_name] = gdf[geom_name].apply(lambda x: dumps(x, hex=True))
	return gdf

	@timeit
	def write_to_db(gdf, engine, index, tbl, table, schema, srid, geom_name):
	# Convert columns to lists and make a generator
	args = [list(gdf[i]) for i in gdf.columns]
	if index:
	args.insert(0,list(gdf.index))

	data_iter = zip(*args)

	# get list of columns using pandas
	keys = tbl.insert_data()[0]
	columns = ', '.join('"{}"'.format(k) for k in list(keys))
	# borrowed from https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#insertion-method
	s_buf = io.StringIO()
	writer = csv.writer(s_buf)
	writer.writerows(data_iter)
	s_buf.seek(0)

	conn = engine.raw_connection()
	cur = conn.cursor()

	sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
	table, columns)

	try:
	cur.copy_expert(sql=sql, file=s_buf)
	cur.execute("SELECT UpdateGeometrySRID('{table}', '{geometry}', {srid})".format(
	schema=schema, table=table, geometry=geom_name, srid=srid))
	conn.commit()
	except Exception as e:
	conn.connection.rollback()
	conn.close()
	raise(e)
	conn.close()


	@timeit
	def copy_to_postgis(gdf, engine, table, if_exists='fail',
	schema=None, dtype=None, index=False,
	):
	"""
	Fast upload of GeoDataFrame into PostGIS database using COPY.

	Parameters
	----------

	gdf : GeoDataFrame
	GeoDataFrame containing the data for upload.
	engine : SQLAclchemy engine.
	Connection.
	if_exists : str
	What to do if table exists already: 'replace' \| 'append' \| 'fail'.
	schema : db-schema
	Database schema where the data will be uploaded (optional).
	dtype : dict of column name to SQL type, default None
	Optional specifying the datatype for columns. The SQL type should be a
	SQLAlchemy type, or a string for sqlite3 fallback connection.
	index : bool
	Store DataFrame index to the database as well.
	"""
	gdf = gdf.copy()
	geom_name = gdf.geometry.name
	if schema is not None:
	schema_name = schema
	else:
	schema_name = 'public'

	# Get srid
	srid = get_srid_from_crs(gdf)

	# Check geometry types
	geometry_type, contains_multi_geoms = get_geometry_type(gdf)

	# Build target geometry type
	if contains_multi_geoms:
	target_geom_type = "Multi{geom_type}".format(geom_type=geometry_type)
	else:
	target_geom_type = geometry_type

	# Build dtype with Geometry (srid is updated afterwards)
	if dtype is not None:
	dtype[geom_name] = Geometry(geometry_type=target_geom_type)
	else:
	dtype = {geom_name: Geometry(geometry_type=target_geom_type)}

	# Get Pandas SQLTable object (ignore 'geometry')
	# If dtypes is used, update table schema accordingly.
	pandas_sql = pd.io.sql.SQLDatabase(engine)
	tbl = pd.io.sql.SQLTable(name=table, pandas_sql_engine=pandas_sql,
	frame=gdf, dtype=dtype, index=index)

	# Check if table exists
	if tbl.exists():
	# If it exists, check if should overwrite
	if if_exists == 'replace':
	pandas_sql.drop_table(table)
	tbl.create()
	elif if_exists == 'fail':
	raise Exception("Table '{table}' exists in the database.".format(table=table))
	elif if_exists == 'append':
	pass
	else:
	tbl.create()

	# Ensure all geometries all Geometry collections if there were MultiGeometries in the table
	if contains_multi_geoms:
	mask = gdf[geom_name].geom_type==geometry_type
	if geometry_type == 'Point':
	gdf.loc[mask, geom_name] = gdf.loc[mask, geom_name].apply(lambda geom: MultiPoint([geom]))
	elif geometry_type == 'LineString':
	gdf.loc[mask, geom_name] = gdf.loc[mask, geom_name].apply(lambda geom: MultiLineString([geom]))
	elif geometry_type == 'Polygon':
	gdf.loc[mask, geom_name] = gdf.loc[mask, geom_name].apply(lambda geom: MultiPolygon([geom]))

	# Convert geometries to WKB
	gdf = convert_to_wkb(gdf, geom_name)

	# Write to database
	write_to_db(gdf, engine, index, tbl, table, schema, srid, geom_name)

	return

	# =====================
	# TEST
	# =====================

	data = gpd.read_file("https://gist.githubusercontent.com/HTenkanen/456ec4611a943955823a65729c9cf2aa/raw/be56f5e1e5c06c33cd51e89f823a7d770d8769b5/ykr_basegrid.geojson")
	engine = create_engine("postgresql+psycopg2://myuser:mypwd@localhost:5432/mydb")

	# Run with %timeit to get a proper time-profile
	copy_to_postgis(data, engine, table='ykr_test', if_exists='replace', schema=None, dtype=None, index=True)