answerquest · October 10, 2022 06:22
diff --git a/_SOI Toponyms import b/_SOI Toponyms import
 SOI Toponyms data import to postgresql DB, csv making
diff --git a/dbschema.sql b/dbschema.sql
 DROP TABLE IF EXISTS soi_toponyms;
 CREATE TABLE soi_toponyms (
 	sr SERIAL PRIMARY KEY,
 	fid VARCHAR(100) NULL,
 	objectid VARCHAR(100) NULL,
 	objectid_1 VARCHAR(100) NULL,
 	feature VARCHAR(100) NULL,
 	texta VARCHAR(100) NULL,
 	gmrotation DECIMAL(9,3) NULL,
 	color SMALLINT NULL,
 	stylea SMALLINT NULL,
 	weight SMALLINT NULL,
 	font SMALLINT NULL,
 	unique_id VARCHAR(100) NULL,
 	svy_date VARCHAR(100) NULL,
 	orig_fid INT NULL,
 	fid_2 INT NULL,
 	objectid_12 INT NULL,
 	everest_sh VARCHAR(100) NULL,
 	osm_sheet_ VARCHAR(100) NULL,
 	updated VARCHAR(100) NULL,
 	restricted VARCHAR(100) NULL,
 	responsibi VARCHAR(100) NULL,
 	shape_leng DECIMAL(15,3) NULL,
 	fid_12 INT NULL,
 	feature_1 VARCHAR(100) NULL,
 	text_1 VARCHAR(100) NULL,
 	hindi VARCHAR(100) NULL,
 	bengali VARCHAR(100) NULL,
 	gujarati VARCHAR(100) NULL,
 	kannada VARCHAR(100) NULL,
 	malayalam VARCHAR(100) NULL,
 	marathi VARCHAR(100) NULL,
 	punjabi VARCHAR(100) NULL,
 	tamil VARCHAR(100) NULL,
 	telugu VARCHAR(100) NULL,
 	roman VARCHAR(100) NULL,
 	assamese VARCHAR(100) NULL,
 	objectid_2 VARCHAR(100) NULL,
 	bodo VARCHAR(100) NULL,
 	dogri VARCHAR(100) NULL,
 	konkani VARCHAR(100) NULL,
 	kashmiri VARCHAR(100) NULL,
 	maithili VARCHAR(100) NULL,
 	manipuri VARCHAR(100) NULL,
 	nepali VARCHAR(100) NULL,
 	oriya VARCHAR(100) NULL,
 	sindhi VARCHAR(100) NULL,
 	sanskrit VARCHAR(100) NULL,
 	santhali VARCHAR(100) NULL,
 	urdu VARCHAR(100) NULL,
 	geometry GEOMETRY(POINT,4326) NULL
 );
 CREATE INDEX soi_toponyms_geom_1 ON soi_toponyms USING GIST (geometry);
diff --git a/log.txt b/log.txt
 2022-10-09 14:57:00 : soi_toponyms table truncated and serial num reset
 2022-10-09 14:57:00 : Removing pre-existing toponyms/soi_toponyms.csv
 2022-10-09 14:57:01 : Batch size: 100000
 2022-10-09 14:58:06 : 1 batches sent
 2022-10-09 14:58:17 : Annoyingly huge field, zapping all spaces: text = 'D    O    D    D    A    P    P    A    N    Ā    Y    A    K    K    A    N    Ū    R  S    O    U    T    H  R   F'
 2022-10-09 14:58:17 : Annoyingly huge field, zapping all spaces: text_1 = 'D    O    D    D    A    P    P    A    N    Ā    Y    A    K    K    A    N    Ū    R  S    O    U    T    H  R   F'
 2022-10-09 14:58:17 : Annoyingly huge field, zapping all spaces: roman = 'D    O    D    D    A    P    P    A    N    A    Y    A    K    K    A    N    U    R  S    O    U    T    H  R   F'
 2022-10-09 14:58:18 : Annoyingly huge field, zapping all spaces: text = 'S    I    R    U    M    A    L    A    I    R    E    S    E    R    V    E    D    F    O    R    E    S    T'
 2022-10-09 14:58:18 : Annoyingly huge field, zapping all spaces: text_1 = 'S    I    R    U    M    A    L    A    I    R    E    S    E    R    V    E    D    F    O    R    E    S    T'
 2022-10-09 14:58:18 : Annoyingly huge field, zapping all spaces: roman = 'S    I    R    U    M    A    L    A    I    R    E    S    E    R    V    E    D    F    O    R    E    S    T'
 2022-10-09 14:58:21 : Annoyingly huge field, zapping all spaces: text = 'H   U   L   I   K   A   L   D   U   R   G   A   M   R   E   S   E   R   V   E   D   F   O   R   E   S   T'
 2022-10-09 14:58:21 : Annoyingly huge field, zapping all spaces: text_1 = 'H   U   L   I   K   A   L   D   U   R   G   A   M   R   E   S   E   R   V   E   D   F   O   R   E   S   T'
 2022-10-09 14:58:21 : Annoyingly huge field, zapping all spaces: roman = 'H   U   L   I   K   A   L   D   U   R   G   A   M   R   E   S   E   R   V   E   D   F   O   R   E   S   T'
 2022-10-09 14:58:24 : Annoyingly huge field, zapping all spaces: text = 'C    H    I    L    A    N    D    V    Ā    D    I   S    T    A    T    E   F    O    R    E    S    T'
 2022-10-09 14:58:24 : Annoyingly huge field, zapping all spaces: text_1 = 'C    H    I    L    A    N    D    V    Ā    D    I   S    T    A    T    E   F    O    R    E    S    T'
 2022-10-09 14:58:24 : Annoyingly huge field, zapping all spaces: roman = 'C    H    I    L    A    N    D    V    A    D    I   S    T    A    T    E   F    O    R    E    S    T'
 2022-10-09 14:59:08 : 2 batches sent
 2022-10-09 14:59:12 : Annoyingly huge field, zapping all spaces: text = 'K   A    R    V    E    T    N    A    G    A    R    P    R    O    T    E    C    T    E    D    F    O    R    E    S    T'
 2022-10-09 14:59:12 : Annoyingly huge field, zapping all spaces: text_1 = 'K   A    R    V    E    T    N    A    G    A    R    P    R    O    T    E    C    T    E    D    F    O    R    E    S    T'
 2022-10-09 14:59:12 : Annoyingly huge field, zapping all spaces: roman = 'K   A    R    V    E    T    N    A    G    A    R    P    R    O    T    E    C    T    E    D    F    O    R    E    S    T'
 2022-10-09 15:00:08 : 3 batches sent
 2022-10-09 15:01:22 : 4 batches sent
 2022-10-09 15:02:23 : 5 batches sent
 2022-10-09 15:02:31 : Annoyingly huge field, zapping all spaces: text = 'I  D  U  P  U  L  A  P   Ā  Y  A  &  R  E  D  L  A  C  H  E  R  U   V  U    R  E  S  E  R  V  E  D F  O  R  E  S  T'
 2022-10-09 15:02:31 : Annoyingly huge field, zapping all spaces: text_1 = 'I  D  U  P  U  L  A  P   Ā  Y  A  &  R  E  D  L  A  C  H  E  R  U   V  U    R  E  S  E  R  V  E  D F  O  R  E  S  T'
 2022-10-09 15:02:31 : Annoyingly huge field, zapping all spaces: roman = 'I  D  U  P  U  L  A  P   A  Y  A  &  R  E  D  L  A  C  H  E  R  U   V  U    R  E  S  E  R  V  E  D F  O  R  E  S  T'
 2022-10-09 15:02:32 : Annoyingly huge field, zapping all spaces: text = 'P    Ā    L    K   O    N D    A & V   A   N   G   A   M   A   L   L   A   R   E   S   E   R   V   E   D   F   O   R   E   S   T'
 2022-10-09 15:02:32 : Annoyingly huge field, zapping all spaces: text_1 = 'P    Ā    L    K   O    N D    A & V   A   N   G   A   M   A   L   L   A   R   E   S   E   R   V   E   D   F   O   R   E   S   T'
 2022-10-09 15:02:32 : Annoyingly huge field, zapping all spaces: roman = 'P    A    L    K   O    N D    A & V   A   N   G   A   M   A   L   L   A   R   E   S   E   R   V   E   D   F   O   R   E   S   T'
 2022-10-09 15:03:30 : 6 batches sent
 2022-10-09 15:04:33 : 7 batches sent
 2022-10-09 15:05:35 : 8 batches sent
 2022-10-09 15:05:40 : Annoyingly huge field, zapping all spaces: text = 'R    Ā    M    N    A    G    A    R  P    R    O    T    E    C    T    E    D  F    O    R    E    S    T'
 2022-10-09 15:05:40 : Annoyingly huge field, zapping all spaces: text_1 = 'R    Ā    M    N    A    G    A    R  P    R    O    T    E    C    T    E    D  F    O    R    E    S    T'
 2022-10-09 15:05:40 : Annoyingly huge field, zapping all spaces: roman = 'R    A    M    N    A    G    A    R  P    R    O    T    E    C    T    E    D  F    O    R    E    S    T'
 2022-10-09 15:06:36 : 9 batches sent
 2022-10-09 15:07:36 : 10 batches sent
 2022-10-09 15:08:38 : 11 batches sent
 2022-10-09 15:09:40 : 12 batches sent
 2022-10-09 15:09:54 : Annoyingly huge field, zapping all spaces: text = 'K   O   T   T   U   R   U B   A   T   T   i   L   i R   E   S   E   R   V   E   D F   O   R   E   S   T'
 2022-10-09 15:09:54 : Annoyingly huge field, zapping all spaces: roman = 'K   O   T   T   U   R   U B   A   T   T   i   L   i R   E   S   E   R   V   E   D F   O   R   E   S   T'
 2022-10-09 15:10:33 : 13 batches sent
 2022-10-09 15:10:33 : Toponyms import completed.
diff --git a/soi_toponyms_import.py b/soi_toponyms_import.py
 # soi_toponyms_import.py
 # 2022-10-09 by Nikhil VJ, https://nikhilvj.co.in

 # source data: .geojonl from https://www.kaggle.com/datasets/planemad/soi-india-map-toponyms/versions/3
 # Importing to a Postgresql DB AND outputting to a CSV (v2), with bad chars/lines fixed

 # to do: read in the geojsonl,
 # do find-replace of chars, 
 # ensure there are no \ slashes in each field
 # export to CSV AND to DB

 dataFile = 'toponyms/soi-osm-toponyms.geojsonl.json'
 batchSize = 100000
 skip2Batch = 0
 rename_cols = {"OBJECTID": "objectid", "text":"texta", "style":"stylea"}
 columns_order = ['latitude','longitude'] # initiate base list of cols
 csvFile = 'toponyms/soi_toponyms.csv'

 from sqlalchemy import create_engine
 import pandas as pd
 import geopandas as gpd
 import datetime, os, json, io, sys
 from itertools import islice # https://stackoverflow.com/a/6335876/4355695


 def logmessage( *content ):
    timeOffset = 5.5
    timestamp = '{:%Y-%m-%d %H:%M:%S} :'.format(datetime.datetime.utcnow() + datetime.timedelta(hours=timeOffset)) # from https://stackoverflow.com/a/26455617/4355695
    line = ' '.join(str(x) for x in list(content)) # from https://stackoverflow.com/a/3590168/4355695
    print(timestamp, line) # print to screen also

    logFilename = 'log.txt'
    
    with open(logFilename, 'a') as f:
        print(timestamp, line, file=f) # file=f argument at end writes to file. from https://stackoverflow.com/a/2918367/4355695


 def makegpd(x,lat='latitude',lon='longitude'):
    gdf = gpd.GeoDataFrame(x, geometry=gpd.points_from_xy(x[lon],x[lat]), crs="EPSG:4326")
    gdf.drop(columns=[lat,lon], inplace=True)
    return gdf



 #########


 creds = json.load(open('local_dbcreds.json','r'))
 engine = create_engine(f"postgresql://{creds['DB_USER']}:{creds['DB_PW']}@{creds['DB_SERVER']}:{creds['DB_PORT']}/{creds['DB_DBNAME']}")

 ########

 d1 = "TRUNCATE TABLE soi_toponyms RESTART IDENTITY"
 c = engine.connect()
 res = c.execute(d1)
 c.close()
 logmessage("soi_toponyms table truncated and serial num reset")

 if os.path.isfile(csvFile):
    logmessage(f"Removing pre-existing {csvFile}")
    os.remove(csvFile)

 ########

 logmessage(f"Batch size: {batchSize}")
 batchCounter = 0
 # Reading batches of lines, from https://stackoverflow.com/a/6335876/4355695 
 with open(dataFile,'r') as f:
    while True:
        next_n_lines = list(islice(f, batchSize))
        if not next_n_lines:
            break
        
        if (skip2Batch-1) > batchCounter:
            batchCounter += 1
            logmessage(f"skipping batch {batchCounter}")
            continue

        collector = []
        for g in next_n_lines:

            try:
                h = json.loads(g)
            except:
                print(g)
                raise
            props = h.get('properties',{})
            row = {}
            for key in props.keys():
                value = props[key]
                
                # do char replacement
                if isinstance(value,str):
                    if len(value) > 0:
                        value2 = value.replace('\n','īn').replace('\r','īr').replace('>','Ā').replace('<','ā').replace('|','Ī').replace('@','Ū').replace('#','ū').replace('\\','ī').replace(' '*5,' ').strip()
                        # also zapping silly spaces

                        # nuclear option for annoying spaces
                        if len(value2) > 100:
                            logmessage(f"Annoyingly huge field, zapping all spaces: {key} = '{value2}'")
                            value2 = value2.replace(' ','')
                            if len(value2) > 100:
                                value2 = value2[:100]
                                logmessage(f"Still huge after zapping spaces also, truncating it. Final: {key} = '{value2}'")
                        # if value2 != value:
                        #     print(f"value: {value} changed to: {value2}")
                    else:
                        value2 = ''
                else:
                    value2 = value

                if key in rename_cols.keys():
                    row[rename_cols[key]] = value2
                else:
                    row[key] = value2

            ll = h.get('geometry',{}).get('coordinates',[])
            row['longitude'] = ll[0]
            row['latitude'] = ll[1]
            collector.append(row)
        
        df1 = pd.DataFrame(collector)

        # data adjustment:
        def rounding1(x):
            try:
                x = float(x)
                return round(x,3)
            except:
                return x

        df1['gmrotation'] = df1['gmrotation'].apply(rounding1)
        df1['shape_leng'] = df1['shape_leng'].apply(rounding1)
        
        # export to csv
        if not os.path.isfile(csvFile):
            # logmessage("writing new file")
            df1.to_csv(csvFile, index=False)
            columns_order = df1.columns.tolist()

        else:
            # logmessage("Appending to file")
            df1[columns_order].to_csv(csvFile, index=False, header=False, mode='a')

        gdf1 = makegpd(df1)
        try:
            gdf1.to_postgis('soi_toponyms', engine, if_exists='append', index=False)
        except Exception as e:
            print(f"Errored out at batch {batchCounter-1}, saving to faulty.csv raw text to faulty.txt")
            df1.to_csv('toponyms/faulty.csv',index=False)
            with open('toponyms/faulty.txt','w') as f:
                f.write(''.join(next_n_lines))
            print('\n\n')
            logmessage(e)
            sys.exit()
        
        batchCounter += 1
        logmessage(f"{batchCounter} batches sent")
        

 logmessage(f"Toponyms import completed.")
	DROP TABLE IF EXISTS soi_toponyms;
	CREATE TABLE soi_toponyms (
	sr SERIAL PRIMARY KEY,
	fid VARCHAR(100) NULL,
	objectid VARCHAR(100) NULL,
	objectid_1 VARCHAR(100) NULL,
	feature VARCHAR(100) NULL,
	texta VARCHAR(100) NULL,
	gmrotation DECIMAL(9,3) NULL,
	color SMALLINT NULL,
	stylea SMALLINT NULL,
	weight SMALLINT NULL,
	font SMALLINT NULL,
	unique_id VARCHAR(100) NULL,
	svy_date VARCHAR(100) NULL,
	orig_fid INT NULL,
	fid_2 INT NULL,
	objectid_12 INT NULL,
	everest_sh VARCHAR(100) NULL,
	osm_sheet_ VARCHAR(100) NULL,
	updated VARCHAR(100) NULL,
	restricted VARCHAR(100) NULL,
	responsibi VARCHAR(100) NULL,
	shape_leng DECIMAL(15,3) NULL,
	fid_12 INT NULL,
	feature_1 VARCHAR(100) NULL,
	text_1 VARCHAR(100) NULL,
	hindi VARCHAR(100) NULL,
	bengali VARCHAR(100) NULL,
	gujarati VARCHAR(100) NULL,
	kannada VARCHAR(100) NULL,
	malayalam VARCHAR(100) NULL,
	marathi VARCHAR(100) NULL,
	punjabi VARCHAR(100) NULL,
	tamil VARCHAR(100) NULL,
	telugu VARCHAR(100) NULL,
	roman VARCHAR(100) NULL,
	assamese VARCHAR(100) NULL,
	objectid_2 VARCHAR(100) NULL,
	bodo VARCHAR(100) NULL,
	dogri VARCHAR(100) NULL,
	konkani VARCHAR(100) NULL,
	kashmiri VARCHAR(100) NULL,
	maithili VARCHAR(100) NULL,
	manipuri VARCHAR(100) NULL,
	nepali VARCHAR(100) NULL,
	oriya VARCHAR(100) NULL,
	sindhi VARCHAR(100) NULL,
	sanskrit VARCHAR(100) NULL,
	santhali VARCHAR(100) NULL,
	urdu VARCHAR(100) NULL,
	geometry GEOMETRY(POINT,4326) NULL
	);
	CREATE INDEX soi_toponyms_geom_1 ON soi_toponyms USING GIST (geometry);
	2022-10-09 14:57:00 : soi_toponyms table truncated and serial num reset
	2022-10-09 14:57:00 : Removing pre-existing toponyms/soi_toponyms.csv
	2022-10-09 14:57:01 : Batch size: 100000
	2022-10-09 14:58:06 : 1 batches sent
	2022-10-09 14:58:17 : Annoyingly huge field, zapping all spaces: text = 'D O D D A P P A N Ā Y A K K A N Ū R S O U T H R F'
	2022-10-09 14:58:17 : Annoyingly huge field, zapping all spaces: text_1 = 'D O D D A P P A N Ā Y A K K A N Ū R S O U T H R F'
	2022-10-09 14:58:17 : Annoyingly huge field, zapping all spaces: roman = 'D O D D A P P A N A Y A K K A N U R S O U T H R F'
	2022-10-09 14:58:18 : Annoyingly huge field, zapping all spaces: text = 'S I R U M A L A I R E S E R V E D F O R E S T'
	2022-10-09 14:58:18 : Annoyingly huge field, zapping all spaces: text_1 = 'S I R U M A L A I R E S E R V E D F O R E S T'
	2022-10-09 14:58:18 : Annoyingly huge field, zapping all spaces: roman = 'S I R U M A L A I R E S E R V E D F O R E S T'
	2022-10-09 14:58:21 : Annoyingly huge field, zapping all spaces: text = 'H U L I K A L D U R G A M R E S E R V E D F O R E S T'
	2022-10-09 14:58:21 : Annoyingly huge field, zapping all spaces: text_1 = 'H U L I K A L D U R G A M R E S E R V E D F O R E S T'
	2022-10-09 14:58:21 : Annoyingly huge field, zapping all spaces: roman = 'H U L I K A L D U R G A M R E S E R V E D F O R E S T'
	2022-10-09 14:58:24 : Annoyingly huge field, zapping all spaces: text = 'C H I L A N D V Ā D I S T A T E F O R E S T'
	2022-10-09 14:58:24 : Annoyingly huge field, zapping all spaces: text_1 = 'C H I L A N D V Ā D I S T A T E F O R E S T'
	2022-10-09 14:58:24 : Annoyingly huge field, zapping all spaces: roman = 'C H I L A N D V A D I S T A T E F O R E S T'
	2022-10-09 14:59:08 : 2 batches sent
	2022-10-09 14:59:12 : Annoyingly huge field, zapping all spaces: text = 'K A R V E T N A G A R P R O T E C T E D F O R E S T'
	2022-10-09 14:59:12 : Annoyingly huge field, zapping all spaces: text_1 = 'K A R V E T N A G A R P R O T E C T E D F O R E S T'
	2022-10-09 14:59:12 : Annoyingly huge field, zapping all spaces: roman = 'K A R V E T N A G A R P R O T E C T E D F O R E S T'
	2022-10-09 15:00:08 : 3 batches sent
	2022-10-09 15:01:22 : 4 batches sent
	2022-10-09 15:02:23 : 5 batches sent
	2022-10-09 15:02:31 : Annoyingly huge field, zapping all spaces: text = 'I D U P U L A P Ā Y A & R E D L A C H E R U V U R E S E R V E D F O R E S T'
	2022-10-09 15:02:31 : Annoyingly huge field, zapping all spaces: text_1 = 'I D U P U L A P Ā Y A & R E D L A C H E R U V U R E S E R V E D F O R E S T'
	2022-10-09 15:02:31 : Annoyingly huge field, zapping all spaces: roman = 'I D U P U L A P A Y A & R E D L A C H E R U V U R E S E R V E D F O R E S T'
	2022-10-09 15:02:32 : Annoyingly huge field, zapping all spaces: text = 'P Ā L K O N D A & V A N G A M A L L A R E S E R V E D F O R E S T'
	2022-10-09 15:02:32 : Annoyingly huge field, zapping all spaces: text_1 = 'P Ā L K O N D A & V A N G A M A L L A R E S E R V E D F O R E S T'
	2022-10-09 15:02:32 : Annoyingly huge field, zapping all spaces: roman = 'P A L K O N D A & V A N G A M A L L A R E S E R V E D F O R E S T'
	2022-10-09 15:03:30 : 6 batches sent
	2022-10-09 15:04:33 : 7 batches sent
	2022-10-09 15:05:35 : 8 batches sent
	2022-10-09 15:05:40 : Annoyingly huge field, zapping all spaces: text = 'R Ā M N A G A R P R O T E C T E D F O R E S T'
	2022-10-09 15:05:40 : Annoyingly huge field, zapping all spaces: text_1 = 'R Ā M N A G A R P R O T E C T E D F O R E S T'
	2022-10-09 15:05:40 : Annoyingly huge field, zapping all spaces: roman = 'R A M N A G A R P R O T E C T E D F O R E S T'
	2022-10-09 15:06:36 : 9 batches sent
	2022-10-09 15:07:36 : 10 batches sent
	2022-10-09 15:08:38 : 11 batches sent
	2022-10-09 15:09:40 : 12 batches sent
	2022-10-09 15:09:54 : Annoyingly huge field, zapping all spaces: text = 'K O T T U R U B A T T i L i R E S E R V E D F O R E S T'
	2022-10-09 15:09:54 : Annoyingly huge field, zapping all spaces: roman = 'K O T T U R U B A T T i L i R E S E R V E D F O R E S T'
	2022-10-09 15:10:33 : 13 batches sent
	2022-10-09 15:10:33 : Toponyms import completed.
	# soi_toponyms_import.py
	# 2022-10-09 by Nikhil VJ, https://nikhilvj.co.in

	# source data: .geojonl from https://www.kaggle.com/datasets/planemad/soi-india-map-toponyms/versions/3
	# Importing to a Postgresql DB AND outputting to a CSV (v2), with bad chars/lines fixed

	# to do: read in the geojsonl,
	# do find-replace of chars,
	# ensure there are no \ slashes in each field
	# export to CSV AND to DB

	dataFile = 'toponyms/soi-osm-toponyms.geojsonl.json'
	batchSize = 100000
	skip2Batch = 0
	rename_cols = {"OBJECTID": "objectid", "text":"texta", "style":"stylea"}
	columns_order = ['latitude','longitude'] # initiate base list of cols
	csvFile = 'toponyms/soi_toponyms.csv'

	from sqlalchemy import create_engine
	import pandas as pd
	import geopandas as gpd
	import datetime, os, json, io, sys
	from itertools import islice # https://stackoverflow.com/a/6335876/4355695


	def logmessage( *content ):
	timeOffset = 5.5
	timestamp = '{:%Y-%m-%d %H:%M:%S} :'.format(datetime.datetime.utcnow() + datetime.timedelta(hours=timeOffset)) # from https://stackoverflow.com/a/26455617/4355695
	line = ' '.join(str(x) for x in list(content)) # from https://stackoverflow.com/a/3590168/4355695
	print(timestamp, line) # print to screen also

	logFilename = 'log.txt'

	with open(logFilename, 'a') as f:
	print(timestamp, line, file=f) # file=f argument at end writes to file. from https://stackoverflow.com/a/2918367/4355695


	def makegpd(x,lat='latitude',lon='longitude'):
	gdf = gpd.GeoDataFrame(x, geometry=gpd.points_from_xy(x[lon],x[lat]), crs="EPSG:4326")
	gdf.drop(columns=[lat,lon], inplace=True)
	return gdf



	#########


	creds = json.load(open('local_dbcreds.json','r'))
	engine = create_engine(f"postgresql://{creds['DB_USER']}:{creds['DB_PW']}@{creds['DB_SERVER']}:{creds['DB_PORT']}/{creds['DB_DBNAME']}")

	########

	d1 = "TRUNCATE TABLE soi_toponyms RESTART IDENTITY"
	c = engine.connect()
	res = c.execute(d1)
	c.close()
	logmessage("soi_toponyms table truncated and serial num reset")

	if os.path.isfile(csvFile):
	logmessage(f"Removing pre-existing {csvFile}")
	os.remove(csvFile)

	########

	logmessage(f"Batch size: {batchSize}")
	batchCounter = 0
	# Reading batches of lines, from https://stackoverflow.com/a/6335876/4355695
	with open(dataFile,'r') as f:
	while True:
	next_n_lines = list(islice(f, batchSize))
	if not next_n_lines:
	break

	if (skip2Batch-1) > batchCounter:
	batchCounter += 1
	logmessage(f"skipping batch {batchCounter}")
	continue

	collector = []
	for g in next_n_lines:

	try:
	h = json.loads(g)
	except:
	print(g)
	raise
	props = h.get('properties',{})
	row = {}
	for key in props.keys():
	value = props[key]

	# do char replacement
	if isinstance(value,str):
	if len(value) > 0:
	value2 = value.replace('\n','īn').replace('\r','īr').replace('>','Ā').replace('<','ā').replace('\|','Ī').replace('@','Ū').replace('#','ū').replace('\\','ī').replace(' '*5,' ').strip()
	# also zapping silly spaces

	# nuclear option for annoying spaces
	if len(value2) > 100:
	logmessage(f"Annoyingly huge field, zapping all spaces: {key} = '{value2}'")
	value2 = value2.replace(' ','')
	if len(value2) > 100:
	value2 = value2[:100]
	logmessage(f"Still huge after zapping spaces also, truncating it. Final: {key} = '{value2}'")
	# if value2 != value:
	# print(f"value: {value} changed to: {value2}")
	else:
	value2 = ''
	else:
	value2 = value

	if key in rename_cols.keys():
	row[rename_cols[key]] = value2
	else:
	row[key] = value2

	ll = h.get('geometry',{}).get('coordinates',[])
	row['longitude'] = ll[0]
	row['latitude'] = ll[1]
	collector.append(row)

	df1 = pd.DataFrame(collector)

	# data adjustment:
	def rounding1(x):
	try:
	x = float(x)
	return round(x,3)
	except:
	return x

	df1['gmrotation'] = df1['gmrotation'].apply(rounding1)
	df1['shape_leng'] = df1['shape_leng'].apply(rounding1)

	# export to csv
	if not os.path.isfile(csvFile):
	# logmessage("writing new file")
	df1.to_csv(csvFile, index=False)
	columns_order = df1.columns.tolist()

	else:
	# logmessage("Appending to file")
	df1[columns_order].to_csv(csvFile, index=False, header=False, mode='a')

	gdf1 = makegpd(df1)
	try:
	gdf1.to_postgis('soi_toponyms', engine, if_exists='append', index=False)
	except Exception as e:
	print(f"Errored out at batch {batchCounter-1}, saving to faulty.csv raw text to faulty.txt")
	df1.to_csv('toponyms/faulty.csv',index=False)
	with open('toponyms/faulty.txt','w') as f:
	f.write(''.join(next_n_lines))
	print('\n\n')
	logmessage(e)
	sys.exit()

	batchCounter += 1
	logmessage(f"{batchCounter} batches sent")


	logmessage(f"Toponyms import completed.")