Last active
October 10, 2022 06:22
-
-
Save answerquest/04675b6ecc155e5dcf8a5024ec00b597 to your computer and use it in GitHub Desktop.
SOI Toponyms data import, csv making
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SOI Toponyms data import to postgresql DB, csv making |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DROP TABLE IF EXISTS soi_toponyms; | |
CREATE TABLE soi_toponyms ( | |
sr SERIAL PRIMARY KEY, | |
fid VARCHAR(100) NULL, | |
objectid VARCHAR(100) NULL, | |
objectid_1 VARCHAR(100) NULL, | |
feature VARCHAR(100) NULL, | |
texta VARCHAR(100) NULL, | |
gmrotation DECIMAL(9,3) NULL, | |
color SMALLINT NULL, | |
stylea SMALLINT NULL, | |
weight SMALLINT NULL, | |
font SMALLINT NULL, | |
unique_id VARCHAR(100) NULL, | |
svy_date VARCHAR(100) NULL, | |
orig_fid INT NULL, | |
fid_2 INT NULL, | |
objectid_12 INT NULL, | |
everest_sh VARCHAR(100) NULL, | |
osm_sheet_ VARCHAR(100) NULL, | |
updated VARCHAR(100) NULL, | |
restricted VARCHAR(100) NULL, | |
responsibi VARCHAR(100) NULL, | |
shape_leng DECIMAL(15,3) NULL, | |
fid_12 INT NULL, | |
feature_1 VARCHAR(100) NULL, | |
text_1 VARCHAR(100) NULL, | |
hindi VARCHAR(100) NULL, | |
bengali VARCHAR(100) NULL, | |
gujarati VARCHAR(100) NULL, | |
kannada VARCHAR(100) NULL, | |
malayalam VARCHAR(100) NULL, | |
marathi VARCHAR(100) NULL, | |
punjabi VARCHAR(100) NULL, | |
tamil VARCHAR(100) NULL, | |
telugu VARCHAR(100) NULL, | |
roman VARCHAR(100) NULL, | |
assamese VARCHAR(100) NULL, | |
objectid_2 VARCHAR(100) NULL, | |
bodo VARCHAR(100) NULL, | |
dogri VARCHAR(100) NULL, | |
konkani VARCHAR(100) NULL, | |
kashmiri VARCHAR(100) NULL, | |
maithili VARCHAR(100) NULL, | |
manipuri VARCHAR(100) NULL, | |
nepali VARCHAR(100) NULL, | |
oriya VARCHAR(100) NULL, | |
sindhi VARCHAR(100) NULL, | |
sanskrit VARCHAR(100) NULL, | |
santhali VARCHAR(100) NULL, | |
urdu VARCHAR(100) NULL, | |
geometry GEOMETRY(POINT,4326) NULL | |
); | |
CREATE INDEX soi_toponyms_geom_1 ON soi_toponyms USING GIST (geometry); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2022-10-09 14:57:00 : soi_toponyms table truncated and serial num reset | |
2022-10-09 14:57:00 : Removing pre-existing toponyms/soi_toponyms.csv | |
2022-10-09 14:57:01 : Batch size: 100000 | |
2022-10-09 14:58:06 : 1 batches sent | |
2022-10-09 14:58:17 : Annoyingly huge field, zapping all spaces: text = 'D O D D A P P A N Ā Y A K K A N Ū R S O U T H R F' | |
2022-10-09 14:58:17 : Annoyingly huge field, zapping all spaces: text_1 = 'D O D D A P P A N Ā Y A K K A N Ū R S O U T H R F' | |
2022-10-09 14:58:17 : Annoyingly huge field, zapping all spaces: roman = 'D O D D A P P A N A Y A K K A N U R S O U T H R F' | |
2022-10-09 14:58:18 : Annoyingly huge field, zapping all spaces: text = 'S I R U M A L A I R E S E R V E D F O R E S T' | |
2022-10-09 14:58:18 : Annoyingly huge field, zapping all spaces: text_1 = 'S I R U M A L A I R E S E R V E D F O R E S T' | |
2022-10-09 14:58:18 : Annoyingly huge field, zapping all spaces: roman = 'S I R U M A L A I R E S E R V E D F O R E S T' | |
2022-10-09 14:58:21 : Annoyingly huge field, zapping all spaces: text = 'H U L I K A L D U R G A M R E S E R V E D F O R E S T' | |
2022-10-09 14:58:21 : Annoyingly huge field, zapping all spaces: text_1 = 'H U L I K A L D U R G A M R E S E R V E D F O R E S T' | |
2022-10-09 14:58:21 : Annoyingly huge field, zapping all spaces: roman = 'H U L I K A L D U R G A M R E S E R V E D F O R E S T' | |
2022-10-09 14:58:24 : Annoyingly huge field, zapping all spaces: text = 'C H I L A N D V Ā D I S T A T E F O R E S T' | |
2022-10-09 14:58:24 : Annoyingly huge field, zapping all spaces: text_1 = 'C H I L A N D V Ā D I S T A T E F O R E S T' | |
2022-10-09 14:58:24 : Annoyingly huge field, zapping all spaces: roman = 'C H I L A N D V A D I S T A T E F O R E S T' | |
2022-10-09 14:59:08 : 2 batches sent | |
2022-10-09 14:59:12 : Annoyingly huge field, zapping all spaces: text = 'K A R V E T N A G A R P R O T E C T E D F O R E S T' | |
2022-10-09 14:59:12 : Annoyingly huge field, zapping all spaces: text_1 = 'K A R V E T N A G A R P R O T E C T E D F O R E S T' | |
2022-10-09 14:59:12 : Annoyingly huge field, zapping all spaces: roman = 'K A R V E T N A G A R P R O T E C T E D F O R E S T' | |
2022-10-09 15:00:08 : 3 batches sent | |
2022-10-09 15:01:22 : 4 batches sent | |
2022-10-09 15:02:23 : 5 batches sent | |
2022-10-09 15:02:31 : Annoyingly huge field, zapping all spaces: text = 'I D U P U L A P Ā Y A & R E D L A C H E R U V U R E S E R V E D F O R E S T' | |
2022-10-09 15:02:31 : Annoyingly huge field, zapping all spaces: text_1 = 'I D U P U L A P Ā Y A & R E D L A C H E R U V U R E S E R V E D F O R E S T' | |
2022-10-09 15:02:31 : Annoyingly huge field, zapping all spaces: roman = 'I D U P U L A P A Y A & R E D L A C H E R U V U R E S E R V E D F O R E S T' | |
2022-10-09 15:02:32 : Annoyingly huge field, zapping all spaces: text = 'P Ā L K O N D A & V A N G A M A L L A R E S E R V E D F O R E S T' | |
2022-10-09 15:02:32 : Annoyingly huge field, zapping all spaces: text_1 = 'P Ā L K O N D A & V A N G A M A L L A R E S E R V E D F O R E S T' | |
2022-10-09 15:02:32 : Annoyingly huge field, zapping all spaces: roman = 'P A L K O N D A & V A N G A M A L L A R E S E R V E D F O R E S T' | |
2022-10-09 15:03:30 : 6 batches sent | |
2022-10-09 15:04:33 : 7 batches sent | |
2022-10-09 15:05:35 : 8 batches sent | |
2022-10-09 15:05:40 : Annoyingly huge field, zapping all spaces: text = 'R Ā M N A G A R P R O T E C T E D F O R E S T' | |
2022-10-09 15:05:40 : Annoyingly huge field, zapping all spaces: text_1 = 'R Ā M N A G A R P R O T E C T E D F O R E S T' | |
2022-10-09 15:05:40 : Annoyingly huge field, zapping all spaces: roman = 'R A M N A G A R P R O T E C T E D F O R E S T' | |
2022-10-09 15:06:36 : 9 batches sent | |
2022-10-09 15:07:36 : 10 batches sent | |
2022-10-09 15:08:38 : 11 batches sent | |
2022-10-09 15:09:40 : 12 batches sent | |
2022-10-09 15:09:54 : Annoyingly huge field, zapping all spaces: text = 'K O T T U R U B A T T i L i R E S E R V E D F O R E S T' | |
2022-10-09 15:09:54 : Annoyingly huge field, zapping all spaces: roman = 'K O T T U R U B A T T i L i R E S E R V E D F O R E S T' | |
2022-10-09 15:10:33 : 13 batches sent | |
2022-10-09 15:10:33 : Toponyms import completed. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# soi_toponyms_import.py | |
# 2022-10-09 by Nikhil VJ, https://nikhilvj.co.in | |
# source data: .geojonl from https://www.kaggle.com/datasets/planemad/soi-india-map-toponyms/versions/3 | |
# Importing to a Postgresql DB AND outputting to a CSV (v2), with bad chars/lines fixed | |
# to do: read in the geojsonl, | |
# do find-replace of chars, | |
# ensure there are no \ slashes in each field | |
# export to CSV AND to DB | |
dataFile = 'toponyms/soi-osm-toponyms.geojsonl.json' | |
batchSize = 100000 | |
skip2Batch = 0 | |
rename_cols = {"OBJECTID": "objectid", "text":"texta", "style":"stylea"} | |
columns_order = ['latitude','longitude'] # initiate base list of cols | |
csvFile = 'toponyms/soi_toponyms.csv' | |
from sqlalchemy import create_engine | |
import pandas as pd | |
import geopandas as gpd | |
import datetime, os, json, io, sys | |
from itertools import islice # https://stackoverflow.com/a/6335876/4355695 | |
def logmessage( *content ): | |
timeOffset = 5.5 | |
timestamp = '{:%Y-%m-%d %H:%M:%S} :'.format(datetime.datetime.utcnow() + datetime.timedelta(hours=timeOffset)) # from https://stackoverflow.com/a/26455617/4355695 | |
line = ' '.join(str(x) for x in list(content)) # from https://stackoverflow.com/a/3590168/4355695 | |
print(timestamp, line) # print to screen also | |
logFilename = 'log.txt' | |
with open(logFilename, 'a') as f: | |
print(timestamp, line, file=f) # file=f argument at end writes to file. from https://stackoverflow.com/a/2918367/4355695 | |
def makegpd(x,lat='latitude',lon='longitude'): | |
gdf = gpd.GeoDataFrame(x, geometry=gpd.points_from_xy(x[lon],x[lat]), crs="EPSG:4326") | |
gdf.drop(columns=[lat,lon], inplace=True) | |
return gdf | |
######### | |
creds = json.load(open('local_dbcreds.json','r')) | |
engine = create_engine(f"postgresql://{creds['DB_USER']}:{creds['DB_PW']}@{creds['DB_SERVER']}:{creds['DB_PORT']}/{creds['DB_DBNAME']}") | |
######## | |
d1 = "TRUNCATE TABLE soi_toponyms RESTART IDENTITY" | |
c = engine.connect() | |
res = c.execute(d1) | |
c.close() | |
logmessage("soi_toponyms table truncated and serial num reset") | |
if os.path.isfile(csvFile): | |
logmessage(f"Removing pre-existing {csvFile}") | |
os.remove(csvFile) | |
######## | |
logmessage(f"Batch size: {batchSize}") | |
batchCounter = 0 | |
# Reading batches of lines, from https://stackoverflow.com/a/6335876/4355695 | |
with open(dataFile,'r') as f: | |
while True: | |
next_n_lines = list(islice(f, batchSize)) | |
if not next_n_lines: | |
break | |
if (skip2Batch-1) > batchCounter: | |
batchCounter += 1 | |
logmessage(f"skipping batch {batchCounter}") | |
continue | |
collector = [] | |
for g in next_n_lines: | |
try: | |
h = json.loads(g) | |
except: | |
print(g) | |
raise | |
props = h.get('properties',{}) | |
row = {} | |
for key in props.keys(): | |
value = props[key] | |
# do char replacement | |
if isinstance(value,str): | |
if len(value) > 0: | |
value2 = value.replace('\n','īn').replace('\r','īr').replace('>','Ā').replace('<','ā').replace('|','Ī').replace('@','Ū').replace('#','ū').replace('\\','ī').replace(' '*5,' ').strip() | |
# also zapping silly spaces | |
# nuclear option for annoying spaces | |
if len(value2) > 100: | |
logmessage(f"Annoyingly huge field, zapping all spaces: {key} = '{value2}'") | |
value2 = value2.replace(' ','') | |
if len(value2) > 100: | |
value2 = value2[:100] | |
logmessage(f"Still huge after zapping spaces also, truncating it. Final: {key} = '{value2}'") | |
# if value2 != value: | |
# print(f"value: {value} changed to: {value2}") | |
else: | |
value2 = '' | |
else: | |
value2 = value | |
if key in rename_cols.keys(): | |
row[rename_cols[key]] = value2 | |
else: | |
row[key] = value2 | |
ll = h.get('geometry',{}).get('coordinates',[]) | |
row['longitude'] = ll[0] | |
row['latitude'] = ll[1] | |
collector.append(row) | |
df1 = pd.DataFrame(collector) | |
# data adjustment: | |
def rounding1(x): | |
try: | |
x = float(x) | |
return round(x,3) | |
except: | |
return x | |
df1['gmrotation'] = df1['gmrotation'].apply(rounding1) | |
df1['shape_leng'] = df1['shape_leng'].apply(rounding1) | |
# export to csv | |
if not os.path.isfile(csvFile): | |
# logmessage("writing new file") | |
df1.to_csv(csvFile, index=False) | |
columns_order = df1.columns.tolist() | |
else: | |
# logmessage("Appending to file") | |
df1[columns_order].to_csv(csvFile, index=False, header=False, mode='a') | |
gdf1 = makegpd(df1) | |
try: | |
gdf1.to_postgis('soi_toponyms', engine, if_exists='append', index=False) | |
except Exception as e: | |
print(f"Errored out at batch {batchCounter-1}, saving to faulty.csv raw text to faulty.txt") | |
df1.to_csv('toponyms/faulty.csv',index=False) | |
with open('toponyms/faulty.txt','w') as f: | |
f.write(''.join(next_n_lines)) | |
print('\n\n') | |
logmessage(e) | |
sys.exit() | |
batchCounter += 1 | |
logmessage(f"{batchCounter} batches sent") | |
logmessage(f"Toponyms import completed.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment