kauevestena · April 23, 2023 03:24
diff --git a/buildings_import_osm_fortaleza.ipynb b/buildings_import_osm_fortaleza.ipynb
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/kauevestena/85f8e22eb5510cc7abaf521163f9661b/buildings_import_osm_fortaleza.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "I_wbXKuOExJ-",
        "outputId": "6c94e200-81b2-412e-f762-aefba9c177ae"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount(\"/content/gdrive\", force_remount=True).\n"
          ]
        }
      ],
      "source": [
        "# connectting to drive first\n",
        "from google.colab import drive\n",
        "drive.mount('/content/gdrive')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "__OOizznGZB-"
      },
      "outputs": [],
      "source": [
        "##########\n",
        "##########            SETUP\n",
        "##########\n",
        "\n",
        "buildings_folderpath = '/content/gdrive/MyDrive/OSM_building_imports/building_parts_data'\n",
        "parcels_folderpath = '/content/gdrive/MyDrive/OSM_building_imports/parcels_data'\n",
        "\n",
        "basepath = '/content/gdrive/MyDrive/OSM_building_imports/'\n",
        "\n",
        "proj_CRS = \"EPSG:31984\"\n",
        "\n",
        "latlon_CRS = 'EPSG:4326'\n",
        "\n",
        "cols_to_delete = ['fid','Bairro','']\n",
        "\n",
        "TOLERANCE = 0.12 # 12  centimeters"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [],
      "metadata": {
        "id": "ESPAxoB-EsMp"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "TaR1A9XWK8my"
      },
      "outputs": [],
      "source": [
        "# renaming rules:\n",
        "\n",
        "renamings = {'building':'building:part', 'pmfsefin:i':'pmfsefin:idedif', 'building:l':'building:levels'}"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "BUbF2SDjwn75"
      },
      "outputs": [],
      "source": [
        "# package installation\n",
        "!pip install geopandas > install_log.txt #workaround to avoid all that big verbose output"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "kLRUUzEfTAsP"
      },
      "outputs": [],
      "source": [
        "!pip install pygeos  > install_log2.txt"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "9Mx-dFTBqbJA"
      },
      "outputs": [],
      "source": [
        "# imports\n",
        "import os, json\n",
        "import geopandas as gpd"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "iqGCxJHgXKPD"
      },
      "outputs": [],
      "source": [
        "!pip freeze > installed_packages.txt"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "YvTZJ5tHyEgX"
      },
      "outputs": [],
      "source": [
        "# setting output folder: \n",
        "\n",
        "outputfolder_path = os.path.join(basepath,'output')\n",
        "\n",
        "if not os.path.exists(outputfolder_path):\n",
        "  os.makedirs(outputfolder_path)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "m0v8Z7a6JoVd"
      },
      "outputs": [],
      "source": [
        "# get the file list for each:\n",
        "building_filelist = [filename for filename in os.listdir(buildings_folderpath)]\n",
        "parcels_filelist = [filename for filename in os.listdir(parcels_folderpath)]\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "MtEAWBk_qpIL"
      },
      "outputs": [],
      "source": [
        "if len(building_filelist) != len(parcels_filelist):\n",
        "  raise Exception('They must have the same size!!')\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "UkGKF6NdrKmO",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "dad15716-8284-427f-f768-0c5a747ed83f"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "00044 {'buildings_path': '/content/gdrive/MyDrive/OSM_building_imports/building_parts_data/buildings_00044.gpkg', 'parcels_path': '/content/gdrive/MyDrive/OSM_building_imports/parcels_data/parcels_00044.gpkg'}\n"
          ]
        }
      ],
      "source": [
        "# creating the dictionary wich holds the between parcels and buildings:\n",
        "file_records = {}\n",
        "\n",
        "for entry in building_filelist:\n",
        "  id = entry.split('.')[0].split('_')[-1]\n",
        "  file_records[id] = {}\n",
        "\n",
        "  file_records[id]['buildings_path'] = os.path.join(buildings_folderpath,entry)\n",
        "\n",
        "for i,entry in enumerate(parcels_filelist):\n",
        "  id = entry.split('.')[0].split('_')[-1]\n",
        "  file_records[id]['parcels_path'] = os.path.join(parcels_folderpath,entry)\n",
        "\n",
        "\n",
        "for key in file_records:\n",
        "  print(key,file_records[key])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "oCWjLmWbN_Gm"
      },
      "outputs": [],
      "source": [
        "def single_feature_to_gdf(input_geometry,crs=proj_CRS):\n",
        "  return gpd.GeoDataFrame({'id':[1],'geometry':[input_geometry]},crs=proj_CRS)\n",
        "\n",
        "def remove_not_really_inside(input_gdf,inputpolygon):\n",
        "\n",
        "  single_polygon_gdf = single_feature_to_gdf(inputpolygon)\n",
        "\n",
        "  in_gdf = input_gdf.copy() # it solves the warning of \"SettingWithCopyWarning\"\n",
        "\n",
        "  in_gdf.loc[:,'orig_area'] = in_gdf.loc[:,'geometry'].area\n",
        "\n",
        "\n",
        "  clipped = in_gdf.overlay(single_polygon_gdf,how='intersection',keep_geom_type=True)\n",
        "\n",
        "\n",
        "  clipped.loc[:,'intersection_area'] = clipped.geometry.area\n",
        "\n",
        "  clipped.loc[:,'areas_ratio'] = clipped['intersection_area'] / clipped['orig_area']\n",
        "\n",
        "\n",
        "  # selecting the remaining:\n",
        "\n",
        "  selected_ids = clipped[clipped['areas_ratio'] > 0.5].set_index('pmfsefin:idedif').index\n",
        "\n",
        "  result = in_gdf.set_index('pmfsefin:idedif').loc[selected_ids,:].reset_index().drop(columns=['orig_area'])\n",
        "\n",
        "\n",
        "  # selected_features = clipped['areas_ratio'] > 0.5\n",
        "\n",
        "\n",
        "  # # return clipped[clipped['areas_ratio'] > 0.5].drop(columns=['orig_area','intersection_area','areas_ratio'])\n",
        "  # result = in_gdf.reset_index().drop(columns=['index'])[selected_features]\n",
        "\n",
        "  \n",
        "  if not result.empty:\n",
        "    return result,max(result['height']),max(result['building:levels'])\n",
        "  else:\n",
        "    return result,None,None\n",
        "\n",
        "def dump_json(inputdict,outputpath):\n",
        "    with open(outputpath,'w+') as json_handle:\n",
        "        json.dump(inputdict,json_handle,indent=4,ensure_ascii=False,sort_keys=True)\n",
        "\n",
        "def read_json(inputpath):\n",
        "    with open(inputpath) as reader:\n",
        "        data = reader.read()\n",
        "\n",
        "    return json.loads(data)\n",
        "\n",
        "def merge_geojsons(input_pathlist,outputpath,delete_inputs=True):\n",
        "    '''\n",
        "        simple function to merge geojsons without using any library.\n",
        "        Same CRS for all files is assumed.\n",
        "    '''\n",
        "\n",
        "    ref_dict = None\n",
        "\n",
        "\n",
        "    for i,path in enumerate(input_pathlist):\n",
        "\n",
        "\n",
        "        if i == 0:\n",
        "            ref_dict = read_json(path)\n",
        "\n",
        "\n",
        "        else:\n",
        "            ref_dict['features'] += read_json(path)['features']\n",
        "\n",
        "\n",
        "    dump_json(ref_dict,outputpath)\n",
        "\n",
        "    if delete_inputs:\n",
        "      for inputpath in input_pathlist:\n",
        "        os.remove(inputpath)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "XZj6MWJ2vPYl",
        "outputId": "aea377aa-ca88-44de-e315-61ff8a05cbf9"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "ID:  00044\n",
            "buildings:  /content/gdrive/MyDrive/OSM_building_imports/building_parts_data/buildings_00044.gpkg\n",
            "parcels:  /content/gdrive/MyDrive/OSM_building_imports/parcels_data/parcels_00044.gpkg\n",
            "Index(['building:part', 'pmfsefin:idedif', 'building:levels', 'height',\n",
            "       'Bairro', 'geometry'],\n",
            "      dtype='object')\n",
            "Bairro\n",
            "Index(['building:part', 'pmfsefin:idedif', 'building:levels', 'height',\n",
            "       'geometry'],\n",
            "      dtype='object')\n",
            "    parcel  0  file id:  00044\n",
            "    parcel  10  file id:  00044\n",
            "    parcel  20  file id:  00044\n",
            "    parcel  30  file id:  00044\n",
            "    parcel  40  file id:  00044\n",
            "    parcel  50  file id:  00044\n",
            "    parcel  60  file id:  00044\n",
            "    parcel  70  file id:  00044\n",
            "    parcel  80  file id:  00044\n",
            "    parcel  90  file id:  00044\n",
            "    parcel  100  file id:  00044\n",
            "    parcel  110  file id:  00044\n",
            "    parcel  120  file id:  00044\n",
            "    parcel  130  file id:  00044\n",
            "    parcel  140  file id:  00044\n",
            "    parcel  150  file id:  00044\n",
            "    parcel  160  file id:  00044\n",
            "    parcel  170  file id:  00044\n",
            "    parcel  180  file id:  00044\n",
            "    parcel  190  file id:  00044\n",
            "    parcel  200  file id:  00044\n",
            "    parcel  210  file id:  00044\n",
            "    parcel  220  file id:  00044\n",
            "    parcel  230  file id:  00044\n",
            "    parcel  240  file id:  00044\n",
            "    parcel  250  file id:  00044\n",
            "    parcel  260  file id:  00044\n",
            "    parcel  270  file id:  00044\n",
            "    parcel  280  file id:  00044\n",
            "    parcel  290  file id:  00044\n",
            "    parcel  300  file id:  00044\n",
            "    parcel  310  file id:  00044\n",
            "    parcel  320  file id:  00044\n",
            "    parcel  330  file id:  00044\n",
            "    parcel  340  file id:  00044\n",
            "    parcel  350  file id:  00044\n",
            "    parcel  360  file id:  00044\n",
            "    parcel  370  file id:  00044\n",
            "    parcel  380  file id:  00044\n",
            "    parcel  390  file id:  00044\n",
            "    parcel  400  file id:  00044\n",
            "    parcel  410  file id:  00044\n",
            "    parcel  420  file id:  00044\n",
            "    parcel  430  file id:  00044\n",
            "    parcel  440  file id:  00044\n",
            "    parcel  450  file id:  00044\n",
            "    parcel  460  file id:  00044\n",
            "    parcel  470  file id:  00044\n",
            "    parcel  480  file id:  00044\n",
            "    parcel  490  file id:  00044\n",
            "    parcel  500  file id:  00044\n",
            "    parcel  510  file id:  00044\n",
            "    parcel  520  file id:  00044\n",
            "    parcel  530  file id:  00044\n",
            "    parcel  540  file id:  00044\n",
            "    parcel  550  file id:  00044\n",
            "    parcel  560  file id:  00044\n",
            "    parcel  570  file id:  00044\n",
            "    parcel  580  file id:  00044\n",
            "    parcel  590  file id:  00044\n",
            "    parcel  600  file id:  00044\n",
            "    parcel  610  file id:  00044\n",
            "    parcel  620  file id:  00044\n",
            "    parcel  630  file id:  00044\n",
            "    parcel  640  file id:  00044\n",
            "    parcel  650  file id:  00044\n",
            "    parcel  660  file id:  00044\n",
            "    parcel  670  file id:  00044\n",
            "    parcel  680  file id:  00044\n",
            "    parcel  690  file id:  00044\n",
            "    parcel  700  file id:  00044\n",
            "    parcel  710  file id:  00044\n",
            "    parcel  720  file id:  00044\n",
            "    parcel  730  file id:  00044\n",
            "    parcel  740  file id:  00044\n",
            "    parcel  750  file id:  00044\n",
            "    parcel  760  file id:  00044\n",
            "    parcel  770  file id:  00044\n",
            "    parcel  780  file id:  00044\n",
            "    parcel  790  file id:  00044\n",
            "    parcel  800  file id:  00044\n",
            "    parcel  810  file id:  00044\n",
            "    parcel  820  file id:  00044\n",
            "    parcel  830  file id:  00044\n",
            "    parcel  840  file id:  00044\n",
            "    parcel  850  file id:  00044\n",
            "    parcel  860  file id:  00044\n",
            "    parcel  870  file id:  00044\n",
            "    parcel  880  file id:  00044\n",
            "    parcel  890  file id:  00044\n",
            "    parcel  900  file id:  00044\n",
            "    parcel  910  file id:  00044\n",
            "    parcel  920  file id:  00044\n",
            "    parcel  930  file id:  00044\n",
            "    parcel  940  file id:  00044\n",
            "    parcel  950  file id:  00044\n",
            "    parcel  960  file id:  00044\n",
            "    parcel  970  file id:  00044\n",
            "    parcel  980  file id:  00044\n",
            "    parcel  990  file id:  00044\n",
            "    parcel  1000  file id:  00044\n",
            "    parcel  1010  file id:  00044\n",
            "    parcel  1020  file id:  00044\n",
            "    parcel  1030  file id:  00044\n",
            "    parcel  1040  file id:  00044\n",
            "    parcel  1050  file id:  00044\n",
            "    parcel  1060  file id:  00044\n",
            "    parcel  1070  file id:  00044\n",
            "    parcel  1080  file id:  00044\n",
            "    parcel  1090  file id:  00044\n",
            "    parcel  1100  file id:  00044\n",
            "    parcel  1110  file id:  00044\n",
            "    parcel  1120  file id:  00044\n",
            "    parcel  1130  file id:  00044\n",
            "    parcel  1140  file id:  00044\n",
            "    parcel  1150  file id:  00044\n",
            "    parcel  1160  file id:  00044\n",
            "    parcel  1170  file id:  00044\n",
            "    parcel  1180  file id:  00044\n",
            "    parcel  1190  file id:  00044\n",
            "    parcel  1200  file id:  00044\n",
            "    parcel  1210  file id:  00044\n",
            "    parcel  1220  file id:  00044\n",
            "    parcel  1230  file id:  00044\n",
            "    parcel  1240  file id:  00044\n",
            "    parcel  1250  file id:  00044\n",
            "    parcel  1260  file id:  00044\n",
            "    parcel  1270  file id:  00044\n",
            "    parcel  1280  file id:  00044\n",
            "    parcel  1290  file id:  00044\n",
            "    parcel  1300  file id:  00044\n",
            "    parcel  1310  file id:  00044\n",
            "    parcel  1320  file id:  00044\n",
            "    parcel  1330  file id:  00044\n",
            "    parcel  1340  file id:  00044\n",
            "    parcel  1350  file id:  00044\n",
            "    parcel  1360  file id:  00044\n",
            "    parcel  1370  file id:  00044\n",
            "    parcel  1380  file id:  00044\n",
            "    parcel  1390  file id:  00044\n",
            "    parcel  1400  file id:  00044\n",
            "    parcel  1410  file id:  00044\n",
            "    parcel  1420  file id:  00044\n",
            "    parcel  1430  file id:  00044\n",
            "    parcel  1440  file id:  00044\n",
            "    parcel  1450  file id:  00044\n",
            "    parcel  1460  file id:  00044\n",
            "    parcel  1470  file id:  00044\n",
            "    parcel  1480  file id:  00044\n",
            "    parcel  1490  file id:  00044\n",
            "    parcel  1500  file id:  00044\n",
            "    parcel  1510  file id:  00044\n",
            "    parcel  1520  file id:  00044\n",
            "    parcel  1530  file id:  00044\n",
            "    parcel  1540  file id:  00044\n",
            "    parcel  1550  file id:  00044\n",
            "    parcel  1560  file id:  00044\n",
            "    parcel  1570  file id:  00044\n",
            "    parcel  1580  file id:  00044\n",
            "    parcel  1590  file id:  00044\n",
            "    parcel  1600  file id:  00044\n",
            "    parcel  1610  file id:  00044\n",
            "    parcel  1620  file id:  00044\n",
            "    parcel  1630  file id:  00044\n",
            "    parcel  1640  file id:  00044\n",
            "    parcel  1650  file id:  00044\n",
            "    parcel  1660  file id:  00044\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.8/dist-packages/geopandas/geodataframe.py:1443: SettingWithCopyWarning: \n",
            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
            "Try using .loc[row_indexer,col_indexer] = value instead\n",
            "\n",
            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
            "  super().__setitem__(key, value)\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "    parcel  1670  file id:  00044\n",
            "    parcel  1680  file id:  00044\n",
            "    parcel  1690  file id:  00044\n",
            "    parcel  1700  file id:  00044\n",
            "    parcel  1710  file id:  00044\n",
            "    parcel  1720  file id:  00044\n",
            "    parcel  1730  file id:  00044\n",
            "    parcel  1740  file id:  00044\n",
            "    parcel  1750  file id:  00044\n",
            "    parcel  1760  file id:  00044\n",
            "    parcel  1770  file id:  00044\n",
            "    parcel  1780  file id:  00044\n",
            "    parcel  1790  file id:  00044\n",
            "    parcel  1800  file id:  00044\n",
            "    parcel  1810  file id:  00044\n",
            "    parcel  1820  file id:  00044\n",
            "    parcel  1830  file id:  00044\n",
            "    parcel  1840  file id:  00044\n",
            "    parcel  1850  file id:  00044\n"
          ]
        }
      ],
      "source": [
        "\n",
        "# doing the actual work:\n",
        "for id_key in file_records:\n",
        "  # storing already used building ids\n",
        "  inside_list = []\n",
        "  already_used = []\n",
        "\n",
        "  united_polygons = {'building':[],'geometry':[],'building:levels':[],'height':[],'pmfsefin:idedif':[]}\n",
        "  print('ID: ',id_key)\n",
        "  print('buildings: ',file_records[id_key]['buildings_path'])\n",
        "  print('parcels: ',file_records[id_key]['parcels_path'])\n",
        "\n",
        "\n",
        "  # to remove the buildings with a single part afterwards\n",
        "  parts_to_remove = []\n",
        "\n",
        "  buildings = gpd.read_file(file_records[id_key]['buildings_path'],encoding='latin1').rename(columns=renamings)\n",
        "  print(buildings.columns)\n",
        "\n",
        "  if cols_to_delete:\n",
        "    for columname in cols_to_delete:\n",
        "      if columname in buildings.columns:\n",
        "        print(columname)\n",
        "        buildings.drop(columns=columname,inplace=True)\n",
        "\n",
        "  print(buildings.columns)\n",
        "  \n",
        "\n",
        "  parcels =   gpd.read_file(file_records[id_key]['parcels_path'],encoding='latin1')\n",
        "\n",
        "\n",
        "\n",
        "  # picking for each parcel\n",
        "  for i,parcel_geom in enumerate(parcels.geometry):\n",
        "\n",
        "    try:\n",
        "      if i % 10 == 0:\n",
        "        print('    parcel ',i,' file id: ',id_key)\n",
        "\n",
        "\n",
        "      inside_df = buildings.geometry.intersects(parcel_geom.buffer(0.00001))\n",
        "\n",
        "      if inside_df.empty:\n",
        "        print('empty')\n",
        "\n",
        "      # buildings that are inside the parcel:\n",
        "      # selected_buildings = buildings[inside_df]\n",
        "\n",
        "      selected_buildings,max_height,max_b_levels = remove_not_really_inside(buildings[inside_df],parcel_geom)\n",
        "      # print(max_height,max_b_levels)\n",
        "\n",
        "      # unary union if is a polygon then only one building is on the parcel, if it multipolygon, there's more\n",
        "      unary = selected_buildings.geometry.unary_union\n",
        "\n",
        "      if unary:\n",
        "\n",
        "        inside_list = list(selected_buildings['pmfsefin:idedif'])\n",
        "\n",
        "        repeated = [id for id in inside_list if id in already_used]\n",
        "\n",
        "        if repeated:\n",
        "          # print(repeated,inside_list)\n",
        "\n",
        "          inside_list = [id for id in inside_list if id not in repeated]\n",
        "\n",
        "          if not inside_list:\n",
        "            continue\n",
        "          else:\n",
        "            # removing the repeated ones:\n",
        "            selected_buildings = selected_buildings.set_index('pmfsefin:idedif').drop(repeated).reset_index()\n",
        "\n",
        "            if selected_buildings.empty:\n",
        "              continue\n",
        "            else:\n",
        "              unary = selected_buildings.geometry.unary_union\n",
        "\n",
        "              if not unary:\n",
        "                continue\n",
        "\n",
        "          \n",
        "          # print(inside_list)\n",
        "\n",
        "\n",
        "        g_type = unary.geom_type\n",
        "\n",
        "        if g_type == 'Polygon':\n",
        "          united_polygons['building'].append('yes')\n",
        "          united_polygons['geometry'].append(unary)\n",
        "\n",
        "          united_polygons['building:levels'].append(max_b_levels)\n",
        "          united_polygons['height'].append(max_height)\n",
        "\n",
        "\n",
        "\n",
        "          if selected_buildings.shape[0] == 1:\n",
        "            curr_id = list(selected_buildings['pmfsefin:idedif'])[0]\n",
        "\n",
        "            parts_to_remove.append(curr_id)\n",
        "            united_polygons['pmfsefin:idedif'].append(curr_id)\n",
        "\n",
        "\n",
        "          else:\n",
        "            united_polygons['pmfsefin:idedif'].append('')\n",
        "\n",
        "\n",
        "\n",
        "\n",
        "\n",
        "        if g_type == \"MultiPolygon\":\n",
        "          united_as_gdf = single_feature_to_gdf(unary).explode(index_parts=False)\n",
        "\n",
        "          for polygon in united_as_gdf.geometry:\n",
        "            united_polygons['building'].append('yes')\n",
        "            united_polygons['geometry'].append(polygon)\n",
        "\n",
        "            # search for the parts inside each baseline\n",
        "            remaining_buildings = selected_buildings[selected_buildings.intersects(polygon)]\n",
        "\n",
        "            if remaining_buildings.shape[0] == 1:\n",
        "              curr_id = list(remaining_buildings['pmfsefin:idedif'])[0]\n",
        "\n",
        "              united_polygons['pmfsefin:idedif'].append(curr_id)\n",
        "              parts_to_remove.append(curr_id)\n",
        "\n",
        "            else:\n",
        "              united_polygons['pmfsefin:idedif'].append('')\n",
        "\n",
        "\n",
        "            # m1 = max(remaining_buildings['height'])\n",
        "            # m2 = max(remaining_buildings['building:levels'])\n",
        "            # print(m1,m2,max(selected_buildings['height']),max(selected_buildings['building:levels']))\n",
        "\n",
        "            united_polygons['building:levels'].append(max(remaining_buildings['building:levels']))\n",
        "            united_polygons['height'].append(max(remaining_buildings['height']))\n",
        "\n",
        "        already_used += inside_list\n",
        "        # removing repeated entries:\n",
        "        already_used = list(set(already_used))\n",
        "        inside_list = []\n",
        "        \n",
        "\n",
        "    except:\n",
        "      with open(os.path.join(basepath,'skipped_parcels.txt'),'a+') as report_writer:\n",
        "        centroid_text = parcel_geom.representative_point().wkt\n",
        "        report_writer.write(centroid_text+'\\n')\n",
        "        print('skipped parcel ',i,' centroid: ',centroid_text)\n",
        "\n",
        "\n",
        "  # removing the ones that got only one part:\n",
        "  # print(buildings.shape)\n",
        "  # print(len(parts_to_remove))\n",
        "\n",
        "  # removing buildings that have a single part\n",
        "  buildings = buildings.set_index('pmfsefin:idedif').drop(parts_to_remove).reset_index()\n",
        "\n",
        "\n",
        "\n",
        "  # taking care of the buildings that are oustide the parcels:\n",
        "  not_inside_parcels = buildings.to_crs(proj_CRS).disjoint(parcels.geometry.unary_union)\n",
        "  buildings_outside_parcels = buildings[not_inside_parcels].rename(columns={'building:part':'building'}).to_crs(latlon_CRS)\n",
        "  outpath_outside_parcels = os.path.join(outputfolder_path,f'{id_key}_outside_parcels.geojson')\n",
        "\n",
        "  if not buildings_outside_parcels.empty:\n",
        "    buildings_outside_parcels.to_file(outpath_outside_parcels)\n",
        "\n",
        "\n",
        "  # checking if there's overlapping parts, throwing an error \n",
        "  for outside_building in buildings_outside_parcels.geometry:\n",
        "    overlappers = buildings_outside_parcels.overlaps(outside_building)\n",
        "\n",
        "    if not buildings_outside_parcels[overlappers].empty:\n",
        "      if not overlappers.empty: \n",
        "        if True in overlappers:\n",
        "          raise Exception('There are overlapping building parts!!')\n",
        "\n",
        "  # exporting the parts of the buildings:\n",
        "  buildings = buildings[~not_inside_parcels]\n",
        "  buildings = buildings.to_crs(latlon_CRS)\n",
        "  buildings['pmfsefin:idedif'] = buildings['pmfsefin:idedif'].astype(str)\n",
        "  outpath_parts_geojson = os.path.join(outputfolder_path,f'{id_key}_building_parts.geojson')\n",
        "\n",
        "  if not buildings.empty:\n",
        "    buildings.to_file(outpath_parts_geojson)\n",
        "\n",
        "\n",
        "  # exporting the baselines of the buildings:\n",
        "  united_as_gdf = gpd.GeoDataFrame(united_polygons,crs=proj_CRS).to_crs(latlon_CRS)\n",
        "\n",
        "  outpath_united_geojson = os.path.join(outputfolder_path,f'{id_key}_baselines.geojson')\n",
        "\n",
        "  if not united_as_gdf.empty:\n",
        "    united_as_gdf.to_file(outpath_united_geojson)\n",
        "\n",
        "  # now exporting all to a single file\n",
        "  paths_to_merge = []\n",
        "\n",
        "  for part_path in [outpath_united_geojson,outpath_parts_geojson,outpath_outside_parcels]:\n",
        "    if os.path.exists(part_path):\n",
        "      paths_to_merge.append(part_path)\n",
        "\n",
        "\n",
        "  merged_outpath =  os.path.join(outputfolder_path,f'{id_key}_exported_buildings.geojson')\n",
        "\n",
        "  merge_geojsons(paths_to_merge,merged_outpath)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "GQL03ZR35Rj7",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "bd4f4e57-f593-45d1-aa0d-30639ddaccf8"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "00044\n",
            "5393 6142 0\n"
          ]
        }
      ],
      "source": [
        "# first iterating through outtut folder:\n",
        "for filename in os.listdir(outputfolder_path):\n",
        "\n",
        "  id_key = filename.split('_')[0]\n",
        "  print(id_key)\n",
        "\n",
        "  # as was originally:\n",
        "  b_before = gpd.read_file(file_records[id_key]['buildings_path'],encoding='latin1').rename(columns=renamings)\n",
        "  b_before['pmfsefin:idedif'] = b_before['pmfsefin:idedif'].astype(str)\n",
        "\n",
        "\n",
        "\n",
        "  # as it has been written:\n",
        "  filepath = os.path.join(outputfolder_path,filename)\n",
        "  b_after = gpd.read_file(filepath)\n",
        "\n",
        "  # checking if there are missing 'pmfsefin:idedif'\n",
        "  l_before = list(b_before['pmfsefin:idedif'])\n",
        "\n",
        "  l_after = list(b_after['pmfsefin:idedif'])\n",
        "\n",
        "  l_missing = []\n",
        "\n",
        "  for id in l_before:\n",
        "    if id != '':\n",
        "      if not id in l_after:\n",
        "        l_missing.append(id)\n",
        "\n",
        "\n",
        "  print(len(l_before),len(l_after),len(l_missing))\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "YFACQHGpAQKo"
      },
      "source": [
        "<b style=\"color:blue;\">PART2: <br>\n",
        "-------------- SPLITTING UP: <br>\n",
        "\n",
        " -ALREADY EXISTENT (will need a careful looking for each) <br>\n",
        "\n",
        " -NON-EXISTING (can be straightforwardly imported)</b>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2cxiXkJbAOP3"
      },
      "outputs": [],
      "source": [
        "# PART 2, starting setting up the second part output folder:\n",
        "outputsplitted_folderpath = os.path.join(basepath,'output_splitted')\n",
        "\n",
        "if not os.path.exists(outputsplitted_folderpath):\n",
        "  os.makedirs(outputsplitted_folderpath)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "xISg9U7uNSae"
      },
      "outputs": [],
      "source": [
        "!pip install osmnx  > install_log3.txt"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "sgK5Lki6QNTi"
      },
      "outputs": [],
      "source": [
        "import osmnx as ox\n",
        "import numpy as np"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "## SET TOLERANCE FOR SIMPLIFYING "
      ],
      "metadata": {
        "id": "WN6FRXv_EFTP"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "LF_6_VLWNpe_"
      },
      "outputs": [],
      "source": [
        "# first iterating through outtut folder:\n",
        "for filename in os.listdir(outputfolder_path):\n",
        "  id = filename.split('_')[0]\n",
        "\n",
        "  filepath = os.path.join(outputfolder_path,filename)\n",
        "\n",
        "  # reading as gdf to get its bounding box: \n",
        "  allbuildings = gpd.read_file(filepath)\n",
        "\n",
        "  # doing the simplification:\n",
        "  allbuildings = allbuildings.to_crs(proj_CRS) #projecting\n",
        "\n",
        "  allbuildings[\"geometry\"] = allbuildings.geometry.simplify(TOLERANCE)\n",
        "\n",
        "  allbuildings = allbuildings.to_crs(latlon_CRS) #unprojecting\n",
        "\n",
        "  # end of simplification part\n",
        "\n",
        "  allbuildings['pmfsefin:idedif'] = allbuildings['pmfsefin:idedif'].replace('',np.nan)\n",
        "\n",
        "  w,s,e,n = allbuildings.unary_union.bounds\n",
        "\n",
        "  # THE HEAVY LIFT, THE DOWNLOAD OF OSM DATA:\n",
        "  already_existent = ox.geometries.geometries_from_bbox(n, s, e, w, {'building':True}).reset_index()\n",
        "  # creating as a single multipolygon to speed up things:\n",
        "  already_existent_unary_pol = already_existent.geometry.unary_union\n",
        "\n",
        "\n",
        "  # splitting buildings (baselines) from parts\n",
        "  building_baselines = allbuildings[allbuildings['building'] == 'yes'].drop(columns=['building:part'])\n",
        "\n",
        "  building_parts = allbuildings[allbuildings['building:part'] == 'yes'].drop(columns=['building'])\n",
        "  building_parts['pmfsefin:idedif'] = building_parts['pmfsefin:idedif'].astype(int) # to avoind messing with wrong types at serialization\n",
        "\n",
        "  baselines_with_correspondences_indexes = building_baselines.intersects(already_existent_unary_pol)\n",
        "\n",
        "\n",
        "  # splitting up to for analysis ease\n",
        "  baselines_with_correspondences   = building_baselines[baselines_with_correspondences_indexes]\n",
        "  baselines_without_correspondences = building_baselines[~baselines_with_correspondences_indexes]\n",
        "\n",
        "\n",
        "  # doing an analogous thing to split parts among the ones with correspondences and without\n",
        "  baselines_with_correspondences_unary     = baselines_with_correspondences.geometry.unary_union\n",
        "  parts_inside_with_correspondence_indexes = building_parts.intersects(baselines_with_correspondences_unary)\n",
        "\n",
        "  # splitting up to for analysis ease, again\n",
        "  parts_with_correspondences    = building_parts[parts_inside_with_correspondence_indexes]\n",
        "  parts_without_correspondences = building_parts[~parts_inside_with_correspondence_indexes]\n",
        "\n",
        "  # outputting:\n",
        "  baselines_with_correspondences_path       = os.path.join(outputsplitted_folderpath,'b_wc.geojson')\n",
        "  baselines_without_correspondences_path    = os.path.join(outputsplitted_folderpath,'b_wo_c.geojson') \n",
        "  parts_with_correspondences_path           = os.path.join(outputsplitted_folderpath,'p_w_c.geojson')\n",
        "  parts_without_correspondences_path        = os.path.join(outputsplitted_folderpath,'p_wo_c.geojson')\n",
        "\n",
        "  #outputting then merging:\n",
        "\n",
        "  #### WITH CORRESPONDENCES\n",
        "  baselines_with_correspondences.to_file(baselines_with_correspondences_path)\n",
        "  parts_with_correspondences.to_file(parts_with_correspondences_path)\n",
        "\n",
        "  merged_with_correspondences_path = os.path.join(outputsplitted_folderpath,f'b_{id}_with_correspondence.geojson')\n",
        "  merge_geojsons([baselines_with_correspondences_path,parts_with_correspondences_path],merged_with_correspondences_path)\n",
        "\n",
        "  #### WITHOUT CORRESPONDENCES\n",
        "  baselines_without_correspondences.to_file(baselines_without_correspondences_path)\n",
        "  parts_without_correspondences.to_file(parts_without_correspondences_path)\n",
        "\n",
        "  merged_without_correspondences_path = os.path.join(outputsplitted_folderpath,f'b_{id}_WITHOUT_correspondence.geojson')\n",
        "  merge_geojsons([baselines_without_correspondences_path,parts_without_correspondences_path],merged_without_correspondences_path)\n",
        "\n",
        "\n"
      ]
    }
  ],
  "metadata": {
    "accelerator": "TPU",
    "colab": {
      "provenance": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
 }