masitings · January 26, 2024 09:47
diff --git a/yt_ocr.ipynb b/yt_ocr.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyNEBoY32Js79QI9AsomXvp4",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/masitings/9dab604147dfe269d6f895f41f872991/yt_ocr.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "YMViZtqR340x",
        "outputId": "402ab93d-5d98-492a-9e21-004c4effa942"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mounted at /content/drive\n"
          ]
        }
      ],
      "source": [
        "from google.colab import drive\n",
        "import os\n",
        "\n",
        "drive.mount('/content/drive')"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!sudo apt install tesseract-ocr-ind\n",
        "!pip install pytesseract\n",
        "!pip install protobuf"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "JLCHyiEf4YyX",
        "outputId": "f96fe312-5c40-48da-ee6d-4f53b8a39aec"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Reading package lists... Done\n",
            "Building dependency tree... Done\n",
            "Reading state information... Done\n",
            "The following additional packages will be installed:\n",
            "  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd\n",
            "The following NEW packages will be installed:\n",
            "  tesseract-ocr tesseract-ocr-eng tesseract-ocr-ind tesseract-ocr-osd\n",
            "0 upgraded, 4 newly installed, 0 to remove and 30 not upgraded.\n",
            "Need to get 5,353 kB of archives.\n",
            "After this operation, 16.8 MB of additional disk space will be used.\n",
            "Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]\n",
            "Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]\n",
            "Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]\n",
            "Get:4 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-ind all 1:4.00~git30-7274cfa-1.1 [537 kB]\n",
            "Fetched 5,353 kB in 1s (8,532 kB/s)\n",
            "debconf: unable to initialize frontend: Dialog\n",
            "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 4.)\n",
            "debconf: falling back to frontend: Readline\n",
            "debconf: unable to initialize frontend: Readline\n",
            "debconf: (This frontend requires a controlling tty.)\n",
            "debconf: falling back to frontend: Teletype\n",
            "dpkg-preconfigure: unable to re-open stdin: \n",
            "Selecting previously unselected package tesseract-ocr-eng.\n",
            "(Reading database ... 121671 files and directories currently installed.)\n",
            "Preparing to unpack .../tesseract-ocr-eng_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n",
            "Unpacking tesseract-ocr-eng (1:4.00~git30-7274cfa-1.1) ...\n",
            "Selecting previously unselected package tesseract-ocr-osd.\n",
            "Preparing to unpack .../tesseract-ocr-osd_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n",
            "Unpacking tesseract-ocr-osd (1:4.00~git30-7274cfa-1.1) ...\n",
            "Selecting previously unselected package tesseract-ocr.\n",
            "Preparing to unpack .../tesseract-ocr_4.1.1-2.1build1_amd64.deb ...\n",
            "Unpacking tesseract-ocr (4.1.1-2.1build1) ...\n",
            "Selecting previously unselected package tesseract-ocr-ind.\n",
            "Preparing to unpack .../tesseract-ocr-ind_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n",
            "Unpacking tesseract-ocr-ind (1:4.00~git30-7274cfa-1.1) ...\n",
            "Setting up tesseract-ocr-eng (1:4.00~git30-7274cfa-1.1) ...\n",
            "Setting up tesseract-ocr-ind (1:4.00~git30-7274cfa-1.1) ...\n",
            "Setting up tesseract-ocr-osd (1:4.00~git30-7274cfa-1.1) ...\n",
            "Setting up tesseract-ocr (4.1.1-2.1build1) ...\n",
            "Processing triggers for man-db (2.10.2-1) ...\n",
            "Collecting pytesseract\n",
            "  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)\n",
            "Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (23.2)\n",
            "Requirement already satisfied: Pillow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (9.4.0)\n",
            "Installing collected packages: pytesseract\n",
            "Successfully installed pytesseract-0.3.10\n",
            "Requirement already satisfied: protobuf in /usr/local/lib/python3.10/dist-packages (3.20.3)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import cv2\n",
        "import numpy as np\n",
        "import pytesseract\n",
        "import pandas as pd\n",
        "from PIL import Image\n",
        "import matplotlib.pyplot as plt"
      ],
      "metadata": {
        "id": "EyApGvkG4wsy"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "FILE_PATH = '/content/drive/MyDrive/ocr/dataset'\n",
        "filePath = os.path.join(FILE_PATH, 'ktp.png')\n",
        "\n",
        "img = cv2.imread(filePath)\n",
        "gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
        "th, threshed = cv2.threshold(gray, 127, 255, cv2.THRESH_TRUNC)\n",
        "\n",
        "result = pytesseract.image_to_string((threshed), lang=\"ind\")\n",
        "\n",
        "for word in result.split(\"\\n\"):\n",
        "  if \"”—\" in word:\n",
        "    word = word.replace(\"”—\", \":\")\n",
        "  if \"NIK\" in word:\n",
        "    nik_char = word.split()\n",
        "    if \"?\" in word:\n",
        "      word = word.replace(\"?\", \"7\")\n",
        "    if \"D\" in word:\n",
        "      word = word.replace(\"D\", \"0\")\n",
        "\n",
        "  print(word)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "WTWGqHTT5NAh",
        "outputId": "07bff621-dd0a-4d71-80b4-1f8ac1bfa47a"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "em\n",
            "\n",
            "PROVINSI DAERAH ISTIMEWA YOGYAKARTA\n",
            "KABUPATEN SLEMAN\n",
            "\n",
            " \n",
            "\n",
            "NIK : 34711140209790001\n",
            "\n",
            "Nama :RIYANTO. SE\n",
            "\n",
            "Tempat/Tgl Lahir : GROBOGAN. 02-09-1979\n",
            "\n",
            "Jenis Kelamin : LAKI-LAKI Gol Darah : 0\n",
            "\n",
            "Alamat PRM PURI DOMAS D-3. SEMPU\n",
            "RTRW 1001 1024\n",
            "\n",
            "Kel/Desa : WEDOMARTANI!\n",
            "Kecamatan : NGEMPLAK\n",
            "\n",
            "Agama \"ISLAM\n",
            "Status Bean KAWIN SLEMAN\n",
            "Pekerjaan : PEDAGANG 05-06-2012\n",
            "\n",
            "Kewarganegaraan: WNI HI —\n",
            "Berlaku Hingga :02-09-2017 NIA\n",
            "\n",
            "   \n",
            " \n",
            "\n",
            " \n",
            "\f\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "class KTPInformation(object):\n",
        "    def __init__(self):\n",
        "        self.nik = \"\"\n",
        "        self.nama = \"\"\n",
        "        self.tempat_lahir = \"\"\n",
        "        self.tanggal_lahir = \"\"\n",
        "        self.jenis_kelamin = \"\"\n",
        "        self.golongan_darah = \"\"\n",
        "        self.alamat = \"\"\n",
        "        self.rt = \"\"\n",
        "        self.rw = \"\"\n",
        "        self.kelurahan_atau_desa = \"\"\n",
        "        self.kecamatan = \"\"\n",
        "        self.agama = \"\"\n",
        "        self.status_perkawinan = \"\"\n",
        "        self.pekerjaan = \"\"\n",
        "        self.kewarganegaraan = \"\"\n",
        "        berlaku_hingga = \"SEUMUR HIDUP\""
      ],
      "metadata": {
        "id": "TLhP9FJH7m5-"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import cv2\n",
        "import json\n",
        "import re\n",
        "import numpy as np\n",
        "import pytesseract\n",
        "import matplotlib.pyplot as plt\n",
        "from PIL import Image\n",
        "\n",
        "class KTPOCR(object):\n",
        "    def __init__(self, image):\n",
        "        self.image = cv2.imread(image)\n",
        "        self.gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)\n",
        "        self.th, self.threshed = cv2.threshold(self.gray, 127, 255, cv2.THRESH_TRUNC)\n",
        "        self.result = KTPInformation()\n",
        "        self.master_process()\n",
        "\n",
        "    def process(self, image):\n",
        "        raw_extracted_text = pytesseract.image_to_string((self.threshed), lang=\"ind\")\n",
        "        return raw_extracted_text\n",
        "\n",
        "    def word_to_number_converter(self, word):\n",
        "        word_dict = {\n",
        "            \"L\": \"1\",\n",
        "            \"l\": \"1\",\n",
        "            \"O\": \"0\",\n",
        "            \"o\": \"0\",\n",
        "            \"?\": \"7\",\n",
        "            \"A\": \"4\",\n",
        "            \"Z\": \"2\",\n",
        "            \"z\": \"2\",\n",
        "            \"S\": \"5\",\n",
        "            \"s\": \"5\",\n",
        "            \"b\": \"6\",\n",
        "            \"B\": \"8\",\n",
        "            \"G\": \"6\"\n",
        "        }\n",
        "        res = \"\"\n",
        "        for letter in word:\n",
        "            if letter in word_dict:\n",
        "                res += word_dict[letter]\n",
        "            else:\n",
        "                res += letter\n",
        "        return res\n",
        "\n",
        "    def extract(self, extracted_result):\n",
        "        #print(extracted_result.replace('\\n', ' -- '))\n",
        "        for word in extracted_result.split(\"\\n\"):\n",
        "            word = self.pun_rem(word)\n",
        "\n",
        "            if \"NIK\" in word:\n",
        "              word = word.split(':')\n",
        "              self.result.nik = self.word_to_number_converter(word[-1].replace(\" \", \"\"))\n",
        "              continue\n",
        "\n",
        "            if \"Nama\" in word:\n",
        "              word = word.split(':')\n",
        "              self.result.nama = word[-1]\n",
        "              continue\n",
        "\n",
        "            if \"Lahir\" in word:\n",
        "              word = word.split(':')\n",
        "              self.result.tanggal_lahir = re.search(\"([0-9]{2}\\-[0-9]{2}\\-[0-9]{4})\", word[-1])[0]\n",
        "              self.result.tempat_lahir = word[-1].replace(self.result.tanggal_lahir, '')\n",
        "              continue\n",
        "\n",
        "            if \"Gol\" in word:\n",
        "              word = word.split(':')\n",
        "              self.result.jenis_kelamin = re.search(\"(LAKI-LAKI|LAKI|LELAKI|PEREMPUAN)\", word[1])[0]\n",
        "              # gol = re.search(\"(O|A|B|AB)\", word[2])[0]\n",
        "\n",
        "              # if gol == \"0\":\n",
        "              #   gol = \"O\"\n",
        "\n",
        "              # self.result.golongan_darah = gol\n",
        "              continue\n",
        "\n",
        "            if \"Alamat\" in word:\n",
        "              self.result.alamat = re.sub(r'^\\W*\\w+\\W*', '', word)\n",
        "              continue\n",
        "\n",
        "            if \"RW\" in word:\n",
        "              word = re.sub(r'^\\W*\\w+\\W*', '', word)\n",
        "\n",
        "              if \" \" in word:\n",
        "                a = word.split(\" \")\n",
        "              elif \"/\" in word:\n",
        "                a = word.split(\"/\")\n",
        "\n",
        "              self.result.rt = a[0][-3:]\n",
        "              self.result.rw = a[1][-3:]\n",
        "              continue\n",
        "\n",
        "            if \"kel\" in word:\n",
        "              word = re.sub(r'^\\W*\\w+\\W*', '', word)\n",
        "              self.result.kecamatan = word\n",
        "              continue\n",
        "\n",
        "            if \"Agama\" in word:\n",
        "              self.result.agama = re.search(\"(ISLAM|KRISTEN|KATOLIK|HINDU|BUDDHA|KONG HU CU)\", word)[0]\n",
        "              continue\n",
        "\n",
        "            if \"Status\" in word:\n",
        "              self.result.status_perkawinan = re.search(\"(KAWIN|BELUM KAWIN|DUDA CERAI|DUDA MATI|JANDA CERAI|JANDA MATI)\", word)[0]\n",
        "              continue\n",
        "\n",
        "            if \"Pekerjaan\" in word:\n",
        "              word = re.sub(r'^\\W*\\w+\\W*', '', word)\n",
        "              word = word.split(\" \")\n",
        "              self.result.pekerjaan = word[0]\n",
        "              continue\n",
        "\n",
        "            if \"Kewarganegaraan\" in word:\n",
        "              self.result.kewarganegaraan = re.search(\"(WNI|WNA)\", word)[0]\n",
        "              continue\n",
        "\n",
        "    def pun_rem(self,text):\n",
        "      punctuations = '''!()[]{}'\"\\<>?@#$%^&*_~'''\n",
        "      no_punct = ''\n",
        "\n",
        "      for char in text:\n",
        "        if char not in punctuations:\n",
        "          no_punct = no_punct + char\n",
        "\n",
        "      return no_punct\n",
        "\n",
        "    def master_process(self):\n",
        "        raw_text = self.process(self.image)\n",
        "        self.extract(raw_text)\n",
        "\n",
        "    def to_json(self):\n",
        "        return json.dumps(self.result.__dict__, indent=4)"
      ],
      "metadata": {
        "id": "7rZCI9I47oC7"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "images = KTPOCR(filePath)\n",
        "print(images.to_json());"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "MFfCNv4L8MPZ",
        "outputId": "a6a0f2b0-13e7-4574-a3b9-489fc0b3c00f"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "{\n",
            "    \"nik\": \"34711140209790001\",\n",
            "    \"nama\": \"RIYANTO. SE\",\n",
            "    \"tempat_lahir\": \" GROBOGAN. \",\n",
            "    \"tanggal_lahir\": \"02-09-1979\",\n",
            "    \"jenis_kelamin\": \"LAKI-LAKI\",\n",
            "    \"golongan_darah\": \"\",\n",
            "    \"alamat\": \"PRM PURI DOMAS D-3. SEMPU\",\n",
            "    \"rt\": \"001\",\n",
            "    \"rw\": \"024\",\n",
            "    \"kelurahan_atau_desa\": \"\",\n",
            "    \"kecamatan\": \"\",\n",
            "    \"agama\": \"ISLAM\",\n",
            "    \"status_perkawinan\": \"KAWIN\",\n",
            "    \"pekerjaan\": \"PEDAGANG\",\n",
            "    \"kewarganegaraan\": \"WNI\"\n",
            "}\n"
          ]
        }
      ]
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"authorship_tag": "ABX9TyNEBoY32Js79QI9AsomXvp4",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/masitings/9dab604147dfe269d6f895f41f872991/yt_ocr.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "YMViZtqR340x",
	"outputId": "402ab93d-5d98-492a-9e21-004c4effa942"
	},
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Mounted at /content/drive\n"
	]
	}
	],
	"source": [
	"from google.colab import drive\n",
	"import os\n",
	"\n",
	"drive.mount('/content/drive')"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"!sudo apt install tesseract-ocr-ind\n",
	"!pip install pytesseract\n",
	"!pip install protobuf"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "JLCHyiEf4YyX",
	"outputId": "f96fe312-5c40-48da-ee6d-4f53b8a39aec"
	},
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Reading package lists... Done\n",
	"Building dependency tree... Done\n",
	"Reading state information... Done\n",
	"The following additional packages will be installed:\n",
	" tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd\n",
	"The following NEW packages will be installed:\n",
	" tesseract-ocr tesseract-ocr-eng tesseract-ocr-ind tesseract-ocr-osd\n",
	"0 upgraded, 4 newly installed, 0 to remove and 30 not upgraded.\n",
	"Need to get 5,353 kB of archives.\n",
	"After this operation, 16.8 MB of additional disk space will be used.\n",
	"Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]\n",
	"Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]\n",
	"Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]\n",
	"Get:4 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-ind all 1:4.00~git30-7274cfa-1.1 [537 kB]\n",
	"Fetched 5,353 kB in 1s (8,532 kB/s)\n",
	"debconf: unable to initialize frontend: Dialog\n",
	"debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 4.)\n",
	"debconf: falling back to frontend: Readline\n",
	"debconf: unable to initialize frontend: Readline\n",
	"debconf: (This frontend requires a controlling tty.)\n",
	"debconf: falling back to frontend: Teletype\n",
	"dpkg-preconfigure: unable to re-open stdin: \n",
	"Selecting previously unselected package tesseract-ocr-eng.\n",
	"(Reading database ... 121671 files and directories currently installed.)\n",
	"Preparing to unpack .../tesseract-ocr-eng_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n",
	"Unpacking tesseract-ocr-eng (1:4.00~git30-7274cfa-1.1) ...\n",
	"Selecting previously unselected package tesseract-ocr-osd.\n",
	"Preparing to unpack .../tesseract-ocr-osd_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n",
	"Unpacking tesseract-ocr-osd (1:4.00~git30-7274cfa-1.1) ...\n",
	"Selecting previously unselected package tesseract-ocr.\n",
	"Preparing to unpack .../tesseract-ocr_4.1.1-2.1build1_amd64.deb ...\n",
	"Unpacking tesseract-ocr (4.1.1-2.1build1) ...\n",
	"Selecting previously unselected package tesseract-ocr-ind.\n",
	"Preparing to unpack .../tesseract-ocr-ind_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n",
	"Unpacking tesseract-ocr-ind (1:4.00~git30-7274cfa-1.1) ...\n",
	"Setting up tesseract-ocr-eng (1:4.00~git30-7274cfa-1.1) ...\n",
	"Setting up tesseract-ocr-ind (1:4.00~git30-7274cfa-1.1) ...\n",
	"Setting up tesseract-ocr-osd (1:4.00~git30-7274cfa-1.1) ...\n",
	"Setting up tesseract-ocr (4.1.1-2.1build1) ...\n",
	"Processing triggers for man-db (2.10.2-1) ...\n",
	"Collecting pytesseract\n",
	" Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)\n",
	"Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (23.2)\n",
	"Requirement already satisfied: Pillow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (9.4.0)\n",
	"Installing collected packages: pytesseract\n",
	"Successfully installed pytesseract-0.3.10\n",
	"Requirement already satisfied: protobuf in /usr/local/lib/python3.10/dist-packages (3.20.3)\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"import cv2\n",
	"import numpy as np\n",
	"import pytesseract\n",
	"import pandas as pd\n",
	"from PIL import Image\n",
	"import matplotlib.pyplot as plt"
	],
	"metadata": {
	"id": "EyApGvkG4wsy"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"FILE_PATH = '/content/drive/MyDrive/ocr/dataset'\n",
	"filePath = os.path.join(FILE_PATH, 'ktp.png')\n",
	"\n",
	"img = cv2.imread(filePath)\n",
	"gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
	"th, threshed = cv2.threshold(gray, 127, 255, cv2.THRESH_TRUNC)\n",
	"\n",
	"result = pytesseract.image_to_string((threshed), lang=\"ind\")\n",
	"\n",
	"for word in result.split(\"\\n\"):\n",
	" if \"”—\" in word:\n",
	" word = word.replace(\"”—\", \":\")\n",
	" if \"NIK\" in word:\n",
	" nik_char = word.split()\n",
	" if \"?\" in word:\n",
	" word = word.replace(\"?\", \"7\")\n",
	" if \"D\" in word:\n",
	" word = word.replace(\"D\", \"0\")\n",
	"\n",
	" print(word)\n"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "WTWGqHTT5NAh",
	"outputId": "07bff621-dd0a-4d71-80b4-1f8ac1bfa47a"
	},
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"em\n",
	"\n",
	"PROVINSI DAERAH ISTIMEWA YOGYAKARTA\n",
	"KABUPATEN SLEMAN\n",
	"\n",
	" \n",
	"\n",
	"NIK : 34711140209790001\n",
	"\n",
	"Nama :RIYANTO. SE\n",
	"\n",
	"Tempat/Tgl Lahir : GROBOGAN. 02-09-1979\n",
	"\n",
	"Jenis Kelamin : LAKI-LAKI Gol Darah : 0\n",
	"\n",
	"Alamat PRM PURI DOMAS D-3. SEMPU\n",
	"RTRW 1001 1024\n",
	"\n",
	"Kel/Desa : WEDOMARTANI!\n",
	"Kecamatan : NGEMPLAK\n",
	"\n",
	"Agama \"ISLAM\n",
	"Status Bean KAWIN SLEMAN\n",
	"Pekerjaan : PEDAGANG 05-06-2012\n",
	"\n",
	"Kewarganegaraan: WNI HI —\n",
	"Berlaku Hingga :02-09-2017 NIA\n",
	"\n",
	" \n",
	" \n",
	"\n",
	" \n",
	"\f\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"class KTPInformation(object):\n",
	" def __init__(self):\n",
	" self.nik = \"\"\n",
	" self.nama = \"\"\n",
	" self.tempat_lahir = \"\"\n",
	" self.tanggal_lahir = \"\"\n",
	" self.jenis_kelamin = \"\"\n",
	" self.golongan_darah = \"\"\n",
	" self.alamat = \"\"\n",
	" self.rt = \"\"\n",
	" self.rw = \"\"\n",
	" self.kelurahan_atau_desa = \"\"\n",
	" self.kecamatan = \"\"\n",
	" self.agama = \"\"\n",
	" self.status_perkawinan = \"\"\n",
	" self.pekerjaan = \"\"\n",
	" self.kewarganegaraan = \"\"\n",
	" berlaku_hingga = \"SEUMUR HIDUP\""
	],
	"metadata": {
	"id": "TLhP9FJH7m5-"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"import cv2\n",
	"import json\n",
	"import re\n",
	"import numpy as np\n",
	"import pytesseract\n",
	"import matplotlib.pyplot as plt\n",
	"from PIL import Image\n",
	"\n",
	"class KTPOCR(object):\n",
	" def __init__(self, image):\n",
	" self.image = cv2.imread(image)\n",
	" self.gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)\n",
	" self.th, self.threshed = cv2.threshold(self.gray, 127, 255, cv2.THRESH_TRUNC)\n",
	" self.result = KTPInformation()\n",
	" self.master_process()\n",
	"\n",
	" def process(self, image):\n",
	" raw_extracted_text = pytesseract.image_to_string((self.threshed), lang=\"ind\")\n",
	" return raw_extracted_text\n",
	"\n",
	" def word_to_number_converter(self, word):\n",
	" word_dict = {\n",
	" \"L\": \"1\",\n",
	" \"l\": \"1\",\n",
	" \"O\": \"0\",\n",
	" \"o\": \"0\",\n",
	" \"?\": \"7\",\n",
	" \"A\": \"4\",\n",
	" \"Z\": \"2\",\n",
	" \"z\": \"2\",\n",
	" \"S\": \"5\",\n",
	" \"s\": \"5\",\n",
	" \"b\": \"6\",\n",
	" \"B\": \"8\",\n",
	" \"G\": \"6\"\n",
	" }\n",
	" res = \"\"\n",
	" for letter in word:\n",
	" if letter in word_dict:\n",
	" res += word_dict[letter]\n",
	" else:\n",
	" res += letter\n",
	" return res\n",
	"\n",
	" def extract(self, extracted_result):\n",
	" #print(extracted_result.replace('\\n', ' -- '))\n",
	" for word in extracted_result.split(\"\\n\"):\n",
	" word = self.pun_rem(word)\n",
	"\n",
	" if \"NIK\" in word:\n",
	" word = word.split(':')\n",
	" self.result.nik = self.word_to_number_converter(word[-1].replace(\" \", \"\"))\n",
	" continue\n",
	"\n",
	" if \"Nama\" in word:\n",
	" word = word.split(':')\n",
	" self.result.nama = word[-1]\n",
	" continue\n",
	"\n",
	" if \"Lahir\" in word:\n",
	" word = word.split(':')\n",
	" self.result.tanggal_lahir = re.search(\"([0-9]{2}\\-[0-9]{2}\\-[0-9]{4})\", word[-1])[0]\n",
	" self.result.tempat_lahir = word[-1].replace(self.result.tanggal_lahir, '')\n",
	" continue\n",
	"\n",
	" if \"Gol\" in word:\n",
	" word = word.split(':')\n",
	" self.result.jenis_kelamin = re.search(\"(LAKI-LAKI\|LAKI\|LELAKI\|PEREMPUAN)\", word[1])[0]\n",
	" # gol = re.search(\"(O\|A\|B\|AB)\", word[2])[0]\n",
	"\n",
	" # if gol == \"0\":\n",
	" # gol = \"O\"\n",
	"\n",
	" # self.result.golongan_darah = gol\n",
	" continue\n",
	"\n",
	" if \"Alamat\" in word:\n",
	" self.result.alamat = re.sub(r'^\\W\\w+\\W', '', word)\n",
	" continue\n",
	"\n",
	" if \"RW\" in word:\n",
	" word = re.sub(r'^\\W\\w+\\W', '', word)\n",
	"\n",
	" if \" \" in word:\n",
	" a = word.split(\" \")\n",
	" elif \"/\" in word:\n",
	" a = word.split(\"/\")\n",
	"\n",
	" self.result.rt = a[0][-3:]\n",
	" self.result.rw = a[1][-3:]\n",
	" continue\n",
	"\n",
	" if \"kel\" in word:\n",
	" word = re.sub(r'^\\W\\w+\\W', '', word)\n",
	" self.result.kecamatan = word\n",
	" continue\n",
	"\n",
	" if \"Agama\" in word:\n",
	" self.result.agama = re.search(\"(ISLAM\|KRISTEN\|KATOLIK\|HINDU\|BUDDHA\|KONG HU CU)\", word)[0]\n",
	" continue\n",
	"\n",
	" if \"Status\" in word:\n",
	" self.result.status_perkawinan = re.search(\"(KAWIN\|BELUM KAWIN\|DUDA CERAI\|DUDA MATI\|JANDA CERAI\|JANDA MATI)\", word)[0]\n",
	" continue\n",
	"\n",
	" if \"Pekerjaan\" in word:\n",
	" word = re.sub(r'^\\W\\w+\\W', '', word)\n",
	" word = word.split(\" \")\n",
	" self.result.pekerjaan = word[0]\n",
	" continue\n",
	"\n",
	" if \"Kewarganegaraan\" in word:\n",
	" self.result.kewarganegaraan = re.search(\"(WNI\|WNA)\", word)[0]\n",
	" continue\n",
	"\n",
	" def pun_rem(self,text):\n",
	" punctuations = '''!()[]{}'\"\\<>?@#$%^&*_~'''\n",
	" no_punct = ''\n",
	"\n",
	" for char in text:\n",
	" if char not in punctuations:\n",
	" no_punct = no_punct + char\n",
	"\n",
	" return no_punct\n",
	"\n",
	" def master_process(self):\n",
	" raw_text = self.process(self.image)\n",
	" self.extract(raw_text)\n",
	"\n",
	" def to_json(self):\n",
	" return json.dumps(self.result.__dict__, indent=4)"
	],
	"metadata": {
	"id": "7rZCI9I47oC7"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"images = KTPOCR(filePath)\n",
	"print(images.to_json());"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "MFfCNv4L8MPZ",
	"outputId": "a6a0f2b0-13e7-4574-a3b9-489fc0b3c00f"
	},
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"{\n",
	" \"nik\": \"34711140209790001\",\n",
	" \"nama\": \"RIYANTO. SE\",\n",
	" \"tempat_lahir\": \" GROBOGAN. \",\n",
	" \"tanggal_lahir\": \"02-09-1979\",\n",
	" \"jenis_kelamin\": \"LAKI-LAKI\",\n",
	" \"golongan_darah\": \"\",\n",
	" \"alamat\": \"PRM PURI DOMAS D-3. SEMPU\",\n",
	" \"rt\": \"001\",\n",
	" \"rw\": \"024\",\n",
	" \"kelurahan_atau_desa\": \"\",\n",
	" \"kecamatan\": \"\",\n",
	" \"agama\": \"ISLAM\",\n",
	" \"status_perkawinan\": \"KAWIN\",\n",
	" \"pekerjaan\": \"PEDAGANG\",\n",
	" \"kewarganegaraan\": \"WNI\"\n",
	"}\n"
	]
	}
	]
	}
	]
	}