Created
January 26, 2024 09:47
-
-
Save masitings/9dab604147dfe269d6f895f41f872991 to your computer and use it in GitHub Desktop.
yt_ocr.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"authorship_tag": "ABX9TyNEBoY32Js79QI9AsomXvp4", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/masitings/9dab604147dfe269d6f895f41f872991/yt_ocr.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "YMViZtqR340x", | |
"outputId": "402ab93d-5d98-492a-9e21-004c4effa942" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Mounted at /content/drive\n" | |
] | |
} | |
], | |
"source": [ | |
"from google.colab import drive\n", | |
"import os\n", | |
"\n", | |
"drive.mount('/content/drive')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!sudo apt install tesseract-ocr-ind\n", | |
"!pip install pytesseract\n", | |
"!pip install protobuf" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "JLCHyiEf4YyX", | |
"outputId": "f96fe312-5c40-48da-ee6d-4f53b8a39aec" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Reading package lists... Done\n", | |
"Building dependency tree... Done\n", | |
"Reading state information... Done\n", | |
"The following additional packages will be installed:\n", | |
" tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd\n", | |
"The following NEW packages will be installed:\n", | |
" tesseract-ocr tesseract-ocr-eng tesseract-ocr-ind tesseract-ocr-osd\n", | |
"0 upgraded, 4 newly installed, 0 to remove and 30 not upgraded.\n", | |
"Need to get 5,353 kB of archives.\n", | |
"After this operation, 16.8 MB of additional disk space will be used.\n", | |
"Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]\n", | |
"Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]\n", | |
"Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]\n", | |
"Get:4 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-ind all 1:4.00~git30-7274cfa-1.1 [537 kB]\n", | |
"Fetched 5,353 kB in 1s (8,532 kB/s)\n", | |
"debconf: unable to initialize frontend: Dialog\n", | |
"debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 4.)\n", | |
"debconf: falling back to frontend: Readline\n", | |
"debconf: unable to initialize frontend: Readline\n", | |
"debconf: (This frontend requires a controlling tty.)\n", | |
"debconf: falling back to frontend: Teletype\n", | |
"dpkg-preconfigure: unable to re-open stdin: \n", | |
"Selecting previously unselected package tesseract-ocr-eng.\n", | |
"(Reading database ... 121671 files and directories currently installed.)\n", | |
"Preparing to unpack .../tesseract-ocr-eng_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n", | |
"Unpacking tesseract-ocr-eng (1:4.00~git30-7274cfa-1.1) ...\n", | |
"Selecting previously unselected package tesseract-ocr-osd.\n", | |
"Preparing to unpack .../tesseract-ocr-osd_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n", | |
"Unpacking tesseract-ocr-osd (1:4.00~git30-7274cfa-1.1) ...\n", | |
"Selecting previously unselected package tesseract-ocr.\n", | |
"Preparing to unpack .../tesseract-ocr_4.1.1-2.1build1_amd64.deb ...\n", | |
"Unpacking tesseract-ocr (4.1.1-2.1build1) ...\n", | |
"Selecting previously unselected package tesseract-ocr-ind.\n", | |
"Preparing to unpack .../tesseract-ocr-ind_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n", | |
"Unpacking tesseract-ocr-ind (1:4.00~git30-7274cfa-1.1) ...\n", | |
"Setting up tesseract-ocr-eng (1:4.00~git30-7274cfa-1.1) ...\n", | |
"Setting up tesseract-ocr-ind (1:4.00~git30-7274cfa-1.1) ...\n", | |
"Setting up tesseract-ocr-osd (1:4.00~git30-7274cfa-1.1) ...\n", | |
"Setting up tesseract-ocr (4.1.1-2.1build1) ...\n", | |
"Processing triggers for man-db (2.10.2-1) ...\n", | |
"Collecting pytesseract\n", | |
" Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)\n", | |
"Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (23.2)\n", | |
"Requirement already satisfied: Pillow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (9.4.0)\n", | |
"Installing collected packages: pytesseract\n", | |
"Successfully installed pytesseract-0.3.10\n", | |
"Requirement already satisfied: protobuf in /usr/local/lib/python3.10/dist-packages (3.20.3)\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import cv2\n", | |
"import numpy as np\n", | |
"import pytesseract\n", | |
"import pandas as pd\n", | |
"from PIL import Image\n", | |
"import matplotlib.pyplot as plt" | |
], | |
"metadata": { | |
"id": "EyApGvkG4wsy" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"FILE_PATH = '/content/drive/MyDrive/ocr/dataset'\n", | |
"filePath = os.path.join(FILE_PATH, 'ktp.png')\n", | |
"\n", | |
"img = cv2.imread(filePath)\n", | |
"gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n", | |
"th, threshed = cv2.threshold(gray, 127, 255, cv2.THRESH_TRUNC)\n", | |
"\n", | |
"result = pytesseract.image_to_string((threshed), lang=\"ind\")\n", | |
"\n", | |
"for word in result.split(\"\\n\"):\n", | |
" if \"”—\" in word:\n", | |
" word = word.replace(\"”—\", \":\")\n", | |
" if \"NIK\" in word:\n", | |
" nik_char = word.split()\n", | |
" if \"?\" in word:\n", | |
" word = word.replace(\"?\", \"7\")\n", | |
" if \"D\" in word:\n", | |
" word = word.replace(\"D\", \"0\")\n", | |
"\n", | |
" print(word)\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "WTWGqHTT5NAh", | |
"outputId": "07bff621-dd0a-4d71-80b4-1f8ac1bfa47a" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"em\n", | |
"\n", | |
"PROVINSI DAERAH ISTIMEWA YOGYAKARTA\n", | |
"KABUPATEN SLEMAN\n", | |
"\n", | |
" \n", | |
"\n", | |
"NIK : 34711140209790001\n", | |
"\n", | |
"Nama :RIYANTO. SE\n", | |
"\n", | |
"Tempat/Tgl Lahir : GROBOGAN. 02-09-1979\n", | |
"\n", | |
"Jenis Kelamin : LAKI-LAKI Gol Darah : 0\n", | |
"\n", | |
"Alamat PRM PURI DOMAS D-3. SEMPU\n", | |
"RTRW 1001 1024\n", | |
"\n", | |
"Kel/Desa : WEDOMARTANI!\n", | |
"Kecamatan : NGEMPLAK\n", | |
"\n", | |
"Agama \"ISLAM\n", | |
"Status Bean KAWIN SLEMAN\n", | |
"Pekerjaan : PEDAGANG 05-06-2012\n", | |
"\n", | |
"Kewarganegaraan: WNI HI —\n", | |
"Berlaku Hingga :02-09-2017 NIA\n", | |
"\n", | |
" \n", | |
" \n", | |
"\n", | |
" \n", | |
"\f\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"class KTPInformation(object):\n", | |
" def __init__(self):\n", | |
" self.nik = \"\"\n", | |
" self.nama = \"\"\n", | |
" self.tempat_lahir = \"\"\n", | |
" self.tanggal_lahir = \"\"\n", | |
" self.jenis_kelamin = \"\"\n", | |
" self.golongan_darah = \"\"\n", | |
" self.alamat = \"\"\n", | |
" self.rt = \"\"\n", | |
" self.rw = \"\"\n", | |
" self.kelurahan_atau_desa = \"\"\n", | |
" self.kecamatan = \"\"\n", | |
" self.agama = \"\"\n", | |
" self.status_perkawinan = \"\"\n", | |
" self.pekerjaan = \"\"\n", | |
" self.kewarganegaraan = \"\"\n", | |
" berlaku_hingga = \"SEUMUR HIDUP\"" | |
], | |
"metadata": { | |
"id": "TLhP9FJH7m5-" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import cv2\n", | |
"import json\n", | |
"import re\n", | |
"import numpy as np\n", | |
"import pytesseract\n", | |
"import matplotlib.pyplot as plt\n", | |
"from PIL import Image\n", | |
"\n", | |
"class KTPOCR(object):\n", | |
" def __init__(self, image):\n", | |
" self.image = cv2.imread(image)\n", | |
" self.gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)\n", | |
" self.th, self.threshed = cv2.threshold(self.gray, 127, 255, cv2.THRESH_TRUNC)\n", | |
" self.result = KTPInformation()\n", | |
" self.master_process()\n", | |
"\n", | |
" def process(self, image):\n", | |
" raw_extracted_text = pytesseract.image_to_string((self.threshed), lang=\"ind\")\n", | |
" return raw_extracted_text\n", | |
"\n", | |
" def word_to_number_converter(self, word):\n", | |
" word_dict = {\n", | |
" \"L\": \"1\",\n", | |
" \"l\": \"1\",\n", | |
" \"O\": \"0\",\n", | |
" \"o\": \"0\",\n", | |
" \"?\": \"7\",\n", | |
" \"A\": \"4\",\n", | |
" \"Z\": \"2\",\n", | |
" \"z\": \"2\",\n", | |
" \"S\": \"5\",\n", | |
" \"s\": \"5\",\n", | |
" \"b\": \"6\",\n", | |
" \"B\": \"8\",\n", | |
" \"G\": \"6\"\n", | |
" }\n", | |
" res = \"\"\n", | |
" for letter in word:\n", | |
" if letter in word_dict:\n", | |
" res += word_dict[letter]\n", | |
" else:\n", | |
" res += letter\n", | |
" return res\n", | |
"\n", | |
" def extract(self, extracted_result):\n", | |
" #print(extracted_result.replace('\\n', ' -- '))\n", | |
" for word in extracted_result.split(\"\\n\"):\n", | |
" word = self.pun_rem(word)\n", | |
"\n", | |
" if \"NIK\" in word:\n", | |
" word = word.split(':')\n", | |
" self.result.nik = self.word_to_number_converter(word[-1].replace(\" \", \"\"))\n", | |
" continue\n", | |
"\n", | |
" if \"Nama\" in word:\n", | |
" word = word.split(':')\n", | |
" self.result.nama = word[-1]\n", | |
" continue\n", | |
"\n", | |
" if \"Lahir\" in word:\n", | |
" word = word.split(':')\n", | |
" self.result.tanggal_lahir = re.search(\"([0-9]{2}\\-[0-9]{2}\\-[0-9]{4})\", word[-1])[0]\n", | |
" self.result.tempat_lahir = word[-1].replace(self.result.tanggal_lahir, '')\n", | |
" continue\n", | |
"\n", | |
" if \"Gol\" in word:\n", | |
" word = word.split(':')\n", | |
" self.result.jenis_kelamin = re.search(\"(LAKI-LAKI|LAKI|LELAKI|PEREMPUAN)\", word[1])[0]\n", | |
" # gol = re.search(\"(O|A|B|AB)\", word[2])[0]\n", | |
"\n", | |
" # if gol == \"0\":\n", | |
" # gol = \"O\"\n", | |
"\n", | |
" # self.result.golongan_darah = gol\n", | |
" continue\n", | |
"\n", | |
" if \"Alamat\" in word:\n", | |
" self.result.alamat = re.sub(r'^\\W*\\w+\\W*', '', word)\n", | |
" continue\n", | |
"\n", | |
" if \"RW\" in word:\n", | |
" word = re.sub(r'^\\W*\\w+\\W*', '', word)\n", | |
"\n", | |
" if \" \" in word:\n", | |
" a = word.split(\" \")\n", | |
" elif \"/\" in word:\n", | |
" a = word.split(\"/\")\n", | |
"\n", | |
" self.result.rt = a[0][-3:]\n", | |
" self.result.rw = a[1][-3:]\n", | |
" continue\n", | |
"\n", | |
" if \"kel\" in word:\n", | |
" word = re.sub(r'^\\W*\\w+\\W*', '', word)\n", | |
" self.result.kecamatan = word\n", | |
" continue\n", | |
"\n", | |
" if \"Agama\" in word:\n", | |
" self.result.agama = re.search(\"(ISLAM|KRISTEN|KATOLIK|HINDU|BUDDHA|KONG HU CU)\", word)[0]\n", | |
" continue\n", | |
"\n", | |
" if \"Status\" in word:\n", | |
" self.result.status_perkawinan = re.search(\"(KAWIN|BELUM KAWIN|DUDA CERAI|DUDA MATI|JANDA CERAI|JANDA MATI)\", word)[0]\n", | |
" continue\n", | |
"\n", | |
" if \"Pekerjaan\" in word:\n", | |
" word = re.sub(r'^\\W*\\w+\\W*', '', word)\n", | |
" word = word.split(\" \")\n", | |
" self.result.pekerjaan = word[0]\n", | |
" continue\n", | |
"\n", | |
" if \"Kewarganegaraan\" in word:\n", | |
" self.result.kewarganegaraan = re.search(\"(WNI|WNA)\", word)[0]\n", | |
" continue\n", | |
"\n", | |
" def pun_rem(self,text):\n", | |
" punctuations = '''!()[]{}'\"\\<>?@#$%^&*_~'''\n", | |
" no_punct = ''\n", | |
"\n", | |
" for char in text:\n", | |
" if char not in punctuations:\n", | |
" no_punct = no_punct + char\n", | |
"\n", | |
" return no_punct\n", | |
"\n", | |
" def master_process(self):\n", | |
" raw_text = self.process(self.image)\n", | |
" self.extract(raw_text)\n", | |
"\n", | |
" def to_json(self):\n", | |
" return json.dumps(self.result.__dict__, indent=4)" | |
], | |
"metadata": { | |
"id": "7rZCI9I47oC7" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"images = KTPOCR(filePath)\n", | |
"print(images.to_json());" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "MFfCNv4L8MPZ", | |
"outputId": "a6a0f2b0-13e7-4574-a3b9-489fc0b3c00f" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"{\n", | |
" \"nik\": \"34711140209790001\",\n", | |
" \"nama\": \"RIYANTO. SE\",\n", | |
" \"tempat_lahir\": \" GROBOGAN. \",\n", | |
" \"tanggal_lahir\": \"02-09-1979\",\n", | |
" \"jenis_kelamin\": \"LAKI-LAKI\",\n", | |
" \"golongan_darah\": \"\",\n", | |
" \"alamat\": \"PRM PURI DOMAS D-3. SEMPU\",\n", | |
" \"rt\": \"001\",\n", | |
" \"rw\": \"024\",\n", | |
" \"kelurahan_atau_desa\": \"\",\n", | |
" \"kecamatan\": \"\",\n", | |
" \"agama\": \"ISLAM\",\n", | |
" \"status_perkawinan\": \"KAWIN\",\n", | |
" \"pekerjaan\": \"PEDAGANG\",\n", | |
" \"kewarganegaraan\": \"WNI\"\n", | |
"}\n" | |
] | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment