Skip to content

Instantly share code, notes, and snippets.

@masitings
Created January 26, 2024 09:47
Show Gist options
  • Save masitings/9dab604147dfe269d6f895f41f872991 to your computer and use it in GitHub Desktop.
Save masitings/9dab604147dfe269d6f895f41f872991 to your computer and use it in GitHub Desktop.
yt_ocr.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyNEBoY32Js79QI9AsomXvp4",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/masitings/9dab604147dfe269d6f895f41f872991/yt_ocr.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YMViZtqR340x",
"outputId": "402ab93d-5d98-492a-9e21-004c4effa942"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Mounted at /content/drive\n"
]
}
],
"source": [
"from google.colab import drive\n",
"import os\n",
"\n",
"drive.mount('/content/drive')"
]
},
{
"cell_type": "code",
"source": [
"!sudo apt install tesseract-ocr-ind\n",
"!pip install pytesseract\n",
"!pip install protobuf"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "JLCHyiEf4YyX",
"outputId": "f96fe312-5c40-48da-ee6d-4f53b8a39aec"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"The following additional packages will be installed:\n",
" tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd\n",
"The following NEW packages will be installed:\n",
" tesseract-ocr tesseract-ocr-eng tesseract-ocr-ind tesseract-ocr-osd\n",
"0 upgraded, 4 newly installed, 0 to remove and 30 not upgraded.\n",
"Need to get 5,353 kB of archives.\n",
"After this operation, 16.8 MB of additional disk space will be used.\n",
"Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]\n",
"Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]\n",
"Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]\n",
"Get:4 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-ind all 1:4.00~git30-7274cfa-1.1 [537 kB]\n",
"Fetched 5,353 kB in 1s (8,532 kB/s)\n",
"debconf: unable to initialize frontend: Dialog\n",
"debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 4.)\n",
"debconf: falling back to frontend: Readline\n",
"debconf: unable to initialize frontend: Readline\n",
"debconf: (This frontend requires a controlling tty.)\n",
"debconf: falling back to frontend: Teletype\n",
"dpkg-preconfigure: unable to re-open stdin: \n",
"Selecting previously unselected package tesseract-ocr-eng.\n",
"(Reading database ... 121671 files and directories currently installed.)\n",
"Preparing to unpack .../tesseract-ocr-eng_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n",
"Unpacking tesseract-ocr-eng (1:4.00~git30-7274cfa-1.1) ...\n",
"Selecting previously unselected package tesseract-ocr-osd.\n",
"Preparing to unpack .../tesseract-ocr-osd_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n",
"Unpacking tesseract-ocr-osd (1:4.00~git30-7274cfa-1.1) ...\n",
"Selecting previously unselected package tesseract-ocr.\n",
"Preparing to unpack .../tesseract-ocr_4.1.1-2.1build1_amd64.deb ...\n",
"Unpacking tesseract-ocr (4.1.1-2.1build1) ...\n",
"Selecting previously unselected package tesseract-ocr-ind.\n",
"Preparing to unpack .../tesseract-ocr-ind_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n",
"Unpacking tesseract-ocr-ind (1:4.00~git30-7274cfa-1.1) ...\n",
"Setting up tesseract-ocr-eng (1:4.00~git30-7274cfa-1.1) ...\n",
"Setting up tesseract-ocr-ind (1:4.00~git30-7274cfa-1.1) ...\n",
"Setting up tesseract-ocr-osd (1:4.00~git30-7274cfa-1.1) ...\n",
"Setting up tesseract-ocr (4.1.1-2.1build1) ...\n",
"Processing triggers for man-db (2.10.2-1) ...\n",
"Collecting pytesseract\n",
" Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)\n",
"Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (23.2)\n",
"Requirement already satisfied: Pillow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (9.4.0)\n",
"Installing collected packages: pytesseract\n",
"Successfully installed pytesseract-0.3.10\n",
"Requirement already satisfied: protobuf in /usr/local/lib/python3.10/dist-packages (3.20.3)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import cv2\n",
"import numpy as np\n",
"import pytesseract\n",
"import pandas as pd\n",
"from PIL import Image\n",
"import matplotlib.pyplot as plt"
],
"metadata": {
"id": "EyApGvkG4wsy"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"FILE_PATH = '/content/drive/MyDrive/ocr/dataset'\n",
"filePath = os.path.join(FILE_PATH, 'ktp.png')\n",
"\n",
"img = cv2.imread(filePath)\n",
"gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
"th, threshed = cv2.threshold(gray, 127, 255, cv2.THRESH_TRUNC)\n",
"\n",
"result = pytesseract.image_to_string((threshed), lang=\"ind\")\n",
"\n",
"for word in result.split(\"\\n\"):\n",
" if \"”—\" in word:\n",
" word = word.replace(\"”—\", \":\")\n",
" if \"NIK\" in word:\n",
" nik_char = word.split()\n",
" if \"?\" in word:\n",
" word = word.replace(\"?\", \"7\")\n",
" if \"D\" in word:\n",
" word = word.replace(\"D\", \"0\")\n",
"\n",
" print(word)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "WTWGqHTT5NAh",
"outputId": "07bff621-dd0a-4d71-80b4-1f8ac1bfa47a"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"em\n",
"\n",
"PROVINSI DAERAH ISTIMEWA YOGYAKARTA\n",
"KABUPATEN SLEMAN\n",
"\n",
" \n",
"\n",
"NIK : 34711140209790001\n",
"\n",
"Nama :RIYANTO. SE\n",
"\n",
"Tempat/Tgl Lahir : GROBOGAN. 02-09-1979\n",
"\n",
"Jenis Kelamin : LAKI-LAKI Gol Darah : 0\n",
"\n",
"Alamat PRM PURI DOMAS D-3. SEMPU\n",
"RTRW 1001 1024\n",
"\n",
"Kel/Desa : WEDOMARTANI!\n",
"Kecamatan : NGEMPLAK\n",
"\n",
"Agama \"ISLAM\n",
"Status Bean KAWIN SLEMAN\n",
"Pekerjaan : PEDAGANG 05-06-2012\n",
"\n",
"Kewarganegaraan: WNI HI —\n",
"Berlaku Hingga :02-09-2017 NIA\n",
"\n",
" \n",
" \n",
"\n",
" \n",
"\f\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"class KTPInformation(object):\n",
" def __init__(self):\n",
" self.nik = \"\"\n",
" self.nama = \"\"\n",
" self.tempat_lahir = \"\"\n",
" self.tanggal_lahir = \"\"\n",
" self.jenis_kelamin = \"\"\n",
" self.golongan_darah = \"\"\n",
" self.alamat = \"\"\n",
" self.rt = \"\"\n",
" self.rw = \"\"\n",
" self.kelurahan_atau_desa = \"\"\n",
" self.kecamatan = \"\"\n",
" self.agama = \"\"\n",
" self.status_perkawinan = \"\"\n",
" self.pekerjaan = \"\"\n",
" self.kewarganegaraan = \"\"\n",
" berlaku_hingga = \"SEUMUR HIDUP\""
],
"metadata": {
"id": "TLhP9FJH7m5-"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import cv2\n",
"import json\n",
"import re\n",
"import numpy as np\n",
"import pytesseract\n",
"import matplotlib.pyplot as plt\n",
"from PIL import Image\n",
"\n",
"class KTPOCR(object):\n",
" def __init__(self, image):\n",
" self.image = cv2.imread(image)\n",
" self.gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)\n",
" self.th, self.threshed = cv2.threshold(self.gray, 127, 255, cv2.THRESH_TRUNC)\n",
" self.result = KTPInformation()\n",
" self.master_process()\n",
"\n",
" def process(self, image):\n",
" raw_extracted_text = pytesseract.image_to_string((self.threshed), lang=\"ind\")\n",
" return raw_extracted_text\n",
"\n",
" def word_to_number_converter(self, word):\n",
" word_dict = {\n",
" \"L\": \"1\",\n",
" \"l\": \"1\",\n",
" \"O\": \"0\",\n",
" \"o\": \"0\",\n",
" \"?\": \"7\",\n",
" \"A\": \"4\",\n",
" \"Z\": \"2\",\n",
" \"z\": \"2\",\n",
" \"S\": \"5\",\n",
" \"s\": \"5\",\n",
" \"b\": \"6\",\n",
" \"B\": \"8\",\n",
" \"G\": \"6\"\n",
" }\n",
" res = \"\"\n",
" for letter in word:\n",
" if letter in word_dict:\n",
" res += word_dict[letter]\n",
" else:\n",
" res += letter\n",
" return res\n",
"\n",
" def extract(self, extracted_result):\n",
" #print(extracted_result.replace('\\n', ' -- '))\n",
" for word in extracted_result.split(\"\\n\"):\n",
" word = self.pun_rem(word)\n",
"\n",
" if \"NIK\" in word:\n",
" word = word.split(':')\n",
" self.result.nik = self.word_to_number_converter(word[-1].replace(\" \", \"\"))\n",
" continue\n",
"\n",
" if \"Nama\" in word:\n",
" word = word.split(':')\n",
" self.result.nama = word[-1]\n",
" continue\n",
"\n",
" if \"Lahir\" in word:\n",
" word = word.split(':')\n",
" self.result.tanggal_lahir = re.search(\"([0-9]{2}\\-[0-9]{2}\\-[0-9]{4})\", word[-1])[0]\n",
" self.result.tempat_lahir = word[-1].replace(self.result.tanggal_lahir, '')\n",
" continue\n",
"\n",
" if \"Gol\" in word:\n",
" word = word.split(':')\n",
" self.result.jenis_kelamin = re.search(\"(LAKI-LAKI|LAKI|LELAKI|PEREMPUAN)\", word[1])[0]\n",
" # gol = re.search(\"(O|A|B|AB)\", word[2])[0]\n",
"\n",
" # if gol == \"0\":\n",
" # gol = \"O\"\n",
"\n",
" # self.result.golongan_darah = gol\n",
" continue\n",
"\n",
" if \"Alamat\" in word:\n",
" self.result.alamat = re.sub(r'^\\W*\\w+\\W*', '', word)\n",
" continue\n",
"\n",
" if \"RW\" in word:\n",
" word = re.sub(r'^\\W*\\w+\\W*', '', word)\n",
"\n",
" if \" \" in word:\n",
" a = word.split(\" \")\n",
" elif \"/\" in word:\n",
" a = word.split(\"/\")\n",
"\n",
" self.result.rt = a[0][-3:]\n",
" self.result.rw = a[1][-3:]\n",
" continue\n",
"\n",
" if \"kel\" in word:\n",
" word = re.sub(r'^\\W*\\w+\\W*', '', word)\n",
" self.result.kecamatan = word\n",
" continue\n",
"\n",
" if \"Agama\" in word:\n",
" self.result.agama = re.search(\"(ISLAM|KRISTEN|KATOLIK|HINDU|BUDDHA|KONG HU CU)\", word)[0]\n",
" continue\n",
"\n",
" if \"Status\" in word:\n",
" self.result.status_perkawinan = re.search(\"(KAWIN|BELUM KAWIN|DUDA CERAI|DUDA MATI|JANDA CERAI|JANDA MATI)\", word)[0]\n",
" continue\n",
"\n",
" if \"Pekerjaan\" in word:\n",
" word = re.sub(r'^\\W*\\w+\\W*', '', word)\n",
" word = word.split(\" \")\n",
" self.result.pekerjaan = word[0]\n",
" continue\n",
"\n",
" if \"Kewarganegaraan\" in word:\n",
" self.result.kewarganegaraan = re.search(\"(WNI|WNA)\", word)[0]\n",
" continue\n",
"\n",
" def pun_rem(self,text):\n",
" punctuations = '''!()[]{}'\"\\<>?@#$%^&*_~'''\n",
" no_punct = ''\n",
"\n",
" for char in text:\n",
" if char not in punctuations:\n",
" no_punct = no_punct + char\n",
"\n",
" return no_punct\n",
"\n",
" def master_process(self):\n",
" raw_text = self.process(self.image)\n",
" self.extract(raw_text)\n",
"\n",
" def to_json(self):\n",
" return json.dumps(self.result.__dict__, indent=4)"
],
"metadata": {
"id": "7rZCI9I47oC7"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"images = KTPOCR(filePath)\n",
"print(images.to_json());"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "MFfCNv4L8MPZ",
"outputId": "a6a0f2b0-13e7-4574-a3b9-489fc0b3c00f"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"{\n",
" \"nik\": \"34711140209790001\",\n",
" \"nama\": \"RIYANTO. SE\",\n",
" \"tempat_lahir\": \" GROBOGAN. \",\n",
" \"tanggal_lahir\": \"02-09-1979\",\n",
" \"jenis_kelamin\": \"LAKI-LAKI\",\n",
" \"golongan_darah\": \"\",\n",
" \"alamat\": \"PRM PURI DOMAS D-3. SEMPU\",\n",
" \"rt\": \"001\",\n",
" \"rw\": \"024\",\n",
" \"kelurahan_atau_desa\": \"\",\n",
" \"kecamatan\": \"\",\n",
" \"agama\": \"ISLAM\",\n",
" \"status_perkawinan\": \"KAWIN\",\n",
" \"pekerjaan\": \"PEDAGANG\",\n",
" \"kewarganegaraan\": \"WNI\"\n",
"}\n"
]
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment