- Install the required packages by running
pip install -r requirements.txt
in your terminal. - Run the command
export TESSDATA_PREFIX=/usr/local/tessdata
in your terminal to set the Tesseract data path. - Download the English language data from here and place it to the tessdata folder above.
- Run
main.py
Last active
April 16, 2023 15:23
-
-
Save tpai/7811231b5139d88a360a9674fdab77dc to your computer and use it in GitHub Desktop.
This script allows you to extract English text from images using Python.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import base64 | |
import cv2 | |
import numpy as np | |
import pytesseract | |
import requests | |
def download_image_from_url(url): | |
response = requests.get(url) | |
img_data = response.content | |
return img_data | |
def extract_en_text(img_data): | |
# Convert the image data to an OpenCV image | |
np_arr = np.frombuffer(img_data, np.uint8) | |
img = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) | |
# Increase the accuracy of text extraction using pytesseract | |
custom_config = r'--oem 3' | |
extracted_text = pytesseract.image_to_string(img, lang='eng', config=custom_config) | |
return extracted_text | |
url = input("Enter the image URL: ") # demo: https://cdn.memes.com/up/2987451663882327/i/1663981590975.jpg | |
img_data = download_image_from_url(url) | |
text = extract_en_text(img_data) | |
print(f"Output:\n{text}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment