Created
November 29, 2024 15:07
-
-
Save Asif-Iqbal-Bhatti/10e231b3a124e10881eca8956e739c2d to your computer and use it in GitHub Desktop.
get_text_from_screenshot
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from PIL import Image | |
import pytesseract | |
from fpdf import FPDF | |
def extract_and_clean_text(image_path): | |
# Extract text from the image | |
image = Image.open(image_path) | |
raw_text = pytesseract.image_to_string(image) | |
# Clean text: Remove unnecessary line breaks but keep paragraph structure | |
paragraphs = raw_text.strip().split("\n\n") | |
cleaned_text = "\n\n".join(paragraph.replace("\n", " ") for paragraph in paragraphs) | |
return cleaned_text | |
# Function to save text to a PDF with book-like formatting | |
def save_text_to_pdf(text, pdf_path): | |
pdf = FPDF(format='A4') | |
pdf.set_margins(20, 20, 20) | |
pdf.set_auto_page_break(auto=True) | |
pdf.add_page() | |
pdf.set_font("Times", size=10) | |
# Add text with full-page width formatting | |
#pdf.multi_cell(0, 10, text) | |
pdf.multi_cell(0, 5, text, align="J") | |
pdf.output(pdf_path) | |
# Extract and process text from the image | |
image_path = '5922636369790880076.jpg' | |
text = extract_and_clean_text(image_path) | |
# Save the continuous text to a PDF | |
pdf_path = 'output_book_style.pdf' | |
save_text_to_pdf(text, pdf_path) | |
print(f"PDF saved to {pdf_path}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment