Skip to content

Instantly share code, notes, and snippets.

@Asif-Iqbal-Bhatti
Created November 29, 2024 15:07
Show Gist options
  • Save Asif-Iqbal-Bhatti/10e231b3a124e10881eca8956e739c2d to your computer and use it in GitHub Desktop.
Save Asif-Iqbal-Bhatti/10e231b3a124e10881eca8956e739c2d to your computer and use it in GitHub Desktop.
get_text_from_screenshot
from PIL import Image
import pytesseract
from fpdf import FPDF
def extract_and_clean_text(image_path):
# Extract text from the image
image = Image.open(image_path)
raw_text = pytesseract.image_to_string(image)
# Clean text: Remove unnecessary line breaks but keep paragraph structure
paragraphs = raw_text.strip().split("\n\n")
cleaned_text = "\n\n".join(paragraph.replace("\n", " ") for paragraph in paragraphs)
return cleaned_text
# Function to save text to a PDF with book-like formatting
def save_text_to_pdf(text, pdf_path):
pdf = FPDF(format='A4')
pdf.set_margins(20, 20, 20)
pdf.set_auto_page_break(auto=True)
pdf.add_page()
pdf.set_font("Times", size=10)
# Add text with full-page width formatting
#pdf.multi_cell(0, 10, text)
pdf.multi_cell(0, 5, text, align="J")
pdf.output(pdf_path)
# Extract and process text from the image
image_path = '5922636369790880076.jpg'
text = extract_and_clean_text(image_path)
# Save the continuous text to a PDF
pdf_path = 'output_book_style.pdf'
save_text_to_pdf(text, pdf_path)
print(f"PDF saved to {pdf_path}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment