Skip to content

Instantly share code, notes, and snippets.

@me-suzy
Created May 14, 2025 20:39
Show Gist options
  • Save me-suzy/63f3d138219b0a6a9e4fbbf603f1fcde to your computer and use it in GitHub Desktop.
Save me-suzy/63f3d138219b0a6a9e4fbbf603f1fcde to your computer and use it in GitHub Desktop.
processor_layout-parser 2
from google.cloud import documentai
from google.api_core.client_options import ClientOptions
from google.api_core import exceptions
from docx import Document
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
import os
def perform_ocr_with_documentai(image_path, project_id, location, processor_id):
print(f"Încep procesarea imaginii: {image_path}")
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
client = documentai.DocumentProcessorServiceClient(client_options=opts)
name = client.processor_path(project_id, location, processor_id)
print(f"Client configurat pentru {name}")
try:
client.get_processor(name=name)
print(f"Procesorul {processor_id} este accesibil.")
except exceptions.GoogleAPIError as e:
print(f"Eroare la accesarea procesorului: {str(e)}")
raise
with open(image_path, "rb") as image_file:
image_content = image_file.read()
raw_document = documentai.RawDocument(content=image_content, mime_type="image/jpeg")
request = documentai.ProcessRequest(name=name, raw_document=raw_document)
print("Procesare document...")
result = client.process_document(request=request)
document = result.document
print("Document procesat cu succes.")
extracted_text = "[TITLE]\n"
title_found = False
page_number_found = False
body_text = "[BODY]\n"
image_blocks = []
for block in document.document_layout.blocks:
if block.type_ == "text_block":
text = block.text_block.text
if not title_found:
extracted_text += text + "\n"
title_found = True
elif not page_number_found:
extracted_text += "[PAGE]\n" + text + "\n"
page_number_found = True
else:
body_text += text + "\n"
elif block.type_ == "image_block":
image_blocks.append({
"page": block.page_span.page,
"coordinates": block.layout.bounding_poly.normalized_vertices
})
extracted_text += body_text
print(f"Text extras: {extracted_text}")
return extracted_text, image_blocks
def create_docx(output_path, text, image_path, image_blocks):
doc = Document()
print(f"Creare fișier .docx la: {output_path}")
title = ""; page = ""; body = ""; current_section = None
for line in text.split('\n'):
if "[TITLE]" in line: current_section = "title"; title = line.replace("[TITLE]", "").strip()
elif "[PAGE]" in line: current_section = "page"; page = line.replace("[PAGE]", "").strip()
elif "[BODY]" in line: current_section = "body"; body = line.replace("[BODY]", "").strip()
elif current_section == "title" and line.strip(): title += " " + line.strip()
elif current_section == "page" and line.strip(): page += " " + line.strip()
elif current_section == "body" and line.strip(): body += "\n" + line.strip()
if title:
p = doc.add_paragraph()
run = p.add_run(title)
run.bold = True
run.font.size = Pt(14)
p.alignment = 0
if page:
p = doc.add_paragraph()
run = p.add_run(page)
run.italic = True
run.font.size = Pt(12)
p.alignment = 0
if body:
for para in body.split('\n'):
if para.strip():
p = doc.add_paragraph(para)
p.style = doc.styles['Normal']
p.alignment = 0
if image_blocks:
p = doc.add_paragraph("Image Placeholders (coordinates from Layout Parser):")
for i, block in enumerate(image_blocks):
coords = [(v.x, v.y) for v in block['coordinates']]
p.add_run(f"\nImage {i+1}: Page {block['page']}, Coordinates: {coords}")
p = doc.add_paragraph("Original Image for Reference:")
doc.add_picture(image_path, width=Inches(6.0))
doc.save(output_path)
print(f"Fișier .docx salvat.")
def create_pdf(output_path, text, image_path, image_blocks):
c = canvas.Canvas(output_path, pagesize=A4)
width, height = A4
print(f"Creare fișier .pdf la: {output_path}")
title = ""; page = ""; body = ""; current_section = None
for line in text.split('\n'):
if "[TITLE]" in line: current_section = "title"; title = line.replace("[TITLE]", "").strip()
elif "[PAGE]" in line: current_section = "page"; page = line.replace("[PAGE]", "").strip()
elif "[BODY]" in line: current_section = "body"; body = line.replace("[BODY]", "").strip()
elif current_section == "title" and line.strip(): title += " " + line.strip()
elif current_section == "page" and line.strip(): page += " " + line.strip()
elif current_section == "body" and line.strip(): body += "\n" + line.strip()
text_y = height - 40
if title: c.setFont("Times-Bold", 14); c.drawString(40, text_y, title); text_y -= 20
if page: c.setFont("Times-Italic", 12); c.drawString(40, text_y, page); text_y -= 20
if body:
c.setFont("Times-Roman", 12)
text_obj = c.beginText(40, text_y)
for para in body.split('\n'):
if para.strip(): text_obj.textLine(para); text_y -= 14
if text_y < 200: c.drawText(text_obj); c.showPage(); text_obj = c.beginText(40, height - 40); text_y = height - 40
c.drawText(text_obj)
if image_blocks:
c.drawString(40, text_y - 20, "Image Placeholders (coordinates from Layout Parser):")
text_y -= 20
for i, block in enumerate(image_blocks):
coords = [(v.x, v.y) for v in block['coordinates']]
c.drawString(40, text_y, f"Image {i+1}: Page {block['page']}, Coordinates: {coords}")
text_y -= 14
c.drawString(40, text_y - 20, "Original Image for Reference:")
c.drawImage(image_path, 40, 100, width=515, height=400)
c.save()
print(f"Fișier .pdf salvat.")
def process_images():
print("Încep procesarea imaginilor...")
input_dir = r"e:\De pus pe FTP 2\Test"
project_id = "bebe-1s084"
location = "eu"
processor_id = "6a0sds3f52a2640a2bc"
json_path = r"e:\Carte\BB\17 - Site Leadership\alte\Ionel Balauta\Aryeht\Task 1 - Traduce tot site-ul\Doar Google Web\Andreea\Meditatii\2023\Pictory.AI - Text to video\bebe-1084-36a8dc3e8b5e.json"
if not os.path.exists(json_path):
raise FileNotFoundError(f"Fișierul JSON nu a fost găsit la: {json_path}")
print(f"Fișier JSON găsit la: {json_path}")
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = json_path
if not os.path.exists(input_dir):
print(f"Directorul {input_dir} nu există. Verifică calea!")
return
for filename in os.listdir(input_dir):
if filename.lower().endswith('.jpg'):
image_path = os.path.join(input_dir, filename)
print(f"Procesare fișier: {image_path}")
try:
text, image_blocks = perform_ocr_with_documentai(image_path, project_id, location, processor_id)
print(f"Text și layout extras din {filename}:\n{text}\n")
print(f"Blocuri de imagini detectate: {image_blocks}\n")
docx_output = os.path.join(input_dir, os.path.splitext(filename)[0] + '.docx')
pdf_output = os.path.join(input_dir, os.path.splitext(filename)[0] + '.pdf')
create_docx(docx_output, text, image_path, image_blocks)
create_pdf(pdf_output, text, image_path, image_blocks)
print(f"Procesat {filename} -> {os.path.basename(docx_output)} și {os.path.basename(pdf_output)}")
except Exception as e:
print(f"Eroare la procesarea {filename}: {str(e)}")
print("Procesare finalizată.")
if __name__ == "__main__":
process_images()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment