Created
May 14, 2025 20:39
-
-
Save me-suzy/63f3d138219b0a6a9e4fbbf603f1fcde to your computer and use it in GitHub Desktop.
processor_layout-parser 2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from google.cloud import documentai | |
from google.api_core.client_options import ClientOptions | |
from google.api_core import exceptions | |
from docx import Document | |
from reportlab.lib.pagesizes import A4 | |
from reportlab.pdfgen import canvas | |
import os | |
def perform_ocr_with_documentai(image_path, project_id, location, processor_id): | |
print(f"Încep procesarea imaginii: {image_path}") | |
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com") | |
client = documentai.DocumentProcessorServiceClient(client_options=opts) | |
name = client.processor_path(project_id, location, processor_id) | |
print(f"Client configurat pentru {name}") | |
try: | |
client.get_processor(name=name) | |
print(f"Procesorul {processor_id} este accesibil.") | |
except exceptions.GoogleAPIError as e: | |
print(f"Eroare la accesarea procesorului: {str(e)}") | |
raise | |
with open(image_path, "rb") as image_file: | |
image_content = image_file.read() | |
raw_document = documentai.RawDocument(content=image_content, mime_type="image/jpeg") | |
request = documentai.ProcessRequest(name=name, raw_document=raw_document) | |
print("Procesare document...") | |
result = client.process_document(request=request) | |
document = result.document | |
print("Document procesat cu succes.") | |
extracted_text = "[TITLE]\n" | |
title_found = False | |
page_number_found = False | |
body_text = "[BODY]\n" | |
image_blocks = [] | |
for block in document.document_layout.blocks: | |
if block.type_ == "text_block": | |
text = block.text_block.text | |
if not title_found: | |
extracted_text += text + "\n" | |
title_found = True | |
elif not page_number_found: | |
extracted_text += "[PAGE]\n" + text + "\n" | |
page_number_found = True | |
else: | |
body_text += text + "\n" | |
elif block.type_ == "image_block": | |
image_blocks.append({ | |
"page": block.page_span.page, | |
"coordinates": block.layout.bounding_poly.normalized_vertices | |
}) | |
extracted_text += body_text | |
print(f"Text extras: {extracted_text}") | |
return extracted_text, image_blocks | |
def create_docx(output_path, text, image_path, image_blocks): | |
doc = Document() | |
print(f"Creare fișier .docx la: {output_path}") | |
title = ""; page = ""; body = ""; current_section = None | |
for line in text.split('\n'): | |
if "[TITLE]" in line: current_section = "title"; title = line.replace("[TITLE]", "").strip() | |
elif "[PAGE]" in line: current_section = "page"; page = line.replace("[PAGE]", "").strip() | |
elif "[BODY]" in line: current_section = "body"; body = line.replace("[BODY]", "").strip() | |
elif current_section == "title" and line.strip(): title += " " + line.strip() | |
elif current_section == "page" and line.strip(): page += " " + line.strip() | |
elif current_section == "body" and line.strip(): body += "\n" + line.strip() | |
if title: | |
p = doc.add_paragraph() | |
run = p.add_run(title) | |
run.bold = True | |
run.font.size = Pt(14) | |
p.alignment = 0 | |
if page: | |
p = doc.add_paragraph() | |
run = p.add_run(page) | |
run.italic = True | |
run.font.size = Pt(12) | |
p.alignment = 0 | |
if body: | |
for para in body.split('\n'): | |
if para.strip(): | |
p = doc.add_paragraph(para) | |
p.style = doc.styles['Normal'] | |
p.alignment = 0 | |
if image_blocks: | |
p = doc.add_paragraph("Image Placeholders (coordinates from Layout Parser):") | |
for i, block in enumerate(image_blocks): | |
coords = [(v.x, v.y) for v in block['coordinates']] | |
p.add_run(f"\nImage {i+1}: Page {block['page']}, Coordinates: {coords}") | |
p = doc.add_paragraph("Original Image for Reference:") | |
doc.add_picture(image_path, width=Inches(6.0)) | |
doc.save(output_path) | |
print(f"Fișier .docx salvat.") | |
def create_pdf(output_path, text, image_path, image_blocks): | |
c = canvas.Canvas(output_path, pagesize=A4) | |
width, height = A4 | |
print(f"Creare fișier .pdf la: {output_path}") | |
title = ""; page = ""; body = ""; current_section = None | |
for line in text.split('\n'): | |
if "[TITLE]" in line: current_section = "title"; title = line.replace("[TITLE]", "").strip() | |
elif "[PAGE]" in line: current_section = "page"; page = line.replace("[PAGE]", "").strip() | |
elif "[BODY]" in line: current_section = "body"; body = line.replace("[BODY]", "").strip() | |
elif current_section == "title" and line.strip(): title += " " + line.strip() | |
elif current_section == "page" and line.strip(): page += " " + line.strip() | |
elif current_section == "body" and line.strip(): body += "\n" + line.strip() | |
text_y = height - 40 | |
if title: c.setFont("Times-Bold", 14); c.drawString(40, text_y, title); text_y -= 20 | |
if page: c.setFont("Times-Italic", 12); c.drawString(40, text_y, page); text_y -= 20 | |
if body: | |
c.setFont("Times-Roman", 12) | |
text_obj = c.beginText(40, text_y) | |
for para in body.split('\n'): | |
if para.strip(): text_obj.textLine(para); text_y -= 14 | |
if text_y < 200: c.drawText(text_obj); c.showPage(); text_obj = c.beginText(40, height - 40); text_y = height - 40 | |
c.drawText(text_obj) | |
if image_blocks: | |
c.drawString(40, text_y - 20, "Image Placeholders (coordinates from Layout Parser):") | |
text_y -= 20 | |
for i, block in enumerate(image_blocks): | |
coords = [(v.x, v.y) for v in block['coordinates']] | |
c.drawString(40, text_y, f"Image {i+1}: Page {block['page']}, Coordinates: {coords}") | |
text_y -= 14 | |
c.drawString(40, text_y - 20, "Original Image for Reference:") | |
c.drawImage(image_path, 40, 100, width=515, height=400) | |
c.save() | |
print(f"Fișier .pdf salvat.") | |
def process_images(): | |
print("Încep procesarea imaginilor...") | |
input_dir = r"e:\De pus pe FTP 2\Test" | |
project_id = "bebe-1s084" | |
location = "eu" | |
processor_id = "6a0sds3f52a2640a2bc" | |
json_path = r"e:\Carte\BB\17 - Site Leadership\alte\Ionel Balauta\Aryeht\Task 1 - Traduce tot site-ul\Doar Google Web\Andreea\Meditatii\2023\Pictory.AI - Text to video\bebe-1084-36a8dc3e8b5e.json" | |
if not os.path.exists(json_path): | |
raise FileNotFoundError(f"Fișierul JSON nu a fost găsit la: {json_path}") | |
print(f"Fișier JSON găsit la: {json_path}") | |
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = json_path | |
if not os.path.exists(input_dir): | |
print(f"Directorul {input_dir} nu există. Verifică calea!") | |
return | |
for filename in os.listdir(input_dir): | |
if filename.lower().endswith('.jpg'): | |
image_path = os.path.join(input_dir, filename) | |
print(f"Procesare fișier: {image_path}") | |
try: | |
text, image_blocks = perform_ocr_with_documentai(image_path, project_id, location, processor_id) | |
print(f"Text și layout extras din {filename}:\n{text}\n") | |
print(f"Blocuri de imagini detectate: {image_blocks}\n") | |
docx_output = os.path.join(input_dir, os.path.splitext(filename)[0] + '.docx') | |
pdf_output = os.path.join(input_dir, os.path.splitext(filename)[0] + '.pdf') | |
create_docx(docx_output, text, image_path, image_blocks) | |
create_pdf(pdf_output, text, image_path, image_blocks) | |
print(f"Procesat {filename} -> {os.path.basename(docx_output)} și {os.path.basename(pdf_output)}") | |
except Exception as e: | |
print(f"Eroare la procesarea {filename}: {str(e)}") | |
print("Procesare finalizată.") | |
if __name__ == "__main__": | |
process_images() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment