me-suzy · May 14, 2025 20:39
diff --git a/processor_layout-parser 2.py b/processor_layout-parser 2.py
 from google.cloud import documentai
 from google.api_core.client_options import ClientOptions
 from google.api_core import exceptions
 from docx import Document
 from reportlab.lib.pagesizes import A4
 from reportlab.pdfgen import canvas
 import os

 def perform_ocr_with_documentai(image_path, project_id, location, processor_id):
    print(f"Încep procesarea imaginii: {image_path}")
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    name = client.processor_path(project_id, location, processor_id)
    print(f"Client configurat pentru {name}")

    try:
        client.get_processor(name=name)
        print(f"Procesorul {processor_id} este accesibil.")
    except exceptions.GoogleAPIError as e:
        print(f"Eroare la accesarea procesorului: {str(e)}")
        raise

    with open(image_path, "rb") as image_file:
        image_content = image_file.read()

    raw_document = documentai.RawDocument(content=image_content, mime_type="image/jpeg")
    request = documentai.ProcessRequest(name=name, raw_document=raw_document)

    print("Procesare document...")
    result = client.process_document(request=request)
    document = result.document
    print("Document procesat cu succes.")

    extracted_text = "[TITLE]\n"
    title_found = False
    page_number_found = False
    body_text = "[BODY]\n"
    image_blocks = []

    for block in document.document_layout.blocks:
        if block.type_ == "text_block":
            text = block.text_block.text
            if not title_found:
                extracted_text += text + "\n"
                title_found = True
            elif not page_number_found:
                extracted_text += "[PAGE]\n" + text + "\n"
                page_number_found = True
            else:
                body_text += text + "\n"
        elif block.type_ == "image_block":
            image_blocks.append({
                "page": block.page_span.page,
                "coordinates": block.layout.bounding_poly.normalized_vertices
            })

    extracted_text += body_text
    print(f"Text extras: {extracted_text}")
    return extracted_text, image_blocks

 def create_docx(output_path, text, image_path, image_blocks):
    doc = Document()
    print(f"Creare fișier .docx la: {output_path}")

    title = ""; page = ""; body = ""; current_section = None
    for line in text.split('\n'):
        if "[TITLE]" in line: current_section = "title"; title = line.replace("[TITLE]", "").strip()
        elif "[PAGE]" in line: current_section = "page"; page = line.replace("[PAGE]", "").strip()
        elif "[BODY]" in line: current_section = "body"; body = line.replace("[BODY]", "").strip()
        elif current_section == "title" and line.strip(): title += " " + line.strip()
        elif current_section == "page" and line.strip(): page += " " + line.strip()
        elif current_section == "body" and line.strip(): body += "\n" + line.strip()

    if title:
        p = doc.add_paragraph()
        run = p.add_run(title)
        run.bold = True
        run.font.size = Pt(14)
        p.alignment = 0

    if page:
        p = doc.add_paragraph()
        run = p.add_run(page)
        run.italic = True
        run.font.size = Pt(12)
        p.alignment = 0

    if body:
        for para in body.split('\n'):
            if para.strip():
                p = doc.add_paragraph(para)
                p.style = doc.styles['Normal']
                p.alignment = 0

    if image_blocks:
        p = doc.add_paragraph("Image Placeholders (coordinates from Layout Parser):")
        for i, block in enumerate(image_blocks):
            coords = [(v.x, v.y) for v in block['coordinates']]
            p.add_run(f"\nImage {i+1}: Page {block['page']}, Coordinates: {coords}")

    p = doc.add_paragraph("Original Image for Reference:")
    doc.add_picture(image_path, width=Inches(6.0))
    doc.save(output_path)
    print(f"Fișier .docx salvat.")

 def create_pdf(output_path, text, image_path, image_blocks):
    c = canvas.Canvas(output_path, pagesize=A4)
    width, height = A4
    print(f"Creare fișier .pdf la: {output_path}")

    title = ""; page = ""; body = ""; current_section = None
    for line in text.split('\n'):
        if "[TITLE]" in line: current_section = "title"; title = line.replace("[TITLE]", "").strip()
        elif "[PAGE]" in line: current_section = "page"; page = line.replace("[PAGE]", "").strip()
        elif "[BODY]" in line: current_section = "body"; body = line.replace("[BODY]", "").strip()
        elif current_section == "title" and line.strip(): title += " " + line.strip()
        elif current_section == "page" and line.strip(): page += " " + line.strip()
        elif current_section == "body" and line.strip(): body += "\n" + line.strip()

    text_y = height - 40
    if title: c.setFont("Times-Bold", 14); c.drawString(40, text_y, title); text_y -= 20
    if page: c.setFont("Times-Italic", 12); c.drawString(40, text_y, page); text_y -= 20
    if body:
        c.setFont("Times-Roman", 12)
        text_obj = c.beginText(40, text_y)
        for para in body.split('\n'):
            if para.strip(): text_obj.textLine(para); text_y -= 14
            if text_y < 200: c.drawText(text_obj); c.showPage(); text_obj = c.beginText(40, height - 40); text_y = height - 40
        c.drawText(text_obj)

    if image_blocks:
        c.drawString(40, text_y - 20, "Image Placeholders (coordinates from Layout Parser):")
        text_y -= 20
        for i, block in enumerate(image_blocks):
            coords = [(v.x, v.y) for v in block['coordinates']]
            c.drawString(40, text_y, f"Image {i+1}: Page {block['page']}, Coordinates: {coords}")
            text_y -= 14

    c.drawString(40, text_y - 20, "Original Image for Reference:")
    c.drawImage(image_path, 40, 100, width=515, height=400)
    c.save()
    print(f"Fișier .pdf salvat.")

 def process_images():
    print("Încep procesarea imaginilor...")
    input_dir = r"e:\De pus pe FTP 2\Test"
    project_id = "bebe-1s084"
    location = "eu"
    processor_id = "6a0sds3f52a2640a2bc"

    json_path = r"e:\Carte\BB\17 - Site Leadership\alte\Ionel Balauta\Aryeht\Task 1 - Traduce tot site-ul\Doar Google Web\Andreea\Meditatii\2023\Pictory.AI - Text to video\bebe-1084-36a8dc3e8b5e.json"
    if not os.path.exists(json_path):
        raise FileNotFoundError(f"Fișierul JSON nu a fost găsit la: {json_path}")
    print(f"Fișier JSON găsit la: {json_path}")
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = json_path

    if not os.path.exists(input_dir):
        print(f"Directorul {input_dir} nu există. Verifică calea!")
        return

    for filename in os.listdir(input_dir):
        if filename.lower().endswith('.jpg'):
            image_path = os.path.join(input_dir, filename)
            print(f"Procesare fișier: {image_path}")
            try:
                text, image_blocks = perform_ocr_with_documentai(image_path, project_id, location, processor_id)
                print(f"Text și layout extras din {filename}:\n{text}\n")
                print(f"Blocuri de imagini detectate: {image_blocks}\n")

                docx_output = os.path.join(input_dir, os.path.splitext(filename)[0] + '.docx')
                pdf_output = os.path.join(input_dir, os.path.splitext(filename)[0] + '.pdf')

                create_docx(docx_output, text, image_path, image_blocks)
                create_pdf(pdf_output, text, image_path, image_blocks)
                print(f"Procesat {filename} -> {os.path.basename(docx_output)} și {os.path.basename(pdf_output)}")
            except Exception as e:
                print(f"Eroare la procesarea {filename}: {str(e)}")
    print("Procesare finalizată.")

 if __name__ == "__main__":
    process_images()
	from google.cloud import documentai
	from google.api_core.client_options import ClientOptions
	from google.api_core import exceptions
	from docx import Document
	from reportlab.lib.pagesizes import A4
	from reportlab.pdfgen import canvas
	import os

	def perform_ocr_with_documentai(image_path, project_id, location, processor_id):
	print(f"Încep procesarea imaginii: {image_path}")
	opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
	client = documentai.DocumentProcessorServiceClient(client_options=opts)

	name = client.processor_path(project_id, location, processor_id)
	print(f"Client configurat pentru {name}")

	try:
	client.get_processor(name=name)
	print(f"Procesorul {processor_id} este accesibil.")
	except exceptions.GoogleAPIError as e:
	print(f"Eroare la accesarea procesorului: {str(e)}")
	raise

	with open(image_path, "rb") as image_file:
	image_content = image_file.read()

	raw_document = documentai.RawDocument(content=image_content, mime_type="image/jpeg")
	request = documentai.ProcessRequest(name=name, raw_document=raw_document)

	print("Procesare document...")
	result = client.process_document(request=request)
	document = result.document
	print("Document procesat cu succes.")

	extracted_text = "[TITLE]\n"
	title_found = False
	page_number_found = False
	body_text = "[BODY]\n"
	image_blocks = []

	for block in document.document_layout.blocks:
	if block.type_ == "text_block":
	text = block.text_block.text
	if not title_found:
	extracted_text += text + "\n"
	title_found = True
	elif not page_number_found:
	extracted_text += "[PAGE]\n" + text + "\n"
	page_number_found = True
	else:
	body_text += text + "\n"
	elif block.type_ == "image_block":
	image_blocks.append({
	"page": block.page_span.page,
	"coordinates": block.layout.bounding_poly.normalized_vertices
	})

	extracted_text += body_text
	print(f"Text extras: {extracted_text}")
	return extracted_text, image_blocks

	def create_docx(output_path, text, image_path, image_blocks):
	doc = Document()
	print(f"Creare fișier .docx la: {output_path}")

	title = ""; page = ""; body = ""; current_section = None
	for line in text.split('\n'):
	if "[TITLE]" in line: current_section = "title"; title = line.replace("[TITLE]", "").strip()
	elif "[PAGE]" in line: current_section = "page"; page = line.replace("[PAGE]", "").strip()
	elif "[BODY]" in line: current_section = "body"; body = line.replace("[BODY]", "").strip()
	elif current_section == "title" and line.strip(): title += " " + line.strip()
	elif current_section == "page" and line.strip(): page += " " + line.strip()
	elif current_section == "body" and line.strip(): body += "\n" + line.strip()

	if title:
	p = doc.add_paragraph()
	run = p.add_run(title)
	run.bold = True
	run.font.size = Pt(14)
	p.alignment = 0

	if page:
	p = doc.add_paragraph()
	run = p.add_run(page)
	run.italic = True
	run.font.size = Pt(12)
	p.alignment = 0

	if body:
	for para in body.split('\n'):
	if para.strip():
	p = doc.add_paragraph(para)
	p.style = doc.styles['Normal']
	p.alignment = 0

	if image_blocks:
	p = doc.add_paragraph("Image Placeholders (coordinates from Layout Parser):")
	for i, block in enumerate(image_blocks):
	coords = [(v.x, v.y) for v in block['coordinates']]
	p.add_run(f"\nImage {i+1}: Page {block['page']}, Coordinates: {coords}")

	p = doc.add_paragraph("Original Image for Reference:")
	doc.add_picture(image_path, width=Inches(6.0))
	doc.save(output_path)
	print(f"Fișier .docx salvat.")

	def create_pdf(output_path, text, image_path, image_blocks):
	c = canvas.Canvas(output_path, pagesize=A4)
	width, height = A4
	print(f"Creare fișier .pdf la: {output_path}")

	title = ""; page = ""; body = ""; current_section = None
	for line in text.split('\n'):
	if "[TITLE]" in line: current_section = "title"; title = line.replace("[TITLE]", "").strip()
	elif "[PAGE]" in line: current_section = "page"; page = line.replace("[PAGE]", "").strip()
	elif "[BODY]" in line: current_section = "body"; body = line.replace("[BODY]", "").strip()
	elif current_section == "title" and line.strip(): title += " " + line.strip()
	elif current_section == "page" and line.strip(): page += " " + line.strip()
	elif current_section == "body" and line.strip(): body += "\n" + line.strip()

	text_y = height - 40
	if title: c.setFont("Times-Bold", 14); c.drawString(40, text_y, title); text_y -= 20
	if page: c.setFont("Times-Italic", 12); c.drawString(40, text_y, page); text_y -= 20
	if body:
	c.setFont("Times-Roman", 12)
	text_obj = c.beginText(40, text_y)
	for para in body.split('\n'):
	if para.strip(): text_obj.textLine(para); text_y -= 14
	if text_y < 200: c.drawText(text_obj); c.showPage(); text_obj = c.beginText(40, height - 40); text_y = height - 40
	c.drawText(text_obj)

	if image_blocks:
	c.drawString(40, text_y - 20, "Image Placeholders (coordinates from Layout Parser):")
	text_y -= 20
	for i, block in enumerate(image_blocks):
	coords = [(v.x, v.y) for v in block['coordinates']]
	c.drawString(40, text_y, f"Image {i+1}: Page {block['page']}, Coordinates: {coords}")
	text_y -= 14

	c.drawString(40, text_y - 20, "Original Image for Reference:")
	c.drawImage(image_path, 40, 100, width=515, height=400)
	c.save()
	print(f"Fișier .pdf salvat.")

	def process_images():
	print("Încep procesarea imaginilor...")
	input_dir = r"e:\De pus pe FTP 2\Test"
	project_id = "bebe-1s084"
	location = "eu"
	processor_id = "6a0sds3f52a2640a2bc"

	json_path = r"e:\Carte\BB\17 - Site Leadership\alte\Ionel Balauta\Aryeht\Task 1 - Traduce tot site-ul\Doar Google Web\Andreea\Meditatii\2023\Pictory.AI - Text to video\bebe-1084-36a8dc3e8b5e.json"
	if not os.path.exists(json_path):
	raise FileNotFoundError(f"Fișierul JSON nu a fost găsit la: {json_path}")
	print(f"Fișier JSON găsit la: {json_path}")
	os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = json_path

	if not os.path.exists(input_dir):
	print(f"Directorul {input_dir} nu există. Verifică calea!")
	return

	for filename in os.listdir(input_dir):
	if filename.lower().endswith('.jpg'):
	image_path = os.path.join(input_dir, filename)
	print(f"Procesare fișier: {image_path}")
	try:
	text, image_blocks = perform_ocr_with_documentai(image_path, project_id, location, processor_id)
	print(f"Text și layout extras din {filename}:\n{text}\n")
	print(f"Blocuri de imagini detectate: {image_blocks}\n")

	docx_output = os.path.join(input_dir, os.path.splitext(filename)[0] + '.docx')
	pdf_output = os.path.join(input_dir, os.path.splitext(filename)[0] + '.pdf')

	create_docx(docx_output, text, image_path, image_blocks)
	create_pdf(pdf_output, text, image_path, image_blocks)
	print(f"Procesat {filename} -> {os.path.basename(docx_output)} și {os.path.basename(pdf_output)}")
	except Exception as e:
	print(f"Eroare la procesarea {filename}: {str(e)}")
	print("Procesare finalizată.")

	if __name__ == "__main__":
	process_images()