me-suzy · February 16, 2026 06:44
diff --git a/OCR convert to text API ClaudeAI 3.py b/OCR convert to text API ClaudeAI 3.py
 # -*- coding: utf-8 -*-
 import cv2
 import numpy as np
 from pathlib import Path
 import sys
 import base64
 import anthropic
 import time
 from PIL import Image, ImageDraw, ImageFont
 import textwrap
 import re

 try:
    if sys.platform == 'win32' and hasattr(sys.stdout, 'buffer'):
        import io
        sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
        sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
 except:
    pass

 ANTHROPIC_API_KEY = "YOUR_API_KEY_HERE"


 def image_to_base64(image_path):
    with open(image_path, "rb") as f:
        return base64.standard_b64encode(f.read()).decode("utf-8")


 def get_image_media_type(image_path):
    ext = Path(image_path).suffix.lower()
    media_types = {
        '.jpg': 'image/jpeg',
        '.jpeg': 'image/jpeg',
        '.png': 'image/png',
        '.gif': 'image/gif',
        '.webp': 'image/webp'
    }
    return media_types.get(ext, 'image/jpeg')


 def ocr_with_claude(image_path, client):
    image_data = image_to_base64(image_path)
    media_type = get_image_media_type(image_path)
    
    message = client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=8000,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": media_type,
                            "data": image_data,
                        },
                    },
                    {
                        "type": "text",
                        "text": """Transcrie EXACT textul din aceasta pagina de carte medicala romaneasca.

 REGULI DE FORMATARE:
 1. Marcheaza paragrafele noi cu [P]
 2. Marcheaza textul ITALIC cu [I]...[/I] (subtitluri, termeni medicali, nume autori)
 3. Marcheaza textul BOLD cu [B]...[/B] (titluri tabele, cuvinte-cheie, doze)
 4. Pentru TABELE, foloseste formatul:
   [TABEL]
   Coloana1 | Coloana2 | Coloana3
   Valoare1 | Valoare2 | Valoare3
   [/TABEL]
 5. Pastreaza diacriticele (ă, â, î, ș, ț)
 6. Pastreaza ortografia originala (sînt, cînd, vîrstă)
 7. Marcheaza [FIGURA] unde apar imagini/diagrame
 8. [?] doar pentru cuvinte complet ilizibile
 9. Numarul paginii la final

 Textul:"""
                    }
                ],
            }
        ],
    )
    
    return message.content[0].text


 def get_font(font_type, size):
    if font_type == 'regular':
        font_paths = [
            "C:/Windows/Fonts/times.ttf",
            "C:/Windows/Fonts/georgia.ttf",
            "/usr/share/fonts/truetype/dejavu/DejaVuSerif.ttf",
        ]
    elif font_type == 'bold':
        font_paths = [
            "C:/Windows/Fonts/timesbd.ttf",
            "C:/Windows/Fonts/georgiab.ttf",
            "/usr/share/fonts/truetype/dejavu/DejaVuSerif-Bold.ttf",
        ]
    elif font_type == 'italic':
        font_paths = [
            "C:/Windows/Fonts/timesi.ttf",
            "C:/Windows/Fonts/georgiai.ttf",
            "/usr/share/fonts/truetype/dejavu/DejaVuSerif-Italic.ttf",
        ]
    elif font_type == 'bolditalic':
        font_paths = [
            "C:/Windows/Fonts/timesbi.ttf",
            "C:/Windows/Fonts/georgiaz.ttf",
            "/usr/share/fonts/truetype/dejavu/DejaVuSerif-BoldItalic.ttf",
        ]
    else:
        font_paths = ["C:/Windows/Fonts/times.ttf"]
    
    for fp in font_paths:
        if Path(fp).exists():
            try:
                return ImageFont.truetype(fp, size)
            except:
                continue
    
    return ImageFont.load_default()


 def get_background_color(img):
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    bg_mask = gray > 200
    
    if np.any(bg_mask):
        bg_pixels = img_rgb[bg_mask]
        bg_color = tuple(np.mean(bg_pixels, axis=0).astype(int))
        return bg_color
    
    return (250, 245, 230)  # Crem deschis


 def draw_justified_text(draw, text, font, x, y, max_width, fill):
    """Deseneaza text aliniat justify (stanga-dreapta)"""
    words = text.split()
    if len(words) <= 1:
        draw.text((x, y), text, fill=fill, font=font)
        return
    
    # Calculam latimea totala a cuvintelor
    word_widths = []
    for word in words:
        bbox = draw.textbbox((0, 0), word, font=font)
        word_widths.append(bbox[2] - bbox[0])
    
    total_word_width = sum(word_widths)
    
    # Spatiul disponibil pentru spatii
    total_space = max_width - total_word_width
    
    if len(words) > 1 and total_space > 0:
        space_width = total_space / (len(words) - 1)
    else:
        space_width = draw.textbbox((0, 0), " ", font=font)[2]
    
    # Desenam fiecare cuvant
    current_x = x
    for i, word in enumerate(words):
        draw.text((current_x, y), word, fill=fill, font=font)
        current_x += word_widths[i] + space_width


 def parse_table(table_text):
    """Parseaza textul tabelului in randuri si coloane"""
    lines = [l.strip() for l in table_text.strip().split('\n') if l.strip()]
    rows = []
    for line in lines:
        if '|' in line:
            cols = [c.strip() for c in line.split('|')]
            rows.append(cols)
        else:
            rows.append([line])
    return rows


 def draw_table(draw, rows, fonts, x, y, max_width, line_height, text_color):
    """Deseneaza un tabel formatat corect"""
    if not rows:
        return y
    
    # Determinam numarul de coloane
    max_cols = max(len(row) for row in rows)
    
    # Calculam latimea fiecarei coloane
    col_width = max_width // max_cols if max_cols > 0 else max_width
    
    # Desenam fiecare rand
    current_y = y
    for row_idx, row in enumerate(rows):
        # Prima linie (header) poate fi bold
        font = fonts['bold'] if row_idx == 0 else fonts['regular']
        
        for col_idx, cell in enumerate(row):
            cell_x = x + col_idx * col_width
            # Truncam daca e prea lung
            cell_text = cell
            bbox = draw.textbbox((0, 0), cell_text, font=font)
            while bbox[2] - bbox[0] > col_width - 10 and len(cell_text) > 3:
                cell_text = cell_text[:-1]
                bbox = draw.textbbox((0, 0), cell_text, font=font)
            
            draw.text((cell_x, current_y), cell_text, fill=text_color, font=font)
        
        current_y += line_height
    
    return current_y


 def create_clean_pages(original_img, ocr_text, output_path_base):
    """Creeaza pagini curate cu formatare profesionala"""
    
    height, width = original_img.shape[:2]
    bg_color = get_background_color(original_img)
    
    # Font size proportional
    base_font_size = max(48, int(height / 50))
    
    fonts = {
        'regular': get_font('regular', base_font_size),
        'bold': get_font('bold', base_font_size),
        'italic': get_font('italic', base_font_size),
        'bolditalic': get_font('bolditalic', base_font_size),
    }
    
    text_color = (20, 20, 20)
    
    # Margini mici pentru text extins
    margin_left = int(width * 0.05)
    margin_right = int(width * 0.05)
    margin_top = int(height * 0.04)
    margin_bottom = int(height * 0.04)
    
    paragraph_indent = int(base_font_size * 1.5)
    line_height = int(base_font_size * 1.35)
    
    text_width = width - margin_left - margin_right
    avg_char_width = base_font_size * 0.42
    chars_per_line = int(text_width / avg_char_width)
    
    # Procesam textul
    # Eliminam marcajele si construim structura
    content_blocks = []
    
    # Procesam tabelele separat
    table_pattern = r'\[TABEL\](.*?)\[/TABEL\]'
    
    # Split by tables first
    parts = re.split(table_pattern, ocr_text, flags=re.DOTALL)
    
    in_table = False
    for i, part in enumerate(parts):
        if i % 2 == 1:  # Este tabel
            content_blocks.append({'type': 'table', 'content': part})
        else:
            # Text normal - procesam paragrafele
            # Split by [P]
            paras = re.split(r'\[P\]', part)
            for para in paras:
                para = para.strip()
                if not para:
                    continue
                
                # Detectam formatarea
                # Procesam bold si italic
                content_blocks.append({'type': 'paragraph', 'content': para})
    
    # Generam paginile
    pages = []
    current_page = Image.new('RGB', (width, height), color=bg_color)
    draw = ImageDraw.Draw(current_page)
    y_position = margin_top
    page_num = 1
    
    for block in content_blocks:
        if block['type'] == 'table':
            # Procesam tabelul
            rows = parse_table(block['content'])
            needed_height = len(rows) * line_height + line_height
            
            if y_position + needed_height > height - margin_bottom:
                pages.append(current_page)
                current_page = Image.new('RGB', (width, height), color=bg_color)
                draw = ImageDraw.Draw(current_page)
                y_position = margin_top
                page_num += 1
            
            y_position = draw_table(draw, rows, fonts, margin_left, y_position, 
                                     text_width, line_height, text_color)
            y_position += line_height // 2
            
        elif block['type'] == 'paragraph':
            para = block['content']
            
            # Eliminam marcajele de formatare pentru wrap
            clean_para = re.sub(r'\[/?[BI]\]', '', para)
            
            # Word wrap
            wrapped = textwrap.wrap(clean_para, width=chars_per_line - 3)
            
            for i, line in enumerate(wrapped):
                if y_position + line_height > height - margin_bottom:
                    pages.append(current_page)
                    current_page = Image.new('RGB', (width, height), color=bg_color)
                    draw = ImageDraw.Draw(current_page)
                    y_position = margin_top
                    page_num += 1
                
                # Determinam fontul bazat pe formatare
                current_font = fonts['regular']
                
                # Verificam daca linia originala avea formatare
                if '[I]' in para and '[B]' in para:
                    current_font = fonts['bolditalic']
                elif '[I]' in para:
                    current_font = fonts['italic']
                elif '[B]' in para:
                    current_font = fonts['bold']
                
                # Prima linie are indent
                if i == 0:
                    x_pos = margin_left + paragraph_indent
                    line_max_width = text_width - paragraph_indent
                else:
                    x_pos = margin_left
                    line_max_width = text_width
                
                # Ultima linie nu e justified
                if i == len(wrapped) - 1:
                    draw.text((x_pos, y_position), line, fill=text_color, font=current_font)
                else:
                    draw_justified_text(draw, line, current_font, x_pos, y_position, 
                                       line_max_width, text_color)
                
                y_position += line_height
            
            # Spatiu intre paragrafe
            y_position += line_height // 4
    
    # Adaugam ultima pagina
    pages.append(current_page)
    
    # Salvam paginile
    saved_paths = []
    if len(pages) == 1:
        output_path = Path(str(output_path_base) + "_clean.jpg")
        pages[0].save(output_path, quality=95, dpi=(300, 300))
        saved_paths.append(output_path)
    else:
        for i, page in enumerate(pages, 1):
            output_path = Path(str(output_path_base) + f"_clean_p{i}.jpg")
            page.save(output_path, quality=95, dpi=(300, 300))
            saved_paths.append(output_path)
    
    return saved_paths


 def process_image(image_path, output_folder, client):
    img = cv2.imread(str(image_path))
    if img is None:
        raise ValueError(f"Nu s-a putut incarca: {image_path}")
    
    filename = Path(image_path).stem
    print(f"\n  Procesare: {Path(image_path).name}")
    
    # OCR
    print("    -> OCR cu Claude API...")
    ocr_text = ocr_with_claude(image_path, client)
    
    # Salvare text
    output_text = output_folder / f"{filename}.txt"
    with open(output_text, 'w', encoding='utf-8') as f:
        f.write(ocr_text)
    print(f"    -> Text: {output_text.name}")
    
    # Pagini curate
    output_base = output_folder / filename
    saved_pages = create_clean_pages(img, ocr_text, output_base)
    for sp in saved_pages:
        print(f"    -> Imagine: {sp.name}")
    
    time.sleep(1)
    
    return ocr_text


 def process_folder(input_folder, output_folder):
    input_path = Path(input_folder)
    output_path = Path(output_folder)
    output_path.mkdir(parents=True, exist_ok=True)
    
    client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
    
    extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.tiff', '*.tif']
    images = []
    for ext in extensions:
        images.extend(input_path.glob(ext))
        images.extend(input_path.glob(ext.upper()))
    
    images = sorted(set(images))
    
    if not images:
        print(f"Nu s-au gasit imagini in: {input_folder}")
        return
    
    print(f"Gasite {len(images)} imagini de procesat")
    
    success = 0
    failed = 0
    all_text = []
    
    for i, img_path in enumerate(images, 1):
        print(f"\n[{i}/{len(images)}]", end="")
        try:
            text = process_image(img_path, output_path, client)
            all_text.append(f"\n{'='*60}\n{img_path.name}\n{'='*60}\n\n{text}")
            success += 1
        except Exception as e:
            print(f"    [EROARE] {img_path.name}: {e}")
            failed += 1
    
    combined_path = output_path / "_TEXT_COMPLET.txt"
    with open(combined_path, 'w', encoding='utf-8') as f:
        f.write("\n".join(all_text))
    print(f"\n\nText complet: {combined_path}")
    
    print(f"\n{'='*60}")
    print(f"REZULTAT: {success} procesate, {failed} erori")
    print(f"{'='*60}")


 if __name__ == "__main__":
    input_folder = r"e:\Carte\BB\17 - Site Leadership\alte\Ionel Balauta\Aryeht\Task 1 - Traduce tot site-ul\Doar Google Web\Andreea\Meditatii\2023\Edit Text Images (Remove shadows + Remove red LInes)\TTT"
    output_folder = r"e:\Carte\BB\17 - Site Leadership\alte\Ionel Balauta\Aryeht\Task 1 - Traduce tot site-ul\Doar Google Web\Andreea\Meditatii\2023\Edit Text Images (Remove shadows + Remove red LInes)\Output"
    
    print("=" * 60)
    print("OCR + Pagini Curate - FORMAT CARTE MEDICALA")
    print("=" * 60)
    
    process_folder(input_folder, output_folder)
	# -- coding: utf-8 --
	import cv2
	import numpy as np
	from pathlib import Path
	import sys
	import base64
	import anthropic
	import time
	from PIL import Image, ImageDraw, ImageFont
	import textwrap
	import re

	try:
	if sys.platform == 'win32' and hasattr(sys.stdout, 'buffer'):
	import io
	sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
	sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
	except:
	pass

	ANTHROPIC_API_KEY = "YOUR_API_KEY_HERE"


	def image_to_base64(image_path):
	with open(image_path, "rb") as f:
	return base64.standard_b64encode(f.read()).decode("utf-8")


	def get_image_media_type(image_path):
	ext = Path(image_path).suffix.lower()
	media_types = {
	'.jpg': 'image/jpeg',
	'.jpeg': 'image/jpeg',
	'.png': 'image/png',
	'.gif': 'image/gif',
	'.webp': 'image/webp'
	}
	return media_types.get(ext, 'image/jpeg')


	def ocr_with_claude(image_path, client):
	image_data = image_to_base64(image_path)
	media_type = get_image_media_type(image_path)

	message = client.messages.create(
	model="claude-sonnet-4-20250514",
	max_tokens=8000,
	messages=[
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"source": {
	"type": "base64",
	"media_type": media_type,
	"data": image_data,
	},
	},
	{
	"type": "text",
	"text": """Transcrie EXACT textul din aceasta pagina de carte medicala romaneasca.

	REGULI DE FORMATARE:
	1. Marcheaza paragrafele noi cu [P]
	2. Marcheaza textul ITALIC cu [I]...[/I] (subtitluri, termeni medicali, nume autori)
	3. Marcheaza textul BOLD cu [B]...[/B] (titluri tabele, cuvinte-cheie, doze)
	4. Pentru TABELE, foloseste formatul:
	[TABEL]
	Coloana1 \| Coloana2 \| Coloana3
	Valoare1 \| Valoare2 \| Valoare3
	[/TABEL]
	5. Pastreaza diacriticele (ă, â, î, ș, ț)
	6. Pastreaza ortografia originala (sînt, cînd, vîrstă)
	7. Marcheaza [FIGURA] unde apar imagini/diagrame
	8. [?] doar pentru cuvinte complet ilizibile
	9. Numarul paginii la final

	Textul:"""
	}
	],
	}
	],
	)

	return message.content[0].text


	def get_font(font_type, size):
	if font_type == 'regular':
	font_paths = [
	"C:/Windows/Fonts/times.ttf",
	"C:/Windows/Fonts/georgia.ttf",
	"/usr/share/fonts/truetype/dejavu/DejaVuSerif.ttf",
	]
	elif font_type == 'bold':
	font_paths = [
	"C:/Windows/Fonts/timesbd.ttf",
	"C:/Windows/Fonts/georgiab.ttf",
	"/usr/share/fonts/truetype/dejavu/DejaVuSerif-Bold.ttf",
	]
	elif font_type == 'italic':
	font_paths = [
	"C:/Windows/Fonts/timesi.ttf",
	"C:/Windows/Fonts/georgiai.ttf",
	"/usr/share/fonts/truetype/dejavu/DejaVuSerif-Italic.ttf",
	]
	elif font_type == 'bolditalic':
	font_paths = [
	"C:/Windows/Fonts/timesbi.ttf",
	"C:/Windows/Fonts/georgiaz.ttf",
	"/usr/share/fonts/truetype/dejavu/DejaVuSerif-BoldItalic.ttf",
	]
	else:
	font_paths = ["C:/Windows/Fonts/times.ttf"]

	for fp in font_paths:
	if Path(fp).exists():
	try:
	return ImageFont.truetype(fp, size)
	except:
	continue

	return ImageFont.load_default()


	def get_background_color(img):
	img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
	bg_mask = gray > 200

	if np.any(bg_mask):
	bg_pixels = img_rgb[bg_mask]
	bg_color = tuple(np.mean(bg_pixels, axis=0).astype(int))
	return bg_color

	return (250, 245, 230) # Crem deschis


	def draw_justified_text(draw, text, font, x, y, max_width, fill):
	"""Deseneaza text aliniat justify (stanga-dreapta)"""
	words = text.split()
	if len(words) <= 1:
	draw.text((x, y), text, fill=fill, font=font)
	return

	# Calculam latimea totala a cuvintelor
	word_widths = []
	for word in words:
	bbox = draw.textbbox((0, 0), word, font=font)
	word_widths.append(bbox[2] - bbox[0])

	total_word_width = sum(word_widths)

	# Spatiul disponibil pentru spatii
	total_space = max_width - total_word_width

	if len(words) > 1 and total_space > 0:
	space_width = total_space / (len(words) - 1)
	else:
	space_width = draw.textbbox((0, 0), " ", font=font)[2]

	# Desenam fiecare cuvant
	current_x = x
	for i, word in enumerate(words):
	draw.text((current_x, y), word, fill=fill, font=font)
	current_x += word_widths[i] + space_width


	def parse_table(table_text):
	"""Parseaza textul tabelului in randuri si coloane"""
	lines = [l.strip() for l in table_text.strip().split('\n') if l.strip()]
	rows = []
	for line in lines:
	if '\|' in line:
	cols = [c.strip() for c in line.split('\|')]
	rows.append(cols)
	else:
	rows.append([line])
	return rows


	def draw_table(draw, rows, fonts, x, y, max_width, line_height, text_color):
	"""Deseneaza un tabel formatat corect"""
	if not rows:
	return y

	# Determinam numarul de coloane
	max_cols = max(len(row) for row in rows)

	# Calculam latimea fiecarei coloane
	col_width = max_width // max_cols if max_cols > 0 else max_width

	# Desenam fiecare rand
	current_y = y
	for row_idx, row in enumerate(rows):
	# Prima linie (header) poate fi bold
	font = fonts['bold'] if row_idx == 0 else fonts['regular']

	for col_idx, cell in enumerate(row):
	cell_x = x + col_idx * col_width
	# Truncam daca e prea lung
	cell_text = cell
	bbox = draw.textbbox((0, 0), cell_text, font=font)
	while bbox[2] - bbox[0] > col_width - 10 and len(cell_text) > 3:
	cell_text = cell_text[:-1]
	bbox = draw.textbbox((0, 0), cell_text, font=font)

	draw.text((cell_x, current_y), cell_text, fill=text_color, font=font)

	current_y += line_height

	return current_y


	def create_clean_pages(original_img, ocr_text, output_path_base):
	"""Creeaza pagini curate cu formatare profesionala"""

	height, width = original_img.shape[:2]
	bg_color = get_background_color(original_img)

	# Font size proportional
	base_font_size = max(48, int(height / 50))

	fonts = {
	'regular': get_font('regular', base_font_size),
	'bold': get_font('bold', base_font_size),
	'italic': get_font('italic', base_font_size),
	'bolditalic': get_font('bolditalic', base_font_size),
	}

	text_color = (20, 20, 20)

	# Margini mici pentru text extins
	margin_left = int(width * 0.05)
	margin_right = int(width * 0.05)
	margin_top = int(height * 0.04)
	margin_bottom = int(height * 0.04)

	paragraph_indent = int(base_font_size * 1.5)
	line_height = int(base_font_size * 1.35)

	text_width = width - margin_left - margin_right
	avg_char_width = base_font_size * 0.42
	chars_per_line = int(text_width / avg_char_width)

	# Procesam textul
	# Eliminam marcajele si construim structura
	content_blocks = []

	# Procesam tabelele separat
	table_pattern = r'\[TABEL\](.*?)\[/TABEL\]'

	# Split by tables first
	parts = re.split(table_pattern, ocr_text, flags=re.DOTALL)

	in_table = False
	for i, part in enumerate(parts):
	if i % 2 == 1: # Este tabel
	content_blocks.append({'type': 'table', 'content': part})
	else:
	# Text normal - procesam paragrafele
	# Split by [P]
	paras = re.split(r'\[P\]', part)
	for para in paras:
	para = para.strip()
	if not para:
	continue

	# Detectam formatarea
	# Procesam bold si italic
	content_blocks.append({'type': 'paragraph', 'content': para})

	# Generam paginile
	pages = []
	current_page = Image.new('RGB', (width, height), color=bg_color)
	draw = ImageDraw.Draw(current_page)
	y_position = margin_top
	page_num = 1

	for block in content_blocks:
	if block['type'] == 'table':
	# Procesam tabelul
	rows = parse_table(block['content'])
	needed_height = len(rows) * line_height + line_height

	if y_position + needed_height > height - margin_bottom:
	pages.append(current_page)
	current_page = Image.new('RGB', (width, height), color=bg_color)
	draw = ImageDraw.Draw(current_page)
	y_position = margin_top
	page_num += 1

	y_position = draw_table(draw, rows, fonts, margin_left, y_position,
	text_width, line_height, text_color)
	y_position += line_height // 2

	elif block['type'] == 'paragraph':
	para = block['content']

	# Eliminam marcajele de formatare pentru wrap
	clean_para = re.sub(r'\[/?[BI]\]', '', para)

	# Word wrap
	wrapped = textwrap.wrap(clean_para, width=chars_per_line - 3)

	for i, line in enumerate(wrapped):
	if y_position + line_height > height - margin_bottom:
	pages.append(current_page)
	current_page = Image.new('RGB', (width, height), color=bg_color)
	draw = ImageDraw.Draw(current_page)
	y_position = margin_top
	page_num += 1

	# Determinam fontul bazat pe formatare
	current_font = fonts['regular']

	# Verificam daca linia originala avea formatare
	if '[I]' in para and '[B]' in para:
	current_font = fonts['bolditalic']
	elif '[I]' in para:
	current_font = fonts['italic']
	elif '[B]' in para:
	current_font = fonts['bold']

	# Prima linie are indent
	if i == 0:
	x_pos = margin_left + paragraph_indent
	line_max_width = text_width - paragraph_indent
	else:
	x_pos = margin_left
	line_max_width = text_width

	# Ultima linie nu e justified
	if i == len(wrapped) - 1:
	draw.text((x_pos, y_position), line, fill=text_color, font=current_font)
	else:
	draw_justified_text(draw, line, current_font, x_pos, y_position,
	line_max_width, text_color)

	y_position += line_height

	# Spatiu intre paragrafe
	y_position += line_height // 4

	# Adaugam ultima pagina
	pages.append(current_page)

	# Salvam paginile
	saved_paths = []
	if len(pages) == 1:
	output_path = Path(str(output_path_base) + "_clean.jpg")
	pages[0].save(output_path, quality=95, dpi=(300, 300))
	saved_paths.append(output_path)
	else:
	for i, page in enumerate(pages, 1):
	output_path = Path(str(output_path_base) + f"_clean_p{i}.jpg")
	page.save(output_path, quality=95, dpi=(300, 300))
	saved_paths.append(output_path)

	return saved_paths


	def process_image(image_path, output_folder, client):
	img = cv2.imread(str(image_path))
	if img is None:
	raise ValueError(f"Nu s-a putut incarca: {image_path}")

	filename = Path(image_path).stem
	print(f"\n Procesare: {Path(image_path).name}")

	# OCR
	print(" -> OCR cu Claude API...")
	ocr_text = ocr_with_claude(image_path, client)

	# Salvare text
	output_text = output_folder / f"{filename}.txt"
	with open(output_text, 'w', encoding='utf-8') as f:
	f.write(ocr_text)
	print(f" -> Text: {output_text.name}")

	# Pagini curate
	output_base = output_folder / filename
	saved_pages = create_clean_pages(img, ocr_text, output_base)
	for sp in saved_pages:
	print(f" -> Imagine: {sp.name}")

	time.sleep(1)

	return ocr_text


	def process_folder(input_folder, output_folder):
	input_path = Path(input_folder)
	output_path = Path(output_folder)
	output_path.mkdir(parents=True, exist_ok=True)

	client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)

	extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']
	images = []
	for ext in extensions:
	images.extend(input_path.glob(ext))
	images.extend(input_path.glob(ext.upper()))

	images = sorted(set(images))

	if not images:
	print(f"Nu s-au gasit imagini in: {input_folder}")
	return

	print(f"Gasite {len(images)} imagini de procesat")

	success = 0
	failed = 0
	all_text = []

	for i, img_path in enumerate(images, 1):
	print(f"\n[{i}/{len(images)}]", end="")
	try:
	text = process_image(img_path, output_path, client)
	all_text.append(f"\n{'='60}\n{img_path.name}\n{'='60}\n\n{text}")
	success += 1
	except Exception as e:
	print(f" [EROARE] {img_path.name}: {e}")
	failed += 1

	combined_path = output_path / "_TEXT_COMPLET.txt"
	with open(combined_path, 'w', encoding='utf-8') as f:
	f.write("\n".join(all_text))
	print(f"\n\nText complet: {combined_path}")

	print(f"\n{'='*60}")
	print(f"REZULTAT: {success} procesate, {failed} erori")
	print(f"{'='*60}")


	if __name__ == "__main__":
	input_folder = r"e:\Carte\BB\17 - Site Leadership\alte\Ionel Balauta\Aryeht\Task 1 - Traduce tot site-ul\Doar Google Web\Andreea\Meditatii\2023\Edit Text Images (Remove shadows + Remove red LInes)\TTT"
	output_folder = r"e:\Carte\BB\17 - Site Leadership\alte\Ionel Balauta\Aryeht\Task 1 - Traduce tot site-ul\Doar Google Web\Andreea\Meditatii\2023\Edit Text Images (Remove shadows + Remove red LInes)\Output"

	print("=" * 60)
	print("OCR + Pagini Curate - FORMAT CARTE MEDICALA")
	print("=" * 60)

	process_folder(input_folder, output_folder)
No results found