Created
February 16, 2026 06:44
-
-
Save me-suzy/343ea201710335a1ee0a3c0be548942e to your computer and use it in GitHub Desktop.
OCR convert to text API ClaudeAI 3.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| import cv2 | |
| import numpy as np | |
| from pathlib import Path | |
| import sys | |
| import base64 | |
| import anthropic | |
| import time | |
| from PIL import Image, ImageDraw, ImageFont | |
| import textwrap | |
| import re | |
| try: | |
| if sys.platform == 'win32' and hasattr(sys.stdout, 'buffer'): | |
| import io | |
| sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') | |
| sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace') | |
| except: | |
| pass | |
| ANTHROPIC_API_KEY = "YOUR_API_KEY_HERE" | |
| def image_to_base64(image_path): | |
| with open(image_path, "rb") as f: | |
| return base64.standard_b64encode(f.read()).decode("utf-8") | |
| def get_image_media_type(image_path): | |
| ext = Path(image_path).suffix.lower() | |
| media_types = { | |
| '.jpg': 'image/jpeg', | |
| '.jpeg': 'image/jpeg', | |
| '.png': 'image/png', | |
| '.gif': 'image/gif', | |
| '.webp': 'image/webp' | |
| } | |
| return media_types.get(ext, 'image/jpeg') | |
| def ocr_with_claude(image_path, client): | |
| image_data = image_to_base64(image_path) | |
| media_type = get_image_media_type(image_path) | |
| message = client.messages.create( | |
| model="claude-sonnet-4-20250514", | |
| max_tokens=8000, | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image", | |
| "source": { | |
| "type": "base64", | |
| "media_type": media_type, | |
| "data": image_data, | |
| }, | |
| }, | |
| { | |
| "type": "text", | |
| "text": """Transcrie EXACT textul din aceasta pagina de carte medicala romaneasca. | |
| REGULI DE FORMATARE: | |
| 1. Marcheaza paragrafele noi cu [P] | |
| 2. Marcheaza textul ITALIC cu [I]...[/I] (subtitluri, termeni medicali, nume autori) | |
| 3. Marcheaza textul BOLD cu [B]...[/B] (titluri tabele, cuvinte-cheie, doze) | |
| 4. Pentru TABELE, foloseste formatul: | |
| [TABEL] | |
| Coloana1 | Coloana2 | Coloana3 | |
| Valoare1 | Valoare2 | Valoare3 | |
| [/TABEL] | |
| 5. Pastreaza diacriticele (ă, â, î, ș, ț) | |
| 6. Pastreaza ortografia originala (sînt, cînd, vîrstă) | |
| 7. Marcheaza [FIGURA] unde apar imagini/diagrame | |
| 8. [?] doar pentru cuvinte complet ilizibile | |
| 9. Numarul paginii la final | |
| Textul:""" | |
| } | |
| ], | |
| } | |
| ], | |
| ) | |
| return message.content[0].text | |
| def get_font(font_type, size): | |
| if font_type == 'regular': | |
| font_paths = [ | |
| "C:/Windows/Fonts/times.ttf", | |
| "C:/Windows/Fonts/georgia.ttf", | |
| "/usr/share/fonts/truetype/dejavu/DejaVuSerif.ttf", | |
| ] | |
| elif font_type == 'bold': | |
| font_paths = [ | |
| "C:/Windows/Fonts/timesbd.ttf", | |
| "C:/Windows/Fonts/georgiab.ttf", | |
| "/usr/share/fonts/truetype/dejavu/DejaVuSerif-Bold.ttf", | |
| ] | |
| elif font_type == 'italic': | |
| font_paths = [ | |
| "C:/Windows/Fonts/timesi.ttf", | |
| "C:/Windows/Fonts/georgiai.ttf", | |
| "/usr/share/fonts/truetype/dejavu/DejaVuSerif-Italic.ttf", | |
| ] | |
| elif font_type == 'bolditalic': | |
| font_paths = [ | |
| "C:/Windows/Fonts/timesbi.ttf", | |
| "C:/Windows/Fonts/georgiaz.ttf", | |
| "/usr/share/fonts/truetype/dejavu/DejaVuSerif-BoldItalic.ttf", | |
| ] | |
| else: | |
| font_paths = ["C:/Windows/Fonts/times.ttf"] | |
| for fp in font_paths: | |
| if Path(fp).exists(): | |
| try: | |
| return ImageFont.truetype(fp, size) | |
| except: | |
| continue | |
| return ImageFont.load_default() | |
| def get_background_color(img): | |
| img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | |
| gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) | |
| bg_mask = gray > 200 | |
| if np.any(bg_mask): | |
| bg_pixels = img_rgb[bg_mask] | |
| bg_color = tuple(np.mean(bg_pixels, axis=0).astype(int)) | |
| return bg_color | |
| return (250, 245, 230) # Crem deschis | |
| def draw_justified_text(draw, text, font, x, y, max_width, fill): | |
| """Deseneaza text aliniat justify (stanga-dreapta)""" | |
| words = text.split() | |
| if len(words) <= 1: | |
| draw.text((x, y), text, fill=fill, font=font) | |
| return | |
| # Calculam latimea totala a cuvintelor | |
| word_widths = [] | |
| for word in words: | |
| bbox = draw.textbbox((0, 0), word, font=font) | |
| word_widths.append(bbox[2] - bbox[0]) | |
| total_word_width = sum(word_widths) | |
| # Spatiul disponibil pentru spatii | |
| total_space = max_width - total_word_width | |
| if len(words) > 1 and total_space > 0: | |
| space_width = total_space / (len(words) - 1) | |
| else: | |
| space_width = draw.textbbox((0, 0), " ", font=font)[2] | |
| # Desenam fiecare cuvant | |
| current_x = x | |
| for i, word in enumerate(words): | |
| draw.text((current_x, y), word, fill=fill, font=font) | |
| current_x += word_widths[i] + space_width | |
| def parse_table(table_text): | |
| """Parseaza textul tabelului in randuri si coloane""" | |
| lines = [l.strip() for l in table_text.strip().split('\n') if l.strip()] | |
| rows = [] | |
| for line in lines: | |
| if '|' in line: | |
| cols = [c.strip() for c in line.split('|')] | |
| rows.append(cols) | |
| else: | |
| rows.append([line]) | |
| return rows | |
| def draw_table(draw, rows, fonts, x, y, max_width, line_height, text_color): | |
| """Deseneaza un tabel formatat corect""" | |
| if not rows: | |
| return y | |
| # Determinam numarul de coloane | |
| max_cols = max(len(row) for row in rows) | |
| # Calculam latimea fiecarei coloane | |
| col_width = max_width // max_cols if max_cols > 0 else max_width | |
| # Desenam fiecare rand | |
| current_y = y | |
| for row_idx, row in enumerate(rows): | |
| # Prima linie (header) poate fi bold | |
| font = fonts['bold'] if row_idx == 0 else fonts['regular'] | |
| for col_idx, cell in enumerate(row): | |
| cell_x = x + col_idx * col_width | |
| # Truncam daca e prea lung | |
| cell_text = cell | |
| bbox = draw.textbbox((0, 0), cell_text, font=font) | |
| while bbox[2] - bbox[0] > col_width - 10 and len(cell_text) > 3: | |
| cell_text = cell_text[:-1] | |
| bbox = draw.textbbox((0, 0), cell_text, font=font) | |
| draw.text((cell_x, current_y), cell_text, fill=text_color, font=font) | |
| current_y += line_height | |
| return current_y | |
| def create_clean_pages(original_img, ocr_text, output_path_base): | |
| """Creeaza pagini curate cu formatare profesionala""" | |
| height, width = original_img.shape[:2] | |
| bg_color = get_background_color(original_img) | |
| # Font size proportional | |
| base_font_size = max(48, int(height / 50)) | |
| fonts = { | |
| 'regular': get_font('regular', base_font_size), | |
| 'bold': get_font('bold', base_font_size), | |
| 'italic': get_font('italic', base_font_size), | |
| 'bolditalic': get_font('bolditalic', base_font_size), | |
| } | |
| text_color = (20, 20, 20) | |
| # Margini mici pentru text extins | |
| margin_left = int(width * 0.05) | |
| margin_right = int(width * 0.05) | |
| margin_top = int(height * 0.04) | |
| margin_bottom = int(height * 0.04) | |
| paragraph_indent = int(base_font_size * 1.5) | |
| line_height = int(base_font_size * 1.35) | |
| text_width = width - margin_left - margin_right | |
| avg_char_width = base_font_size * 0.42 | |
| chars_per_line = int(text_width / avg_char_width) | |
| # Procesam textul | |
| # Eliminam marcajele si construim structura | |
| content_blocks = [] | |
| # Procesam tabelele separat | |
| table_pattern = r'\[TABEL\](.*?)\[/TABEL\]' | |
| # Split by tables first | |
| parts = re.split(table_pattern, ocr_text, flags=re.DOTALL) | |
| in_table = False | |
| for i, part in enumerate(parts): | |
| if i % 2 == 1: # Este tabel | |
| content_blocks.append({'type': 'table', 'content': part}) | |
| else: | |
| # Text normal - procesam paragrafele | |
| # Split by [P] | |
| paras = re.split(r'\[P\]', part) | |
| for para in paras: | |
| para = para.strip() | |
| if not para: | |
| continue | |
| # Detectam formatarea | |
| # Procesam bold si italic | |
| content_blocks.append({'type': 'paragraph', 'content': para}) | |
| # Generam paginile | |
| pages = [] | |
| current_page = Image.new('RGB', (width, height), color=bg_color) | |
| draw = ImageDraw.Draw(current_page) | |
| y_position = margin_top | |
| page_num = 1 | |
| for block in content_blocks: | |
| if block['type'] == 'table': | |
| # Procesam tabelul | |
| rows = parse_table(block['content']) | |
| needed_height = len(rows) * line_height + line_height | |
| if y_position + needed_height > height - margin_bottom: | |
| pages.append(current_page) | |
| current_page = Image.new('RGB', (width, height), color=bg_color) | |
| draw = ImageDraw.Draw(current_page) | |
| y_position = margin_top | |
| page_num += 1 | |
| y_position = draw_table(draw, rows, fonts, margin_left, y_position, | |
| text_width, line_height, text_color) | |
| y_position += line_height // 2 | |
| elif block['type'] == 'paragraph': | |
| para = block['content'] | |
| # Eliminam marcajele de formatare pentru wrap | |
| clean_para = re.sub(r'\[/?[BI]\]', '', para) | |
| # Word wrap | |
| wrapped = textwrap.wrap(clean_para, width=chars_per_line - 3) | |
| for i, line in enumerate(wrapped): | |
| if y_position + line_height > height - margin_bottom: | |
| pages.append(current_page) | |
| current_page = Image.new('RGB', (width, height), color=bg_color) | |
| draw = ImageDraw.Draw(current_page) | |
| y_position = margin_top | |
| page_num += 1 | |
| # Determinam fontul bazat pe formatare | |
| current_font = fonts['regular'] | |
| # Verificam daca linia originala avea formatare | |
| if '[I]' in para and '[B]' in para: | |
| current_font = fonts['bolditalic'] | |
| elif '[I]' in para: | |
| current_font = fonts['italic'] | |
| elif '[B]' in para: | |
| current_font = fonts['bold'] | |
| # Prima linie are indent | |
| if i == 0: | |
| x_pos = margin_left + paragraph_indent | |
| line_max_width = text_width - paragraph_indent | |
| else: | |
| x_pos = margin_left | |
| line_max_width = text_width | |
| # Ultima linie nu e justified | |
| if i == len(wrapped) - 1: | |
| draw.text((x_pos, y_position), line, fill=text_color, font=current_font) | |
| else: | |
| draw_justified_text(draw, line, current_font, x_pos, y_position, | |
| line_max_width, text_color) | |
| y_position += line_height | |
| # Spatiu intre paragrafe | |
| y_position += line_height // 4 | |
| # Adaugam ultima pagina | |
| pages.append(current_page) | |
| # Salvam paginile | |
| saved_paths = [] | |
| if len(pages) == 1: | |
| output_path = Path(str(output_path_base) + "_clean.jpg") | |
| pages[0].save(output_path, quality=95, dpi=(300, 300)) | |
| saved_paths.append(output_path) | |
| else: | |
| for i, page in enumerate(pages, 1): | |
| output_path = Path(str(output_path_base) + f"_clean_p{i}.jpg") | |
| page.save(output_path, quality=95, dpi=(300, 300)) | |
| saved_paths.append(output_path) | |
| return saved_paths | |
| def process_image(image_path, output_folder, client): | |
| img = cv2.imread(str(image_path)) | |
| if img is None: | |
| raise ValueError(f"Nu s-a putut incarca: {image_path}") | |
| filename = Path(image_path).stem | |
| print(f"\n Procesare: {Path(image_path).name}") | |
| # OCR | |
| print(" -> OCR cu Claude API...") | |
| ocr_text = ocr_with_claude(image_path, client) | |
| # Salvare text | |
| output_text = output_folder / f"{filename}.txt" | |
| with open(output_text, 'w', encoding='utf-8') as f: | |
| f.write(ocr_text) | |
| print(f" -> Text: {output_text.name}") | |
| # Pagini curate | |
| output_base = output_folder / filename | |
| saved_pages = create_clean_pages(img, ocr_text, output_base) | |
| for sp in saved_pages: | |
| print(f" -> Imagine: {sp.name}") | |
| time.sleep(1) | |
| return ocr_text | |
| def process_folder(input_folder, output_folder): | |
| input_path = Path(input_folder) | |
| output_path = Path(output_folder) | |
| output_path.mkdir(parents=True, exist_ok=True) | |
| client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY) | |
| extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.tiff', '*.tif'] | |
| images = [] | |
| for ext in extensions: | |
| images.extend(input_path.glob(ext)) | |
| images.extend(input_path.glob(ext.upper())) | |
| images = sorted(set(images)) | |
| if not images: | |
| print(f"Nu s-au gasit imagini in: {input_folder}") | |
| return | |
| print(f"Gasite {len(images)} imagini de procesat") | |
| success = 0 | |
| failed = 0 | |
| all_text = [] | |
| for i, img_path in enumerate(images, 1): | |
| print(f"\n[{i}/{len(images)}]", end="") | |
| try: | |
| text = process_image(img_path, output_path, client) | |
| all_text.append(f"\n{'='*60}\n{img_path.name}\n{'='*60}\n\n{text}") | |
| success += 1 | |
| except Exception as e: | |
| print(f" [EROARE] {img_path.name}: {e}") | |
| failed += 1 | |
| combined_path = output_path / "_TEXT_COMPLET.txt" | |
| with open(combined_path, 'w', encoding='utf-8') as f: | |
| f.write("\n".join(all_text)) | |
| print(f"\n\nText complet: {combined_path}") | |
| print(f"\n{'='*60}") | |
| print(f"REZULTAT: {success} procesate, {failed} erori") | |
| print(f"{'='*60}") | |
| if __name__ == "__main__": | |
| input_folder = r"e:\Carte\BB\17 - Site Leadership\alte\Ionel Balauta\Aryeht\Task 1 - Traduce tot site-ul\Doar Google Web\Andreea\Meditatii\2023\Edit Text Images (Remove shadows + Remove red LInes)\TTT" | |
| output_folder = r"e:\Carte\BB\17 - Site Leadership\alte\Ionel Balauta\Aryeht\Task 1 - Traduce tot site-ul\Doar Google Web\Andreea\Meditatii\2023\Edit Text Images (Remove shadows + Remove red LInes)\Output" | |
| print("=" * 60) | |
| print("OCR + Pagini Curate - FORMAT CARTE MEDICALA") | |
| print("=" * 60) | |
| process_folder(input_folder, output_folder) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment