Created
August 15, 2025 08:39
-
-
Save billmetangmo/96000a5c3238d0c8460f48db21b3b3af to your computer and use it in GitHub Desktop.
Get themes and chapters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import glob | |
from typing import List, Tuple, Dict, Any | |
from dotenv import load_dotenv | |
from llama_cloud_services import LlamaExtract | |
from llama_cloud.core.api_error import ApiError | |
# Charge les variables d'environnement (ex: LLAMA_CLOUD_API_KEY) | |
load_dotenv() | |
# Identifiant (ID ou nom) de l'agent d'extraction à utiliser | |
EXTRACTION_AGENT_IDENTIFIER = "d9a0f69d-45a3-4590-a183-c9d1010ef36f" | |
def _pick_input_path(file_bulletin: str) -> str: | |
"""Accepte un chemin PDF OU un dossier (prend le premier *.pdf).""" | |
if os.path.isdir(file_bulletin): | |
pdf_files = sorted(glob.glob(os.path.join(file_bulletin, "*.pdf"))) | |
if not pdf_files: | |
raise FileNotFoundError(f"No PDFs found in directory: {file_bulletin}") | |
return pdf_files[0] | |
if not os.path.isfile(file_bulletin): | |
raise FileNotFoundError(f"File not found: {file_bulletin}") | |
return file_bulletin | |
def _resolve_agent(identifier: str, extractor: LlamaExtract): | |
"""Retourne un agent LlamaExtract par nom ou ID.""" | |
try: | |
return extractor.get_agent(identifier) | |
except ApiError: | |
pass | |
agents = extractor.list_agents() | |
match = None | |
for a in agents: | |
if getattr(a, "id", None) == identifier or getattr(a, "name", None) == identifier: | |
match = a | |
break | |
if not match: | |
raise RuntimeError( | |
f"Extraction agent '{identifier}' not found. " | |
f"Available: {[getattr(a, 'name', None) for a in agents]}" | |
) | |
return extractor.get_agent(match.name) | |
def _extract_themes_payload(data: Dict[str, Any]) -> List[Tuple[str, str]]: | |
""" | |
Schéma récent: | |
{ | |
"themes": [ | |
{"theme_name": "str", "chapters": [{"chapter_number": int|null, "chapter_title": "str"}]} | |
] | |
} | |
Retourne: [(theme_name, chapter_title), ...] | |
""" | |
out: List[Tuple[str, str]] = [] | |
for theme in data.get("themes") or []: | |
theme_name = (theme or {}).get("theme_name", "").strip() | |
for ch in (theme or {}).get("chapters") or []: | |
title = (ch or {}).get("chapter_title") | |
if isinstance(title, str) and title.strip(): | |
out.append((theme_name, title.strip())) | |
return out | |
def _extract_parts_payload(data: Dict[str, Any]) -> List[str]: | |
""" | |
Schéma legacy: | |
{ | |
"parts": [{"part_title": "str", "chapters": ["str", ...]}] | |
} | |
Retourne: ["chapter title", ...] | |
""" | |
out: List[str] = [] | |
for part in data.get("parts") or []: | |
for ch in (part or {}).get("chapters") or []: | |
if isinstance(ch, str) and ch.strip(): | |
out.append(ch.strip()) | |
return out | |
def get_chapters_llamacloud(level: str, discipline: str, file_bulletin: str) -> List[Tuple[str, str]]: | |
""" | |
Extrait les titres de chapitres via LlamaExtract et retourne: | |
[(theme_name, chapter_title), ...] | |
- file_bulletin: chemin d'un PDF ou dossier contenant des PDFs (prend le 1er). | |
- Requiert LLAMA_CLOUD_API_KEY dans l'environnement. | |
""" | |
file_path = _pick_input_path(file_bulletin) | |
# Client local par appel (thread-safe) | |
extractor = LlamaExtract() | |
# Résolution de l'agent (par nom ou ID) | |
agent = _resolve_agent(EXTRACTION_AGENT_IDENTIFIER, extractor) | |
# Extraction (upload + job + polling gérés par le SDK) | |
result = agent.extract(file_path) | |
data = getattr(result, "data", {}) or {} | |
if "themes" in data: | |
return _extract_themes_payload(data) | |
if "parts" in data: | |
# Normalise vers une liste de tuples (theme vide) | |
return [("", ch) for ch in _extract_parts_payload(data)] | |
return [] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment