Skip to content

Instantly share code, notes, and snippets.

@billmetangmo
Created August 15, 2025 08:39
Show Gist options
  • Save billmetangmo/96000a5c3238d0c8460f48db21b3b3af to your computer and use it in GitHub Desktop.
Save billmetangmo/96000a5c3238d0c8460f48db21b3b3af to your computer and use it in GitHub Desktop.
Get themes and chapters
import os
import glob
from typing import List, Tuple, Dict, Any
from dotenv import load_dotenv
from llama_cloud_services import LlamaExtract
from llama_cloud.core.api_error import ApiError
# Charge les variables d'environnement (ex: LLAMA_CLOUD_API_KEY)
load_dotenv()
# Identifiant (ID ou nom) de l'agent d'extraction à utiliser
EXTRACTION_AGENT_IDENTIFIER = "d9a0f69d-45a3-4590-a183-c9d1010ef36f"
def _pick_input_path(file_bulletin: str) -> str:
"""Accepte un chemin PDF OU un dossier (prend le premier *.pdf)."""
if os.path.isdir(file_bulletin):
pdf_files = sorted(glob.glob(os.path.join(file_bulletin, "*.pdf")))
if not pdf_files:
raise FileNotFoundError(f"No PDFs found in directory: {file_bulletin}")
return pdf_files[0]
if not os.path.isfile(file_bulletin):
raise FileNotFoundError(f"File not found: {file_bulletin}")
return file_bulletin
def _resolve_agent(identifier: str, extractor: LlamaExtract):
"""Retourne un agent LlamaExtract par nom ou ID."""
try:
return extractor.get_agent(identifier)
except ApiError:
pass
agents = extractor.list_agents()
match = None
for a in agents:
if getattr(a, "id", None) == identifier or getattr(a, "name", None) == identifier:
match = a
break
if not match:
raise RuntimeError(
f"Extraction agent '{identifier}' not found. "
f"Available: {[getattr(a, 'name', None) for a in agents]}"
)
return extractor.get_agent(match.name)
def _extract_themes_payload(data: Dict[str, Any]) -> List[Tuple[str, str]]:
"""
Schéma récent:
{
"themes": [
{"theme_name": "str", "chapters": [{"chapter_number": int|null, "chapter_title": "str"}]}
]
}
Retourne: [(theme_name, chapter_title), ...]
"""
out: List[Tuple[str, str]] = []
for theme in data.get("themes") or []:
theme_name = (theme or {}).get("theme_name", "").strip()
for ch in (theme or {}).get("chapters") or []:
title = (ch or {}).get("chapter_title")
if isinstance(title, str) and title.strip():
out.append((theme_name, title.strip()))
return out
def _extract_parts_payload(data: Dict[str, Any]) -> List[str]:
"""
Schéma legacy:
{
"parts": [{"part_title": "str", "chapters": ["str", ...]}]
}
Retourne: ["chapter title", ...]
"""
out: List[str] = []
for part in data.get("parts") or []:
for ch in (part or {}).get("chapters") or []:
if isinstance(ch, str) and ch.strip():
out.append(ch.strip())
return out
def get_chapters_llamacloud(level: str, discipline: str, file_bulletin: str) -> List[Tuple[str, str]]:
"""
Extrait les titres de chapitres via LlamaExtract et retourne:
[(theme_name, chapter_title), ...]
- file_bulletin: chemin d'un PDF ou dossier contenant des PDFs (prend le 1er).
- Requiert LLAMA_CLOUD_API_KEY dans l'environnement.
"""
file_path = _pick_input_path(file_bulletin)
# Client local par appel (thread-safe)
extractor = LlamaExtract()
# Résolution de l'agent (par nom ou ID)
agent = _resolve_agent(EXTRACTION_AGENT_IDENTIFIER, extractor)
# Extraction (upload + job + polling gérés par le SDK)
result = agent.extract(file_path)
data = getattr(result, "data", {}) or {}
if "themes" in data:
return _extract_themes_payload(data)
if "parts" in data:
# Normalise vers une liste de tuples (theme vide)
return [("", ch) for ch in _extract_parts_payload(data)]
return []
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment