billmetangmo · August 15, 2025 08:39
diff --git a/extract-chapters.py b/extract-chapters.py
 import os
 import glob
 from typing import List, Tuple, Dict, Any
 from dotenv import load_dotenv
 from llama_cloud_services import LlamaExtract
 from llama_cloud.core.api_error import ApiError

 # Charge les variables d'environnement (ex: LLAMA_CLOUD_API_KEY)
 load_dotenv()

 # Identifiant (ID ou nom) de l'agent d'extraction à utiliser
 EXTRACTION_AGENT_IDENTIFIER = "d9a0f69d-45a3-4590-a183-c9d1010ef36f"


 def _pick_input_path(file_bulletin: str) -> str:
    """Accepte un chemin PDF OU un dossier (prend le premier *.pdf)."""
    if os.path.isdir(file_bulletin):
        pdf_files = sorted(glob.glob(os.path.join(file_bulletin, "*.pdf")))
        if not pdf_files:
            raise FileNotFoundError(f"No PDFs found in directory: {file_bulletin}")
        return pdf_files[0]
    if not os.path.isfile(file_bulletin):
        raise FileNotFoundError(f"File not found: {file_bulletin}")
    return file_bulletin


 def _resolve_agent(identifier: str, extractor: LlamaExtract):
    """Retourne un agent LlamaExtract par nom ou ID."""
    try:
        return extractor.get_agent(identifier)
    except ApiError:
        pass

    agents = extractor.list_agents()
    match = None
    for a in agents:
        if getattr(a, "id", None) == identifier or getattr(a, "name", None) == identifier:
            match = a
            break
    if not match:
        raise RuntimeError(
            f"Extraction agent '{identifier}' not found. "
            f"Available: {[getattr(a, 'name', None) for a in agents]}"
        )
    return extractor.get_agent(match.name)


 def _extract_themes_payload(data: Dict[str, Any]) -> List[Tuple[str, str]]:
    """
    Schéma récent:
    {
      "themes": [
        {"theme_name": "str", "chapters": [{"chapter_number": int|null, "chapter_title": "str"}]}
      ]
    }
    Retourne: [(theme_name, chapter_title), ...]
    """
    out: List[Tuple[str, str]] = []
    for theme in data.get("themes") or []:
        theme_name = (theme or {}).get("theme_name", "").strip()
        for ch in (theme or {}).get("chapters") or []:
            title = (ch or {}).get("chapter_title")
            if isinstance(title, str) and title.strip():
                out.append((theme_name, title.strip()))
    return out


 def _extract_parts_payload(data: Dict[str, Any]) -> List[str]:
    """
    Schéma legacy:
    {
      "parts": [{"part_title": "str", "chapters": ["str", ...]}]
    }
    Retourne: ["chapter title", ...]
    """
    out: List[str] = []
    for part in data.get("parts") or []:
        for ch in (part or {}).get("chapters") or []:
            if isinstance(ch, str) and ch.strip():
                out.append(ch.strip())
    return out


 def get_chapters_llamacloud(level: str, discipline: str, file_bulletin: str) -> List[Tuple[str, str]]:
    """
    Extrait les titres de chapitres via LlamaExtract et retourne:
      [(theme_name, chapter_title), ...]
    - file_bulletin: chemin d'un PDF ou dossier contenant des PDFs (prend le 1er).
    - Requiert LLAMA_CLOUD_API_KEY dans l'environnement.
    """
    file_path = _pick_input_path(file_bulletin)

    # Client local par appel (thread-safe)
    extractor = LlamaExtract()

    # Résolution de l'agent (par nom ou ID)
    agent = _resolve_agent(EXTRACTION_AGENT_IDENTIFIER, extractor)

    # Extraction (upload + job + polling gérés par le SDK)
    result = agent.extract(file_path)
    data = getattr(result, "data", {}) or {}

    if "themes" in data:
        return _extract_themes_payload(data)
    if "parts" in data:
        # Normalise vers une liste de tuples (theme vide)
        return [("", ch) for ch in _extract_parts_payload(data)]
    return []
	import os
	import glob
	from typing import List, Tuple, Dict, Any
	from dotenv import load_dotenv
	from llama_cloud_services import LlamaExtract
	from llama_cloud.core.api_error import ApiError

	# Charge les variables d'environnement (ex: LLAMA_CLOUD_API_KEY)
	load_dotenv()

	# Identifiant (ID ou nom) de l'agent d'extraction à utiliser
	EXTRACTION_AGENT_IDENTIFIER = "d9a0f69d-45a3-4590-a183-c9d1010ef36f"


	def _pick_input_path(file_bulletin: str) -> str:
	"""Accepte un chemin PDF OU un dossier (prend le premier *.pdf)."""
	if os.path.isdir(file_bulletin):
	pdf_files = sorted(glob.glob(os.path.join(file_bulletin, "*.pdf")))
	if not pdf_files:
	raise FileNotFoundError(f"No PDFs found in directory: {file_bulletin}")
	return pdf_files[0]
	if not os.path.isfile(file_bulletin):
	raise FileNotFoundError(f"File not found: {file_bulletin}")
	return file_bulletin


	def _resolve_agent(identifier: str, extractor: LlamaExtract):
	"""Retourne un agent LlamaExtract par nom ou ID."""
	try:
	return extractor.get_agent(identifier)
	except ApiError:
	pass

	agents = extractor.list_agents()
	match = None
	for a in agents:
	if getattr(a, "id", None) == identifier or getattr(a, "name", None) == identifier:
	match = a
	break
	if not match:
	raise RuntimeError(
	f"Extraction agent '{identifier}' not found. "
	f"Available: {[getattr(a, 'name', None) for a in agents]}"
	)
	return extractor.get_agent(match.name)


	def _extract_themes_payload(data: Dict[str, Any]) -> List[Tuple[str, str]]:
	"""
	Schéma récent:
	{
	"themes": [
	{"theme_name": "str", "chapters": [{"chapter_number": int\|null, "chapter_title": "str"}]}
	]
	}
	Retourne: [(theme_name, chapter_title), ...]
	"""
	out: List[Tuple[str, str]] = []
	for theme in data.get("themes") or []:
	theme_name = (theme or {}).get("theme_name", "").strip()
	for ch in (theme or {}).get("chapters") or []:
	title = (ch or {}).get("chapter_title")
	if isinstance(title, str) and title.strip():
	out.append((theme_name, title.strip()))
	return out


	def _extract_parts_payload(data: Dict[str, Any]) -> List[str]:
	"""
	Schéma legacy:
	{
	"parts": [{"part_title": "str", "chapters": ["str", ...]}]
	}
	Retourne: ["chapter title", ...]
	"""
	out: List[str] = []
	for part in data.get("parts") or []:
	for ch in (part or {}).get("chapters") or []:
	if isinstance(ch, str) and ch.strip():
	out.append(ch.strip())
	return out


	def get_chapters_llamacloud(level: str, discipline: str, file_bulletin: str) -> List[Tuple[str, str]]:
	"""
	Extrait les titres de chapitres via LlamaExtract et retourne:
	[(theme_name, chapter_title), ...]
	- file_bulletin: chemin d'un PDF ou dossier contenant des PDFs (prend le 1er).
	- Requiert LLAMA_CLOUD_API_KEY dans l'environnement.
	"""
	file_path = _pick_input_path(file_bulletin)

	# Client local par appel (thread-safe)
	extractor = LlamaExtract()

	# Résolution de l'agent (par nom ou ID)
	agent = _resolve_agent(EXTRACTION_AGENT_IDENTIFIER, extractor)

	# Extraction (upload + job + polling gérés par le SDK)
	result = agent.extract(file_path)
	data = getattr(result, "data", {}) or {}

	if "themes" in data:
	return _extract_themes_payload(data)
	if "parts" in data:
	# Normalise vers une liste de tuples (theme vide)
	return [("", ch) for ch in _extract_parts_payload(data)]
	return []