celsowm · July 21, 2025 21:24
diff --git a/augmentation_legal.py b/augmentation_legal.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Gerador de pares (instrução, resposta) da CF‑88 – v5
 ▪ Grava direto em Parquet
 ▪ Descobre incisos/§ via JSON‑schema
 ▪ + PARÁFRASE (variante “paraphrase”)
 ▪ + BACK‑TRANSLATION (variante “back_translation”)

 © 2025 Celso + ChatGPT
 """

 # ───────────────────────── IMPORTS ───────────────────────── #
 import json, uuid, random
 from pathlib import Path
 from dataclasses import dataclass, asdict
 from typing import List, Dict, Iterable

 import requests
 import pyarrow as pa
 import pyarrow.parquet as pq
 from datasets import load_dataset
 from jsonschema import validate
 from tqdm import tqdm

 # ───────────────────────── CONSTANTES ────────────────────── #
 API_URL     = "http://10.120.191.11:8000/v1/chat/completions"   # vLLM Qwen‑3
 SOURCE_DS   = "celsowm/constituicao_br_1988"
 OUT_PARQUET = Path("constituicao_br_1988_instructions.parquet")

 TEMPERATURE = 0.7
 TOP_P       = 0.95

 BASE_TASKS = {
    "Resumo conciso":
        "Resuma de forma concisa o conteúdo do Art. {art} da CF/88.",
    "Extração de Entidades":
        "Extraia as entidades principais do Art. {art} da CF/88.",
    "Simplificação (ELI5)":
        "Explique de forma simples e acessível o conteúdo do Art. {art} da CF/88.",
    "Cenário Hipotético":
        "Crie um cenário hipotético baseado no Art. {art} da CF/88."
 }

 # ───────────────────────── ESQUEMAS JSON ─────────────────── #
 MESSAGE_SCHEMA = {
    "type": "object",
    "properties": {
        "role":    {"type": "string", "enum": ["user", "assistant"]},
        "content": {"type": "string"}
    },
    "required": ["role", "content"]
 }
 ROW_SCHEMA = {
    "type": "object",
    "properties": {
        "id": {"type": "string"},
        "messages": {
            "type": "array",
            "items": MESSAGE_SCHEMA,
            "minItems": 2
        },
        "meta": {
            "type": "object",
            "properties": {
                "artigo":   {"type": "string"},
                "task_type": {"type": "string"},
                "augmentation": {"type": "string"}
            },
            "required": ["artigo", "task_type"]
        }
    },
    "required": ["id", "messages", "meta"]
 }

 # ───────────────────────── UTILIDADES LLM ────────────────── #
 def llm_call(messages: List[Dict[str, str]],
             temperature: float = TEMPERATURE,
             top_p: float = TOP_P,
             timeout: int = 60) -> str:
    """Chama Qwen‑3 via vLLM e devolve apenas o texto da resposta."""
    payload = {
        "messages": messages,
        "temperature": temperature,
        "top_p": top_p,
        "chat_template_kwargs": {"enable_thinking": False}
    }
    r = requests.post(API_URL, json=payload, timeout=timeout)
    r.raise_for_status()
    return r.json()["choices"][0]["message"]["content"].strip()

 def paraphrase(texto: str) -> str:
    """Gera uma paráfrase única da instrução."""
    prompt = (
        "Reescreva a instrução abaixo em português, mudando a estrutura "
        "e o vocabulário, mas mantendo exatamente o mesmo significado. "
        "Devolva somente a versão reescrita, sem explicações.\n\n"
        f"Instrução:\n{texto}»"
    )
    return llm_call([{"role": "user", "content": prompt}], temperature=1.0)

 def back_translate(texto: str) -> str:
    """Faz back‑translation (PT → EN → PT) em um único passo."""
    prompt = (
        "Traduza o texto a seguir para o inglês e, em seguida, "
        "traduza‑o de volta para o português, usando palavras diferentes "
        "mas preservando o sentido jurídico. Retorne APENAS a versão final "
        "em português.\n\nTexto:\n«{texto}»"
    ).format(texto=texto)
    return llm_call([{"role": "user", "content": prompt}], temperature=1.0)

 def detectar_partes(art: str, texto: str) -> Dict[str, List[str]]:
    """Descobre incisos e parágrafos usando a própria LLM."""
    system_msg = {
        "role": "system",
        "content": (
            "Você é um assistente que extrai metadados estruturados de textos jurídicos. "
            "Retorne SOMENTE um JSON válido no formato "
            '{"incisos": ["I","II"], "paragrafos": ["1º","2º"]}. '
            "Sem comentários adicionais."
        )
    }
    user_msg = {
        "role": "user",
        "content": f"Identifique os incisos e parágrafos do texto a seguir.\n<ARTIGO {art}>\n{texto}\n</ARTIGO>"
    }
    try:
        parts_json = llm_call([system_msg, user_msg])
        return json.loads(parts_json)
    except Exception:
        return {"incisos": [], "paragrafos": []}

 # ───────────────────────── DATACLASS ──────────────────────── #
 @dataclass
 class Row:
    id: str
    messages: List[Dict[str, str]]
    meta: Dict[str, str]

    def as_dict(self):
        obj = asdict(self)
        validate(obj, ROW_SCHEMA)
        return obj

 # ───────────────────────── PIPELINE ───────────────────────── #
 def generate_rows() -> Iterable[Row]:
    ds = load_dataset(SOURCE_DS, split="train")
    for rec in tqdm(ds, desc="Artigos"):
        art   = rec["artigo"]
        texto = rec["texto"]

        # 1) tarefas base + augments
        for ttype, template in BASE_TASKS.items():
            instr_original = template.format(art=art)
            prompt_full    = f"{instr_original}\n\n{texto}"
            answer_orig    = llm_call([{"role": "user", "content": prompt_full}])

            yield Row(
                id=f"{art}_{ttype}_{uuid.uuid4().hex[:8]}",
                messages=[
                    {"role": "user",      "content": instr_original},
                    {"role": "assistant", "content": answer_orig}
                ],
                meta={"artigo": art, "task_type": ttype, "augmentation": "original"}
            )

            # 1a) PARÁFRASE
            instr_para = paraphrase(instr_original)
            prompt_para = f"{instr_para}\n\n{texto}"
            answer_para = llm_call([{"role": "user", "content": prompt_para}])

            yield Row(
                id=f"{art}_{ttype}_para_{uuid.uuid4().hex[:8]}",
                messages=[
                    {"role": "user",      "content": instr_para},
                    {"role": "assistant", "content": answer_para}
                ],
                meta={"artigo": art, "task_type": ttype, "augmentation": "paraphrase"}
            )

            # 1b) BACK‑TRANSLATION
            instr_back = back_translate(instr_original)
            prompt_back = f"{instr_back}\n\n{texto}"
            answer_back = llm_call([{"role": "user", "content": prompt_back}])

            yield Row(
                id=f"{art}_{ttype}_bt_{uuid.uuid4().hex[:8]}",
                messages=[
                    {"role": "user",      "content": instr_back},
                    {"role": "assistant", "content": answer_back}
                ],
                meta={"artigo": art, "task_type": ttype, "augmentation": "back_translation"}
            )

        # 2) extra – perguntas diretas sobre incisos / §§ (sem augments)
        parts = detectar_partes(art, texto)

        for inc in parts["incisos"]:
            q = f"O que diz o inciso {inc} do Art. {art} da Constituição de 1988?\n\n{texto}"
            a = llm_call([{"role": "user", "content": q}])
            yield Row(
                id=f"{art}_inc_{inc}_{uuid.uuid4().hex[:8]}",
                messages=[
                    {"role": "user",      "content": q},
                    {"role": "assistant", "content": a}
                ],
                meta={"artigo": art, "task_type": "Inciso (extrativa)", "augmentation": "original"}
            )

        for par in parts["paragrafos"]:
            q = f"O que diz o §{par} do Art. {art} da Constituição de 1988?\n\n{texto}"
            a = llm_call([{"role": "user", "content": q}])
            yield Row(
                id=f"{art}_par_{par}_{uuid.uuid4().hex[:8]}",
                messages=[
                    {"role": "user",      "content": q},
                    {"role": "assistant", "content": a}
                ],
                meta={"artigo": art, "task_type": "Parágrafo (extrativa)", "augmentation": "original"}
            )

 def main():
    if OUT_PARQUET.exists():
        OUT_PARQUET.unlink()

    writer = None
    for row in generate_rows():
        row_dict = row.as_dict()
        print("Adicionando linha:", json.dumps(row_dict, ensure_ascii=False))

        table = pa.Table.from_pylist([row_dict])
        if writer is None:
            writer = pq.ParquetWriter(
                OUT_PARQUET,
                table.schema,
                compression="zstd",
                use_dictionary=True
            )
        writer.write_table(table)

    if writer:
        writer.close()

    print(f"✅ Dataset salvo em {OUT_PARQUET.resolve()}")

 # ──────────────────────────── RUN ─────────────────────────── #
 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	Gerador de pares (instrução, resposta) da CF‑88 – v5
	▪ Grava direto em Parquet
	▪ Descobre incisos/§ via JSON‑schema
	▪ + PARÁFRASE (variante “paraphrase”)
	▪ + BACK‑TRANSLATION (variante “back_translation”)

	© 2025 Celso + ChatGPT
	"""

	# ───────────────────────── IMPORTS ───────────────────────── #
	import json, uuid, random
	from pathlib import Path
	from dataclasses import dataclass, asdict
	from typing import List, Dict, Iterable

	import requests
	import pyarrow as pa
	import pyarrow.parquet as pq
	from datasets import load_dataset
	from jsonschema import validate
	from tqdm import tqdm

	# ───────────────────────── CONSTANTES ────────────────────── #
	API_URL = "http://10.120.191.11:8000/v1/chat/completions" # vLLM Qwen‑3
	SOURCE_DS = "celsowm/constituicao_br_1988"
	OUT_PARQUET = Path("constituicao_br_1988_instructions.parquet")

	TEMPERATURE = 0.7
	TOP_P = 0.95

	BASE_TASKS = {
	"Resumo conciso":
	"Resuma de forma concisa o conteúdo do Art. {art} da CF/88.",
	"Extração de Entidades":
	"Extraia as entidades principais do Art. {art} da CF/88.",
	"Simplificação (ELI5)":
	"Explique de forma simples e acessível o conteúdo do Art. {art} da CF/88.",
	"Cenário Hipotético":
	"Crie um cenário hipotético baseado no Art. {art} da CF/88."
	}

	# ───────────────────────── ESQUEMAS JSON ─────────────────── #
	MESSAGE_SCHEMA = {
	"type": "object",
	"properties": {
	"role": {"type": "string", "enum": ["user", "assistant"]},
	"content": {"type": "string"}
	},
	"required": ["role", "content"]
	}
	ROW_SCHEMA = {
	"type": "object",
	"properties": {
	"id": {"type": "string"},
	"messages": {
	"type": "array",
	"items": MESSAGE_SCHEMA,
	"minItems": 2
	},
	"meta": {
	"type": "object",
	"properties": {
	"artigo": {"type": "string"},
	"task_type": {"type": "string"},
	"augmentation": {"type": "string"}
	},
	"required": ["artigo", "task_type"]
	}
	},
	"required": ["id", "messages", "meta"]
	}

	# ───────────────────────── UTILIDADES LLM ────────────────── #
	def llm_call(messages: List[Dict[str, str]],
	temperature: float = TEMPERATURE,
	top_p: float = TOP_P,
	timeout: int = 60) -> str:
	"""Chama Qwen‑3 via vLLM e devolve apenas o texto da resposta."""
	payload = {
	"messages": messages,
	"temperature": temperature,
	"top_p": top_p,
	"chat_template_kwargs": {"enable_thinking": False}
	}
	r = requests.post(API_URL, json=payload, timeout=timeout)
	r.raise_for_status()
	return r.json()["choices"][0]["message"]["content"].strip()

	def paraphrase(texto: str) -> str:
	"""Gera uma paráfrase única da instrução."""
	prompt = (
	"Reescreva a instrução abaixo em português, mudando a estrutura "
	"e o vocabulário, mas mantendo exatamente o mesmo significado. "
	"Devolva somente a versão reescrita, sem explicações.\n\n"
	f"Instrução:\n{texto}»"
	)
	return llm_call([{"role": "user", "content": prompt}], temperature=1.0)

	def back_translate(texto: str) -> str:
	"""Faz back‑translation (PT → EN → PT) em um único passo."""
	prompt = (
	"Traduza o texto a seguir para o inglês e, em seguida, "
	"traduza‑o de volta para o português, usando palavras diferentes "
	"mas preservando o sentido jurídico. Retorne APENAS a versão final "
	"em português.\n\nTexto:\n«{texto}»"
	).format(texto=texto)
	return llm_call([{"role": "user", "content": prompt}], temperature=1.0)

	def detectar_partes(art: str, texto: str) -> Dict[str, List[str]]:
	"""Descobre incisos e parágrafos usando a própria LLM."""
	system_msg = {
	"role": "system",
	"content": (
	"Você é um assistente que extrai metadados estruturados de textos jurídicos. "
	"Retorne SOMENTE um JSON válido no formato "
	'{"incisos": ["I","II"], "paragrafos": ["1º","2º"]}. '
	"Sem comentários adicionais."
	)
	}
	user_msg = {
	"role": "user",
	"content": f"Identifique os incisos e parágrafos do texto a seguir.\n<ARTIGO {art}>\n{texto}\n</ARTIGO>"
	}
	try:
	parts_json = llm_call([system_msg, user_msg])
	return json.loads(parts_json)
	except Exception:
	return {"incisos": [], "paragrafos": []}

	# ───────────────────────── DATACLASS ──────────────────────── #
	@dataclass
	class Row:
	id: str
	messages: List[Dict[str, str]]
	meta: Dict[str, str]

	def as_dict(self):
	obj = asdict(self)
	validate(obj, ROW_SCHEMA)
	return obj

	# ───────────────────────── PIPELINE ───────────────────────── #
	def generate_rows() -> Iterable[Row]:
	ds = load_dataset(SOURCE_DS, split="train")
	for rec in tqdm(ds, desc="Artigos"):
	art = rec["artigo"]
	texto = rec["texto"]

	# 1) tarefas base + augments
	for ttype, template in BASE_TASKS.items():
	instr_original = template.format(art=art)
	prompt_full = f"{instr_original}\n\n{texto}"
	answer_orig = llm_call([{"role": "user", "content": prompt_full}])

	yield Row(
	id=f"{art}_{ttype}_{uuid.uuid4().hex[:8]}",
	messages=[
	{"role": "user", "content": instr_original},
	{"role": "assistant", "content": answer_orig}
	],
	meta={"artigo": art, "task_type": ttype, "augmentation": "original"}
	)

	# 1a) PARÁFRASE
	instr_para = paraphrase(instr_original)
	prompt_para = f"{instr_para}\n\n{texto}"
	answer_para = llm_call([{"role": "user", "content": prompt_para}])

	yield Row(
	id=f"{art}_{ttype}_para_{uuid.uuid4().hex[:8]}",
	messages=[
	{"role": "user", "content": instr_para},
	{"role": "assistant", "content": answer_para}
	],
	meta={"artigo": art, "task_type": ttype, "augmentation": "paraphrase"}
	)

	# 1b) BACK‑TRANSLATION
	instr_back = back_translate(instr_original)
	prompt_back = f"{instr_back}\n\n{texto}"
	answer_back = llm_call([{"role": "user", "content": prompt_back}])

	yield Row(
	id=f"{art}_{ttype}_bt_{uuid.uuid4().hex[:8]}",
	messages=[
	{"role": "user", "content": instr_back},
	{"role": "assistant", "content": answer_back}
	],
	meta={"artigo": art, "task_type": ttype, "augmentation": "back_translation"}
	)

	# 2) extra – perguntas diretas sobre incisos / §§ (sem augments)
	parts = detectar_partes(art, texto)

	for inc in parts["incisos"]:
	q = f"O que diz o inciso {inc} do Art. {art} da Constituição de 1988?\n\n{texto}"
	a = llm_call([{"role": "user", "content": q}])
	yield Row(
	id=f"{art}_inc_{inc}_{uuid.uuid4().hex[:8]}",
	messages=[
	{"role": "user", "content": q},
	{"role": "assistant", "content": a}
	],
	meta={"artigo": art, "task_type": "Inciso (extrativa)", "augmentation": "original"}
	)

	for par in parts["paragrafos"]:
	q = f"O que diz o §{par} do Art. {art} da Constituição de 1988?\n\n{texto}"
	a = llm_call([{"role": "user", "content": q}])
	yield Row(
	id=f"{art}_par_{par}_{uuid.uuid4().hex[:8]}",
	messages=[
	{"role": "user", "content": q},
	{"role": "assistant", "content": a}
	],
	meta={"artigo": art, "task_type": "Parágrafo (extrativa)", "augmentation": "original"}
	)

	def main():
	if OUT_PARQUET.exists():
	OUT_PARQUET.unlink()

	writer = None
	for row in generate_rows():
	row_dict = row.as_dict()
	print("Adicionando linha:", json.dumps(row_dict, ensure_ascii=False))

	table = pa.Table.from_pylist([row_dict])
	if writer is None:
	writer = pq.ParquetWriter(
	OUT_PARQUET,
	table.schema,
	compression="zstd",
	use_dictionary=True
	)
	writer.write_table(table)

	if writer:
	writer.close()

	print(f"✅ Dataset salvo em {OUT_PARQUET.resolve()}")

	# ──────────────────────────── RUN ─────────────────────────── #
	if __name__ == "__main__":
	main()