Created
July 8, 2025 11:36
-
-
Save arturocandela/d2c2a114475d559e18f599dd6330a32a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import pdfplumber | |
import pandas as pd | |
import re | |
ruta_rel = './250708/' | |
ruta_pdf = ruta_rel + 'admitidos.pdf' | |
archivo_intermedio = ruta_rel + 'todas_tablas_constante.csv' | |
archivo_salida = ruta_rel + 'alumnes_cicles.csv' | |
max_cols = 3 | |
filas_combinadas = [] | |
with pdfplumber.open(ruta_pdf) as pdf: | |
for n_pagina, page in enumerate(pdf.pages, start=1): | |
print(f"Analitzant pàgina {n_pagina}...") | |
tablas = page.extract_tables() | |
for i, tabla in enumerate(tablas, start=1): | |
for fila in tabla: | |
# Rellenar fins a max_cols | |
fila_normalizada = fila + [''] * (max_cols - len(fila)) | |
fila_normalizada = fila_normalizada[:max_cols] | |
# Netejar cada cel·la | |
cel_clean = [] | |
for celda in fila_normalizada: | |
if celda: | |
# 🔴 Cas 1: elimina "NO P\n" i semblants | |
celda = re.sub(r'NO\s*P[\s\n]*', '', celda) | |
celda = celda.replace("\n", " ").strip() | |
else: | |
celda = "" | |
cel_clean.append(celda) | |
# 🔴 Cas 2: si tota la fila conté variants de "UBLICABLE", descartar-la | |
celda_unida = ' '.join(cel_clean).lower() | |
if "ublicable" in celda_unida and all( | |
"publicable" in c.lower() for c in cel_clean if c.strip() != "" | |
): | |
continue # Ignorar fila contaminada | |
# Afegir pàgina i taula | |
fila_completa = [n_pagina, i] + cel_clean | |
filas_combinadas.append(fila_completa) | |
# Columnes | |
columnas = ["pagina", "tabla"] + [f"col{i}" for i in range(1, max_cols + 1)] | |
df = pd.DataFrame(filas_combinadas, columns=columnas) | |
df.to_csv(archivo_intermedio, index=False) | |
print(f"\n✅ Exportació completada sense marques d’aigua, guardada en: {archivo_intermedio}") | |
import pandas as pd | |
# Carrega el CSV original | |
df = pd.read_csv(archivo_intermedio) | |
# Variables per guardar el codi i nom del cicle actual | |
current_codi_cicle = None | |
current_nom_cicle = None | |
# Llista per guardar les dades dels alumnes | |
students_data = [] | |
# Itera per les files del DataFrame | |
for _, row in df.iterrows(): | |
if row['tabla'] == 2 and isinstance(row['col1'], str) and row['col1'].startswith("CODI CICLE"): | |
parts = row['col1'].split(":") | |
if len(parts) > 1: | |
current_codi_cicle = parts[1].strip() | |
parts = row['col2'].split(":") | |
if len(parts) > 1: | |
current_nom_cicle = parts[1].strip() | |
elif row['tabla'] == 3 and isinstance(row['col1'], str) and row['col1'].strip() != "DNI": | |
dni = row['col1'] | |
nom_alumne = row['col2'] | |
if isinstance(dni, str) and dni.strip() != "": | |
students_data.append({ | |
"dni": dni.strip(), | |
"nom_alumne": nom_alumne.strip() if isinstance(nom_alumne, str) else "", | |
"codi_cicle": current_codi_cicle, | |
"nom_cicle": current_nom_cicle | |
}) | |
# Crea el DataFrame i guarda'l | |
students_df = pd.DataFrame(students_data) | |
students_df.to_csv(archivo_salida, index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment