|
import requests |
|
import os |
|
import re |
|
from urllib.parse import urljoin |
|
|
|
# =========================== |
|
# KONFIGURATION |
|
# =========================== |
|
BASE_URL = 'http://YOURIP/api/v4' # Anpassen an Ihre Mayan-Instanz |
|
AUTH = ('YOURUSERNAME', 'YOURPASSWORD') # Basic-Auth-Daten |
|
BASE_DOWNLOAD_DIR = 'heruntergeladene_dokumente' |
|
START_DOC_TYPE_LABEL = '' # Ab diesem Typ fortsetzen |
|
|
|
os.makedirs(BASE_DOWNLOAD_DIR, exist_ok=True) |
|
|
|
# =========================== |
|
# HILFSFUNKTIONEN |
|
# =========================== |
|
def sanitize_filename(filename: str) -> str: |
|
""" |
|
Ersetzt problematische Zeichen (/\:*?"<>|) in Dateinamen durch Unterstrich '_'. |
|
So vermeiden wir FileNotFoundError, wenn im Label z.B. / oder \ vorkommt. |
|
""" |
|
return re.sub(r'[\\/:*?"<>|]+', '_', filename) |
|
|
|
def get_all_results(start_url: str, auth=None): |
|
""" |
|
Ruft die angegebene URL auf, sammelt alle Datensätze über Pagination |
|
und gibt eine Liste aller 'results' zurück. |
|
""" |
|
results = [] |
|
next_url = start_url |
|
|
|
while next_url: |
|
response = requests.get(next_url, auth=auth) |
|
response.raise_for_status() # wirft Fehler bei 4xx/5xx |
|
data = response.json() |
|
|
|
# Ergebnisse anhängen |
|
results.extend(data.get('results', [])) |
|
|
|
# Nächste Seite |
|
next_url = data.get('next') |
|
|
|
return results |
|
|
|
# =========================== |
|
# HAUPTLOGIK |
|
# =========================== |
|
def main(): |
|
# 1) Alle Dokumententypen laden (paginiert) |
|
doc_type_url = f"{BASE_URL}/document_types/" |
|
document_types = get_all_results(doc_type_url, auth=AUTH) |
|
|
|
# Flag, um erst ab START_DOC_TYPE_LABEL zu beginnen |
|
start_downloading = False |
|
|
|
for doc_type in document_types: |
|
doc_type_id = doc_type['id'] |
|
doc_type_label = doc_type['label'] |
|
|
|
# Noch nicht beim Start-Typ angekommen? Überspringen. |
|
if not start_downloading: |
|
if doc_type_label == START_DOC_TYPE_LABEL: |
|
start_downloading = True |
|
else: |
|
print(f"Überspringe Dokumententyp: {doc_type_label}") |
|
continue |
|
|
|
# Ab hier: wir sind beim Starttyp oder schon darüber hinaus |
|
safe_doc_type_label = sanitize_filename(doc_type_label) |
|
doc_type_dir = os.path.join(BASE_DOWNLOAD_DIR, safe_doc_type_label) |
|
os.makedirs(doc_type_dir, exist_ok=True) |
|
|
|
print(f"\n== Dokumententyp: {doc_type_label} ==") |
|
|
|
# 2) Dokumente zum aktuellen Typ laden |
|
doc_url = f"{BASE_URL}/document_types/{doc_type_id}/documents/" |
|
documents = get_all_results(doc_url, auth=AUTH) |
|
|
|
# 3) Dokumente durchgehen |
|
for document in documents: |
|
document_id = document['id'] |
|
document_label = document['label'] |
|
|
|
# Dateinamen säubern |
|
safe_document_label = sanitize_filename(document_label) |
|
|
|
print(f" Dokument: {document_label} (ID: {document_id})") |
|
|
|
# 3a) Dateien des Dokuments (paginiert) abrufen |
|
file_url = f"{BASE_URL}/documents/{document_id}/files/" |
|
files_data = get_all_results(file_url, auth=AUTH) |
|
|
|
# 3b) Download jeder Datei |
|
for file_entry in files_data: |
|
file_id = file_entry['id'] |
|
|
|
# Echte Endung ableiten? |
|
# Hier beispielhaft .pdf statisch: |
|
extension = ".pdf" |
|
# Oder aus Dateiname, falls vorhanden: |
|
# original_filename = file_entry.get('filename', '') |
|
# _, extension = os.path.splitext(original_filename) |
|
# if not extension: |
|
# extension = ".bin" |
|
|
|
file_name = f"{safe_document_label}_file{file_id}{extension}" |
|
file_path = os.path.join(doc_type_dir, file_name) |
|
|
|
# Download-Endpunkt |
|
download_endpoint = f"documents/{document_id}/files/{file_id}/download/" |
|
full_download_url = urljoin(BASE_URL + '/', download_endpoint) |
|
|
|
# Herunterladen |
|
with requests.get(full_download_url, auth=AUTH, stream=True) as resp: |
|
resp.raise_for_status() |
|
with open(file_path, 'wb') as f: |
|
for chunk in resp.iter_content(chunk_size=8192): |
|
f.write(chunk) |
|
|
|
print(f" -> Heruntergeladen: {file_name}") |
|
|
|
print("\nDownload abgeschlossen.") |
|
|
|
if __name__ == "__main__": |
|
main() |