Last active
October 6, 2025 17:51
-
-
Save Vigrond/096f82d5b971bc47d646e012f49f67f8 to your computer and use it in GitHub Desktop.
Paperless-ngx 'Created Date' reprocessing script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
2025-10-05 | |
Paperless-ngx 'Created Date' reprocessing script | |
This script will reprocess and update Document created dates using either a custom provided regex, or the default Paperless | |
regex. It will only apply the regex to original filenames, not content (unless RESET_DEFAULTS is specified). | |
If the regex does not produce a match on a particular filename, the document is skipped. | |
This script assumes the PAPERLESS_FILENAME_DATE_ORDER configuration is set. | |
SETTINGS: | |
USE_CUSTOM_REGEX: Uses default regex if False, otherwise FILENAME_DATE_REGEX if True | |
FILENAME_DATE_REGEX: Your custom filename regex. | |
RESET_DEFAULTS: If the result is not desirable, Set to True to reprocess 'created' dates the default Paperless-ngx way. | |
USAGE: | |
Set settings above to one's liking. | |
Set `doc_queryset` to your own filtered queryset if processing all documents is not desirable. | |
Run through Django's manage.py shell | |
Docker example: | |
``` | |
docker compose cp paperless_reprocess_created.py webserver:/usr/src/paperless/src | |
docker compose exec webserver sh -c "python3 manage.py shell < paperless_reprocess_created.py" | |
``` | |
NO GUARANTEES | |
USE AT YOUR OWN RISK. | |
IF YOU DO NOT UNDERSTAND THE CODE BELOW, DO NOT USE THIS SCRIPT. | |
""" | |
import datetime | |
import re | |
import sys | |
from pathlib import Path | |
from types import FunctionType | |
from django.conf import settings | |
from django.utils import timezone | |
from documents.models import Document | |
from documents.parsers import parse_date_generator | |
if not getattr(settings, 'FILENAME_DATE_ORDER', None): | |
sys.exit("Please set PAPERLESS_FILENAME_DATE_ORDER to make this script useful.") | |
# Uses FILENAME_DATE_REGEX if True, | |
# otherwise uses default regex located at https://github.com/paperless-ngx/paperless-ngx/blob/d609b386fe72e65335f853f9de0da0d4c9d8746e/src/documents/parsers.py#L45 | |
USE_CUSTOM_REGEX = False | |
# Date regex for filenames. Only used if USE_CUSTOM_REGEX is True | |
FILENAME_DATE_REGEX = re.compile( | |
r"(.*)", | |
re.IGNORECASE, | |
) | |
# If something went wrong, set to True to reprocess dates the default way. | |
RESET_DEFAULTS = False | |
# A simple progress bar | |
def progress_bar(current, total, bar_length=60): | |
fraction = current / total | |
arrow = int(fraction * bar_length - 1) * '-' + '>' | |
padding = int(bar_length - len(arrow)) * ' ' | |
ending = '\n' if current == total else '\r' | |
print(f'Progress: [{arrow}{padding}] {int(fraction*100)}% {current}/{total}', end=ending) | |
# Queryset of documents we want to update. | |
doc_queryset = Document.objects.distinct() | |
# We mimic the `parse_date_generator` function in `documents.parsers` so that | |
# we can use our custom DATE_REGEX global variable | |
parse_date_globals = {**parse_date_generator.__globals__, 'DATE_REGEX': FILENAME_DATE_REGEX} | |
parse_date_mimic = FunctionType(parse_date_generator.__code__, parse_date_globals) | |
progress_count = 0 | |
progress_total = doc_queryset.count() | |
updated = 0 | |
if not RESET_DEFAULTS: | |
print(f'Using {"custom" if USE_CUSTOM_REGEX else "default"} date regular expression for filenames.') | |
for doc in doc_queryset.only("original_filename"): | |
if USE_CUSTOM_REGEX: | |
gen = parse_date_mimic(doc.original_filename, "") | |
else: | |
gen = parse_date_generator(doc.original_filename, "") | |
date = next(gen, None) | |
if date: | |
doc.created = date | |
doc.save() | |
updated = updated + 1 | |
progress_count = progress_count + 1 | |
progress_bar(progress_count, progress_total) | |
else: | |
print(f'Reprocessing document created dates using default Paperless method. (RESET_DEFAULTS set to True).') | |
for doc in doc_queryset.only("pk", "original_filename", "filename", "mime_type", "storage_type", "content"): | |
gen = parse_date_generator(doc.original_filename, doc.content) | |
date = next(gen, None) | |
if date: | |
doc.created = date | |
doc.save() | |
else: | |
stats = Path(doc.source_path).stat() | |
create_date = timezone.make_aware( | |
datetime.datetime.fromtimestamp(stats.st_mtime), | |
) | |
doc.created = create_date | |
doc.save() | |
updated = updated + 1 | |
progress_count = progress_count + 1 | |
progress_bar(progress_count, progress_total) | |
print(f'Processed {progress_total}. Updated {updated}.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment