Skip to content

Instantly share code, notes, and snippets.

@Vigrond
Last active October 6, 2025 17:51
Show Gist options
  • Save Vigrond/096f82d5b971bc47d646e012f49f67f8 to your computer and use it in GitHub Desktop.
Save Vigrond/096f82d5b971bc47d646e012f49f67f8 to your computer and use it in GitHub Desktop.
Paperless-ngx 'Created Date' reprocessing script
"""
2025-10-05
Paperless-ngx 'Created Date' reprocessing script
This script will reprocess and update Document created dates using either a custom provided regex, or the default Paperless
regex. It will only apply the regex to original filenames, not content (unless RESET_DEFAULTS is specified).
If the regex does not produce a match on a particular filename, the document is skipped.
This script assumes the PAPERLESS_FILENAME_DATE_ORDER configuration is set.
SETTINGS:
USE_CUSTOM_REGEX: Uses default regex if False, otherwise FILENAME_DATE_REGEX if True
FILENAME_DATE_REGEX: Your custom filename regex.
RESET_DEFAULTS: If the result is not desirable, Set to True to reprocess 'created' dates the default Paperless-ngx way.
USAGE:
Set settings above to one's liking.
Set `doc_queryset` to your own filtered queryset if processing all documents is not desirable.
Run through Django's manage.py shell
Docker example:
```
docker compose cp paperless_reprocess_created.py webserver:/usr/src/paperless/src
docker compose exec webserver sh -c "python3 manage.py shell < paperless_reprocess_created.py"
```
NO GUARANTEES
USE AT YOUR OWN RISK.
IF YOU DO NOT UNDERSTAND THE CODE BELOW, DO NOT USE THIS SCRIPT.
"""
import datetime
import re
import sys
from pathlib import Path
from types import FunctionType
from django.conf import settings
from django.utils import timezone
from documents.models import Document
from documents.parsers import parse_date_generator
if not getattr(settings, 'FILENAME_DATE_ORDER', None):
sys.exit("Please set PAPERLESS_FILENAME_DATE_ORDER to make this script useful.")
# Uses FILENAME_DATE_REGEX if True,
# otherwise uses default regex located at https://github.com/paperless-ngx/paperless-ngx/blob/d609b386fe72e65335f853f9de0da0d4c9d8746e/src/documents/parsers.py#L45
USE_CUSTOM_REGEX = False
# Date regex for filenames. Only used if USE_CUSTOM_REGEX is True
FILENAME_DATE_REGEX = re.compile(
r"(.*)",
re.IGNORECASE,
)
# If something went wrong, set to True to reprocess dates the default way.
RESET_DEFAULTS = False
# A simple progress bar
def progress_bar(current, total, bar_length=60):
fraction = current / total
arrow = int(fraction * bar_length - 1) * '-' + '>'
padding = int(bar_length - len(arrow)) * ' '
ending = '\n' if current == total else '\r'
print(f'Progress: [{arrow}{padding}] {int(fraction*100)}% {current}/{total}', end=ending)
# Queryset of documents we want to update.
doc_queryset = Document.objects.distinct()
# We mimic the `parse_date_generator` function in `documents.parsers` so that
# we can use our custom DATE_REGEX global variable
parse_date_globals = {**parse_date_generator.__globals__, 'DATE_REGEX': FILENAME_DATE_REGEX}
parse_date_mimic = FunctionType(parse_date_generator.__code__, parse_date_globals)
progress_count = 0
progress_total = doc_queryset.count()
updated = 0
if not RESET_DEFAULTS:
print(f'Using {"custom" if USE_CUSTOM_REGEX else "default"} date regular expression for filenames.')
for doc in doc_queryset.only("original_filename"):
if USE_CUSTOM_REGEX:
gen = parse_date_mimic(doc.original_filename, "")
else:
gen = parse_date_generator(doc.original_filename, "")
date = next(gen, None)
if date:
doc.created = date
doc.save()
updated = updated + 1
progress_count = progress_count + 1
progress_bar(progress_count, progress_total)
else:
print(f'Reprocessing document created dates using default Paperless method. (RESET_DEFAULTS set to True).')
for doc in doc_queryset.only("pk", "original_filename", "filename", "mime_type", "storage_type", "content"):
gen = parse_date_generator(doc.original_filename, doc.content)
date = next(gen, None)
if date:
doc.created = date
doc.save()
else:
stats = Path(doc.source_path).stat()
create_date = timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime),
)
doc.created = create_date
doc.save()
updated = updated + 1
progress_count = progress_count + 1
progress_bar(progress_count, progress_total)
print(f'Processed {progress_total}. Updated {updated}.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment