Skip to content

Instantly share code, notes, and snippets.

View mara004's full-sized avatar

mara004

View GitHub Profile
@mara004
mara004 / poppler_gtk.py
Last active September 6, 2024 18:40
PDF rendering with poppler-gtk
# SPDX-FileCopyrightText: 2024 geisserml <[email protected]>
# SPDX-License-Identifier: MPL-2.0
# Note that Poppler is GPL-licensed, so this code is altogether affected by copyleft
import math
from pathlib import Path
import PIL.Image
import cairo
import gi
@mara004
mara004 / pnp.py
Last active April 6, 2025 01:45
Page number spec parser [Draft]
# Four lines intentionally left blank
# SPDX-FileCopyrightText: 2025 geisserml <[email protected]>
# SPDX-License-Identifier: MPL-2.0
# Sophisticated parser for a page number mini-language
# Technically, this might be a use case for some parser generator like pyparsing or PLY, but this is a manual implementation based on common string operations.
@mara004
mara004 / argparse_compat.py
Last active July 21, 2024 22:34
Argparse compat extensions
# SPDX-FileCopyrightText: 2024 geisserml <[email protected]>
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
import sys
import argparse
if sys.version_info >= (3, 9):
from argparse import BooleanOptionalAction
else:
@mara004
mara004 / tile.py
Last active February 18, 2025 20:52
JPEG to PDF N-up with pypdfium2
# Four lines intentionally left blank
# SPDX-FileCopyrightText: 2025 geisserml <[email protected]>
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
import argparse
from pathlib import Path
@mara004
mara004 / parse_gh_release.py
Last active September 26, 2023 00:28
Extract information from GitHub release notes
# SPDX-FileCopyrightText: 2023 geisserml <[email protected]>
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0 OR BSD-3-Clause
# Unlike repository files, there is no "raw view" for GH releases, but we can extract the plain markdown content using GH web API
# See also https://stackoverflow.com/q/76995969/15547292
# The following code snippet shows how to get a release title from pdfium-binaries to extract the full version
import re
import json
@mara004
mara004 / safer_tar_extract.py
Last active February 8, 2025 20:21
Safer tar extraction
# SPDX-FileCopyrightText: 2023 geisserml <[email protected]>
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause OR MPL-2.0
# Safer tar extraction (hopefully) preventing CVE-2007-4559 etc.
# Tries to use the most elegant strategy available in the caller's python version (>= 3.6)
__all__ = ["safer_tar_unpack"]
import sys
if sys.version_info >= (3, 11, 4): # PEP 706
@mara004
mara004 / pypdfjs.py
Last active April 15, 2025 23:56
PDF rendering with pdf.js, from Python
# Four lines intentionally left blank
# SPDX-FileCopyrightText: 2025 geisserml <[email protected]>
# SPDX-License-Identifier: Apache-2.0 OR MPL-2.0
# See also https://github.com/extremeheat/JSPyBridge/blob/master/examples/python/pdfjs.py
@mara004
mara004 / pdfbox_version_parsing.py
Last active July 31, 2024 20:53
Parse pdfbox versions
# SPDX-FileCopyrightText: 2024 geisserml <[email protected]>
# SPDX-License-Identifier: Apache-2.0
import re
from datetime import datetime
from urllib.request import urlopen
from packaging.version import Version as PypaVersion
PB_RELEASE_URL = "https://archive.apache.org/dist/pdfbox/"
PB_DISTS_RE = r'<a href="([\d\.]+.+?)/">.+</a>\s+([\d\-]+ [\d:]+)'
@mara004
mara004 / pdfbox.py
Last active April 4, 2025 00:02
PDF rendering with PDFBox, from Python
# Four lines intentionally left blank
# SPDX-FileCopyrightText: 2025 geisserml <[email protected]>
# SPDX-License-Identifier: Apache-2.0 OR MPL-2.0
# Assuming you have an Apache PDFBox 3 jar in the same directory