Skip to content

Instantly share code, notes, and snippets.

@benoit74
Created October 28, 2024 08:17
Show Gist options
  • Save benoit74/63207a4bf14c7f98d7c6b02a9fd2843c to your computer and use it in GitHub Desktop.
Save benoit74/63207a4bf14c7f98d7c6b02a9fd2843c to your computer and use it in GitHub Desktop.
Zimls v2, computing aggregated path segment sizes and sorting display by path aggregated size
# !/usr/bin/env python3
import argparse
import logging
import sys
from typing import NamedTuple
from pydantic import BaseModel
import humanfriendly
from zimscraperlib.zim.archive import Archive
from zimscraperlib.logging import getLogger as lib_getLogger
NAME = "zimls"
VERSION = "0.2"
logger = lib_getLogger(
NAME,
level=logging.DEBUG,
log_format="[%(asctime)s] %(levelname)s:%(message)s",
)
class ZimPath(BaseModel):
path: str
size: int = 0
children: dict[str, "ZimPath"] = {}
def print(self, prefix: str = ""):
print(f"{humanfriendly.format_size(self.size)} {prefix}{self.path}")
for child in sorted(self.children.values(), key=lambda child: child.size, reverse=True):
child.print(prefix=f"{prefix}{self.path}/")
class ZimLs:
def __init__(self, fpath, **kwargs):
self.fpath = fpath
def run(self):
zim = Archive(self.fpath)
root = ZimPath(path="")
for idx in range(zim.all_entry_count):
entry = zim._get_entry_by_id(idx)
entry_size = entry.get_item().size
segments = str(entry.path).split("/")
current_path = root
for segment in segments:
current_path.size += entry_size
if not segment in current_path.children:
current_path.children[segment] = ZimPath(path=segment)
current_path = current_path.children[segment]
current_path.size += entry_size
root.print()
# print(humanfriendly.format_size(entry.get_item().size), entry.path)
def main():
parser = argparse.ArgumentParser(
prog=NAME,
description="Scraper to create ZIM files wikihow articles",
)
parser.add_argument(
"--debug", help="Enable verbose output", action="store_true", default=False
)
parser.add_argument(
"--version",
help=f"Display {NAME} version and exit",
action="version",
version=VERSION,
)
parser.add_argument("fpath", help="ZIM file to work off")
args = parser.parse_args()
try:
tool = ZimLs(**dict(args._get_kwargs()))
sys.exit(tool.run())
except Exception as exc:
logger.error(f"FAILED. An error occurred: {exc}")
if args.debug:
logger.exception(exc)
raise SystemExit(1)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment