SebDeclercq · September 29, 2023 15:09
diff --git a/jorf.py b/jorf.py
 from __future__ import annotations
 import re
 from dataclasses import dataclass
 from datetime import datetime
 from typing import ClassVar, Final
 from urllib.parse import urljoin
 import httpx


 @dataclass
 class MatchedParts:
    filename: str | None
    stem: str | None
    type: str | None
    timestamp: str | None
    size: str | None
    other_filename: str | None
    other_type: str | None
    other_timestamp: str | None
    other_size: str | None

    def __getitem__(self, attrname: str) -> str | None:
       try:
           return getattr(self, attrname) or getattr(self, f"other_{attrname}")
       except AttributeError:
           return None


 @dataclass
 class Archive:
    name: str
    type: type
    url: str
    size: str
    created_date: datetime

    HTML_PARSING_REGEXP: ClassVar[Final[re.Pattern]] = re.compile(
        r'''
        (
            <td><a\shref="(?P<filename>(?P<stem> # Get the entire filename
                (?P<type>JORF|LEGI|Freemium)     # Get the type
                [-0-9_]++                        # Match the timestamp in the filename
            )\.tar\.gz)">(?P=stem)\.\.&gt;</a></td>         # The filename is displayed as text for <a>
            <td\s+align="right">(?P<timestamp>[^<]++)</td>  # The next cell contains the timestamp
            <td\s+align="right">(?P<size>[^<]++)</td> # and the last one the file size
        )
        |  # OR other page style
        (
            <a\shref="(?P<other_filename>           # Get the entire filename
                (?P<other_type>JORF|LEGI|Freemium)  # Get the type
                [-0-9_]++                           # Match the timestamp in the filename
            \.tar\.gz)">(?P=other_filename)</a>     # The filename is displayed as text for <a>
            \s+
            (?P<other_timestamp>\d{4}-\d{2}-\d{2}\s\d{2}:\d{2})  # The timestamp is matched after a looot of spaces
            \s+
            (?P<other_size>\S++)  # And finally the file size
        )
        ''',
        re.X
    )

    @classmethod
    def get_from_source(cls, url: str) -> list[Archive]:
        resp: httpx.Response = httpx.get(url)
        archives: list[Archive] = []
        for archive in cls.HTML_PARSING_REGEXP.finditer(resp.text):
            archives.append(cls.parse_matched_data(archive, url))
        return archives

    @classmethod
    def parse_matched_data(cls, match: re.Match, url: str) -> Archive:
        values: MatchedParts = MatchedParts(**match.groupdict())
        return cls(
            name=(name := values["filename"]),
            type=values["type"],
            url=urljoin(url, name),
            size=values["size"],
            created_date=datetime.strptime(
                values["timestamp"].strip(), r"%Y-%m-%d %H:%M"
            ),
        )

 Archive.get_from_source("https://echanges.dila.gouv.fr/OPENDATA/JORF/")
	from __future__ import annotations
	import re
	from dataclasses import dataclass
	from datetime import datetime
	from typing import ClassVar, Final
	from urllib.parse import urljoin
	import httpx


	@dataclass
	class MatchedParts:
	filename: str \| None
	stem: str \| None
	type: str \| None
	timestamp: str \| None
	size: str \| None
	other_filename: str \| None
	other_type: str \| None
	other_timestamp: str \| None
	other_size: str \| None

	def __getitem__(self, attrname: str) -> str \| None:
	try:
	return getattr(self, attrname) or getattr(self, f"other_{attrname}")
	except AttributeError:
	return None


	@dataclass
	class Archive:
	name: str
	type: type
	url: str
	size: str
	created_date: datetime

	HTML_PARSING_REGEXP: ClassVar[Final[re.Pattern]] = re.compile(
	r'''
	(
	<td><a\shref="(?P<filename>(?P<stem> # Get the entire filename
	(?P<type>JORF\|LEGI\|Freemium) # Get the type
	[-0-9_]++ # Match the timestamp in the filename
	)\.tar\.gz)">(?P=stem)\.\.></a></td> # The filename is displayed as text for <a>
	<td\s+align="right">(?P<timestamp>[^<]++)</td> # The next cell contains the timestamp
	<td\s+align="right">(?P<size>[^<]++)</td> # and the last one the file size
	)
	\| # OR other page style
	(
	<a\shref="(?P<other_filename> # Get the entire filename
	(?P<other_type>JORF\|LEGI\|Freemium) # Get the type
	[-0-9_]++ # Match the timestamp in the filename
	\.tar\.gz)">(?P=other_filename)</a> # The filename is displayed as text for <a>
	\s+
	(?P<other_timestamp>\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}) # The timestamp is matched after a looot of spaces
	\s+
	(?P<other_size>\S++) # And finally the file size
	)
	''',
	re.X
	)

	@classmethod
	def get_from_source(cls, url: str) -> list[Archive]:
	resp: httpx.Response = httpx.get(url)
	archives: list[Archive] = []
	for archive in cls.HTML_PARSING_REGEXP.finditer(resp.text):
	archives.append(cls.parse_matched_data(archive, url))
	return archives

	@classmethod
	def parse_matched_data(cls, match: re.Match, url: str) -> Archive:
	values: MatchedParts = MatchedParts(**match.groupdict())
	return cls(
	name=(name := values["filename"]),
	type=values["type"],
	url=urljoin(url, name),
	size=values["size"],
	created_date=datetime.strptime(
	values["timestamp"].strip(), r"%Y-%m-%d %H:%M"
	),
	)

	Archive.get_from_source("https://echanges.dila.gouv.fr/OPENDATA/JORF/")
No results found