Skip to content

Instantly share code, notes, and snippets.

@SebDeclercq
Created September 29, 2023 15:09
Show Gist options
  • Save SebDeclercq/caffbe102bfccbf9c61995ceab89584c to your computer and use it in GitHub Desktop.
Save SebDeclercq/caffbe102bfccbf9c61995ceab89584c to your computer and use it in GitHub Desktop.
from __future__ import annotations
import re
from dataclasses import dataclass
from datetime import datetime
from typing import ClassVar, Final
from urllib.parse import urljoin
import httpx
@dataclass
class MatchedParts:
filename: str | None
stem: str | None
type: str | None
timestamp: str | None
size: str | None
other_filename: str | None
other_type: str | None
other_timestamp: str | None
other_size: str | None
def __getitem__(self, attrname: str) -> str | None:
try:
return getattr(self, attrname) or getattr(self, f"other_{attrname}")
except AttributeError:
return None
@dataclass
class Archive:
name: str
type: type
url: str
size: str
created_date: datetime
HTML_PARSING_REGEXP: ClassVar[Final[re.Pattern]] = re.compile(
r'''
(
<td><a\shref="(?P<filename>(?P<stem> # Get the entire filename
(?P<type>JORF|LEGI|Freemium) # Get the type
[-0-9_]++ # Match the timestamp in the filename
)\.tar\.gz)">(?P=stem)\.\.&gt;</a></td> # The filename is displayed as text for <a>
<td\s+align="right">(?P<timestamp>[^<]++)</td> # The next cell contains the timestamp
<td\s+align="right">(?P<size>[^<]++)</td> # and the last one the file size
)
| # OR other page style
(
<a\shref="(?P<other_filename> # Get the entire filename
(?P<other_type>JORF|LEGI|Freemium) # Get the type
[-0-9_]++ # Match the timestamp in the filename
\.tar\.gz)">(?P=other_filename)</a> # The filename is displayed as text for <a>
\s+
(?P<other_timestamp>\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}) # The timestamp is matched after a looot of spaces
\s+
(?P<other_size>\S++) # And finally the file size
)
''',
re.X
)
@classmethod
def get_from_source(cls, url: str) -> list[Archive]:
resp: httpx.Response = httpx.get(url)
archives: list[Archive] = []
for archive in cls.HTML_PARSING_REGEXP.finditer(resp.text):
archives.append(cls.parse_matched_data(archive, url))
return archives
@classmethod
def parse_matched_data(cls, match: re.Match, url: str) -> Archive:
values: MatchedParts = MatchedParts(**match.groupdict())
return cls(
name=(name := values["filename"]),
type=values["type"],
url=urljoin(url, name),
size=values["size"],
created_date=datetime.strptime(
values["timestamp"].strip(), r"%Y-%m-%d %H:%M"
),
)
Archive.get_from_source("https://echanges.dila.gouv.fr/OPENDATA/JORF/")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment