Created
September 30, 2020 13:07
-
-
Save rebane2001/9683e476ad361c2b949f239c15a10928 to your computer and use it in GitHub Desktop.
Laeb alla fotoalbum.ee albumeid
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request | |
import re | |
base_url = "http://fotoalbum.ee" | |
album = input("Sisesta fotoalbum.ee albumi link:") | |
album = album.split("?")[0] | |
if not "/sets/" in album: | |
print("Hoiatus: Link pole album ning võib seetõttu valesti toimida") | |
pildid = [] | |
def sanitize_filename(s, restricted=False, is_id=False): | |
"""Sanitizes a string so it could be used as part of a filename. | |
If restricted is set, use a stricter subset of allowed characters. | |
Set is_id if this is not an arbitrary string, but an ID that should be kept | |
if possible. | |
""" | |
def replace_insane(char): | |
if restricted and char in ACCENT_CHARS: | |
return ACCENT_CHARS[char] | |
if char == '?' or ord(char) < 32 or ord(char) == 127: | |
return '' | |
elif char == '"': | |
return '' if restricted else '\'' | |
elif char == ':': | |
return '_-' if restricted else ' -' | |
elif char in '\\/|*<>': | |
return '_' | |
if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()): | |
return '_' | |
if restricted and ord(char) > 127: | |
return '_' | |
return char | |
# Handle timestamps | |
s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) | |
result = ''.join(map(replace_insane, s)) | |
if not is_id: | |
while '__' in result: | |
result = result.replace('__', '_') | |
result = result.strip('_') | |
# Common case of "Foreign band name - English song title" | |
if restricted and result.startswith('-_'): | |
result = result[2:] | |
if result.startswith('-'): | |
result = '_' + result[len('-'):] | |
result = result.lstrip('.') | |
if not result: | |
result = '_' | |
return result | |
i = 1 | |
while True: | |
print(f"Laeb albumi lehte {i}") | |
r = urllib.request.urlopen(f"{album}?page={i}").read().decode("utf-8") | |
lingid = re.findall(r"/photos/[^\/]+/[0-9]+",r) | |
pildid += lingid | |
if len(lingid) == 0: | |
break | |
i+=1 | |
print(f"Leitud {len(pildid)} pilti") | |
for pilt in pildid: | |
r = urllib.request.urlopen(f"{base_url}{pilt}").read().decode("utf-8") | |
results = re.search(r'<img src="([^"]*)" border="[0-9]*" alt="([^"]*)" vspace="[0-9]*">', r) | |
dlurl = f"http:{results.group(1)}" | |
filename = sanitize_filename(results.group(2)) | |
print(filename) | |
ext = dlurl.split(".")[-1] | |
if not filename.endswith(f".{ext}"): | |
filename += f".{ext}" | |
urllib.request.urlretrieve (dlurl, filename) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment