Created
January 19, 2018 13:40
-
-
Save ap-Codkelden/1ec3a5baa733805e8937a50392b0a4fb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import lxml.html as html | |
import time | |
import requests | |
import re | |
import os | |
from string import ascii_lowercase, digits | |
from random import choice | |
START = 1 | |
SERVER = "https://www8.city-adm.lviv.ua" | |
PLUS = 29 | |
END = 1277 | |
PAUSE = 0.7 | |
DIRECTORY = 'lmr_des' | |
URL_TEMPLATE = 'https://www8.city-adm.lviv.ua/pool/info/doclmr_1.nsf/(RishenniaWeb)?OpenView&Start={}' | |
LINK_RE = re.compile("""(\/Pool\\\\Info\\\\doclmr_1\.NSF\/\(SearchForWeb\)\/(?:.+?)\?OpenDocument)">(.+?)<\/a>""") | |
SYMBOLS = ascii_lowercase + digits | |
def rand_part(length=8): | |
return ''.join(choice(SYMBOLS) for i in range(length)) | |
def get_page(start=None): | |
r = requests.get(f'https://www8.city-adm.lviv.ua/pool/info/doclmr_1.nsf/(RishenniaWeb)?OpenView&Start={start}') | |
print(r.url) | |
do_continue = re.search('\<h2\>No documents found', r.text, re.I|re.M) | |
if do_continue: | |
return | |
links = LINK_RE.findall(r.text) | |
return links | |
def get_r(*args): | |
try: | |
r = requests.get(SERVER+args[0]) | |
document = html.document_fromstring(r.text) | |
text = document.text_content() | |
c = re.sub("(<!--.*?-->)", "", text, flags=re.MULTILINE|re.DOTALL) | |
full_name_part = "".join([c for c in args[1] if c.isalpha() or c.isdigit() or c==' ']) | |
if len(full_name_part) > 100: | |
name_part = full_name_part[:15]+"_CUT_"+full_name_part[-15:] | |
else: | |
name_part = full_name_part | |
fname = "".join(name_part).rstrip().replace(' ','_') + '_' + rand_part(length=16) + '.txt' | |
print(fname) | |
try: | |
with open(os.path.join(DIRECTORY, fname),'w') as f: | |
f.write(c) | |
except OSError as e: | |
print(e.errno) | |
except: | |
raise | |
if __name__ == '__main__': | |
if not os.path.isdir(DIRECTORY): | |
os.makedirs(DIRECTORY) | |
start = START | |
while start<=END + 100: | |
res = get_page(start=start) | |
if not res: | |
print('Page not found') | |
break | |
for l in res: | |
get_r(*l) | |
time.sleep(PAUSE) | |
start+=PLUS | |
print(start) | |
time.sleep(PAUSE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment