Skip to content

Instantly share code, notes, and snippets.

@ap-Codkelden
Created January 19, 2018 13:40
Show Gist options
  • Save ap-Codkelden/1ec3a5baa733805e8937a50392b0a4fb to your computer and use it in GitHub Desktop.
Save ap-Codkelden/1ec3a5baa733805e8937a50392b0a4fb to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import lxml.html as html
import time
import requests
import re
import os
from string import ascii_lowercase, digits
from random import choice
START = 1
SERVER = "https://www8.city-adm.lviv.ua"
PLUS = 29
END = 1277
PAUSE = 0.7
DIRECTORY = 'lmr_des'
URL_TEMPLATE = 'https://www8.city-adm.lviv.ua/pool/info/doclmr_1.nsf/(RishenniaWeb)?OpenView&Start={}'
LINK_RE = re.compile("""(\/Pool\\\\Info\\\\doclmr_1\.NSF\/\(SearchForWeb\)\/(?:.+?)\?OpenDocument)">(.+?)<\/a>""")
SYMBOLS = ascii_lowercase + digits
def rand_part(length=8):
return ''.join(choice(SYMBOLS) for i in range(length))
def get_page(start=None):
r = requests.get(f'https://www8.city-adm.lviv.ua/pool/info/doclmr_1.nsf/(RishenniaWeb)?OpenView&Start={start}')
print(r.url)
do_continue = re.search('\<h2\>No documents found', r.text, re.I|re.M)
if do_continue:
return
links = LINK_RE.findall(r.text)
return links
def get_r(*args):
try:
r = requests.get(SERVER+args[0])
document = html.document_fromstring(r.text)
text = document.text_content()
c = re.sub("(<!--.*?-->)", "", text, flags=re.MULTILINE|re.DOTALL)
full_name_part = "".join([c for c in args[1] if c.isalpha() or c.isdigit() or c==' '])
if len(full_name_part) > 100:
name_part = full_name_part[:15]+"_CUT_"+full_name_part[-15:]
else:
name_part = full_name_part
fname = "".join(name_part).rstrip().replace(' ','_') + '_' + rand_part(length=16) + '.txt'
print(fname)
try:
with open(os.path.join(DIRECTORY, fname),'w') as f:
f.write(c)
except OSError as e:
print(e.errno)
except:
raise
if __name__ == '__main__':
if not os.path.isdir(DIRECTORY):
os.makedirs(DIRECTORY)
start = START
while start<=END + 100:
res = get_page(start=start)
if not res:
print('Page not found')
break
for l in res:
get_r(*l)
time.sleep(PAUSE)
start+=PLUS
print(start)
time.sleep(PAUSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment