Created
April 18, 2022 20:52
-
-
Save zajdee/dc0404b7a9a7bfb2e23fb30db0f3f97a to your computer and use it in GitHub Desktop.
RUIAN - byty parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
namespaces = { | |
'gml': 'http://www.opengis.net/gml/3.2', | |
'xlink': 'http://www.w3.org/1999/xlink', | |
'xsi': 'http://www.w3.org/2001/XMLSchema-instance', | |
'ami': 'urn:cz:isvs:ruian:schemas:AdrMistoIntTypy:v1', | |
'base': 'urn:cz:isvs:ruian:schemas:BaseTypy:v1', | |
'coi': 'urn:cz:isvs:ruian:schemas:CastObceIntTypy:v1', | |
'com': 'urn:cz:isvs:ruian:schemas:CommonTypy:v1', | |
'kui': 'urn:cz:isvs:ruian:schemas:KatUzIntTypy:v1', | |
'kri': 'urn:cz:isvs:ruian:schemas:KrajIntTypy:v1', | |
'mci': 'urn:cz:isvs:ruian:schemas:MomcIntTypy:v1', | |
'mpi': 'urn:cz:isvs:ruian:schemas:MopIntTypy:v1', | |
'obi': 'urn:cz:isvs:ruian:schemas:ObecIntTypy:v1', | |
'oki': 'urn:cz:isvs:ruian:schemas:OkresIntTypy:v1', | |
'opi': 'urn:cz:isvs:ruian:schemas:OrpIntTypy:v1', | |
'pai': 'urn:cz:isvs:ruian:schemas:ParcelaIntTypy:v1', | |
'pui': 'urn:cz:isvs:ruian:schemas:PouIntTypy:v1', | |
'rsi': 'urn:cz:isvs:ruian:schemas:RegSouIntiTypy:v1', | |
'spi': 'urn:cz:isvs:ruian:schemas:SpravObvIntTypy:v1', | |
'sti': 'urn:cz:isvs:ruian:schemas:StatIntTypy:v1', | |
'soi': 'urn:cz:isvs:ruian:schemas:StavObjIntTypy:v1', | |
'uli': 'urn:cz:isvs:ruian:schemas:UliceIntTypy:v1', | |
'vci': 'urn:cz:isvs:ruian:schemas:VuscIntTypy:v1', | |
'vf': 'urn:cz:isvs:ruian:schemas:VymennyFormatTypy:v1', | |
'zji': 'urn:cz:isvs:ruian:schemas:ZsjIntTypy:v1', | |
'voi': 'urn:cz:isvs:ruian:schemas:VOIntTypy:v1', | |
} | |
""" | |
Novou zakladni URL ziskej na https://vdp.cuzk.cz/vdp/ruian/vymennyformat/vyhledej | |
Ve vyhledávání zvolte: | |
- Platnost údajů: Platné | |
- Časový rozsah: Úplná kopie | |
- Územní prvky: Obec a podřazené | |
- Datová sada: Základní | |
- Výběr z údajů: např. Základní údaje | |
- Územní omezení: ČR | |
Seznam obci uloz do souboru s nazvem `id_obce`; jeden radek = jedno ID obce. | |
- Pouzij ciselniky CSU, napr. 554782 = Praha. | |
Vysledkem skriptu je tab-separated soubor `ruian-byty.tsv` se sloupci: | |
- id_obce (kod obce z ciselniku CSU) | |
- ruian (RUIAN ID adresy) | |
- stavebni_objekt (jeden stavebni objekt muze mit vic RUIAN ID) | |
- byty_dum (celkovy pocet bytu v jednom stavebnim objektu) | |
- byty_vchod (pocet bytu v jednom vchode/RUIAN ID; nekdy v datech chybi) | |
- zpusob_vyuziti | |
""" | |
BASE_URL = 'https://vdp.cuzk.cz/vymenny_format/soucasna/20220331_OB_{}_UZSZ.xml.zip' | |
OUTFILE = 'ruian-byty.tsv' | |
import gzip | |
import sys | |
import os | |
import io | |
import zipfile | |
from urllib.request import urlopen | |
from io import StringIO | |
from xml.dom.minidom import parse, parseString | |
from xml.etree.ElementTree import ElementTree | |
from pprint import pprint | |
def parse_tea(id_objektu, vchod, vchody): | |
e_byty = vchod.findall('./soi:PocetBytu', namespaces) | |
if e_byty: | |
byty = int(e_byty[0].text) | |
else: | |
# jsou stavebni vchody, ktere v ruian nemaji pocty bytu, napr. 22055843 v Praze | |
# tyto vchody ignorujeme | |
byty = -1 | |
e_ruian = vchod.findall('./soi:AdresniMistoKod/base:Kod', namespaces) | |
# objekt muze mit vic kodu RUIAN, takze ulozme vsechny | |
if e_ruian: | |
for e_ruian_item in e_ruian: | |
ruian = int(e_ruian_item.text) | |
if id_objektu not in vchody: | |
vchody[id_objektu] = {} | |
vchody[id_objektu][ruian] = byty | |
else: | |
print('ruian NENALEZEN') | |
pprint(id_objektu) | |
sys.exit(2) | |
def parse_so(e, vchody, domy): | |
e_id_objektu = e.findall('./soi:Kod', namespaces) | |
if not e_id_objektu: | |
return | |
id_objektu = int(e_id_objektu[0].text) | |
e_vchody = e.findall('./soi:DetailniTEA/soi:DetailniTEA', namespaces) | |
if e_vchody: | |
for vchod in e_vchody: | |
parse_tea(id_objektu, vchod, vchody) | |
if id_objektu not in domy: | |
domy[id_objektu] = {} | |
byty = e.findall('./soi:PocetBytu', namespaces) | |
if byty: | |
if len(byty) > 1: | |
print('vice nez jeden zaznam o poctu bytu bez TEA') | |
sys.exit(1) | |
else: | |
domy[id_objektu]['byty'] = int(byty[0].text) | |
else: | |
domy[id_objektu]['byty'] = -1 | |
vyuziti = e.findall('./soi:ZpusobVyuzitiKod', namespaces) | |
domy[id_objektu]['vyuziti'] = int(vyuziti[0].text) if vyuziti else -1 | |
def parse_am(e, adresnimista): | |
e_id_objektu = e.findall('./ami:Kod', namespaces) | |
if not e_id_objektu: | |
return | |
id_objektu = int(e_id_objektu[0].text) | |
e_so = e.findall('./ami:StavebniObjekt/soi:Kod', namespaces) | |
if e_so: | |
so_id = int(e_so[0].text) | |
else: | |
so_id = -1 | |
adresnimista[id_objektu] = so_id | |
def parse_file(file_name): | |
print('Opening file...') | |
mydoc = ElementTree(file=file_name) | |
vchody = {} | |
domy = {} | |
adresnimista = {} | |
print('Parsing...') | |
for e in mydoc.findall('./vf:Data/vf:StavebniObjekty/vf:StavebniObjekt', namespaces): | |
parse_so(e, vchody, domy) | |
for e in mydoc.findall('./vf:Data/vf:AdresniMista/vf:AdresniMisto', namespaces): | |
parse_am(e, adresnimista) | |
print('Parsing complete.') | |
return (vchody, domy, adresnimista) | |
def pocet_bytu(ruian, so_id, vchody, domy): | |
if so_id in vchody: | |
byty_vchod = vchody[so_id][ruian] if ruian in vchody[so_id] else -2 | |
else: | |
byty_vchod = -3 | |
if so_id in domy: | |
vyuziti = domy[so_id]['vyuziti'] if 'vyuziti' in domy[so_id] else -2 | |
byty_dum = domy[so_id]['byty'] if 'byty' in domy[so_id] else -2 | |
else: | |
vyuziti = -3 | |
byty_dum = -3 | |
return byty_dum, byty_vchod, vyuziti | |
def byty(id_obce, adresnimista, vchody, domy, outfile): | |
byty = {} | |
print('Writing...') | |
for misto in adresnimista: | |
ruian = misto | |
so_id = adresnimista[ruian] | |
byty_dum, byty_vchod, vyuziti = pocet_bytu(ruian, so_id, vchody, domy) | |
outfile.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(id_obce, ruian, so_id, byty_dum, byty_vchod, vyuziti)) | |
print('Writing complete.') | |
def download_gz(id_obce, outfile_path): | |
url = BASE_URL.format(id_obce) | |
print('Downloading {}...'.format(url)) | |
response = urlopen(url) | |
compressedFile = StringIO.StringIO() | |
compressedFile.write(response.read()) | |
# | |
# Set the file's current position to the beginning | |
# of the file so that gzip.GzipFile can read | |
# its contents from the top. | |
# | |
compressedFile.seek(0) | |
decompressedFile = gzip.GzipFile(fileobj=compressedFile, mode='rb') | |
with open(outfile_path, 'w') as outfile: | |
outfile.write(decompressedFile.read()) | |
print('Download complete.') | |
def download_zip(id_obce, outfile_path): | |
url = BASE_URL.format(id_obce) | |
print('Downloading {}...'.format(url)) | |
response = urlopen(url) | |
compressedData = io.BytesIO(response.read()) | |
zipdata = zipfile.ZipFile(compressedData) | |
decompressed_file_name = [name for name in zipdata.namelist()][0] | |
with open(outfile_path, 'wb') as outfile: | |
outfile.write(zipdata.read(decompressed_file_name)) | |
print('Download complete.') | |
def xml_to_bytlist(id_obce, outfile_path): | |
tempfile = '/tmp/parse.xml' | |
outfile = open(outfile_path, 'a') | |
download_zip(id_obce, tempfile) | |
(vchody, domy, adresnimista) = parse_file(tempfile) | |
byty(id_obce, adresnimista, vchody, domy, outfile) | |
os.unlink(tempfile) | |
outfile.close() | |
print('Job complete.') | |
def load_obce(): | |
# return [500496] # Olomouc | |
# return [554782] # Praha | |
seznam_obci = [] | |
with open('id_obce', 'r') as obcefile: | |
for line in obcefile: | |
seznam_obci.append(int(line)) | |
return seznam_obci | |
def main(): | |
print('Start processing...') | |
if os.path.isfile(OUTFILE): | |
os.unlink(OUTFILE) | |
# write header | |
with open(OUTFILE, 'a') as outfile: | |
outfile.write('id_obce\truian\tstavebni_objekt\tbyty_dum\tbyty_vchod\tzpusob_vyuziti\n') | |
# write data | |
for obce in load_obce(): | |
xml_to_bytlist(obce, OUTFILE) | |
print('Processing complete.') | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment