Created
December 23, 2018 19:00
-
-
Save ap-Codkelden/4cf8c1792047bf0cf7b32d7e09735b64 to your computer and use it in GitHub Desktop.
gergstat_merger
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
import csv | |
import sys | |
import re | |
from os import scandir | |
main_container = [] | |
TABL45 = ''';;;;;"Табл.45.1 (дол)"''' | |
def check_45(path): | |
with open(path) as f: | |
c = f.read(22) | |
if c == TABL45: | |
return True | |
return False | |
def process_45(path): | |
container = [] | |
year = path.split('/')[1][-4:] | |
with open(path) as cf: | |
reader = csv.reader(cf, quotechar='"', delimiter=';') | |
rows = [r for r in reader] | |
section = None | |
next_empty = False | |
country = None | |
for r in rows: | |
stripped = [x.strip() for x in r] | |
if len(stripped) == 1: | |
if stripped[0] in ('а', 'в', 'з', 'о', 'ч', ', р'): | |
continue | |
if next_empty and stripped[0] == '': | |
next_empty = False | |
continue | |
if stripped[0] in ('ВСЬОГО', 'АЗІЯ', 'АФРИКА', 'ЄВРОПА', 'АМЕРИКА') or \ | |
re.match('^II?\. .+?$', stripped[0]): | |
next_empty = True | |
continue | |
elif re.match("^0\nВсього$", stripped[0]): | |
continue | |
if re.match("^\d{9,10}\n.+?$", stripped[0]): | |
m = re.search('^\d{9,10}', stripped[0][:10]) | |
if not m: | |
print('!!!') | |
continue | |
section = m.group().zfill(10) | |
continue | |
if not [x for x in stripped if x]: | |
continue | |
new_row = [year, section] | |
if stripped[0]: | |
country = stripped[0] | |
else: | |
stripped[0] = country | |
new_row.extend(stripped) | |
try: | |
if new_row[-1] == 'Табл.45.1 (дол)': | |
continue | |
elif new_row[2] == 'Найменування' or new_row[2].startswith('Зовнішня торгівля'): | |
continue | |
except AttributeError: | |
print(new_row) | |
raise | |
container.append(new_row) | |
return container | |
def scantree(path): | |
"""Recursively yield DirEntry objects for given directory.""" | |
for entry in scandir(path): | |
if entry.is_dir(follow_symlinks=False): | |
yield from scantree(entry.path) | |
else: | |
yield entry | |
for entry in scantree(sys.argv[1] if len(sys.argv) > 1 else '.'): | |
if not entry.path.endswith('.txt'): | |
continue | |
if not check_45(entry.path): | |
print(f"File {entry.path} is not table No. 45\n") | |
continue | |
main_container.extend(process_45(entry.path)) | |
with open('merged.csv', 'w') as mf: | |
writer = csv.writer(mf, quotechar='"', delimiter=';', quoting=csv.QUOTE_MINIMAL) | |
writer.writerows(main_container) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment