Skip to content

Instantly share code, notes, and snippets.

@ap-Codkelden
Created December 23, 2018 19:00
Show Gist options
  • Save ap-Codkelden/4cf8c1792047bf0cf7b32d7e09735b64 to your computer and use it in GitHub Desktop.
Save ap-Codkelden/4cf8c1792047bf0cf7b32d7e09735b64 to your computer and use it in GitHub Desktop.
gergstat_merger
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import csv
import sys
import re
from os import scandir
main_container = []
TABL45 = ''';;;;;"Табл.45.1 (дол)"'''
def check_45(path):
with open(path) as f:
c = f.read(22)
if c == TABL45:
return True
return False
def process_45(path):
container = []
year = path.split('/')[1][-4:]
with open(path) as cf:
reader = csv.reader(cf, quotechar='"', delimiter=';')
rows = [r for r in reader]
section = None
next_empty = False
country = None
for r in rows:
stripped = [x.strip() for x in r]
if len(stripped) == 1:
if stripped[0] in ('а', 'в', 'з', 'о', 'ч', ', р'):
continue
if next_empty and stripped[0] == '':
next_empty = False
continue
if stripped[0] in ('ВСЬОГО', 'АЗІЯ', 'АФРИКА', 'ЄВРОПА', 'АМЕРИКА') or \
re.match('^II?\. .+?$', stripped[0]):
next_empty = True
continue
elif re.match("^0\nВсього$", stripped[0]):
continue
if re.match("^\d{9,10}\n.+?$", stripped[0]):
m = re.search('^\d{9,10}', stripped[0][:10])
if not m:
print('!!!')
continue
section = m.group().zfill(10)
continue
if not [x for x in stripped if x]:
continue
new_row = [year, section]
if stripped[0]:
country = stripped[0]
else:
stripped[0] = country
new_row.extend(stripped)
try:
if new_row[-1] == 'Табл.45.1 (дол)':
continue
elif new_row[2] == 'Найменування' or new_row[2].startswith('Зовнішня торгівля'):
continue
except AttributeError:
print(new_row)
raise
container.append(new_row)
return container
def scantree(path):
"""Recursively yield DirEntry objects for given directory."""
for entry in scandir(path):
if entry.is_dir(follow_symlinks=False):
yield from scantree(entry.path)
else:
yield entry
for entry in scantree(sys.argv[1] if len(sys.argv) > 1 else '.'):
if not entry.path.endswith('.txt'):
continue
if not check_45(entry.path):
print(f"File {entry.path} is not table No. 45\n")
continue
main_container.extend(process_45(entry.path))
with open('merged.csv', 'w') as mf:
writer = csv.writer(mf, quotechar='"', delimiter=';', quoting=csv.QUOTE_MINIMAL)
writer.writerows(main_container)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment