Last active
July 25, 2016 14:57
-
-
Save tilacog/2e8463a4a1ef8abbb347d5bf4f49f9f9 to your computer and use it in GitHub Desktop.
Parse DCTF CHM files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import sys | |
import yaml | |
from bs4 import BeautifulSoup | |
field_types = { | |
'Branco(s)': None, | |
'Zero(s)': None, | |
'R$': 'decimal', | |
'*R$': 'decimal', | |
'R+': 'decimal', | |
'X': 'string', | |
'XN': 'integer', | |
'N': 'integer', | |
'DATA': 'date', | |
'DATA1': 'date', | |
'DATA2': 'date', | |
'DEC1': 'decimal', | |
'*DEC1': 'decimal', | |
'DEC2': 'decimal', | |
'*DEC2': 'decimal', | |
'CPF': 'string', | |
'CNPJ': 'string', | |
'CPF/CNPJ': 'string', | |
'NPED': 'string', | |
'NPADM': 'string', | |
'CNPJ/CEI': 'string', | |
'EOL': None, | |
} | |
clean_field_rgx = re.compile(r'\s+') | |
def clean_field(text): | |
return clean_field.sub(' ', text) | |
def parse_field(field): | |
new_field = { | |
'field_name': clean_field(field['Campo']), | |
'length': int(field['Tamanho']), | |
'order': int(field['Ordem']), | |
'type': field_types[field['Formato']], | |
} | |
return new_field | |
def parse(f): | |
soup = BeautifulSoup(open(f, encoding='latin1'), 'lxml') | |
# get record name | |
record = soup.find('table').find_all('td')[2].text | |
record = record.replace('\n ', '') | |
try: | |
record_description, record_name = [i.strip() for i in record.split('-')] | |
record_name = record_name.replace('Tipo ', '') | |
except ValueError: | |
if record == 'Header da Declaração': | |
record_name = 'H0' | |
record_description = record | |
results = { | |
'record': record_name, | |
'description': record_description, | |
'fields': [] | |
} | |
# get keys | |
main_table = soup.find_all('table')[1] | |
keys = [ | |
i.text for i in | |
main_table.find('tr').find_all('td') | |
] | |
# get data | |
for row in main_table.find_all('tr')[2:]: | |
values = [i.text for i in row.find_all('td')] | |
field = dict(zip(keys, values)) | |
results['fields'].append(parse_field(field)) | |
return results | |
if __name__ == '__main__': | |
result = map(parse, sys.argv[1:]) | |
print(yaml.safe_dump(sorted(result, key=lambda x: x['record']), | |
allow_unicode=True)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment