Skip to content

Instantly share code, notes, and snippets.

@tilacog
Last active July 25, 2016 14:57
Show Gist options
  • Save tilacog/2e8463a4a1ef8abbb347d5bf4f49f9f9 to your computer and use it in GitHub Desktop.
Save tilacog/2e8463a4a1ef8abbb347d5bf4f49f9f9 to your computer and use it in GitHub Desktop.
Parse DCTF CHM files
import re
import sys
import yaml
from bs4 import BeautifulSoup
field_types = {
'Branco(s)': None,
'Zero(s)': None,
'R$': 'decimal',
'*R$': 'decimal',
'R+': 'decimal',
'X': 'string',
'XN': 'integer',
'N': 'integer',
'DATA': 'date',
'DATA1': 'date',
'DATA2': 'date',
'DEC1': 'decimal',
'*DEC1': 'decimal',
'DEC2': 'decimal',
'*DEC2': 'decimal',
'CPF': 'string',
'CNPJ': 'string',
'CPF/CNPJ': 'string',
'NPED': 'string',
'NPADM': 'string',
'CNPJ/CEI': 'string',
'EOL': None,
}
clean_field_rgx = re.compile(r'\s+')
def clean_field(text):
return clean_field.sub(' ', text)
def parse_field(field):
new_field = {
'field_name': clean_field(field['Campo']),
'length': int(field['Tamanho']),
'order': int(field['Ordem']),
'type': field_types[field['Formato']],
}
return new_field
def parse(f):
soup = BeautifulSoup(open(f, encoding='latin1'), 'lxml')
# get record name
record = soup.find('table').find_all('td')[2].text
record = record.replace('\n ', '')
try:
record_description, record_name = [i.strip() for i in record.split('-')]
record_name = record_name.replace('Tipo ', '')
except ValueError:
if record == 'Header da Declaração':
record_name = 'H0'
record_description = record
results = {
'record': record_name,
'description': record_description,
'fields': []
}
# get keys
main_table = soup.find_all('table')[1]
keys = [
i.text for i in
main_table.find('tr').find_all('td')
]
# get data
for row in main_table.find_all('tr')[2:]:
values = [i.text for i in row.find_all('td')]
field = dict(zip(keys, values))
results['fields'].append(parse_field(field))
return results
if __name__ == '__main__':
result = map(parse, sys.argv[1:])
print(yaml.safe_dump(sorted(result, key=lambda x: x['record']),
allow_unicode=True))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment