- 
      
 - 
        
Save sergiolucero/b93163a7e86e67dc16203ae8450e3483 to your computer and use it in GitHub Desktop.  
| import csv, glob, fitz | |
| def pdf2csv(fn): | |
| csv_fn = fn.replace('.pdf','.csv') | |
| region = int(fn[1:3]) # A04101.pdf -> 4 | |
| with open(csv_fn,'w') as fw: | |
| writer = csv.writer(fw) | |
| writer.writerow(['nombre','rut','genero','direccion', | |
| 'mesa','region','comuna']) | |
| for page in fitz.open(fn): | |
| text = page.getText() | |
| data = text[text.index('de'):].split(chr(10))[14:] | |
| nombres = [n for n in data[::5] if len(n)>1] | |
| ruts = data[1::5] | |
| genedire = data[2::5] | |
| gens = [gd[:5] for gd in genedire] | |
| dirs = [gd[6:] for gd in genedire] | |
| comuna = data[3] | |
| mesas = data[4::5] | |
| zipd = zip(nombres,ruts,gens,dirs,mesas) | |
| for nombre, rut, gene, dire, mesa in zipd: | |
| writer.writerow([nombre,rut,gene,dire,mesa,region,comuna]) | |
| if __name__ == '__main__': | |
| for fn in list(glob.glob('A*.pdf'))[:5]: | |
| print(fn) | |
| pdf2csv(fn) | 
      
          
      
      
            sergiolucero
  
      
      
      commented 
        Feb 29, 2020 
      
    
  

Hola, da el siguiente error:
Traceback (most recent call last):
File "servel_plebiscito_pdfToText.py", line 32, in 
pdf2csv(fn)
File "servel_plebiscito_pdfToText.py", line 27, in pdf2csv
writer.writerow([nombre,rut,gene,dire,mesa,region,comuna])
UnicodeEncodeError: 'ascii' codec can't encode character u'\xd1' in position 3: ordinal not in range(128)
Hola, da el siguiente error:
Traceback (most recent call last):
File "servel_plebiscito_pdfToText.py", line 32, in
pdf2csv(fn)
File "servel_plebiscito_pdfToText.py", line 27, in pdf2csv
writer.writerow([nombre,rut,gene,dire,mesa,region,comuna])
UnicodeEncodeError: 'ascii' codec can't encode character u'\xd1' in position 3: ordinal not in range(128)
Lo solucioné agregando
text = page.getText().encode('utf-8')
El problema que da es que  no está colocando un registro por línea y el rut lo divide en 2 campos, es posible solucionarlo?

Hola, da el siguiente error:
Traceback (most recent call last):
File "servel_plebiscito_pdfToText.py", line 32, in
pdf2csv(fn)
File "servel_plebiscito_pdfToText.py", line 27, in pdf2csv
writer.writerow([nombre,rut,gene,dire,mesa,region,comuna])
UnicodeEncodeError: 'ascii' codec can't encode character u'\xd1' in position 3: ordinal not in range(128)Lo solucioné agregando
text = page.getText().encode('utf-8')
El problema que da es que no está colocando un registro por línea y el rut lo divide en 2 campos, es posible solucionarlo?
Aca el parseo solucionado, pero es una solucion muy distinta !