Skip to content

Instantly share code, notes, and snippets.

@marcwittke
Last active January 16, 2023 10:32
Show Gist options
  • Save marcwittke/61bcfaa069d6c92c8d1467c09c6e264f to your computer and use it in GitHub Desktop.
Save marcwittke/61bcfaa069d6c92c8d1467c09c6e264f to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
from collections import OrderedDict
from PyPDF2 import PdfReader
import os
def _getFields(obj, tree=None, retval=None, fileobj=None):
"""
Extracts field data if this PDF contains interactive form fields.
The *tree* and *retval* parameters are for recursive use.
:param fileobj: A file object (usually a text file) to write
a report to on all interactive form fields found.
:return: A dictionary where each key is a field name, and each
value is a :class:`Field<PyPDF2.generic.Field>` object. By
default, the mapping name is used for keys.
:rtype: dict, or ``None`` if form data could not be located.
"""
fieldAttributes = {'/FT': 'Field Type', '/Parent': 'Parent', '/T': 'Field Name', '/TU': 'Alternate Field Name',
'/TM': 'Mapping Name', '/Ff': 'Field Flags', '/V': 'Value', '/DV': 'Default Value'}
if retval is None:
retval = OrderedDict()
catalog = obj.trailer["/Root"]
# get the AcroForm tree
if "/AcroForm" in catalog:
tree = catalog["/AcroForm"]
else:
return None
if tree is None:
return retval
obj._check_kids(tree, retval, fileobj)
for attr in fieldAttributes:
if attr in tree:
# Tree is a field
obj._buildField(tree, retval, fileobj, fieldAttributes)
break
if "/Fields" in tree:
fields = tree["/Fields"]
for f in fields:
field = f.get_object()
obj._build_field(field, retval, fileobj, fieldAttributes)
return retval
def get_form_fields(infile):
infile = PdfReader(open(infile, 'rb'))
fields = _getFields(infile)
# for k, v in fields.items():
# print(k + ": " + v.get('/V', ''))
return OrderedDict((k, v.get('/V', '').strip().replace(' ', '')) for k, v in fields.items())
if __name__ == '__main__':
nextNoEmailIndex = 1
for x in sorted(os.listdir()):
if x.endswith(".pdf"):
try:
pdf_file_name = x
formFields = get_form_fields(pdf_file_name)
csvCells = []
csvCells.append("Circuito") # circuit
csvCells.append(formFields['900_11_Text_C'] + ' ' + formFields['900_12_Text_C']) # congregation
csvCells.append(formFields['900_2_Text_C']) # first name
csvCells.append(formFields['900_3_Text_C']) # middle name
csvCells.append(formFields['900_1_Text_C']) # last name
if formFields['900_22_CheckBox'] == "/Yes": # is female
csvCells.append("TRUE")
else:
csvCells.append("FALSE")
email = formFields['900_5_Text_C'].replace(' ', '').lower()
if email == '':
nextNoEmailIndex += 1
email = 'no-email-' + str(nextNoEmailIndex).zfill(2) + '@example.com'
csvCells.append(email) # email
csvCells.append('+549'+formFields['900_10_Text_C'].replace(' ', '').replace('-', '').replace('(', '').replace(')','').replace('.','')) # phone
csvCells.append("S") # language
if formFields['900_30_CheckBox'] == "/Yes": # pioneerlevel
csvCells.append("Special")
elif formFields['900_32_CheckBox'] == "/Yes":
csvCells.append("Regular")
else:
csvCells.append("None")
if formFields['900_34_CheckBox'] == "/Yes": # privilege
csvCells.append("Elder")
elif formFields['900_35_CheckBox'] == "/Yes":
csvCells.append("MinisterialServant")
else:
csvCells.append("None")
if formFields['900_38_CheckBox'] == "/Yes": # participated before: yes
csvCells.append("Common") #experience
else:
csvCells.append("Newbie")
csvCells.append("FALSE") # IsShiftLeader
csvCells.append("FALSE") # IsVip
csvCells.append("FALSE") # IsAdmin
csvCells.append("Activate") # ImportAction
csvCells.append("") # placeholder for concatenation of availabilities
#csvCells.append(formFields['900_28_Text_C']) # partner
csvCells.append(formFields['900_41_Text_C'])
csvCells.append(formFields['900_42_Text_C'])
csvCells.append(formFields['900_43_Text_C'])
csvCells.append(formFields['900_44_Text_C'])
csvCells.append(formFields['900_45_Text_C'])
csvCells.append(formFields['900_46_Text_C'])
csvCells.append(formFields['900_47_Text_C'])
csvCells = map(lambda s: s.strip(), csvCells)
print("\""+"\";\"".join(csvCells)+"\"")
except:
print("\""+x+"\"")
@marcwittke
Copy link
Author

prerequisite:
pip install PyPDF2

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment