Created
September 1, 2023 17:23
-
-
Save metadaddy/ceaf352d2a9c834bb8a11ae37734df92 to your computer and use it in GitHub Desktop.
This script uses the PDF file at https://github.com/Snowflake-Labs/snowpark-python-demos/blob/main/pdf-analysis/prod_review10.pdf as a template for creating more PDFs with random names, dates, product selections and recommendations. This PDF file is used in the Snowflake demo "How To Analyze PDF Docs Using Snowpark" at https://www.youtube.com/wa…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Backblaze wants developers and organization to copy and re-use our | |
# code examples, so we make the samples available by several different | |
# licenses. One option is the MIT license (below). Other options are | |
# available here: | |
# | |
# https://www.backblaze.com/using_b2_code.html | |
# | |
# | |
# The MIT License (MIT) | |
# | |
# Copyright (c) 2023 Backblaze | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in all | |
# copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
# SOFTWARE. | |
# This script uses the PDF file at | |
# https://github.com/Snowflake-Labs/snowpark-python-demos/blob/main/pdf-analysis/prod_review10.pdf | |
# as a template for creating more PDFs with random names, dates, product | |
# selections and recommendations. This PDF file is used in the Snowflake demo | |
# "How To Analyze PDF Docs Using Snowpark" at https://www.youtube.com/watch?v=NqZzUACUzm8 | |
# | |
# The code is somewhat tailored to that template PDF, but may be a useful guide | |
# to manipulating fields in other documents. | |
from PyPDF2 import PdfWriter, PdfReader | |
from PyPDF2.constants import FieldDictionaryAttributes as FD | |
from PyPDF2.constants import AnnotationDictionaryAttributes as AD | |
from PyPDF2.generic import NameObject, TextStringObject | |
from random import choice, random | |
from datetime import datetime | |
import censusname | |
genders = ["male", "female"] | |
products = ["Red Skateboard", "Blue Skateboard", "Tennis Shoes", "Basket Ball", "Boat"] | |
recommendations = ["Yes", "No", "MayBe"] | |
# Path to the template PDF - change this as necessary | |
template_file_path = "data/prod_review10.pdf" | |
# PDF field flags | |
RADIO_BUTTON = 1 << 15 | |
MULTI_SELECT = 1 << 21 | |
# This is an extended version of the update_page_form_field_values method in | |
# PdfWriter that handles radio buttons, and removes outdated | |
def update_field_values(page, fields): | |
for j in range(0, len(page['/Annots'])): | |
annot = page['/Annots'][j].get_object() | |
if annot.get('/Subtype') == '/Widget': | |
# Radio button field is not itself an annotation, but contains an annotation | |
# for each individual button. Hence, we have to find annotations that have | |
# a parent that is a radio button field - i.e. it has the radio button bit | |
# set in its field flags - and set the value of the parent | |
parent = annot.get(FD.Parent) | |
# Dereference the IndirectObject if there is one | |
parent = parent and parent.get_object() | |
parent_field_flag = parent and parent.get(FD.Ff) | |
if parent and (parent_field_flag & RADIO_BUTTON): | |
# The annotation is one of the individual buttons, parent is the radio button | |
# field | |
if field_value := fields.get(parent.get(FD.T)): | |
update_radio_button_field(annot, parent, field_value) | |
elif field_value := fields.get(annot.get(FD.T)): | |
# We found the annotation for a text/choice field - they seem to work the same way | |
update_text_field(annot, field_value) | |
# Fix issue with single-select choice containing an "/I" array. | |
# This should only be present for multi-select choice fields and | |
# seems to confuse Adobe Acrobat. | |
field_flag = annot.get(FD.Ff) | |
if annot.get(FD.FT) == '/Ch' and (not field_flag & MULTI_SELECT) and annot.get('/I'): | |
del annot['/I'] | |
def update_text_field(annot, field_value): | |
annot.update({ | |
NameObject(FD.V): TextStringObject(field_value) | |
}) | |
# Remove outdated appearance so it doesn't show when the PDF is viewed | |
if annot.get(AD.AP): | |
del annot[AD.AP] | |
def update_radio_button_field(annot, parent, field_value): | |
# Radio button field text table matches a field in the dict we passed in | |
# Set the parent value - it must be preceded by a '/' | |
parent.update({ | |
NameObject(FD.V): NameObject("/" + field_value) | |
}) | |
# Set the appearance state for the individual buttons so it shows correctly when | |
# the PDF is viewed | |
if field_value in annot.get(AD.AP).get(FD.N): | |
# This is the 'on' button | |
annot.update({ | |
NameObject(AD.AS): NameObject("/" + field_value) | |
}) | |
else: | |
# Section 12.7.4.2.3 of the PDF 1.7 spec defines "Off" as the name of the off state | |
# appearance for check boxes, and this seems to also apply to radio buttons | |
annot.update({ | |
NameObject(AD.AS): NameObject("/Off") | |
}) | |
def main(): | |
reader = PdfReader(template_file_path) | |
# Set range to suit your requirements | |
for n in range(11, 31): | |
# Choose a gender so that first and middle names are consistent | |
gender = choice(genders) | |
first_name = censusname.generate(nameformat='{given}', given=gender) | |
middle_name = censusname.generate(nameformat='{given}', given=gender) | |
last_name = censusname.generate(nameformat='{surname}') | |
product = choice(products) | |
recommend = choice(recommendations) | |
# Generate a random date this year | |
end = datetime.today() | |
start = datetime(end.year, 1, 1) | |
random_date = start + (end - start) * random() | |
# Date looks like 03/25/2023 | |
purchase_date = random_date.strftime("%m/%d/%Y") | |
fields = { | |
'FirstName': first_name, | |
'Middle Name': middle_name, | |
'LastName': last_name, | |
'Product': product, | |
'Purchase Date': purchase_date, | |
'Recommend': recommend | |
} | |
print(fields) | |
writer = PdfWriter() | |
# Need to clone the document rather than just add the page, since we | |
# need the fields on the document as well as the annotations on the | |
# page | |
writer.clone_document_from_reader(reader) | |
# This is *part* of what's needed for new field values to show up in | |
# PDF viewers like Mac Preview and Adobe Acrobat | |
writer.set_need_appearances_writer() | |
update_field_values(writer.get_page(0), fields) | |
output_file_path = f"data/prod_review{n}.pdf" | |
writer.write(output_file_path) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment