Skip to content

Instantly share code, notes, and snippets.

@manzke
Last active October 18, 2024 10:51
Show Gist options
  • Save manzke/eb2d63f68927fab94f276e86764af1a1 to your computer and use it in GitHub Desktop.
Save manzke/eb2d63f68927fab94f276e86764af1a1 to your computer and use it in GitHub Desktop.
Python script using PyPDF2 to analyse a suspicious PDF for Actions, Links, ...
# pip install --upgrade PyPDF2
# python analyze_pdf.py <pdf_file>
import sys
import PyPDF2
from PyPDF2.generic import IndirectObject, DictionaryObject, ArrayObject, NameObject
import struct
def analyze_pdf(file_path):
findings = []
counters = {
'total_pages': 0,
'open_actions': 0,
'additional_actions': 0,
'embedded_files': 0,
'javascript': 0,
'external_links': 0,
'acroforms': 0,
'xfa_forms': 0,
'launch_actions': 0,
'embedded_multimedia': 0,
'suspicious_filters': 0,
'embedded_fonts': 0,
'embedded_images': 0,
'encrypted': False,
'incremental_updates': False,
}
critical_findings = []
try:
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file, strict=False)
# Check if the PDF is encrypted
if reader.is_encrypted:
findings.append("The PDF is encrypted.")
counters['encrypted'] = True
critical_findings.append("PDF is encrypted, analysis may be limited.")
try:
reader.decrypt('')
except:
pass # If decryption fails, proceed with limited analysis
num_pages = len(reader.pages)
counters['total_pages'] = num_pages
print(f"Number of pages: {num_pages}")
# Check for incremental updates
if len(reader.trailer.get('/Prev', [])) > 0:
findings.append("PDF contains incremental updates.")
counters['incremental_updates'] = True
critical_findings.append("PDF contains incremental updates, which may hide malicious content.")
# Check for /OpenAction in /Root
root = reader.trailer['/Root']
if '/OpenAction' in root:
findings.append("Potential malicious action found: /OpenAction in /Root")
counters['open_actions'] += 1
critical_findings.append("Document contains /OpenAction, which can execute actions on open.")
# Check for AcroForms and XFA Forms
acro_form = root.get('/AcroForm')
if acro_form:
findings.append("AcroForm detected in the PDF.")
counters['acroforms'] += 1
# Check for XFA forms
xfa = acro_form.get('/XFA')
if xfa:
findings.append("XFA form detected in the PDF.")
counters['xfa_forms'] += 1
critical_findings.append("Document contains XFA forms, which can include dynamic content.")
# Check for Names dictionary in /Root
names = root.get('/Names')
if names:
if isinstance(names, IndirectObject):
names = names.get_object()
# Check for embedded files
embedded_files = names.get('/EmbeddedFiles')
if embedded_files:
findings.append("Embedded files detected in the PDF.")
counters['embedded_files'] += 1
critical_findings.append("Document contains embedded files.")
# Check for JavaScript
javascript = names.get('/JavaScript')
if javascript:
findings.append("JavaScript detected in the PDF.")
counters['javascript'] += 1
critical_findings.append("Document contains JavaScript code.")
# Check for embedded multimedia content
if '/RichMedia' in root or '/Media' in root:
findings.append("Embedded multimedia content detected in the PDF.")
counters['embedded_multimedia'] += 1
critical_findings.append("Document contains embedded multimedia content.")
# Check for embedded fonts and images
fonts_checked = set()
images_checked = set()
for page_num in range(num_pages):
page = reader.pages[page_num]
print(f"\nAnalyzing Page {page_num + 1}")
# Resolve the page object
page_obj = page.get_object()
# Check for /AA (Additional Actions)
aa = page_obj.get('/AA')
if aa:
findings.append(f"Potential malicious action found: /AA (Additional Actions) on page {page_num + 1}.")
counters['additional_actions'] += 1
critical_findings.append(f"Page {page_num + 1} contains /AA (Additional Actions).")
# Check for embedded multimedia content in page resources
resources = page_obj.get('/Resources', {})
if isinstance(resources, IndirectObject):
resources = resources.get_object()
# Check for embedded fonts
fonts = resources.get('/Font', {})
if isinstance(fonts, IndirectObject):
fonts = fonts.get_object()
for font_key, font in fonts.items():
if font_key not in fonts_checked:
fonts_checked.add(font_key)
findings.append(f"Embedded font detected: {font_key} on page {page_num + 1}")
counters['embedded_fonts'] += 1
# Check for embedded images
xobjects = resources.get('/XObject', {})
if isinstance(xobjects, IndirectObject):
xobjects = xobjects.get_object()
for xobj_key, xobj in xobjects.items():
xobj_obj = xobj.get_object()
subtype = xobj_obj.get('/Subtype')
if subtype == '/Image' and xobj_key not in images_checked:
images_checked.add(xobj_key)
findings.append(f"Embedded image detected: {xobj_key} on page {page_num + 1}")
counters['embedded_images'] += 1
# Check for annotations
annots = page_obj.get('/Annots')
if annots:
if isinstance(annots, IndirectObject):
annots = annots.get_object()
if not isinstance(annots, list):
annots = [annots]
for annot_ref in annots:
annot = annot_ref.get_object()
subtype = annot.get('/Subtype')
# Check for external links
if subtype == '/Link':
action = annot.get('/A')
if action:
if isinstance(action, IndirectObject):
action = action.get_object()
uri = action.get('/URI')
if uri:
findings.append(f"External link found on page {page_num + 1}: {uri}")
counters['external_links'] += 1
# Check for /Launch actions
if action.get('/S') == '/Launch':
findings.append(f"Launch action detected on page {page_num + 1}.")
counters['launch_actions'] += 1
critical_findings.append(f"Page {page_num + 1} contains a /Launch action.")
# Check for actions in annotations
if '/AA' in annot:
findings.append(f"Annotation with additional actions found on page {page_num + 1}.")
counters['additional_actions'] += 1
critical_findings.append(f"Annotation on page {page_num + 1} contains /AA (Additional Actions).")
# Check for suspicious filters in content streams
contents = page_obj.get('/Contents')
if contents:
if isinstance(contents, IndirectObject):
contents = [contents]
elif isinstance(contents, ArrayObject):
contents = contents
else:
contents = [contents]
for content in contents:
content_obj = content.get_object()
filters = content_obj.get('/Filter')
if filters:
if isinstance(filters, NameObject):
filters = [filters]
elif isinstance(filters, ArrayObject):
filters = filters
else:
filters = [filters]
for filter_name in filters:
if filter_name in ['/JBIG2Decode', '/CCITTFaxDecode', '/DCTDecode']:
findings.append(f"Suspicious filter {filter_name} used on page {page_num + 1}.")
counters['suspicious_filters'] += 1
critical_findings.append(f"Page {page_num + 1} uses suspicious filter {filter_name}.")
print("\nPDF analysis completed.")
# Print summary of findings
print("\nSummary of Findings:")
print(f" - Total pages: {counters['total_pages']}")
if counters['encrypted']:
print(" - The PDF is encrypted.")
if counters['open_actions']:
print(f" - Open actions (/OpenAction): {counters['open_actions']}")
if counters['additional_actions']:
print(f" - Additional actions (/AA): {counters['additional_actions']}")
if counters['embedded_files']:
print(f" - Embedded files: {counters['embedded_files']}")
if counters['javascript']:
print(f" - JavaScript sections: {counters['javascript']}")
if counters['external_links']:
print(f" - External links found: {counters['external_links']}")
if counters['acroforms']:
print(f" - AcroForms detected: {counters['acroforms']}")
if counters['xfa_forms']:
print(f" - XFA forms detected: {counters['xfa_forms']}")
if counters['launch_actions']:
print(f" - Launch actions detected: {counters['launch_actions']}")
if counters['embedded_multimedia']:
print(f" - Embedded multimedia content: {counters['embedded_multimedia']}")
if counters['embedded_fonts']:
print(f" - Embedded fonts: {counters['embedded_fonts']}")
if counters['embedded_images']:
print(f" - Embedded images: {counters['embedded_images']}")
if counters['suspicious_filters']:
print(f" - Suspicious filters used: {counters['suspicious_filters']}")
if counters['incremental_updates']:
print(" - PDF contains incremental updates.")
# Detailed findings
if findings:
print("\nDetailed Findings:")
for finding in findings:
print(f" - {finding}")
else:
print(" - No suspicious content detected.")
# Warnings for critical findings
if critical_findings:
print("\nWarnings:")
print("The following critical issues were found:")
for critical in critical_findings:
print(f" - {critical}")
print("\nRecommendation: It's highly recommended to avoid opening this PDF or to open it in a secure, isolated environment.")
else:
print("\nNo critical issues detected.")
except Exception as e:
print(f"An error occurred: {e}")
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python analyze_pdf.py <pdf_file>")
else:
analyze_pdf(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment