Last active
October 18, 2024 10:51
-
-
Save manzke/eb2d63f68927fab94f276e86764af1a1 to your computer and use it in GitHub Desktop.
Python script using PyPDF2 to analyse a suspicious PDF for Actions, Links, ...
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install --upgrade PyPDF2 | |
# python analyze_pdf.py <pdf_file> | |
import sys | |
import PyPDF2 | |
from PyPDF2.generic import IndirectObject, DictionaryObject, ArrayObject, NameObject | |
import struct | |
def analyze_pdf(file_path): | |
findings = [] | |
counters = { | |
'total_pages': 0, | |
'open_actions': 0, | |
'additional_actions': 0, | |
'embedded_files': 0, | |
'javascript': 0, | |
'external_links': 0, | |
'acroforms': 0, | |
'xfa_forms': 0, | |
'launch_actions': 0, | |
'embedded_multimedia': 0, | |
'suspicious_filters': 0, | |
'embedded_fonts': 0, | |
'embedded_images': 0, | |
'encrypted': False, | |
'incremental_updates': False, | |
} | |
critical_findings = [] | |
try: | |
with open(file_path, 'rb') as file: | |
reader = PyPDF2.PdfReader(file, strict=False) | |
# Check if the PDF is encrypted | |
if reader.is_encrypted: | |
findings.append("The PDF is encrypted.") | |
counters['encrypted'] = True | |
critical_findings.append("PDF is encrypted, analysis may be limited.") | |
try: | |
reader.decrypt('') | |
except: | |
pass # If decryption fails, proceed with limited analysis | |
num_pages = len(reader.pages) | |
counters['total_pages'] = num_pages | |
print(f"Number of pages: {num_pages}") | |
# Check for incremental updates | |
if len(reader.trailer.get('/Prev', [])) > 0: | |
findings.append("PDF contains incremental updates.") | |
counters['incremental_updates'] = True | |
critical_findings.append("PDF contains incremental updates, which may hide malicious content.") | |
# Check for /OpenAction in /Root | |
root = reader.trailer['/Root'] | |
if '/OpenAction' in root: | |
findings.append("Potential malicious action found: /OpenAction in /Root") | |
counters['open_actions'] += 1 | |
critical_findings.append("Document contains /OpenAction, which can execute actions on open.") | |
# Check for AcroForms and XFA Forms | |
acro_form = root.get('/AcroForm') | |
if acro_form: | |
findings.append("AcroForm detected in the PDF.") | |
counters['acroforms'] += 1 | |
# Check for XFA forms | |
xfa = acro_form.get('/XFA') | |
if xfa: | |
findings.append("XFA form detected in the PDF.") | |
counters['xfa_forms'] += 1 | |
critical_findings.append("Document contains XFA forms, which can include dynamic content.") | |
# Check for Names dictionary in /Root | |
names = root.get('/Names') | |
if names: | |
if isinstance(names, IndirectObject): | |
names = names.get_object() | |
# Check for embedded files | |
embedded_files = names.get('/EmbeddedFiles') | |
if embedded_files: | |
findings.append("Embedded files detected in the PDF.") | |
counters['embedded_files'] += 1 | |
critical_findings.append("Document contains embedded files.") | |
# Check for JavaScript | |
javascript = names.get('/JavaScript') | |
if javascript: | |
findings.append("JavaScript detected in the PDF.") | |
counters['javascript'] += 1 | |
critical_findings.append("Document contains JavaScript code.") | |
# Check for embedded multimedia content | |
if '/RichMedia' in root or '/Media' in root: | |
findings.append("Embedded multimedia content detected in the PDF.") | |
counters['embedded_multimedia'] += 1 | |
critical_findings.append("Document contains embedded multimedia content.") | |
# Check for embedded fonts and images | |
fonts_checked = set() | |
images_checked = set() | |
for page_num in range(num_pages): | |
page = reader.pages[page_num] | |
print(f"\nAnalyzing Page {page_num + 1}") | |
# Resolve the page object | |
page_obj = page.get_object() | |
# Check for /AA (Additional Actions) | |
aa = page_obj.get('/AA') | |
if aa: | |
findings.append(f"Potential malicious action found: /AA (Additional Actions) on page {page_num + 1}.") | |
counters['additional_actions'] += 1 | |
critical_findings.append(f"Page {page_num + 1} contains /AA (Additional Actions).") | |
# Check for embedded multimedia content in page resources | |
resources = page_obj.get('/Resources', {}) | |
if isinstance(resources, IndirectObject): | |
resources = resources.get_object() | |
# Check for embedded fonts | |
fonts = resources.get('/Font', {}) | |
if isinstance(fonts, IndirectObject): | |
fonts = fonts.get_object() | |
for font_key, font in fonts.items(): | |
if font_key not in fonts_checked: | |
fonts_checked.add(font_key) | |
findings.append(f"Embedded font detected: {font_key} on page {page_num + 1}") | |
counters['embedded_fonts'] += 1 | |
# Check for embedded images | |
xobjects = resources.get('/XObject', {}) | |
if isinstance(xobjects, IndirectObject): | |
xobjects = xobjects.get_object() | |
for xobj_key, xobj in xobjects.items(): | |
xobj_obj = xobj.get_object() | |
subtype = xobj_obj.get('/Subtype') | |
if subtype == '/Image' and xobj_key not in images_checked: | |
images_checked.add(xobj_key) | |
findings.append(f"Embedded image detected: {xobj_key} on page {page_num + 1}") | |
counters['embedded_images'] += 1 | |
# Check for annotations | |
annots = page_obj.get('/Annots') | |
if annots: | |
if isinstance(annots, IndirectObject): | |
annots = annots.get_object() | |
if not isinstance(annots, list): | |
annots = [annots] | |
for annot_ref in annots: | |
annot = annot_ref.get_object() | |
subtype = annot.get('/Subtype') | |
# Check for external links | |
if subtype == '/Link': | |
action = annot.get('/A') | |
if action: | |
if isinstance(action, IndirectObject): | |
action = action.get_object() | |
uri = action.get('/URI') | |
if uri: | |
findings.append(f"External link found on page {page_num + 1}: {uri}") | |
counters['external_links'] += 1 | |
# Check for /Launch actions | |
if action.get('/S') == '/Launch': | |
findings.append(f"Launch action detected on page {page_num + 1}.") | |
counters['launch_actions'] += 1 | |
critical_findings.append(f"Page {page_num + 1} contains a /Launch action.") | |
# Check for actions in annotations | |
if '/AA' in annot: | |
findings.append(f"Annotation with additional actions found on page {page_num + 1}.") | |
counters['additional_actions'] += 1 | |
critical_findings.append(f"Annotation on page {page_num + 1} contains /AA (Additional Actions).") | |
# Check for suspicious filters in content streams | |
contents = page_obj.get('/Contents') | |
if contents: | |
if isinstance(contents, IndirectObject): | |
contents = [contents] | |
elif isinstance(contents, ArrayObject): | |
contents = contents | |
else: | |
contents = [contents] | |
for content in contents: | |
content_obj = content.get_object() | |
filters = content_obj.get('/Filter') | |
if filters: | |
if isinstance(filters, NameObject): | |
filters = [filters] | |
elif isinstance(filters, ArrayObject): | |
filters = filters | |
else: | |
filters = [filters] | |
for filter_name in filters: | |
if filter_name in ['/JBIG2Decode', '/CCITTFaxDecode', '/DCTDecode']: | |
findings.append(f"Suspicious filter {filter_name} used on page {page_num + 1}.") | |
counters['suspicious_filters'] += 1 | |
critical_findings.append(f"Page {page_num + 1} uses suspicious filter {filter_name}.") | |
print("\nPDF analysis completed.") | |
# Print summary of findings | |
print("\nSummary of Findings:") | |
print(f" - Total pages: {counters['total_pages']}") | |
if counters['encrypted']: | |
print(" - The PDF is encrypted.") | |
if counters['open_actions']: | |
print(f" - Open actions (/OpenAction): {counters['open_actions']}") | |
if counters['additional_actions']: | |
print(f" - Additional actions (/AA): {counters['additional_actions']}") | |
if counters['embedded_files']: | |
print(f" - Embedded files: {counters['embedded_files']}") | |
if counters['javascript']: | |
print(f" - JavaScript sections: {counters['javascript']}") | |
if counters['external_links']: | |
print(f" - External links found: {counters['external_links']}") | |
if counters['acroforms']: | |
print(f" - AcroForms detected: {counters['acroforms']}") | |
if counters['xfa_forms']: | |
print(f" - XFA forms detected: {counters['xfa_forms']}") | |
if counters['launch_actions']: | |
print(f" - Launch actions detected: {counters['launch_actions']}") | |
if counters['embedded_multimedia']: | |
print(f" - Embedded multimedia content: {counters['embedded_multimedia']}") | |
if counters['embedded_fonts']: | |
print(f" - Embedded fonts: {counters['embedded_fonts']}") | |
if counters['embedded_images']: | |
print(f" - Embedded images: {counters['embedded_images']}") | |
if counters['suspicious_filters']: | |
print(f" - Suspicious filters used: {counters['suspicious_filters']}") | |
if counters['incremental_updates']: | |
print(" - PDF contains incremental updates.") | |
# Detailed findings | |
if findings: | |
print("\nDetailed Findings:") | |
for finding in findings: | |
print(f" - {finding}") | |
else: | |
print(" - No suspicious content detected.") | |
# Warnings for critical findings | |
if critical_findings: | |
print("\nWarnings:") | |
print("The following critical issues were found:") | |
for critical in critical_findings: | |
print(f" - {critical}") | |
print("\nRecommendation: It's highly recommended to avoid opening this PDF or to open it in a secure, isolated environment.") | |
else: | |
print("\nNo critical issues detected.") | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
if __name__ == "__main__": | |
if len(sys.argv) != 2: | |
print("Usage: python analyze_pdf.py <pdf_file>") | |
else: | |
analyze_pdf(sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment