Skip to content

Instantly share code, notes, and snippets.

@scruss
Created April 20, 2017 02:52
Show Gist options
  • Save scruss/d07545f8d71ed7ef9ca064d8e3075626 to your computer and use it in GitHub Desktop.
Save scruss/d07545f8d71ed7ef9ca064d8e3075626 to your computer and use it in GitHub Desktop.
pdf-meta.py - extract metadata from first PDF file given as argument
#!/usr/bin/env python3
# extract metadata from first PDF file given as argument
# scruss - 2017
# -*- coding: utf-8 -*-
import sys
from PyPDF2 import PdfFileReader
inputPdf = PdfFileReader(open(sys.argv[1], "rb"))
docInfo = inputPdf.getDocumentInfo()
# PyPDF2 has very limited docinfo fields;
# not much more than
print('Title: ', docInfo.title)
print('Subject: ', docInfo.subject)
print('Author: ', docInfo.author)
# XMP stream, if present, can have much more.
# mostly under Dublin Core Metadata standards:
# http://dublincore.org/
xmpInfo=inputPdf.getXmpMetadata()
if xmpInfo is not None:
print("\n*** XMP data -\n")
print("dc_contributor: ", xmpInfo.dc_contributor)
print("dc_coverage: ", xmpInfo.dc_coverage)
print("dc_creator: ", xmpInfo.dc_creator)
print("dc_date: ", xmpInfo.dc_date)
print("dc_description: ", xmpInfo.dc_description)
print("dc_format: ", xmpInfo.dc_format)
print("dc_identifier: ", xmpInfo.dc_identifier)
print("dc_language: ", xmpInfo.dc_language)
print("dc_publisher: ", xmpInfo.dc_publisher)
print("dc_relation: ", xmpInfo.dc_relation)
print("dc_rights: ", xmpInfo.dc_rights)
print("dc_source: ", xmpInfo.dc_source)
print("dc_subject: ", xmpInfo.dc_subject) # tags/keywords
print("dc_title: ", xmpInfo.dc_title)
print("dc_type: ", xmpInfo.dc_type)
print("pdf_keywords: ", xmpInfo.pdf_keywords)
print("pdf_pdfversion: ", xmpInfo.pdf_pdfversion)
print("pdf_producer: ", xmpInfo.pdf_producer)
print("xmp_createDate: ", xmpInfo.xmp_createDate)
print("xmp_creatorTool: ", xmpInfo.xmp_creatorTool)
print("xmp_metadataDate: ", xmpInfo.xmp_metadataDate)
print("xmpmm_documentId: ", xmpInfo.xmpmm_documentId)
print("xmpmm_instanceId: ", xmpInfo.xmpmm_instanceId)
print("xmp_modifyDate: ", xmpInfo.xmp_modifyDate)
# for
# https://www.epa.gov/sites/production/files/2016-09/documents/climate-change-hi.pdf
# this returns:
# dc_contributor: []
# dc_coverage: None
# dc_creator: ['US EPA', 'OAR', 'Climate Change Division']
# dc_date: []
# dc_description: {'x-default': 'This fact sheet provides a concise overview of the observed and projected effects and impacts of climate change on Hawaii.'}
# dc_format: application/pdf
# dc_identifier: None
# dc_language: []
# dc_publisher: []
# dc_relation: []
# dc_rights: {}
# dc_source: None
# dc_subject: ['EPA', 'climate change', 'state', 'impacts', 'fact sheet', 'summary']
# dc_title: {'x-default': 'What Climate Change Means for Hawaii'}
# dc_type: []
# pdf_keywords: EPA; climate change; state; impacts; fact sheet; summary
# pdf_pdfversion: None
# pdf_producer: Adobe PDF Library 15.0
# xmp_createDate: 2016-04-08 14:41:23
# xmp_creatorTool: Adobe InDesign CC 2015 (Windows)
# xmp_metadataDate: 2016-08-24 15:05:46
# xmpmm_documentId: xmp.id:43618963-6411-e44a-8538-2301fb5e6821
# xmpmm_instanceId: uuid:196fd5fa-8ecf-44b0-870b-086d08707697
# xmp_modifyDate: 2016-08-24 15:05:46
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment