Created
April 20, 2017 02:52
-
-
Save scruss/d07545f8d71ed7ef9ca064d8e3075626 to your computer and use it in GitHub Desktop.
pdf-meta.py - extract metadata from first PDF file given as argument
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# extract metadata from first PDF file given as argument | |
# scruss - 2017 | |
# -*- coding: utf-8 -*- | |
import sys | |
from PyPDF2 import PdfFileReader | |
inputPdf = PdfFileReader(open(sys.argv[1], "rb")) | |
docInfo = inputPdf.getDocumentInfo() | |
# PyPDF2 has very limited docinfo fields; | |
# not much more than | |
print('Title: ', docInfo.title) | |
print('Subject: ', docInfo.subject) | |
print('Author: ', docInfo.author) | |
# XMP stream, if present, can have much more. | |
# mostly under Dublin Core Metadata standards: | |
# http://dublincore.org/ | |
xmpInfo=inputPdf.getXmpMetadata() | |
if xmpInfo is not None: | |
print("\n*** XMP data -\n") | |
print("dc_contributor: ", xmpInfo.dc_contributor) | |
print("dc_coverage: ", xmpInfo.dc_coverage) | |
print("dc_creator: ", xmpInfo.dc_creator) | |
print("dc_date: ", xmpInfo.dc_date) | |
print("dc_description: ", xmpInfo.dc_description) | |
print("dc_format: ", xmpInfo.dc_format) | |
print("dc_identifier: ", xmpInfo.dc_identifier) | |
print("dc_language: ", xmpInfo.dc_language) | |
print("dc_publisher: ", xmpInfo.dc_publisher) | |
print("dc_relation: ", xmpInfo.dc_relation) | |
print("dc_rights: ", xmpInfo.dc_rights) | |
print("dc_source: ", xmpInfo.dc_source) | |
print("dc_subject: ", xmpInfo.dc_subject) # tags/keywords | |
print("dc_title: ", xmpInfo.dc_title) | |
print("dc_type: ", xmpInfo.dc_type) | |
print("pdf_keywords: ", xmpInfo.pdf_keywords) | |
print("pdf_pdfversion: ", xmpInfo.pdf_pdfversion) | |
print("pdf_producer: ", xmpInfo.pdf_producer) | |
print("xmp_createDate: ", xmpInfo.xmp_createDate) | |
print("xmp_creatorTool: ", xmpInfo.xmp_creatorTool) | |
print("xmp_metadataDate: ", xmpInfo.xmp_metadataDate) | |
print("xmpmm_documentId: ", xmpInfo.xmpmm_documentId) | |
print("xmpmm_instanceId: ", xmpInfo.xmpmm_instanceId) | |
print("xmp_modifyDate: ", xmpInfo.xmp_modifyDate) | |
# for | |
# https://www.epa.gov/sites/production/files/2016-09/documents/climate-change-hi.pdf | |
# this returns: | |
# dc_contributor: [] | |
# dc_coverage: None | |
# dc_creator: ['US EPA', 'OAR', 'Climate Change Division'] | |
# dc_date: [] | |
# dc_description: {'x-default': 'This fact sheet provides a concise overview of the observed and projected effects and impacts of climate change on Hawaii.'} | |
# dc_format: application/pdf | |
# dc_identifier: None | |
# dc_language: [] | |
# dc_publisher: [] | |
# dc_relation: [] | |
# dc_rights: {} | |
# dc_source: None | |
# dc_subject: ['EPA', 'climate change', 'state', 'impacts', 'fact sheet', 'summary'] | |
# dc_title: {'x-default': 'What Climate Change Means for Hawaii'} | |
# dc_type: [] | |
# pdf_keywords: EPA; climate change; state; impacts; fact sheet; summary | |
# pdf_pdfversion: None | |
# pdf_producer: Adobe PDF Library 15.0 | |
# xmp_createDate: 2016-04-08 14:41:23 | |
# xmp_creatorTool: Adobe InDesign CC 2015 (Windows) | |
# xmp_metadataDate: 2016-08-24 15:05:46 | |
# xmpmm_documentId: xmp.id:43618963-6411-e44a-8538-2301fb5e6821 | |
# xmpmm_instanceId: uuid:196fd5fa-8ecf-44b0-870b-086d08707697 | |
# xmp_modifyDate: 2016-08-24 15:05:46 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment