Last active
October 20, 2015 10:29
-
-
Save mitya57/f616f32514e1f13da495 to your computer and use it in GitHub Desktop.
Statistical information about PDFs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Get statistical information about PDFs | |
# Author: 2015 Dmitry Shachnev <[email protected]> | |
# Required packages (in Debian/Ubuntu): | |
# - gir1.2-poppler-0.18 | |
# - python3-gi or python-gi | |
import gi | |
gi.require_version('Poppler', '0.18') | |
from gi.repository import GLib, Poppler | |
CMS_IN_INCH = 2.54 | |
CMS_IN_PT = CMS_IN_INCH / 72 | |
SQUARE_CMS_IN_QUIRE = 3000 | |
CHARS_IN_PAGE_THRESHOLD = 1000 | |
CHARS_IN_QUIRE = 25000 | |
class PdfInfo(object): | |
"""This class represents the PDF information.""" | |
pages_count = 0 | |
characters_count = 0 | |
quires_count = 0.0 | |
images_count = 0 | |
total_area = 0.0 | |
images_area = 0.0 | |
def _PopplerDocument_get_pages(document): | |
n_pages = document.get_n_pages() | |
for i in range(n_pages): | |
yield document.get_page(i) | |
Poppler.Document.get_pages = _PopplerDocument_get_pages | |
def get_area(document): | |
"""Returns total area of all pages in the documents, in square | |
centimeters.""" | |
total_area_sqcm = 0 | |
for page in document.get_pages(): | |
width_pt, height_pt = page.get_size() | |
width_cm = round(width_pt * CMS_IN_PT, 4) | |
height_cm = round(height_pt * CMS_IN_PT, 4) | |
total_area_sqcm += width_cm * height_cm | |
return round(total_area_sqcm, 4) | |
def get_quires_count(document, use_square=False): | |
"""Returns approximate number of quires needed to print the document.""" | |
if use_square: | |
return get_area(document) / SQUARE_CMS_IN_QUIRE | |
images_area = get_images_info(document)[1] | |
characters_count = get_characters_count(document) | |
quires_from_text = characters_count / CHARS_IN_QUIRE | |
quires_from_images = images_area / SQUARE_CMS_IN_QUIRE | |
return quires_from_text + quires_from_images | |
def get_characters_count(document): | |
"""Returns number of characters in the document.""" | |
pages = document.get_pages() | |
return sum(map(len, map(Poppler.Page.get_text, pages))) | |
def get_images_info(document): | |
"""Returns (number of images, total area of all images in square | |
centimeters) tuple.""" | |
images_count = 0 | |
total_area_sqcm = 0 | |
for page in document.get_pages(): | |
for img in page.get_image_mapping(): | |
images_count += 1 | |
img_width_cm = (img.area.x2 - img.area.x1) * CMS_IN_PT | |
img_height_cm = (img.area.y2 - img.area.y1) * CMS_IN_PT | |
total_area_sqcm += img_width_cm * img_height_cm | |
return images_count, total_area_sqcm | |
def get_info(document): | |
"""Returns an instance of PdfInfo class.""" | |
info = PdfInfo() | |
info.pages_count = document.get_n_pages() | |
info.characters_count = get_characters_count(document) | |
info.quires_count = get_quires_count(document) | |
info.images_count, info.images_area = get_images_info(document) | |
info.total_area = get_area(document) | |
return info | |
def document_from_file(filename): | |
uri = GLib.filename_to_uri(filename) | |
return Poppler.Document.new_from_file(uri) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Пример использования: