Last active
October 14, 2015 23:35
-
-
Save jg-you/ccdb007c92e9cad1bbf3 to your computer and use it in GitHub Desktop.
Extract parts of a pdf file as a png (dirty hack)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Author: Jean-Gabriel Young | |
# Email: [email protected] | |
# -*- coding: utf-8 -*- | |
import argparse | |
import subprocess | |
import os | |
from PIL import Image | |
def extract(upper_bound=1, lower_bound=0, margin_x=0.075, margin_y=0.055, | |
page=1, onecolumn=False, column=1, dpi=300, | |
infile=None, outfile=None): | |
print(dpi, margin_x, margin_y, upper_bound, lower_bound, page, onecolumn, dpi, infile, outfile) | |
# Process pdf and create a PNG of the page that contains the figure. | |
tmp_pdf_path = "/tmp/extraction_tmp.pdf" | |
tmp_png_path = "/tmp/extraction_tmp.png" | |
subprocess.call(["pdfseparate", | |
"-f", str(page), # First page | |
"-l", str(page), # Last page | |
infile, # Input file | |
tmp_pdf_path]) # Output file | |
subprocess.call(["pdftocairo", | |
"-png", # Output in cairo png | |
"-r", str(dpi), # DPI | |
"-singlefile", # Do not rename | |
tmp_pdf_path, # Input file | |
"/tmp/extraction_tmp"]) # Output file | |
# Load page in PIL and crop it. | |
with Image.open(tmp_png_path) as page: | |
width = page.size[0] | |
height = page.size[1] | |
accesible = height * (1 - 2 * margin_y) | |
get_pixel = lambda y: int(height * margin_y + accesible * y) | |
if onecolumn: | |
bounding_box = (int(width * margin_x), | |
get_pixel(upper_bound), | |
int(width * (1 - margin_x)), | |
get_pixel(lower_bound)) | |
else: | |
if column == 1: | |
bounding_box = (int(width * margin_x), | |
get_pixel(upper_bound), | |
int(width * 0.49), | |
get_pixel(lower_bound)) | |
else: | |
bounding_box = (int(width * 0.51), | |
get_pixel(upper_bound), | |
int(width * (1 - margin_x)), | |
get_pixel(lower_bound)) | |
figure = page.crop(bounding_box) | |
figure.save(outfile) | |
# Cleanup | |
os.remove(tmp_png_path) | |
os.remove(tmp_pdf_path) | |
if __name__ == '__main__': | |
# Options parser. | |
prs = argparse.ArgumentParser(description='Crop parts of a PDF file and\ | |
output to PNG.') | |
prs.add_argument('--upper', '-u', type=float, default=0.0, | |
help='Upper bound of the crop box (passed as a fraction\ | |
of the page (excluding margin). The origin is\ | |
located in the top left hand corner of the page.') | |
prs.add_argument('--lower', '-l', type=float, default=1.0, | |
help='Low bound of the crop box (passed as a fraction\ | |
of the page (excluding margin). The origin is\ | |
located in the top left hand corner of the page.') | |
prs.add_argument('--margin_x', '-mx', type=float, default=0.075, | |
help='Width of the margin (x)') | |
prs.add_argument('--margin_y', '-my', type=float, default=0.055, | |
help='Height of the margin (y)') | |
prs.add_argument('--page', '-p', type=int, default=1, | |
help='Page of the pdf.') | |
prs.add_argument('--onecolumn', '-t', action='store_true', | |
help='The cropped area spans two column.') | |
prs.add_argument('--column', '-c', type=int, default=1, | |
help='Column index (if the crop box span one column).') | |
prs.add_argument('--dpi', '-d', type=int, default=300, | |
help='Resolution of the image.') | |
prs.add_argument('infile', type=str, nargs=1, | |
help='Path to input pdf.') | |
prs.add_argument('outfile', type=str, nargs=1, | |
help='Path to output pdf.') | |
args = prs.parse_args() | |
extract(upper_bound=args.upper, | |
lower_bound=args.lower, | |
margin_x=args.margin_x, | |
margin_y=args.margin_y, | |
page=args.page, | |
onecolumn=args.onecolumn, | |
column=args.column, | |
dpi=args.dpi, | |
infile=args.infile[0], | |
outfile=args.outfile[0]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Author: Jean-Gabriel Young | |
# Email: [email protected] | |
# -*- coding: utf-8 -*- | |
import argparse | |
from extraction import extract | |
def simple_extract(anchor=0, page=1, onecolumn=False, column=1, | |
infile=None, outfile=None): | |
if onecolumn: | |
extract(upper_bound=anchor, | |
lower_bound=round(anchor + 0.0975, 4), | |
margin_x=0.063, | |
page=page, | |
onecolumn=True, | |
column=0, | |
dpi=210, | |
infile=infile, | |
outfile=outfile) | |
else: | |
extract(upper_bound=anchor, | |
lower_bound=round(anchor + 0.0457, 4), | |
margin_x=0.0803, | |
page=page, | |
onecolumn=False, | |
column=column, | |
dpi=448, | |
infile=infile, | |
outfile=outfile) | |
if __name__ == '__main__': | |
prs = argparse.ArgumentParser(description='Wrapper around extraction.py.') | |
prs.add_argument('--anchor', '-a', type=float, default=0.0, | |
help='y anchor of the cropbox.') | |
prs.add_argument('--page', '-p', type=int, default=1, | |
help='Page of the pdf.') | |
prs.add_argument('--onecolumn', '-t', action='store_true', | |
help='The cropped area spans two column.') | |
prs.add_argument('--column', '-c', type=int, default=1, | |
help='Column index (if the crop box span one column).') | |
prs.add_argument('infile', type=str, nargs=1, | |
help='Path to input pdf.') | |
prs.add_argument('outfile', type=str, nargs=1, | |
help='Path to output pdf.') | |
args = prs.parse_args() | |
simple_extract(anchor=args.anchor, | |
page=args.page, | |
onecolumn=args.onecolumn, | |
column=args.column, | |
infile=args.infile[0], | |
outfile=args.outfile[0]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment