Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save kabeer11000/0bbbceaed66df29814f4dd509a133c59 to your computer and use it in GitHub Desktop.
Save kabeer11000/0bbbceaed66df29814f4dd509a133c59 to your computer and use it in GitHub Desktop.
Extract the bounding box of the first, leftmost rectangle dividing labels and annotations in a PDF-formatted UCSC genome browser shot
#!/usr/bin/env python
import sys
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams, LTFigure, LTCurve, LTRect
from pdfminer.converter import PDFPageAggregator
g_fn = None
g_bbox = None
def parse_lt_objs (lt_objs, page_number, found_rect, text=[]):
"""
Iterate through the list of LT* objects
one level deep, and capture the data contained
in each
"""
content = []
for lt_obj in lt_objs:
if isinstance(lt_obj, LTRect) and not found_rect:
content.append(lt_obj.bbox)
found_rect = True
elif isinstance(lt_obj, LTFigure) and not found_rect:
# LTFigure objects are containers for other LT* objects, so recurse through the children
content.extend(parse_lt_objs(lt_obj, page_number, found_rect, content))
return content
def set_bbox(b=None):
global g_bbox
g_bbox = b
def get_bbox():
return g_bbox
def set_fn(f=None):
global g_fn
g_fn = sys.argv[1] if not f else f
def get_fn():
return g_fn
def main():
set_fn(sys.argv[1])
parse(True)
def parse(debug=False):
fp = open(get_fn(), 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser, None)
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
content = []
for i, page in enumerate(PDFPage.create_pages(doc)):
interpreter.process_page(page)
layout = device.get_result()
content.extend(parse_lt_objs(layout, (i+1), False))
set_bbox(content[0])
# print first LTLine object bbox tuple
if debug: sys.stderr.write("First LTRect bbox in PDF -> %s\n" % (get_bbox(),))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment