Skip to content

Instantly share code, notes, and snippets.

@kylebutts
Last active November 24, 2021 15:21
Show Gist options
  • Select an option

  • Save kylebutts/97f1f1368a19fa188ff457c5b4e76642 to your computer and use it in GitHub Desktop.

Select an option

Save kylebutts/97f1f1368a19fa188ff457c5b4e76642 to your computer and use it in GitHub Desktop.
Extract table from pdf pages using Layout.Parser
# Extract images from PDFs
import pdfplumber
# Layout Parser
import layoutparser as lp
import cv2
# regex
import re
# Ploting
import matplotlib.pyplot as plt
%matplotlib inline
# Working with dataframes
from statistics import mode
import numpy as np
import pandas as pd
# ---- Convert pdf to png ------------------------------------------------------
# Create high-resolution images for each page
with pdfplumber.open("raw_data/nat64.pdf") as pdf:
# Pages 107-117
for i in range(106,117):
file_path = "raw_data/nat64_page_" + str(i+1) + ".png"
page = (pdf
.pages[i]
.to_image(resolution=300)
.save(file_path, format="PNG")
)
# Pages 125-135
for i in range(124,135):
file_path = "raw_data/nat64_page_" + str(i+1) + ".png"
page = (pdf
.pages[i]
.to_image(resolution=300)
.save(file_path, format="PNG")
)
# ---- Load Parser -------------------------------------------------------------
# Load the deep layout model from the layoutparser API
# For all the supported model, please check the Model
# Zoo Page: https://layout-parser.readthedocs.io/en/latest/notes/modelzoo.html
model = lp.Detectron2LayoutModel(
'lp://TableBank/faster_rcnn_R_101_FPN_3x/config',
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
label_map={0: "Table"}
)
# Helper function for extracting table image
def extract_table(file_path, debug=False):
"""
Extract the table from the image using layout-parser
"""
image = cv2.imread(file_path)
# Detect the layout of the input image
layout = model.detect(image)
if(debug):
# Show the detected layout of the input image
lp.draw_box(image, layout, box_width=3)
# Extract the table from the layout
tables = lp.Layout([b for b in layout if b.type=='Table'])
if(debug):
# Visualize table on image
lp.draw_box(image, tables, box_width=3, show_element_id=True)
for block in tables:
# Crop image to the table `TextBlock`
# add padding in each image segment can help improve robustness
segment_image = (block
.pad(left=20, right=20, top=10, bottom=10)
.crop_image(image))
# Update file name
export_file_path = re.sub('\\.(.*?)$', '_table.\\1', file_path)
cv2.imwrite(export_file_path, segment_image)
# ---- Debugging ---------------------------------------------------------------
file_path = "raw_data/nat64_page_108.png"
image = cv2.imread(file_path)
# Detect the layout of the input image
layout = model.detect(image)
# Show the detected layout of the input image
lp.draw_box(image, layout, box_width=3)
# ---- Extract All -------------------------------------------------------------
# Pages 107-117
for i in range(106,117):
file_path = "raw_data/nat64_page_" + str(i+1) + ".png"
extract_table(file_path)
# Pages 125-135
for i in range(124,135):
file_path = "raw_data/nat64_page_" + str(i+1) + ".png"
# print(file_path)
extract_table(file_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment