kylebutts · November 24, 2021 15:21
diff --git a/extract_table.py b/extract_table.py
 # Extract images from PDFs
 import pdfplumber

 # Layout Parser
 import layoutparser as lp 
 import cv2

 # regex
 import re 

 # Ploting 
 import matplotlib.pyplot as plt
 %matplotlib inline 

 # Working with dataframes
 from statistics import mode
 import numpy as np
 import pandas as pd


 # ---- Convert pdf to png ------------------------------------------------------

 # Create high-resolution images for each page
 with pdfplumber.open("raw_data/nat64.pdf") as pdf:
    # Pages 107-117
    for i in range(106,117):
        file_path = "raw_data/nat64_page_" + str(i+1) + ".png"
        page = (pdf
            .pages[i]
            .to_image(resolution=300)
            .save(file_path, format="PNG")
        )
    
    # Pages 125-135
    for i in range(124,135):
        file_path = "raw_data/nat64_page_" + str(i+1) + ".png"
        page = (pdf
            .pages[i]
            .to_image(resolution=300)
            .save(file_path, format="PNG")
        )


 # ---- Load Parser -------------------------------------------------------------

 # Load the deep layout model from the layoutparser API 
 # For all the supported model, please check the Model 
 # Zoo Page: https://layout-parser.readthedocs.io/en/latest/notes/modelzoo.html
 model = lp.Detectron2LayoutModel(
    'lp://TableBank/faster_rcnn_R_101_FPN_3x/config', 
    extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
    label_map={0: "Table"}
 )

 # Helper function for extracting table image 

 def extract_table(file_path, debug=False):
    """
    Extract the table from the image using layout-parser
    """
    image = cv2.imread(file_path)
    
    # Detect the layout of the input image
    layout = model.detect(image)

    if(debug):
        # Show the detected layout of the input image
        lp.draw_box(image, layout, box_width=3)

    # Extract the table from the layout
    tables = lp.Layout([b for b in layout if b.type=='Table'])

    if(debug):
        # Visualize table on image
        lp.draw_box(image, tables, box_width=3, show_element_id=True)

    for block in tables:
        # Crop image to the table `TextBlock`
        # add padding in each image segment can help improve robustness
        segment_image = (block
            .pad(left=20, right=20, top=10, bottom=10)
            .crop_image(image))

        # Update file name 
        export_file_path = re.sub('\\.(.*?)$', '_table.\\1', file_path)

        cv2.imwrite(export_file_path, segment_image)



 # ---- Debugging ---------------------------------------------------------------

 file_path = "raw_data/nat64_page_108.png"

 image = cv2.imread(file_path)
    
 # Detect the layout of the input image
 layout = model.detect(image)


 # Show the detected layout of the input image
 lp.draw_box(image, layout, box_width=3)



 # ---- Extract All -------------------------------------------------------------

 # Pages 107-117
 for i in range(106,117):
    file_path = "raw_data/nat64_page_" + str(i+1) + ".png"
    extract_table(file_path)

 # Pages 125-135
 for i in range(124,135):
    file_path = "raw_data/nat64_page_" + str(i+1) + ".png"
    # print(file_path)
    extract_table(file_path)
	# Extract images from PDFs
	import pdfplumber

	# Layout Parser
	import layoutparser as lp
	import cv2

	# regex
	import re

	# Ploting
	import matplotlib.pyplot as plt
	%matplotlib inline

	# Working with dataframes
	from statistics import mode
	import numpy as np
	import pandas as pd


	# ---- Convert pdf to png ------------------------------------------------------

	# Create high-resolution images for each page
	with pdfplumber.open("raw_data/nat64.pdf") as pdf:
	# Pages 107-117
	for i in range(106,117):
	file_path = "raw_data/nat64_page_" + str(i+1) + ".png"
	page = (pdf
	.pages[i]
	.to_image(resolution=300)
	.save(file_path, format="PNG")
	)

	# Pages 125-135
	for i in range(124,135):
	file_path = "raw_data/nat64_page_" + str(i+1) + ".png"
	page = (pdf
	.pages[i]
	.to_image(resolution=300)
	.save(file_path, format="PNG")
	)


	# ---- Load Parser -------------------------------------------------------------

	# Load the deep layout model from the layoutparser API
	# For all the supported model, please check the Model
	# Zoo Page: https://layout-parser.readthedocs.io/en/latest/notes/modelzoo.html
	model = lp.Detectron2LayoutModel(
	'lp://TableBank/faster_rcnn_R_101_FPN_3x/config',
	extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
	label_map={0: "Table"}
	)

	# Helper function for extracting table image

	def extract_table(file_path, debug=False):
	"""
	Extract the table from the image using layout-parser
	"""
	image = cv2.imread(file_path)

	# Detect the layout of the input image
	layout = model.detect(image)

	if(debug):
	# Show the detected layout of the input image
	lp.draw_box(image, layout, box_width=3)

	# Extract the table from the layout
	tables = lp.Layout([b for b in layout if b.type=='Table'])

	if(debug):
	# Visualize table on image
	lp.draw_box(image, tables, box_width=3, show_element_id=True)

	for block in tables:
	# Crop image to the table `TextBlock`
	# add padding in each image segment can help improve robustness
	segment_image = (block
	.pad(left=20, right=20, top=10, bottom=10)
	.crop_image(image))

	# Update file name
	export_file_path = re.sub('\\.(.*?)$', '_table.\\1', file_path)

	cv2.imwrite(export_file_path, segment_image)



	# ---- Debugging ---------------------------------------------------------------

	file_path = "raw_data/nat64_page_108.png"

	image = cv2.imread(file_path)

	# Detect the layout of the input image
	layout = model.detect(image)


	# Show the detected layout of the input image
	lp.draw_box(image, layout, box_width=3)



	# ---- Extract All -------------------------------------------------------------

	# Pages 107-117
	for i in range(106,117):
	file_path = "raw_data/nat64_page_" + str(i+1) + ".png"
	extract_table(file_path)

	# Pages 125-135
	for i in range(124,135):
	file_path = "raw_data/nat64_page_" + str(i+1) + ".png"
	# print(file_path)
	extract_table(file_path)
No results found