Last active
November 24, 2021 15:21
-
-
Save kylebutts/97f1f1368a19fa188ff457c5b4e76642 to your computer and use it in GitHub Desktop.
Extract table from pdf pages using Layout.Parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Extract images from PDFs | |
| import pdfplumber | |
| # Layout Parser | |
| import layoutparser as lp | |
| import cv2 | |
| # regex | |
| import re | |
| # Ploting | |
| import matplotlib.pyplot as plt | |
| %matplotlib inline | |
| # Working with dataframes | |
| from statistics import mode | |
| import numpy as np | |
| import pandas as pd | |
| # ---- Convert pdf to png ------------------------------------------------------ | |
| # Create high-resolution images for each page | |
| with pdfplumber.open("raw_data/nat64.pdf") as pdf: | |
| # Pages 107-117 | |
| for i in range(106,117): | |
| file_path = "raw_data/nat64_page_" + str(i+1) + ".png" | |
| page = (pdf | |
| .pages[i] | |
| .to_image(resolution=300) | |
| .save(file_path, format="PNG") | |
| ) | |
| # Pages 125-135 | |
| for i in range(124,135): | |
| file_path = "raw_data/nat64_page_" + str(i+1) + ".png" | |
| page = (pdf | |
| .pages[i] | |
| .to_image(resolution=300) | |
| .save(file_path, format="PNG") | |
| ) | |
| # ---- Load Parser ------------------------------------------------------------- | |
| # Load the deep layout model from the layoutparser API | |
| # For all the supported model, please check the Model | |
| # Zoo Page: https://layout-parser.readthedocs.io/en/latest/notes/modelzoo.html | |
| model = lp.Detectron2LayoutModel( | |
| 'lp://TableBank/faster_rcnn_R_101_FPN_3x/config', | |
| extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8], | |
| label_map={0: "Table"} | |
| ) | |
| # Helper function for extracting table image | |
| def extract_table(file_path, debug=False): | |
| """ | |
| Extract the table from the image using layout-parser | |
| """ | |
| image = cv2.imread(file_path) | |
| # Detect the layout of the input image | |
| layout = model.detect(image) | |
| if(debug): | |
| # Show the detected layout of the input image | |
| lp.draw_box(image, layout, box_width=3) | |
| # Extract the table from the layout | |
| tables = lp.Layout([b for b in layout if b.type=='Table']) | |
| if(debug): | |
| # Visualize table on image | |
| lp.draw_box(image, tables, box_width=3, show_element_id=True) | |
| for block in tables: | |
| # Crop image to the table `TextBlock` | |
| # add padding in each image segment can help improve robustness | |
| segment_image = (block | |
| .pad(left=20, right=20, top=10, bottom=10) | |
| .crop_image(image)) | |
| # Update file name | |
| export_file_path = re.sub('\\.(.*?)$', '_table.\\1', file_path) | |
| cv2.imwrite(export_file_path, segment_image) | |
| # ---- Debugging --------------------------------------------------------------- | |
| file_path = "raw_data/nat64_page_108.png" | |
| image = cv2.imread(file_path) | |
| # Detect the layout of the input image | |
| layout = model.detect(image) | |
| # Show the detected layout of the input image | |
| lp.draw_box(image, layout, box_width=3) | |
| # ---- Extract All ------------------------------------------------------------- | |
| # Pages 107-117 | |
| for i in range(106,117): | |
| file_path = "raw_data/nat64_page_" + str(i+1) + ".png" | |
| extract_table(file_path) | |
| # Pages 125-135 | |
| for i in range(124,135): | |
| file_path = "raw_data/nat64_page_" + str(i+1) + ".png" | |
| # print(file_path) | |
| extract_table(file_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment