Created
October 14, 2017 22:09
-
-
Save beldaz/8ed6e7473bd228fcee8d4a3e4525be11 to your computer and use it in GitHub Desktop.
Class to extract tabular PDF text using PDFBox
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright 2017 Beldaz (https://github.com/beldaz) | |
* Licensed to the Apache Software Foundation (ASF) under one or more | |
* contributor license agreements. See the NOTICE file distributed with | |
* this work for additional information regarding copyright ownership. | |
* The ASF licenses this file to You under the Apache License, Version 2.0 | |
* (the "License"); you may not use this file except in compliance with | |
* the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
import java.awt.Rectangle; | |
import java.awt.Shape; | |
import java.awt.geom.AffineTransform; | |
import java.awt.geom.Rectangle2D; | |
import java.io.ByteArrayOutputStream; | |
import java.io.File; | |
import java.io.IOException; | |
import java.io.OutputStreamWriter; | |
import java.io.Writer; | |
import java.util.ArrayList; | |
import java.util.HashSet; | |
import java.util.Iterator; | |
import java.util.LinkedList; | |
import java.util.List; | |
import java.util.Set; | |
import org.apache.fontbox.util.BoundingBox; | |
import org.apache.pdfbox.pdmodel.PDDocument; | |
import org.apache.pdfbox.pdmodel.PDPage; | |
import org.apache.pdfbox.pdmodel.common.PDRectangle; | |
import org.apache.pdfbox.pdmodel.font.PDFont; | |
import org.apache.pdfbox.pdmodel.font.PDType3Font; | |
import org.apache.pdfbox.text.PDFTextStripper; | |
import org.apache.pdfbox.text.PDFTextStripperByArea; | |
import org.apache.pdfbox.text.TextPosition; | |
/** | |
* | |
* Class to extract tabular data from a PDF. | |
* Works by making a first pass of the page to group all nearby text items | |
* together, and then inferring a 2D grid from these regions. Each table cell | |
* is then extracted using a PDFTextStripperByArea object. | |
* | |
* Works best when | |
* headers are included in the detected region, to ensure representative text | |
* in every column. | |
* | |
* Based upon DrawPrintTextLocations PDFBox example | |
* (https://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/util/DrawPrintTextLocations.java) | |
* | |
* @author Beldaz | |
*/ | |
public class PDFTableStripper extends PDFTextStripper | |
{ | |
/** | |
* This will print the documents data, for each table cell. | |
* | |
* @param args The command line arguments. | |
* | |
* @throws IOException If there is an error parsing the document. | |
*/ | |
public static void main(String[] args) throws IOException | |
{ | |
try (PDDocument document = PDDocument.load(new File(args[0]))) | |
{ | |
final double res = 72; // PDF units are at 72 DPI | |
PDFTableStripper stripper = new PDFTableStripper(); | |
stripper.setSortByPosition(true); | |
// Choose a region in which to extract a table (here a 6"wide, 9" high rectangle offset 1" from top left of page) | |
stripper.setRegion(new Rectangle((int) Math.round(1.0*res), (int) Math.round(1*res), (int) Math.round(6*res), (int) Math.round(9.0*res))); | |
// Repeat for each page of PDF | |
for (int page = 0; page < document.getNumberOfPages(); ++page) | |
{ | |
System.out.println("Page " + page); | |
PDPage pdPage = document.getPage(page); | |
stripper.extractTable(pdPage); | |
for(int c=0; c<stripper.getColumns(); ++c) { | |
System.out.println("Column " + c); | |
for(int r=0; r<stripper.getRows(); ++r) { | |
System.out.println("Row " + r); | |
System.out.println(stripper.getText(r, c)); | |
} | |
} | |
} | |
} | |
} | |
/* | |
* Used in methods derived from DrawPrintTextLocations | |
*/ | |
private AffineTransform flipAT; | |
private AffineTransform rotateAT; | |
/** | |
* Regions updated by calls to writeString | |
*/ | |
private Set<Rectangle2D> boxes; | |
// Border to allow when finding intersections | |
private double dx = 1.0; // This value works for me, feel free to tweak (or add setter) | |
private double dy = 0.000; // Rows of text tend to overlap, so need to extend | |
/** | |
* Region in which to find table (otherwise whole page) | |
*/ | |
private Rectangle2D regionArea; | |
/** | |
* Number of rows in inferred table | |
*/ | |
private int nRows=0; | |
/** | |
* Number of columns in inferred table | |
*/ | |
private int nCols=0; | |
/** | |
* This is the object that does the text extraction | |
*/ | |
private PDFTextStripperByArea regionStripper; | |
/** | |
* 1D intervals - used for calculateTableRegions() | |
* @author Beldaz | |
* | |
*/ | |
public static class Interval { | |
double start; | |
double end; | |
public Interval(double start, double end) { | |
this.start=start; this.end = end; | |
} | |
public void add(Interval col) { | |
if(col.start<start) | |
start = col.start; | |
if(col.end>end) | |
end = col.end; | |
} | |
public static void addTo(Interval x, LinkedList<Interval> columns) { | |
int p = 0; | |
Iterator<Interval> it = columns.iterator(); | |
// Find where x should go | |
while(it.hasNext()) { | |
Interval col = it.next(); | |
if(x.end>=col.start) { | |
if(x.start<=col.end) { // overlaps | |
x.add(col); | |
it.remove(); | |
} | |
break; | |
} | |
++p; | |
} | |
while(it.hasNext()) { | |
Interval col = it.next(); | |
if(x.start>col.end) | |
break; | |
x.add(col); | |
it.remove(); | |
} | |
columns.add(p, x); | |
} | |
} | |
/** | |
* Instantiate a new PDFTableStripper object. | |
* | |
* @param document | |
* @throws IOException If there is an error loading the properties. | |
*/ | |
public PDFTableStripper() throws IOException | |
{ | |
super.setShouldSeparateByBeads(false); | |
regionStripper = new PDFTextStripperByArea(); | |
regionStripper.setSortByPosition( true ); | |
} | |
/** | |
* Define the region to group text by. | |
* | |
* @param rect The rectangle area to retrieve the text from. | |
*/ | |
public void setRegion(Rectangle2D rect ) | |
{ | |
regionArea = rect; | |
} | |
public int getRows() | |
{ | |
return nRows; | |
} | |
public int getColumns() | |
{ | |
return nCols; | |
} | |
/** | |
* Get the text for the region, this should be called after extractTable(). | |
* | |
* @return The text that was identified in that region. | |
*/ | |
public String getText(int row, int col) | |
{ | |
return regionStripper.getTextForRegion("el"+col+"x"+row); | |
} | |
public void extractTable(PDPage pdPage) throws IOException | |
{ | |
setStartPage(getCurrentPageNo()); | |
setEndPage(getCurrentPageNo()); | |
boxes = new HashSet<Rectangle2D>(); | |
// flip y-axis | |
flipAT = new AffineTransform(); | |
flipAT.translate(0, pdPage.getBBox().getHeight()); | |
flipAT.scale(1, -1); | |
// page may be rotated | |
rotateAT = new AffineTransform(); | |
int rotation = pdPage.getRotation(); | |
if (rotation != 0) | |
{ | |
PDRectangle mediaBox = pdPage.getMediaBox(); | |
switch (rotation) | |
{ | |
case 90: | |
rotateAT.translate(mediaBox.getHeight(), 0); | |
break; | |
case 270: | |
rotateAT.translate(0, mediaBox.getWidth()); | |
break; | |
case 180: | |
rotateAT.translate(mediaBox.getWidth(), mediaBox.getHeight()); | |
break; | |
default: | |
break; | |
} | |
rotateAT.rotate(Math.toRadians(rotation)); | |
} | |
// Trigger processing of the document so that writeString is called. | |
try (Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream())) { | |
super.output = dummy; | |
super.processPage(pdPage); | |
} | |
Rectangle2D[][] regions = calculateTableRegions(); | |
// System.err.println("Drawing " + nCols + "x" + nRows + "="+ nRows*nCols + " regions"); | |
for(int i=0; i<nCols; ++i) { | |
for(int j=0; j<nRows; ++j) { | |
final Rectangle2D region = regions[i][j]; | |
regionStripper.addRegion("el"+i+"x"+j, region); | |
} | |
} | |
regionStripper.extractRegions(pdPage); | |
} | |
/** | |
* Infer a rectangular grid of regions from the boxes field. | |
* | |
* @return 2D array of table regions (as Rectangle2D objects). Note that | |
* some of these regions may have no content. | |
*/ | |
private Rectangle2D[][] calculateTableRegions() { | |
// Build up a list of all table regions, based upon the populated | |
// regions of boxes field. Treats the horizontal and vertical extents | |
// of each box as distinct | |
LinkedList<Interval> columns = new LinkedList<Interval>(); | |
LinkedList<Interval> rows = new LinkedList<Interval>(); | |
for(Rectangle2D box: boxes) { | |
Interval x = new Interval(box.getMinX(), box.getMaxX()); | |
Interval y = new Interval(box.getMinY(), box.getMaxY()); | |
Interval.addTo(x, columns); | |
Interval.addTo(y, rows); | |
} | |
nRows = rows.size(); | |
nCols = columns.size(); | |
Rectangle2D[][] regions = new Rectangle2D[nCols][nRows]; | |
int i=0; | |
// Label regions from top left, rather than the transformed orientation | |
for(Interval column: columns) { | |
int j=0; | |
for(Interval row: rows) { | |
regions[nCols-i-1][nRows-j-1] = new Rectangle2D.Double(column.start, row.start, column.end - column.start, row.end - row.start); | |
++j; | |
} | |
++i; | |
} | |
return regions; | |
} | |
/** | |
* Register each character's bounding box, updating boxes field to maintain | |
* a list of all distinct groups of characters. | |
* | |
* Overrides the default functionality of PDFTextStripper. | |
* Most of this is taken from DrawPrintTextLocations.java, with extra steps | |
* at end of main loop | |
*/ | |
@Override | |
protected void writeString(String string, List<TextPosition> textPositions) throws IOException | |
{ | |
for (TextPosition text : textPositions) | |
{ | |
// glyph space -> user space | |
// note: text.getTextMatrix() is *not* the Text Matrix, it's the Text Rendering Matrix | |
AffineTransform at = text.getTextMatrix().createAffineTransform(); | |
PDFont font = text.getFont(); | |
BoundingBox bbox = font.getBoundingBox(); | |
// advance width, bbox height (glyph space) | |
float xadvance = font.getWidth(text.getCharacterCodes()[0]); // todo: should iterate all chars | |
Rectangle2D.Float rect = new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight()); | |
if (font instanceof PDType3Font) | |
{ | |
// bbox and font matrix are unscaled | |
at.concatenate(font.getFontMatrix().createAffineTransform()); | |
} | |
else | |
{ | |
// bbox and font matrix are already scaled to 1000 | |
at.scale(1/1000f, 1/1000f); | |
} | |
Shape s = at.createTransformedShape(rect); | |
s = flipAT.createTransformedShape(s); | |
s = rotateAT.createTransformedShape(s); | |
// | |
// Merge character's bounding box with boxes field | |
// | |
Rectangle2D bounds = s.getBounds2D(); | |
// Pad sides to detect almost touching boxes | |
Rectangle2D hitbox = bounds.getBounds2D(); | |
hitbox.add(bounds.getMinX() - dx , bounds.getMinY() - dy); | |
hitbox.add(bounds.getMaxX() + dx , bounds.getMaxY() + dy); | |
// Find all overlapping boxes | |
List<Rectangle2D> intersectList = new ArrayList<Rectangle2D>(); | |
for(Rectangle2D box: boxes) { | |
if(box.intersects(hitbox)) { | |
intersectList.add(box); | |
} | |
} | |
// Combine all touching boxes and update | |
// (NOTE: Potentially this could leave some overlapping boxes un-merged, | |
// but it's sufficient for now and get's fixed up in calculateTableRegions) | |
for(Rectangle2D box: intersectList) { | |
bounds.add(box); | |
boxes.remove(box); | |
} | |
boxes.add(bounds); | |
} | |
} | |
/** | |
* This method does nothing in this derived class, because beads and regions are incompatible. Beads are | |
* ignored when stripping by area. | |
* | |
* @param aShouldSeparateByBeads The new grouping of beads. | |
*/ | |
@Override | |
public final void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) | |
{ | |
} | |
/** | |
* Adapted from PDFTextStripperByArea | |
* {@inheritDoc} | |
*/ | |
@Override | |
protected void processTextPosition( TextPosition text ) | |
{ | |
if(regionArea!=null && !regionArea.contains( text.getX(), text.getY() ) ) { | |
// skip character | |
} else { | |
super.processTextPosition( text ); | |
} | |
} | |
} |
Great God!
Please take my knees!
大神,您太厉害了,我佩服的五体投地,跪了,跪了!
uhm Im confused on how to use this D;
Great program. Worked flawlessly.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you so much for sharing this! Worked for my use case perfectly when PDFTextStripper wasn't getting the job done. You are the best!