Created
July 28, 2016 00:31
-
-
Save JoelGeraci-Datalogics/dd5b6e91531e3a79f6e296c4f3f91da9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright Datalogics, Inc. 2015 | |
*/ | |
package pdfjt.cookbook.document; | |
import java.awt.Color; | |
import java.util.ArrayList; | |
import java.util.List; | |
import org.apache.commons.lang3.StringUtils; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.nodes.Node; | |
import org.jsoup.select.Elements; | |
import com.adobe.pdfjt.core.types.ASString; | |
import com.adobe.pdfjt.pdf.document.PDFDocument; | |
import com.adobe.pdfjt.pdf.document.PDFOpenOptions; | |
import com.adobe.pdfjt.pdf.interactive.action.PDFActionURI; | |
import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationLink; | |
import com.adobe.pdfjt.pdf.interactive.annotation.PDFBorder; | |
import com.adobe.pdfjt.pdf.page.PDFPage; | |
import com.adobe.pdfjt.services.readingorder.ReadingOrderTextExtractor; | |
import com.adobe.pdfjt.services.textextraction.Word; | |
import com.adobe.pdfjt.services.textextraction.WordsIterator; | |
import com.datalogics.pdf.document.DocumentHelper; | |
import com.datalogics.pdf.document.FontSetLoader; | |
import com.datalogics.pdf.layout.LayoutEngine; | |
import com.datalogics.pdf.text.Dimension; | |
import com.datalogics.pdf.text.Heading; | |
import com.datalogics.pdf.text.Length; | |
import com.datalogics.pdf.text.Paragraph; | |
import com.datalogics.pdf.text.Span; | |
/** | |
* This sample reads an HTML file and adds text to a new PDF file. | |
*/ | |
public class jsoupWithPDF_Java_Toolkit { | |
private static final String inputURL = "https://jsoup.org/"; | |
private static final String outputDir = "cookbook/Document/output/"; | |
static public void main(String[] args) throws Exception { | |
/* | |
* We're using JSoup so we'll read the HTML file from their home page | |
* and select one of the elements that has some content we want to | |
* layout. | |
*/ | |
Document doc = Jsoup.connect(inputURL).userAgent("Mozilla").get(); | |
Element col1 = doc.select(".col1").first(); | |
// Create a new blank PDF file. | |
PDFDocument pdfDocument = PDFDocument.newInstance(PDFOpenOptions.newInstance()); | |
/* | |
* Create a new LayoutEngine object that will actually perform the | |
* addition of text to the page. | |
*/ | |
try (LayoutEngine layout = new LayoutEngine(pdfDocument)) { | |
// Set the font for the entire document | |
layout.getStyle().setFontFamily("Helvetica"); | |
// Work our way through the HTML Element by Element | |
for (Element element : col1.children()) { | |
if (element.isBlock() && element.text().isEmpty() == false) { | |
String nodeName = element.nodeName().toLowerCase(); | |
switch (nodeName) { | |
case "h1": | |
case "h2": | |
case "h3": | |
case "h4": | |
/* | |
* Add the text content of the <Hx> Element to the page | |
* as a Heading object. | |
* | |
*/ | |
layout.add(new Heading(element.text())); | |
break; | |
case "p": | |
if (element.childNodes().size() == 1) { | |
/* | |
* If there are no children in the <p>, just add the | |
* text to the page as a Paragraph. | |
*/ | |
layout.add(new Paragraph(element.text())); | |
} else { | |
Paragraph para = new Paragraph(); | |
/* | |
* If there are children in the <p> create a | |
* Paragraph object and add Span objects to it. If | |
* they are links, make them blue. | |
*/ | |
for (Node childNode : element.childNodes()) { | |
if (childNode.nodeName().matches("a")) { | |
Element nodeAsElement = (Element) childNode; | |
Span span = new Span(nodeAsElement.text()); | |
span.getStyle().setColor(Color.BLUE); | |
para.add(span); | |
} else { | |
if (childNode.nodeName().matches("#text")) { | |
para.add(new Span(childNode.toString())); | |
} else { | |
Element nodeAsElement = (Element) childNode; | |
para.add(new Span(nodeAsElement.text())); | |
} | |
} | |
} | |
layout.add(para); | |
} | |
break; | |
case "ul": | |
/* | |
* Talkeetna doesn't do lists yet so we'll fake it and | |
* just append a bullet or number as needed. | |
*/ | |
for (Element li : element.children()) { | |
Paragraph para = new Paragraph("\u2022 " + li.text()); | |
para.getStyle().setTextIndent(new Length(18, Dimension.PT)); | |
para.getStyle().setFontSize(new Length(10, Dimension.PT)); | |
para.getStyle().setMarginBottom(new Length(0, Dimension.PT)); | |
layout.add(para); | |
} | |
Paragraph para = new Paragraph(""); | |
para.getStyle().setTextIndent(new Length(18, Dimension.PT)); | |
para.getStyle().setFontSize(new Length(10, Dimension.PT)); | |
para.getStyle().setMarginBottom(new Length(0, Dimension.PT)); | |
layout.add(para); | |
break; | |
case "ol": | |
int i = 1; | |
for (Element li : element.children()) { | |
para = new Paragraph(String.valueOf(i) + " " + li.text()); | |
para.getStyle().setTextIndent(new Length(18, Dimension.PT)); | |
para.getStyle().setFontSize(new Length(10, Dimension.PT)); | |
para.getStyle().setMarginBottom(new Length(0, Dimension.PT)); | |
layout.add(para); | |
i++; | |
} | |
para = new Paragraph(""); | |
para.getStyle().setTextIndent(new Length(18, Dimension.PT)); | |
para.getStyle().setFontSize(new Length(10, Dimension.PT)); | |
para.getStyle().setMarginBottom(new Length(0, Dimension.PT)); | |
layout.add(para); | |
break; | |
case "pre": | |
String[] lines = StringUtils.split(element.text(), System.lineSeparator()); | |
for (String line : lines) { | |
Paragraph pre = new Paragraph(line); | |
pre.getStyle().setColor(Color.GRAY); | |
pre.getStyle().setFontFamily("Courier"); | |
pre.getStyle().setTextIndent(new Length(18, Dimension.PT)); | |
pre.getStyle().setFontSize(new Length(10, Dimension.PT)); | |
pre.getStyle().setMarginBottom(new Length(0, Dimension.PT)); | |
layout.add(pre); | |
} | |
para = new Paragraph(""); | |
para.getStyle().setTextIndent(new Length(18, Dimension.PT)); | |
para.getStyle().setFontSize(new Length(10, Dimension.PT)); | |
para.getStyle().setMarginBottom(new Length(0, Dimension.PT)); | |
layout.add(para); | |
break; | |
} | |
} | |
} | |
} | |
/* | |
* Now collect the words that we just layed out. We'll use the list to locate where the text inside <a> tags are on the page. | |
*/ | |
ReadingOrderTextExtractor textExtractor = ReadingOrderTextExtractor.newInstance(pdfDocument, | |
FontSetLoader.newInstance().getFontSet()); | |
WordsIterator wordsIterator = textExtractor.getWordsIterator(); | |
List<Word> wordsArray = new ArrayList<Word>(); | |
List<String> stringsArray = new ArrayList<String>(); | |
while (wordsIterator.hasNext()) { | |
Word word = wordsIterator.next(); | |
if (word.toString().matches(" ") == false && word.toString().contains("\n") == false) { | |
wordsArray.add(word); | |
// Strip punctuation | |
stringsArray.add(word.toString().replaceAll("[^a-zA-Z0-9 ]", "")); | |
} | |
} | |
/* | |
* Select <a> Elements and then locate the text inside them on the page. | |
*/ | |
Elements links = col1.select("a[href]"); | |
int start = 0; | |
for (Element link : links) { | |
if (link.parent().nodeName().contains("li") == false) { | |
if (link.hasText()) { | |
for (String linkWord : link.text().split(" ")) { | |
int position = stringsArray.subList(start, stringsArray.size()).indexOf(linkWord); | |
Word word = wordsArray.get(position + start); | |
/* | |
* Add a link to the page based on the bounding quads of | |
* the Word. Set the destination to be the same as the | |
* href in the <a> tag. | |
*/ | |
PDFAnnotationLink pdfAnnotationLink = PDFAnnotationLink.newInstance(pdfDocument); | |
pdfAnnotationLink.setRect(word.getBoundingQuads().get(0).p1().x(), word.getBoundingQuads().get(0).p1().y(), | |
word.getBoundingQuads().get(0).p3().x(), word.getBoundingQuads().get(0).p3().y()); | |
PDFActionURI pdfActionURI = PDFActionURI.newInstance(pdfDocument); | |
pdfActionURI.setURI(new ASString(link.absUrl("href"))); | |
pdfAnnotationLink.setAction(pdfActionURI); | |
PDFBorder pdfBorder = PDFBorder.newInstance(pdfDocument); | |
pdfBorder.setWidth(0); | |
pdfAnnotationLink.setBorder(pdfBorder); | |
PDFPage pdfPage = pdfDocument.requirePages().getPage(word.getPageNumber() - 1); | |
pdfPage.addAnnotation(pdfAnnotationLink); | |
if (position > 0) { | |
position += start; | |
start = position; | |
} | |
} | |
} | |
} | |
} | |
// Save and close | |
DocumentHelper.saveFullAndClose(pdfDocument, outputDir + "jsoup_Output.pdf"); | |
// Save the file. | |
System.out.println("Done!"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment