Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save JoelGeraci-Datalogics/dd5b6e91531e3a79f6e296c4f3f91da9 to your computer and use it in GitHub Desktop.
Save JoelGeraci-Datalogics/dd5b6e91531e3a79f6e296c4f3f91da9 to your computer and use it in GitHub Desktop.
/*
* Copyright Datalogics, Inc. 2015
*/
package pdfjt.cookbook.document;
import java.awt.Color;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import com.adobe.pdfjt.core.types.ASString;
import com.adobe.pdfjt.pdf.document.PDFDocument;
import com.adobe.pdfjt.pdf.document.PDFOpenOptions;
import com.adobe.pdfjt.pdf.interactive.action.PDFActionURI;
import com.adobe.pdfjt.pdf.interactive.annotation.PDFAnnotationLink;
import com.adobe.pdfjt.pdf.interactive.annotation.PDFBorder;
import com.adobe.pdfjt.pdf.page.PDFPage;
import com.adobe.pdfjt.services.readingorder.ReadingOrderTextExtractor;
import com.adobe.pdfjt.services.textextraction.Word;
import com.adobe.pdfjt.services.textextraction.WordsIterator;
import com.datalogics.pdf.document.DocumentHelper;
import com.datalogics.pdf.document.FontSetLoader;
import com.datalogics.pdf.layout.LayoutEngine;
import com.datalogics.pdf.text.Dimension;
import com.datalogics.pdf.text.Heading;
import com.datalogics.pdf.text.Length;
import com.datalogics.pdf.text.Paragraph;
import com.datalogics.pdf.text.Span;
/**
* This sample reads an HTML file and adds text to a new PDF file.
*/
public class jsoupWithPDF_Java_Toolkit {
private static final String inputURL = "https://jsoup.org/";
private static final String outputDir = "cookbook/Document/output/";
static public void main(String[] args) throws Exception {
/*
* We're using JSoup so we'll read the HTML file from their home page
* and select one of the elements that has some content we want to
* layout.
*/
Document doc = Jsoup.connect(inputURL).userAgent("Mozilla").get();
Element col1 = doc.select(".col1").first();
// Create a new blank PDF file.
PDFDocument pdfDocument = PDFDocument.newInstance(PDFOpenOptions.newInstance());
/*
* Create a new LayoutEngine object that will actually perform the
* addition of text to the page.
*/
try (LayoutEngine layout = new LayoutEngine(pdfDocument)) {
// Set the font for the entire document
layout.getStyle().setFontFamily("Helvetica");
// Work our way through the HTML Element by Element
for (Element element : col1.children()) {
if (element.isBlock() && element.text().isEmpty() == false) {
String nodeName = element.nodeName().toLowerCase();
switch (nodeName) {
case "h1":
case "h2":
case "h3":
case "h4":
/*
* Add the text content of the <Hx> Element to the page
* as a Heading object.
*
*/
layout.add(new Heading(element.text()));
break;
case "p":
if (element.childNodes().size() == 1) {
/*
* If there are no children in the <p>, just add the
* text to the page as a Paragraph.
*/
layout.add(new Paragraph(element.text()));
} else {
Paragraph para = new Paragraph();
/*
* If there are children in the <p> create a
* Paragraph object and add Span objects to it. If
* they are links, make them blue.
*/
for (Node childNode : element.childNodes()) {
if (childNode.nodeName().matches("a")) {
Element nodeAsElement = (Element) childNode;
Span span = new Span(nodeAsElement.text());
span.getStyle().setColor(Color.BLUE);
para.add(span);
} else {
if (childNode.nodeName().matches("#text")) {
para.add(new Span(childNode.toString()));
} else {
Element nodeAsElement = (Element) childNode;
para.add(new Span(nodeAsElement.text()));
}
}
}
layout.add(para);
}
break;
case "ul":
/*
* Talkeetna doesn't do lists yet so we'll fake it and
* just append a bullet or number as needed.
*/
for (Element li : element.children()) {
Paragraph para = new Paragraph("\u2022 " + li.text());
para.getStyle().setTextIndent(new Length(18, Dimension.PT));
para.getStyle().setFontSize(new Length(10, Dimension.PT));
para.getStyle().setMarginBottom(new Length(0, Dimension.PT));
layout.add(para);
}
Paragraph para = new Paragraph("");
para.getStyle().setTextIndent(new Length(18, Dimension.PT));
para.getStyle().setFontSize(new Length(10, Dimension.PT));
para.getStyle().setMarginBottom(new Length(0, Dimension.PT));
layout.add(para);
break;
case "ol":
int i = 1;
for (Element li : element.children()) {
para = new Paragraph(String.valueOf(i) + " " + li.text());
para.getStyle().setTextIndent(new Length(18, Dimension.PT));
para.getStyle().setFontSize(new Length(10, Dimension.PT));
para.getStyle().setMarginBottom(new Length(0, Dimension.PT));
layout.add(para);
i++;
}
para = new Paragraph("");
para.getStyle().setTextIndent(new Length(18, Dimension.PT));
para.getStyle().setFontSize(new Length(10, Dimension.PT));
para.getStyle().setMarginBottom(new Length(0, Dimension.PT));
layout.add(para);
break;
case "pre":
String[] lines = StringUtils.split(element.text(), System.lineSeparator());
for (String line : lines) {
Paragraph pre = new Paragraph(line);
pre.getStyle().setColor(Color.GRAY);
pre.getStyle().setFontFamily("Courier");
pre.getStyle().setTextIndent(new Length(18, Dimension.PT));
pre.getStyle().setFontSize(new Length(10, Dimension.PT));
pre.getStyle().setMarginBottom(new Length(0, Dimension.PT));
layout.add(pre);
}
para = new Paragraph("");
para.getStyle().setTextIndent(new Length(18, Dimension.PT));
para.getStyle().setFontSize(new Length(10, Dimension.PT));
para.getStyle().setMarginBottom(new Length(0, Dimension.PT));
layout.add(para);
break;
}
}
}
}
/*
* Now collect the words that we just layed out. We'll use the list to locate where the text inside <a> tags are on the page.
*/
ReadingOrderTextExtractor textExtractor = ReadingOrderTextExtractor.newInstance(pdfDocument,
FontSetLoader.newInstance().getFontSet());
WordsIterator wordsIterator = textExtractor.getWordsIterator();
List<Word> wordsArray = new ArrayList<Word>();
List<String> stringsArray = new ArrayList<String>();
while (wordsIterator.hasNext()) {
Word word = wordsIterator.next();
if (word.toString().matches(" ") == false && word.toString().contains("\n") == false) {
wordsArray.add(word);
// Strip punctuation
stringsArray.add(word.toString().replaceAll("[^a-zA-Z0-9 ]", ""));
}
}
/*
* Select <a> Elements and then locate the text inside them on the page.
*/
Elements links = col1.select("a[href]");
int start = 0;
for (Element link : links) {
if (link.parent().nodeName().contains("li") == false) {
if (link.hasText()) {
for (String linkWord : link.text().split(" ")) {
int position = stringsArray.subList(start, stringsArray.size()).indexOf(linkWord);
Word word = wordsArray.get(position + start);
/*
* Add a link to the page based on the bounding quads of
* the Word. Set the destination to be the same as the
* href in the <a> tag.
*/
PDFAnnotationLink pdfAnnotationLink = PDFAnnotationLink.newInstance(pdfDocument);
pdfAnnotationLink.setRect(word.getBoundingQuads().get(0).p1().x(), word.getBoundingQuads().get(0).p1().y(),
word.getBoundingQuads().get(0).p3().x(), word.getBoundingQuads().get(0).p3().y());
PDFActionURI pdfActionURI = PDFActionURI.newInstance(pdfDocument);
pdfActionURI.setURI(new ASString(link.absUrl("href")));
pdfAnnotationLink.setAction(pdfActionURI);
PDFBorder pdfBorder = PDFBorder.newInstance(pdfDocument);
pdfBorder.setWidth(0);
pdfAnnotationLink.setBorder(pdfBorder);
PDFPage pdfPage = pdfDocument.requirePages().getPage(word.getPageNumber() - 1);
pdfPage.addAnnotation(pdfAnnotationLink);
if (position > 0) {
position += start;
start = position;
}
}
}
}
}
// Save and close
DocumentHelper.saveFullAndClose(pdfDocument, outputDir + "jsoup_Output.pdf");
// Save the file.
System.out.println("Done!");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment