Last active
April 5, 2018 11:47
-
-
Save elmuerte/f22db2ecd86bbaf0fef1 to your computer and use it in GitHub Desktop.
Alternative renderer for Jericho HTML Parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Copyright (c) 2015 Michiel Hendriks | |
This software is provided 'as-is', without any express or implied | |
warranty. In no event will the authors be held liable for any damages | |
arising from the use of this software. | |
Permission is granted to anyone to use this software for any purpose, | |
including commercial applications, and to alter it and redistribute it | |
freely, subject to the following restrictions: | |
1. The origin of this software must not be misrepresented; you must not | |
claim that you wrote the original software. If you use this software | |
in a product, an acknowledgment in the product documentation would be | |
appreciated but is not required. | |
2. Altered source versions must be plainly marked as such, and must not be | |
misrepresented as being the original software. | |
3. This notice may not be removed or altered from any source distribution. | |
*/ | |
package net.htmlparser.jericho; | |
import java.io.IOException; | |
import java.net.URI; | |
import java.net.URISyntaxException; | |
import java.util.LinkedHashMap; | |
import java.util.Map; | |
import java.util.Map.Entry; | |
public class UrlMappedRenderer extends Renderer { | |
private Map<String, Integer> urlMap; | |
public UrlMappedRenderer(final Segment segment) { | |
super(segment); | |
} | |
public String renderHyperlinkURL(final StartTag startTag) { | |
final String href = startTag.getAttributeValue("href"); | |
if (href == null || href.startsWith("javascript:")) { | |
return null; | |
} | |
try { | |
URI uri = new URI(href); | |
if (!uri.isAbsolute()) { | |
return null; | |
} | |
} catch (URISyntaxException ex) { | |
return null; | |
} | |
if (urlMap == null) { | |
urlMap = new LinkedHashMap<String, Integer>(); | |
} | |
if (!urlMap.containsKey(href)) { | |
urlMap.put(href, Integer.valueOf(urlMap.size() + 1)); | |
} | |
return String.format("[%d]", urlMap.get(href)); | |
} | |
public void appendTo(final Appendable appendable) throws IOException { | |
super.appendTo(appendable); | |
printUrlMap(appendable); | |
urlMap = null; | |
} | |
private void printUrlMap(final Appendable appendable) throws IOException { | |
if (urlMap == null || urlMap.isEmpty()) { | |
return; | |
} | |
appendable.append(getNewLine()); | |
appendable.append(getNewLine()); | |
final int digitLength = Integer.toString(urlMap.size()).length(); | |
final String fmtString = " %" + digitLength + "d. "; | |
for (Entry<String, Integer> entry : urlMap.entrySet()) { | |
appendable.append(String.format(fmtString, entry.getValue())); | |
appendable.append(entry.getKey()); | |
appendable.append(getNewLine()); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Instead of adding the URL right after the content where it was linked it created references like [1] which are printed at the end of the content.
Links with the same URL get the same reference.