Skip to content

Instantly share code, notes, and snippets.

@ararog
Created October 23, 2012 19:20
Show Gist options
  • Save ararog/3940997 to your computer and use it in GitHub Desktop.
Save ararog/3940997 to your computer and use it in GitHub Desktop.
package com.atlantbh.nutch.filter.xpath;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.avro.util.Utf8;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.log4j.Logger;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseFilter;
import org.apache.nutch.parse.ParseStatusCodes;
import org.apache.nutch.parse.ParseStatusUtils;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.storage.WebPage.Field;
import org.apache.nutch.util.Bytes;
import org.apache.nutch.util.EncodingDetector;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.jaxen.JaxenException;
import org.jaxen.XPath;
import org.jaxen.dom.DOMXPath;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import com.atlantbh.nutch.filter.xpath.config.XPathFilterConfiguration;
import com.atlantbh.nutch.filter.xpath.config.XPathIndexerProperties;
import com.atlantbh.nutch.filter.xpath.config.XPathIndexerPropertiesField;
/**
* A Xml-Html xpath filter implementation that fetches data
* from the content, depending on the supplied xpath,
* and prepares it for the {@link XPathIndexingFilter} to
* index it into solr.
*
* @author Emir Dizdarevic
* @version 1.4
* @since Apache Nutch 1.4
*/
public class XPathHtmlParserFilter implements ParseFilter {
// Constants
private static final Logger log = Logger.getLogger(XPathHtmlParserFilter.class);
private static final List<String> htmlMimeTypes = Arrays.asList(new String[] {"text/html", "application/xhtml+xml"});
// OLD WAY TO DETERMIN IF IT'S AN XML FORMAT
//private static final List<String> xmlMimeTypes = Arrays.asList(new String[] {"text/xml", "application/xml"});
// Configuration
private Configuration configuration;
private XPathFilterConfiguration xpathFilterConfiguration;
private String defaultEncoding;
private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
static {
FIELDS.add(WebPage.Field.METADATA);
}
// Internal data
private HtmlCleaner cleaner;
private DomSerializer domSerializer;
private DocumentBuilder documentBuilder;
public XPathHtmlParserFilter() {
init();
}
private void init() {
// Initialize HTMLCleaner
cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
props.setAllowHtmlInsideAttributes(true);
props.setAllowMultiWordAttributes(true);
props.setRecognizeUnicodeChars(true);
props.setOmitComments(true);
props.setNamespacesAware(false);
// Initialize DomSerializer
domSerializer = new DomSerializer(props);
// Initialize xml parser
try {
DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
documentBuilder = documentBuilderFactory.newDocumentBuilder();
} catch (ParserConfigurationException e) {
// THIS CAN NEVER HAPPEN
}
}
private void initConfig() {
// Initialize configuration
xpathFilterConfiguration = XPathFilterConfiguration.getInstance(configuration);
defaultEncoding = configuration.get("parser.character.encoding.default", "UTF-8");
}
@Override
public Configuration getConf() {
return configuration;
}
@Override
public void setConf(Configuration configuration) {
this.configuration = configuration;
initConfig();
}
private void removeAll(Node node, short nodeType, String name) {
if (node.getNodeType() == nodeType && (name == null || node.getNodeName().equals(name))) {
node.getParentNode().removeChild(node);
}
else {
NodeList list = node.getChildNodes();
for (int i = 0; i < list.getLength(); i++) {
removeAll(list.item(i), nodeType, name);
}
}
}
@Override
public Parse filter(String url, WebPage page, Parse parse,
HTMLMetaTags metaTags, DocumentFragment doc) {
byte[] rawContent = page.getContent().array();
try {
Document cleanedXmlHtml = documentBuilder.newDocument();
if(htmlMimeTypes.contains(page.getContentType().toString())) {
String encoding = defaultEncoding;
ByteBuffer buffer = page.getFromMetadata(new Utf8(Metadata.ORIGINAL_CHAR_ENCODING));
if(buffer != null)
encoding = Bytes.toString(buffer.array());
// Create reader so the input can be read in UTF-8
Reader rawContentReader = new InputStreamReader(new ByteArrayInputStream(rawContent), encoding);
// Use the cleaner to "clean" the HTML and return it as a TagNode object
TagNode tagNode = cleaner.clean(rawContentReader);
cleanedXmlHtml = domSerializer.createDOM(tagNode);
} else if(page.getContentType().toString().contains(new StringBuilder("/xml")) || page.getContentType().toString().contains(new StringBuilder("+xml"))) {
// Parse as xml - don't clean
cleanedXmlHtml = documentBuilder.parse(new InputSource(new ByteArrayInputStream(rawContent)));
}
// Once the HTML is cleaned, then you can run your XPATH expressions on the node,
// which will then return an array of TagNode objects
List<XPathIndexerProperties> xPathIndexerPropertiesList = xpathFilterConfiguration.getXPathIndexerPropertiesList();
for(XPathIndexerProperties xPathIndexerProperties : xPathIndexerPropertiesList) {
//****************************
// CORE XPATH EVALUATION
//****************************
if(pageToProcess(xPathIndexerProperties, cleanedXmlHtml, page.getBaseUrl().toString())) {
List<XPathIndexerPropertiesField> xPathIndexerPropertiesFieldList = xPathIndexerProperties.getXPathIndexerPropertiesFieldList();
for(XPathIndexerPropertiesField xPathIndexerPropertiesField : xPathIndexerPropertiesFieldList) {
// Evaluate xpath
XPath xPath = new DOMXPath(xPathIndexerPropertiesField.getXPath());
List nodeList = xPath.selectNodes(cleanedXmlHtml);
// Trim?
boolean trim = FilterUtils.getNullSafe(xPathIndexerPropertiesField.getTrimXPathData(), true);
if(FilterUtils.getNullSafe(xPathIndexerPropertiesField.isConcat(), false)) {
// Iterate trough all found nodes
String value = new String();
String concatDelimiter = FilterUtils.getNullSafe(xPathIndexerPropertiesField.getConcatDelimiter(), "");
for (Object node : nodeList) {
// Extract data
String tempValue = FilterUtils.extractTextContentFromRawNode(node);
tempValue = filterValue(tempValue, trim);
// Concatenate tempValue to value
if(tempValue != null) {
if(value.isEmpty()) {
value = tempValue;
} else {
value = value + concatDelimiter + tempValue;
}
}
}
// Add the extracted data to meta
if(value != null) {
page.putToMetadata(new Utf8(xPathIndexerPropertiesField.getName()), ByteBuffer.wrap(value.getBytes()));
}
} else {
// Iterate trough all found nodes
for (Object node : nodeList) {
// Add the extracted data to meta
String value = FilterUtils.extractTextContentFromRawNode(node);
value = filterValue(value, trim);
if(value != null) {
page.putToMetadata(new Utf8(xPathIndexerPropertiesField.getName()), ByteBuffer.wrap(value.getBytes()));
}
}
}
}
}
}
} catch (IOException e) {
// This can never happen because it's an in memory stream
} catch(PatternSyntaxException e) {
System.err.println(e.getMessage());
log.error("Error parsing urlRegex: " + e.getMessage());
return ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED, page.getBaseUrl().toString(), configuration);
} catch (ParserConfigurationException e) {
System.err.println(e.getMessage());
log.error("HTML Cleaning error: " + e.getMessage());
return ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED, page.getBaseUrl().toString(), configuration);
} catch (SAXException e) {
System.err.println(e.getMessage());
log.error("XML parsing error: " + e.getMessage());
return ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED, page.getBaseUrl().toString(), configuration);
} catch (JaxenException e) {
System.err.println(e.getMessage());
log.error("XPath error: " + e.getMessage());
return ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED, page.getBaseUrl().toString(), configuration);
}
return parse;
}
private boolean pageToProcess(XPathIndexerProperties xPathIndexerProperties, Document cleanedXmlHtml, String url) throws JaxenException {
boolean processPage = true;
// *************************************
// URL REGEX CONTENT PAGE FILTERING
// *************************************
processPage = processPage && FilterUtils.isMatch(xPathIndexerProperties.getPageUrlFilterRegex(), url);
// Check return status
if (!processPage) {
return false;
}
// *************************************
// XPATH CONTENT PAGE FILTERING
// *************************************
if (xPathIndexerProperties.getPageContentFilterXPath() != null) {
XPath xPathPageContentFilter = new DOMXPath(xPathIndexerProperties.getPageContentFilterXPath());
List pageContentFilterNodeList = xPathPageContentFilter.selectNodes(cleanedXmlHtml);
boolean trim = FilterUtils.getNullSafe(xPathIndexerProperties.isTrimPageContentFilterXPathData(), true);
if (FilterUtils.getNullSafe(xPathIndexerProperties.isConcatPageContentFilterXPathData(), false)) {
// Iterate trough all found nodes
String value = new String();
String concatDelimiter = FilterUtils.getNullSafe(xPathIndexerProperties.getConcatPageContentFilterXPathDataDelimiter(), "");
for (Object node : pageContentFilterNodeList) {
// Extract data
String tempValue = FilterUtils.extractTextContentFromRawNode(node);
tempValue = filterValue(tempValue, trim);
// Concatenate tempValue to value
if(tempValue != null) {
if (value.isEmpty()) {
value = tempValue;
} else {
value = value + concatDelimiter + tempValue;
}
}
}
processPage = processPage && FilterUtils.isMatch(xPathIndexerProperties.getPageContentFilterRegex(), value);
} else {
for (Object node : pageContentFilterNodeList) {
// Add the extracted data to meta
String value = FilterUtils.extractTextContentFromRawNode(node);
value = filterValue(value, trim);
if(value != null) {
processPage = processPage && FilterUtils.isMatch(xPathIndexerProperties.getPageContentFilterRegex(), value);
}
}
}
}
return processPage;
}
private String filterValue(String value, boolean trim) {
String returnValue = null;
// Filter out empty strings and strings made of space, carriage return and tab characters
if(!value.isEmpty() && !FilterUtils.isMadeOf(value, " \n\t")) {
// Trim data?
returnValue = trimValue(value, trim);
}
return returnValue == null ? null : StringEscapeUtils.unescapeHtml(returnValue);
}
private String trimValue(String value, boolean trim) {
String returnValue;
if (trim) {
returnValue = value.trim();
} else {
returnValue = value;
}
return returnValue;
}
@Override
public Collection<Field> getFields() {
return FIELDS;
}
}
package com.atlantbh.nutch.filter.xpath;
import java.nio.ByteBuffer;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
import org.apache.log4j.Logger;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.storage.WebPage.Field;
import org.apache.nutch.util.Bytes;
import com.atlantbh.nutch.filter.xpath.config.FieldType;
import com.atlantbh.nutch.filter.xpath.config.XPathFilterConfiguration;
import com.atlantbh.nutch.filter.xpath.config.XPathIndexerProperties;
import com.atlantbh.nutch.filter.xpath.config.XPathIndexerPropertiesField;
/**
* Second stage of {@link XPathHtmlParserFilter} the IndexingFilter.
* It takes the prepared data located in the metadata and indexes
* it to solr.
*
*
* @author Emir Dizdarevic
* @version 1.4
* @since Apache Nutch 1.4
*
*/
public class XPathIndexingFilter implements IndexingFilter {
// Constants
private static final String CONFIG_FILE_PROPERTY = "parser.xmlhtml.file";
private static final Logger log = Logger.getLogger(XPathIndexingFilter.class);
// Configuration
private Configuration configuration;
private XPathFilterConfiguration xpathFilterConfiguration;
private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
static {
FIELDS.add(WebPage.Field.METADATA);
}
public XPathIndexingFilter() {}
private void initConfig() {
// Initialize configuration
xpathFilterConfiguration = XPathFilterConfiguration.getInstance(configuration);
}
@Override
public Configuration getConf() {
return configuration;
}
@Override
public void setConf(Configuration configuration) {
this.configuration = configuration;
initConfig();
}
@Override
public NutchDocument filter(NutchDocument doc, String url, WebPage page)
throws IndexingException {
List<XPathIndexerProperties> xPathIndexerPropertiesList = xpathFilterConfiguration.getXPathIndexerPropertiesList();
for(XPathIndexerProperties xPathIndexerProperties : xPathIndexerPropertiesList) {
if(FilterUtils.isMatch(xPathIndexerProperties.getPageUrlFilterRegex(), url)) {
List<XPathIndexerPropertiesField> xPathIndexerPropertiesFieldList = xPathIndexerProperties.getXPathIndexerPropertiesFieldList();
for(XPathIndexerPropertiesField xPathIndexerPropertiesField : xPathIndexerPropertiesFieldList) {
ByteBuffer buffer = page.getFromMetadata(new Utf8(xPathIndexerPropertiesField.getName()));
String stringValue = "";
if(buffer != null)
stringValue = Bytes.toString(buffer.array());
doc.add(xPathIndexerPropertiesField.getName(), stringValue);
}
}
}
return doc;
}
@Override
public Collection<Field> getFields() {
return FIELDS;
}
}
@developer1test
Copy link

Hi,

I have use this 2 file in my nutch2.1 version so, after that filter-xpath plugin build successfully.
But after run this cmd for crawl.
bin/nutch crawl urls -depth 3 -topN 1

It gives me following errors.
FetcherJob: threads: 10
FetcherJob: parsing: false
FetcherJob: resuming: false
FetcherJob : timelimit set for : -1
Using queue mode : byHost
Fetcher: threads: 10
QueueFeeder finished: total 1 records. Hit by time limit :0
fetching http://localhost.testcheck.com/
-finishing thread FetcherThread1, activeThreads=1
-finishing thread FetcherThread2, activeThreads=1
-finishing thread FetcherThread3, activeThreads=1
-finishing thread FetcherThread4, activeThreads=1
-finishing thread FetcherThread5, activeThreads=1
-finishing thread FetcherThread6, activeThreads=1
-finishing thread FetcherThread7, activeThreads=1
-finishing thread FetcherThread8, activeThreads=1
Fetcher: throughput threshold: -1
-finishing thread FetcherThread9, activeThreads=1
Fetcher: throughput threshold sequence: 5
-finishing thread FetcherThread0, activeThreads=0
0/0 spinwaiting/active, 1 pages, 0 errors, 0.2 0.2 pages/s, 1 1 kb/s, 0 URLs in 0 queues
-activeThreads=0
ParserJob: resuming: false
ParserJob: forced reparse: false
ParserJob: parsing all
Exception in thread "main" java.lang.NoClassDefFoundError: org/jaxen/JaxenException
at java.lang.Class.getDeclaredConstructors0(Native Method)
at java.lang.Class.privateGetDeclaredConstructors(Class.java:2413)
at java.lang.Class.getConstructor0(Class.java:2723)
at java.lang.Class.newInstance0(Class.java:345)
at java.lang.Class.newInstance(Class.java:327)
at org.apache.nutch.plugin.Extension.getExtensionInstance(Extension.java:160)
at org.apache.nutch.parse.ParseFilters.(ParseFilters.java:63)
at org.apache.nutch.parse.ParserJob.getFields(ParserJob.java:191)
at org.apache.nutch.parse.ParserJob.run(ParserJob.java:245)
at org.apache.nutch.crawl.Crawler.runTool(Crawler.java:68)
at org.apache.nutch.crawl.Crawler.run(Crawler.java:171)
at org.apache.nutch.crawl.Crawler.run(Crawler.java:250)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65)
at org.apache.nutch.crawl.Crawler.main(Crawler.java:257)
Caused by: java.lang.ClassNotFoundException: org.jaxen.JaxenException
at java.net.URLClassLoader$1.run(URLClassLoader.java:366)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:423)
at java.lang.ClassLoader.loadClass(ClassLoader.java:356)
... 14 more

so can you please help me ?

Thanks,
Dev

@natesire
Copy link

I am working on a similar problem. My next was to import jaxen into hadoop source code and try a recompile of hadoop from source.

java.lang.Exception: java.lang.NoClassDefFoundError: org/jaxen/XPath
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:354)
Caused by: java.lang.NoClassDefFoundError: org/jaxen/XPath

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment