ararog · October 23, 2012 19:20 · natesire · Jul 16, 2013
diff --git a/XPathHtmlParserFilter.java b/XPathHtmlParserFilter.java
 package com.atlantbh.nutch.filter.xpath;

 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.UnsupportedEncodingException;
 import java.nio.ByteBuffer;
 import java.nio.charset.Charset;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;

 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;

 import org.apache.avro.util.Utf8;
 import org.apache.commons.lang.StringEscapeUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.log4j.Logger;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.HTMLMetaTags;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseFilter;
 import org.apache.nutch.parse.ParseStatusCodes;
 import org.apache.nutch.parse.ParseStatusUtils;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.storage.WebPage.Field;
 import org.apache.nutch.util.Bytes;
 import org.apache.nutch.util.EncodingDetector;
 import org.htmlcleaner.CleanerProperties;
 import org.htmlcleaner.DomSerializer;
 import org.htmlcleaner.HtmlCleaner;
 import org.htmlcleaner.TagNode;
 import org.jaxen.JaxenException;
 import org.jaxen.XPath;
 import org.jaxen.dom.DOMXPath;
 import org.w3c.dom.Document;
 import org.w3c.dom.DocumentFragment;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;

 import com.atlantbh.nutch.filter.xpath.config.XPathFilterConfiguration;
 import com.atlantbh.nutch.filter.xpath.config.XPathIndexerProperties;
 import com.atlantbh.nutch.filter.xpath.config.XPathIndexerPropertiesField;

 /**
 * A Xml-Html xpath filter implementation that fetches data
 * from the content, depending on the supplied xpath,
 * and prepares it for the {@link XPathIndexingFilter} to
 * index it into solr.
 * 
 * @author Emir Dizdarevic
 * @version 1.4
 * @since Apache Nutch 1.4
 */
 public class XPathHtmlParserFilter implements ParseFilter {
 	
 	// Constants
 	private static final Logger log = Logger.getLogger(XPathHtmlParserFilter.class);
 	private static final List<String> htmlMimeTypes = Arrays.asList(new String[] {"text/html", "application/xhtml+xml"});
 	
 	// OLD WAY TO DETERMIN IF IT'S AN XML FORMAT
 	//private static final List<String> xmlMimeTypes = Arrays.asList(new String[] {"text/xml", "application/xml"});
 	
 	// Configuration
 	private Configuration configuration;
 	private XPathFilterConfiguration xpathFilterConfiguration;
 	private String defaultEncoding;
 	
 	private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();

 	static {
 	    FIELDS.add(WebPage.Field.METADATA);
 	}
 	
 	// Internal data
 	private HtmlCleaner cleaner;
 	private DomSerializer domSerializer;
 	private DocumentBuilder documentBuilder;

 	public XPathHtmlParserFilter() {
 		init();
 	}

 	private void init() {
 		
 		// Initialize HTMLCleaner
 		cleaner = new HtmlCleaner();
 		CleanerProperties props = cleaner.getProperties();
 		props.setAllowHtmlInsideAttributes(true);
 		props.setAllowMultiWordAttributes(true);
 		props.setRecognizeUnicodeChars(true);
 		props.setOmitComments(true);
 		props.setNamespacesAware(false);
 		
 		// Initialize DomSerializer
 		domSerializer = new DomSerializer(props);
 		
 		// Initialize xml parser		
 		try {
 			DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
 			documentBuilder = documentBuilderFactory.newDocumentBuilder();
 		} catch (ParserConfigurationException e) {
 			// THIS CAN NEVER HAPPEN
 		}
 	}
 	
 	private void initConfig() {

 		// Initialize configuration
 		xpathFilterConfiguration  = XPathFilterConfiguration.getInstance(configuration);
 		defaultEncoding = configuration.get("parser.character.encoding.default", "UTF-8");
 	}

 	@Override
 	public Configuration getConf() {
 		return configuration;
 	}

 	@Override
 	public void setConf(Configuration configuration) {
 		this.configuration = configuration;
 		initConfig(); 
 	}
 	
 	private void removeAll(Node node, short nodeType, String name) {
 	    if (node.getNodeType() == nodeType && (name == null || node.getNodeName().equals(name))) {
 	    	node.getParentNode().removeChild(node);
 	    } 
 	    else {
 	    	NodeList list = node.getChildNodes();
 	    	for (int i = 0; i < list.getLength(); i++) {
 	    		removeAll(list.item(i), nodeType, name);
 	    	}
 	    }
 	}
 	
 	
 	@Override
 	public Parse filter(String url, WebPage page, Parse parse,
 			HTMLMetaTags metaTags, DocumentFragment doc) {

 		byte[] rawContent = page.getContent().array();
 		
 		try {
 			Document cleanedXmlHtml = documentBuilder.newDocument();
 			if(htmlMimeTypes.contains(page.getContentType().toString())) {
 		
 				
 				String encoding = defaultEncoding;
 				ByteBuffer buffer = page.getFromMetadata(new Utf8(Metadata.ORIGINAL_CHAR_ENCODING));
 				if(buffer != null)
 					encoding = Bytes.toString(buffer.array());
 				
 				// Create reader so the input can be read in UTF-8
 				Reader rawContentReader = new InputStreamReader(new ByteArrayInputStream(rawContent), encoding);
 				
 				// Use the cleaner to "clean" the HTML and return it as a TagNode object
 				TagNode tagNode = cleaner.clean(rawContentReader);
 				cleanedXmlHtml = domSerializer.createDOM(tagNode);
 			} else if(page.getContentType().toString().contains(new StringBuilder("/xml")) || page.getContentType().toString().contains(new StringBuilder("+xml"))) {
 				
 				// Parse as xml - don't clean
 				cleanedXmlHtml = documentBuilder.parse(new InputSource(new ByteArrayInputStream(rawContent)));	
 			} 
 			
 			// Once the HTML is cleaned, then you can run your XPATH expressions on the node, 
 			// which will then return an array of TagNode objects 
 			List<XPathIndexerProperties> xPathIndexerPropertiesList = xpathFilterConfiguration.getXPathIndexerPropertiesList();
 			for(XPathIndexerProperties xPathIndexerProperties : xPathIndexerPropertiesList) {
 				
 				
 				//****************************
 				// CORE XPATH EVALUATION
 				//****************************
 				if(pageToProcess(xPathIndexerProperties, cleanedXmlHtml, page.getBaseUrl().toString())) {
 					
 					List<XPathIndexerPropertiesField> xPathIndexerPropertiesFieldList = xPathIndexerProperties.getXPathIndexerPropertiesFieldList();
 					for(XPathIndexerPropertiesField xPathIndexerPropertiesField : xPathIndexerPropertiesFieldList) {
 						
 						// Evaluate xpath			
 						XPath xPath = new DOMXPath(xPathIndexerPropertiesField.getXPath());
 						List nodeList = xPath.selectNodes(cleanedXmlHtml);
 						
 						// Trim?
 						boolean trim = FilterUtils.getNullSafe(xPathIndexerPropertiesField.getTrimXPathData(), true);
 						
 						if(FilterUtils.getNullSafe(xPathIndexerPropertiesField.isConcat(), false)) {
 								
 							// Iterate trough all found nodes
 							String value = new String();
 							String concatDelimiter = FilterUtils.getNullSafe(xPathIndexerPropertiesField.getConcatDelimiter(), "");
 							for (Object node : nodeList) {

 								// Extract data	
 								String tempValue = FilterUtils.extractTextContentFromRawNode(node);
 								tempValue = filterValue(tempValue, trim);
 								
 								// Concatenate tempValue to value
 								if(tempValue != null) {
 									if(value.isEmpty()) {
 										value = tempValue;
 									} else {
 										value = value + concatDelimiter + tempValue;
 									}
 								}
 							}
 							
 							// Add the extracted data to meta
 							if(value != null) {
 								page.putToMetadata(new Utf8(xPathIndexerPropertiesField.getName()), ByteBuffer.wrap(value.getBytes()));
 							}
 							
 						} else {
 							
 							// Iterate trough all found nodes
 							for (Object node : nodeList) {

 								// Add the extracted data to meta
 								String value = FilterUtils.extractTextContentFromRawNode(node);					
 								value = filterValue(value, trim);
 								if(value != null) {
 								    page.putToMetadata(new Utf8(xPathIndexerPropertiesField.getName()), ByteBuffer.wrap(value.getBytes()));
 								}
 							}
 						}
 						
 					}
 				}
 			}
 			
 		} catch (IOException e) {
 			// This can never happen because it's an in memory stream
 		} catch(PatternSyntaxException e) {
 			System.err.println(e.getMessage());
 			log.error("Error parsing urlRegex: " + e.getMessage());
 			return ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED, page.getBaseUrl().toString(), configuration);
 		} catch (ParserConfigurationException e) {
 			System.err.println(e.getMessage());
 			log.error("HTML Cleaning error: " + e.getMessage());
 			return ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED, page.getBaseUrl().toString(), configuration);
 		} catch (SAXException e) {
 			System.err.println(e.getMessage());
 			log.error("XML parsing error: " + e.getMessage());
 			return ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED, page.getBaseUrl().toString(), configuration);
 		} catch (JaxenException e) {
 			System.err.println(e.getMessage());
 			log.error("XPath error: " + e.getMessage());
 			return ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED, page.getBaseUrl().toString(), configuration);
 		}
 		
 		return parse;
 	}
 	
 	
 	private boolean pageToProcess(XPathIndexerProperties xPathIndexerProperties, Document cleanedXmlHtml, String url) throws JaxenException {

 		boolean processPage = true;

 		// *************************************
 		// URL REGEX CONTENT PAGE FILTERING
 		// *************************************
 		processPage = processPage && FilterUtils.isMatch(xPathIndexerProperties.getPageUrlFilterRegex(), url);

 		// Check return status
 		if (!processPage) {
 			return false;
 		}

 		// *************************************
 		// XPATH CONTENT PAGE FILTERING
 		// *************************************

 		if (xPathIndexerProperties.getPageContentFilterXPath() != null) {
 			XPath xPathPageContentFilter = new DOMXPath(xPathIndexerProperties.getPageContentFilterXPath());
 			List pageContentFilterNodeList = xPathPageContentFilter.selectNodes(cleanedXmlHtml);
 			boolean trim = FilterUtils.getNullSafe(xPathIndexerProperties.isTrimPageContentFilterXPathData(), true);
 			
 			if (FilterUtils.getNullSafe(xPathIndexerProperties.isConcatPageContentFilterXPathData(), false)) {

 				// Iterate trough all found nodes
 				String value = new String();
 				String concatDelimiter = FilterUtils.getNullSafe(xPathIndexerProperties.getConcatPageContentFilterXPathDataDelimiter(), "");

 				for (Object node : pageContentFilterNodeList) {

 					// Extract data
 					String tempValue = FilterUtils.extractTextContentFromRawNode(node);
 					tempValue = filterValue(tempValue, trim);

 					// Concatenate tempValue to value
 					if(tempValue != null) {
 						if (value.isEmpty()) {
 							value = tempValue;
 						} else {
 							value = value + concatDelimiter + tempValue;
 						}
 					}
 				}

 				processPage = processPage && FilterUtils.isMatch(xPathIndexerProperties.getPageContentFilterRegex(), value);
 			} else {
 				for (Object node : pageContentFilterNodeList) {

 					// Add the extracted data to meta
 					String value = FilterUtils.extractTextContentFromRawNode(node);
 					value = filterValue(value, trim);
 					if(value != null) {
 						processPage = processPage && FilterUtils.isMatch(xPathIndexerProperties.getPageContentFilterRegex(), value);
 					}
 				}
 			}
 		}

 		return processPage;
 	}
 	
 	private String filterValue(String value, boolean trim) {

 		String returnValue = null;
 		
 		// Filter out empty strings and strings made of space, carriage return and tab characters
 		if(!value.isEmpty() && !FilterUtils.isMadeOf(value, " \n\t")) {
 			
 			// Trim data?
 			returnValue = trimValue(value, trim);
 		}
 		
 		return returnValue == null ? null : StringEscapeUtils.unescapeHtml(returnValue);
 	}
 	
 	private String trimValue(String value, boolean trim) {
 		
 		String returnValue;
 		if (trim) {
 			returnValue = value.trim();
 		} else {
 			returnValue = value;
 		}
 		
 		return returnValue;
 	}

 	@Override
 	public Collection<Field> getFields() {
 		return FIELDS;
 	}
 }
diff --git a/XPathIndexingFilter.java b/XPathIndexingFilter.java
 package com.atlantbh.nutch.filter.xpath;

 import java.nio.ByteBuffer;
 import java.text.ParseException;
 import java.text.SimpleDateFormat;
 import java.util.Collection;
 import java.util.Date;
 import java.util.HashSet;
 import java.util.List;

 import org.apache.avro.util.Utf8;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.log4j.Logger;
 import org.apache.nutch.indexer.IndexingException;
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.NutchDocument;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.storage.WebPage.Field;
 import org.apache.nutch.util.Bytes;

 import com.atlantbh.nutch.filter.xpath.config.FieldType;
 import com.atlantbh.nutch.filter.xpath.config.XPathFilterConfiguration;
 import com.atlantbh.nutch.filter.xpath.config.XPathIndexerProperties;
 import com.atlantbh.nutch.filter.xpath.config.XPathIndexerPropertiesField;

 /**
 * Second stage of {@link XPathHtmlParserFilter} the IndexingFilter.
 * It takes the prepared data located in the metadata and indexes
 * it to solr.
 * 
 * 
 * @author Emir Dizdarevic
 * @version 1.4
 * @since Apache Nutch 1.4
 *
 */
 public class XPathIndexingFilter implements IndexingFilter {

 	// Constants
 	private static final String CONFIG_FILE_PROPERTY = "parser.xmlhtml.file";
 	private static final Logger log = Logger.getLogger(XPathIndexingFilter.class);
 	
 	// Configuration
 	private Configuration configuration;
 	private XPathFilterConfiguration xpathFilterConfiguration;
 	
 	private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();

 	static {
 		  FIELDS.add(WebPage.Field.METADATA);
 	}	
 	
 	public XPathIndexingFilter() {}
 	
 	private void initConfig() {
 		
 		// Initialize configuration
 		xpathFilterConfiguration  = XPathFilterConfiguration.getInstance(configuration);
 	}
 	
 	@Override
 	public Configuration getConf() {
 		return configuration;
 	}

 	@Override
 	public void setConf(Configuration configuration) {
 		this.configuration = configuration;
 		initConfig();
 	}

 	@Override
 	public NutchDocument filter(NutchDocument doc, String url, WebPage page)
 			throws IndexingException {
 		List<XPathIndexerProperties> xPathIndexerPropertiesList = xpathFilterConfiguration.getXPathIndexerPropertiesList();
 		for(XPathIndexerProperties xPathIndexerProperties : xPathIndexerPropertiesList) {
 			
 			if(FilterUtils.isMatch(xPathIndexerProperties.getPageUrlFilterRegex(), url)) {

 				List<XPathIndexerPropertiesField> xPathIndexerPropertiesFieldList = xPathIndexerProperties.getXPathIndexerPropertiesFieldList();
 				for(XPathIndexerPropertiesField xPathIndexerPropertiesField : xPathIndexerPropertiesFieldList) {
 					
 					ByteBuffer buffer = page.getFromMetadata(new Utf8(xPathIndexerPropertiesField.getName()));
 					String stringValue = "";
 					if(buffer != null) 
 						stringValue = Bytes.toString(buffer.array());
 						
 					doc.add(xPathIndexerPropertiesField.getName(), stringValue);
 				}
 			}
 		}
 		
 		return doc;
 	}

 	@Override
 	public Collection<Field> getFields() {
 		return FIELDS;
 	}
 }
	package com.atlantbh.nutch.filter.xpath;

	import java.io.ByteArrayInputStream;
	import java.io.IOException;
	import java.io.InputStreamReader;
	import java.io.Reader;
	import java.io.UnsupportedEncodingException;
	import java.nio.ByteBuffer;
	import java.nio.charset.Charset;
	import java.util.Arrays;
	import java.util.Collection;
	import java.util.HashSet;
	import java.util.List;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;
	import java.util.regex.PatternSyntaxException;

	import javax.xml.parsers.DocumentBuilder;
	import javax.xml.parsers.DocumentBuilderFactory;
	import javax.xml.parsers.ParserConfigurationException;

	import org.apache.avro.util.Utf8;
	import org.apache.commons.lang.StringEscapeUtils;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.log4j.Logger;
	import org.apache.nutch.metadata.Metadata;
	import org.apache.nutch.parse.HTMLMetaTags;
	import org.apache.nutch.parse.Parse;
	import org.apache.nutch.parse.ParseFilter;
	import org.apache.nutch.parse.ParseStatusCodes;
	import org.apache.nutch.parse.ParseStatusUtils;
	import org.apache.nutch.storage.WebPage;
	import org.apache.nutch.storage.WebPage.Field;
	import org.apache.nutch.util.Bytes;
	import org.apache.nutch.util.EncodingDetector;
	import org.htmlcleaner.CleanerProperties;
	import org.htmlcleaner.DomSerializer;
	import org.htmlcleaner.HtmlCleaner;
	import org.htmlcleaner.TagNode;
	import org.jaxen.JaxenException;
	import org.jaxen.XPath;
	import org.jaxen.dom.DOMXPath;
	import org.w3c.dom.Document;
	import org.w3c.dom.DocumentFragment;
	import org.w3c.dom.Node;
	import org.w3c.dom.NodeList;
	import org.xml.sax.InputSource;
	import org.xml.sax.SAXException;

	import com.atlantbh.nutch.filter.xpath.config.XPathFilterConfiguration;
	import com.atlantbh.nutch.filter.xpath.config.XPathIndexerProperties;
	import com.atlantbh.nutch.filter.xpath.config.XPathIndexerPropertiesField;

	/**
	* A Xml-Html xpath filter implementation that fetches data
	* from the content, depending on the supplied xpath,
	* and prepares it for the {@link XPathIndexingFilter} to
	* index it into solr.
	*
	* @author Emir Dizdarevic
	* @version 1.4
	* @since Apache Nutch 1.4
	*/
	public class XPathHtmlParserFilter implements ParseFilter {

	// Constants
	private static final Logger log = Logger.getLogger(XPathHtmlParserFilter.class);
	private static final List<String> htmlMimeTypes = Arrays.asList(new String[] {"text/html", "application/xhtml+xml"});

	// OLD WAY TO DETERMIN IF IT'S AN XML FORMAT
	//private static final List<String> xmlMimeTypes = Arrays.asList(new String[] {"text/xml", "application/xml"});

	// Configuration
	private Configuration configuration;
	private XPathFilterConfiguration xpathFilterConfiguration;
	private String defaultEncoding;

	private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();

	static {
	FIELDS.add(WebPage.Field.METADATA);
	}

	// Internal data
	private HtmlCleaner cleaner;
	private DomSerializer domSerializer;
	private DocumentBuilder documentBuilder;

	public XPathHtmlParserFilter() {
	init();
	}

	private void init() {

	// Initialize HTMLCleaner
	cleaner = new HtmlCleaner();
	CleanerProperties props = cleaner.getProperties();
	props.setAllowHtmlInsideAttributes(true);
	props.setAllowMultiWordAttributes(true);
	props.setRecognizeUnicodeChars(true);
	props.setOmitComments(true);
	props.setNamespacesAware(false);

	// Initialize DomSerializer
	domSerializer = new DomSerializer(props);

	// Initialize xml parser
	try {
	DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
	documentBuilder = documentBuilderFactory.newDocumentBuilder();
	} catch (ParserConfigurationException e) {
	// THIS CAN NEVER HAPPEN
	}
	}

	private void initConfig() {

	// Initialize configuration
	xpathFilterConfiguration = XPathFilterConfiguration.getInstance(configuration);
	defaultEncoding = configuration.get("parser.character.encoding.default", "UTF-8");
	}

	@Override
	public Configuration getConf() {
	return configuration;
	}

	@Override
	public void setConf(Configuration configuration) {
	this.configuration = configuration;
	initConfig();
	}

	private void removeAll(Node node, short nodeType, String name) {
	if (node.getNodeType() == nodeType && (name == null \|\| node.getNodeName().equals(name))) {
	node.getParentNode().removeChild(node);
	}
	else {
	NodeList list = node.getChildNodes();
	for (int i = 0; i < list.getLength(); i++) {
	removeAll(list.item(i), nodeType, name);
	}
	}
	}


	@Override
	public Parse filter(String url, WebPage page, Parse parse,
	HTMLMetaTags metaTags, DocumentFragment doc) {

	byte[] rawContent = page.getContent().array();

	try {
	Document cleanedXmlHtml = documentBuilder.newDocument();
	if(htmlMimeTypes.contains(page.getContentType().toString())) {


	String encoding = defaultEncoding;
	ByteBuffer buffer = page.getFromMetadata(new Utf8(Metadata.ORIGINAL_CHAR_ENCODING));
	if(buffer != null)
	encoding = Bytes.toString(buffer.array());

	// Create reader so the input can be read in UTF-8
	Reader rawContentReader = new InputStreamReader(new ByteArrayInputStream(rawContent), encoding);

	// Use the cleaner to "clean" the HTML and return it as a TagNode object
	TagNode tagNode = cleaner.clean(rawContentReader);
	cleanedXmlHtml = domSerializer.createDOM(tagNode);
	} else if(page.getContentType().toString().contains(new StringBuilder("/xml")) \|\| page.getContentType().toString().contains(new StringBuilder("+xml"))) {

	// Parse as xml - don't clean
	cleanedXmlHtml = documentBuilder.parse(new InputSource(new ByteArrayInputStream(rawContent)));
	}

	// Once the HTML is cleaned, then you can run your XPATH expressions on the node,
	// which will then return an array of TagNode objects
	List<XPathIndexerProperties> xPathIndexerPropertiesList = xpathFilterConfiguration.getXPathIndexerPropertiesList();
	for(XPathIndexerProperties xPathIndexerProperties : xPathIndexerPropertiesList) {


	//****************************
	// CORE XPATH EVALUATION
	//****************************
	if(pageToProcess(xPathIndexerProperties, cleanedXmlHtml, page.getBaseUrl().toString())) {

	List<XPathIndexerPropertiesField> xPathIndexerPropertiesFieldList = xPathIndexerProperties.getXPathIndexerPropertiesFieldList();
	for(XPathIndexerPropertiesField xPathIndexerPropertiesField : xPathIndexerPropertiesFieldList) {

	// Evaluate xpath
	XPath xPath = new DOMXPath(xPathIndexerPropertiesField.getXPath());
	List nodeList = xPath.selectNodes(cleanedXmlHtml);

	// Trim?
	boolean trim = FilterUtils.getNullSafe(xPathIndexerPropertiesField.getTrimXPathData(), true);

	if(FilterUtils.getNullSafe(xPathIndexerPropertiesField.isConcat(), false)) {

	// Iterate trough all found nodes
	String value = new String();
	String concatDelimiter = FilterUtils.getNullSafe(xPathIndexerPropertiesField.getConcatDelimiter(), "");
	for (Object node : nodeList) {

	// Extract data
	String tempValue = FilterUtils.extractTextContentFromRawNode(node);
	tempValue = filterValue(tempValue, trim);

	// Concatenate tempValue to value
	if(tempValue != null) {
	if(value.isEmpty()) {
	value = tempValue;
	} else {
	value = value + concatDelimiter + tempValue;
	}
	}
	}

	// Add the extracted data to meta
	if(value != null) {
	page.putToMetadata(new Utf8(xPathIndexerPropertiesField.getName()), ByteBuffer.wrap(value.getBytes()));
	}

	} else {

	// Iterate trough all found nodes
	for (Object node : nodeList) {

	// Add the extracted data to meta
	String value = FilterUtils.extractTextContentFromRawNode(node);
	value = filterValue(value, trim);
	if(value != null) {
	page.putToMetadata(new Utf8(xPathIndexerPropertiesField.getName()), ByteBuffer.wrap(value.getBytes()));
	}
	}
	}

	}
	}
	}

	} catch (IOException e) {
	// This can never happen because it's an in memory stream
	} catch(PatternSyntaxException e) {
	System.err.println(e.getMessage());
	log.error("Error parsing urlRegex: " + e.getMessage());
	return ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED, page.getBaseUrl().toString(), configuration);
	} catch (ParserConfigurationException e) {
	System.err.println(e.getMessage());
	log.error("HTML Cleaning error: " + e.getMessage());
	return ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED, page.getBaseUrl().toString(), configuration);
	} catch (SAXException e) {
	System.err.println(e.getMessage());
	log.error("XML parsing error: " + e.getMessage());
	return ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED, page.getBaseUrl().toString(), configuration);
	} catch (JaxenException e) {
	System.err.println(e.getMessage());
	log.error("XPath error: " + e.getMessage());
	return ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED, page.getBaseUrl().toString(), configuration);
	}

	return parse;
	}


	private boolean pageToProcess(XPathIndexerProperties xPathIndexerProperties, Document cleanedXmlHtml, String url) throws JaxenException {

	boolean processPage = true;

	// *************************************
	// URL REGEX CONTENT PAGE FILTERING
	// *************************************
	processPage = processPage && FilterUtils.isMatch(xPathIndexerProperties.getPageUrlFilterRegex(), url);

	// Check return status
	if (!processPage) {
	return false;
	}

	// *************************************
	// XPATH CONTENT PAGE FILTERING
	// *************************************

	if (xPathIndexerProperties.getPageContentFilterXPath() != null) {
	XPath xPathPageContentFilter = new DOMXPath(xPathIndexerProperties.getPageContentFilterXPath());
	List pageContentFilterNodeList = xPathPageContentFilter.selectNodes(cleanedXmlHtml);
	boolean trim = FilterUtils.getNullSafe(xPathIndexerProperties.isTrimPageContentFilterXPathData(), true);

	if (FilterUtils.getNullSafe(xPathIndexerProperties.isConcatPageContentFilterXPathData(), false)) {

	// Iterate trough all found nodes
	String value = new String();
	String concatDelimiter = FilterUtils.getNullSafe(xPathIndexerProperties.getConcatPageContentFilterXPathDataDelimiter(), "");

	for (Object node : pageContentFilterNodeList) {

	// Extract data
	String tempValue = FilterUtils.extractTextContentFromRawNode(node);
	tempValue = filterValue(tempValue, trim);

	// Concatenate tempValue to value
	if(tempValue != null) {
	if (value.isEmpty()) {
	value = tempValue;
	} else {
	value = value + concatDelimiter + tempValue;
	}
	}
	}

	processPage = processPage && FilterUtils.isMatch(xPathIndexerProperties.getPageContentFilterRegex(), value);
	} else {
	for (Object node : pageContentFilterNodeList) {

	// Add the extracted data to meta
	String value = FilterUtils.extractTextContentFromRawNode(node);
	value = filterValue(value, trim);
	if(value != null) {
	processPage = processPage && FilterUtils.isMatch(xPathIndexerProperties.getPageContentFilterRegex(), value);
	}
	}
	}
	}

	return processPage;
	}

	private String filterValue(String value, boolean trim) {

	String returnValue = null;

	// Filter out empty strings and strings made of space, carriage return and tab characters
	if(!value.isEmpty() && !FilterUtils.isMadeOf(value, " \n\t")) {

	// Trim data?
	returnValue = trimValue(value, trim);
	}

	return returnValue == null ? null : StringEscapeUtils.unescapeHtml(returnValue);
	}

	private String trimValue(String value, boolean trim) {

	String returnValue;
	if (trim) {
	returnValue = value.trim();
	} else {
	returnValue = value;
	}

	return returnValue;
	}

	@Override
	public Collection<Field> getFields() {
	return FIELDS;
	}
	}
	package com.atlantbh.nutch.filter.xpath;

	import java.nio.ByteBuffer;
	import java.text.ParseException;
	import java.text.SimpleDateFormat;
	import java.util.Collection;
	import java.util.Date;
	import java.util.HashSet;
	import java.util.List;

	import org.apache.avro.util.Utf8;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.log4j.Logger;
	import org.apache.nutch.indexer.IndexingException;
	import org.apache.nutch.indexer.IndexingFilter;
	import org.apache.nutch.indexer.NutchDocument;
	import org.apache.nutch.storage.WebPage;
	import org.apache.nutch.storage.WebPage.Field;
	import org.apache.nutch.util.Bytes;

	import com.atlantbh.nutch.filter.xpath.config.FieldType;
	import com.atlantbh.nutch.filter.xpath.config.XPathFilterConfiguration;
	import com.atlantbh.nutch.filter.xpath.config.XPathIndexerProperties;
	import com.atlantbh.nutch.filter.xpath.config.XPathIndexerPropertiesField;

	/**
	* Second stage of {@link XPathHtmlParserFilter} the IndexingFilter.
	* It takes the prepared data located in the metadata and indexes
	* it to solr.
	*
	*
	* @author Emir Dizdarevic
	* @version 1.4
	* @since Apache Nutch 1.4
	*
	*/
	public class XPathIndexingFilter implements IndexingFilter {

	// Constants
	private static final String CONFIG_FILE_PROPERTY = "parser.xmlhtml.file";
	private static final Logger log = Logger.getLogger(XPathIndexingFilter.class);

	// Configuration
	private Configuration configuration;
	private XPathFilterConfiguration xpathFilterConfiguration;

	private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();

	static {
	FIELDS.add(WebPage.Field.METADATA);
	}

	public XPathIndexingFilter() {}

	private void initConfig() {

	// Initialize configuration
	xpathFilterConfiguration = XPathFilterConfiguration.getInstance(configuration);
	}

	@Override
	public Configuration getConf() {
	return configuration;
	}

	@Override
	public void setConf(Configuration configuration) {
	this.configuration = configuration;
	initConfig();
	}

	@Override
	public NutchDocument filter(NutchDocument doc, String url, WebPage page)
	throws IndexingException {
	List<XPathIndexerProperties> xPathIndexerPropertiesList = xpathFilterConfiguration.getXPathIndexerPropertiesList();
	for(XPathIndexerProperties xPathIndexerProperties : xPathIndexerPropertiesList) {

	if(FilterUtils.isMatch(xPathIndexerProperties.getPageUrlFilterRegex(), url)) {

	List<XPathIndexerPropertiesField> xPathIndexerPropertiesFieldList = xPathIndexerProperties.getXPathIndexerPropertiesFieldList();
	for(XPathIndexerPropertiesField xPathIndexerPropertiesField : xPathIndexerPropertiesFieldList) {

	ByteBuffer buffer = page.getFromMetadata(new Utf8(xPathIndexerPropertiesField.getName()));
	String stringValue = "";
	if(buffer != null)
	stringValue = Bytes.toString(buffer.array());

	doc.add(xPathIndexerPropertiesField.getName(), stringValue);
	}
	}
	}

	return doc;
	}

	@Override
	public Collection<Field> getFields() {
	return FIELDS;
	}
	}