cstrap · October 26, 2011 08:06
diff --git a/SearchResultSnippetTag.java b/SearchResultSnippetTag.java
 // @author cstrap PATCH: unescape string and complete first and last snippets's words 

 /**
 * This file Copyright (c) 2003-2011 Magnolia International
 * Ltd.  (http://www.magnolia-cms.com). All rights reserved.
 *
 *
 * This file is dual-licensed under both the Magnolia
 * Network Agreement and the GNU General Public License.
 * You may elect to use one or the other of these licenses.
 *
 * This file is distributed in the hope that it will be
 * useful, but AS-IS and WITHOUT ANY WARRANTY; without even the
 * implied warranty of MERCHANTABILITY or FITNESS FOR A
 * PARTICULAR PURPOSE, TITLE, or NONINFRINGEMENT.
 * Redistribution, except as permitted by whichever of the GPL
 * or MNA you select, is prohibited.
 *
 * 1. For the GPL license (GPL), you can redistribute and/or
 * modify this file under the terms of the GNU General
 * Public License, Version 3, as published by the Free Software
 * Foundation.  You should have received a copy of the GNU
 * General Public License, Version 3 along with this program;
 * if not, write to the Free Software Foundation, Inc., 51
 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * 2. For the Magnolia Network Agreement (MNA), this file
 * and the accompanying materials are made available under the
 * terms of the MNA which accompanies this distribution, and
 * is available at http://www.magnolia-cms.com/mna.html
 *
 * Any modifications to this file must keep this entire header
 * intact.
 *
 */
 package info.magnolia.cms.taglibs.util;

 import info.magnolia.cms.core.Content;
 import info.magnolia.cms.core.ItemType;
 import info.magnolia.cms.core.NodeData;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Iterator;
 import java.util.regex.Pattern;

 import javax.jcr.PropertyType;
 import javax.servlet.jsp.JspException;
 import javax.servlet.jsp.JspWriter;
 import javax.servlet.jsp.tagext.TagSupport;

 import org.apache.commons.lang.ArrayUtils;
 import org.apache.commons.lang.CharUtils;
 import org.apache.commons.lang.StringEscapeUtils;
 import org.apache.commons.lang.StringUtils;
 import org.apache.commons.lang.exception.NestableRuntimeException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;


 /**
 * Output a set of snippets taken from any paragraph in the given page mathing any of the search term.
 * @jsp.tag name="searchResultSnippet" body-content="empty"
 * @jsp.tag-example <pre>
 * &lt;cmsu:simplesearch query="${param.search}" var="results" /&gt;
 * &lt;c:forEach items="${results}" var="page"&gt;
 *   &lt;cmsu:searchResultSnippet query="${param.search}" page="${page}" /&gt;
 * &lt;/c:forEach&gt;
 *</pre>
 * @author Fabrizio Giustina
 * @version $Revision: 41137 $ ($Author: gjoseph $)
 */
 @SuppressWarnings("serial")
 public class SearchResultSnippetTag extends TagSupport
 {

    private static final Pattern HTML_STRIP = Pattern.compile("<.*?>", Pattern.DOTALL);

    private static final Logger log = LoggerFactory.getLogger(SearchResultSnippetTag.class);

    /**
     * Start level.
     */
    private Content page;

    /**
     * Query, natural language.
     */
    private String query;

    /**
     * Number of chars to include in result.
     */
    private int chars = 100;

    /**
     * Maximum number of snippets to include in result.
     */
    private int maxSnippets = 3;

    /**
     * Search query.
     * @jsp.attribute required="true" rtexprvalue="true"
     */
    public void setQuery(String query)
    {
        this.query = query;
    }

    /**
     * Number of characters to include in search snippets. Default is 100.
     * @jsp.attribute required="false" rtexprvalue="true" type="int"
     */
    public void setChars(int chars)
    {
        this.chars = chars;
    }

    /**
     * Maximum number of snippets to print out.
     * @jsp.attribute required="false" rtexprvalue="true" type="int"
     */
    public void setMaxSnippets(int maxSnippets)
    {
        this.maxSnippets = maxSnippets;
    }

    /**
     * A Content node of type mgnl:content (a magnolia page), typically returned by the simpleSearch tag.
     * @jsp.attribute required="true" rtexprvalue="true" type="info.magnolia.cms.core.Content"
     */
    public void setPage(Content page)
    {
        this.page = page;
    }

    /**
     * @see javax.servlet.jsp.tagext.TagSupport#doStartTag()
     */
    @SuppressWarnings("rawtypes")
    @Override
    public int doStartTag() throws JspException
    {

        JspWriter out = this.pageContext.getOut();
        try
        {
            Iterator iterator = getSnippets().iterator();
            while (iterator.hasNext())
            {
                out.println(iterator.next());
            }
        }
        catch (IOException e)
        {
            // should never happen
            throw new NestableRuntimeException(e);
        }
        return EVAL_PAGE;
    }

    /**
     * Extract a collection of snippets from any paragraph in the given page.
     * @return a collection of Strings.
     * @todo avoid overlapping snippets (use regexp insted of simple indexOfs)
     * @todo only extract snippets from user-configured properties
     * @todo abbreviate on whitespace and puntuation, detect start of sentences
     * @todo replace ampersand in regexp
     * @todo break methods and write junits
     */
    @SuppressWarnings({"rawtypes", "unchecked" })
    public Collection getSnippets()
    {

        log.debug("collecting snippets"); //$NON-NLS-1$

        Collection snippets = new ArrayList();
        String[] searchTerms = StringUtils.split(this.query);

        Collection paragraphCollections = this.page.getChildren(ItemType.CONTENTNODE);

        Iterator iterator = paragraphCollections.iterator();
        outer : while (iterator.hasNext())
        {
            Content paragraphCollection = (Content) iterator.next();

            Collection paragraphs = paragraphCollection.getChildren();

            Iterator parIterator = paragraphs.iterator();
            while (parIterator.hasNext())
            {
                Content paragraph = (Content) parIterator.next();

                log.debug("Iterating on paragraph {}", paragraph); //$NON-NLS-1$

                Collection properties = paragraph.getNodeDataCollection();

                Iterator dataIterator = properties.iterator();
                while (dataIterator.hasNext())
                {
                    NodeData property = (NodeData) dataIterator.next();
                    if (property.getType() != PropertyType.BINARY)
                    {

                        String resultString = property.getString();

                        log.debug("Iterating on property {}", property.getName()); //$NON-NLS-1$
                        log.debug("Property value is {}", resultString); //$NON-NLS-1$

                        // a quick and buggy way to avoid configuration properties, we should allow the user to
                        // configure a list of nodeData to search for...
                        if (resultString.length() < 20)
                        {
                            continue;
                        }

                        for (int j = 0; j < searchTerms.length; j++)
                        {
                            String searchTerm = StringUtils.lowerCase(searchTerms[j]);

                            // exclude keywords and words with less than 2 chars
                            if (!ArrayUtils.contains(SimpleSearchTag.KEYWORDS, searchTerm) && searchTerm.length() > 2)
                            {

                                log.debug("Looking for search term [{}] in [{}]", searchTerm, resultString); //$NON-NLS-1$

                                // first check, avoid using heavy string replaceAll operations if the search term is not
                                // there
                                if (!StringUtils.contains(resultString.toLowerCase(), searchTerm))
                                {
                                    continue;
                                }

                                // strips out html tags using a regexp
                                resultString = StringEscapeUtils.unescapeHtml(stripHtmlTags(resultString));

                                // only get first matching keyword
                                int pos = resultString.toLowerCase().indexOf(searchTerm);
                                if (pos > -1)
                                {

                                    int posEnd = pos + searchTerm.length();
                                    int from = (pos - chars / 2);

                                    if (from < 0)
                                    {
                                        from = 0;
                                    }

                                    int to = from + chars;
                                    if (to > resultString.length())
                                    {
                                        to = resultString.length();
                                    }

                                    StringBuffer snippet = new StringBuffer();

                                    String appendString = StringUtils.substring(resultString, from, pos);
                                    int i = from;
                                    while (i > 0 && StringUtils.isNotBlank(CharUtils.toString(resultString.charAt(i))))
                                    {
                                        appendString = StringUtils.substring(resultString, i, pos);
                                        i--;
                                    }

                                    snippet.append(appendString);
                                    snippet.append("<strong>"); //$NON-NLS-1$
                                    snippet.append(StringUtils.substring(resultString, pos, posEnd));
                                    snippet.append("</strong>"); //$NON-NLS-1$

                                    appendString = StringUtils.substring(resultString, posEnd, to);

                                    i = to;
                                    while (i < resultString.length()
                                        && StringUtils.isNotBlank(CharUtils.toString(resultString.charAt(i)))
                                        && ',' != resultString.charAt(i))
                                    {
                                        appendString = StringUtils.substring(resultString, posEnd, i + 1);
                                        i++;
                                    }
                                    snippet.append(appendString);

                                    if (from > 0)
                                    {
                                        snippet.insert(0, "... "); //$NON-NLS-1$
                                    }
                                    if (to < resultString.length())
                                    {
                                        snippet.append("... "); //$NON-NLS-1$
                                    }

                                    log.debug("Search term found, adding snippet {}", snippet); //$NON-NLS-1$

                                    snippets.add(snippet);
                                    if (snippets.size() >= this.maxSnippets)
                                    {

                                        log.debug("Maximum number of snippets ({}) reached, exiting", //$NON-NLS-1$
                                            Integer.toString(this.maxSnippets));

                                        break outer;
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }

        return snippets;
    }

    /**
     * @param resultString
     * @return
     */
    protected String stripHtmlTags(String input)
    {
        return HTML_STRIP.matcher(input).replaceAll("");
    }

    /**
     * @see javax.servlet.jsp.tagext.TagSupport#release()
     */
    @Override
    public void release()
    {
        this.query = null;
        this.page = null;
        this.chars = 100;
        this.maxSnippets = 3;
        super.release();
    }

 }
	// @author cstrap PATCH: unescape string and complete first and last snippets's words

	/**
	* This file Copyright (c) 2003-2011 Magnolia International
	* Ltd. (http://www.magnolia-cms.com). All rights reserved.
	*
	*
	* This file is dual-licensed under both the Magnolia
	* Network Agreement and the GNU General Public License.
	* You may elect to use one or the other of these licenses.
	*
	* This file is distributed in the hope that it will be
	* useful, but AS-IS and WITHOUT ANY WARRANTY; without even the
	* implied warranty of MERCHANTABILITY or FITNESS FOR A
	* PARTICULAR PURPOSE, TITLE, or NONINFRINGEMENT.
	* Redistribution, except as permitted by whichever of the GPL
	* or MNA you select, is prohibited.
	*
	* 1. For the GPL license (GPL), you can redistribute and/or
	* modify this file under the terms of the GNU General
	* Public License, Version 3, as published by the Free Software
	* Foundation. You should have received a copy of the GNU
	* General Public License, Version 3 along with this program;
	* if not, write to the Free Software Foundation, Inc., 51
	* Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
	*
	* 2. For the Magnolia Network Agreement (MNA), this file
	* and the accompanying materials are made available under the
	* terms of the MNA which accompanies this distribution, and
	* is available at http://www.magnolia-cms.com/mna.html
	*
	* Any modifications to this file must keep this entire header
	* intact.
	*
	*/
	package info.magnolia.cms.taglibs.util;

	import info.magnolia.cms.core.Content;
	import info.magnolia.cms.core.ItemType;
	import info.magnolia.cms.core.NodeData;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Collection;
	import java.util.Iterator;
	import java.util.regex.Pattern;

	import javax.jcr.PropertyType;
	import javax.servlet.jsp.JspException;
	import javax.servlet.jsp.JspWriter;
	import javax.servlet.jsp.tagext.TagSupport;

	import org.apache.commons.lang.ArrayUtils;
	import org.apache.commons.lang.CharUtils;
	import org.apache.commons.lang.StringEscapeUtils;
	import org.apache.commons.lang.StringUtils;
	import org.apache.commons.lang.exception.NestableRuntimeException;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;


	/**
	* Output a set of snippets taken from any paragraph in the given page mathing any of the search term.
	* @jsp.tag name="searchResultSnippet" body-content="empty"
	* @jsp.tag-example <pre>
	* <cmsu:simplesearch query="${param.search}" var="results" />
	* <c:forEach items="${results}" var="page">
	* <cmsu:searchResultSnippet query="${param.search}" page="${page}" />
	* </c:forEach>
	*</pre>
	* @author Fabrizio Giustina
	* @version $Revision: 41137 $ ($Author: gjoseph $)
	*/
	@SuppressWarnings("serial")
	public class SearchResultSnippetTag extends TagSupport
	{

	private static final Pattern HTML_STRIP = Pattern.compile("<.*?>", Pattern.DOTALL);

	private static final Logger log = LoggerFactory.getLogger(SearchResultSnippetTag.class);

	/**
	* Start level.
	*/
	private Content page;

	/**
	* Query, natural language.
	*/
	private String query;

	/**
	* Number of chars to include in result.
	*/
	private int chars = 100;

	/**
	* Maximum number of snippets to include in result.
	*/
	private int maxSnippets = 3;

	/**
	* Search query.
	* @jsp.attribute required="true" rtexprvalue="true"
	*/
	public void setQuery(String query)
	{
	this.query = query;
	}

	/**
	* Number of characters to include in search snippets. Default is 100.
	* @jsp.attribute required="false" rtexprvalue="true" type="int"
	*/
	public void setChars(int chars)
	{
	this.chars = chars;
	}

	/**
	* Maximum number of snippets to print out.
	* @jsp.attribute required="false" rtexprvalue="true" type="int"
	*/
	public void setMaxSnippets(int maxSnippets)
	{
	this.maxSnippets = maxSnippets;
	}

	/**
	* A Content node of type mgnl:content (a magnolia page), typically returned by the simpleSearch tag.
	* @jsp.attribute required="true" rtexprvalue="true" type="info.magnolia.cms.core.Content"
	*/
	public void setPage(Content page)
	{
	this.page = page;
	}

	/**
	* @see javax.servlet.jsp.tagext.TagSupport#doStartTag()
	*/
	@SuppressWarnings("rawtypes")
	@Override
	public int doStartTag() throws JspException
	{

	JspWriter out = this.pageContext.getOut();
	try
	{
	Iterator iterator = getSnippets().iterator();
	while (iterator.hasNext())
	{
	out.println(iterator.next());
	}
	}
	catch (IOException e)
	{
	// should never happen
	throw new NestableRuntimeException(e);
	}
	return EVAL_PAGE;
	}

	/**
	* Extract a collection of snippets from any paragraph in the given page.
	* @return a collection of Strings.
	* @todo avoid overlapping snippets (use regexp insted of simple indexOfs)
	* @todo only extract snippets from user-configured properties
	* @todo abbreviate on whitespace and puntuation, detect start of sentences
	* @todo replace ampersand in regexp
	* @todo break methods and write junits
	*/
	@SuppressWarnings({"rawtypes", "unchecked" })
	public Collection getSnippets()
	{

	log.debug("collecting snippets"); //$NON-NLS-1$

	Collection snippets = new ArrayList();
	String[] searchTerms = StringUtils.split(this.query);

	Collection paragraphCollections = this.page.getChildren(ItemType.CONTENTNODE);

	Iterator iterator = paragraphCollections.iterator();
	outer : while (iterator.hasNext())
	{
	Content paragraphCollection = (Content) iterator.next();

	Collection paragraphs = paragraphCollection.getChildren();

	Iterator parIterator = paragraphs.iterator();
	while (parIterator.hasNext())
	{
	Content paragraph = (Content) parIterator.next();

	log.debug("Iterating on paragraph {}", paragraph); //$NON-NLS-1$

	Collection properties = paragraph.getNodeDataCollection();

	Iterator dataIterator = properties.iterator();
	while (dataIterator.hasNext())
	{
	NodeData property = (NodeData) dataIterator.next();
	if (property.getType() != PropertyType.BINARY)
	{

	String resultString = property.getString();

	log.debug("Iterating on property {}", property.getName()); //$NON-NLS-1$
	log.debug("Property value is {}", resultString); //$NON-NLS-1$

	// a quick and buggy way to avoid configuration properties, we should allow the user to
	// configure a list of nodeData to search for...
	if (resultString.length() < 20)
	{
	continue;
	}

	for (int j = 0; j < searchTerms.length; j++)
	{
	String searchTerm = StringUtils.lowerCase(searchTerms[j]);

	// exclude keywords and words with less than 2 chars
	if (!ArrayUtils.contains(SimpleSearchTag.KEYWORDS, searchTerm) && searchTerm.length() > 2)
	{

	log.debug("Looking for search term [{}] in [{}]", searchTerm, resultString); //$NON-NLS-1$

	// first check, avoid using heavy string replaceAll operations if the search term is not
	// there
	if (!StringUtils.contains(resultString.toLowerCase(), searchTerm))
	{
	continue;
	}

	// strips out html tags using a regexp
	resultString = StringEscapeUtils.unescapeHtml(stripHtmlTags(resultString));

	// only get first matching keyword
	int pos = resultString.toLowerCase().indexOf(searchTerm);
	if (pos > -1)
	{

	int posEnd = pos + searchTerm.length();
	int from = (pos - chars / 2);

	if (from < 0)
	{
	from = 0;
	}

	int to = from + chars;
	if (to > resultString.length())
	{
	to = resultString.length();
	}

	StringBuffer snippet = new StringBuffer();

	String appendString = StringUtils.substring(resultString, from, pos);
	int i = from;
	while (i > 0 && StringUtils.isNotBlank(CharUtils.toString(resultString.charAt(i))))
	{
	appendString = StringUtils.substring(resultString, i, pos);
	i--;
	}

	snippet.append(appendString);
	snippet.append("<strong>"); //$NON-NLS-1$
	snippet.append(StringUtils.substring(resultString, pos, posEnd));
	snippet.append("</strong>"); //$NON-NLS-1$

	appendString = StringUtils.substring(resultString, posEnd, to);

	i = to;
	while (i < resultString.length()
	&& StringUtils.isNotBlank(CharUtils.toString(resultString.charAt(i)))
	&& ',' != resultString.charAt(i))
	{
	appendString = StringUtils.substring(resultString, posEnd, i + 1);
	i++;
	}
	snippet.append(appendString);

	if (from > 0)
	{
	snippet.insert(0, "... "); //$NON-NLS-1$
	}
	if (to < resultString.length())
	{
	snippet.append("... "); //$NON-NLS-1$
	}

	log.debug("Search term found, adding snippet {}", snippet); //$NON-NLS-1$

	snippets.add(snippet);
	if (snippets.size() >= this.maxSnippets)
	{

	log.debug("Maximum number of snippets ({}) reached, exiting", //$NON-NLS-1$
	Integer.toString(this.maxSnippets));

	break outer;
	}
	}
	}
	}
	}
	}
	}
	}

	return snippets;
	}

	/**
	* @param resultString
	* @return
	*/
	protected String stripHtmlTags(String input)
	{
	return HTML_STRIP.matcher(input).replaceAll("");
	}

	/**
	* @see javax.servlet.jsp.tagext.TagSupport#release()
	*/
	@Override
	public void release()
	{
	this.query = null;
	this.page = null;
	this.chars = 100;
	this.maxSnippets = 3;
	super.release();
	}

	}