Skip to content

Instantly share code, notes, and snippets.

@BillCacy
Last active October 19, 2015 16:04
Show Gist options
  • Save BillCacy/efda207fb7b5a9e97032 to your computer and use it in GitHub Desktop.
Save BillCacy/efda207fb7b5a9e97032 to your computer and use it in GitHub Desktop.
Computed Index field for parsing page content and stripping out html, comments, scripts, etc..
using System;
using System.Linq;
using System.Net;
using System.Text;
using HtmlAgilityPack;
using Sitecore.ContentSearch;
using Sitecore.ContentSearch.ComputedFields;
using Sitecore.Data;
using Sitecore.Data.Items;
using Sitecore.Diagnostics;
using Sitecore.Links;
using Sitecore.Web;
namespace YOURNAMESPACE.ComputedFields
{
public class PageContentField : IComputedIndexField
{
public string FieldName { get; set; }
public string ReturnType { get; set; }
public object ComputeFieldValue(IIndexable indexable)
{
Assert.ArgumentNotNull(indexable, "indexable");
string url = null;
try
{
Item item = indexable as SitecoreIndexableItem;
//Only parsing page items
if (item == null) return null;
if (item.TemplateName != "PageBase") return null;
bool excludeFromSearch = false;
var excludeFieldValue = item.Fields["Exclude Page From Search"].Value;
if (!string.IsNullOrEmpty(excludeFieldValue) && excludeFieldValue == "1")
{
excludeFromSearch = true;
}
if (excludeFromSearch) return null;
// Determine the url to request
using (new DatabaseSwitcher(item.Database))
{
url = WebUtil.AddQueryString(
LinkManager.GetItemUrl(item, new UrlOptions()
{
AlwaysIncludeServerUrl = true,
LanguageEmbedding = LanguageEmbedding.Never
}),
"sc_database", Sitecore.Context.Database.Name);
}
// Http request the page
using (var client = new WebClient())
{
string pageContent = client.DownloadString(url);
// Parse the page's html using HtmlAgilityPack
HtmlDocument htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(pageContent);
// Strip out all the html tags, so we can index just the text
HtmlNode mainContainer = htmlDocument.DocumentNode.Descendants("body").FirstOrDefault();
string content = mainContainer != null ? GetAllInnerTexts(mainContainer) : null;
return item.ID.ToString() + "|" + content;
}
}
catch (WebException webExc)
{
Log.Warn(string.Format("Failed to html index {0} ({1}): {2}", indexable.Id, url, webExc.Message), webExc, this);
}
catch (Exception exc)
{
Log.Error(string.Format("An error occurred when indexing {0}: {1}", indexable.Id, exc.Message), exc, this);
}
return null;
}
/// <summary>
/// Find all inner texts and return a simplified string.
/// </summary>
/// <param name="node"></param>
/// <returns></returns>
public string GetAllInnerTexts(HtmlNode node)
{
var htmlBuilder = new StringBuilder();
foreach (var x in node.ChildNodes)
{
if (x.Name != "noscript" && x.Name != "script"
&& x.Name != "header" && x.Name != "footer"
&& x.Name != "nav" && x.Name != "#comment")
{
var attribute = x.GetAttributeValue("class", "");
if (attribute.Contains("modal")) continue;
htmlBuilder.Append(x.InnerText.Replace(Environment.NewLine, " ").Trim());
}
}
return RemoveWhitespace(htmlBuilder.ToString());
}
/// <summary>
/// Storing whitespace for a field that is only to be used for searching in is not very useful.
/// This methods removes excessive whitespace.
/// </summary>
/// <param name="inputStr"></param>
/// <returns></returns>
private string RemoveWhitespace(string inputStr)
{
const int n = 5;
StringBuilder tmpbuilder = new StringBuilder(inputStr.Length);
for (int i = 0; i < n; ++i)
{
string scopy = inputStr;
bool inspaces = false;
tmpbuilder.Length = 0;
for (int k = 0; k < inputStr.Length; ++k)
{
char c = scopy[k];
if (inspaces)
{
if (c != ' ')
{
inspaces = false;
tmpbuilder.Append(c);
}
}
else if (c == ' ')
{
inspaces = true;
tmpbuilder.Append(' ');
}
else
{
tmpbuilder.Append(c);
}
}
}
return tmpbuilder.ToString();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment