Last active
October 19, 2015 16:04
-
-
Save BillCacy/efda207fb7b5a9e97032 to your computer and use it in GitHub Desktop.
Computed Index field for parsing page content and stripping out html, comments, scripts, etc..
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Linq; | |
using System.Net; | |
using System.Text; | |
using HtmlAgilityPack; | |
using Sitecore.ContentSearch; | |
using Sitecore.ContentSearch.ComputedFields; | |
using Sitecore.Data; | |
using Sitecore.Data.Items; | |
using Sitecore.Diagnostics; | |
using Sitecore.Links; | |
using Sitecore.Web; | |
namespace YOURNAMESPACE.ComputedFields | |
{ | |
public class PageContentField : IComputedIndexField | |
{ | |
public string FieldName { get; set; } | |
public string ReturnType { get; set; } | |
public object ComputeFieldValue(IIndexable indexable) | |
{ | |
Assert.ArgumentNotNull(indexable, "indexable"); | |
string url = null; | |
try | |
{ | |
Item item = indexable as SitecoreIndexableItem; | |
//Only parsing page items | |
if (item == null) return null; | |
if (item.TemplateName != "PageBase") return null; | |
bool excludeFromSearch = false; | |
var excludeFieldValue = item.Fields["Exclude Page From Search"].Value; | |
if (!string.IsNullOrEmpty(excludeFieldValue) && excludeFieldValue == "1") | |
{ | |
excludeFromSearch = true; | |
} | |
if (excludeFromSearch) return null; | |
// Determine the url to request | |
using (new DatabaseSwitcher(item.Database)) | |
{ | |
url = WebUtil.AddQueryString( | |
LinkManager.GetItemUrl(item, new UrlOptions() | |
{ | |
AlwaysIncludeServerUrl = true, | |
LanguageEmbedding = LanguageEmbedding.Never | |
}), | |
"sc_database", Sitecore.Context.Database.Name); | |
} | |
// Http request the page | |
using (var client = new WebClient()) | |
{ | |
string pageContent = client.DownloadString(url); | |
// Parse the page's html using HtmlAgilityPack | |
HtmlDocument htmlDocument = new HtmlDocument(); | |
htmlDocument.LoadHtml(pageContent); | |
// Strip out all the html tags, so we can index just the text | |
HtmlNode mainContainer = htmlDocument.DocumentNode.Descendants("body").FirstOrDefault(); | |
string content = mainContainer != null ? GetAllInnerTexts(mainContainer) : null; | |
return item.ID.ToString() + "|" + content; | |
} | |
} | |
catch (WebException webExc) | |
{ | |
Log.Warn(string.Format("Failed to html index {0} ({1}): {2}", indexable.Id, url, webExc.Message), webExc, this); | |
} | |
catch (Exception exc) | |
{ | |
Log.Error(string.Format("An error occurred when indexing {0}: {1}", indexable.Id, exc.Message), exc, this); | |
} | |
return null; | |
} | |
/// <summary> | |
/// Find all inner texts and return a simplified string. | |
/// </summary> | |
/// <param name="node"></param> | |
/// <returns></returns> | |
public string GetAllInnerTexts(HtmlNode node) | |
{ | |
var htmlBuilder = new StringBuilder(); | |
foreach (var x in node.ChildNodes) | |
{ | |
if (x.Name != "noscript" && x.Name != "script" | |
&& x.Name != "header" && x.Name != "footer" | |
&& x.Name != "nav" && x.Name != "#comment") | |
{ | |
var attribute = x.GetAttributeValue("class", ""); | |
if (attribute.Contains("modal")) continue; | |
htmlBuilder.Append(x.InnerText.Replace(Environment.NewLine, " ").Trim()); | |
} | |
} | |
return RemoveWhitespace(htmlBuilder.ToString()); | |
} | |
/// <summary> | |
/// Storing whitespace for a field that is only to be used for searching in is not very useful. | |
/// This methods removes excessive whitespace. | |
/// </summary> | |
/// <param name="inputStr"></param> | |
/// <returns></returns> | |
private string RemoveWhitespace(string inputStr) | |
{ | |
const int n = 5; | |
StringBuilder tmpbuilder = new StringBuilder(inputStr.Length); | |
for (int i = 0; i < n; ++i) | |
{ | |
string scopy = inputStr; | |
bool inspaces = false; | |
tmpbuilder.Length = 0; | |
for (int k = 0; k < inputStr.Length; ++k) | |
{ | |
char c = scopy[k]; | |
if (inspaces) | |
{ | |
if (c != ' ') | |
{ | |
inspaces = false; | |
tmpbuilder.Append(c); | |
} | |
} | |
else if (c == ' ') | |
{ | |
inspaces = true; | |
tmpbuilder.Append(' '); | |
} | |
else | |
{ | |
tmpbuilder.Append(c); | |
} | |
} | |
} | |
return tmpbuilder.ToString(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment