BillCacy · October 19, 2015 16:04
diff --git a/PageContentField.cs b/PageContentField.cs
 using System;
 using System.Linq;
 using System.Net;
 using System.Text;
 using HtmlAgilityPack;
 using Sitecore.ContentSearch;
 using Sitecore.ContentSearch.ComputedFields;
 using Sitecore.Data;
 using Sitecore.Data.Items;
 using Sitecore.Diagnostics;
 using Sitecore.Links;
 using Sitecore.Web;

 namespace YOURNAMESPACE.ComputedFields
 {
   public class PageContentField : IComputedIndexField
   {
      public string FieldName { get; set; }

      public string ReturnType { get; set; }

      public object ComputeFieldValue(IIndexable indexable)
      {
         Assert.ArgumentNotNull(indexable, "indexable");
         string url = null;
         try
         {
            Item item = indexable as SitecoreIndexableItem;

            //Only parsing page items
            if (item == null) return null;
            if (item.TemplateName != "PageBase") return null;

            bool excludeFromSearch = false;
            var excludeFieldValue = item.Fields["Exclude Page From Search"].Value;
            if (!string.IsNullOrEmpty(excludeFieldValue) && excludeFieldValue == "1")
            {
               excludeFromSearch = true;
            }

            if (excludeFromSearch) return null;

            // Determine the url to request
            using (new DatabaseSwitcher(item.Database))
            {
               url = WebUtil.AddQueryString(
                   LinkManager.GetItemUrl(item, new UrlOptions()
                   {
                      AlwaysIncludeServerUrl = true,
                      LanguageEmbedding = LanguageEmbedding.Never
                   }),
                   "sc_database", Sitecore.Context.Database.Name);
            }

            // Http request the page
            using (var client = new WebClient())
            {
               string pageContent = client.DownloadString(url);

               // Parse the page's html using HtmlAgilityPack
               HtmlDocument htmlDocument = new HtmlDocument();
               htmlDocument.LoadHtml(pageContent);

               // Strip out all the html tags, so we can index just the text
               HtmlNode mainContainer = htmlDocument.DocumentNode.Descendants("body").FirstOrDefault();
               string content = mainContainer != null ? GetAllInnerTexts(mainContainer) : null;
               return item.ID.ToString() + "|" + content;
            }
         }
         catch (WebException webExc)
         {
            Log.Warn(string.Format("Failed to html index {0} ({1}): {2}", indexable.Id, url, webExc.Message), webExc, this);
         }
         catch (Exception exc)
         {
            Log.Error(string.Format("An error occurred when indexing {0}: {1}", indexable.Id, exc.Message), exc, this);
         }
         return null;
      }

      /// <summary>
      /// Find all inner texts and return a simplified string.
      /// </summary>
      /// <param name="node"></param>
      /// <returns></returns>
      public string GetAllInnerTexts(HtmlNode node)
      {
         var htmlBuilder = new StringBuilder();

         foreach (var x in node.ChildNodes)
         {
            if (x.Name != "noscript" && x.Name != "script"
                && x.Name != "header" && x.Name != "footer"
                && x.Name != "nav" && x.Name != "#comment")
            {
               var attribute = x.GetAttributeValue("class", "");
               if (attribute.Contains("modal")) continue;

               htmlBuilder.Append(x.InnerText.Replace(Environment.NewLine, " ").Trim());
            }
         }

         return RemoveWhitespace(htmlBuilder.ToString());
      }

      /// <summary>
      /// Storing whitespace for a field that is only to be used for searching in is not very useful.
      /// This methods removes excessive whitespace.
      /// </summary>
      /// <param name="inputStr"></param>
      /// <returns></returns>
      private string RemoveWhitespace(string inputStr)
      {
         const int n = 5;
         StringBuilder tmpbuilder = new StringBuilder(inputStr.Length);
         for (int i = 0; i < n; ++i)
         {
            string scopy = inputStr;
            bool inspaces = false;
            tmpbuilder.Length = 0;
            for (int k = 0; k < inputStr.Length; ++k)
            {
               char c = scopy[k];
               if (inspaces)
               {
                  if (c != ' ')
                  {
                     inspaces = false;
                     tmpbuilder.Append(c);
                  }
               }
               else if (c == ' ')
               {
                  inspaces = true;
                  tmpbuilder.Append(' ');
               }
               else
               {
                  tmpbuilder.Append(c);
               }
            }
         }
         return tmpbuilder.ToString();
      }
   }
 }
	using System;
	using System.Linq;
	using System.Net;
	using System.Text;
	using HtmlAgilityPack;
	using Sitecore.ContentSearch;
	using Sitecore.ContentSearch.ComputedFields;
	using Sitecore.Data;
	using Sitecore.Data.Items;
	using Sitecore.Diagnostics;
	using Sitecore.Links;
	using Sitecore.Web;

	namespace YOURNAMESPACE.ComputedFields
	{
	public class PageContentField : IComputedIndexField
	{
	public string FieldName { get; set; }

	public string ReturnType { get; set; }

	public object ComputeFieldValue(IIndexable indexable)
	{
	Assert.ArgumentNotNull(indexable, "indexable");
	string url = null;
	try
	{
	Item item = indexable as SitecoreIndexableItem;

	//Only parsing page items
	if (item == null) return null;
	if (item.TemplateName != "PageBase") return null;

	bool excludeFromSearch = false;
	var excludeFieldValue = item.Fields["Exclude Page From Search"].Value;
	if (!string.IsNullOrEmpty(excludeFieldValue) && excludeFieldValue == "1")
	{
	excludeFromSearch = true;
	}

	if (excludeFromSearch) return null;

	// Determine the url to request
	using (new DatabaseSwitcher(item.Database))
	{
	url = WebUtil.AddQueryString(
	LinkManager.GetItemUrl(item, new UrlOptions()
	{
	AlwaysIncludeServerUrl = true,
	LanguageEmbedding = LanguageEmbedding.Never
	}),
	"sc_database", Sitecore.Context.Database.Name);
	}

	// Http request the page
	using (var client = new WebClient())
	{
	string pageContent = client.DownloadString(url);

	// Parse the page's html using HtmlAgilityPack
	HtmlDocument htmlDocument = new HtmlDocument();
	htmlDocument.LoadHtml(pageContent);

	// Strip out all the html tags, so we can index just the text
	HtmlNode mainContainer = htmlDocument.DocumentNode.Descendants("body").FirstOrDefault();
	string content = mainContainer != null ? GetAllInnerTexts(mainContainer) : null;
	return item.ID.ToString() + "\|" + content;
	}
	}
	catch (WebException webExc)
	{
	Log.Warn(string.Format("Failed to html index {0} ({1}): {2}", indexable.Id, url, webExc.Message), webExc, this);
	}
	catch (Exception exc)
	{
	Log.Error(string.Format("An error occurred when indexing {0}: {1}", indexable.Id, exc.Message), exc, this);
	}
	return null;
	}

	/// <summary>
	/// Find all inner texts and return a simplified string.
	/// </summary>
	/// <param name="node"></param>
	/// <returns></returns>
	public string GetAllInnerTexts(HtmlNode node)
	{
	var htmlBuilder = new StringBuilder();

	foreach (var x in node.ChildNodes)
	{
	if (x.Name != "noscript" && x.Name != "script"
	&& x.Name != "header" && x.Name != "footer"
	&& x.Name != "nav" && x.Name != "#comment")
	{
	var attribute = x.GetAttributeValue("class", "");
	if (attribute.Contains("modal")) continue;

	htmlBuilder.Append(x.InnerText.Replace(Environment.NewLine, " ").Trim());
	}
	}

	return RemoveWhitespace(htmlBuilder.ToString());
	}

	/// <summary>
	/// Storing whitespace for a field that is only to be used for searching in is not very useful.
	/// This methods removes excessive whitespace.
	/// </summary>
	/// <param name="inputStr"></param>
	/// <returns></returns>
	private string RemoveWhitespace(string inputStr)
	{
	const int n = 5;
	StringBuilder tmpbuilder = new StringBuilder(inputStr.Length);
	for (int i = 0; i < n; ++i)
	{
	string scopy = inputStr;
	bool inspaces = false;
	tmpbuilder.Length = 0;
	for (int k = 0; k < inputStr.Length; ++k)
	{
	char c = scopy[k];
	if (inspaces)
	{
	if (c != ' ')
	{
	inspaces = false;
	tmpbuilder.Append(c);
	}
	}
	else if (c == ' ')
	{
	inspaces = true;
	tmpbuilder.Append(' ');
	}
	else
	{
	tmpbuilder.Append(c);
	}
	}
	}
	return tmpbuilder.ToString();
	}
	}
	}