ksasao · December 27, 2015 12:26
diff --git a/WordExtractor.cs b/WordExtractor.cs
 using System;
 using System.Collections.Generic;
 using System.Linq;
 using System.Text;
 using System.Threading.Tasks;

 namespace Mpga.ML
 {
    /// <summary>
    /// Google 画像検索の検索結果文字列向け単語抽出クラス
    /// </summary>
    public class WordExtractor
    {
        /// <summary>
        /// 文字列に含まれるもっとも主要な語を返す
        /// </summary>
        /// <param name="data">単語の重複がある文字列</param>
        /// <returns>単語</returns>
        public string Extract(string data)
        {
            var sa = new SuffixArray(data);
            var words = sa.Words;

            int minCount = 3;  // 単語とみなす最小の出現数
            int minLength = 2; // 単語とみなす最小の文字列長

            // 単語をカウント
            Dictionary<string, int> wordCount = new Dictionary<string, int>();
            for (int i = 1; i < words.Length; i++)
            {
                if(words[i].Lcp > 0)
                {
                    string w = words[i].GetSubstring().Substring(0, words[i].Lcp).Trim();
                    if (wordCount.ContainsKey(w))
                    {
                        wordCount[w]++;
                    }
                    else if(!w.StartsWith("#") && !w.StartsWith("#") && !w.StartsWith("..") && !w.StartsWith("【") && !w.StartsWith("ID:")
                        && !w.EndsWith("」")
                        && !w.Contains("まとめ") && !w.Contains("壁紙") && !w.Contains("画像")
                        && !w.Contains("witter") && !w.Contains("join the"))  // for Google Search
                    {
                        wordCount.Add(w, 1);
                    }
                }
            }

            // しきい値以上の単語のみを抽出
            var list = ((from c in wordCount
                       where c.Key.Length >= minLength && c.Value >= minCount
                       orderby c.Value descending
                       select new { Key = c.Key, Value = c.Value })).ToArray();

            // 語が他の語の部分文字列になっている場合には、
            // 部分文字列の頻度が同じか、より小さい場合に、
            // 他の語の出現数にその語の出現数を加える
            // らき = 4, ☆すた = 3, らき☆すた = 3 なら、この処理の後には、
            // らき = 4, ☆すた = 3, らき☆すた = 6 となる。
            int bestIndex = 0;
            int bestCount = 0;
            for(int i=0; i < list.Length; i++)
            {
                int count = 0;
                for(int j=0; j < list.Length; j++)
                {
                    if (list[i].Key.IndexOf(list[j].Key) >= 0 && list[i].Value >= list[j].Value)
                    {
                        count += list[j].Value;
                    }
                }
                
                if(bestCount < count)
                {
                    bestCount = count;
                    bestIndex = i;
                }
            }

            // 最頻出の語を返す
            return list.Length > 0 ? list[bestIndex].Key : "";
        }
    }

    public class SuffixArray
    {
        /// <summary>
        /// 元の文字列
        /// </summary>
        public string Data { get; private set; }
        public Word[] Words { get; private set; }
        public SuffixArray(string data)
        {
            this.Data = data;

            // 初期化
            List<Word> words = new List<Word>();
            for(int i=0; i < data.Length; i++)
            {
                words.Add(new Word(data) { Position = i });
            }
            // 辞書順にソート
            var sorted = words.OrderByDescending(c => c.GetSubstring(), StringComparer.Ordinal);
            this.Words = sorted.ToArray();

            // 直前のwordと何文字一致しているかを求める
            for(int i=1; i< words.Count; i++)
            {
                int count = 0;
                string w1 = this.Words[i - 1].GetSubstring();
                string w2 = this.Words[i].GetSubstring();
                for (int j=0; j< Math.Min(w1.Length, w2.Length); j++)
                {
                    if (w1[j] == w2[j]) count++;
                    else break;
                }
                this.Words[i].Lcp = count;
            }
        }
    }
    public class Word
    {
        public int Position { get; set; }
        public int Lcp { get; set; }

        private readonly string _data;

        public Word(string data)
        {
            _data = data;
        }
        public string GetSubstring()
        {
            return _data.Substring(this.Position);
        }
        public override string ToString()
        {
            return string.Format("Position={0},Lcp={1},Str={2}", Position, Lcp, GetSubstring());
        }
    }
 }
	using System;
	using System.Collections.Generic;
	using System.Linq;
	using System.Text;
	using System.Threading.Tasks;

	namespace Mpga.ML
	{
	/// <summary>
	/// Google 画像検索の検索結果文字列向け単語抽出クラス
	/// </summary>
	public class WordExtractor
	{
	/// <summary>
	/// 文字列に含まれるもっとも主要な語を返す
	/// </summary>
	/// <param name="data">単語の重複がある文字列</param>
	/// <returns>単語</returns>
	public string Extract(string data)
	{
	var sa = new SuffixArray(data);
	var words = sa.Words;

	int minCount = 3; // 単語とみなす最小の出現数
	int minLength = 2; // 単語とみなす最小の文字列長

	// 単語をカウント
	Dictionary<string, int> wordCount = new Dictionary<string, int>();
	for (int i = 1; i < words.Length; i++)
	{
	if(words[i].Lcp > 0)
	{
	string w = words[i].GetSubstring().Substring(0, words[i].Lcp).Trim();
	if (wordCount.ContainsKey(w))
	{
	wordCount[w]++;
	}
	else if(!w.StartsWith("#") && !w.StartsWith("#") && !w.StartsWith("..") && !w.StartsWith("【") && !w.StartsWith("ID:")
	&& !w.EndsWith("」")
	&& !w.Contains("まとめ") && !w.Contains("壁紙") && !w.Contains("画像")
	&& !w.Contains("witter") && !w.Contains("join the")) // for Google Search
	{
	wordCount.Add(w, 1);
	}
	}
	}

	// しきい値以上の単語のみを抽出
	var list = ((from c in wordCount
	where c.Key.Length >= minLength && c.Value >= minCount
	orderby c.Value descending
	select new { Key = c.Key, Value = c.Value })).ToArray();

	// 語が他の語の部分文字列になっている場合には、
	// 部分文字列の頻度が同じか、より小さい場合に、
	// 他の語の出現数にその語の出現数を加える
	// らき = 4, ☆すた = 3, らき☆すた = 3 なら、この処理の後には、
	// らき = 4, ☆すた = 3, らき☆すた = 6 となる。
	int bestIndex = 0;
	int bestCount = 0;
	for(int i=0; i < list.Length; i++)
	{
	int count = 0;
	for(int j=0; j < list.Length; j++)
	{
	if (list[i].Key.IndexOf(list[j].Key) >= 0 && list[i].Value >= list[j].Value)
	{
	count += list[j].Value;
	}
	}

	if(bestCount < count)
	{
	bestCount = count;
	bestIndex = i;
	}
	}

	// 最頻出の語を返す
	return list.Length > 0 ? list[bestIndex].Key : "";
	}
	}

	public class SuffixArray
	{
	/// <summary>
	/// 元の文字列
	/// </summary>
	public string Data { get; private set; }
	public Word[] Words { get; private set; }
	public SuffixArray(string data)
	{
	this.Data = data;

	// 初期化
	List<Word> words = new List<Word>();
	for(int i=0; i < data.Length; i++)
	{
	words.Add(new Word(data) { Position = i });
	}
	// 辞書順にソート
	var sorted = words.OrderByDescending(c => c.GetSubstring(), StringComparer.Ordinal);
	this.Words = sorted.ToArray();

	// 直前のwordと何文字一致しているかを求める
	for(int i=1; i< words.Count; i++)
	{
	int count = 0;
	string w1 = this.Words[i - 1].GetSubstring();
	string w2 = this.Words[i].GetSubstring();
	for (int j=0; j< Math.Min(w1.Length, w2.Length); j++)
	{
	if (w1[j] == w2[j]) count++;
	else break;
	}
	this.Words[i].Lcp = count;
	}
	}
	}
	public class Word
	{
	public int Position { get; set; }
	public int Lcp { get; set; }

	private readonly string _data;

	public Word(string data)
	{
	_data = data;
	}
	public string GetSubstring()
	{
	return _data.Substring(this.Position);
	}
	public override string ToString()
	{
	return string.Format("Position={0},Lcp={1},Str={2}", Position, Lcp, GetSubstring());
	}
	}
	}