Last active
December 27, 2015 12:26
-
-
Save ksasao/3664fe9c59146c14051d to your computer and use it in GitHub Desktop.
Googleの画像検索結果の文字列から主要な語句を自動抽出するコード。License WTFPL。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Linq; | |
using System.Text; | |
using System.Threading.Tasks; | |
namespace Mpga.ML | |
{ | |
/// <summary> | |
/// Google 画像検索の検索結果文字列向け単語抽出クラス | |
/// </summary> | |
public class WordExtractor | |
{ | |
/// <summary> | |
/// 文字列に含まれるもっとも主要な語を返す | |
/// </summary> | |
/// <param name="data">単語の重複がある文字列</param> | |
/// <returns>単語</returns> | |
public string Extract(string data) | |
{ | |
var sa = new SuffixArray(data); | |
var words = sa.Words; | |
int minCount = 3; // 単語とみなす最小の出現数 | |
int minLength = 2; // 単語とみなす最小の文字列長 | |
// 単語をカウント | |
Dictionary<string, int> wordCount = new Dictionary<string, int>(); | |
for (int i = 1; i < words.Length; i++) | |
{ | |
if(words[i].Lcp > 0) | |
{ | |
string w = words[i].GetSubstring().Substring(0, words[i].Lcp).Trim(); | |
if (wordCount.ContainsKey(w)) | |
{ | |
wordCount[w]++; | |
} | |
else if(!w.StartsWith("#") && !w.StartsWith("#") && !w.StartsWith("..") && !w.StartsWith("【") && !w.StartsWith("ID:") | |
&& !w.EndsWith("」") | |
&& !w.Contains("まとめ") && !w.Contains("壁紙") && !w.Contains("画像") | |
&& !w.Contains("witter") && !w.Contains("join the")) // for Google Search | |
{ | |
wordCount.Add(w, 1); | |
} | |
} | |
} | |
// しきい値以上の単語のみを抽出 | |
var list = ((from c in wordCount | |
where c.Key.Length >= minLength && c.Value >= minCount | |
orderby c.Value descending | |
select new { Key = c.Key, Value = c.Value })).ToArray(); | |
// 語が他の語の部分文字列になっている場合には、 | |
// 部分文字列の頻度が同じか、より小さい場合に、 | |
// 他の語の出現数にその語の出現数を加える | |
// らき = 4, ☆すた = 3, らき☆すた = 3 なら、この処理の後には、 | |
// らき = 4, ☆すた = 3, らき☆すた = 6 となる。 | |
int bestIndex = 0; | |
int bestCount = 0; | |
for(int i=0; i < list.Length; i++) | |
{ | |
int count = 0; | |
for(int j=0; j < list.Length; j++) | |
{ | |
if (list[i].Key.IndexOf(list[j].Key) >= 0 && list[i].Value >= list[j].Value) | |
{ | |
count += list[j].Value; | |
} | |
} | |
if(bestCount < count) | |
{ | |
bestCount = count; | |
bestIndex = i; | |
} | |
} | |
// 最頻出の語を返す | |
return list.Length > 0 ? list[bestIndex].Key : ""; | |
} | |
} | |
public class SuffixArray | |
{ | |
/// <summary> | |
/// 元の文字列 | |
/// </summary> | |
public string Data { get; private set; } | |
public Word[] Words { get; private set; } | |
public SuffixArray(string data) | |
{ | |
this.Data = data; | |
// 初期化 | |
List<Word> words = new List<Word>(); | |
for(int i=0; i < data.Length; i++) | |
{ | |
words.Add(new Word(data) { Position = i }); | |
} | |
// 辞書順にソート | |
var sorted = words.OrderByDescending(c => c.GetSubstring(), StringComparer.Ordinal); | |
this.Words = sorted.ToArray(); | |
// 直前のwordと何文字一致しているかを求める | |
for(int i=1; i< words.Count; i++) | |
{ | |
int count = 0; | |
string w1 = this.Words[i - 1].GetSubstring(); | |
string w2 = this.Words[i].GetSubstring(); | |
for (int j=0; j< Math.Min(w1.Length, w2.Length); j++) | |
{ | |
if (w1[j] == w2[j]) count++; | |
else break; | |
} | |
this.Words[i].Lcp = count; | |
} | |
} | |
} | |
public class Word | |
{ | |
public int Position { get; set; } | |
public int Lcp { get; set; } | |
private readonly string _data; | |
public Word(string data) | |
{ | |
_data = data; | |
} | |
public string GetSubstring() | |
{ | |
return _data.Substring(this.Position); | |
} | |
public override string ToString() | |
{ | |
return string.Format("Position={0},Lcp={1},Str={2}", Position, Lcp, GetSubstring()); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment