Skip to content

Instantly share code, notes, and snippets.

@compustar
Last active August 29, 2015 14:24
Show Gist options
  • Select an option

  • Save compustar/e48f4224f8f8de4c6427 to your computer and use it in GitHub Desktop.

Select an option

Save compustar/e48f4224f8f8de4c6427 to your computer and use it in GitHub Desktop.
CJK NGram Analyzer
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Globalization;
namespace ngram {
public class NGramAnalyzer {
private bool cjkOnly;
private Dictionary<string, int> rawStats = new Dictionary<string, int>();
private List<string> tokens = new List<string>();
private int minLength = 2;
private int minFrequency = 2;
public NGramAnalyzer()
: this(true) {
}
public NGramAnalyzer(bool cjkOnly) {
this.cjkOnly = cjkOnly;
}
public void AddData(Stream fs) {
using (var reader = new StreamReader(fs)) {
foreach (var line in ReadLines(reader)) {
ProcessLine(line);
}
}
}
public void AddData(TextReader reader) {
foreach (var line in ReadLines(reader)) {
ProcessLine(line);
}
}
private void ProcessLine(string line) {
var prevCat = UnicodeCategory.Control;
StringBuilder buffer = new StringBuilder();
foreach (var c in line) {
var currentCat = CharUnicodeInfo.GetUnicodeCategory(c);
if (prevCat != currentCat) {
if (!((prevCat == UnicodeCategory.LowercaseLetter && currentCat == UnicodeCategory.UppercaseLetter) ||
(prevCat == UnicodeCategory.UppercaseLetter && currentCat == UnicodeCategory.LowercaseLetter))) {
if (buffer.Length > 0) {
AddCandidate(buffer.ToString());
}
buffer.Length = 0;
}
}
if (currentCat == UnicodeCategory.LetterNumber ||
currentCat == UnicodeCategory.LowercaseLetter ||
currentCat == UnicodeCategory.ModifierLetter ||
currentCat == UnicodeCategory.OtherLetter ||
currentCat == UnicodeCategory.TitlecaseLetter ||
currentCat == UnicodeCategory.UppercaseLetter) {
buffer.Append(c);
}
prevCat = currentCat;
}
AddCandidate(buffer.ToString());
}
public Dictionary<int, List<string>> Analyze(Dictionary<string, int> raw) {
return this.Finalize(this.Count(raw));
}
public Dictionary<int, List<string>> Analyze() {
return Analyze(rawStats);
}
public Dictionary<int, List<string>> Finalize(Dictionary<int, List<string>> frequency) {
Dictionary<int, List<string>> result = new Dictionary<int, List<string>>();
foreach (var entry in frequency.OrderByDescending(o => o.Key)) {
if (!result.ContainsKey(entry.Key)) {
result[entry.Key] = new List<string>();
}
foreach (var word in entry.Value) {
var skip = false;
foreach (var anotherWord in entry.Value.Where(o => o.Length > word.Length)) {
// if a word is contained in another word with the same frequency, skip it
if (anotherWord.Contains(word)) {
skip = true;
break;
}
}
if (!skip) {
result[entry.Key].Add(word);
}
}
}
return result;
}
public Dictionary<int, List<string>> Count(Dictionary<string, int> raw) {
Dictionary<int, List<string>> result = new Dictionary<int, List<string>>();
foreach (var entry in raw.Where(o => o.Value > minFrequency).OrderByDescending(o => o.Value)) {
if (!result.ContainsKey(entry.Value)) {
result[entry.Value] = new List<string>();
}
result[entry.Value].Add(entry.Key);
}
return result;
}
public void AddCandidate(string candidate) {
if (candidate.Length == 0) {
return;
}
tokens.Add(candidate);
if (CharUnicodeInfo.GetUnicodeCategory(candidate[0]) == UnicodeCategory.OtherLetter) {
for (int i = 0; i < candidate.Length - minLength; i++) {
for (int j = i + minLength; j <= candidate.Length; j++) {
AddNgram(candidate.Substring(i, j - i));
}
}
}
else if (!cjkOnly) {
AddNgram(candidate);
}
}
private void AddNgram(string ngram) {
if (rawStats.ContainsKey(ngram)) {
rawStats[ngram]++;
}
else {
rawStats[ngram] = 1;
}
}
private IEnumerable<string> ReadLines(TextReader reader) {
for (; ; ) {
var line = reader.ReadLine();
if (line == null) {
break;
}
yield return line;
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment