Last active
March 24, 2020 09:13
-
-
Save mjs3339/f243f37bf38809bcd21d79054e353857 to your computer and use it in GitHub Desktop.
Chi Squared Byte Array Test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Concurrent; | |
using System.Collections.Generic; | |
using System.Drawing; | |
using System.Globalization; | |
using System.IO; | |
using System.Linq; | |
public static class ChiSquared | |
{ | |
/// <summary> | |
/// Calculated from an English word dictionary containing over 466,000 words. | |
/// </summary> | |
private static readonly float[] _expectedPercentages = {.0846f, .0189f, .0420f, .0353f, .1098f, .0125f, .0243f, .0274f, .0864f, .0018f, .0089f, .0574f, .0292f, .0715f, .0709f, .0310f, .0019f, .0704f, .0705f, .0647f, .0363f, .0099f, .0085f, .0028f, .0192f, .0041f}; | |
/// <summary> | |
/// Not accurate 100% all of the time. | |
/// </summary> | |
/// <param name="path"></param> | |
public static bool IsFileCompressed(this string path) | |
{ | |
var arr = File.ReadAllBytes(path); | |
var r1 = arr.ChiSquaredTest(); | |
return r1.isRandom; | |
} | |
/// <summary> | |
/// Tests a buffer for randomness. Returns chi squared values. | |
/// isRandom - is the buffer a random sequence. | |
/// Quality - Less than 1 or greater than 1 is off target. Observed is off expected. | |
/// Entropy - Calculates a 8 bit Entropy level of the buffer as a percentage of perfect disorder 100% | |
/// ExpectedChiSq - The expected chi squared value. | |
/// LowLimit - (R - (2*sqrt(R))) | |
/// chiSqValue - The observed chi squared value. | |
/// UpperLimit - (R + (2*sqrt(R))) | |
/// </summary> | |
/// <param name="bArr">The byte Array</param> | |
public static (bool isRandom, float Quality, float Entropy, int ExpectedChiSq, float LowLimit, float chiSqValue, float UpperLimit) ChiSquaredTest(this byte[] bArr) | |
{ | |
if (bArr != null) | |
{ | |
var iArr = Ia(bArr); | |
var ent = Entropy(bArr); | |
if (ent < 95) | |
return (false, 0, ent, 0, 0, 0, 0); | |
var aLen = iArr.Length; | |
var rLim = aLen / 10; | |
var n = aLen; | |
var r = rLim - 1; | |
var freq = new ConcurrentDictionary<int, int>(); | |
iArr.AsParallel().WithDegreeOfParallelism(2).ForAll(I => | |
{ | |
var iT = Math.Abs(Math.Abs(I) % rLim - rLim); | |
if (!freq.ContainsKey(iT)) | |
freq.TryAdd(iT, 1); | |
else | |
freq[iT] += 1; | |
}); | |
var t = freq.Sum(e => (float) Math.Pow(e.Value, 2)); | |
var cS = Math.Abs(r * t / n - n); | |
var fL = r - 2.0f * (float) Math.Sqrt(r); | |
var fH = r + 2.0f * (float) Math.Sqrt(r); | |
var iR = (fL <= cS) & (fH >= cS); | |
var q = cS / r; | |
return (iR, q, ent, r, fL, cS, fH); | |
} | |
return default; | |
} | |
private static int[] Ia(byte[] ba) | |
{ | |
var bal = ba.Length; | |
var dWordCount = bal / 4 + (bal % 4 == 0 ? 0 : 1); | |
var arr = new int[dWordCount]; | |
Buffer.BlockCopy(ba, 0, arr, 0, bal); | |
return arr; | |
} | |
private static float Entropy(byte[] s) | |
{ | |
float len = s.Length; | |
var map = new int[256]; | |
for (var i = 0; i < (int) len; i++) | |
map[s[i]]++; | |
var result = 0f; | |
for (var idx = 0; idx < map.Length; idx++) | |
{ | |
var frequency = map[idx] / len; | |
if (frequency > 0) | |
result -= frequency * (float) Math.Log(frequency, 2); | |
} | |
return result / 8f * 100f; | |
} | |
public static int ChiSquaredCount(this byte[] s, byte b) | |
{ | |
float len = s.Length; | |
var map = new int[256]; | |
for (var i = 0; i < (int) len; i++) | |
map[s[i]]++; | |
return map[b]; | |
} | |
public static int ChiSquaredCount(this string s, char b) | |
{ | |
float len = s.Length; | |
var map = new int[256]; | |
for (var i = 0; i < (int) len; i++) | |
map[s[i]]++; | |
return map[b]; | |
} | |
public static float ChiSquaredAsPercent(this string s, char b) | |
{ | |
float len = s.Length; | |
var map = new int[256]; | |
for (var i = 0; i < (int) len; i++) | |
map[s[i]]++; | |
return map[b] / len; | |
} | |
/// <summary> | |
/// Compute the letter frequencies within the English language. | |
/// Use a large English language text block for accurate testing. | |
/// </summary> | |
/// <param name="s">String that contains the large English text</param> | |
public static KeyValuePair<char, float>[] ChiSquaredTextAsPercent(this string s) | |
{ | |
float len = s.Length; | |
s = s.ToLower(CultureInfo.CurrentCulture); | |
var lst = new Dictionary<char, float>(); | |
var map = new int[256]; | |
for (var i = 0; i < (int) len; i++) | |
if (s[i].IsLetter()) | |
map[s[i]]++; | |
var t = map.Sum(e => e); | |
foreach (var l in "abcdefghijklmnopqrstuvwxyz") | |
lst.Add(l, map[l] / (float) t); | |
var klst = lst.OrderBy(e => e.Key).ToArray(); | |
var KeyList = ""; | |
var ValueList = ""; | |
foreach (var kv in klst) | |
{ | |
KeyList += $"{kv.Key},"; | |
ValueList += $"{kv.Value:.0000},"; | |
} | |
var nlst = lst.OrderBy(e => e.Value).ToArray(); | |
return nlst; | |
} | |
public static float ChiSquaredTextTest(this string s) | |
{ | |
var realLen = 0; | |
s = s.ToLower(CultureInfo.CurrentCulture); | |
var observed = new Dictionary<char, int>(); | |
foreach (var c in s) | |
if (c.IsLetter()) | |
{ | |
if (!observed.ContainsKey(c)) | |
observed.Add(c, 1); | |
else | |
observed[c]++; | |
realLen++; | |
} | |
var expected = new Dictionary<char, float>(); | |
for (var i = 0; i < 26; i++) | |
expected.Add((char) (i + 97), _expectedPercentages[i] * realLen); | |
var cSList = new List<float>(); | |
foreach (var item in expected) | |
{ | |
var c = item.Key; | |
if (observed.ContainsKey(c)) | |
cSList.Add((float) Math.Pow(observed[c] - expected[c], 2) / expected[c]); | |
} | |
return cSList.Sum(e => e) / realLen * 100f; | |
} | |
/// <summary> | |
/// The value of 10 as a combined chi-squared total distance percentage threshold is subjective. | |
/// Determined from about 40 test runs. Most non-text files have readings | |
/// in the 100's | |
/// </summary> | |
/// <param name="path">Path to the file to test</param> | |
public static bool IsTextFile(this string path) | |
{ | |
return File.ReadAllText(path).ChiSquaredTextTest() < 10; | |
} | |
public static float ImageChiSqrdDp(string p1, string p2) | |
{ | |
var hist1 = new int[256]; | |
var hist2 = new int[256]; | |
var iLen1 = 0; | |
var iLen2 = 0; | |
using (var img1 = new Bitmap(p1)) | |
{ | |
for (var y = 0; y < img1.Height; y++) | |
for (var x = 0; x < img1.Width; x++) | |
{ | |
iLen1++; | |
var pixel1 = img1.GetPixel(x, y); | |
hist1[(pixel1.R + pixel1.G + pixel1.B) / 3]++; | |
} | |
} | |
using (var img2 = new Bitmap(p2)) | |
{ | |
for (var y = 0; y < img2.Height; y++) | |
for (var x = 0; x < img2.Width; x++) | |
{ | |
iLen2++; | |
var pixel2 = img2.GetPixel(x, y); | |
hist2[(pixel2.R + pixel2.G + pixel2.B) / 3]++; | |
} | |
} | |
var ChiSqrd= 0.0f; | |
for (var i = 0; i < 256; i++) | |
if (hist2[i] != 0) | |
ChiSqrd += (float) Math.Pow(hist1[i] - hist2[i], 2) / hist2[i]; | |
return ChiSqrd; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment