Created
March 22, 2017 13:58
-
-
Save ship561/b64173c6d2c595e2a5476e99531e769d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns sandbox.cipher | |
(:require [clojure.string :as str])) | |
(def alphabet "abcdefghijklmnopqrstuvwxyz") | |
(defn rotate [n coll] | |
(concat (drop n coll) (take n coll))) | |
(defn caeser-cipher [offset] | |
(-> (zipmap alphabet (rotate offset alphabet)) | |
(assoc \space \space))) | |
(defn caeser-encode [offset s] | |
(let [encode (caeser-cipher offset)] | |
(apply str (map encode s)))) | |
(defn caeser-decode [offset s] | |
(let [decode (zipmap (vals (caeser-cipher offset)) | |
(keys (caeser-cipher offset)))] | |
(apply str (map decode s)))) | |
(->> (caeser-encode 1 "hello there") | |
(caeser-decode 1)) | |
;;; "hello there" | |
;;;-----vigenere cipher------------------------------------------------------------ | |
(defn letter->number | |
"Hash-map to convert letters to number" | |
[alpha] | |
(into {\space \space} (map vector alpha (range)))) | |
(defn number->letter | |
"Hash-map to convert numbers into letters" | |
[alpha] | |
(into {\space \space} (map vector (range) alpha))) | |
;;; build a function to use the vigenere cipher to encode a message with a keyword | |
;;; we do it in a single step, but it can be seen as letters -> | |
;;; numbers -> letter. The mod 26 is there to cycle the alphabet | |
;;; around again, whether you are encoding or decoding. | |
(defn vigenere-encode [key-word plain-text] | |
(let [original (letter->number alphabet) | |
encode (number->letter alphabet) | |
repeat-keyword (cycle (seq key-word))] | |
(apply str | |
(map (fn [k c] | |
(encode (mod (+ (original k) (original c)) 26))); encode by summing letters | |
repeat-keyword ; cycle keyword letters | |
plain-text)))) | |
(defn- vigenere-generic | |
"Generic function that can encode or decode the vignere | |
cipher. Encode (+), decode (-). The encoding works by 2 letters | |
together to create a new letter. This is carried by converting the | |
letters to numbers, summing, and then converting the number back | |
into a letter. Decoding is the reverse operation whereby the key is | |
subtracted from the cryto letter to get the original back." | |
[fun key-word s] | |
(let [to-number (letter->number alphabet) | |
to-letter (number->letter alphabet)] | |
(apply str | |
(map (fn [letter k] | |
(to-letter (mod (fun (to-number letter) (to-number k)) 26))) | |
s | |
(cycle (seq key-word)))))) | |
;;; we wrote a generic function, now we don't have to write so much | |
;;; repeated code. It will be obvious that the only difference between | |
;;; encoding and decoding is the + or - of the keyword from | |
;;; plaintext (encode) or cryptotext (decode). | |
(defn vigenere-decode [key-word s] (vigenere-generic - key-word s)) | |
(defn vigenere-encode [key-word s] (vigenere-generic + key-word s)) | |
(->> (vigenere-encode "key" "function") | |
(vigenere-decode "key")) | |
;;; "function" | |
(def encoded-message | |
(str/replace | |
(str "coliwicojqkwbjplvwffjvapvjuikicomitxrxnemtbefickmpeitf | |
oohesnojwxcgrddvkhitesnnafxjhhfhxxvxbmxvgpvjuikgcemifg | |
sbapagoqympzgvmhggzzclgksdjqgthbytkgubbshlcnnspxufwxnr | |
fbytkgusjtrbbhjxorqijqdxfexstmwtbsoxjjbmvhfjvyvmssnhvt | |
dqrrithnhgjtacnvfhcsxrnrhirwcgroxxjbbhvstxoimmumwolxnr | |
wsnqgfpfamvpotrrvascuicdrflioussjrfxodqwgiosjxgwmjwkgf | |
pfaathihqxkmghqsumiqxrvasgusqksbpitemjfmuaseclgfcsasyo | |
ojwpabvbmwqnuicxqucsasyyfpvqaucptwunfdneuxcgbstkcxbstk | |
cxostmvfusumzfwstxtpaxjxfbaicgrsjhktbuvekwsoflqfhinepz | |
smbrcfsmnrqksojqgestblgksgxvgossvstx") | |
#"(\s|\n)" "")) | |
(defn likely-repeats-length-n [n s] | |
(->> (partition-all n 1 s) ; all k-mers of length n | |
(map #(apply str %)) ; make them strings | |
frequencies | |
(sort-by second >) ; k-mer with most repeats | |
first)) | |
(defn find-repeats [s] | |
(filter #(> (second %) 1) (map #(likely-repeats-length-n % s) (range 3 (/ (count s) 2))))) | |
(defn factors-of [n] (filter #(zero? (rem n %)) (range 1 (inc (/ n 2))))) | |
(defn find-occurances [match] | |
(->> encoded-message | |
(partition-all (count match) 1) | |
(map-indexed (fn [i x] [i (= (apply str x) match)])) | |
(filter #(-> % second true? )) | |
(map first))) | |
(defn delta-occurances [occurances] | |
(loop [V occurances | |
delta []] | |
(if (>= (count V) 2) | |
(let [[a b] (take 2 V)] | |
(recur (rest V) | |
(conj delta (- b a)))) | |
delta))) | |
(defn partition-all-str | |
([n s] (partition-all-str n n s)) | |
([n step s] | |
(mapv (fn [i] | |
(if (< (+ i n) (count s)) | |
(subs s i (+ i n)) | |
(subs s i))) | |
(range 0 (count s) step)))) | |
(defn transpose [coll] (apply map vector coll)) | |
(defn string->column [n s] | |
(map (fn [i] (map #(.charAt s %) | |
(range i (count s) n))) | |
(range n))) | |
(defn string->column2 [n s] | |
(map #(->> (subs s %) ; starting frame | |
(take-nth n)) ; every n letters | |
(range n))) | |
(defn sum [coll] (reduce + coll)) | |
(defn mean [coll] (/ (sum coll) (count coll))) | |
(defn sqr [x] (* x x)) | |
(def letter-freq | |
{\a 0.0812, \b 0.0149, \c 0.0271, \d 0.0432, | |
\e 0.1202, \f 0.023, \g 0.0203, \h 0.0592, | |
\i 0.0731, \j 0.001, \k 0.0069, \l 0.0398, | |
\m 0.026099999999999998, \n 0.0695, \o 0.0768, | |
\p 0.0182, \q 0.0011, \r 0.0602, | |
\s 0.06280000000000001, \t 0.091, \u 0.0288, | |
\v 0.0111, \w 0.0209, \x 0.0017000000000000001, | |
\y 0.021099999999999997, \z 7.000000000000001E-4}) | |
(defn index-of-coincidence [c char-vector] | |
(let [obs-char-freqs (frequencies char-vector)] | |
(/ (sum | |
(map (fn [a] | |
(if-let [ni (obs-char-freqs a)] | |
(* ni (- ni 1)) | |
0)) | |
alphabet)) | |
(count char-vector) | |
(- (count char-vector) 1) | |
(/ c)))) | |
(def IC (partial index-of-coincidence 26)) | |
(defn chi-sq [char-vector] | |
(let [obs-char-freqs (frequencies char-vector) | |
n (count char-vector)] | |
(sum | |
(map (fn [a] | |
(let [obs (get obs-char-freqs a 0) ; observed number letter(i) | |
exp (* (letter-freq a) n)] ; explected number letter(i) | |
(/ (sqr (- obs exp)) | |
exp))) | |
alphabet)))) | |
;;; We know that the key is length 3-8. We narrow down the potential | |
;;; key size by calculating the distances between repeated words and | |
;;; then identifying common factors of these distances. The | |
;;; intersection of these factors are likely key sizes | |
(->> (find-repeats encoded-message) (map first) ; identify repeat words | |
(mapcat #(delta-occurances (find-occurances %))) ; distance between these repeats | |
(map (fn [x] (filter #(<= 3 % 8) (factors-of x))))) ; factors b/w 3 and 8 | |
;;; at this point we have narrowed down the key-size to 3,4,6. But we | |
;;; still have to figure out the key-size and the message. We could | |
;;; try to brute force it by trying out all letter combinations for 3, | |
;;; 4, and 6 length words but it is still impossible to read all of | |
;;; the messages. | |
;;; Instead of reading every message, we can decode the key by looking | |
;;; for decoded messages that resemble an english passage. By looking | |
;;; for letter frequency, we can automatically determine if the | |
;;; passage looks like it is written in english. We narrow down the | |
;;; key size by using the index-of-coincidence to calculate common | |
;;; di-letter frequencies. With the calculated key size, we use the | |
;;; the chi-square test to identify and retain any decoded messages | |
;;; that contain similar letter frequency to written english. | |
(defn crack-code [instr] | |
(let [key-size (->> (for [i (range 3 9)] | |
[i (->> (string->column2 i instr) ; make message into columns of length i | |
(map IC) ; mean IC over letter columns | |
mean)]) | |
(sort-by second >) | |
ffirst)] ; keep the key size with highest IC | |
(prn :key-size key-size) | |
(->> (string->column key-size instr) | |
(map (fn [column] ; decode each column | |
(let [column (apply str column)] | |
(->> (for [i (range 26)] ; try every letter | |
[i (chi-sq (caeser-decode i column))]) ; calculate similarity to english | |
(sort-by second) | |
ffirst)))) ; keep the most similar hit | |
(map (number->letter alphabet))))) ; calculate the key | |
(crack-code (str/lower-case encoded-message)) | |
;;; instead of the ->> for loop stuff, use reduce! This doesn't work | |
;;; right now b/c like a fool, i forgot what column is | |
(reduce (fn [[best-offset cur-chi :as current] i] | |
(let [col (apply str column) | |
new-chi (chi-sq (caeser-decode i col))] ; calculate similarity to english | |
(if (< cur-chi new-chi) ; keep the most similar hit (least significant) | |
current | |
[i new-chi]))) | |
[-1 Integer/MAX_VALUE] | |
(vec (range 26))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment