ship561 · March 22, 2017 13:58
diff --git a/cipher.clj b/cipher.clj
 (ns sandbox.cipher
  (:require [clojure.string :as str]))

 (def alphabet "abcdefghijklmnopqrstuvwxyz")

 (defn rotate [n coll]
  (concat (drop n coll) (take n coll)))

 (defn caeser-cipher [offset]
  (-> (zipmap alphabet (rotate offset alphabet))
      (assoc \space \space)))

 (defn caeser-encode [offset s]
  (let [encode (caeser-cipher offset)]
    (apply str (map encode s))))

 (defn caeser-decode [offset s]
  (let [decode (zipmap (vals (caeser-cipher offset))
                       (keys (caeser-cipher offset)))]
    (apply str (map decode s))))

 (->> (caeser-encode 1 "hello there")
     (caeser-decode 1))
 ;;; "hello there"
 ;;;-----vigenere cipher------------------------------------------------------------


 (defn letter->number
  "Hash-map to convert letters to number"

  [alpha]
  (into {\space \space} (map vector alpha (range))))

 (defn number->letter
  "Hash-map to convert numbers into letters"
  
  [alpha]
  (into {\space \space} (map vector (range) alpha)))

 ;;; build a function to use the vigenere cipher to encode a message with a keyword
 ;;; we do it in a single step, but it can be seen as letters ->
 ;;; numbers -> letter. The mod 26 is there to cycle the alphabet
 ;;; around again, whether you are encoding or decoding.
 (defn vigenere-encode [key-word plain-text]
  (let [original (letter->number alphabet)
        encode (number->letter alphabet)
        repeat-keyword (cycle (seq key-word))]
    (apply str
           (map (fn [k c]
                  (encode (mod (+ (original k) (original c)) 26))); encode by summing letters
                repeat-keyword                                    ; cycle keyword letters
                plain-text))))

 (defn- vigenere-generic
  "Generic function that can encode or decode the vignere
  cipher. Encode (+), decode (-). The encoding works by 2 letters
  together to create a new letter. This is carried by converting the
  letters to numbers, summing, and then converting the number back
  into a letter. Decoding is the reverse operation whereby the key is
  subtracted from the cryto letter to get the original back."
  [fun key-word s]
  (let [to-number (letter->number alphabet)
        to-letter (number->letter alphabet)]
    (apply str
           (map (fn [letter k]
                  (to-letter (mod (fun (to-number letter) (to-number k)) 26)))
                s
                (cycle (seq key-word))))))

 ;;; we wrote a generic function, now we don't have to write so much
 ;;; repeated code. It will be obvious that the only difference between
 ;;; encoding and decoding is the + or - of the keyword from
 ;;; plaintext (encode) or cryptotext (decode).
 (defn vigenere-decode [key-word s] (vigenere-generic - key-word s))
 (defn vigenere-encode [key-word s] (vigenere-generic + key-word s))

 (->> (vigenere-encode "key" "function")
     (vigenere-decode "key"))
 ;;; "function"

 (def encoded-message
  (str/replace
   (str "coliwicojqkwbjplvwffjvapvjuikicomitxrxnemtbefickmpeitf
      oohesnojwxcgrddvkhitesnnafxjhhfhxxvxbmxvgpvjuikgcemifg
      sbapagoqympzgvmhggzzclgksdjqgthbytkgubbshlcnnspxufwxnr
      fbytkgusjtrbbhjxorqijqdxfexstmwtbsoxjjbmvhfjvyvmssnhvt
      dqrrithnhgjtacnvfhcsxrnrhirwcgroxxjbbhvstxoimmumwolxnr
      wsnqgfpfamvpotrrvascuicdrflioussjrfxodqwgiosjxgwmjwkgf
      pfaathihqxkmghqsumiqxrvasgusqksbpitemjfmuaseclgfcsasyo
      ojwpabvbmwqnuicxqucsasyyfpvqaucptwunfdneuxcgbstkcxbstk
      cxostmvfusumzfwstxtpaxjxfbaicgrsjhktbuvekwsoflqfhinepz
      smbrcfsmnrqksojqgestblgksgxvgossvstx")
   #"(\s|\n)" ""))

 (defn likely-repeats-length-n [n s]
  (->> (partition-all n 1 s) ; all k-mers of length n
       (map #(apply str %))  ; make them strings
       frequencies           
       (sort-by second >)    ; k-mer with most repeats 
       first))

 (defn find-repeats [s]
  (filter #(> (second %) 1) (map #(likely-repeats-length-n % s) (range 3 (/ (count s) 2)))))

 (defn factors-of [n] (filter #(zero? (rem n %)) (range 1 (inc (/ n 2)))))

 (defn find-occurances [match]
  (->> encoded-message
       (partition-all (count match) 1)
       (map-indexed (fn [i x] [i (= (apply str x) match)]))
       (filter #(-> % second true? ))
       (map first)))

 (defn delta-occurances [occurances]
  (loop [V occurances
         delta []]
    (if (>= (count V) 2)
      (let [[a b] (take 2 V)]
        (recur (rest V)
               (conj delta (- b a))))
      delta)))

 (defn partition-all-str
  ([n s] (partition-all-str n n s))
  ([n step s]
   (mapv (fn [i]
           (if (< (+ i n) (count s))
             (subs s i (+ i n))
             (subs s i)))
         (range 0 (count s) step))))

 (defn transpose [coll] (apply map vector coll))


 (defn string->column [n s]
  (map (fn [i] (map #(.charAt s %)
                   (range i (count s) n)))
       (range n)))

 (defn string->column2 [n s]
  (map #(->> (subs s %)     ; starting frame
              (take-nth n))  ; every n letters
         (range n)))

 (defn sum [coll] (reduce + coll))
 (defn mean [coll] (/ (sum coll) (count coll)))
 (defn sqr [x] (* x x))

 (def letter-freq
  {\a 0.0812, \b 0.0149, \c 0.0271, \d 0.0432,
   \e 0.1202, \f 0.023, \g 0.0203, \h 0.0592,
   \i 0.0731, \j 0.001, \k 0.0069, \l 0.0398,
   \m 0.026099999999999998, \n 0.0695, \o 0.0768,
   \p 0.0182, \q 0.0011, \r 0.0602,
   \s 0.06280000000000001, \t 0.091, \u 0.0288,
   \v 0.0111, \w 0.0209, \x 0.0017000000000000001,
   \y 0.021099999999999997, \z 7.000000000000001E-4})


 (defn index-of-coincidence [c char-vector]
  (let [obs-char-freqs (frequencies char-vector)]
    (/ (sum
        (map (fn [a]
               (if-let [ni (obs-char-freqs a)]
                 (* ni (- ni 1))
                 0))
             alphabet))
       (count char-vector)
       (- (count char-vector) 1)
       (/ c))))

 (def IC (partial index-of-coincidence 26))

 (defn chi-sq [char-vector]
  (let [obs-char-freqs (frequencies char-vector)
        n (count char-vector)]
    (sum
     (map (fn [a]
            (let [obs (get obs-char-freqs a 0) ; observed number letter(i)
                  exp (* (letter-freq a) n)]   ; explected number letter(i)
              (/ (sqr (- obs exp))
                 exp)))
          alphabet))))



 ;;; We know that the key is length 3-8. We narrow down the potential
 ;;; key size by calculating the distances between repeated words and
 ;;; then identifying common factors of these distances. The
 ;;; intersection of these factors are likely key sizes
 (->> (find-repeats encoded-message) (map first)         ; identify repeat words
     (mapcat #(delta-occurances (find-occurances %)))   ; distance between these repeats
     (map (fn [x] (filter #(<= 3 % 8) (factors-of x))))) ; factors b/w 3 and 8


 ;;; at this point we have narrowed down the key-size to 3,4,6. But we
 ;;; still have to figure out the key-size and the message. We could
 ;;; try to brute force it by trying out all letter combinations for 3,
 ;;; 4, and 6 length words but it is still impossible to read all of
 ;;; the messages.

 ;;; Instead of reading every message, we can decode the key by looking
 ;;; for decoded messages that resemble an english passage. By looking
 ;;; for letter frequency, we can automatically determine if the
 ;;; passage looks like it is written in english. We narrow down the
 ;;; key size by using the index-of-coincidence to calculate common
 ;;; di-letter frequencies. With the calculated key size, we use the
 ;;; the chi-square test to identify and retain any decoded messages
 ;;; that contain similar letter frequency to written english.


 (defn crack-code [instr]
  (let [key-size (->> (for [i (range 3 9)]
                        [i (->> (string->column2 i instr)       ; make message into columns of length i
                                (map IC)                        ; mean IC over letter columns
                                mean)])
                      (sort-by second >)
                      ffirst)]                                  ; keep the key size with highest IC
    (prn :key-size key-size)
    (->> (string->column key-size instr)                        
         (map (fn [column]                                       ; decode each column
                (let [column (apply str column)]
                  (->> (for [i (range 26)]                      ; try every letter
                         [i (chi-sq (caeser-decode i column))]) ; calculate similarity to english
                       (sort-by second)
                       ffirst))))                               ; keep the most similar hit
         (map (number->letter alphabet)))))                      ; calculate the key 

 (crack-code (str/lower-case encoded-message))

 ;;; instead of the ->> for loop stuff, use reduce! This doesn't work
 ;;; right now b/c like a fool, i forgot what column is
 (reduce (fn [[best-offset cur-chi :as current] i]
          (let [col (apply str column)
                new-chi (chi-sq (caeser-decode i col))] ; calculate similarity to english
            (if (< cur-chi new-chi)                     ; keep the most similar hit (least significant)
              current
              [i new-chi])))
        [-1 Integer/MAX_VALUE]
        (vec (range 26)))
	(ns sandbox.cipher
	(:require [clojure.string :as str]))

	(def alphabet "abcdefghijklmnopqrstuvwxyz")

	(defn rotate [n coll]
	(concat (drop n coll) (take n coll)))

	(defn caeser-cipher [offset]
	(-> (zipmap alphabet (rotate offset alphabet))
	(assoc \space \space)))

	(defn caeser-encode [offset s]
	(let [encode (caeser-cipher offset)]
	(apply str (map encode s))))

	(defn caeser-decode [offset s]
	(let [decode (zipmap (vals (caeser-cipher offset))
	(keys (caeser-cipher offset)))]
	(apply str (map decode s))))

	(->> (caeser-encode 1 "hello there")
	(caeser-decode 1))
	;;; "hello there"
	;;;-----vigenere cipher------------------------------------------------------------


	(defn letter->number
	"Hash-map to convert letters to number"

	[alpha]
	(into {\space \space} (map vector alpha (range))))

	(defn number->letter
	"Hash-map to convert numbers into letters"

	[alpha]
	(into {\space \space} (map vector (range) alpha)))

	;;; build a function to use the vigenere cipher to encode a message with a keyword
	;;; we do it in a single step, but it can be seen as letters ->
	;;; numbers -> letter. The mod 26 is there to cycle the alphabet
	;;; around again, whether you are encoding or decoding.
	(defn vigenere-encode [key-word plain-text]
	(let [original (letter->number alphabet)
	encode (number->letter alphabet)
	repeat-keyword (cycle (seq key-word))]
	(apply str
	(map (fn [k c]
	(encode (mod (+ (original k) (original c)) 26))); encode by summing letters
	repeat-keyword ; cycle keyword letters
	plain-text))))

	(defn- vigenere-generic
	"Generic function that can encode or decode the vignere
	cipher. Encode (+), decode (-). The encoding works by 2 letters
	together to create a new letter. This is carried by converting the
	letters to numbers, summing, and then converting the number back
	into a letter. Decoding is the reverse operation whereby the key is
	subtracted from the cryto letter to get the original back."
	[fun key-word s]
	(let [to-number (letter->number alphabet)
	to-letter (number->letter alphabet)]
	(apply str
	(map (fn [letter k]
	(to-letter (mod (fun (to-number letter) (to-number k)) 26)))
	s
	(cycle (seq key-word))))))

	;;; we wrote a generic function, now we don't have to write so much
	;;; repeated code. It will be obvious that the only difference between
	;;; encoding and decoding is the + or - of the keyword from
	;;; plaintext (encode) or cryptotext (decode).
	(defn vigenere-decode [key-word s] (vigenere-generic - key-word s))
	(defn vigenere-encode [key-word s] (vigenere-generic + key-word s))

	(->> (vigenere-encode "key" "function")
	(vigenere-decode "key"))
	;;; "function"

	(def encoded-message
	(str/replace
	(str "coliwicojqkwbjplvwffjvapvjuikicomitxrxnemtbefickmpeitf
	oohesnojwxcgrddvkhitesnnafxjhhfhxxvxbmxvgpvjuikgcemifg
	sbapagoqympzgvmhggzzclgksdjqgthbytkgubbshlcnnspxufwxnr
	fbytkgusjtrbbhjxorqijqdxfexstmwtbsoxjjbmvhfjvyvmssnhvt
	dqrrithnhgjtacnvfhcsxrnrhirwcgroxxjbbhvstxoimmumwolxnr
	wsnqgfpfamvpotrrvascuicdrflioussjrfxodqwgiosjxgwmjwkgf
	pfaathihqxkmghqsumiqxrvasgusqksbpitemjfmuaseclgfcsasyo
	ojwpabvbmwqnuicxqucsasyyfpvqaucptwunfdneuxcgbstkcxbstk
	cxostmvfusumzfwstxtpaxjxfbaicgrsjhktbuvekwsoflqfhinepz
	smbrcfsmnrqksojqgestblgksgxvgossvstx")
	#"(\s\|\n)" ""))

	(defn likely-repeats-length-n [n s]
	(->> (partition-all n 1 s) ; all k-mers of length n
	(map #(apply str %)) ; make them strings
	frequencies
	(sort-by second >) ; k-mer with most repeats
	first))

	(defn find-repeats [s]
	(filter #(> (second %) 1) (map #(likely-repeats-length-n % s) (range 3 (/ (count s) 2)))))

	(defn factors-of [n] (filter #(zero? (rem n %)) (range 1 (inc (/ n 2)))))

	(defn find-occurances [match]
	(->> encoded-message
	(partition-all (count match) 1)
	(map-indexed (fn [i x] [i (= (apply str x) match)]))
	(filter #(-> % second true? ))
	(map first)))

	(defn delta-occurances [occurances]
	(loop [V occurances
	delta []]
	(if (>= (count V) 2)
	(let [[a b] (take 2 V)]
	(recur (rest V)
	(conj delta (- b a))))
	delta)))

	(defn partition-all-str
	([n s] (partition-all-str n n s))
	([n step s]
	(mapv (fn [i]
	(if (< (+ i n) (count s))
	(subs s i (+ i n))
	(subs s i)))
	(range 0 (count s) step))))

	(defn transpose [coll] (apply map vector coll))


	(defn string->column [n s]
	(map (fn [i] (map #(.charAt s %)
	(range i (count s) n)))
	(range n)))

	(defn string->column2 [n s]
	(map #(->> (subs s %) ; starting frame
	(take-nth n)) ; every n letters
	(range n)))

	(defn sum [coll] (reduce + coll))
	(defn mean [coll] (/ (sum coll) (count coll)))
	(defn sqr [x] (* x x))

	(def letter-freq
	{\a 0.0812, \b 0.0149, \c 0.0271, \d 0.0432,
	\e 0.1202, \f 0.023, \g 0.0203, \h 0.0592,
	\i 0.0731, \j 0.001, \k 0.0069, \l 0.0398,
	\m 0.026099999999999998, \n 0.0695, \o 0.0768,
	\p 0.0182, \q 0.0011, \r 0.0602,
	\s 0.06280000000000001, \t 0.091, \u 0.0288,
	\v 0.0111, \w 0.0209, \x 0.0017000000000000001,
	\y 0.021099999999999997, \z 7.000000000000001E-4})


	(defn index-of-coincidence [c char-vector]
	(let [obs-char-freqs (frequencies char-vector)]
	(/ (sum
	(map (fn [a]
	(if-let [ni (obs-char-freqs a)]
	(* ni (- ni 1))
	0))
	alphabet))
	(count char-vector)
	(- (count char-vector) 1)
	(/ c))))

	(def IC (partial index-of-coincidence 26))

	(defn chi-sq [char-vector]
	(let [obs-char-freqs (frequencies char-vector)
	n (count char-vector)]
	(sum
	(map (fn [a]
	(let [obs (get obs-char-freqs a 0) ; observed number letter(i)
	exp (* (letter-freq a) n)] ; explected number letter(i)
	(/ (sqr (- obs exp))
	exp)))
	alphabet))))



	;;; We know that the key is length 3-8. We narrow down the potential
	;;; key size by calculating the distances between repeated words and
	;;; then identifying common factors of these distances. The
	;;; intersection of these factors are likely key sizes
	(->> (find-repeats encoded-message) (map first) ; identify repeat words
	(mapcat #(delta-occurances (find-occurances %))) ; distance between these repeats
	(map (fn [x] (filter #(<= 3 % 8) (factors-of x))))) ; factors b/w 3 and 8


	;;; at this point we have narrowed down the key-size to 3,4,6. But we
	;;; still have to figure out the key-size and the message. We could
	;;; try to brute force it by trying out all letter combinations for 3,
	;;; 4, and 6 length words but it is still impossible to read all of
	;;; the messages.

	;;; Instead of reading every message, we can decode the key by looking
	;;; for decoded messages that resemble an english passage. By looking
	;;; for letter frequency, we can automatically determine if the
	;;; passage looks like it is written in english. We narrow down the
	;;; key size by using the index-of-coincidence to calculate common
	;;; di-letter frequencies. With the calculated key size, we use the
	;;; the chi-square test to identify and retain any decoded messages
	;;; that contain similar letter frequency to written english.


	(defn crack-code [instr]
	(let [key-size (->> (for [i (range 3 9)]
	[i (->> (string->column2 i instr) ; make message into columns of length i
	(map IC) ; mean IC over letter columns
	mean)])
	(sort-by second >)
	ffirst)] ; keep the key size with highest IC
	(prn :key-size key-size)
	(->> (string->column key-size instr)
	(map (fn [column] ; decode each column
	(let [column (apply str column)]
	(->> (for [i (range 26)] ; try every letter
	[i (chi-sq (caeser-decode i column))]) ; calculate similarity to english
	(sort-by second)
	ffirst)))) ; keep the most similar hit
	(map (number->letter alphabet))))) ; calculate the key

	(crack-code (str/lower-case encoded-message))

	;;; instead of the ->> for loop stuff, use reduce! This doesn't work
	;;; right now b/c like a fool, i forgot what column is
	(reduce (fn [[best-offset cur-chi :as current] i]
	(let [col (apply str column)
	new-chi (chi-sq (caeser-decode i col))] ; calculate similarity to english
	(if (< cur-chi new-chi) ; keep the most similar hit (least significant)
	current
	[i new-chi])))
	[-1 Integer/MAX_VALUE]
	(vec (range 26)))