lawrencejones · August 29, 2015 14:00
diff --git a/seq.coffee b/seq.coffee
 # Making comparisons between two DNA sequences can become a hard computational
 # problem, due to the inexact definition of organic similarity.
 #
 # When making comparisons between two sequences, a useful definition of
 # similarity is the largest subsequence present within both sequences, where a
 # subsequence is defined as such...
 #
 #         Given X = { X1, X2, ..., Xm }
 #         and   Z = { Z1, Z2, ..., Xk }...
 #
 # ...Z is a subsequence of X iff there exists a strictly increasing sequence...
 #
 #               S : < i1, i2, ..., ik >
 #
 # ...such that for all j ∈ [1..k], we have X[ij] == Z[j]. In other words, the
 # elements within sequence S can be found, in order, by iterating through both
 # X and Z.

 largest = (results) ->
  results.reduce (a,c) ->
    if a.length < c.length then c else a

 largestSubsequence = (X, Z) ->
  [hits, taps] = [0,0]
  cache = [0..X.length].map -> new Array Z.length
  sub = (i, k) ->

    ++taps
    forks = []

    # If cache is hit, return memozized
    if (hit = cache[i][k])? then ++hits; return hit

    # If base case, then return empty array
    if (i >= X.length) or (k >= Z.length) then return forks

    # If matched on this character, then add matching outcome to forks
    if X[i] == Z[k]
      tail = cache[i+1][k+1] = sub (i+1), (k+1)
      forks.push([X[i]].concat tail)

    # Recurse on a shuffle left and right
    forks.push\
    ( cache[i][k+1] = sub(i, k+1)
    , cache[i+1][k] = sub(i+1, k) )

    # Return the fork with the largest length
    return largest forks

  # Return algorithm run summary
  sequence: sub 0, 0
  hits: hits, taps: taps


 # Base sequence from which to build larger test cases
 S1 = 'GCAGCGATCGACAGCTGTGCTATCCCGGCGAGCCCGTGGCAGAGGACCTCGCTTGCGAAAGCAT'

 # Use given argument as length of sequences
 [size, split] = process.argv[2..].map (a) -> parseInt a, 10
 rS = [] # random sequence
 for i in [1..(size ?= 50)]
  rS.push S1[Math.floor(Math.random()*S1.length)]

 # Generate two sequences, split in middle or optionally at arg index
 [Z,X] = [rS.splice(split || size/2), rS]
  
 console.log """\t
 Running algorithm with a sequence X of size #{X.length}, and Z
 of size #{Z.length}...\n"""

 # Run algorithm
 res = largestSubsequence X, Z

 console.log """
 Algorithm run in #{res.taps} taps, with #{res.hits} cache hits.
 Cache hit percentage is therefore #{(100*res.hits/res.taps).toFixed 2}%
 Largest sequence found is of size #{res.sequence.length}!\n"""
	# Making comparisons between two DNA sequences can become a hard computational
	# problem, due to the inexact definition of organic similarity.
	#
	# When making comparisons between two sequences, a useful definition of
	# similarity is the largest subsequence present within both sequences, where a
	# subsequence is defined as such...
	#
	# Given X = { X1, X2, ..., Xm }
	# and Z = { Z1, Z2, ..., Xk }...
	#
	# ...Z is a subsequence of X iff there exists a strictly increasing sequence...
	#
	# S : < i1, i2, ..., ik >
	#
	# ...such that for all j ∈ [1..k], we have X[ij] == Z[j]. In other words, the
	# elements within sequence S can be found, in order, by iterating through both
	# X and Z.

	largest = (results) ->
	results.reduce (a,c) ->
	if a.length < c.length then c else a

	largestSubsequence = (X, Z) ->
	[hits, taps] = [0,0]
	cache = [0..X.length].map -> new Array Z.length
	sub = (i, k) ->

	++taps
	forks = []

	# If cache is hit, return memozized
	if (hit = cache[i][k])? then ++hits; return hit

	# If base case, then return empty array
	if (i >= X.length) or (k >= Z.length) then return forks

	# If matched on this character, then add matching outcome to forks
	if X[i] == Z[k]
	tail = cache[i+1][k+1] = sub (i+1), (k+1)
	forks.push([X[i]].concat tail)

	# Recurse on a shuffle left and right
	forks.push\
	( cache[i][k+1] = sub(i, k+1)
	, cache[i+1][k] = sub(i+1, k) )

	# Return the fork with the largest length
	return largest forks

	# Return algorithm run summary
	sequence: sub 0, 0
	hits: hits, taps: taps


	# Base sequence from which to build larger test cases
	S1 = 'GCAGCGATCGACAGCTGTGCTATCCCGGCGAGCCCGTGGCAGAGGACCTCGCTTGCGAAAGCAT'

	# Use given argument as length of sequences
	[size, split] = process.argv[2..].map (a) -> parseInt a, 10
	rS = [] # random sequence
	for i in [1..(size ?= 50)]
	rS.push S1[Math.floor(Math.random()*S1.length)]

	# Generate two sequences, split in middle or optionally at arg index
	[Z,X] = [rS.splice(split \|\| size/2), rS]

	console.log """\t
	Running algorithm with a sequence X of size #{X.length}, and Z
	of size #{Z.length}...\n"""

	# Run algorithm
	res = largestSubsequence X, Z

	console.log """
	Algorithm run in #{res.taps} taps, with #{res.hits} cache hits.
	Cache hit percentage is therefore #{(100*res.hits/res.taps).toFixed 2}%
	Largest sequence found is of size #{res.sequence.length}!\n"""