-
-
Save marceloqla/02740e5b709262b04c32df6a5bb4b90d to your computer and use it in GitHub Desktop.
Smith Waterman
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* smith_waterman.php | |
* | |
* The Smith–Waterman algorithm performs local sequence alignment to detect | |
* similarities in string. Smith-Waterman has been used for sequencing DNA, | |
* and for detecting plagiarism and collusion by comparing sequences of text. | |
* | |
* @example | |
* $str1 = 'Smith-Waterman and Systolic PE Array are well-known dynamic programming algorithms.'; | |
* $str2 = 'Smith-Waterman algorithm is a well-known algorithm for performing sequence alignment.'; | |
* $sw = new SmithWaterman($str1, $str2); | |
* $score = $sw->get_score(); // Percent score of similarity | |
* $html = $sw->get_html(); // String for viewing similarities (highlighted) | |
* echo number_format($score, 4) . ' ' . $html; | |
* | |
* | |
* Changes by marceloqla: | |
* | |
* Alignment Viz generated automatically in a html table. Check variables $aligned_seq1 and $aligned_seq2 | |
* if you want the sequences. | |
* | |
* Get Score function not included in this code - still working on a way to improve it | |
* | |
* Included identation in code for better clarity | |
* | |
* Code was adapted for biological sequences: Included a Blosum Matrix. | |
* If you want to use a different matrix contact me or modify the code yourself | |
* | |
* Traceback algorithm was modified for better clarity | |
* | |
* Code currently prints out the alignment score matrix (Lines 119-128) and the choices taken at each traceback step | |
* (Lines 144, 146, 152, 157) | |
* | |
*/ | |
private $str1; | |
private $str2; | |
private $html; | |
private $score; | |
private $blosum85; | |
public function __construct($str1, $str2) { | |
$this->str1 = $str1; | |
$this->str2 = $str2; | |
$this->blosum55 = Array( | |
"A"=> Array("A"=>5, "R"=>2, "N"=>2, "D"=>2, "C"=>1, "Q"=>1, "E"=>1, "G"=>0, "H"=>2, "I"=>2, "L"=>2, "K"=>1, "M"=>2, "F"=>3, "P"=>1, "S"=>1, "T"=>0, "W"=>3, "Y"=>3, "V"=>1, "B"=>2, "Z"=>1, "X"=>1, "*"=>6), | |
"R"=> Array("A"=>2, "R"=>6, "N"=>1, "D"=>2, "C"=>4, "Q"=>1, "E"=>1, "G"=>3, "H"=>0, "I"=>4, "L"=>3, "K"=>2, "M"=>2, "F"=>4, "P"=>2, "S"=>1, "T"=>2, "W"=>4, "Y"=>3, "V"=>3, "B"=>2, "Z"=>0, "X"=>2, "*"=>6), | |
"N"=> Array("A"=>2, "R"=>1, "N"=>7, "D"=>1, "C"=>4, "Q"=>0, "E"=>1, "G"=>1, "H"=>0, "I"=>4, "L"=>4, "K"=>0, "M"=>3, "F"=>4, "P"=>3, "S"=>0, "T"=>0, "W"=>5, "Y"=>3, "V"=>4, "B"=>4, "Z"=>1, "X"=>2, "*"=>6), | |
"D"=> Array("A"=>2, "R"=>2, "N"=>1, "D"=>7, "C"=>5, "Q"=>1, "E"=>1, "G"=>2, "H"=>2, "I"=>5, "L"=>5, "K"=>1, "M"=>4, "F"=>4, "P"=>2, "S"=>1, "T"=>2, "W"=>6, "Y"=>4, "V"=>4, "B"=>4, "Z"=>1, "X"=>2, "*"=>6), | |
"C"=> Array("A"=>1, "R"=>4, "N"=>4, "D"=>5, "C"=>9, "Q"=>4, "E"=>5, "G"=>4, "H"=>5, "I"=>2, "L"=>2, "K"=>4, "M"=>2, "F"=>3, "P"=>4, "S"=>2, "T"=>2, "W"=>4, "Y"=>3, "V"=>1, "B"=>4, "Z"=>5, "X"=>3, "*"=>6), | |
"Q"=> Array("A"=>1, "R"=>1, "N"=>0, "D"=>1, "C"=>4, "Q"=>6, "E"=>2, "G"=>3, "H"=>1, "I"=>4, "L"=>3, "K"=>1, "M"=>0, "F"=>4, "P"=>2, "S"=>1, "T"=>1, "W"=>3, "Y"=>2, "V"=>3, "B"=>1, "Z"=>4, "X"=>1, "*"=>6), | |
"E"=> Array("A"=>1, "R"=>1, "N"=>1, "D"=>1, "C"=>5, "Q"=>2, "E"=>6, "G"=>3, "H"=>1, "I"=>4, "L"=>4, "K"=>0, "M"=>3, "F"=>4, "P"=>2, "S"=>1, "T"=>1, "W"=>4, "Y"=>4, "V"=>3, "B"=>0, "Z"=>4, "X"=>1, "*"=>6), | |
"G"=> Array("A"=>0, "R"=>3, "N"=>1, "D"=>2, "C"=>4, "Q"=>3, "E"=>3, "G"=>6, "H"=>3, "I"=>5, "L"=>5, "K"=>2, "M"=>4, "F"=>4, "P"=>3, "S"=>1, "T"=>2, "W"=>4, "Y"=>5, "V"=>4, "B"=>1, "Z"=>3, "X"=>2, "*"=>6), | |
"H"=> Array("A"=>2, "R"=>0, "N"=>0, "D"=>2, "C"=>5, "Q"=>1, "E"=>1, "G"=>3, "H"=>8, "I"=>4, "L"=>3, "K"=>1, "M"=>3, "F"=>2, "P"=>3, "S"=>1, "T"=>2, "W"=>3, "Y"=>2, "V"=>4, "B"=>1, "Z"=>0, "X"=>2, "*"=>6), | |
"I"=> Array("A"=>2, "R"=>4, "N"=>4, "D"=>5, "C"=>2, "Q"=>4, "E"=>4, "G"=>5, "H"=>4, "I"=>5, "L"=>1, "K"=>3, "M"=>1, "F"=>1, "P"=>4, "S"=>3, "T"=>1, "W"=>3, "Y"=>2, "V"=>3, "B"=>5, "Z"=>4, "X"=>2, "*"=>6), | |
"L"=> Array("A"=>2, "R"=>3, "N"=>4, "D"=>5, "C"=>2, "Q"=>3, "E"=>4, "G"=>5, "H"=>3, "I"=>1, "L"=>4, "K"=>3, "M"=>2, "F"=>0, "P"=>4, "S"=>3, "T"=>2, "W"=>3, "Y"=>2, "V"=>0, "B"=>5, "Z"=>4, "X"=>2, "*"=>6), | |
"K"=> Array("A"=>1, "R"=>2, "N"=>0, "D"=>1, "C"=>4, "Q"=>1, "E"=>0, "G"=>2, "H"=>1, "I"=>3, "L"=>3, "K"=>6, "M"=>2, "F"=>4, "P"=>2, "S"=>1, "T"=>1, "W"=>5, "Y"=>3, "V"=>3, "B"=>1, "Z"=>1, "X"=>1, "*"=>6), | |
"M"=> Array("A"=>2, "R"=>2, "N"=>3, "D"=>4, "C"=>2, "Q"=>0, "E"=>3, "G"=>4, "H"=>3, "I"=>1, "L"=>2, "K"=>2, "M"=>7, "F"=>1, "P"=>3, "S"=>2, "T"=>1, "W"=>2, "Y"=>2, "V"=>0, "B"=>4, "Z"=>2, "X"=>1, "*"=>6), | |
"F"=> Array("A"=>3, "R"=>4, "N"=>4, "D"=>4, "C"=>3, "Q"=>4, "E"=>4, "G"=>4, "H"=>2, "I"=>1, "L"=>0, "K"=>4, "M"=>1, "F"=>7, "P"=>4, "S"=>3, "T"=>3, "W"=>0, "Y"=>3, "V"=>1, "B"=>4, "Z"=>4, "X"=>2, "*"=>6), | |
"P"=> Array("A"=>1, "R"=>2, "N"=>3, "D"=>2, "C"=>4, "Q"=>2, "E"=>2, "G"=>3, "H"=>3, "I"=>4, "L"=>4, "K"=>2, "M"=>3, "F"=>4, "P"=>8, "S"=>1, "T"=>2, "W"=>5, "Y"=>4, "V"=>3, "B"=>3, "Z"=>2, "X"=>2, "*"=>6), | |
"S"=> Array("A"=>1, "R"=>1, "N"=>0, "D"=>1, "C"=>2, "Q"=>1, "E"=>1, "G"=>1, "H"=>1, "I"=>3, "L"=>3, "K"=>1, "M"=>2, "F"=>3, "P"=>1, "S"=>5, "T"=>1, "W"=>4, "Y"=>2, "V"=>2, "B"=>0, "Z"=>1, "X"=>1, "*"=>6), | |
"T"=> Array("A"=>0, "R"=>2, "N"=>0, "D"=>2, "C"=>2, "Q"=>1, "E"=>1, "G"=>2, "H"=>2, "I"=>1, "L"=>2, "K"=>1, "M"=>1, "F"=>3, "P"=>2, "S"=>1, "T"=>5, "W"=>4, "Y"=>2, "V"=>0, "B"=>1, "Z"=>1, "X"=>1, "*"=>6), | |
"W"=> Array("A"=>3, "R"=>4, "N"=>5, "D"=>6, "C"=>4, "Q"=>3, "E"=>4, "G"=>4, "H"=>3, "I"=>3, "L"=>3, "K"=>5, "M"=>2, "F"=>0, "P"=>5, "S"=>4, "T"=>4, "W"=>11, "Y"=>2, "V"=>3, "B"=>5, "Z"=>4, "X"=>3, "*"=>6), | |
"Y"=> Array("A"=>3, "R"=>3, "N"=>3, "D"=>4, "C"=>3, "Q"=>2, "E"=>4, "G"=>5, "H"=>2, "I"=>2, "L"=>2, "K"=>3, "M"=>2, "F"=>3, "P"=>4, "S"=>2, "T"=>2, "W"=>2, "Y"=>7, "V"=>2, "B"=>4, "Z"=>3, "X"=>2, "*"=>6), | |
"V"=> Array("A"=>1, "R"=>3, "N"=>4, "D"=>4, "C"=>1, "Q"=>3, "E"=>3, "G"=>4, "H"=>4, "I"=>3, "L"=>0, "K"=>3, "M"=>0, "F"=>1, "P"=>3, "S"=>2, "T"=>0, "W"=>3, "Y"=>2, "V"=>5, "B"=>4, "Z"=>3, "X"=>1, "*"=>6), | |
"B"=> Array("A"=>2, "R"=>2, "N"=>4, "D"=>4, "C"=>4, "Q"=>1, "E"=>0, "G"=>1, "H"=>1, "I"=>5, "L"=>5, "K"=>1, "M"=>4, "F"=>4, "P"=>3, "S"=>0, "T"=>1, "W"=>5, "Y"=>4, "V"=>4, "B"=>4, "Z"=>0, "X"=>2, "*"=>6), | |
"Z"=> Array("A"=>1, "R"=>0, "N"=>1, "D"=>1, "C"=>5, "Q"=>4, "E"=>4, "G"=>3, "H"=>0, "I"=>4, "L"=>4, "K"=>1, "M"=>2, "F"=>4, "P"=>2, "S"=>1, "T"=>1, "W"=>4, "Y"=>3, "V"=>3, "B"=>0, "Z"=>4, "X"=>1, "*"=>6), | |
"X"=> Array("A"=>1, "R"=>2, "N"=>2, "D"=>2, "C"=>3, "Q"=>1, "E"=>1, "G"=>2, "H"=>2, "I"=>2, "L"=>2, "K"=>1, "M"=>1, "F"=>2, "P"=>2, "S"=>1, "T"=>1, "W"=>3, "Y"=>2, "V"=>1, "B"=>2, "Z"=>1, "X"=>2, "*"=>6), | |
"*"=> Array("A"=>6, "R"=>6, "N"=>6, "D"=>6, "C"=>6, "Q"=>6, "E"=>6, "G"=>6, "H"=>6, "I"=>6, "L"=>6, "K"=>6, "M"=>6, "F"=>6, "P"=>6, "S"=>6, "T"=>6, "W"=>6, "Y"=>6, "V"=>6, "B"=>6, "Z"=>6, "X"=>6, "*"=>1) | |
); | |
$this->smith_waterman(); | |
} | |
public function clean_token($token) { | |
$token = preg_replace('/J/', 'X', $token); | |
return $token; | |
} | |
public function smith_waterman() { | |
$this->score = 0.0; | |
//Weights - use blosum85 | |
$deletion = -10; | |
$insertion = -10; | |
// Lengths of strings | |
$X = $this->str1; | |
$Y = $this->str2; | |
$m = strlen($X); | |
$n = strlen($Y); | |
// echo "<br>Printing seq 1 and 2:<br> Length: " . $m . " " . $X . "<br> Length: " . $n . " " . $Y; | |
// Create and initialise matrix for dynamic programming | |
$H = array (); | |
// echo "<br><br>Printing initial matrix<br><br>"; | |
for ($i = 0; $i <= $m; $i++) { | |
$H[$i][0] = 0; | |
} | |
for ($j = 0; $j <= $m; $j++) { | |
$H[0][$j] = 0; | |
} | |
$max_i = 0; | |
$max_j = 0; | |
$max_H = 0; | |
//calculate alignment matrix | |
echo "<br>"; | |
for ($i = 1; $i <= $m; $i++){ | |
for ($j = 1; $j <= $n; $j++){ | |
$a = $H[$i -1][$j -1]; | |
$s1 = $this->clean_token($X[$i -1]); | |
$s2 = $this->clean_token($Y[$j -1]); | |
// echo "<br>Printing seq 1 and 2 - Char1:" . $s1 . " vs Char2: " . $s2 . " Score: "; | |
// Compute score of four possible situations (match, mismatch, deletion, insertion | |
$matrix_comp = $this->blosum55; | |
$a += $matrix_comp[$s1][$s2]; //match or mismatch | |
// echo $a . " vs gap: " . $deletion . " Chosen: "; | |
$b = $H[$i -1][$j] + $deletion; //deletion | |
$c = $H[$i][$j -1] + $insertion; //insertion | |
$H[$i][$j] = max(max($a, $b), $c); //best result calculated | |
// echo $H[$i][$j] . "<br>"; | |
if ($H[$i][$j] > $max_H) { | |
$max_H = $H[$i][$j]; | |
$max_i = $i; | |
$max_j = $j; | |
} | |
} | |
} | |
echo '<table style="font-size: 20px">'; | |
for ($i = 0; $i <= $m; $i++) { | |
echo "<tr>"; | |
for ($j = 0; $j <= $m; $j++) { | |
echo '<td style="padding: 5px"> ' . $H[$i][$j] . " </td>"; | |
// $H[$i][$j] = 0; | |
} | |
echo "</tr>"; | |
} | |
echo "</table>"; | |
// $maximum_possible_score = count($Y) * $match; | |
// $this->score = $max_H / $maximum_possible_score; | |
//Traceback to Recover alignments initiated: | |
$aligned_seq1 = array (); | |
$aligned_seq2 = array (); | |
$value = $H[$max_i][$max_j]; | |
$i = $max_i -1; | |
$j = $max_j -1; | |
while (($value != 0) && (($i != 0) && ($j != 0))) { | |
$s1 = $this->clean_token($X[$i]); | |
$s2 = $this->clean_token($Y[$j]); | |
$diagonal = $H[$i -1][$j -1]; | |
$up = $H[$i -1][$j]; | |
$left = $H[$i][$j -1]; | |
echo "Current: " . $H[$i][$j] . " Diag: " . $diagonal . " Up: " . $up . " Left: " . $left . "<br>"; | |
if (($diagonal >= $up) && ($diagonal >= $left)) { | |
echo "Chosen: Diag<br>"; | |
$aligned_seq1[] = $s1; | |
$aligned_seq2[] = $s2; | |
$i -= 1; | |
$j -= 1; | |
} elseif (($up > $diagonal) && ($up > $left)) { | |
echo "Chosen: Up<br>"; | |
$aligned_seq1[] = $s1; | |
$aligned_seq2[] = '-'; | |
$i -= 1; | |
} else { | |
echo "Chosen: Left<br>"; | |
$aligned_seq1[] = '-'; | |
$aligned_seq2[] = $s2; | |
$j -= 1; | |
} | |
} | |
$s1 = $this->clean_token($X[$i]); | |
$s2 = $this->clean_token($Y[$j]); | |
$aligned_seq1[] = $s1; | |
$aligned_seq2[] = $s2; | |
echo '<table><tr><td style="padding: 5px">'; | |
print_r(implode('</td><td style="padding: 5px">', array_reverse($aligned_seq1))); | |
echo "</td></tr><tr><td>"; | |
print_r(implode('</td><td style="padding: 5px">', array_reverse($aligned_seq2))); | |
echo "</td></tr></table>"; | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment