Skip to content

Instantly share code, notes, and snippets.

@marceloqla
Forked from petrovitch/smith_waterman.php
Last active February 1, 2018 17:34
Show Gist options
  • Save marceloqla/02740e5b709262b04c32df6a5bb4b90d to your computer and use it in GitHub Desktop.
Save marceloqla/02740e5b709262b04c32df6a5bb4b90d to your computer and use it in GitHub Desktop.
Smith Waterman
<?php
/**
* smith_waterman.php
*
* The Smith–Waterman algorithm performs local sequence alignment to detect
* similarities in string. Smith-Waterman has been used for sequencing DNA,
* and for detecting plagiarism and collusion by comparing sequences of text.
*
* @example
* $str1 = 'Smith-Waterman and Systolic PE Array are well-known dynamic programming algorithms.';
* $str2 = 'Smith-Waterman algorithm is a well-known algorithm for performing sequence alignment.';
* $sw = new SmithWaterman($str1, $str2);
* $score = $sw->get_score(); // Percent score of similarity
* $html = $sw->get_html(); // String for viewing similarities (highlighted)
* echo number_format($score, 4) . ' ' . $html;
*
*
* Changes by marceloqla:
*
* Alignment Viz generated automatically in a html table. Check variables $aligned_seq1 and $aligned_seq2
* if you want the sequences.
*
* Get Score function not included in this code - still working on a way to improve it
*
* Included identation in code for better clarity
*
* Code was adapted for biological sequences: Included a Blosum Matrix.
* If you want to use a different matrix contact me or modify the code yourself
*
* Traceback algorithm was modified for better clarity
*
* Code currently prints out the alignment score matrix (Lines 119-128) and the choices taken at each traceback step
* (Lines 144, 146, 152, 157)
*
*/
private $str1;
private $str2;
private $html;
private $score;
private $blosum85;
public function __construct($str1, $str2) {
$this->str1 = $str1;
$this->str2 = $str2;
$this->blosum55 = Array(
"A"=> Array("A"=>5, "R"=>2, "N"=>2, "D"=>2, "C"=>1, "Q"=>1, "E"=>1, "G"=>0, "H"=>2, "I"=>2, "L"=>2, "K"=>1, "M"=>2, "F"=>3, "P"=>1, "S"=>1, "T"=>0, "W"=>3, "Y"=>3, "V"=>1, "B"=>2, "Z"=>1, "X"=>1, "*"=>6),
"R"=> Array("A"=>2, "R"=>6, "N"=>1, "D"=>2, "C"=>4, "Q"=>1, "E"=>1, "G"=>3, "H"=>0, "I"=>4, "L"=>3, "K"=>2, "M"=>2, "F"=>4, "P"=>2, "S"=>1, "T"=>2, "W"=>4, "Y"=>3, "V"=>3, "B"=>2, "Z"=>0, "X"=>2, "*"=>6),
"N"=> Array("A"=>2, "R"=>1, "N"=>7, "D"=>1, "C"=>4, "Q"=>0, "E"=>1, "G"=>1, "H"=>0, "I"=>4, "L"=>4, "K"=>0, "M"=>3, "F"=>4, "P"=>3, "S"=>0, "T"=>0, "W"=>5, "Y"=>3, "V"=>4, "B"=>4, "Z"=>1, "X"=>2, "*"=>6),
"D"=> Array("A"=>2, "R"=>2, "N"=>1, "D"=>7, "C"=>5, "Q"=>1, "E"=>1, "G"=>2, "H"=>2, "I"=>5, "L"=>5, "K"=>1, "M"=>4, "F"=>4, "P"=>2, "S"=>1, "T"=>2, "W"=>6, "Y"=>4, "V"=>4, "B"=>4, "Z"=>1, "X"=>2, "*"=>6),
"C"=> Array("A"=>1, "R"=>4, "N"=>4, "D"=>5, "C"=>9, "Q"=>4, "E"=>5, "G"=>4, "H"=>5, "I"=>2, "L"=>2, "K"=>4, "M"=>2, "F"=>3, "P"=>4, "S"=>2, "T"=>2, "W"=>4, "Y"=>3, "V"=>1, "B"=>4, "Z"=>5, "X"=>3, "*"=>6),
"Q"=> Array("A"=>1, "R"=>1, "N"=>0, "D"=>1, "C"=>4, "Q"=>6, "E"=>2, "G"=>3, "H"=>1, "I"=>4, "L"=>3, "K"=>1, "M"=>0, "F"=>4, "P"=>2, "S"=>1, "T"=>1, "W"=>3, "Y"=>2, "V"=>3, "B"=>1, "Z"=>4, "X"=>1, "*"=>6),
"E"=> Array("A"=>1, "R"=>1, "N"=>1, "D"=>1, "C"=>5, "Q"=>2, "E"=>6, "G"=>3, "H"=>1, "I"=>4, "L"=>4, "K"=>0, "M"=>3, "F"=>4, "P"=>2, "S"=>1, "T"=>1, "W"=>4, "Y"=>4, "V"=>3, "B"=>0, "Z"=>4, "X"=>1, "*"=>6),
"G"=> Array("A"=>0, "R"=>3, "N"=>1, "D"=>2, "C"=>4, "Q"=>3, "E"=>3, "G"=>6, "H"=>3, "I"=>5, "L"=>5, "K"=>2, "M"=>4, "F"=>4, "P"=>3, "S"=>1, "T"=>2, "W"=>4, "Y"=>5, "V"=>4, "B"=>1, "Z"=>3, "X"=>2, "*"=>6),
"H"=> Array("A"=>2, "R"=>0, "N"=>0, "D"=>2, "C"=>5, "Q"=>1, "E"=>1, "G"=>3, "H"=>8, "I"=>4, "L"=>3, "K"=>1, "M"=>3, "F"=>2, "P"=>3, "S"=>1, "T"=>2, "W"=>3, "Y"=>2, "V"=>4, "B"=>1, "Z"=>0, "X"=>2, "*"=>6),
"I"=> Array("A"=>2, "R"=>4, "N"=>4, "D"=>5, "C"=>2, "Q"=>4, "E"=>4, "G"=>5, "H"=>4, "I"=>5, "L"=>1, "K"=>3, "M"=>1, "F"=>1, "P"=>4, "S"=>3, "T"=>1, "W"=>3, "Y"=>2, "V"=>3, "B"=>5, "Z"=>4, "X"=>2, "*"=>6),
"L"=> Array("A"=>2, "R"=>3, "N"=>4, "D"=>5, "C"=>2, "Q"=>3, "E"=>4, "G"=>5, "H"=>3, "I"=>1, "L"=>4, "K"=>3, "M"=>2, "F"=>0, "P"=>4, "S"=>3, "T"=>2, "W"=>3, "Y"=>2, "V"=>0, "B"=>5, "Z"=>4, "X"=>2, "*"=>6),
"K"=> Array("A"=>1, "R"=>2, "N"=>0, "D"=>1, "C"=>4, "Q"=>1, "E"=>0, "G"=>2, "H"=>1, "I"=>3, "L"=>3, "K"=>6, "M"=>2, "F"=>4, "P"=>2, "S"=>1, "T"=>1, "W"=>5, "Y"=>3, "V"=>3, "B"=>1, "Z"=>1, "X"=>1, "*"=>6),
"M"=> Array("A"=>2, "R"=>2, "N"=>3, "D"=>4, "C"=>2, "Q"=>0, "E"=>3, "G"=>4, "H"=>3, "I"=>1, "L"=>2, "K"=>2, "M"=>7, "F"=>1, "P"=>3, "S"=>2, "T"=>1, "W"=>2, "Y"=>2, "V"=>0, "B"=>4, "Z"=>2, "X"=>1, "*"=>6),
"F"=> Array("A"=>3, "R"=>4, "N"=>4, "D"=>4, "C"=>3, "Q"=>4, "E"=>4, "G"=>4, "H"=>2, "I"=>1, "L"=>0, "K"=>4, "M"=>1, "F"=>7, "P"=>4, "S"=>3, "T"=>3, "W"=>0, "Y"=>3, "V"=>1, "B"=>4, "Z"=>4, "X"=>2, "*"=>6),
"P"=> Array("A"=>1, "R"=>2, "N"=>3, "D"=>2, "C"=>4, "Q"=>2, "E"=>2, "G"=>3, "H"=>3, "I"=>4, "L"=>4, "K"=>2, "M"=>3, "F"=>4, "P"=>8, "S"=>1, "T"=>2, "W"=>5, "Y"=>4, "V"=>3, "B"=>3, "Z"=>2, "X"=>2, "*"=>6),
"S"=> Array("A"=>1, "R"=>1, "N"=>0, "D"=>1, "C"=>2, "Q"=>1, "E"=>1, "G"=>1, "H"=>1, "I"=>3, "L"=>3, "K"=>1, "M"=>2, "F"=>3, "P"=>1, "S"=>5, "T"=>1, "W"=>4, "Y"=>2, "V"=>2, "B"=>0, "Z"=>1, "X"=>1, "*"=>6),
"T"=> Array("A"=>0, "R"=>2, "N"=>0, "D"=>2, "C"=>2, "Q"=>1, "E"=>1, "G"=>2, "H"=>2, "I"=>1, "L"=>2, "K"=>1, "M"=>1, "F"=>3, "P"=>2, "S"=>1, "T"=>5, "W"=>4, "Y"=>2, "V"=>0, "B"=>1, "Z"=>1, "X"=>1, "*"=>6),
"W"=> Array("A"=>3, "R"=>4, "N"=>5, "D"=>6, "C"=>4, "Q"=>3, "E"=>4, "G"=>4, "H"=>3, "I"=>3, "L"=>3, "K"=>5, "M"=>2, "F"=>0, "P"=>5, "S"=>4, "T"=>4, "W"=>11, "Y"=>2, "V"=>3, "B"=>5, "Z"=>4, "X"=>3, "*"=>6),
"Y"=> Array("A"=>3, "R"=>3, "N"=>3, "D"=>4, "C"=>3, "Q"=>2, "E"=>4, "G"=>5, "H"=>2, "I"=>2, "L"=>2, "K"=>3, "M"=>2, "F"=>3, "P"=>4, "S"=>2, "T"=>2, "W"=>2, "Y"=>7, "V"=>2, "B"=>4, "Z"=>3, "X"=>2, "*"=>6),
"V"=> Array("A"=>1, "R"=>3, "N"=>4, "D"=>4, "C"=>1, "Q"=>3, "E"=>3, "G"=>4, "H"=>4, "I"=>3, "L"=>0, "K"=>3, "M"=>0, "F"=>1, "P"=>3, "S"=>2, "T"=>0, "W"=>3, "Y"=>2, "V"=>5, "B"=>4, "Z"=>3, "X"=>1, "*"=>6),
"B"=> Array("A"=>2, "R"=>2, "N"=>4, "D"=>4, "C"=>4, "Q"=>1, "E"=>0, "G"=>1, "H"=>1, "I"=>5, "L"=>5, "K"=>1, "M"=>4, "F"=>4, "P"=>3, "S"=>0, "T"=>1, "W"=>5, "Y"=>4, "V"=>4, "B"=>4, "Z"=>0, "X"=>2, "*"=>6),
"Z"=> Array("A"=>1, "R"=>0, "N"=>1, "D"=>1, "C"=>5, "Q"=>4, "E"=>4, "G"=>3, "H"=>0, "I"=>4, "L"=>4, "K"=>1, "M"=>2, "F"=>4, "P"=>2, "S"=>1, "T"=>1, "W"=>4, "Y"=>3, "V"=>3, "B"=>0, "Z"=>4, "X"=>1, "*"=>6),
"X"=> Array("A"=>1, "R"=>2, "N"=>2, "D"=>2, "C"=>3, "Q"=>1, "E"=>1, "G"=>2, "H"=>2, "I"=>2, "L"=>2, "K"=>1, "M"=>1, "F"=>2, "P"=>2, "S"=>1, "T"=>1, "W"=>3, "Y"=>2, "V"=>1, "B"=>2, "Z"=>1, "X"=>2, "*"=>6),
"*"=> Array("A"=>6, "R"=>6, "N"=>6, "D"=>6, "C"=>6, "Q"=>6, "E"=>6, "G"=>6, "H"=>6, "I"=>6, "L"=>6, "K"=>6, "M"=>6, "F"=>6, "P"=>6, "S"=>6, "T"=>6, "W"=>6, "Y"=>6, "V"=>6, "B"=>6, "Z"=>6, "X"=>6, "*"=>1)
);
$this->smith_waterman();
}
public function clean_token($token) {
$token = preg_replace('/J/', 'X', $token);
return $token;
}
public function smith_waterman() {
$this->score = 0.0;
//Weights - use blosum85
$deletion = -10;
$insertion = -10;
// Lengths of strings
$X = $this->str1;
$Y = $this->str2;
$m = strlen($X);
$n = strlen($Y);
// echo "<br>Printing seq 1 and 2:<br> Length: " . $m . " " . $X . "<br> Length: " . $n . " " . $Y;
// Create and initialise matrix for dynamic programming
$H = array ();
// echo "<br><br>Printing initial matrix<br><br>";
for ($i = 0; $i <= $m; $i++) {
$H[$i][0] = 0;
}
for ($j = 0; $j <= $m; $j++) {
$H[0][$j] = 0;
}
$max_i = 0;
$max_j = 0;
$max_H = 0;
//calculate alignment matrix
echo "<br>";
for ($i = 1; $i <= $m; $i++){
for ($j = 1; $j <= $n; $j++){
$a = $H[$i -1][$j -1];
$s1 = $this->clean_token($X[$i -1]);
$s2 = $this->clean_token($Y[$j -1]);
// echo "<br>Printing seq 1 and 2 - Char1:" . $s1 . " vs Char2: " . $s2 . " Score: ";
// Compute score of four possible situations (match, mismatch, deletion, insertion
$matrix_comp = $this->blosum55;
$a += $matrix_comp[$s1][$s2]; //match or mismatch
// echo $a . " vs gap: " . $deletion . " Chosen: ";
$b = $H[$i -1][$j] + $deletion; //deletion
$c = $H[$i][$j -1] + $insertion; //insertion
$H[$i][$j] = max(max($a, $b), $c); //best result calculated
// echo $H[$i][$j] . "<br>";
if ($H[$i][$j] > $max_H) {
$max_H = $H[$i][$j];
$max_i = $i;
$max_j = $j;
}
}
}
echo '<table style="font-size: 20px">';
for ($i = 0; $i <= $m; $i++) {
echo "<tr>";
for ($j = 0; $j <= $m; $j++) {
echo '<td style="padding: 5px"> ' . $H[$i][$j] . " </td>";
// $H[$i][$j] = 0;
}
echo "</tr>";
}
echo "</table>";
// $maximum_possible_score = count($Y) * $match;
// $this->score = $max_H / $maximum_possible_score;
//Traceback to Recover alignments initiated:
$aligned_seq1 = array ();
$aligned_seq2 = array ();
$value = $H[$max_i][$max_j];
$i = $max_i -1;
$j = $max_j -1;
while (($value != 0) && (($i != 0) && ($j != 0))) {
$s1 = $this->clean_token($X[$i]);
$s2 = $this->clean_token($Y[$j]);
$diagonal = $H[$i -1][$j -1];
$up = $H[$i -1][$j];
$left = $H[$i][$j -1];
echo "Current: " . $H[$i][$j] . " Diag: " . $diagonal . " Up: " . $up . " Left: " . $left . "<br>";
if (($diagonal >= $up) && ($diagonal >= $left)) {
echo "Chosen: Diag<br>";
$aligned_seq1[] = $s1;
$aligned_seq2[] = $s2;
$i -= 1;
$j -= 1;
} elseif (($up > $diagonal) && ($up > $left)) {
echo "Chosen: Up<br>";
$aligned_seq1[] = $s1;
$aligned_seq2[] = '-';
$i -= 1;
} else {
echo "Chosen: Left<br>";
$aligned_seq1[] = '-';
$aligned_seq2[] = $s2;
$j -= 1;
}
}
$s1 = $this->clean_token($X[$i]);
$s2 = $this->clean_token($Y[$j]);
$aligned_seq1[] = $s1;
$aligned_seq2[] = $s2;
echo '<table><tr><td style="padding: 5px">';
print_r(implode('</td><td style="padding: 5px">', array_reverse($aligned_seq1)));
echo "</td></tr><tr><td>";
print_r(implode('</td><td style="padding: 5px">', array_reverse($aligned_seq2)));
echo "</td></tr></table>";
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment