Created
April 7, 2019 10:11
-
-
Save KirillLykov/12ec47fecb238aeb6699389661ed0028 to your computer and use it in GitHub Desktop.
Count unique subsequences of DNA string, solution using DFA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Count number of unique subsequencies of DNA sequence | |
| #include <bits/stdc++.h> | |
| using namespace std; | |
| using lint = long long int; | |
| // only 4 letters | |
| int char2int(char c) { | |
| if (c == 'A') return 0; | |
| if (c == 'C') return 1; | |
| if (c == 'G') return 2; | |
| return 3; | |
| } | |
| // internal representation if DFA is array Nx4 | |
| const vector<vector<int>> buildDFA(const string& s) { | |
| const size_t n = s.size(); | |
| vector<vector<int>> dfa(n+1); | |
| vector<int> next(4, -1); | |
| for (int i = n; i >= 0; --i) { | |
| dfa[i] = next; | |
| if (i != 0) | |
| next[char2int(s[i-1])] = i; | |
| } | |
| return dfa; | |
| } | |
| lint countUniqueSubseq(const string& s) { | |
| const size_t n = s.size(); | |
| // 1. Build DFA by the input sequece | |
| // For that, for each position i in s | |
| // consider edges labeled with 'ACGT' to be | |
| // the index of the first occurence of the | |
| // corresponding index on the right. | |
| // Enumerate position in string starting from 1 in DFA. | |
| // For example, | |
| // 123 | |
| // aba => 0 -a-> 1 -a->3 | |
| // | b ^ | |
| // | V / | |
| // \-b-> 2 --a | |
| const vector<vector<int>>& dfa = buildDFA(s); | |
| // 2. Now we can compute for each vertex number of | |
| // pathes which end in it. Lets call this value weight | |
| // Vertices are numbred in topolocial order, which means | |
| // that we can traver dfa from left to right and carry forward | |
| // weight of current vertex | |
| vector<lint> weight(n+1,0); // weight[1] correspond to s[0] | |
| weight[0] = 1; | |
| for (int i = 0; i <= n; ++i) { | |
| for (int letter = 0; letter < 4; ++letter) { | |
| if (dfa[i][letter] != -1) | |
| weight[dfa[i][letter]] += weight[i]; | |
| } | |
| } | |
| // 3. Just accumulate all the weights | |
| lint res = 0; | |
| for (int i = 1; i <= n; ++i) | |
| res = res + weight[i]; | |
| return res; | |
| } | |
| int main() { | |
| ios_base::sync_with_stdio(0); cin.tie(NULL); | |
| string s; | |
| cin >> s; | |
| cout << countUniqueSubseq(s) << endl; | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment