Last active
August 29, 2015 14:09
-
-
Save andreasvc/ab3845503eec3da55585 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <sdsl/suffix_arrays.hpp> | |
#include <string> | |
#include <iostream> | |
#include <algorithm> | |
#include <iomanip> | |
#include <sdsl/vectors.hpp> | |
#include <fstream> | |
#include <sstream> | |
#include <unordered_map> | |
using namespace sdsl; | |
using namespace std; | |
int main(int argc, char** argv) { | |
if (argc < 2) { | |
cout << "Usage " << argv[0] << " text_file [max_locations]" << endl; | |
cout << " This program constructs a very compact FM-index" << endl; | |
cout << " which supports count, locate, and extract queries." << endl; | |
cout << " text_file Original text file." << endl; | |
cout << " max_locations Maximal number of location to report; 0 to disable [default 1000]." <<endl; | |
cout << "input: send queries to sdin; one query per line of space separated tokens" << endl; | |
cout << "output: lines of the form '[index1, index2, ...]'" << endl; | |
cout << "the output corresponds to the lines of input." << endl; | |
return 1; | |
} | |
size_t max_locations = 1000; | |
size_t sentcnt = 0; | |
int cnt = 2; | |
if (argc >= 3) | |
max_locations = atoi(argv[2]); | |
unordered_map<string, int> mapping; | |
vector<int> data; | |
vector<int> wordno2sentno; | |
string line; | |
// create mapping | |
// convert argv[1] to vector | |
// store sentence beginings as '0' token | |
ifstream is(argv[1]); | |
while (getline(is, line)) { | |
istringstream linestream(line); | |
string word; | |
data.push_back(2); // start of sentence | |
wordno2sentno.push_back(sentcnt); | |
while (linestream >> word) { | |
if (mapping.find(word) == mapping.end()) { | |
cnt++; | |
mapping[word] = cnt; | |
} | |
data.push_back(mapping[word]); | |
// simple but space inefficient sent. no. table: | |
wordno2sentno.push_back(sentcnt); | |
} | |
sentcnt++; | |
} | |
string index_suffix = ".fm9"; | |
string index_file = string(argv[1]) + index_suffix; | |
csa_wt<wt_int<rrr_vector<127> > > fm_index; | |
if (!load_from_file(fm_index, index_file)) { | |
ifstream in(argv[1]); | |
if (!in) { | |
cerr << "ERROR: File " << argv[1] << " does not exist. Exit." << endl; | |
return 1; | |
} | |
cerr << "No index " << index_file << " located. Building index now." << endl; | |
// copy to compressed vector (maybe store in there directly?) | |
int_vector<> data2(data.size()); | |
for (int i=0; i < data.size(); i++) | |
if (data[i] == 0) | |
cerr << "Zero at " << i << endl; | |
else | |
data2[i] = data[i]; | |
// create suffix tree from int_vector | |
construct_im(fm_index, data2); | |
// alternatively, write int vector to file | |
// string int_suffix = ".int"; | |
// string int_file = string(argv[1]) + int_suffix; | |
// store_to_file(data2, int_file); | |
// create suffix tree from file | |
// construct(fm_index, int_file, 0); | |
store_to_file(fm_index, index_file); | |
cerr << "Index construction complete, index requires "; | |
cerr << size_in_mega_bytes(fm_index) << " MiB." << endl; | |
} | |
cerr << "Input search terms and press Ctrl-D to exit." << endl; | |
while (getline(cin, line)) { | |
// apply mapping to query | |
vector<int> query; | |
istringstream linestream(line); | |
string word; | |
while (linestream >> word) { | |
auto search = mapping.find(word); | |
if (search == mapping.end()) | |
query.push_back(1); // unknown word, not part of mapping | |
else | |
query.push_back(search->second); | |
} | |
// perform query on suffix array | |
size_t m = query.size(); | |
size_t occs = sdsl::count(fm_index, query.begin(), query.end()); | |
cout << "["; | |
if (occs > 0) { | |
auto locations = locate(fm_index, query.begin(), query.begin() + m); | |
// optional: sort sentence numbers | |
// sort(locations.begin(), locations.end()); | |
if (max_locations > 0) | |
occs = min(occs, max_locations); | |
if (occs > 0) | |
cout << wordno2sentno[locations[0]]; | |
for (size_t i = 1; i < occs; ++i) | |
cout << ", " << wordno2sentno[locations[i]]; | |
} | |
cout << "]" << endl; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
cd enit | |
zcat enit-kernels.table.m2.gz | python ../splitsubstr.py | |
cd ../ennl | |
zcat ennl-kernels.table.m2.gz | python ../splitsubstr.py | |
cd .. | |
./fm-index enit/train.tags.en-it.clean.tok.lc.en 0 < enit/source.txt | pv -l -t -r -b > enit/source.txt.idx & | |
./fm-index enit/train.tags.en-it.clean.tok.lc.it 0 < enit/target.txt | pv -l -t -r -b > enit/target.txt.idx & | |
./fm-index ennl/train.tags.nl-en.clean.tok.lc.en 0 < ennl/source.txt | pv -l -t -r -b > ennl/source.txt.idx & | |
./fm-index ennl/train.tags.nl-en.clean.tok.lc.nl 0 < ennl/target.txt | pv -l -t -r -b > ennl/target.txt.idx & | |
wait | |
gzip */*.txt */*.txt.idx |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# requires sidsl: | |
# git clone https://github.com/simongog/sdsl-lite.git | |
# cd sdsl-lite | |
# ./install.sh $HOME/.local | |
# uses pv to display progress (not essential) | |
# http://www.ivarch.com/programs/pv.shtml | |
all: fm-index indices | |
fm-index: | |
g++ fm-index.cpp -o fm-index -std=c++11 -DNDEBUG -O3 -msse4.2 \ | |
-lsdsl -ldivsufsort -ldivsufsort64 \ | |
-I${HOME}/.local/include -L${HOME}/.local/lib |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Read substring table from stdin and produce 2 files source.txt, and | |
target.txt, with a list of unique substrings.""" | |
import sys | |
target = set() | |
with open('source.txt', 'w') as src: | |
for line in sys.stdin: | |
if line.startswith('\t\t['): | |
target.add(line[3:line.index(']\t', 2)]) | |
elif line.startswith('\t['): | |
src.write(' '.join( | |
line[2:line.index(']', 2)].split(', ')) + '\n') | |
with open('target.txt', 'w') as tar: | |
tar.writelines(' '.join(a.split(', ')) + '\n' for a in target) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment