In order to pull search results from wikipedia, wiktionary, wiktionary, or gutenberg book collection and various other sources, it can be patched together:
nginx
as proxy to provide CORSkiwix-serve
as backend
and XHR in a WebApp using JS is possible, yet, the results one needs to parse line-wise (easy) or html2json
(complex) to get the formated results:
nginx
config:
location /zim {
if ($request_method = 'OPTIONS') {
add_header 'Access-Control-Allow-Origin' '*';
add_header 'Access-Control-Allow-Methods' 'GET, POST, OPTIONS';
add_header 'Access-Control-Allow-Headers' 'Authorization';
add_header 'Access-Control-Allow-Credentials' 'true';
add_header 'Content-Type' 'text/plain; charset=utf-8';
add_header 'Content-Length' 0;
return 204;
}
rewrite ^/zim(.*) $1 break;
#auth_basic "ZIM Service";
#auth_basic_user_file /path/to/htpasswd;
proxy_pass http://127.0.0.1:8081/;
proxy_read_timeout 60;
}
kiwix-manage library.xml add *.zim
kiwix-serve --port 8081 --address 127.0.0.1 --library library.xml
and RESTful endpoint resides then at http://somewhere/zim/search?pattern=test
The results can be parsed like this in JS, after receiving the data
:
var h = 0;
var u, t, c;
var r = [];
for(var l of data.split(/\n/)) {
if(l.match(/class="footer"/))
break;
if(l.match(/<li>/))
h++;
if(l.match(/<\/li>/)) {
h = 0;
var m = u.match(/([a-z]+)/);
m[1] = m[1][0].toUpperCase() + m[1].slice(1);
r.push({source: m[1], link: u, text: t, cite: c});
}
if(h) {
var m;
if(h==2) {
t = l;
h--;
}
if(m=l.match(/<a href="([^"]+)"/)) {
u = m[1];
h++;
} else if(m=l.match(/<cite>(.+)<\/cite>/)) {
c = m[1];
}
}
}
or using html2json
(see https://github.com/Jxck/html2json, requires htmlparser
too):
data = html2json(data);
for(var i of [0,3,7,1])
data = data && data.child && data.child[i];
data.results = [];
for(var i=0; i<data.child.length; i++) {
if(i%2 == 1) {
var e = data.child[i];
var c = "";
for(var j=0; j<e.child[3].child.length; j++) {
var f = e.child[3].child[j];
if(f.node == 'text')
c += f.text + " ";
}
data.results.push({link: e.child[1].attr.href, text: e.child[1].child[0].text, cite: c});
}
}
The html2json
looks simpler, but is actually more complex as it requires to probe deep into the HTML tree.
A list of library items as listed in the root http://somewhere:8081/zim/
and parse the HTML (data
) like this:
zim_sets = [];
var u, f;
f = [];
for(var l of data.split(/\n/)) {
if(m=l.match(/<a href="([^"]+)"><div class=["']book["']>/)) {
u = m[1];
} else if(m=l.match(/div class=['"]book__(title|description|info)['"][^>]*>([^<]+)<\/div>/)) {
f[m[1]] = m[2];
} else if(l.match(/<\/div><\/a>/)) {
zim_sets.push({ id: u.replace(/^\//,''), url: u, title: f.title, description: f.description, info: f.info});
}
}