Last active
May 12, 2024 09:56
-
-
Save donpdonp/4ad3bb1343d17d3db9ce110b6ea78330 to your computer and use it in GitHub Desktop.
gluon wikipedia
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(function () { | |
// setup | |
setup() | |
// descriptor | |
return {name: 'wikipedia'} | |
}) | |
function setup () { | |
} | |
function go (msg) { | |
if (msg.method == 'irc.privmsg') { | |
var cmd_match = /^!wikipedia(\s+(.+)\s*$)?/.exec(msg.params.message) | |
if (cmd_match) { | |
var term = cmd_match[2] | |
bot.say(bot.admin_channel, msg.params.nick+' wikipedia lookup: ' + term) | |
var plain = wikipedia_lookup(term) | |
if (plain) { | |
bot.say(msg.params.channel, plain) | |
} | |
} else { | |
// trynlp(msg) | |
} | |
} | |
} | |
function trynlp (msg) { | |
if (msg.params.message.split(' ').length >= 3) { | |
var sub = nlp(msg.params.message) | |
if (sub.length > 2) { | |
bot.say('#pdxbots', 'I heard a noun from ' + msg.params.channel + ': ' + sub) | |
var plain = wikipedia_lookup(sub) | |
if (plain) { | |
bot.say('#pdxbots', plain) | |
} | |
} | |
} | |
} | |
function wikipedia_lookup (word) { | |
var data = wikipage(titleize(word)) | |
if (data) { | |
var text | |
if (data.error) { | |
if (data.error.code == 'missingtitle') { | |
bot.say(bot.admin_channel, 'wikipedia: title match failed for ' + word) | |
var srch = wikisearch(word) | |
if (srch) { | |
bot.say(bot.admin_channel, 'wikipedia: search hits ' + srch.query.searchinfo.totalhits) | |
if (srch.query.searchinfo.totalhits > 0) { | |
word = srch.query.search[0].title | |
bot.say(bot.admin_channel, 'wikipedia: search first ' + word) | |
data = wikipage(word) | |
if (data.parse) { | |
text = data.parse.text['*'].replace(/\n/g, '').replace(/\r/g, '') | |
} else { | |
if (data.error) { | |
bot.say(bot.admin_channel, 'wikipedia: wikipage() ' + data.error.code) | |
} | |
} | |
} | |
} | |
} | |
} | |
if (data.parse) { | |
text = data.parse.text['*'].replace(/\n/g, '').replace(/\r/g, '') | |
var recapital = /ul class="redirectText"><li><a href="\/wiki\/([^"]+)"/.exec(text) | |
if (recapital) { | |
bot.say(bot.admin_channel, 'wikipedia: recapital ' + word + ' to ' + recapital[1]) | |
data = wikipage(recapital[1]) | |
text = null | |
if (data.parse) { | |
text = data.parse.text['*'].replace(/\n/g, '').replace(/\r/g, '') | |
} | |
} | |
} | |
if (text) { | |
return simpleparse(decodeEntities(text))// + ' [' + data.parse.title + ']' | |
} | |
} else { | |
bot.say(bot.admin_channel, 'wikipedia: no json in ' + html.substr(0, 80)) | |
} | |
} | |
function simpleparse (text) { | |
var shortdesc_regex = /div class=".*?mw-parser-output".*?<div class=".*?shortdescription.*?>(.*?)<\/div>/ | |
var pom = shortdesc_regex.exec(text) | |
if (pom) { | |
var plain = pom[1].replace(/<[^>]*>/g, '') | |
// first words up to a lowercase letter period (avoid Abb R. reviations) | |
var sentance = /(^(\([^)]+\)|[^()]+?)[^A-Z]\.)/.exec(plain) | |
var desc = plain | |
if (sentance) { | |
desc = sentance[1] | |
// take out html entities | |
} | |
return desc | |
} else { | |
bot.say(bot.admin_channel, 'wikipedia: no match in ' + text.substr(0, 80)) | |
} | |
} | |
function wikipage (word) { | |
var url = 'https://en.wikipedia.org/w/api.php?action=parse&page=' + encodeURIComponent(word) + '&format=json' | |
var html = http.get(url) | |
if (html) { | |
var data = JSON.parse(html) | |
return data | |
} | |
} | |
function wikisearch (word) { | |
var url = 'https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=' + encodeURIComponent(word) + '&format=json' | |
var html = http.get(url) | |
if (html) { | |
var data = JSON.parse(html) | |
return data | |
} | |
} | |
function titleize (word) { | |
var parts = word.split(' ') | |
parts = parts.map(function (part) { | |
return part[0].toUpperCase() + part.substring(1) | |
}) | |
return parts.join('_') | |
} | |
function nlp (words) { | |
var url = 'http://nlp.stanford.edu:8080/parser/index.jsp?query=' + encodeURIComponent(words) | |
var html = http.get(url).replace(/\n/g, '').replace(/\r/g, '') | |
// var subj = /nsubj\([^,]+, ([^)]+)\)/.exec(html) | |
// return subj[1].split('-')[0] | |
var subj = /<pre id="parse" class="spacingFree">([^<]+)</.exec(html) | |
if (subj) { | |
var nn = /\(NN ([^)]+)\)/.exec(subj[1]) | |
if (nn) { | |
return nn[1] | |
} | |
} | |
} | |
function wikiparse (text) { | |
var sentance | |
parser.parse(text, { | |
startElement: function (sTagName, oAttrs) { | |
bot.say(bot.admin_channel, 'parse start: ' + sTagName) | |
}, | |
endElement: function (sTagName) {}, | |
characters: function (s) {}, | |
comment: function (s) {} | |
}) | |
return sentance | |
} | |
var parser = { | |
handler: null, | |
// regexps | |
startTagRe: /^<([^>\s\/]+)((\s+[^=>\s]+(\s*=\s*((\"[^"]*\")|(\'[^']*\')|[^>\s]+))?)*)\s*\/?\s*>/m, | |
endTagRe: /^<\/([^>\s]+)[^>]*>/m, | |
attrRe: /([^=\s]+)(\s*=\s*((\"([^"]*)\")|(\'([^']*)\')|[^>\s]+))?/gm, | |
parse: function (s, oHandler) { | |
if (oHandler) | |
this.contentHandler = oHandler | |
var i = 0 | |
var res, lc, lm, rc, index | |
var treatAsChars = false | |
var oThis = this | |
while (s.length > 0){ | |
// Comment | |
if (s.substring(0, 4) == '<!--') { | |
index = s.indexOf('-->') | |
if (index != -1) { | |
this.contentHandler.comment(s.substring(4, index)) | |
s = s.substring(index + 3) | |
treatAsChars = false | |
}else { | |
treatAsChars = true | |
} | |
} | |
// end tag | |
else if (s.substring(0, 2) == '</') { | |
if (this.endTagRe.test(s)) { | |
lc = RegExp.leftContext | |
lm = RegExp.lastMatch | |
rc = RegExp.rightContext | |
lm.replace(this.endTagRe, function () { | |
return oThis.parseEndTag.apply(oThis, arguments) | |
}) | |
s = rc | |
treatAsChars = false | |
}else { | |
treatAsChars = true | |
} | |
} | |
// start tag | |
else if (s.charAt(0) == '<') { | |
if (this.startTagRe.test(s)) { | |
lc = RegExp.leftContext | |
lm = RegExp.lastMatch | |
rc = RegExp.rightContext | |
lm.replace(this.startTagRe, function () { | |
return oThis.parseStartTag.apply(oThis, arguments) | |
}) | |
s = rc | |
treatAsChars = false | |
}else { | |
treatAsChars = true | |
} | |
} | |
if (treatAsChars) { | |
index = s.indexOf('<') | |
if (index == -1) { | |
this.contentHandler.characters(s) | |
s = '' | |
}else { | |
this.contentHandler.characters(s.substring(0, index)) | |
s = s.substring(index) | |
} | |
} | |
treatAsChars = true | |
} | |
}, | |
parseStartTag: function (sTag, sTagName, sRest) { | |
var attrs = this.parseAttributes(sTagName, sRest) | |
this.contentHandler.startElement(sTagName, attrs) | |
}, | |
parseEndTag: function (sTag, sTagName) { | |
this.contentHandler.endElement(sTagName) | |
}, | |
parseAttributes: function (sTagName, s) { | |
var oThis = this | |
var attrs = [] | |
s.replace(this.attrRe, function (a0, a1, a2, a3, a4, a5, a6) { | |
attrs.push(oThis.parseAttribute(sTagName, a0, a1, a2, a3, a4, a5, a6)) | |
}) | |
return attrs | |
}, | |
parseAttribute: function (sTagName, sAttribute, sName) { | |
var value = '' | |
if (arguments[7]) | |
value = arguments[8] | |
else if (arguments[5]) | |
value = arguments[6] | |
else if (arguments[3]) | |
value = arguments[4] | |
var empty = !value && !arguments[3] | |
return {name: sName, value: empty ? null : value} | |
} | |
} | |
function decodeEntities(encodedString) { | |
var translate_re = /&(nbsp|amp|quot|lt|gt);/g; | |
var translate = { | |
"nbsp":" ", | |
"amp" : "&", | |
"quot": "\"", | |
"lt" : "<", | |
"gt" : ">" | |
}; | |
return encodedString.replace(translate_re, function(match, entity) { | |
return translate[entity]; | |
}).replace(/&#(\d+);/gi, function(match, numStr) { | |
var num = parseInt(numStr, 10); | |
return String.fromCharCode(num); | |
}); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment