Last active
February 4, 2016 18:23
-
-
Save zoharbabin/5026988 to your computer and use it in GitHub Desktop.
A phenny/jenni IRC Bot module for searching the Kaltura Knowledge Center and presenting the results from the first search results page in the IRC chat. Join us at #kaltura on irc.freenode.net
http://www.zoharbabin.com/kaltura-knowledge-center-search-irc-bot-module
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
kc.py - jenni's Kaltura Knowledge Center Search Module | |
author Zohar Babin <[email protected]> | |
http://www.zoharbabin.com/kaltura-knowledge-center-search-irc-bot-module | |
Copyright Kaltura Inc. | |
Licensed under AGPL v3. | |
More info: | |
Returns the list of search results from the first page in the Kaltura Knowledge Center (http://knowledge.kaltura.com) | |
* jenni: https://github.com/myano/jenni/ | |
""" | |
import re | |
import mechanize | |
import sys | |
import string | |
from bs4 import BeautifulSoup | |
import bitly_api | |
import urllib | |
def kc(jenni, input): | |
# We use bitly to shorten the Urls.. this is not a must, but makes the IRC messages cleaner | |
# Get your bitly API token from: https://bitly.com/a/oauth_apps | |
bitly = bitly_api.Connection(access_token="REPLACE_THIS_WITH_YOUR_BITLY_TOKEN") | |
# phenny keeps the command input in input.group(2). input.group(1) would be the command itself (kc in our case) | |
input_txt = urllib.quote(input.group(2).encode('utf-8')) | |
# give the user some feedback while we go searching... | |
jenni.reply("Searcing the Kaltura Knowledge Center (knowledge.kaltura.com) for: "+input.group(2)) | |
jenni.reply("If I find any results for your search, I will return all the results from the first page.") | |
# we use mechanize for fetching the search results page | |
br = mechanize.Browser() | |
br.set_handle_equiv(True) | |
br.set_handle_gzip(True) | |
br.set_handle_redirect(True) | |
br.set_handle_referer(True) | |
br.set_handle_robots(False) | |
# Follows refresh 0 but not hangs on refresh > 0 | |
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) | |
# User-Agent (this is cheating, ok?) | |
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] | |
br.open("http://knowledge.kaltura.com/search/"+input_txt) | |
assert br.viewing_html() | |
html = br.response().read() | |
soup = BeautifulSoup(html, "lxml") #lxml is the faster parser of all | |
search_div = soup.findAll("div", class_="search-results") #get the search-results div | |
# if we couldn't find any results for the desired terms: | |
if len(search_div) == 0: | |
return jenni.reply("I couldn't find any results for your search of "+input_txt) | |
# If resulsts were indeed found, parse the page for all h2 headers in the search-results div | |
# and return all the links to the results | |
rx = re.compile('\W+') #this will be used to cleanup any non textual chars from the result title | |
counter = 1 | |
# we will only have one search div | |
for div in search_div: | |
headers = div.findAll("h2") | |
# every search result in the KC is under <article><h2>.. so we fetch all h2 | |
for h in headers: | |
links = h.findAll("a") | |
for a in links: | |
# every h2 will have 2 links, one that is used to hide/show the description and one linking to the result page | |
# ignore the links that are empty (used to hide/show desc box) | |
if a["href"] != "#": | |
#cleanup the title: | |
tmpTitle = a["title"].encode('ascii', 'ignore') | |
titleTxt = rx.sub(' ', tmpTitle.strip()) | |
#shorten the url: | |
bitlyUrl = bitly.shorten("http://knowledge.kaltura.com"+str(a['href'])) | |
#print a result to the user: | |
jenni.reply(str(counter)+") "+titleTxt+" - "+bitlyUrl["url"]) | |
counter += 1 | |
#defines the command used to call this module: | |
kc.commands = ['kc'] | |
#set this to how important you feel this module should be treated among other modules (low/medium/high): | |
kc.priority = 'high' | |
#description text for help | |
kc.example = '.kc [any kaltura search term]' | |
if __name__ == '__main__': | |
print __doc__.strip() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment