Created
November 11, 2011 07:44
-
-
Save Paaskehare/1357459 to your computer and use it in GitHub Desktop.
Print titles for URLs, only in the active buffer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
# | |
# Copyright (c) 2009 by ole <[email protected]> | |
# | |
# This program is free software; you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation; either version 3 of the License, or | |
# (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
# | |
# | |
# Print titles for URLs, only in the active buffer | |
# (this script requires WeeChat 0.3.0 or newer) | |
# | |
# History: | |
# 2011-09-11, ole <[email protected]> | |
# version 0.2-dev: add youtube support | |
# 2011-09-09, ole <[email protected]> | |
# version 0.1-dev: dev snapshot | |
# | |
ENABLE_YOUTUBE = True | |
import weechat as w | |
import re | |
try: | |
import json | |
except ImportError: | |
# Import used for Python versions prior to 2.6 | |
# *** You will have to download and install simplejson for this to work. | |
try: | |
import simplejson as json | |
except ImportError: | |
# if we can't get this either, don't read YouTube category from gdata.youtube.com | |
ENABLE_YOUTUBE = False | |
from HTMLParser import HTMLParser | |
from urllib2 import urlopen | |
SCRIPT_NAME = 'url_title' | |
SCRIPT_AUTHOR = "Ole Bergmann <[email protected]>" | |
SCRIPT_VERSION = "0.2" | |
SCRIPT_LICENSE = "GPL3" | |
SCRIPT_DESC = "Output URL Titles written in any channel" | |
# colors, we dont want to screw up people's themes | |
COLOR_RESET = w.color('reset') | |
COLOR_TITLE = w.color("*default") | |
COLOR_LINK = w.color('default') | |
# empty array which will be filled with urls to check | |
URLS = {} | |
# RegEx pattern to match urls | |
RE_URL = re.compile(r'(((http|ftp|https):\/\/|www\.)[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?)') | |
# RegEx pattern to match title attribute of page | |
RE_TITLE = re.compile(r'<title.*>(.*?)<\/title>', re.I | re.S) | |
# RegEx pattern to match popular file extensions that should NOT be checked | |
RE_FILES = re.compile(r'.*(png|jpg|bmp|gif|avi|mpg|flv|3gp|mp4|exe|msi|mp3|flac|tar\.gz|tar\.bz2)$', re.I) | |
# RegEx used to match YouTube links | |
RE_YOUTUBE = re.compile('http://(www\.)?youtube\.com/watch\?v=([a-zA-Z0-9\-_]{10,13})', re.I) | |
''' | |
This function attempts to retrieve the title from given page content | |
''' | |
def youtube_title(page): | |
# parse json | |
entry = json.loads(page.decode('utf-8'))['entry'] | |
# get category and video title | |
vid = {'title': entry['title']['$t'], 'category': entry['media$group']['media$category'][0]['label']} | |
# make sure the category is not bold formatted like the video title | |
return '%s(%s)%s %s' % (COLOR_RESET, vid['category'], COLOR_TITLE, vid['title']) | |
def url_title(page): | |
try: | |
# search the page for a <title> attribute | |
match = RE_TITLE.search(page) | |
if match: | |
# remove any whitespace we encounter (e.g. newlines) and replace it with a single space | |
title = ' '.join(match.groups(0)[0].decode('utf-8').split()) | |
# init html parser | |
h = HTMLParser() | |
# if we found a title, return the title html decoded. | |
if title: return h.unescape(title) | |
else: return '' | |
except AttributeError: | |
return '' | |
''' | |
Really simple function for appending an url to the URLS array | |
''' | |
def url_append(url, buffer = ""): | |
global URLS | |
URLS[url] = buffer | |
''' | |
I don't like to do this, but it is impossible to run a background process | |
otherwise within a weechat script. | |
This is instead of delaying the message we parsed the url from | |
''' | |
def url_process(url, command, rc, stdout, stderr): | |
global URLS | |
# make sure title is set | |
title = "" | |
# get the buffer object from the URLS array | |
try: | |
buffer = URLS[url] | |
except KeyError: | |
# probably already looked this one up, so just exit. | |
return w.WEECHAT_RC_OK | |
except IndexError: | |
# probably already looked this one up, so just exit. | |
return w.WEECHAT_RC_OK | |
# unfortunately, we have to check up on this again, | |
# since hook_process doesn't let us pass more than one variable: | |
if ENABLE_YOUTUBE: | |
match_yt = re.match(RE_YOUTUBE, url) | |
if match_yt: | |
# read from stdout, pass it to url_title | |
title = youtube_title(stdout) | |
# Not a YouTube link. | |
if not title: | |
# read from stdout, pass it to url_title | |
title = url_title(stdout) | |
if title: | |
w.prnt(buffer, | |
"+++\t%s%s %s%s- %s" % \ | |
( | |
COLOR_TITLE, | |
title, | |
COLOR_RESET, | |
COLOR_LINK, | |
url, | |
) | |
) | |
del URLS[url] | |
return w.WEECHAT_RC_OK | |
def message_parse(data, signal, signal_data): | |
# the server (to check which buffer the message belongs to) | |
server = signal.split(",")[0] | |
splits = signal_data.split(":") | |
# the channel (to check which buffer the message belongs to) | |
try: | |
channel = splits[1].split(" ")[-2] | |
except IndexError: | |
# Don't check url titles on other than channels: | |
return w.WEECHAT_RC_OK | |
# the actual message | |
message = ':'.join(splits[2:]) | |
# get the buffer the message was posted in | |
buffer = w.info_get("irc_buffer", "%s,%s" % (server, channel)) | |
# get the current buffer | |
current_buffer = w.current_buffer() | |
# we only check for urls in the current buffer, so see if they match: | |
if buffer == current_buffer: | |
# search the message for any urls | |
match = RE_URL.search(message) | |
if match: | |
# Great! we found one! | |
url = match.groups(0)[0] | |
# We don't want to download files and pictures. | |
if not RE_FILES.match(url): | |
# Assume http protocol if url matched with only the www. portion | |
if url[:len("www.")] == "www.": | |
url = "http://" + url | |
# append the url to URLS to make sure it knows which buffer it belongs to. | |
url_append(url, buffer) | |
# by default only read 4096 first bytes of a webpage, youtube needs the whole thing for valid json though | |
readBytes = "4096" | |
# default, changes if we match a youtube link. | |
lookup_url = url | |
# look for youtube link if json is available: | |
if ENABLE_YOUTUBE: | |
match_yt = re.match(RE_YOUTUBE, url) | |
if match_yt: | |
lookup_url = "http://gdata.youtube.com/feeds/videos/%s?alt=json" % match_yt.groups(0)[1] | |
# make sure we read the ENTIRE feed | |
readBytes = "" | |
# Check for python2 bin on systems where python3 is default | |
python2_bin = w.info_get("python2_bin", "") or "python" | |
cmd = python2_bin + " -c \"from urllib2 import urlopen; print(urlopen('%s').read(%s))\"" % (lookup_url, readBytes) | |
# Wait 15 seconds before killing the process | |
w.hook_process(cmd, 15 * 1000, "url_process", url) | |
return w.WEECHAT_RC_OK | |
# only run if the script is not imported from another source: | |
if __name__ == "__main__": | |
# attempt to register the script to weechat | |
if w.register(SCRIPT_NAME, SCRIPT_AUTHOR, SCRIPT_VERSION, SCRIPT_LICENSE, SCRIPT_DESC, "", ""): | |
# hook messages from buffers | |
w.hook_signal("*,irc_in_privmsg", "message_parse", "") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment