Last active
August 31, 2021 23:05
-
-
Save gustavi/8759636 to your computer and use it in GitHub Desktop.
A simple IRC bot which displays urls titles
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
#-*- encoding: utf-8 -*- | |
# ircbot_url_title.py | |
# Simple bot which displays urls titles | |
# | |
# Copyright (c) 2010 Mick@el and Zopieux | |
# Copyright (c) 2014 gustavi | |
# | |
# This program is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU Lesser General Public License as | |
# published by the Free Software Foundation, either version 3 of | |
# the License, or (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU Lesser General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
import time | |
from urllib import parse | |
import re | |
import unicodedata | |
import difflib | |
import requests | |
from pypeul import * | |
from bs4 import BeautifulSoup | |
from itertools import groupby | |
# Bot informations | |
BOT_NAME = 'SimpleB00t' | |
BOT_CHAN = '##gustavi' | |
BOT_SERVER = 'irc.freenode.net' | |
BOT_PORT = 6667 | |
# Main settings | |
settings = { | |
'ratio' : 0.6, # Display or not title, use in SimpleUrlBot.show_url() | |
'buffer_url_len' : 25, | |
'title_len_min' : 18, | |
'title_len_max' : 136, | |
'content_length_max' : 2097152, | |
} | |
# Contain lasts urls | |
buffer_url = [] | |
# The urls we can't displays title | |
exceptions_domain = [ | |
'docs.google.com', | |
'translate.google.com', | |
'paste.awesom.eu', | |
] | |
class SimpleUrlBot(IRC): | |
def get_title(self, url): | |
""" | |
Get the url title. Return error message if http error or domain does | |
not exist. | |
""" | |
req = requests.get(url, stream=True, verify=False) | |
try: | |
if int(req.headers.get('content-length')) > settings['content_length_max']: | |
return '', '' | |
except TypeError: | |
pass | |
soup = BeautifulSoup(req.content) | |
# Exclude binary | |
try: | |
title = soup.title.string.replace('\n', '') | |
except AttributeError: | |
return '', '' | |
# Cut the title if too big | |
if len(title) > settings['title_len_max']: | |
title = title[:settings['title_len_max']] + '...' | |
# Compute a compact redirection path | |
hist = (parse.urlsplit(u.url).netloc for u in list(req.history) + [req]) | |
nodup_hist = ((k, sum(1 for i in g)) for k, g in groupby(hist)) | |
path = ' → '.join('%s%s' % (host, ' ×%d' % c if c > 1 else '') for host, c in nodup_hist) | |
return '[{}] {}'.format(path, title), req.url | |
def slugify(self, value): | |
""" | |
Converts to lowercase, removes non-word characters (alphanumerics and | |
underscores) and converts spaces to hyphens. Also strips leading and | |
trailing whitespace. | |
""" | |
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') | |
value = re.sub('[^\w\s-]', '', value).strip().lower() | |
return re.sub('[-\s]+', '-', value) | |
def show_url(self, url, title, final_url): | |
""" | |
Displays title only if: | |
- url does not contain title | |
- url is not recent (is not in 'buffer_url') | |
- lenght of title is bigger than 'url_title_min' (in settings) | |
- not in exeptions list | |
""" | |
# Check the min lenght | |
if len(title) < settings['title_len_min']: | |
return False | |
# Check if not in exceptions list | |
if final_url in exceptions_domain: | |
return False | |
url_dif = parse.urlsplit(url).path.split('/')[-1] | |
title_dif = self.slugify(title) | |
# Check if url does not contain title | |
if difflib.SequenceMatcher(None, url_dif, title_dif).ratio() < settings['ratio']: | |
# Check if is recent | |
if final_url in buffer_url: | |
return False | |
# Check if buffer is over | |
if len(buffer_url) == settings['buffer_url_len']: | |
del buffer_url[0] | |
buffer_url.append(final_url) | |
return True | |
return False | |
def on_ready(self): | |
""" | |
If you want your bot to join a channel when it connects, you should do | |
that in the on_ready event handler. | |
""" | |
self.join(BOT_CHAN) | |
def on_channel_message(self, umask, target, msg): | |
""" | |
Main event handler, called when someone speaks on a channel where the | |
bot is. | |
Just displays the title for each url. | |
""" | |
urls = urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', msg) | |
for i in urls: | |
title, final_url = self.get_title(i) | |
if self.show_url(i, title, final_url) and title != '': | |
self.message(target, title) | |
def on_ctcp_version_request(self, umask, value): | |
""" | |
There are event handlers for CTCP too. | |
Here the bot replies its own __version__ string on a CTCP "version". | |
""" | |
self.ctcp_reply(umask.nick, | |
'VERSION', | |
"{}, powered by pypeul and <3".format(BOT_NAME)) | |
def on_disconnected(self): | |
logger.info('Disconnected. Trying to reconnect...') | |
time_sleep = 30 | |
while True: | |
try: | |
self.connect(BOT_SERVER, BOT_PORT) | |
self.ident(BOT_NAME) | |
self.run() | |
break | |
except: | |
logger.error('Attempt failed. Retrying in {}s...'.format(time_spleep)) | |
time_sleep += 30 | |
time.sleep(time_sleep) | |
if __name__ == '__main__': | |
# Enable debug-level logging | |
import logging | |
logging.basicConfig(level=logging.DEBUG) | |
# Instanciate our SimpleUrlBot class and let it run | |
bot = SimpleUrlBot() | |
bot.connect(BOT_SERVER, BOT_PORT) | |
bot.ident(BOT_NAME) | |
bot.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment