gustavi · August 31, 2021 23:05
diff --git a/ircbot_url_title.py b/ircbot_url_title.py
 #!/usr/bin/env python3
 #-*- encoding: utf-8 -*-

 # ircbot_url_title.py
 # Simple bot which displays urls titles
 #
 # Copyright (c) 2010 Mick@el and Zopieux
 # Copyright (c) 2014 gustavi
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as
 # published by the Free Software Foundation, either version 3 of
 # the License, or (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

 import time
 from urllib import parse
 import re
 import unicodedata
 import difflib

 import requests
 from pypeul import *
 from bs4 import BeautifulSoup
 from itertools import groupby

 # Bot informations
 BOT_NAME = 'SimpleB00t'
 BOT_CHAN = '##gustavi'
 BOT_SERVER = 'irc.freenode.net'
 BOT_PORT = 6667

 # Main settings
 settings = {
    'ratio' : 0.6, # Display or not title, use in SimpleUrlBot.show_url()
    'buffer_url_len' : 25,
    'title_len_min' : 18,
    'title_len_max' : 136,
    'content_length_max' : 2097152,
 }

 # Contain lasts urls
 buffer_url = []

 # The urls we can't displays title
 exceptions_domain = [
    'docs.google.com',
    'translate.google.com',
    'paste.awesom.eu',
 ]

 class SimpleUrlBot(IRC):

    def get_title(self, url):
        """
        Get the url title.  Return error message if http error or domain does
        not exist.
        """

        req = requests.get(url, stream=True, verify=False)
        try:
            if int(req.headers.get('content-length')) > settings['content_length_max']:
                return '', ''
        except TypeError:
            pass

        soup = BeautifulSoup(req.content)

        # Exclude binary
        try:
            title = soup.title.string.replace('\n', '')
        except AttributeError:
            return '', ''

        # Cut the title if too big
        if len(title) > settings['title_len_max']:
            title = title[:settings['title_len_max']] + '...'

        # Compute a compact redirection path
        hist = (parse.urlsplit(u.url).netloc for u in list(req.history) + [req])
        nodup_hist = ((k, sum(1 for i in g)) for k, g in groupby(hist))
        path = ' → '.join('%s%s' % (host, ' ×%d' % c if c > 1 else '') for host, c in nodup_hist)
        return '[{}] {}'.format(path, title), req.url

    def slugify(self, value):
        """
        Converts to lowercase, removes non-word characters (alphanumerics and
        underscores) and converts spaces to hyphens. Also strips leading and
        trailing whitespace.
        """

        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
        value = re.sub('[^\w\s-]', '', value).strip().lower()
        return re.sub('[-\s]+', '-', value)

    def show_url(self, url, title, final_url):
        """
        Displays title only if:
            - url does not contain title
            - url is not recent (is not in 'buffer_url')
            - lenght of title is bigger than 'url_title_min' (in settings)
            - not in exeptions list
        """

        # Check the min lenght
        if len(title) < settings['title_len_min']:
            return False

        # Check if not in exceptions list
        if final_url in exceptions_domain:
            return False

        url_dif = parse.urlsplit(url).path.split('/')[-1]
        title_dif = self.slugify(title)
        # Check if url does not contain title
        if difflib.SequenceMatcher(None, url_dif, title_dif).ratio() < settings['ratio']:
            # Check if is recent
            if final_url in buffer_url:
                return False
            # Check if buffer is over
            if len(buffer_url) == settings['buffer_url_len']:
                del buffer_url[0]
            buffer_url.append(final_url)
            return True
        return False

    def on_ready(self):
        """
        If you want your bot to join a channel when it connects, you should do
        that in the on_ready event handler.
        """

        self.join(BOT_CHAN)

    def on_channel_message(self, umask, target, msg):
        """
        Main event handler, called when someone speaks on a channel where the
        bot is.
        Just displays the title for each url.
        """

        urls = urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', msg)
        for i in urls:
            title, final_url = self.get_title(i)
            if self.show_url(i, title, final_url) and title != '':
                self.message(target, title)

    def on_ctcp_version_request(self, umask, value):
        """
        There are event handlers for CTCP too.
        Here the bot replies its own __version__ string on a CTCP "version".
        """

        self.ctcp_reply(umask.nick,
                        'VERSION',
                        "{}, powered by pypeul and <3".format(BOT_NAME))

    def on_disconnected(self):
        logger.info('Disconnected. Trying to reconnect...')
        time_sleep = 30
        while True:
            try:
                self.connect(BOT_SERVER, BOT_PORT)
                self.ident(BOT_NAME)
                self.run()
                break
            except:
                logger.error('Attempt failed. Retrying in {}s...'.format(time_spleep))
            time_sleep += 30
            time.sleep(time_sleep)

 if __name__ == '__main__':
    # Enable debug-level logging
    import logging
    logging.basicConfig(level=logging.DEBUG)

    # Instanciate our SimpleUrlBot class and let it run
    bot = SimpleUrlBot()
    bot.connect(BOT_SERVER, BOT_PORT)
    bot.ident(BOT_NAME)
    bot.run()
	#!/usr/bin/env python3
	#-- encoding: utf-8 --

	# ircbot_url_title.py
	# Simple bot which displays urls titles
	#
	# Copyright (c) 2010 Mick@el and Zopieux
	# Copyright (c) 2014 gustavi
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU Lesser General Public License as
	# published by the Free Software Foundation, either version 3 of
	# the License, or (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU Lesser General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <http://www.gnu.org/licenses/>.

	import time
	from urllib import parse
	import re
	import unicodedata
	import difflib

	import requests
	from pypeul import *
	from bs4 import BeautifulSoup
	from itertools import groupby

	# Bot informations
	BOT_NAME = 'SimpleB00t'
	BOT_CHAN = '##gustavi'
	BOT_SERVER = 'irc.freenode.net'
	BOT_PORT = 6667

	# Main settings
	settings = {
	'ratio' : 0.6, # Display or not title, use in SimpleUrlBot.show_url()
	'buffer_url_len' : 25,
	'title_len_min' : 18,
	'title_len_max' : 136,
	'content_length_max' : 2097152,
	}

	# Contain lasts urls
	buffer_url = []

	# The urls we can't displays title
	exceptions_domain = [
	'docs.google.com',
	'translate.google.com',
	'paste.awesom.eu',
	]

	class SimpleUrlBot(IRC):

	def get_title(self, url):
	"""
	Get the url title. Return error message if http error or domain does
	not exist.
	"""

	req = requests.get(url, stream=True, verify=False)
	try:
	if int(req.headers.get('content-length')) > settings['content_length_max']:
	return '', ''
	except TypeError:
	pass

	soup = BeautifulSoup(req.content)

	# Exclude binary
	try:
	title = soup.title.string.replace('\n', '')
	except AttributeError:
	return '', ''

	# Cut the title if too big
	if len(title) > settings['title_len_max']:
	title = title[:settings['title_len_max']] + '...'

	# Compute a compact redirection path
	hist = (parse.urlsplit(u.url).netloc for u in list(req.history) + [req])
	nodup_hist = ((k, sum(1 for i in g)) for k, g in groupby(hist))
	path = ' → '.join('%s%s' % (host, ' ×%d' % c if c > 1 else '') for host, c in nodup_hist)
	return '[{}] {}'.format(path, title), req.url

	def slugify(self, value):
	"""
	Converts to lowercase, removes non-word characters (alphanumerics and
	underscores) and converts spaces to hyphens. Also strips leading and
	trailing whitespace.
	"""

	value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
	value = re.sub('[^\w\s-]', '', value).strip().lower()
	return re.sub('[-\s]+', '-', value)

	def show_url(self, url, title, final_url):
	"""
	Displays title only if:
	- url does not contain title
	- url is not recent (is not in 'buffer_url')
	- lenght of title is bigger than 'url_title_min' (in settings)
	- not in exeptions list
	"""

	# Check the min lenght
	if len(title) < settings['title_len_min']:
	return False

	# Check if not in exceptions list
	if final_url in exceptions_domain:
	return False

	url_dif = parse.urlsplit(url).path.split('/')[-1]
	title_dif = self.slugify(title)
	# Check if url does not contain title
	if difflib.SequenceMatcher(None, url_dif, title_dif).ratio() < settings['ratio']:
	# Check if is recent
	if final_url in buffer_url:
	return False
	# Check if buffer is over
	if len(buffer_url) == settings['buffer_url_len']:
	del buffer_url[0]
	buffer_url.append(final_url)
	return True
	return False

	def on_ready(self):
	"""
	If you want your bot to join a channel when it connects, you should do
	that in the on_ready event handler.
	"""

	self.join(BOT_CHAN)

	def on_channel_message(self, umask, target, msg):
	"""
	Main event handler, called when someone speaks on a channel where the
	bot is.
	Just displays the title for each url.
	"""

	urls = urls = re.findall('http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\(\),]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+', msg)
	for i in urls:
	title, final_url = self.get_title(i)
	if self.show_url(i, title, final_url) and title != '':
	self.message(target, title)

	def on_ctcp_version_request(self, umask, value):
	"""
	There are event handlers for CTCP too.
	Here the bot replies its own __version__ string on a CTCP "version".
	"""

	self.ctcp_reply(umask.nick,
	'VERSION',
	"{}, powered by pypeul and <3".format(BOT_NAME))

	def on_disconnected(self):
	logger.info('Disconnected. Trying to reconnect...')
	time_sleep = 30
	while True:
	try:
	self.connect(BOT_SERVER, BOT_PORT)
	self.ident(BOT_NAME)
	self.run()
	break
	except:
	logger.error('Attempt failed. Retrying in {}s...'.format(time_spleep))
	time_sleep += 30
	time.sleep(time_sleep)

	if __name__ == '__main__':
	# Enable debug-level logging
	import logging
	logging.basicConfig(level=logging.DEBUG)

	# Instanciate our SimpleUrlBot class and let it run
	bot = SimpleUrlBot()
	bot.connect(BOT_SERVER, BOT_PORT)
	bot.ident(BOT_NAME)
	bot.run()