Created
November 26, 2020 07:14
-
-
Save lenormf/e2348635f343c1bcd40cf30258b408a9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# | |
# link_plugin.py for link-bot | |
# by lenormf | |
# | |
import irc3 | |
import requests | |
from urllib.parse import urlparse | |
from bs4 import BeautifulSoup | |
@irc3.plugin | |
class Title: | |
requires = ["irc3.plugins.log"] | |
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36" | |
def __init__(self, bot): | |
self.bot = bot | |
self.log = bot.log | |
self.log.debug(self.bot.config) | |
if self.bot.config["debug"]: | |
# NOTE: official documented way of enabling debug on `requests` | |
import logging | |
from http.client import HTTPConnection | |
HTTPConnection.debuglevel = 1 | |
logging.basicConfig() | |
logging.getLogger().setLevel(logging.DEBUG) | |
requests_log = logging.getLogger("urllib3") | |
requests_log.setLevel(logging.DEBUG) | |
requests_log.propagate = True | |
def _get_url_title(self, url): | |
try: | |
url_parsed = urlparse(url) | |
url_scheme, url_netloc = url_parsed[:2] | |
self.log.debug("URL parsed: %s", url_parsed) | |
except ValueError as e: | |
self.log.error("Unable to parse URL: %s", e) | |
return None | |
if not url_scheme: | |
return None | |
if url_scheme not in ["http", "https"]: | |
self.log.debug("Unsupported scheme: %s", url_scheme) | |
return None | |
html = None | |
try: | |
headers = { | |
"User-Agent": Title.USER_AGENT, | |
} | |
r = requests.head(url, headers=headers, allow_redirects=True) | |
if "content-type" not in r.headers \ | |
or "text/html" not in r.headers["content-type"]: | |
return None | |
r = requests.get(url, headers=headers) | |
html = r.text | |
except requests.exceptions.RequestException as e: | |
self.log.error("Unable to fetch URL: %s", e) | |
return None | |
try: | |
self.log.debug("HTML [%s]", html) | |
soup = BeautifulSoup(html, "lxml") | |
def get_opengraph_property(soup, property): | |
property = "og:%s" % property | |
opengraph = soup.find("meta", property=property) | |
if opengraph and "content" in opengraph.attrs: | |
return opengraph["content"] | |
return None | |
og_title = get_opengraph_property(soup, "title") | |
og_description = get_opengraph_property(soup, "description") | |
if not og_title: | |
title = soup.find("title") | |
if not title: | |
return None | |
og_title = title.text | |
if og_description: | |
return "%s | %s" % (og_title, og_description) | |
else: | |
return og_title | |
except Exception as e: | |
# NOTE: in theory no exceptions should be thrown here | |
self.log.error("Unable to parse the HTML: %s", e) | |
return None | |
return None | |
@irc3.event(irc3.rfc.PRIVMSG) | |
async def show_title(self, mask, target, event, data): | |
"""Print the title of a webpage when a URL is posted on a channel""" | |
if mask.nick == self.bot.nick or target not in self.bot.config["autojoins"]: | |
return | |
titles = {} | |
for url in data.split(" "): | |
title_parsed = self._get_url_title(url) | |
if not title_parsed: | |
self.log.error("No title could be parsed") | |
continue | |
titles[url] = title_parsed | |
if len(titles) == 1: | |
self.bot.privmsg(target, titles.popitem()[1]) | |
else: | |
for url, title in titles.items(): | |
self.bot.privmsg(target, "%s: %s" % (url, title)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment