|
# based on https://gist.github.com/vstoykov/6028987 |
|
# python socialmediacard.py 'https://meduza.io/feature/2024/02/28/ya-sdelayu-vse-chtoby-zlo-otstupilo-a-prekrasnoe-buduschee-prishlo' |
|
|
|
import html.parser |
|
import urllib.request |
|
|
|
class SeoParser(html.parser.HTMLParser): |
|
CONTENT_TAGS = ('p', 'h1', 'h2', 'h3', 'h4') |
|
ALLOWED_INLINE_TAGS = ('b', 'u', 'strong', 'em', 'br') |
|
|
|
def __init__(self, default_charset='utf-8'): |
|
super().__init__() |
|
self._last_tag = None |
|
self._in_head = False |
|
self._in_title = False |
|
self._in_body = False |
|
self._content_stack = 0 |
|
self._in_content = False |
|
self._in_content_tag = False |
|
self.charset = default_charset |
|
self.title = None |
|
self.meta_tags = {} |
|
self.og_tags = {} |
|
self.content = '' |
|
|
|
def handle_starttag(self, tag, attrs): |
|
""" Choose what to do when open a tag """ |
|
attrs = dict(attrs) |
|
tag = tag.lower() |
|
if self._in_head: |
|
if tag == 'title': |
|
self._in_title = True |
|
elif tag == 'meta': |
|
if 'charset' in attrs: |
|
self.charset = attrs['charset'] |
|
else: |
|
meta_property = attrs.get('property') or '' |
|
meta_name = attrs.get('name') or '' |
|
meta_content = attrs.get('content') |
|
|
|
if meta_property.startswith('og:'): |
|
self.og_tags[meta_property[3:]] = meta_content |
|
elif meta_name: |
|
self.meta_tags[meta_name] = meta_content |
|
|
|
elif self._in_body: |
|
if self._in_content: |
|
if tag in self.CONTENT_TAGS: |
|
self._in_content_tag = True |
|
self.content += "<%s>" % tag |
|
|
|
elif tag in ['div', 'section', 'article']: |
|
tag_class = (attrs.get('class') or '').lower() |
|
tag_id = (attrs.get('id') or '').lower() |
|
if 'content' in tag_class or 'body' in tag_class: |
|
self._in_content = True |
|
elif 'content' in tag_id: |
|
self._in_content = True |
|
|
|
elif tag == 'head': |
|
self._in_head = True |
|
|
|
elif tag == 'body': |
|
self._in_head = False |
|
self._in_body = True |
|
|
|
if self._in_content: |
|
self._content_stack += 1 |
|
|
|
self._last_tag = tag |
|
|
|
def handle_data(self, data): |
|
""" Chose what to do with text nodes """ |
|
data = data.strip() |
|
if not data: |
|
return |
|
|
|
if self._in_title: |
|
# TODO: Remove new lines in title |
|
self.title = data |
|
|
|
if self._in_content_tag: |
|
# Add space before content in some circumstances |
|
if self.content[-1] not in '> ' and data[0] not in ',."': |
|
data = ' ' + data |
|
self.content += data |
|
|
|
def handle_endtag(self, tag): |
|
""" Choose what to do when close a tag """ |
|
tag = tag.lower() |
|
if self._in_head: |
|
if self._in_title: |
|
self._in_title = False |
|
if tag == 'head': |
|
self._in_head = False |
|
elif self._in_body: |
|
if self._in_content: |
|
if tag in self.CONTENT_TAGS: |
|
self._in_content_tag = False |
|
self.content += "</%s>\n" % tag |
|
|
|
self._content_stack -= 1 |
|
|
|
if tag == 'body': |
|
self._content_stack = 0 |
|
self._in_body = False |
|
|
|
if self._content_stack == 0: |
|
self._in_content = False |
|
|
|
def get_seo_dict(self, translate = {ord('\xa0') : ' ' }): |
|
return dict(title = seo.title.translate(translate), content = seo.content.translate(translate), meta_tags = {k : v.translate(translate) for k, v in self.meta_tags.items()}, og_tags = {k : v.translate(translate) for k, v in self.og_tags.items()}) |
|
|
|
if __name__ == '__main__': |
|
import sys |
|
url = sys.argv[1] |
|
response_bytes = urllib.request.urlopen(url).read() |
|
response_text_utf8 = response_bytes.decode('utf-8') |
|
|
|
seo = SeoParser(default_charset = 'utf-8') |
|
seo.feed(response_text_utf8) |
|
if seo.charset != 'utf-8' and seo.charset != 'utf8': |
|
response_text_decoded = response_bytes.decode(seo.charset) |
|
seo = SeoParser(default_charset = 'utf-8') |
|
seo.feed(response_text_decoded) |
|
print(seo.get_seo_dict()) |