Last active
December 19, 2015 22:39
-
-
Save vstoykov/6028987 to your computer and use it in GitHub Desktop.
Simple parser to get all meta tags
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
Fetch SEO data from given URL | |
""" | |
from HTMLParser import HTMLParser | |
from urllib2 import build_opener | |
class SeoParser(HTMLParser): | |
""" | |
Parser which will get all SEO meta tags information | |
""" | |
CONTENT_TAGS = ('p', 'h1', 'h2', 'h3', 'h4') | |
ALLOWED_INLINE_TAGS = ('b', 'u', 'strong', 'em', 'br') | |
def __init__(self, url, default_charset='utf-8'): | |
HTMLParser.__init__(self) | |
self.url = url | |
self._last_tag = None | |
self._in_head = False | |
self._in_title = False | |
self._in_body = False | |
self._content_stack = 0 | |
self._in_content = False | |
self._in_content_tag = False | |
self.charset = default_charset | |
self.title = None | |
self.meta_tags = {} | |
self.og_tags = {} | |
self.content = '' | |
def handle_starttag(self, tag, attrs): | |
""" Choose what to do when open a tag """ | |
attrs = dict(attrs) | |
tag = tag.lower() | |
if self._in_head: | |
if tag == 'title': | |
self._in_title = True | |
elif tag == 'meta': | |
if 'charset' in attrs: | |
self.charset = attrs['charset'] | |
else: | |
meta_property = attrs.get('property') or '' | |
meta_name = attrs.get('name') or '' | |
meta_content = attrs.get('content') | |
if meta_property.startswith('og:'): | |
self.og_tags[meta_property[3:]] = meta_content | |
elif meta_name: | |
self.meta_tags[meta_name] = meta_content | |
elif self._in_body: | |
if self._in_content: | |
if tag in self.CONTENT_TAGS: | |
self._in_content_tag = True | |
self.content += "<%s>" % tag | |
elif tag in ['div', 'section', 'article']: | |
tag_class = (attrs.get('class') or '').lower() | |
tag_id = (attrs.get('id') or '').lower() | |
if 'content' in tag_class or 'body' in tag_class: | |
self._in_content = True | |
elif 'content' in tag_id: | |
self._in_content = True | |
elif tag == 'head': | |
self._in_head = True | |
elif tag == 'body': | |
self._in_head = False | |
self._in_body = True | |
if self._in_content: | |
self._content_stack += 1 | |
self._last_tag = tag | |
def handle_data(self, data): | |
""" Chose what to do with text nodes """ | |
data = data.strip() | |
if not data: | |
return | |
if self._in_title: | |
# TODO: Remove new lines in title | |
self.title = data | |
if self._in_content_tag: | |
# Add space before content in some circumstances | |
if self.content[-1] not in '> ' and data[0] not in ',."': | |
data = ' ' + data | |
self.content += data | |
def handle_endtag(self, tag): | |
""" Choose what to do when close a tag """ | |
tag = tag.lower() | |
if self._in_head: | |
if self._in_title: | |
self._in_title = False | |
if tag == 'head': | |
self._in_head = False | |
elif self._in_body: | |
if self._in_content: | |
if tag in self.CONTENT_TAGS: | |
self._in_content_tag = False | |
self.content += "</%s>\n" % tag | |
self._content_stack -= 1 | |
if tag == 'body': | |
self._content_stack = 0 | |
self._in_body = False | |
if self._content_stack == 0: | |
self._in_content = False | |
def get_seo_tags(self): | |
opener = build_opener() | |
# Add custom header to identify the parser | |
opener.addheaders = [('User-agent', 'Mozilla/5.0 SEO meta tags parser (https://gist.github.com/vstoykov/6028987)')] | |
response = opener.open(self.url).read() | |
self.feed(response) | |
self._fix_encoding() | |
return { | |
'title': self.title, | |
'meta': self.meta_tags, | |
'og': self.og_tags, | |
'content': self.content, | |
} | |
def unescape(self, s): | |
# Make unescaping more exception safe | |
super_unescape = HTMLParser.unescape | |
try: | |
return super_unescape(self, s.decode(self.charset)) | |
except UnicodeDecodeError: | |
try: | |
return super_unescape(self, s) | |
except UnicodeDecodeError: | |
return s | |
def _fix_encoding(self): | |
for attr in ['title', 'content']: | |
val = getattr(self, attr) | |
if val: | |
setattr(self, attr, val.decode(self.charset)) | |
for attr in ['meta_tags', 'og_tags']: | |
items = getattr(self, attr) | |
for key, val in items.iteritems(): | |
if not val: | |
continue | |
val = val.replace(u'\xa0', ' ') | |
try: | |
items[key] = val.decode(self.charset) | |
except UnicodeEncodeError: | |
items[key] = val | |
def get_seo_tags(url): | |
parser = SeoParser(url) | |
return parser.get_seo_tags() | |
def main(): | |
""" Main Function """ | |
print get_seo_tags('http://magicsolutions.bg/') | |
if __name__ == '__main__': | |
exit(main() or 0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment