Created
May 9, 2022 12:32
-
-
Save lcomplete/df019861e0eb65ed7403259522fc9e05 to your computer and use it in GitHub Desktop.
通过 OPML xml 文件解析 RSS feed,从 feed 中获取被推荐最多的链接
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ssl | |
from typing import List | |
from urllib.parse import urlparse | |
from xml.etree import ElementTree as ET | |
import feedparser | |
import requests | |
from bs4 import BeautifulSoup, ResultSet | |
ssl._create_default_https_context = ssl._create_unverified_context | |
class Feed: | |
text: str | |
rssUrl: str | |
def __init__(self, text: str, rssUrl: str): | |
self.text = text | |
self.rssUrl = rssUrl | |
tree = ET.parse('subscriptions_newsletter.xml') | |
root = tree.getroot() | |
outlines = root.findall('.//outline') | |
feeds: List[Feed] = [] | |
ref_links = {} | |
for p in outlines: | |
rssUrl = p.attrib.get('xmlUrl') | |
if rssUrl is not None: | |
# print("%s | %s" % (p.attrib['text'], rssUrl)) | |
feeds.append(Feed(p.attrib['text'], rssUrl)) | |
def handleInnerLinks(links: ResultSet, backlink): | |
sets = {} | |
backdomain = urlparse(backlink['link']).netloc | |
for link in links: | |
href: str = link.attrs['href'] | |
if href.startswith('http') is False: | |
continue | |
# get real href | |
if 'click' in href or 'mail' in href or 'link' in href: | |
response = requests.head(href) | |
if response.status_code == 302: | |
href = response.headers['Location'] | |
if '?utm' in href: | |
href = href[0: href.index('?utm')] | |
domain = urlparse(href).netloc | |
if href == backlink['link'] or backdomain == domain: | |
continue | |
if href in sets: | |
continue | |
backlinks = ref_links.get(href) | |
if backlinks is None: | |
ref_links.setdefault(href, []) | |
backlinks = [] | |
domainExists = False | |
for existslink in backlinks: | |
existsdomain = urlparse(existslink['link']).netloc | |
if existsdomain == backdomain: | |
domainExists = True | |
break | |
if domainExists: | |
continue | |
backlinks.append(backlink) | |
sets[href] = '' | |
ref_links[href] = backlinks | |
count = 0 | |
for feed in feeds: | |
print("%s | %s" % (feed.text, feed.rssUrl)) | |
try: | |
contents = feedparser.parse(feed.rssUrl) | |
# print(json.dumps(contents, indent=4, sort_keys=True)) | |
for entry in contents.get('entries'): | |
link = entry.link | |
title = entry.title | |
html = entry.content[0].value | |
soup = BeautifulSoup(html, 'html.parser') | |
handleInnerLinks(soup.find_all('a'), {'link': link, 'title': title}) | |
except Exception as e: | |
print(e) | |
count += 1 | |
# if count == 1: | |
# break | |
linkinfos = [] | |
for reflink in ref_links: | |
linkinfos.append({'href': reflink, 'back_links': ref_links[reflink], 'link_count': len(ref_links[reflink])}) | |
linkinfos.sort(key=lambda x: x['link_count'], reverse=True) | |
# print(json.dumps(linkinfos,indent=4)) | |
for linkinfo in linkinfos: | |
print('%s | %i | %s' % (linkinfo['href'], linkinfo['link_count'], linkinfo['back_links'])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment