lcomplete · May 9, 2022 12:32
diff --git a/rss_link_hunt.py b/rss_link_hunt.py
 import ssl
 from typing import List
 from urllib.parse import urlparse
 from xml.etree import ElementTree as ET

 import feedparser
 import requests
 from bs4 import BeautifulSoup, ResultSet

 ssl._create_default_https_context = ssl._create_unverified_context


 class Feed:
    text: str
    rssUrl: str

    def __init__(self, text: str, rssUrl: str):
        self.text = text
        self.rssUrl = rssUrl


 tree = ET.parse('subscriptions_newsletter.xml')
 root = tree.getroot()
 outlines = root.findall('.//outline')
 feeds: List[Feed] = []
 ref_links = {}

 for p in outlines:
    rssUrl = p.attrib.get('xmlUrl')
    if rssUrl is not None:
        # print("%s | %s" % (p.attrib['text'], rssUrl))
        feeds.append(Feed(p.attrib['text'], rssUrl))


 def handleInnerLinks(links: ResultSet, backlink):
    sets = {}
    backdomain = urlparse(backlink['link']).netloc
    for link in links:
        href: str = link.attrs['href']
        if href.startswith('http') is False:
            continue

        # get real href
        if 'click' in href or 'mail' in href or 'link' in href:
            response = requests.head(href)
            if response.status_code == 302:
                href = response.headers['Location']
                if '?utm' in href:
                    href = href[0: href.index('?utm')]

        domain = urlparse(href).netloc
        if href == backlink['link'] or backdomain == domain:
            continue
        if href in sets:
            continue

        backlinks = ref_links.get(href)
        if backlinks is None:
            ref_links.setdefault(href, [])
            backlinks = []

        domainExists = False
        for existslink in backlinks:
            existsdomain = urlparse(existslink['link']).netloc
            if existsdomain == backdomain:
                domainExists = True
                break
        if domainExists:
            continue

        backlinks.append(backlink)
        sets[href] = ''
        ref_links[href] = backlinks


 count = 0
 for feed in feeds:
    print("%s | %s" % (feed.text, feed.rssUrl))
    try:
        contents = feedparser.parse(feed.rssUrl)
        # print(json.dumps(contents, indent=4, sort_keys=True))
        for entry in contents.get('entries'):
            link = entry.link
            title = entry.title
            html = entry.content[0].value
            soup = BeautifulSoup(html, 'html.parser')
            handleInnerLinks(soup.find_all('a'), {'link': link, 'title': title})
    except Exception as e:
        print(e)
    count += 1
    # if count == 1:
    #     break

 linkinfos = []

 for reflink in ref_links:
    linkinfos.append({'href': reflink, 'back_links': ref_links[reflink], 'link_count': len(ref_links[reflink])})

 linkinfos.sort(key=lambda x: x['link_count'], reverse=True)

 # print(json.dumps(linkinfos,indent=4))

 for linkinfo in linkinfos:
    print('%s | %i | %s' % (linkinfo['href'], linkinfo['link_count'], linkinfo['back_links']))
	import ssl
	from typing import List
	from urllib.parse import urlparse
	from xml.etree import ElementTree as ET

	import feedparser
	import requests
	from bs4 import BeautifulSoup, ResultSet

	ssl._create_default_https_context = ssl._create_unverified_context


	class Feed:
	text: str
	rssUrl: str

	def __init__(self, text: str, rssUrl: str):
	self.text = text
	self.rssUrl = rssUrl


	tree = ET.parse('subscriptions_newsletter.xml')
	root = tree.getroot()
	outlines = root.findall('.//outline')
	feeds: List[Feed] = []
	ref_links = {}

	for p in outlines:
	rssUrl = p.attrib.get('xmlUrl')
	if rssUrl is not None:
	# print("%s \| %s" % (p.attrib['text'], rssUrl))
	feeds.append(Feed(p.attrib['text'], rssUrl))


	def handleInnerLinks(links: ResultSet, backlink):
	sets = {}
	backdomain = urlparse(backlink['link']).netloc
	for link in links:
	href: str = link.attrs['href']
	if href.startswith('http') is False:
	continue

	# get real href
	if 'click' in href or 'mail' in href or 'link' in href:
	response = requests.head(href)
	if response.status_code == 302:
	href = response.headers['Location']
	if '?utm' in href:
	href = href[0: href.index('?utm')]

	domain = urlparse(href).netloc
	if href == backlink['link'] or backdomain == domain:
	continue
	if href in sets:
	continue

	backlinks = ref_links.get(href)
	if backlinks is None:
	ref_links.setdefault(href, [])
	backlinks = []

	domainExists = False
	for existslink in backlinks:
	existsdomain = urlparse(existslink['link']).netloc
	if existsdomain == backdomain:
	domainExists = True
	break
	if domainExists:
	continue

	backlinks.append(backlink)
	sets[href] = ''
	ref_links[href] = backlinks


	count = 0
	for feed in feeds:
	print("%s \| %s" % (feed.text, feed.rssUrl))
	try:
	contents = feedparser.parse(feed.rssUrl)
	# print(json.dumps(contents, indent=4, sort_keys=True))
	for entry in contents.get('entries'):
	link = entry.link
	title = entry.title
	html = entry.content[0].value
	soup = BeautifulSoup(html, 'html.parser')
	handleInnerLinks(soup.find_all('a'), {'link': link, 'title': title})
	except Exception as e:
	print(e)
	count += 1
	# if count == 1:
	# break

	linkinfos = []

	for reflink in ref_links:
	linkinfos.append({'href': reflink, 'back_links': ref_links[reflink], 'link_count': len(ref_links[reflink])})

	linkinfos.sort(key=lambda x: x['link_count'], reverse=True)

	# print(json.dumps(linkinfos,indent=4))

	for linkinfo in linkinfos:
	print('%s \| %i \| %s' % (linkinfo['href'], linkinfo['link_count'], linkinfo['back_links']))