Created
September 5, 2020 13:52
-
-
Save PandaWhoCodes/3db075219a82fbf412ddbe07a4c594ac to your computer and use it in GitHub Desktop.
Extract Feed URL's from a given URL
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
def detect_feeds_in_HTML(html): | |
""" | |
Extract Feed URL from a given HTML page | |
This is achieved by detecting all ``link`` tags that reference a feed in HTML. | |
""" | |
# check if really an input stream | |
result = [] | |
# get the textual data (the HTML) from the input stream | |
soup = BeautifulSoup(html,"lxml") | |
# find all links that have an "alternate" attribute | |
feed_urls = soup.findAll("link", rel="alternate") | |
# extract URL and type | |
for feed_link in feed_urls: | |
url = feed_link.get("href", None) | |
# if a valid URL is there | |
if url: | |
result.append(url) | |
return result | |
def get_html(url): | |
return requests.get(url).content | |
if __name__ == "__main__": | |
print(detect_feeds_in_HTML(get_html("https://www.doraithodla.com/"))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment