Skip to content

Instantly share code, notes, and snippets.

@samundra
Created March 25, 2016 05:30
Show Gist options
  • Select an option

  • Save samundra/32e7a4e14fe319cf7603 to your computer and use it in GitHub Desktop.

Select an option

Save samundra/32e7a4e14fe319cf7603 to your computer and use it in GitHub Desktop.
Get the urls from the sitemap with python and beautiful soup
import sys
import urllib
from bs4 import BeautifulSoup
def get_sitemap_url(sitemap_url):
try:
opener = urllib.urlopen(url=sitemap_url, data=None)
xml = opener.read()
# fp = open("../../static/sitemap.xml", "r")
# xml = fp.read()
soup = BeautifulSoup(markup=xml,features="lxml")
for tag in soup.find_all("url"):
for child in tag.children:
if child.name == "changefreq" and child.text=="daily":
loc = child.findPrevious().findPrevious()
yield loc.text
except Exception as e:
raise e
if __name__=="__main__":
for url in get_sitemap_url("http://www.classichome.com/sitemap.xml"):
print url
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment