Created
October 28, 2017 14:16
-
-
Save mzmmoazam/cd35321e95aad0edb8b1f429eeca11ed to your computer and use it in GitHub Desktop.
Can generate the sitemap and also get the links on any specific depth (layer).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from urllib.parse import urlparse,urljoin | |
from bs4 import BeautifulSoup | |
all_urls = set() | |
class url_tree(): | |
def __init__(self, base_url,page_no=1 ,children=None): | |
self.url = base_url | |
self.page_no =page_no | |
self.children = [] | |
if children is not None : | |
# print(self.url,children) | |
for child in children: | |
if child not in all_urls: | |
all_urls.add(child) | |
self.add_child(url_tree(child,self.page_no+1)) | |
if to_level >= self.page_no: | |
self.get_children() | |
# print(self.url,self.children) | |
def add_child(self, node): | |
''' adds children to the its parent node ''' | |
assert isinstance(node, url_tree) | |
self.children.append(node) | |
def get_layer(self,no): | |
''' Returns the urls at certain depth of the root url ''' | |
if self.page_no != no: | |
for child in self.children: | |
yield child.get_layer(no) | |
# return child.get_layer(no) | |
else: | |
yield self.url | |
def get_children(self): | |
# url_tree[page_no][url] = list() | |
try : | |
page = requests.get(self.url) | |
except Exception as e: | |
print (e) # when the url has got some problem | |
return | |
print('scraped -> ',self.url) | |
soup = BeautifulSoup(page.content, 'lxml') | |
for link in soup.find_all('a', href=True): | |
llink = self.true_url(self.url, link['href']) | |
# print(llink, link['href']) | |
if llink and llink not in all_urls: | |
all_urls.add(llink) | |
self.add_child(url_tree(llink,page_no=self.page_no+1)) | |
def true_url(self,url,x,extensive_check=False): | |
''' returns url if it matches some basic validations else None''' | |
try: | |
result = urlparse(x) | |
if all([result.scheme, result.netloc, result.path]) : | |
return x | |
elif any([result.scheme, result.netloc, result.path]) : | |
if extensive_check: | |
_ = requests.get(urljoin(url,x)) | |
return urljoin(url,x) if _.status_code == 200 else False | |
else: | |
return urljoin(url,x) | |
else: | |
return False | |
except: | |
return False | |
def all_urls(self,send=False): | |
for url in all_urls: | |
print (url) | |
if send: return all_urls | |
def sitemap(self,depth): | |
if self.page_no <= depth: | |
for child in self.children: | |
print(self.url,'->' ,child) | |
for child in self.children: | |
child.sitemap(depth) | |
return "" | |
def __str__(self): | |
return self.url | |
def __repr__(self): | |
return self.url | |
if __name__ == '__main__': | |
url_ = "https://github.com/Cartman720/PySitemap" # Enter the base url over here | |
to_level = 1 # no of layers deep the site has to be scraped ; \\ to_level = layer + 1 \\, so , we can get two layer depth | |
url_tree = url_tree(url_) # initiating the url_tree with the url | |
print([''.join(i) for i in url.get_layer(2)]) # get the links at depth 2 | |
print(url_tree.sitemap(depth=2)) # sitemap till depth = 2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment