-
-
Save hackerdem/2872d7f994d192188970408980267e6e to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup,SoupStrainer | |
import urllib.request | |
import colorama,re,queue,threading | |
from colorama import Fore | |
from urllib.parse import * | |
class check_link(): | |
def __init__(self,address): | |
self.address=address | |
def check(self,address): | |
try: | |
req=urllib.request.Request(url=address) | |
resp=urllib.request.urlopen(req) | |
if resp.status in [400,404,403,408,409,501,502,503]: | |
print (Fore.RED+resp.status+"-"+resp.reason+"-->"+address) | |
else: print (Fore.GREEN+"no problem in-->"+address) | |
except Exception as e: | |
print (Fore.YELLOW+"{}-{}".format(e,address)) | |
pass | |
def pattern_adjust(a): | |
try: | |
if re.match('^#' ,a):return 0 | |
r=urlsplit(a) | |
if r.scheme=='' and (r.netloc!='' or r.path!=''): | |
d=urlunsplit(r) | |
if re.match('^//' ,d): | |
m= re.search('(?<=//)\S+', d) | |
d=m.group(0) | |
m="https://"+d | |
return m | |
elif r.scheme=='' and r.netloc=='': | |
return address+a | |
else:return a | |
except Exception as e: | |
pass | |
def extract_link(address): | |
tags= {'a':'href', 'img':'src', 'script':'src', 'link':'href' } | |
for key,value in iter(tags.items()): | |
try: | |
headers={"User-Agent": "Mozilla/5.0"} | |
res=urllib.request.urlopen(urllib.request.Request(url=address, headers=headers)) | |
response=res.read().decode('utf-8') #needs improvement | |
for link in BeautifulSoup(response,"html.parser",parse_only=SoupStrainer(key)): | |
if link.has_attr(value) and address in link[value]: # address in link[value] to keep testing the target site only | |
p=pattern_adjust(link[value]) | |
if p!=0 and str(p)!='None': | |
newcheck=check_link(p) | |
newcheck.check(p) | |
if p not in hyperlinks: | |
hyperlinks.add(p) | |
if website.split('.')[1] in p:#needs improvement | |
if not website.endswith(('.png','.jpeg','.js','jpg')): | |
q.put(p) | |
except Exception as e: | |
print (e,address) | |
def threader(): | |
while True: | |
value=q.get() | |
result=extract_link(value) | |
q.task_done() | |
if __name__=="__main__": | |
colorama.init() | |
q=queue.Queue() | |
global hyperlinks,website | |
hyperlinks=set() | |
website= 'https://www.sozcu.com.tr/' #Target website | |
for x in range(30): | |
t=threading.Thread(target=threader) | |
t.deamon=True | |
t.start() | |
q.put(website.strip()) | |
q.join() | |
foo |
@hackerdem whats the license for this file?
Nice!
Hey, wonderful script. However, it seems that it keeps running on the other domains linked, which is kinda an unwanted behavior (for example, if I include a link to Google on my website, it will scan Google as well...). Could a fix be made in order to only restrict the crawl to the same domain?
For this code now how to print those links in a document or excel sheet?
Hey, please try to implement some additional code to check every link's root url to a base url, so if it is not the same, it won't run on it. Another question is about printing results to another file, I think for this purpose, python's csv library can be used.
Hey, try to include a header as well with the urlopen request. Some website gives 403 forbidden if the request is unknown. It should look something like
url = "https://atomstalk.com" headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1)'} info = urllib.request.urlopen(urllib.request.Request(url=url, headers=headers))
how to debug if code is not working for a given url ??
Thanks. It works right out of the box!
can someone explain each parts?
Hi Thanks a Lot or the Script... I tried to run on my Site and gave a single Hyper Link and its running for All the Links and giving me the output ...There are Many Links and is there any way that we know the base-link where the Link that is found to be Broken ...?