-
-
Save hackerdem/2872d7f994d192188970408980267e6e to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup,SoupStrainer | |
import urllib.request | |
import colorama,re,queue,threading | |
from colorama import Fore | |
from urllib.parse import * | |
class check_link(): | |
def __init__(self,address): | |
self.address=address | |
def check(self,address): | |
try: | |
req=urllib.request.Request(url=address) | |
resp=urllib.request.urlopen(req) | |
if resp.status in [400,404,403,408,409,501,502,503]: | |
print (Fore.RED+resp.status+"-"+resp.reason+"-->"+address) | |
else: print (Fore.GREEN+"no problem in-->"+address) | |
except Exception as e: | |
print (Fore.YELLOW+"{}-{}".format(e,address)) | |
pass | |
def pattern_adjust(a): | |
try: | |
if re.match('^#' ,a):return 0 | |
r=urlsplit(a) | |
if r.scheme=='' and (r.netloc!='' or r.path!=''): | |
d=urlunsplit(r) | |
if re.match('^//' ,d): | |
m= re.search('(?<=//)\S+', d) | |
d=m.group(0) | |
m="https://"+d | |
return m | |
elif r.scheme=='' and r.netloc=='': | |
return address+a | |
else:return a | |
except Exception as e: | |
pass | |
def extract_link(address): | |
tags= {'a':'href', 'img':'src', 'script':'src', 'link':'href' } | |
for key,value in iter(tags.items()): | |
try: | |
headers={"User-Agent": "Mozilla/5.0"} | |
res=urllib.request.urlopen(urllib.request.Request(url=address, headers=headers)) | |
response=res.read().decode('utf-8') #needs improvement | |
for link in BeautifulSoup(response,"html.parser",parse_only=SoupStrainer(key)): | |
if link.has_attr(value) and address in link[value]: # address in link[value] to keep testing the target site only | |
p=pattern_adjust(link[value]) | |
if p!=0 and str(p)!='None': | |
newcheck=check_link(p) | |
newcheck.check(p) | |
if p not in hyperlinks: | |
hyperlinks.add(p) | |
if website.split('.')[1] in p:#needs improvement | |
if not website.endswith(('.png','.jpeg','.js','jpg')): | |
q.put(p) | |
except Exception as e: | |
print (e,address) | |
def threader(): | |
while True: | |
value=q.get() | |
result=extract_link(value) | |
q.task_done() | |
if __name__=="__main__": | |
colorama.init() | |
q=queue.Queue() | |
global hyperlinks,website | |
hyperlinks=set() | |
website= 'https://www.sozcu.com.tr/' #Target website | |
for x in range(30): | |
t=threading.Thread(target=threader) | |
t.deamon=True | |
t.start() | |
q.put(website.strip()) | |
q.join() | |
foo |
Hey, wonderful script. However, it seems that it keeps running on the other domains linked, which is kinda an unwanted behavior (for example, if I include a link to Google on my website, it will scan Google as well...). Could a fix be made in order to only restrict the crawl to the same domain?
For this code now how to print those links in a document or excel sheet?
Hey, please try to implement some additional code to check every link's root url to a base url, so if it is not the same, it won't run on it. Another question is about printing results to another file, I think for this purpose, python's csv library can be used.
Hey, try to include a header as well with the urlopen request. Some website gives 403 forbidden if the request is unknown. It should look something like
url = "https://atomstalk.com" headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1)'} info = urllib.request.urlopen(urllib.request.Request(url=url, headers=headers))
how to debug if code is not working for a given url ??
Thanks. It works right out of the box!
can someone explain each parts?
Nice!