Created
November 22, 2023 08:43
-
-
Save karimmuya/c837c611b0432957319d7b1e5d1d5af8 to your computer and use it in GitHub Desktop.
commoncrawl.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
import threading | |
import queue | |
import argparse | |
class commonCrawlDataClass(): | |
def __init__(self,domain): | |
self.jsonIndexData = "" | |
self.domain = domain | |
self.domains = [] | |
self.q = queue.Queue() | |
def getIndexes(self): | |
indexURL = "https://index.commoncrawl.org/collinfo.json" | |
r = requests.get(indexURL) | |
jsonIndexData = json.loads(r.text) | |
for index in jsonIndexData: | |
self.q.put(index['id']) | |
def getIndexData(self,indexID): | |
try: | |
commonCrawlURL = "http://index.commoncrawl.org/"+indexID+"-index?url="+self.domain+"/*&output=json" | |
r = requests.get(commonCrawlURL) | |
data = r.text.split("\n")[:-1] | |
for entry in data: | |
url = json.loads(entry)['url'] | |
if url not in self.domains: | |
self.domains.append(url) | |
print(url) | |
except: | |
pass | |
def worker(self): | |
while 1: | |
indexID = self.q.get() | |
self.getIndexData(indexID) | |
self.q.task_done() | |
def start(self): | |
self.getIndexes() | |
for i in range(0,10): | |
t = threading.Thread(target=self.worker) | |
t.daemon = True | |
t.start() | |
self.q.join() | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-d","--domain", help="Domain Name; EX: test.com") | |
args = parser.parse_args() | |
if args.domain: | |
domain = args.domain | |
cc = commonCrawlDataClass(domain) | |
cc.start() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment