aaronsdevera · October 31, 2020 22:49
diff --git a/gatherURLscan.py b/gatherURLscan.py
 import requests
 import json
 import csv
 import os

 def printLog(msg,kind):
    msgType = '[+]'

    if kind == 1:
        msgType = '[!]'
    if kind == 2:
        msgType = '    '
    print(
        '{msgType} {msg}'.format(
            msgType=msgType,
            msg=msg
        )
    )

 printLog('starting up the urlscan crawler...',0)


 apiKey = 'ENTER_API_KEY_HERE'
 uuids = []

 if apiKey == '' or len(apiKey) != 36:
    printLog('you need an api key from urlscan!',1)
    printLog('exiting...',1)
    exit

 printLog('making the proper directories',2)
 try:
    os.makedirs('phishy')
    os.makedirs('not-phishy')
    printLog('done.',0)

 except FileExistsError:
    printLog('oh you have it already... cool.',1)

 printLog('generating proper http headers...',2)

 liveResultsHeaders = {
    'authority': 'urlscan.io',
    'accept': 'application/json, text/javascript, */*; q=0.01',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
    'x-requested-with': 'XMLHttpRequest',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-mode': 'cors',
    'sec-fetch-dest': 'empty',
    'referer': 'https://urlscan.io/live/',
    'accept-language': 'en-US,en;q=0.9',
    'cookie': 'sid=s%3AKR6oK-rfPFYWztbTpyIAiUz2dmFST-qM.jS2Eu2lednTeT4ui%2FcPeFTQqGJnZk3WFuEI6hVPMdtY',
    'dnt': '1',
    'sec-gpc': '1',
    'if-none-match': 'W/"4d96-0kI56VAPnWGeH+Ww+5yF12X52bE"',
 }

 singleScanHeaders = {
    'Content-Type': 'application/json',
    'API-Key': apiKey,
 }

 printLog('done.',0)

 printLog('instrumenting data file as crawler.csv ...',2)
 csvFile = csv.writer(open('crawler.csv','a+'))

 while True:

    printLog('starting crawler loop...',2)

    printLog('getting urlscan live feed...',2)
    liveResultsResponse = requests.get(
        'https://urlscan.io/json/live/',
        headers=liveResultsHeaders
    )

    urlscanLiveResults = liveResultsResponse.json()['results']

    printLog('done.',0)

    for scan in urlscanLiveResults:
        uuid = scan['_id']

        if uuid not in uuids:

            url = scan['task']['url']
            res = scan['result']
            ss = scan['screenshot']

            printLog('getting singular scan result...',2)

            singleScanResponse = requests.get(
                'https://urlscan.io/api/v1/result/{uuid}'.format(
                    uuid=uuid
                ),
                headers=singleScanHeaders
            )

            singleScanResult = singleScanResponse.json()
            #statsMalicious = singleScanResult['stats']['malicious']
            verdictsMalcious = singleScanResult['verdicts']['overall']['malicious']

            printLog('done.',0)

            printLog('result retrieved for {uuid}.'.format(
                uuid=uuid
            ),0)

            printLog('url: {url}.'.format(
                url=url
            ),0)

            printLog('malicious verdict: {verdict}.'.format(
                verdict=verdictsMalcious
            ),0)

            printLog('saving screenshot to appropriate directory...',2)

            targetDir = 'not-phishy'
            if verdictsMalcious == True:
                targetDir = 'phishy'
            os.system(
                'wget -q {ss} -P ./{dir}'.format(
                    ss=ss,
                    dir=targetDir
                )
            )
            printLog('done.',0)

            scanRow = [
                uuid,
                verdictsMalcious,
                url,
                res,
                ss
            ]

            printLog('writing scan to CSV...',2)

            csvFile.writerow(scanRow)
            uuids.append(uuid)
            printLog('done.',0)
	import requests
	import json
	import csv
	import os

	def printLog(msg,kind):
	msgType = '[+]'

	if kind == 1:
	msgType = '[!]'
	if kind == 2:
	msgType = ' '
	print(
	'{msgType} {msg}'.format(
	msgType=msgType,
	msg=msg
	)
	)

	printLog('starting up the urlscan crawler...',0)


	apiKey = 'ENTER_API_KEY_HERE'
	uuids = []

	if apiKey == '' or len(apiKey) != 36:
	printLog('you need an api key from urlscan!',1)
	printLog('exiting...',1)
	exit

	printLog('making the proper directories',2)
	try:
	os.makedirs('phishy')
	os.makedirs('not-phishy')
	printLog('done.',0)

	except FileExistsError:
	printLog('oh you have it already... cool.',1)

	printLog('generating proper http headers...',2)

	liveResultsHeaders = {
	'authority': 'urlscan.io',
	'accept': 'application/json, text/javascript, /; q=0.01',
	'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
	'x-requested-with': 'XMLHttpRequest',
	'sec-fetch-site': 'same-origin',
	'sec-fetch-mode': 'cors',
	'sec-fetch-dest': 'empty',
	'referer': 'https://urlscan.io/live/',
	'accept-language': 'en-US,en;q=0.9',
	'cookie': 'sid=s%3AKR6oK-rfPFYWztbTpyIAiUz2dmFST-qM.jS2Eu2lednTeT4ui%2FcPeFTQqGJnZk3WFuEI6hVPMdtY',
	'dnt': '1',
	'sec-gpc': '1',
	'if-none-match': 'W/"4d96-0kI56VAPnWGeH+Ww+5yF12X52bE"',
	}

	singleScanHeaders = {
	'Content-Type': 'application/json',
	'API-Key': apiKey,
	}

	printLog('done.',0)

	printLog('instrumenting data file as crawler.csv ...',2)
	csvFile = csv.writer(open('crawler.csv','a+'))

	while True:

	printLog('starting crawler loop...',2)

	printLog('getting urlscan live feed...',2)
	liveResultsResponse = requests.get(
	'https://urlscan.io/json/live/',
	headers=liveResultsHeaders
	)

	urlscanLiveResults = liveResultsResponse.json()['results']

	printLog('done.',0)

	for scan in urlscanLiveResults:
	uuid = scan['_id']

	if uuid not in uuids:

	url = scan['task']['url']
	res = scan['result']
	ss = scan['screenshot']

	printLog('getting singular scan result...',2)

	singleScanResponse = requests.get(
	'https://urlscan.io/api/v1/result/{uuid}'.format(
	uuid=uuid
	),
	headers=singleScanHeaders
	)

	singleScanResult = singleScanResponse.json()
	#statsMalicious = singleScanResult['stats']['malicious']
	verdictsMalcious = singleScanResult['verdicts']['overall']['malicious']

	printLog('done.',0)

	printLog('result retrieved for {uuid}.'.format(
	uuid=uuid
	),0)

	printLog('url: {url}.'.format(
	url=url
	),0)

	printLog('malicious verdict: {verdict}.'.format(
	verdict=verdictsMalcious
	),0)

	printLog('saving screenshot to appropriate directory...',2)

	targetDir = 'not-phishy'
	if verdictsMalcious == True:
	targetDir = 'phishy'
	os.system(
	'wget -q {ss} -P ./{dir}'.format(
	ss=ss,
	dir=targetDir
	)
	)
	printLog('done.',0)

	scanRow = [
	uuid,
	verdictsMalcious,
	url,
	res,
	ss
	]

	printLog('writing scan to CSV...',2)

	csvFile.writerow(scanRow)
	uuids.append(uuid)
	printLog('done.',0)