Created
October 31, 2020 22:49
-
-
Save aaronsdevera/14a3148d03aac427e8c22d812e964b15 to your computer and use it in GitHub Desktop.
Script to gather scans from urlscan.io, and save screenshots for ML model training data: https://twitter.com/aaronsdevera/status/1322399067725426690
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
import csv | |
import os | |
def printLog(msg,kind): | |
msgType = '[+]' | |
if kind == 1: | |
msgType = '[!]' | |
if kind == 2: | |
msgType = ' ' | |
print( | |
'{msgType} {msg}'.format( | |
msgType=msgType, | |
msg=msg | |
) | |
) | |
printLog('starting up the urlscan crawler...',0) | |
apiKey = 'ENTER_API_KEY_HERE' | |
uuids = [] | |
if apiKey == '' or len(apiKey) != 36: | |
printLog('you need an api key from urlscan!',1) | |
printLog('exiting...',1) | |
exit | |
printLog('making the proper directories',2) | |
try: | |
os.makedirs('phishy') | |
os.makedirs('not-phishy') | |
printLog('done.',0) | |
except FileExistsError: | |
printLog('oh you have it already... cool.',1) | |
printLog('generating proper http headers...',2) | |
liveResultsHeaders = { | |
'authority': 'urlscan.io', | |
'accept': 'application/json, text/javascript, */*; q=0.01', | |
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36', | |
'x-requested-with': 'XMLHttpRequest', | |
'sec-fetch-site': 'same-origin', | |
'sec-fetch-mode': 'cors', | |
'sec-fetch-dest': 'empty', | |
'referer': 'https://urlscan.io/live/', | |
'accept-language': 'en-US,en;q=0.9', | |
'cookie': 'sid=s%3AKR6oK-rfPFYWztbTpyIAiUz2dmFST-qM.jS2Eu2lednTeT4ui%2FcPeFTQqGJnZk3WFuEI6hVPMdtY', | |
'dnt': '1', | |
'sec-gpc': '1', | |
'if-none-match': 'W/"4d96-0kI56VAPnWGeH+Ww+5yF12X52bE"', | |
} | |
singleScanHeaders = { | |
'Content-Type': 'application/json', | |
'API-Key': apiKey, | |
} | |
printLog('done.',0) | |
printLog('instrumenting data file as crawler.csv ...',2) | |
csvFile = csv.writer(open('crawler.csv','a+')) | |
while True: | |
printLog('starting crawler loop...',2) | |
printLog('getting urlscan live feed...',2) | |
liveResultsResponse = requests.get( | |
'https://urlscan.io/json/live/', | |
headers=liveResultsHeaders | |
) | |
urlscanLiveResults = liveResultsResponse.json()['results'] | |
printLog('done.',0) | |
for scan in urlscanLiveResults: | |
uuid = scan['_id'] | |
if uuid not in uuids: | |
url = scan['task']['url'] | |
res = scan['result'] | |
ss = scan['screenshot'] | |
printLog('getting singular scan result...',2) | |
singleScanResponse = requests.get( | |
'https://urlscan.io/api/v1/result/{uuid}'.format( | |
uuid=uuid | |
), | |
headers=singleScanHeaders | |
) | |
singleScanResult = singleScanResponse.json() | |
#statsMalicious = singleScanResult['stats']['malicious'] | |
verdictsMalcious = singleScanResult['verdicts']['overall']['malicious'] | |
printLog('done.',0) | |
printLog('result retrieved for {uuid}.'.format( | |
uuid=uuid | |
),0) | |
printLog('url: {url}.'.format( | |
url=url | |
),0) | |
printLog('malicious verdict: {verdict}.'.format( | |
verdict=verdictsMalcious | |
),0) | |
printLog('saving screenshot to appropriate directory...',2) | |
targetDir = 'not-phishy' | |
if verdictsMalcious == True: | |
targetDir = 'phishy' | |
os.system( | |
'wget -q {ss} -P ./{dir}'.format( | |
ss=ss, | |
dir=targetDir | |
) | |
) | |
printLog('done.',0) | |
scanRow = [ | |
uuid, | |
verdictsMalcious, | |
url, | |
res, | |
ss | |
] | |
printLog('writing scan to CSV...',2) | |
csvFile.writerow(scanRow) | |
uuids.append(uuid) | |
printLog('done.',0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment