Skip to content

Instantly share code, notes, and snippets.

@aaronsdevera
Created October 31, 2020 22:49
Show Gist options
  • Save aaronsdevera/14a3148d03aac427e8c22d812e964b15 to your computer and use it in GitHub Desktop.
Save aaronsdevera/14a3148d03aac427e8c22d812e964b15 to your computer and use it in GitHub Desktop.
Script to gather scans from urlscan.io, and save screenshots for ML model training data: https://twitter.com/aaronsdevera/status/1322399067725426690
import requests
import json
import csv
import os
def printLog(msg,kind):
msgType = '[+]'
if kind == 1:
msgType = '[!]'
if kind == 2:
msgType = ' '
print(
'{msgType} {msg}'.format(
msgType=msgType,
msg=msg
)
)
printLog('starting up the urlscan crawler...',0)
apiKey = 'ENTER_API_KEY_HERE'
uuids = []
if apiKey == '' or len(apiKey) != 36:
printLog('you need an api key from urlscan!',1)
printLog('exiting...',1)
exit
printLog('making the proper directories',2)
try:
os.makedirs('phishy')
os.makedirs('not-phishy')
printLog('done.',0)
except FileExistsError:
printLog('oh you have it already... cool.',1)
printLog('generating proper http headers...',2)
liveResultsHeaders = {
'authority': 'urlscan.io',
'accept': 'application/json, text/javascript, */*; q=0.01',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://urlscan.io/live/',
'accept-language': 'en-US,en;q=0.9',
'cookie': 'sid=s%3AKR6oK-rfPFYWztbTpyIAiUz2dmFST-qM.jS2Eu2lednTeT4ui%2FcPeFTQqGJnZk3WFuEI6hVPMdtY',
'dnt': '1',
'sec-gpc': '1',
'if-none-match': 'W/"4d96-0kI56VAPnWGeH+Ww+5yF12X52bE"',
}
singleScanHeaders = {
'Content-Type': 'application/json',
'API-Key': apiKey,
}
printLog('done.',0)
printLog('instrumenting data file as crawler.csv ...',2)
csvFile = csv.writer(open('crawler.csv','a+'))
while True:
printLog('starting crawler loop...',2)
printLog('getting urlscan live feed...',2)
liveResultsResponse = requests.get(
'https://urlscan.io/json/live/',
headers=liveResultsHeaders
)
urlscanLiveResults = liveResultsResponse.json()['results']
printLog('done.',0)
for scan in urlscanLiveResults:
uuid = scan['_id']
if uuid not in uuids:
url = scan['task']['url']
res = scan['result']
ss = scan['screenshot']
printLog('getting singular scan result...',2)
singleScanResponse = requests.get(
'https://urlscan.io/api/v1/result/{uuid}'.format(
uuid=uuid
),
headers=singleScanHeaders
)
singleScanResult = singleScanResponse.json()
#statsMalicious = singleScanResult['stats']['malicious']
verdictsMalcious = singleScanResult['verdicts']['overall']['malicious']
printLog('done.',0)
printLog('result retrieved for {uuid}.'.format(
uuid=uuid
),0)
printLog('url: {url}.'.format(
url=url
),0)
printLog('malicious verdict: {verdict}.'.format(
verdict=verdictsMalcious
),0)
printLog('saving screenshot to appropriate directory...',2)
targetDir = 'not-phishy'
if verdictsMalcious == True:
targetDir = 'phishy'
os.system(
'wget -q {ss} -P ./{dir}'.format(
ss=ss,
dir=targetDir
)
)
printLog('done.',0)
scanRow = [
uuid,
verdictsMalcious,
url,
res,
ss
]
printLog('writing scan to CSV...',2)
csvFile.writerow(scanRow)
uuids.append(uuid)
printLog('done.',0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment