Created
October 31, 2020 23:45
-
-
Save aaronsdevera/39dcda7025e76eee1ee01f4bc3986329 to your computer and use it in GitHub Desktop.
Given a UUID from URLScan upload, script saves the screenshots for ML model training data: https://twitter.com/aaronsdevera/status/1322399067725426690
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
import csv | |
import os | |
import sys | |
def printLog(msg,kind): | |
msgType = '[+]' | |
if kind == 1: | |
msgType = '[!]' | |
if kind == 2: | |
msgType = ' ' | |
print( | |
'{msgType} {msg}'.format( | |
msgType=msgType, | |
msg=msg | |
) | |
) | |
printLog('starting up the urlscan crawler...',0) | |
apiKey = 'ENTER_API_KEY_HERE' | |
uuids = [] | |
if apiKey == '' or len(apiKey) != 36: | |
printLog('you need an api key from urlscan!',1) | |
printLog('exiting...',1) | |
exit | |
printLog('making the proper directories',2) | |
try: | |
os.makedirs('training') | |
os.makedirs('training/phishy') | |
os.makedirs('training/not-phishy') | |
printLog('done.',0) | |
except FileExistsError: | |
printLog('oh you have it already... cool.',1) | |
printLog('generating proper http headers...',2) | |
singleScanHeaders = { | |
'Content-Type': 'application/json', | |
'API-Key': apiKey, | |
} | |
printLog('done.',0) | |
printLog('instrumenting data file as crawler.csv ...',2) | |
csvFile = csv.writer(open('crawler.csv','a+')) | |
uuid = sys.argv[1] | |
printLog('getting singular scan result...',2) | |
singleScanResponse = requests.get( | |
'https://urlscan.io/api/v1/result/{uuid}'.format( | |
uuid=uuid | |
), | |
headers=singleScanHeaders | |
) | |
singleScanResult = singleScanResponse.json() | |
#verdictsMalcious = singleScanResult['verdicts']['overall']['malicious'] | |
verdictsMalcious = True | |
url = singleScanResult['task']['url'] | |
res = singleScanResult['task']['reportURL'] | |
ss = singleScanResult['task']['screenshotURL'] | |
printLog('done.',0) | |
printLog('result retrieved for {uuid}.'.format( | |
uuid=uuid | |
),0) | |
printLog('url: {url}.'.format( | |
url=url | |
),0) | |
printLog('malicious verdict: {verdict}.'.format( | |
verdict=verdictsMalcious | |
),0) | |
printLog('saving screenshot to appropriate directory...',2) | |
targetDir = 'training/not-phishy' | |
if verdictsMalcious == True: | |
targetDir = 'training/phishy' | |
os.system( | |
'wget -q {ss} -P ./{dir}'.format( | |
ss=ss, | |
dir=targetDir | |
) | |
) | |
printLog('done.',0) | |
scanRow = [ | |
uuid, | |
verdictsMalcious, | |
url, | |
res, | |
ss | |
] | |
printLog('writing scan to CSV...',2) | |
csvFile.writerow(scanRow) | |
uuids.append(uuid) | |
printLog('done.',0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment