Skip to content

Instantly share code, notes, and snippets.

@aaronsdevera
Created October 31, 2020 23:45
Show Gist options
  • Save aaronsdevera/39dcda7025e76eee1ee01f4bc3986329 to your computer and use it in GitHub Desktop.
Save aaronsdevera/39dcda7025e76eee1ee01f4bc3986329 to your computer and use it in GitHub Desktop.
Given a UUID from URLScan upload, script saves the screenshots for ML model training data: https://twitter.com/aaronsdevera/status/1322399067725426690
import requests
import json
import csv
import os
import sys
def printLog(msg,kind):
msgType = '[+]'
if kind == 1:
msgType = '[!]'
if kind == 2:
msgType = ' '
print(
'{msgType} {msg}'.format(
msgType=msgType,
msg=msg
)
)
printLog('starting up the urlscan crawler...',0)
apiKey = 'ENTER_API_KEY_HERE'
uuids = []
if apiKey == '' or len(apiKey) != 36:
printLog('you need an api key from urlscan!',1)
printLog('exiting...',1)
exit
printLog('making the proper directories',2)
try:
os.makedirs('training')
os.makedirs('training/phishy')
os.makedirs('training/not-phishy')
printLog('done.',0)
except FileExistsError:
printLog('oh you have it already... cool.',1)
printLog('generating proper http headers...',2)
singleScanHeaders = {
'Content-Type': 'application/json',
'API-Key': apiKey,
}
printLog('done.',0)
printLog('instrumenting data file as crawler.csv ...',2)
csvFile = csv.writer(open('crawler.csv','a+'))
uuid = sys.argv[1]
printLog('getting singular scan result...',2)
singleScanResponse = requests.get(
'https://urlscan.io/api/v1/result/{uuid}'.format(
uuid=uuid
),
headers=singleScanHeaders
)
singleScanResult = singleScanResponse.json()
#verdictsMalcious = singleScanResult['verdicts']['overall']['malicious']
verdictsMalcious = True
url = singleScanResult['task']['url']
res = singleScanResult['task']['reportURL']
ss = singleScanResult['task']['screenshotURL']
printLog('done.',0)
printLog('result retrieved for {uuid}.'.format(
uuid=uuid
),0)
printLog('url: {url}.'.format(
url=url
),0)
printLog('malicious verdict: {verdict}.'.format(
verdict=verdictsMalcious
),0)
printLog('saving screenshot to appropriate directory...',2)
targetDir = 'training/not-phishy'
if verdictsMalcious == True:
targetDir = 'training/phishy'
os.system(
'wget -q {ss} -P ./{dir}'.format(
ss=ss,
dir=targetDir
)
)
printLog('done.',0)
scanRow = [
uuid,
verdictsMalcious,
url,
res,
ss
]
printLog('writing scan to CSV...',2)
csvFile.writerow(scanRow)
uuids.append(uuid)
printLog('done.',0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment