Created
July 28, 2021 03:27
-
-
Save brevityinmotion/25f20271e682a67e90022eef72438a16 to your computer and use it in GitHub Desktop.
Lambda function to parse and normalize scope data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ast | |
import re | |
import urllib.request, json | |
from urllib.parse import urlparse | |
def parseScopeIn(scopeIn): | |
targetData = [] | |
if not scopeIn: | |
return targetData | |
smallAll = str(scopeIn)[1:-1] | |
scopeLength = len(scopeIn) | |
smallData = ast.literal_eval(smallAll) | |
if (scopeLength > 1): | |
for item in smallData: | |
if item.get('target') is not None: | |
targetData.append(item.get('target')) | |
if item.get('asset_identifier') is not None: | |
targetData.append(item.get('asset_identifier')) | |
return targetData | |
else: | |
if smallData.get('target') is not None: | |
targetData.append(smallData.get('target')) | |
if smallData.get('asset_identifier') is not None: | |
targetData.append(smallData.get('asset_identifier')) | |
return targetData | |
def parseScopeOut(scopeOut): | |
targetData = [] | |
if not scopeOut: | |
targetData.append('icicles.io') | |
return targetData | |
smallAll = str(scopeOut)[1:-1] | |
scopeLength = len(scopeOut) | |
smallData = ast.literal_eval(smallAll) | |
if (scopeLength > 1): | |
for item in smallData: | |
if item.get('target') is not None: | |
targetData.append(item.get('target')) | |
if item.get('asset_identifier') is not None: | |
targetData.append(item.get('asset_identifier')) | |
return targetData | |
else: | |
if smallData.get('target') is not None: | |
targetData.append(smallData.get('target')) | |
if smallData.get('asset_identifier') is not None: | |
targetData.append(smallData.get('asset_identifier')) | |
return targetData | |
def parseProgramUrl(programUrl): | |
programName = programUrl.rsplit('/', 1)[-1] | |
programName = ''.join(e for e in programName if e.isalnum()) | |
return programName | |
def parseProgramName(programName): | |
programName = ''.join(e for e in programName if e.isalnum()) | |
return programName | |
def cleanScope(sub, test_str): | |
for ele in sub: | |
if ele in test_str: | |
return 1 | |
return 0 | |
def cleanupScopeGithub(dfIn): | |
matchString = 'github.com' | |
matches = [] | |
for match in dfIn: | |
#if matchString in match: | |
match = re.search("(?P<url>https?://[^\s]+)", match)#.group("url") | |
if match is None: | |
continue | |
else: | |
match = match.group('url') | |
if matchString in match: | |
matches.append(match) | |
else: | |
continue | |
return matches | |
def cleanupScopeStrict(dfIn): | |
matchString = 'github.com' | |
matches = [] | |
for match in dfIn: | |
#if matchString in match: | |
match = re.search("(?P<url>https?://[^\s]+)", match)#.group("url") | |
if match is None: | |
continue | |
else: | |
match = match.group('url') | |
if matchString in match: | |
continue | |
else: | |
matches.append(match) | |
return matches | |
def cleanupScopeWild(dfIn): | |
matchString = '\*.' | |
matches = [] | |
for match in dfIn: | |
match = re.search("(?P<url>[*][^\s|\,]+)", match) | |
#match = re.search("(?P<url>[*][^\s]+)", match) | |
if match is None: | |
continue | |
else: | |
match = match.group('url') | |
#if matchString in match: | |
matches.append(match) | |
#else: | |
# continue | |
return matches | |
def cleanupScopeIP(dfIn): | |
matchString = '\*.' | |
matches = [] | |
for match in dfIn: | |
# This regex will only identify IP addresses and cuts off CIDR extensions | |
#match = re.search("(?P<url>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})", match) | |
match = re.search("(?P<url>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?:/\d{1,2}|))", match) | |
if match is None: | |
continue | |
else: | |
match = match.group('url') | |
matches.append(match) | |
return matches | |
def cleanupScopeGeneral(dfIn): | |
matchStringSpace = ' ' | |
matchStringGit = 'github.com' | |
matchStringUrl = 'http' | |
matchStringWild = '\*.' | |
matchStringDot = '.' | |
matches = [] | |
for match in dfIn: | |
matchWild = re.search("(?P<url>[*][^\s|\,]+)", match) | |
matchDot = re.search("(?P<url>[.][^\s]+)", match) | |
matchIP = re.search("(?P<url>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?:/\d{1,2}|))", match) | |
if matchStringGit in match: | |
continue | |
elif matchStringSpace in match: | |
continue | |
elif matchStringUrl in match: | |
url = urlparse(match) | |
match = url.netloc | |
matches.append(match) | |
continue | |
elif matchIP is not None: | |
continue | |
elif matchWild is not None: | |
match = match.replace('*.','') | |
matches.append(match) | |
continue | |
elif matchDot is None: | |
continue | |
else: | |
matches.append(match) | |
return matches | |
def extrapolateScope(programName, listscopein, listscopeout): | |
ScopeInURLs = cleanupScopeStrict(listscopein) | |
ScopeInGithub = cleanupScopeGithub(listscopein) | |
ScopeInWild = cleanupScopeWild(listscopein) | |
ScopeInGeneral = cleanupScopeGeneral(listscopein) | |
ScopeInIP = cleanupScopeIP(listscopein) | |
ScopeOutURLs = cleanupScopeStrict(listscopeout) | |
ScopeOutGithub = cleanupScopeGithub(listscopeout) | |
ScopeOutWild = cleanupScopeWild(listscopeout) | |
ScopeOutGeneral = cleanupScopeGeneral(listscopeout) | |
ScopeOutIP = cleanupScopeIP(listscopeout) | |
return ScopeInURLs, ScopeInGithub, ScopeInWild, ScopeInGeneral, ScopeInIP, ScopeOutURLs, ScopeOutGithub, ScopeOutWild, ScopeOutGeneral, ScopeOutIP |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment