Skip to content

Instantly share code, notes, and snippets.

@brevityinmotion
Created June 9, 2021 03:55
Show Gist options
  • Save brevityinmotion/255b5b11a1a880ebfab1ef00906d348c to your computer and use it in GitHub Desktop.
Save brevityinmotion/255b5b11a1a880ebfab1ef00906d348c to your computer and use it in GitHub Desktop.
Function to run GoSpider and then process the output
def generateScriptGoSpider(programName, inputBucketName):
fileBuffer = io.StringIO()
fileContents = f"""#!/bin/bash
# Run custom goSpider script
export HOME=/root
export PATH=/root/go/bin:$PATH
mkdir $HOME/security/refined/{programName}
mkdir $HOME/security/refined/{programName}/crawl
gospider -S $HOME/security/inputs/{programName}/{programName}-urls-base.csv -o $HOME/security/raw/{programName}/crawl -u web -t 1 -c 5 -d 1 --js --sitemap --robots --other-source --include-subs --include-other-source
cd $HOME/security/raw/{programName}/
# Generate list of base urls
cat crawl/* | grep -Eo '(http|https)://[^/?:&"]+' | anew > $HOME/security/refined/{programName}/{programName}-urls-min.txt
sleep 20
# Generate list of full urls
cat crawl/* | grep -Eo '(http|https)://[^*]+' | anew > $HOME/security/refined/{programName}/{programName}-urls-max.txt
sleep 20
FILES=$HOME/security/raw/{programName}/crawl/*
for f in $FILES
do
cat $f | grep -Eo '(http|https)://[^/?:&"]+' | anew > $HOME/security/refined/{programName}/crawl/urls-simple-${{f##*/}}.txt
done
sleep 10
sh $HOME/security/run/{programName}/sync-{programName}.sh"""
fileBuffer.write(fileContents)
objectBuffer = io.BytesIO(fileBuffer.getvalue().encode())
# Upload file to S3
object_name = 'crawl-' + programName + '.sh'
object_path = 'run/' + programName + '/' + object_name
status = brevitycore.upload_object(objectBuffer,inputBucketName,object_path)
fileBuffer.close()
objectBuffer.close()
return status
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment