Skip to content

Instantly share code, notes, and snippets.

@brevityinmotion
Created August 3, 2021 02:48
Show Gist options
  • Save brevityinmotion/7a72aa276a9642ae0e6ccbc83deb7d2b to your computer and use it in GitHub Desktop.
Save brevityinmotion/7a72aa276a9642ae0e6ccbc83deb7d2b to your computer and use it in GitHub Desktop.
Function to add program and baseurl fields to raw httpx json output
def processHttpx(programName, refinedBucketPath, inputBucketPath, presentationBucketPath, operationName, programInputBucketPath):
fileName = programName + '-httpx-' + operationName + '.json'
presentationFilePath = presentationBucketPath + 'httpx-json/' + fileName
df = pd.read_json(presentationFilePath, lines=True)
df['program'] = programName
if (operationName == 'initial'):
storePathUrl = programInputBucketPath + programName + '/' + programName + '-httpx.csv'
df.to_csv(storePathUrl, header=False, index=False, sep='\n')
storePathUrl = programInputBucketPath + programName + '/' + programName + '-urls-base.csv'
dfUrls = df.drop_duplicates(subset=['url'])
dfUrls['url'].to_csv(storePathUrl, header=False, index=False, sep='\n')
from urllib.parse import urlparse
def _parseUrlRoot(urlvalue):
cleanurl = urlparse(urlvalue).netloc
return cleanurl
def _parseUrlBase(urlvalue):
baseurl = urlparse(urlvalue)#.netloc
baseurl = baseurl.scheme + '://' + baseurl.netloc + baseurl.path
return baseurl
df['domain'] = df['url'].apply(_parseUrlRoot)
df['baseurl'] = df['url'].apply(_parseUrlBase)
fileOutputName = programName + '-httpx.json'
outputPath = presentationBucketPath + 'httpx/' + fileOutputName
# Check if there is already output so that it is not overwritten
try:
dfInitialHttpx = pd.read_json(outputPath, lines=True)
df = dfInitialHttpx.append(df)
df = df.drop_duplicates(subset=['url'], keep='last')
except:
print('No initial httpx output')
df.to_json(outputPath, orient='records', lines=True)
if (operationName == 'initial'):
fileOutputCrawl = programName + '-httpx-crawl.csv'
storePathUrl = inputBucketPath + programName + '/' + fileOutputName
df['url'].to_csv(storePathUrl, header=False, index=False, sep='\n')
return 'Success'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment