Skip to content

Instantly share code, notes, and snippets.

@nmilford
Last active June 6, 2018 18:05
Show Gist options
  • Save nmilford/0b50aa2e358ddaec2c2065d7ccf78b84 to your computer and use it in GitHub Desktop.
Save nmilford/0b50aa2e358ddaec2c2065d7ccf78b84 to your computer and use it in GitHub Desktop.
Sample script to build a report from PerformLine API and Export
#!/usr/bin/env python
import performline.embedded.stdlib.clients.rest.exceptions
from performline.client import Client
from glob import iglob
import argparse
import json
import sys
class PlineExportReport(object):
def __init__(self, token=None, path='.'):
"""
The PerformLine Compliance API can provide most of the page data
available in the UI. You can install it via the public PyPi repository:
`pip install -U performline`
see: https://github.com/PerformLine/python-performline-client
Initialize a connection to PerformLine's API.
Load WHOIS data from file.
Load page_ids from glob.
"""
self.path = path
self.report_dict = {}
self.report_file = "{}/report.json".format(self.path)
self.whois_json = "{}/whois_export.json".format(self.path)
if token:
self.performline = Client(token)
else:
print "No API token provided."
sys.exit(-1)
try:
with open(self.whois_json, 'r') as f:
self.whois_data = json.load(f)
except:
print "Problem opening up whois data at {}.".format(self.whois_json)
sys.exit(-1)
self.__get_page_ids_from_files()
def __get_page_ids_from_files(self):
"""
A page_id is the is the primary identifier for any page object. The
source code export functionality in PerformLine will give you a zip file
with the page source files in the format of $page_id.html.
Below we just glob for anything in the path that ends in *.html, strip
the leading path and the trailing file extension to determine all of the
page_ids in this export, returning them in an array of ints.
"""
self.page_ids = []
html_glob = iglob("{}/*.html".format(self.path))
for page in html_glob:
self.page_ids.append(int(page.split('/')[-1].split('.')[0]))
return self.page_ids
def __get_whois_data(self, page_id):
"""
The whois data provided as part of the export is a series of domains and
thier related whois output with and array of associated page_ids.
Below we just iterate through the domains, looking to see if it is
associated with a particular page_id, then return thr whois data.
"""
for domain in self.whois_data.keys():
if page_id in self.whois_data[domain]['page_ids']:
return self.whois_data[domain]['whois']
return "No Whois"
def __read_source_from_file(self, page_id):
"""
Reads the source code into a string from disk.
"""
try:
with open("{}/{}.html".format(self.path, page_id), 'r') as f:
return f.read()
except:
print "Problem reading HTML source file {}.html".format(page_id)
return "No Content"
def build(self):
"""
Build a dict of data from the API, Whois json and the source.
Write it to a file.
"""
for page_id in self.page_ids:
print "Preparing report entry for {}".format(page_id)
report_entry = {}
try:
"""
# You can explore available attributes thus:
In [1]: from performline.client import Client
In [2]: c = Client('tokenca6e5897e27d1b43906469134b1c3eb0424')
In [3]: c.webpages(id=3480374)
Out[3]:
{
"LastScoredAt": "2018-06-05T10:46:53.693483-04:00",
"CompanyId": 501,
"Url": "http://www.guidetoonlineschools.com/online-schools?lvl=8",
"CampaignId": 4330,
"TrafficSourceId": 7670,
"BrandId": 458,
"Score": 10,
"Type": "web",
"Id": 3480374,
"CreatedAt": "2015-06-16T16:46:06.824781-04:00"
}
In [4]: c.webpages(id=3480374).LastScoredAt
Out[4]: u'2018-06-05T10:46:53.693483-04:00'
In [5]: c.webpages(id=3480374).Url
Out[5]: u'http://www.guidetoonlineschools.com/online-schools?lvl=8'
In [6]: c.webpages(id=3480374).Score
Out[6]: 10
"""
page = self.performline.webpages(id=page_id)
report_entry['score'] = page.score
report_entry['url'] = page.url
except performline.embedded.stdlib.clients.rest.NotFound as e:
print "ERROR: {} was not found.".format(page_id)
report_entry['whois'] = self.__get_whois_data(page_id)
report_entry['source'] = self.__read_source_from_file(page_id)
self.report_dict[page_id] = report_entry
try:
print "Writing report to {}".format(self.report_file)
with open(self.report_file, 'w') as f:
json.dump(self.report_dict, f)
except:
print "ERROR: Could not write report to file."
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Build a report from PerformLine API and Export.'
)
parser.add_argument('-t', '--token', action='store', dest='token',
required=True, help='PerformLine API Token')
parser.add_argument('-p', '--path', action='store', dest='path',
required=True, help='Path of extracted export contents.')
args = parser.parse_args()
if len(sys.argv[1:]) == 0:
parser.print_help()
else:
report = PlineExportReport(args.token, args.path)
report.build()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment