Last active
June 6, 2018 18:05
-
-
Save nmilford/0b50aa2e358ddaec2c2065d7ccf78b84 to your computer and use it in GitHub Desktop.
Sample script to build a report from PerformLine API and Export
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import performline.embedded.stdlib.clients.rest.exceptions | |
from performline.client import Client | |
from glob import iglob | |
import argparse | |
import json | |
import sys | |
class PlineExportReport(object): | |
def __init__(self, token=None, path='.'): | |
""" | |
The PerformLine Compliance API can provide most of the page data | |
available in the UI. You can install it via the public PyPi repository: | |
`pip install -U performline` | |
see: https://github.com/PerformLine/python-performline-client | |
Initialize a connection to PerformLine's API. | |
Load WHOIS data from file. | |
Load page_ids from glob. | |
""" | |
self.path = path | |
self.report_dict = {} | |
self.report_file = "{}/report.json".format(self.path) | |
self.whois_json = "{}/whois_export.json".format(self.path) | |
if token: | |
self.performline = Client(token) | |
else: | |
print "No API token provided." | |
sys.exit(-1) | |
try: | |
with open(self.whois_json, 'r') as f: | |
self.whois_data = json.load(f) | |
except: | |
print "Problem opening up whois data at {}.".format(self.whois_json) | |
sys.exit(-1) | |
self.__get_page_ids_from_files() | |
def __get_page_ids_from_files(self): | |
""" | |
A page_id is the is the primary identifier for any page object. The | |
source code export functionality in PerformLine will give you a zip file | |
with the page source files in the format of $page_id.html. | |
Below we just glob for anything in the path that ends in *.html, strip | |
the leading path and the trailing file extension to determine all of the | |
page_ids in this export, returning them in an array of ints. | |
""" | |
self.page_ids = [] | |
html_glob = iglob("{}/*.html".format(self.path)) | |
for page in html_glob: | |
self.page_ids.append(int(page.split('/')[-1].split('.')[0])) | |
return self.page_ids | |
def __get_whois_data(self, page_id): | |
""" | |
The whois data provided as part of the export is a series of domains and | |
thier related whois output with and array of associated page_ids. | |
Below we just iterate through the domains, looking to see if it is | |
associated with a particular page_id, then return thr whois data. | |
""" | |
for domain in self.whois_data.keys(): | |
if page_id in self.whois_data[domain]['page_ids']: | |
return self.whois_data[domain]['whois'] | |
return "No Whois" | |
def __read_source_from_file(self, page_id): | |
""" | |
Reads the source code into a string from disk. | |
""" | |
try: | |
with open("{}/{}.html".format(self.path, page_id), 'r') as f: | |
return f.read() | |
except: | |
print "Problem reading HTML source file {}.html".format(page_id) | |
return "No Content" | |
def build(self): | |
""" | |
Build a dict of data from the API, Whois json and the source. | |
Write it to a file. | |
""" | |
for page_id in self.page_ids: | |
print "Preparing report entry for {}".format(page_id) | |
report_entry = {} | |
try: | |
""" | |
# You can explore available attributes thus: | |
In [1]: from performline.client import Client | |
In [2]: c = Client('tokenca6e5897e27d1b43906469134b1c3eb0424') | |
In [3]: c.webpages(id=3480374) | |
Out[3]: | |
{ | |
"LastScoredAt": "2018-06-05T10:46:53.693483-04:00", | |
"CompanyId": 501, | |
"Url": "http://www.guidetoonlineschools.com/online-schools?lvl=8", | |
"CampaignId": 4330, | |
"TrafficSourceId": 7670, | |
"BrandId": 458, | |
"Score": 10, | |
"Type": "web", | |
"Id": 3480374, | |
"CreatedAt": "2015-06-16T16:46:06.824781-04:00" | |
} | |
In [4]: c.webpages(id=3480374).LastScoredAt | |
Out[4]: u'2018-06-05T10:46:53.693483-04:00' | |
In [5]: c.webpages(id=3480374).Url | |
Out[5]: u'http://www.guidetoonlineschools.com/online-schools?lvl=8' | |
In [6]: c.webpages(id=3480374).Score | |
Out[6]: 10 | |
""" | |
page = self.performline.webpages(id=page_id) | |
report_entry['score'] = page.score | |
report_entry['url'] = page.url | |
except performline.embedded.stdlib.clients.rest.NotFound as e: | |
print "ERROR: {} was not found.".format(page_id) | |
report_entry['whois'] = self.__get_whois_data(page_id) | |
report_entry['source'] = self.__read_source_from_file(page_id) | |
self.report_dict[page_id] = report_entry | |
try: | |
print "Writing report to {}".format(self.report_file) | |
with open(self.report_file, 'w') as f: | |
json.dump(self.report_dict, f) | |
except: | |
print "ERROR: Could not write report to file." | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser( | |
description='Build a report from PerformLine API and Export.' | |
) | |
parser.add_argument('-t', '--token', action='store', dest='token', | |
required=True, help='PerformLine API Token') | |
parser.add_argument('-p', '--path', action='store', dest='path', | |
required=True, help='Path of extracted export contents.') | |
args = parser.parse_args() | |
if len(sys.argv[1:]) == 0: | |
parser.print_help() | |
else: | |
report = PlineExportReport(args.token, args.path) | |
report.build() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment