Created
November 24, 2015 21:28
-
-
Save seamustuohy/03947c292cf14ef1588e to your computer and use it in GitHub Desktop.
Simple script for open internet project M&E.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| #Original Code | |
| #https://github.com/winter-muted/moose/blob/399aefb8d7d62e0163045b39803a2d791ada04f9/scripts/github_traffic.py | |
| # For automated logging: | |
| # python github_traffic.py --log --user [USERNAME] --password [PASSWORD] | |
| import cookielib | |
| import sys, os, time, argparse, getpass, re, datetime, csv | |
| try: | |
| import mechanize | |
| except ImportError: | |
| print 'python-mechanize module not available.\n' | |
| sys.exit(1) | |
| def webBrowser(): | |
| # Browser | |
| br = mechanize.Browser() | |
| # Cookie Jar | |
| cj = cookielib.LWPCookieJar() | |
| br.set_cookiejar(cj) | |
| # Browser options | |
| br.set_handle_equiv(True) | |
| br.set_handle_gzip(False) | |
| br.set_handle_redirect(True) | |
| br.set_handle_referer(True) | |
| br.set_handle_robots(False) | |
| br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) | |
| br.addheaders = [('User-agent', 'Chrome')] | |
| # The site we will navigate into, handling it's session | |
| br.open('https://github.com/login') | |
| # Select the second (index one) form (the first form is a search query box) | |
| # this changes from web site to web site. GitHub.com/login happens to be the second form | |
| br.select_form(nr=1) | |
| return br | |
| def authenticatePage(user, password): | |
| browser = webBrowser() | |
| browser.form['login'] = user | |
| browser.form['password'] = password | |
| browser.submit() | |
| return browser | |
| def readPage(browser, repo): | |
| stats = {} | |
| browser.addheaders = [('User-agent', 'Chrome'), ('Referer', 'https://github.com/' + repo + '/graphs/traffic'), ('X-Requested-With', 'XMLHttpRequest')] | |
| # GitHubs Traffic payload is in python dictionary format | |
| # grab the clones, and Visitors | |
| try: | |
| stats['Clones'] = eval(browser.open('https://github.com/' + repo + '/graphs/clone-activity-data').read()) | |
| stats['Visitors'] = eval(browser.open('https://github.com/' + repo + '/graphs/traffic-data').read()) | |
| except mechanize.HTTPError as e: | |
| print 'There was an error obtaining traffic for said site.' | |
| if str(e).find('406') != -1: | |
| print '\tError 406: You do not have permission to view statistics. Or you supplied incorrect credentials' | |
| sys.exit(1) | |
| if str(e).find('404') != -1: | |
| print '\tError 404: Page not found' | |
| sys.exit(1) | |
| return stats | |
| def verifyArgs(args): | |
| if args.repo is None or len(args.repo.split('/')) != 2: | |
| print '\nYou must specify a repository you are insterested in scrapeing:\n\t --repo foo/bar\n\nNote: GitHub is case-sensitive, so your arguments must be too' | |
| sys.exit(1) | |
| if args.user is '': | |
| print '\nYou must specify a user to authenticate with' | |
| sys.exit(1) | |
| try: | |
| while args.password is '': | |
| args.password = getpass.getpass('Password for UserID ' + args.user + ' :',) | |
| except KeyboardInterrupt: | |
| print '' | |
| sys.exit(0) | |
| return args | |
| def write_list(dev_data, writefile): | |
| with open(writefile, 'w+') as csvfile: | |
| csvw = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) | |
| for i in dev_data: | |
| csvw.writerow(i) | |
| def get_csv(filename): | |
| with open(filename, 'r') as csvfile: | |
| reader= csv.reader(csvfile) | |
| csvlist = list(reader) | |
| return csvlist | |
| def writeFile(writefile, stats): | |
| if os.path.isfile(writefile): | |
| log_file = get_csv(writefile) | |
| merged = {} | |
| for log in log_file: | |
| merged[(log[0], log[1])] = log | |
| for stat in stats: | |
| merged[(stat[0], stat[1])] = stat | |
| updated = [] | |
| for i in merged: | |
| updated.append(merged[i]) | |
| else: | |
| updated = stats | |
| # sort list by date (in-place) | |
| updated.sort(key=lambda x: datetime.datetime.strptime(x[0], "%Y-%b-%d")) | |
| write_list(updated, writefile) | |
| def parseArgs(args=None): | |
| # Traffic Stats URL: https://github.com/idaholab/moose/graphs/clone-activity-data | |
| parser = argparse.ArgumentParser(description='Scrape GitHub for a webpage requiring authentication') | |
| parser.add_argument('--repo', '-r', nargs='?', help='Repository (example: foo/bar)') | |
| parser.add_argument('--log', '-l', help='Write stats to the default log location.', action='store_true') | |
| parser.add_argument('--write', '-w', nargs='?', help='Write to a file') | |
| try: | |
| parser.add_argument('--user', '-u', nargs='?', default=os.getenv('USER'), help='Authenticate using specified user. Defaults to: (' + os.getenv('USER') + ')') | |
| except TypeError: | |
| parser.add_argument('--user', '-u', nargs='?', default='', help='Authenticate using specified user') | |
| parser.add_argument('--password', '-p', nargs='?', default='', help='Authenticate using specified password') | |
| return verifyArgs(parser.parse_args(args)) | |
| if __name__ == '__main__': | |
| args = parseArgs() | |
| OI_repos = ["OpenInternet/co-pilot", | |
| "OpenInternet/SAFETAG-website", | |
| "OpenInternet/System_Administrator_Guide_Text", | |
| "OpenInternet/MyWebsiteIsDown", | |
| "OpenInternet/System_Administrator_Guide_Templates", | |
| "OpenInternet/Documentation-Builder", | |
| "OpenInternet/SAFETAG", | |
| "OpenInternet/levelup", | |
| "OpenInternet/levelup_theme"] | |
| web_page = authenticatePage(args.user, args.password) | |
| clones = [] | |
| visits = [] | |
| for repo in OI_repos: | |
| print("pulling {0}".format(repo)) | |
| payload = readPage(web_page, repo) | |
| stats = {'clones' : [], | |
| 'visitors' : []} | |
| for point in payload['Clones']['counts']: | |
| clones.append([time.strftime("%Y-%b-%d", time.gmtime(point['bucket'])), repo, str(point['total']), str(point['unique'])]) | |
| for point in payload['Visitors']['counts']: | |
| visits.append([time.strftime("%Y-%b-%d", time.gmtime(point['bucket'])), repo, str(point['total']), str(point['unique'])]) | |
| print("found clones {0}".format(clones)) | |
| print("found visits {0}".format(clones)) | |
| if args.write: | |
| print("writing to files") | |
| writeFile("{0}.clones.csv".format(args.write), clones) | |
| writeFile("{0}.visits.csv".format(args.write), visits) | |
| elif args.log: | |
| print("writing to default log files") | |
| writeFile("/tmp/github.clones.csv".format(args.write), clones) | |
| writeFile("/tmp/github.visits.csv".format(args.write), visits) | |
| else: | |
| print '\nClones: (date, total, unique)\n', stats['clones'] | |
| print '\nVisitors: (date, total, unique)\n', stats['visitors'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment