Last active
December 20, 2015 21:09
-
-
Save cbare/6195859 to your computer and use it in GitHub Desktop.
A total one-off hack to crawl Synapse and compile stats on public entities for auditing of data governance compliance. A modification of an earlier gist: synapse_stats.py https://gist.github.com/cbare/4686667
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ## | |
| ## Collect statistics for auditing Synapse usage and governance | |
| ###################################################################### | |
| import synapseclient | |
| from synapseclient.utils import id_of | |
| from datetime import datetime as Datetime | |
| import re, sys | |
| import requests | |
| ROOT = 'syn4489' | |
| syn = synapseclient.Synapse() | |
| syn.login('challengeAdmin@sagebase.org', 'synChal2013!') | |
| sage_users = ["earthlingzephyr", "isjang", "xschildwachter", | |
| "bennett.k.ng", "bruce_hoff", "mikerkellen", | |
| "cbare", "Christopher Bare", "metteptrs", | |
| "Jay Hodgson", "matthew.furia", "laramangravite", | |
| "nicole.deflaux.guest", "Nicole Deflaux gmail"] | |
| sage_domains = ['sagebase.org','jayhodgson.com'] | |
| acl_cache = {} | |
| project_cache = {} | |
| node_types = { | |
| 0:'Study', | |
| 1:'Data', | |
| 2:'Project', | |
| 3:'Preview', | |
| 4:'Folder', | |
| 5:'Analysis', | |
| 6:'Step', | |
| 7:'Code', | |
| 8:'Link', | |
| 9:'PhenotypeData', | |
| 10:'GenotypeData', | |
| 11:'ExpressionData', | |
| 12:'RObject', | |
| 13:'Summary', | |
| 14:'GenomicData', | |
| 15:'Page', | |
| 16:'FileEntity' | |
| } | |
| def get_all_users(): | |
| results = syn.restGET('/user?limit=10000&offset=0') | |
| return results['results'] | |
| def get_all_users_generator(): | |
| totalNumberOfResults = sys.maxint | |
| offset = 0 | |
| limit = 500 | |
| while offset < totalNumberOfResults: | |
| results = syn.restGET('/user?limit=%d&offset=%d' % (limit,offset)) | |
| totalNumberOfResults = results['totalNumberOfResults'] | |
| offset += limit | |
| for result in results['results']: | |
| yield result | |
| def strip_prefixes_from_keys(dictionary, prefix): | |
| new_dictionary = {} | |
| for key,value in dictionary.iteritems(): | |
| ## entity.id -> id | |
| m = re.match(prefix + r'\.(.*)', key) | |
| if m: | |
| new_dictionary[m.group(1)] = value | |
| else: | |
| new_dictionary[key] = value | |
| return new_dictionary | |
| def get_user_entities(user, entity_type): | |
| fixed_results = [] | |
| offset = 1 | |
| limit = 500 | |
| results = None | |
| while results==None or offset <= results['totalNumberOfResults']: | |
| results = syn.query('select id, parentId, name, nodeType, concreteType, modifiedOn, benefactorId from %s where createdByPrincipalId == %s limit %d offset %d' % (entity_type, user['ownerId'], limit, offset,)) | |
| ## fix this crappy crap | |
| fixed_results.extend( [ strip_prefixes_from_keys(result, entity_type) for result in results['results'] ] ) | |
| offset += limit | |
| return fixed_results | |
| ## 273948 = AUTHENTICATED_USERS | |
| ## 273949 = PUBLIC | |
| def classify_by_access(acl): | |
| for access in acl['resourceAccess']: | |
| if access['principalId'] in [273949] and 'READ' in access['accessType']: | |
| return 'PUBLIC' | |
| if access['principalId'] in [273948] and 'READ' in access['accessType']: | |
| return 'AUTHENTICATED_USERS' | |
| return 'private' | |
| def quote(string): | |
| return '\\"'.join('"' + p + '"' for p in string.encode('latin-1').decode('utf-8').split('"')) | |
| def get_acl(entity): | |
| benefactor = entity.get('benefactorId', syn._getBenefactor(entity)['id']) | |
| if benefactor not in acl_cache: | |
| acl_cache[benefactor] = syn._getACL(entity) | |
| return acl_cache[benefactor] | |
| def get_access_requirements(entity): | |
| return syn.restGET('/entity/%s/accessRequirement' % id_of(entity)) | |
| def get_access_types(entity): | |
| access_requirements = get_access_requirements(entity) | |
| if 'totalNumberOfResults' in access_requirements and access_requirements['totalNumberOfResults'] == 0: | |
| return ['No access requirements'] | |
| if 'results' in access_requirements: | |
| return [result['entityType'].split('.')[-1] for result in access_requirements['results']] | |
| return ['No access requirements'] | |
| def get_date(entity): | |
| if 'modifiedOn' in entity: | |
| dt = Datetime.fromtimestamp(entity['modifiedOn']/1000) | |
| return dt.strftime("%Y-%m-%d") | |
| else: | |
| return '' | |
| ## get principle IDs for the public and authenticated users group | |
| ## PUBLIC | |
| ## AUTHENTICATED_USERS | |
| def get_user_group_headers(prefix): | |
| groups = self.restGET('/userGroupHeaders?prefix=%s' % prefix) | |
| def print_user_counts(user_entities, sage_users=False): | |
| for user, entity_count in user_entities: | |
| if sage_users==None or user['sage_user'] == sage_users: | |
| print user['displayName'], user['email'], 'sage' if user['sage_user'] else 'non-sage', entity_count | |
| def get_entity_type(entity): | |
| if entity.get('concreteType', None): | |
| return re.sub('org.sagebionetworks.repo.model.', '', entity['concreteType'][0]) | |
| elif entity.get('nodeType', None): | |
| return node_types[entity['nodeType']] | |
| else: | |
| return 'unknown' | |
| def is_sage_user(user): | |
| if user['displayName'] in sage_users: return True | |
| if user.get('company', None) == 'Sage Bionetworks': return True | |
| if 'email' in user: | |
| domain = user['email'].split("@")[-1] | |
| if domain in sage_domains: return True | |
| # profile = getUserProfile(user['ownerId']) | |
| # if 'email' in profile: | |
| # domain = profile['email'].split("@")[-1] | |
| # if domain in sage_domains: return True | |
| return False | |
| all_users = get_all_users() | |
| print 'read %d users.' % len(all_users) | |
| ## get emails from profile and mark Sage users | |
| for user in all_users: | |
| ## count stuff owned by sage folks separately | |
| user['sage_user'] = is_sage_user(user) | |
| print '%d sage users.' % sum([user['sage_user'] for user in all_users]) | |
| print '%d other users.' % sum([not user['sage_user'] for user in all_users]) | |
| ## sort by sage users, lastName or displayName, then firstName | |
| all_users.sort(key=lambda user: [user['sage_user'], user.get('lastName', user.get('displayName', 'UNKNOWN')), user.get('firstName', '')]) | |
| print 'sorted users' | |
| d = Datetime.now() | |
| f = open('synapse_public_inventory_%s.csv' % str(d.date()), 'w') | |
| column_labels = ['OwnerId', 'Username', 'First name', 'Email', 'Sage', 'SynapseId', 'Parent', 'Name', 'Type', 'Date', 'Open', 'Access Requirements'] | |
| f.write(', '.join(column_labels)) | |
| f.write('\n') | |
| # remove names until we find the one that failed. | |
| # found = False | |
| # filtered_users = [] | |
| # for user in all_users: | |
| # if found: | |
| # filtered_users.append(user) | |
| # else: | |
| # if user['displayName'].startswith('Daniel Lombra'): | |
| # found = True | |
| # filtered_users.append(user) | |
| # all_users = filtered_users | |
| user_entities = [] | |
| for user in all_users: | |
| try: | |
| if user['sage_user']: continue | |
| entities = get_user_entities(user, 'entity') | |
| user_entities.append( (user, len(entities)) ) | |
| print '~' * 80 | |
| print user | |
| print entities | |
| print '+' * 80 | |
| for entity in entities: | |
| try: | |
| entity['is-open'] = classify_by_access(get_acl(entity)) | |
| except Exception as ex: | |
| print ex | |
| entity['is-open'] = '??' | |
| sys.stderr.write('couldn\'t get acl for entity %s\n' % (str(entity['id']),)) | |
| try: | |
| if entity['parentId'] == ROOT: | |
| entity['project'] = entity['id'] | |
| elif entity['parentId'] in project_cache: | |
| entity['project'] = project_cache[entity['parentId']] | |
| else: | |
| ancestors = syn.restGET('/entity/%s/ancestors' % (str(entity['id']),)) | |
| entity['project'] = ancestors['idList'][1]['id'] if len(ancestors['idList']) > 1 else ancestors['idList'][0]['id'] | |
| project_cache[entity['parentId']] = entity['project'] | |
| except Exception as ex: | |
| print ex | |
| entity['project'] = '??' | |
| sys.stderr.write('couldn\'t get ancestors for entity %s\n' % (str(entity['id']),)) | |
| line = u', '.join( (unicode(user['ownerId']), | |
| quote(user.get('lastName', user.get('displayName', 'UNKNOWN'))), | |
| quote(user.get('firstName','')), | |
| user['email'], | |
| 'sage' if user['sage_user'] else 'non-sage', | |
| entity['id'], | |
| entity['parentId'], | |
| entity['project'], | |
| quote(entity['name']), | |
| get_entity_type(entity), | |
| quote(get_date(entity)), | |
| entity['is-open'], | |
| '|'.join(get_access_types(entity)), )).encode('utf-8') | |
| print line | |
| f.write(line) | |
| f.write('\n') | |
| except Exception as ex: | |
| sys.stderr.write('Exception processing user %s: %s\n' % (user, ex,)) | |
| raise | |
| f.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment