Skip to content

Instantly share code, notes, and snippets.

@cbare
Last active December 20, 2015 21:09
Show Gist options
  • Select an option

  • Save cbare/6195859 to your computer and use it in GitHub Desktop.

Select an option

Save cbare/6195859 to your computer and use it in GitHub Desktop.
A total one-off hack to crawl Synapse and compile stats on public entities for auditing of data governance compliance. A modification of an earlier gist: synapse_stats.py https://gist.github.com/cbare/4686667
##
## Collect statistics for auditing Synapse usage and governance
######################################################################
import synapseclient
from synapseclient.utils import id_of
from datetime import datetime as Datetime
import re, sys
import requests
ROOT = 'syn4489'
syn = synapseclient.Synapse()
syn.login('challengeAdmin@sagebase.org', 'synChal2013!')
sage_users = ["earthlingzephyr", "isjang", "xschildwachter",
"bennett.k.ng", "bruce_hoff", "mikerkellen",
"cbare", "Christopher Bare", "metteptrs",
"Jay Hodgson", "matthew.furia", "laramangravite",
"nicole.deflaux.guest", "Nicole Deflaux gmail"]
sage_domains = ['sagebase.org','jayhodgson.com']
acl_cache = {}
project_cache = {}
node_types = {
0:'Study',
1:'Data',
2:'Project',
3:'Preview',
4:'Folder',
5:'Analysis',
6:'Step',
7:'Code',
8:'Link',
9:'PhenotypeData',
10:'GenotypeData',
11:'ExpressionData',
12:'RObject',
13:'Summary',
14:'GenomicData',
15:'Page',
16:'FileEntity'
}
def get_all_users():
results = syn.restGET('/user?limit=10000&offset=0')
return results['results']
def get_all_users_generator():
totalNumberOfResults = sys.maxint
offset = 0
limit = 500
while offset < totalNumberOfResults:
results = syn.restGET('/user?limit=%d&offset=%d' % (limit,offset))
totalNumberOfResults = results['totalNumberOfResults']
offset += limit
for result in results['results']:
yield result
def strip_prefixes_from_keys(dictionary, prefix):
new_dictionary = {}
for key,value in dictionary.iteritems():
## entity.id -> id
m = re.match(prefix + r'\.(.*)', key)
if m:
new_dictionary[m.group(1)] = value
else:
new_dictionary[key] = value
return new_dictionary
def get_user_entities(user, entity_type):
fixed_results = []
offset = 1
limit = 500
results = None
while results==None or offset <= results['totalNumberOfResults']:
results = syn.query('select id, parentId, name, nodeType, concreteType, modifiedOn, benefactorId from %s where createdByPrincipalId == %s limit %d offset %d' % (entity_type, user['ownerId'], limit, offset,))
## fix this crappy crap
fixed_results.extend( [ strip_prefixes_from_keys(result, entity_type) for result in results['results'] ] )
offset += limit
return fixed_results
## 273948 = AUTHENTICATED_USERS
## 273949 = PUBLIC
def classify_by_access(acl):
for access in acl['resourceAccess']:
if access['principalId'] in [273949] and 'READ' in access['accessType']:
return 'PUBLIC'
if access['principalId'] in [273948] and 'READ' in access['accessType']:
return 'AUTHENTICATED_USERS'
return 'private'
def quote(string):
return '\\"'.join('"' + p + '"' for p in string.encode('latin-1').decode('utf-8').split('"'))
def get_acl(entity):
benefactor = entity.get('benefactorId', syn._getBenefactor(entity)['id'])
if benefactor not in acl_cache:
acl_cache[benefactor] = syn._getACL(entity)
return acl_cache[benefactor]
def get_access_requirements(entity):
return syn.restGET('/entity/%s/accessRequirement' % id_of(entity))
def get_access_types(entity):
access_requirements = get_access_requirements(entity)
if 'totalNumberOfResults' in access_requirements and access_requirements['totalNumberOfResults'] == 0:
return ['No access requirements']
if 'results' in access_requirements:
return [result['entityType'].split('.')[-1] for result in access_requirements['results']]
return ['No access requirements']
def get_date(entity):
if 'modifiedOn' in entity:
dt = Datetime.fromtimestamp(entity['modifiedOn']/1000)
return dt.strftime("%Y-%m-%d")
else:
return ''
## get principle IDs for the public and authenticated users group
## PUBLIC
## AUTHENTICATED_USERS
def get_user_group_headers(prefix):
groups = self.restGET('/userGroupHeaders?prefix=%s' % prefix)
def print_user_counts(user_entities, sage_users=False):
for user, entity_count in user_entities:
if sage_users==None or user['sage_user'] == sage_users:
print user['displayName'], user['email'], 'sage' if user['sage_user'] else 'non-sage', entity_count
def get_entity_type(entity):
if entity.get('concreteType', None):
return re.sub('org.sagebionetworks.repo.model.', '', entity['concreteType'][0])
elif entity.get('nodeType', None):
return node_types[entity['nodeType']]
else:
return 'unknown'
def is_sage_user(user):
if user['displayName'] in sage_users: return True
if user.get('company', None) == 'Sage Bionetworks': return True
if 'email' in user:
domain = user['email'].split("@")[-1]
if domain in sage_domains: return True
# profile = getUserProfile(user['ownerId'])
# if 'email' in profile:
# domain = profile['email'].split("@")[-1]
# if domain in sage_domains: return True
return False
all_users = get_all_users()
print 'read %d users.' % len(all_users)
## get emails from profile and mark Sage users
for user in all_users:
## count stuff owned by sage folks separately
user['sage_user'] = is_sage_user(user)
print '%d sage users.' % sum([user['sage_user'] for user in all_users])
print '%d other users.' % sum([not user['sage_user'] for user in all_users])
## sort by sage users, lastName or displayName, then firstName
all_users.sort(key=lambda user: [user['sage_user'], user.get('lastName', user.get('displayName', 'UNKNOWN')), user.get('firstName', '')])
print 'sorted users'
d = Datetime.now()
f = open('synapse_public_inventory_%s.csv' % str(d.date()), 'w')
column_labels = ['OwnerId', 'Username', 'First name', 'Email', 'Sage', 'SynapseId', 'Parent', 'Name', 'Type', 'Date', 'Open', 'Access Requirements']
f.write(', '.join(column_labels))
f.write('\n')
# remove names until we find the one that failed.
# found = False
# filtered_users = []
# for user in all_users:
# if found:
# filtered_users.append(user)
# else:
# if user['displayName'].startswith('Daniel Lombra'):
# found = True
# filtered_users.append(user)
# all_users = filtered_users
user_entities = []
for user in all_users:
try:
if user['sage_user']: continue
entities = get_user_entities(user, 'entity')
user_entities.append( (user, len(entities)) )
print '~' * 80
print user
print entities
print '+' * 80
for entity in entities:
try:
entity['is-open'] = classify_by_access(get_acl(entity))
except Exception as ex:
print ex
entity['is-open'] = '??'
sys.stderr.write('couldn\'t get acl for entity %s\n' % (str(entity['id']),))
try:
if entity['parentId'] == ROOT:
entity['project'] = entity['id']
elif entity['parentId'] in project_cache:
entity['project'] = project_cache[entity['parentId']]
else:
ancestors = syn.restGET('/entity/%s/ancestors' % (str(entity['id']),))
entity['project'] = ancestors['idList'][1]['id'] if len(ancestors['idList']) > 1 else ancestors['idList'][0]['id']
project_cache[entity['parentId']] = entity['project']
except Exception as ex:
print ex
entity['project'] = '??'
sys.stderr.write('couldn\'t get ancestors for entity %s\n' % (str(entity['id']),))
line = u', '.join( (unicode(user['ownerId']),
quote(user.get('lastName', user.get('displayName', 'UNKNOWN'))),
quote(user.get('firstName','')),
user['email'],
'sage' if user['sage_user'] else 'non-sage',
entity['id'],
entity['parentId'],
entity['project'],
quote(entity['name']),
get_entity_type(entity),
quote(get_date(entity)),
entity['is-open'],
'|'.join(get_access_types(entity)), )).encode('utf-8')
print line
f.write(line)
f.write('\n')
except Exception as ex:
sys.stderr.write('Exception processing user %s: %s\n' % (user, ex,))
raise
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment