mmahbub · November 27, 2020 17:22
diff --git a/enron_attack b/enron_attack
 df_test = pd.DataFrame(test_ds, columns=['text', 'author'])
 # make a unique list of authors
 auth = sorted(set(df['author']))
 # make a dict of possible signatures where key is the labeled entity and value is a list of possible signatures
 auth_dict = {}
 auth_dict[auth[0]] = ['ben', 'benjamin', 'rogers', 'benjamin rogers','ben rogers','br']
 auth_dict[auth[1]] = ['chris', 'dorland','chris dorland','cd']
 auth_dict[auth[2]] = ['drew','fossum','drew fossum','df']
 auth_dict[auth[3]] = ['jeffrey','shankman','jeffrey shankman','js']
 auth_dict[auth[4]] = ['kevin','presto','kevin presto','kp']
 auth_dict[auth[5]] = ['kim','kimberly','watson','kimberly watson','kim watson','kw']
 auth_dict[auth[6]] = ['lynn','blair','lynn blair','lb']
 auth_dict[auth[7]] = ['mark','haedicke','mark haedicke','mh']
 auth_dict[auth[8]] = ['mike','michelle','cash','michelle cash','mike cash','mc']
 auth_dict[auth[9]] = ['phillip','allen','phillip allen']

 def untargeted_signature_attack(auth_dict,text, target = ''):
  if target is not '':
    target = auth_dict[target][0]
  max_name_len = max([len(x) for x in auth_dict.keys()])
  body = text[-max_name_len:]
  body_l = body.lower()
  for author in auth:
    found = False
    for sig in sorted(auth_dict[author], key=lambda k: len(k), reverse=True):
      if sig in body_l:
        found = True
        startidx = body_l.find(sig)
        endidx = startidx + len(sig)
        text = text[:-max_name_len] + body.replace(body[startidx:endidx], target)
        break
    if found:
      break
  # return pd.Series([text, found])
  return text
  
 df_test[['perturbed_text', 'modified']] = df_test['text'].apply(lambda text:untargeted_signature_attack(auth_dict,text))
 print('Percentage of perturbation: {:.2f}%\n'.format((len(df_test[df_test['modified'] == True])/len(df_test))*100))
 for a in auth:
  print('Modified data percentage for ' + a + ' = {:.2f}%\n'.format((len(df_test[(df_test['author']==a) & (df_test['modified']==True)])/len(df_test[df_test['author']==a]))*100))
	df_test = pd.DataFrame(test_ds, columns=['text', 'author'])
	# make a unique list of authors
	auth = sorted(set(df['author']))
	# make a dict of possible signatures where key is the labeled entity and value is a list of possible signatures
	auth_dict = {}
	auth_dict[auth[0]] = ['ben', 'benjamin', 'rogers', 'benjamin rogers','ben rogers','br']
	auth_dict[auth[1]] = ['chris', 'dorland','chris dorland','cd']
	auth_dict[auth[2]] = ['drew','fossum','drew fossum','df']
	auth_dict[auth[3]] = ['jeffrey','shankman','jeffrey shankman','js']
	auth_dict[auth[4]] = ['kevin','presto','kevin presto','kp']
	auth_dict[auth[5]] = ['kim','kimberly','watson','kimberly watson','kim watson','kw']
	auth_dict[auth[6]] = ['lynn','blair','lynn blair','lb']
	auth_dict[auth[7]] = ['mark','haedicke','mark haedicke','mh']
	auth_dict[auth[8]] = ['mike','michelle','cash','michelle cash','mike cash','mc']
	auth_dict[auth[9]] = ['phillip','allen','phillip allen']

	def untargeted_signature_attack(auth_dict,text, target = ''):
	if target is not '':
	target = auth_dict[target][0]
	max_name_len = max([len(x) for x in auth_dict.keys()])
	body = text[-max_name_len:]
	body_l = body.lower()
	for author in auth:
	found = False
	for sig in sorted(auth_dict[author], key=lambda k: len(k), reverse=True):
	if sig in body_l:
	found = True
	startidx = body_l.find(sig)
	endidx = startidx + len(sig)
	text = text[:-max_name_len] + body.replace(body[startidx:endidx], target)
	break
	if found:
	break
	# return pd.Series([text, found])
	return text

	df_test[['perturbed_text', 'modified']] = df_test['text'].apply(lambda text:untargeted_signature_attack(auth_dict,text))
	print('Percentage of perturbation: {:.2f}%\n'.format((len(df_test[df_test['modified'] == True])/len(df_test))*100))
	for a in auth:
	print('Modified data percentage for ' + a + ' = {:.2f}%\n'.format((len(df_test[(df_test['author']==a) & (df_test['modified']==True)])/len(df_test[df_test['author']==a]))*100))