Skip to content

Instantly share code, notes, and snippets.

@mmahbub
Last active November 27, 2020 17:22
Show Gist options
  • Save mmahbub/ce462fedf974e8a12ccc062e002af4b4 to your computer and use it in GitHub Desktop.
Save mmahbub/ce462fedf974e8a12ccc062e002af4b4 to your computer and use it in GitHub Desktop.
df_test = pd.DataFrame(test_ds, columns=['text', 'author'])
# make a unique list of authors
auth = sorted(set(df['author']))
# make a dict of possible signatures where key is the labeled entity and value is a list of possible signatures
auth_dict = {}
auth_dict[auth[0]] = ['ben', 'benjamin', 'rogers', 'benjamin rogers','ben rogers','br']
auth_dict[auth[1]] = ['chris', 'dorland','chris dorland','cd']
auth_dict[auth[2]] = ['drew','fossum','drew fossum','df']
auth_dict[auth[3]] = ['jeffrey','shankman','jeffrey shankman','js']
auth_dict[auth[4]] = ['kevin','presto','kevin presto','kp']
auth_dict[auth[5]] = ['kim','kimberly','watson','kimberly watson','kim watson','kw']
auth_dict[auth[6]] = ['lynn','blair','lynn blair','lb']
auth_dict[auth[7]] = ['mark','haedicke','mark haedicke','mh']
auth_dict[auth[8]] = ['mike','michelle','cash','michelle cash','mike cash','mc']
auth_dict[auth[9]] = ['phillip','allen','phillip allen']
def untargeted_signature_attack(auth_dict,text, target = ''):
if target is not '':
target = auth_dict[target][0]
max_name_len = max([len(x) for x in auth_dict.keys()])
body = text[-max_name_len:]
body_l = body.lower()
for author in auth:
found = False
for sig in sorted(auth_dict[author], key=lambda k: len(k), reverse=True):
if sig in body_l:
found = True
startidx = body_l.find(sig)
endidx = startidx + len(sig)
text = text[:-max_name_len] + body.replace(body[startidx:endidx], target)
break
if found:
break
# return pd.Series([text, found])
return text
df_test[['perturbed_text', 'modified']] = df_test['text'].apply(lambda text:untargeted_signature_attack(auth_dict,text))
print('Percentage of perturbation: {:.2f}%\n'.format((len(df_test[df_test['modified'] == True])/len(df_test))*100))
for a in auth:
print('Modified data percentage for ' + a + ' = {:.2f}%\n'.format((len(df_test[(df_test['author']==a) & (df_test['modified']==True)])/len(df_test[df_test['author']==a]))*100))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment