Skip to content

Instantly share code, notes, and snippets.

@adammcmaster
Last active July 19, 2017 10:19
Show Gist options
  • Save adammcmaster/12a4021ddacf3692867968faec36f93a to your computer and use it in GitHub Desktop.
Save adammcmaster/12a4021ddacf3692867968faec36f93a to your computer and use it in GitHub Desktop.
import csv
INCLUDED_FIELDS = (
'classification_id',
'subject_ids',
'user_name',
'metadata',
'annotations'
)
with open('annotate-classifications.csv') as f:
r = csv.DictReader(f)
user_subjects = {}
for row in r:
user_subjects.setdefault(
row['user_name'],
{}
).setdefault(
row['subject_ids'],
[]
).append(tuple(
row[field] for field in INCLUDED_FIELDS
))
with open('duplicate-annotate-classifications.csv', 'w') as out_f:
w = csv.writer(out_f)
w.writerow(INCLUDED_FIELDS)
for user, subjects_dict in user_subjects.items():
for subject_id, rows in subjects_dict.items():
if len(rows) > 1:
for row in rows:
w.writerow(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment