Skip to content

Instantly share code, notes, and snippets.

@AlJohri
Created August 20, 2018 03:19
Show Gist options
  • Save AlJohri/9d1f5c03f195b54db70c0e03a2ae9a65 to your computer and use it in GitHub Desktop.
Save AlJohri/9d1f5c03f195b54db70c0e03a2ae9a65 to your computer and use it in GitHub Desktop.
import os
import csv
import tqdm
import hashlib
import itertools
def load(filepaths, line_limit=None):
data = []
for i, filepath in enumerate(filepaths):
print(f'[{i+1} of {len(filepaths)}] loading {filepath}')
with open(filepath) as f:
reader = csv.DictReader(f)
for row in tqdm.tqdm(itertools.islice(reader, line_limit)):
data.append(row)
return data
def index(data, fields=['author', 'content', 'publish_date']):
indexed_data = {}
for row in data:
string_to_hash = ' '.join([row[f] for f in fields])
hasher = hashlib.sha1()
hasher.update(string_to_hash.encode('utf-8'))
row_hash = hasher.hexdigest()
if row_hash in indexed_data:
print(f'HASH COLLISION {row_hash}')
# print(f'attempting to add row: {row}')
# print(f'existing row was: {indexed_data[row_hash]}')
else:
indexed_data[row_hash] = row
return indexed_data
print('loading old files')
filepaths = sorted([f'russian-troll-tweets-master/{x}' for x in
os.listdir('russian-troll-tweets-master') if x.endswith('.csv')])
old_data = load(filepaths)
print('finished loading old files')
print('loading new file')
new_data = load(['IRAhandle_tweets_v5_sorted.csv'])
print('finished loading new file')
print('index old and new data by hash of author and content')
indexed_old_data = index(old_data)
indexed_new_data = index(new_data)
print("number of records in old data", len(indexed_old_data))
print("number of records in new data", len(indexed_new_data))
old_date_ids = set(indexed_old_data.keys())
new_data_ids = set(indexed_new_data.keys())
deleted_ids = old_date_ids-new_data_ids
new_ids = new_data_ids-old_date_ids
print("number of deleted ids", len(deleted_ids))
print("number of new ids", len(new_ids))
if deleted_ids:
with open('deleted_tweets.csv', 'w') as f:
writer = csv.DictWriter(f, fieldnames=list(old_data[0].keys()))
writer.writeheader()
for row in (indexed_old_data[x] for x in deleted_ids):
writer.writerow(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment