Created
August 20, 2018 03:19
-
-
Save AlJohri/9d1f5c03f195b54db70c0e03a2ae9a65 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import csv | |
import tqdm | |
import hashlib | |
import itertools | |
def load(filepaths, line_limit=None): | |
data = [] | |
for i, filepath in enumerate(filepaths): | |
print(f'[{i+1} of {len(filepaths)}] loading {filepath}') | |
with open(filepath) as f: | |
reader = csv.DictReader(f) | |
for row in tqdm.tqdm(itertools.islice(reader, line_limit)): | |
data.append(row) | |
return data | |
def index(data, fields=['author', 'content', 'publish_date']): | |
indexed_data = {} | |
for row in data: | |
string_to_hash = ' '.join([row[f] for f in fields]) | |
hasher = hashlib.sha1() | |
hasher.update(string_to_hash.encode('utf-8')) | |
row_hash = hasher.hexdigest() | |
if row_hash in indexed_data: | |
print(f'HASH COLLISION {row_hash}') | |
# print(f'attempting to add row: {row}') | |
# print(f'existing row was: {indexed_data[row_hash]}') | |
else: | |
indexed_data[row_hash] = row | |
return indexed_data | |
print('loading old files') | |
filepaths = sorted([f'russian-troll-tweets-master/{x}' for x in | |
os.listdir('russian-troll-tweets-master') if x.endswith('.csv')]) | |
old_data = load(filepaths) | |
print('finished loading old files') | |
print('loading new file') | |
new_data = load(['IRAhandle_tweets_v5_sorted.csv']) | |
print('finished loading new file') | |
print('index old and new data by hash of author and content') | |
indexed_old_data = index(old_data) | |
indexed_new_data = index(new_data) | |
print("number of records in old data", len(indexed_old_data)) | |
print("number of records in new data", len(indexed_new_data)) | |
old_date_ids = set(indexed_old_data.keys()) | |
new_data_ids = set(indexed_new_data.keys()) | |
deleted_ids = old_date_ids-new_data_ids | |
new_ids = new_data_ids-old_date_ids | |
print("number of deleted ids", len(deleted_ids)) | |
print("number of new ids", len(new_ids)) | |
if deleted_ids: | |
with open('deleted_tweets.csv', 'w') as f: | |
writer = csv.DictWriter(f, fieldnames=list(old_data[0].keys())) | |
writer.writeheader() | |
for row in (indexed_old_data[x] for x in deleted_ids): | |
writer.writerow(row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment