-
-
Save intentionally-left-nil/536d32b9388675d7c98b019d524983a5 to your computer and use it in GitHub Desktop.
#!/bin/python3 | |
# Largely copied from http://www.mathewinkson.com/2015/03/delete-old-tweets-selectively-using-python-and-tweepy | |
# However, Mathew's script cannot delete tweets older than something like a year (these tweets are not available from the twitter API) | |
# This script is a complement on first use, to delete old tweets. It uses your twitter archive to find tweets' ids to delete | |
# How to use it : | |
# - download and extract your twitter archive (tweet.js will contain all your tweets with dates and ids) | |
# - put this script in the extracted directory | |
# - complete the secrets to access twitter's API on your behalf and, possibly, modify days_to_keep | |
# - delete the few junk characters at the beginning of tweet.js, until the first '[' (it crashed my json parser) | |
# - review the script !!!! It has not been thoroughly tested, it may have some unexpected behaviors... | |
# - run this script | |
# - forget this script, you can now use Mathew's script for your future deletions | |
# | |
# License : Unlicense http://unlicense.org/ | |
import tweepy | |
import json | |
from datetime import datetime, timedelta, timezone | |
import os | |
from os import path | |
consumer_key = '' | |
consumer_secret = '' | |
access_token = '' | |
access_token_secret = '' | |
days_to_keep = 365 | |
auth = tweepy.OAuthHandler(consumer_key, consumer_secret) | |
auth.set_access_token(access_token, access_token_secret) | |
api = tweepy.API(auth) | |
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_to_keep) | |
print(cutoff_date) | |
data_dir = path.join(os.getcwd(), 'data', 'js', 'tweets') | |
tweets = [] | |
for filename in os.listdir(data_dir): | |
with open(path.join(data_dir, filename), 'r', encoding='UTF-8') as fp: | |
js_file = fp.read() | |
contents = json.loads(js_file[js_file.find('\n') + 1:]) | |
tweets.extend(contents) | |
for tweet in tweets: | |
d = datetime.strptime(tweet['created_at'], "%Y-%m-%d %H:%M:%S %z") | |
if d < cutoff_date: | |
try: | |
api.destroy_status(tweet['id_str']) | |
print(tweet['created_at'] + " " + tweet['id_str']) | |
except: | |
print("failed to delete " + tweet['id_str']) | |
pass |
I'm running into an issue with converting the .js file into json, I think, and I'm getting errors regarding the json part of the code. I will keep looking into this issue, but wondered if you had any ideas on what the issue may be.
line 52, in
contents = json.loads(js_file[js_file.find('\n') + 1:])
File "C:\Python3.7\lib\json_init_.py", line 348, in loads
return _default_decoder.decode(s)
File "C:\Python3.7\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Python3.7\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 2 column 1 (char 1)
Hi,
Thanks for the awesome work.
I stumbled upon many small bugs (among other things related to datetime and json parsing), so I forked the script to a new gist that works.
If anyone is interested : https://gist.github.com/marrakchino/d6bd3438cd87ab419888edbc4dc1f0f5.
Best
This works for me:
https://gist.github.com/bluebossa63/e1dbdad9c0bc4fd625e072bfe304cf42
#!/bin/python3
# Largely copied from http://www.mathewinkson.com/2015/03/delete-old-tweets-selectively-using-python-and-tweepy
# However, Mathew's script cannot delete tweets older than something like a year (these tweets are not available from the twitter API)
# This script is a complement on first use, to delete old tweets. It uses your twitter archive to find tweets' ids to delete
# How to use it :
# - download and extract your twitter archive (tweet.js will contain all your tweets with dates and ids)
# - put this script in the extracted directory
# - complete the secrets to access twitter's API on your behalf and, possibly, modify days_to_keep
# - delete the few junk characters at the beginning of tweet.js, until the first '[' (it crashed my json parser)
# - review the script !!!! It has not been thoroughly tested, it may have some unexpected behaviors...
# - run this script
# - forget this script, you can now use Mathew's script for your future deletions
#
# License : Unlicense http://unlicense.org/
import tweepy
import json
from datetime import datetime, timedelta, timezone
from dateutil.parser import parse
import os
from os import path
consumer_key = ''
consumer_secret = ''
access_token = ''
access_token_secret = ''
days_to_keep = 7
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_to_keep)
print(cutoff_date)
data_dir = path.join(os.getcwd(), 'data')
tweets = []
with open(path.join(data_dir, "tweet.js"), 'r', encoding='UTF-8') as fp:
js_file = fp.read()
contents = json.loads(js_file)
tweets.extend(contents)
for tweet in tweets:
t = tweet['tweet']
d = parse(t['created_at'])
if d < cutoff_date:
try:
api.destroy_status(t['id_str'])
print(t['created_at'] + " " + t['id_str'])
except tweepy.TweepError as e:
print("failed to delete " + t['id_str'])
print (e.api_code)
print (e.reason)
pass
What would be the required changes to have the script delete everything before a set date based on the current year (and only the year) ? Eg. keep only the current and previous year, delete everything older.