Last active
February 1, 2018 06:44
-
-
Save impshum/7ed755b84f980490b5ed2df15f363d77 to your computer and use it in GitHub Desktop.
Crap markov chain thing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip3 install tweepy markovify text_cleaner | |
# Create the 5 text files mentioned below | |
# Run - python3 run.py | |
import markovify | |
import tweepy | |
from text_cleaner import keep | |
from text_cleaner.processor.common import ASCII | |
from text_cleaner.processor.misc import URL, ESCAPED_WHITESPACE | |
import re | |
test_mode = 0 | |
post_mode = 0 | |
consumer_key = 'XXXX' | |
consumer_secret = 'XXXX' | |
access_key = 'XXXX-XXXX' | |
access_secret = 'XXXX' | |
auth = tweepy.OAuthHandler(consumer_key, consumer_secret) | |
auth.set_access_token(access_key, access_secret) | |
api = tweepy.API(auth) | |
in1 = 'officialjaden.txt' | |
in2 = 'realDonaldTrump.txt' | |
out1 = 'officialjaden_out.txt' | |
out2 = 'realDonaldTrump_out.txt' | |
out3 = 'markov.txt' | |
def get_tweets(twatter): | |
print('Getting tweets') | |
alltweets = [] | |
new_tweets = api.user_timeline(screen_name=twatter, count=200) | |
alltweets.extend(new_tweets) | |
oldest = alltweets[-1].id - 1 | |
while len(new_tweets) > 0: | |
new_tweets = api.user_timeline( | |
screen_name=twatter, count=200, max_id=oldest) | |
alltweets.extend(new_tweets) | |
oldest = alltweets[-1].id - 1 | |
with open(twatter + '.txt', 'w') as f: | |
for tweet in alltweets: | |
f.write(tweet.text + '\n') | |
def sieve(filein, fileout): | |
print('Processing tweets') | |
def replace_all(text, dic): | |
for i, j in dic.items(): | |
text = text.replace(i, j) | |
return text | |
with open(filein, 'r') as x, open(fileout, 'a') as y: | |
for line in x: | |
k = keep( | |
line, | |
[ASCII], | |
) | |
k = URL.remove(k) | |
expression = '(\#[a-zA-Z0-9]+)|(\@[A-Za-z0-9]+)|\$(\w+)|([#@$"|])' | |
k = ' '.join(re.sub(expression, " ", k).split()) | |
reps = {'RT : ': '', 'RT ': '', 'RT _: ': '', | |
'RT : . : ': '', 'RT : - ': '', ': : ': '', ': ': '', ':': '', '_': '', 'Soo': '', '!!!': '!', | |
'http': '', 'https': '', 'http:': '', 'http://': '', 'https': '', 'https:': '', 'https://': ''} | |
txt = replace_all(k, reps) | |
y.write(txt + ' ') | |
def merge(fileout1, fileout2, fileout3): | |
with open(fileout1) as xh: | |
with open(fileout2) as yh: | |
with open(fileout3, 'w') as zh: | |
xlines = xh.readlines() | |
ylines = yh.readlines() | |
for line1, line2 in zip(ylines, xlines): | |
zh.write("{} {}\n".format(line1.rstrip(), line2.rstrip())) | |
def silly(): | |
with open(out3, 'r') as f: | |
text = f.read() | |
text_model = markovify.Text(text, state_size=2) | |
twonk = text_model.make_short_sentence(200) | |
if post_mode: | |
api.update_status(twonk) | |
print(twonk) | |
def clean(): | |
with open(in1, 'w') as a, open(in2, 'w') as b, open(out1, 'w') as c, open(out2, 'w') as d, open(out3, 'w') as e: | |
t = [a,b,c,d,e] | |
for r in t: | |
r.write('') | |
if __name__ == '__main__': | |
if not test_mode: | |
clean() | |
get_tweets("officialjaden") | |
get_tweets("realDonaldTrump") | |
sieve(in1, out1) | |
sieve(in2, out2) | |
merge(out1, out2, out3) | |
silly() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment