Last active
January 30, 2021 10:05
-
-
Save egordorichev/f847803c9995ab58fc5ddae17846e9c0 to your computer and use it in GitHub Desktop.
Useful information from twitter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Requires python3.6 | |
from pattern.en import parse | |
from pattern.web import Twitter | |
from gingerit.gingerit import GingerIt | |
from better_profanity import profanity | |
import json | |
import string | |
import re | |
import time | |
parser = GingerIt() | |
def findUrls(string): | |
regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))" | |
url = re.findall(regex,string) | |
return [x[0] for x in url] | |
twitter = Twitter(throttle=10) | |
data = [] | |
index = 0 | |
chunkId = 0 | |
nextTarget = 1 | |
def stringIsAscii(s): | |
return all(ord(c) < 128 for c in s) | |
def parseText(text): | |
global nextTarget | |
tokens = parse( | |
text, | |
relations=True, | |
lemmata=True | |
).split() | |
for sentense in tokens: | |
if len(sentense) > 32: continue | |
if sentense[-1][2] == 'O' and sentense[-1][0] != '.': continue | |
convertedSentense = [] | |
hadVerb = False | |
bad = False | |
for word in sentense: | |
if not hadVerb and word[4] == 'VP-1': | |
hadVerb = True | |
if word[1] != 'VBD' and word[1] != 'VBN' and word[1] != 'VBP': | |
bad = True | |
break | |
token = word[5] | |
if 'SBJ-1' in word[4]: | |
continue | |
elif not '-1' in word[4]: | |
token = word[0] | |
convertedSentense.append(token) | |
if not hadVerb or bad: continue | |
sentenseString = ''.join([('' if c in string.punctuation else ' ') + c for c in convertedSentense]).strip().capitalize() | |
if not stringIsAscii(sentenseString): continue | |
if not sentenseString[-1] in string.punctuation: sentenseString = sentenseString + '.' | |
print(text) | |
print('->', sentenseString) | |
correctedString = profanity.censor(parser.parse(sentenseString).get('result')) | |
print('=>', correctedString) | |
data.append(correctedString) | |
if len(data) >= nextTarget: | |
nextTarget += 16 | |
with open('twitter2.json', 'w') as outfile: | |
json.dump(data, outfile, indent=2) | |
while True: | |
try: | |
for tweet in twitter.search('do', start=index, count=100): | |
if tweet.language != 'en': continue | |
text = tweet.text | |
if text.startswith('RT'): continue | |
if '&' in text or '@' in text or '#' in text or '\b' in text: continue | |
if len(findUrls(text)) > 0: continue | |
parseText(text) | |
except: | |
print('Sleeping...') | |
time.sleep(10) | |
continue | |
index += 1 | |
time.sleep(0.1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment