Created
January 9, 2012 09:10
-
-
Save rohitdholakia/1582105 to your computer and use it in GitHub Desktop.
Version 0.2 of Py script to preprocess tweets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| '''The aim is to take all the tweets of a user and store them in a table. Do this for all the users and then lets see what we can do with it | |
| What you wanna do is that you want to get enough information about a user so that you can profile them better. So , lets get started | |
| ''' | |
| def regexSub(line): | |
| line = re.sub(regRT,'',line) | |
| line = re.sub(regAt,'',line) | |
| line = line.lstrip(' ') | |
| line = re.sub(regHttp,'',line) | |
| return line | |
| def userName(line): return line[19:] | |
| import sys,os,itertools,re | |
| data = open(sys.argv[1],'r') | |
| processed = open(sys.argv[2],'w') | |
| global regRT | |
| regRT = 'RT' | |
| global regHttp | |
| regHttp = re.compile('(http://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?') | |
| global regAt | |
| regAt = re.compile('@([a-zA-Z0-9]*[*_/&%#@$]*)*[a-zA-Z0-9]*') | |
| for line1,line2,line3 in itertools.izip_longest(*[data]*3): | |
| line1 = line1[2:] | |
| line2 = line2[2:] | |
| line3 = line3[2:] | |
| try: | |
| tweet=regexSub(line3) | |
| user = userName(line2) | |
| except: | |
| print 'Line2 is ',line2 | |
| print 'Line3 is',line3 | |
| processed.write(user.strip("\n")+"\t"+tweet) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment