rohitdholakia · January 9, 2012 09:10
diff --git a/preprocessing02.py b/preprocessing02.py
 '''The aim is  to take all the tweets of a user and store them in a table.  Do this for all the users and then lets see what we can do with it 
   What you wanna do is that you want to get enough information about a user so that you can profile them better. So , lets get started 
 '''
 def regexSub(line):
 	line = re.sub(regRT,'',line)
 	line = re.sub(regAt,'',line)
 	line = line.lstrip(' ')
 	line = re.sub(regHttp,'',line)
 	return line
 def userName(line): return line[19:]


 import sys,os,itertools,re
 data = open(sys.argv[1],'r')
 processed = open(sys.argv[2],'w')
 global regRT 
 regRT = 'RT'
 global regHttp 
 regHttp = re.compile('(http://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
 global regAt 
 regAt = re.compile('@([a-zA-Z0-9]*[*_/&%#@$]*)*[a-zA-Z0-9]*')

 for line1,line2,line3 in itertools.izip_longest(*[data]*3):
 	line1 = line1[2:]
 	line2 = line2[2:]
 	line3 = line3[2:]

 	try:
 		tweet=regexSub(line3)
 		user = userName(line2)
 	except:
 		print 'Line2 is ',line2
 		print 'Line3 is',line3
 	
 	
 	processed.write(user.strip("\n")+"\t"+tweet)
	'''The aim is to take all the tweets of a user and store them in a table. Do this for all the users and then lets see what we can do with it
	What you wanna do is that you want to get enough information about a user so that you can profile them better. So , lets get started
	'''
	def regexSub(line):
	line = re.sub(regRT,'',line)
	line = re.sub(regAt,'',line)
	line = line.lstrip(' ')
	line = re.sub(regHttp,'',line)
	return line
	def userName(line): return line[19:]


	import sys,os,itertools,re
	data = open(sys.argv[1],'r')
	processed = open(sys.argv[2],'w')
	global regRT
	regRT = 'RT'
	global regHttp
	regHttp = re.compile('(http://)[a-zA-Z0-9].[a-zA-Z0-9/](.[a-zA-Z0-9]*)?')
	global regAt
	regAt = re.compile('@([a-zA-Z0-9][_/&%#@$])[a-zA-Z0-9]*')

	for line1,line2,line3 in itertools.izip_longest([data]3):
	line1 = line1[2:]
	line2 = line2[2:]
	line3 = line3[2:]

	try:
	tweet=regexSub(line3)
	user = userName(line2)
	except:
	print 'Line2 is ',line2
	print 'Line3 is',line3


	processed.write(user.strip("\n")+"\t"+tweet)
No results found