vallantin · October 18, 2018 14:37
diff --git a/002savariables.py b/002savariables.py
 def import_data_set(url):
    '''
    This function imports Tweets data set and decomposes 
    the date field into:
    
    - Weekday
    - Month
    - Calendar day
    - Hour
    - Is weekend?
        
    The function also set the field types, shows 
    missing data and creates aditional data.
    '''
    
    df = pd.read_csv(url, 
                     encoding='latin-1',
                     usecols=[0,2,5],
                     header=None,
                     names=['sentiment','date','tweet'])
    
    # Decompose date field
    weekday = []
    calendar_day = []
    hour = []
    
    for date in df.date:
        element = date.split(' ')
        weekday.append(element[0])
        calendar_day.append(int(element[2]))
        
        # Decompose time
        time = element[3].split(':')
        hour.append(int(time[0]))
            
    df['weekday'] = weekday
    df['calendar_day'] = calendar_day
    df['hour'] = hour
      
    # Is weekend?
    is_weekend = []
    for day in df['weekday']:
        if day == 'Sat' or day == 'Sun':
            is_weekend.append('Y')
        else:
            is_weekend.append('N')
    
    df['is_weekend'] = is_weekend
    
    # Since we only have 2 sentiment labels, let's change
    # them to a better reading
    sentiment = []
    for s in df['sentiment']:
        if s == 4:
            sentiment.append(1)
        else:
            sentiment.append(0)
    
    df['sentiment'] = sentiment
    
    # Capture size, mentions etc in each tweet
    len_tweet = []
    mention = []
    link = []
    for tweet in df['tweet']:
        len_tweet.append(len(tweet))
        
        # Look for mentions
        if tweet.find('@') == -1:
            mention.append(0)
        else:
            mention.append(1)
            
        # Look for links
        if tweet.find('://') == -1:
            link.append(0)
        else:
            link.append(1)

    df['tweet_size'] = len_tweet
    df['mention'] = mention
    df['link'] = link
    
    # Change data types
    df['weekday'] = df.weekday.astype('category')
    df['calendar_day'] = df.calendar_day.astype('category')
    df['hour'] = df.hour.astype('category')
    df['is_weekend'] = df.is_weekend.astype('category')
    df['mention'] = df.mention.astype('category')
    df['link'] = df.link.astype('category')
    df['sentiment'] = df.sentiment.astype('category')
    
    print('Data imported | Rows:', df.shape[0], '| Columns:', df.shape[1])
    print('Types:\n', df.dtypes)
    print('Missing data:\n', df.isnull().any())
    return df

 # Import data and start preprocessing
 data = import_data_set('../data/training.1600000.processed.noemoticon.csv')
	def import_data_set(url):
	'''
	This function imports Tweets data set and decomposes
	the date field into:

	- Weekday
	- Month
	- Calendar day
	- Hour
	- Is weekend?

	The function also set the field types, shows
	missing data and creates aditional data.
	'''

	df = pd.read_csv(url,
	encoding='latin-1',
	usecols=[0,2,5],
	header=None,
	names=['sentiment','date','tweet'])

	# Decompose date field
	weekday = []
	calendar_day = []
	hour = []

	for date in df.date:
	element = date.split(' ')
	weekday.append(element[0])
	calendar_day.append(int(element[2]))

	# Decompose time
	time = element[3].split(':')
	hour.append(int(time[0]))

	df['weekday'] = weekday
	df['calendar_day'] = calendar_day
	df['hour'] = hour

	# Is weekend?
	is_weekend = []
	for day in df['weekday']:
	if day == 'Sat' or day == 'Sun':
	is_weekend.append('Y')
	else:
	is_weekend.append('N')

	df['is_weekend'] = is_weekend

	# Since we only have 2 sentiment labels, let's change
	# them to a better reading
	sentiment = []
	for s in df['sentiment']:
	if s == 4:
	sentiment.append(1)
	else:
	sentiment.append(0)

	df['sentiment'] = sentiment

	# Capture size, mentions etc in each tweet
	len_tweet = []
	mention = []
	link = []
	for tweet in df['tweet']:
	len_tweet.append(len(tweet))

	# Look for mentions
	if tweet.find('@') == -1:
	mention.append(0)
	else:
	mention.append(1)

	# Look for links
	if tweet.find('://') == -1:
	link.append(0)
	else:
	link.append(1)

	df['tweet_size'] = len_tweet
	df['mention'] = mention
	df['link'] = link

	# Change data types
	df['weekday'] = df.weekday.astype('category')
	df['calendar_day'] = df.calendar_day.astype('category')
	df['hour'] = df.hour.astype('category')
	df['is_weekend'] = df.is_weekend.astype('category')
	df['mention'] = df.mention.astype('category')
	df['link'] = df.link.astype('category')
	df['sentiment'] = df.sentiment.astype('category')

	print('Data imported \| Rows:', df.shape[0], '\| Columns:', df.shape[1])
	print('Types:\n', df.dtypes)
	print('Missing data:\n', df.isnull().any())
	return df

	# Import data and start preprocessing
	data = import_data_set('../data/training.1600000.processed.noemoticon.csv')