Skip to content

Instantly share code, notes, and snippets.

@vallantin
Last active October 18, 2018 14:37
Show Gist options
  • Save vallantin/a6f3024c8a9a334565234cca8ae5a4b6 to your computer and use it in GitHub Desktop.
Save vallantin/a6f3024c8a9a334565234cca8ae5a4b6 to your computer and use it in GitHub Desktop.
def import_data_set(url):
'''
This function imports Tweets data set and decomposes
the date field into:
- Weekday
- Month
- Calendar day
- Hour
- Is weekend?
The function also set the field types, shows
missing data and creates aditional data.
'''
df = pd.read_csv(url,
encoding='latin-1',
usecols=[0,2,5],
header=None,
names=['sentiment','date','tweet'])
# Decompose date field
weekday = []
calendar_day = []
hour = []
for date in df.date:
element = date.split(' ')
weekday.append(element[0])
calendar_day.append(int(element[2]))
# Decompose time
time = element[3].split(':')
hour.append(int(time[0]))
df['weekday'] = weekday
df['calendar_day'] = calendar_day
df['hour'] = hour
# Is weekend?
is_weekend = []
for day in df['weekday']:
if day == 'Sat' or day == 'Sun':
is_weekend.append('Y')
else:
is_weekend.append('N')
df['is_weekend'] = is_weekend
# Since we only have 2 sentiment labels, let's change
# them to a better reading
sentiment = []
for s in df['sentiment']:
if s == 4:
sentiment.append(1)
else:
sentiment.append(0)
df['sentiment'] = sentiment
# Capture size, mentions etc in each tweet
len_tweet = []
mention = []
link = []
for tweet in df['tweet']:
len_tweet.append(len(tweet))
# Look for mentions
if tweet.find('@') == -1:
mention.append(0)
else:
mention.append(1)
# Look for links
if tweet.find('://') == -1:
link.append(0)
else:
link.append(1)
df['tweet_size'] = len_tweet
df['mention'] = mention
df['link'] = link
# Change data types
df['weekday'] = df.weekday.astype('category')
df['calendar_day'] = df.calendar_day.astype('category')
df['hour'] = df.hour.astype('category')
df['is_weekend'] = df.is_weekend.astype('category')
df['mention'] = df.mention.astype('category')
df['link'] = df.link.astype('category')
df['sentiment'] = df.sentiment.astype('category')
print('Data imported | Rows:', df.shape[0], '| Columns:', df.shape[1])
print('Types:\n', df.dtypes)
print('Missing data:\n', df.isnull().any())
return df
# Import data and start preprocessing
data = import_data_set('../data/training.1600000.processed.noemoticon.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment