Skip to content

Instantly share code, notes, and snippets.

@audhiaprilliant
Created May 28, 2021 09:51
Show Gist options
  • Select an option

  • Save audhiaprilliant/a328fc6476beee8ce865a31629e35827 to your computer and use it in GitHub Desktop.

Select an option

Save audhiaprilliant/a328fc6476beee8ce865a31629e35827 to your computer and use it in GitHub Desktop.
# Import module for data manipulation
import pandas as pd
# Import module for linear algebra
import numpy as np
# Import module for directory
import os
import sys
# Import module fo regular expression
import re
# Import module for network analysis
import networkx as nx
# Import module for creating iterators for efficient looping
import itertools
# Import module for storing collections of data
import collections
# Import module for data viz
import matplotlib.pyplot as plt
# Function to extract mention from chat
def extractMention(x):
if isinstance(x, str):
return re.findall(r'(@\d+)', x)
return x
# Import the data
df = pd.read_csv('data/WhatsApp_Chat - Final.csv', sep = ';')
print('Dimension data: {} rows and {} columns'.format(len(df), len(df.columns)))
df.head()
# Check the data type
df.info()
# Check the missing value in the data
df.isna().sum()
# Filter the missing value on column of content
df[df['content'].isna()]
# Check the data type and scale measurement
df.select_dtypes(include = ['object']).dtypes
# Replace You with the phone number
df['noMobile'].replace('You', '193360307006', inplace = True)
# Show the unique phone number
df['noMobile'].unique()
# Extract the phone number by mentions
df['mention'] = df['content'].apply(extractMention)
# Filter the data in which it has the mention wihtin content
dfMentioned = df[df['mention'].str.len() > 0]
dfMentioned.reset_index(drop = True, inplace = True)
dfMentioned.head()
# Save the source and target phone number based on mentions
source = []
target = []
for i in range(len(dfMentioned)):
listMentioned = dfMentioned.loc[i]['mention']
for j in range(len(listMentioned)):
source.append(dfMentioned.loc[i]['noMobile'])
target.append(dfMentioned.loc[i]['mention'][j])
# Create a dataframe
dfSA = pd.DataFrame(
{
'source': source,
'target': target
}
)
# Count the unique possibilities of two columns
dfCombination = dfSA.groupby(['source','target']).size().reset_index().rename(columns = {0:'count'})
dfCombination.head()
# Graph representation to the adjacency list
graph = collections.defaultdict(dict)
for row in dfCombination.to_numpy():
graph[row[0]][row[1]] = row[2]
graph[row[1]][row[0]] = row[2]
# 1. Determine the figure size
plt.figure(figsize = (6, 6))
# 2. Create the graph
g = nx.from_pandas_edgelist(dfCombination, source = 'source', target = 'target')
# 3. Create a layout for our nodes
layout = nx.spring_layout(g, iterations = 50)
nx.draw(g)
# Make a list of the source, we'll use it later
sources = list(dfCombination['source'].unique())
# Make a list of the target, we'll use it later
targets = list(dfCombination['target'].unique())
# How many connections does You have coming out of it?
g.degree('193360307006')
# 1. Determine the figure size
plt.figure(figsize = (12, 12))
# 2. Create the graph
g = nx.from_pandas_edgelist(dfCombination, source = 'source', target = 'target')
# 3. Create a layout for our nodes
layout = nx.spring_layout(g, iterations = 50)
# 4. Draw the parts we want
# - Edges thin and grey
# - People small and grey
# - Source sized according to their number of connections
# - Source blue
# - Labels for sources ONLY
# - Target who are highly connected are a highlighted color
# Go through every sources name, ask the graph how many
# connections it has. Multiply that by 80 to get the circle size
source_size = [g.degree(source) * 80 for source in sources]
nx.draw_networkx_nodes(g,
layout,
nodelist = sources,
node_size = source_size, # a list of sizes, based on g.degree
node_color = 'orange')
# Draw EVERYONE
nx.draw_networkx_nodes(g, layout, nodelist = targets, node_color = '#cccccc', node_size = 100)
# Draw POPULAR target
popular_target = [target for target in targets if g.degree(target) > 1]
nx.draw_networkx_nodes(g, layout, nodelist = popular_target, node_color = 'red', node_size = 100)
nx.draw_networkx_edges(g, layout, width = 1, edge_color = '#cccccc')
node_labels = dict(zip(sources, sources))
nx.draw_networkx_labels(g, layout, labels = node_labels)
# 5. Turn off the axis because we don't want it
plt.axis('off')
plt.title('Group ABCDE')
# 6. Tell matplotlib to show it
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment