audhiaprilliant · May 28, 2021 09:51
diff --git a/social_network_data.py b/social_network_data.py
 # Import module for data manipulation
 import pandas as pd
 # Import module for linear algebra
 import numpy as np
 # Import module for directory
 import os
 import sys
 # Import module fo regular expression
 import re
 # Import module for network analysis
 import networkx as nx
 # Import module for creating iterators for efficient looping
 import itertools
 # Import module for storing collections of data
 import collections
 # Import module for data viz
 import matplotlib.pyplot as plt

 # Function to extract mention from chat
 def extractMention(x):
    if isinstance(x, str):
        return re.findall(r'(@\d+)', x)
    return x

 # Import the data
 df = pd.read_csv('data/WhatsApp_Chat - Final.csv', sep = ';')
 print('Dimension data: {} rows and {} columns'.format(len(df), len(df.columns)))
 df.head()
 # Check the data type
 df.info()
 # Check the missing value in the data
 df.isna().sum()
 # Filter the missing value on column of content
 df[df['content'].isna()]
 # Check the data type and scale measurement
 df.select_dtypes(include = ['object']).dtypes
 # Replace You with the phone number
 df['noMobile'].replace('You', '193360307006', inplace = True)
 # Show the unique phone number
 df['noMobile'].unique()

 # Extract the phone number by mentions
 df['mention'] = df['content'].apply(extractMention)
 # Filter the data in which it has the mention wihtin content
 dfMentioned = df[df['mention'].str.len() > 0]
 dfMentioned.reset_index(drop = True, inplace = True)
 dfMentioned.head()

 # Save the source and target phone number based on mentions
 source = []
 target = []
 for i in range(len(dfMentioned)):
    listMentioned = dfMentioned.loc[i]['mention']
    for j in range(len(listMentioned)):
        source.append(dfMentioned.loc[i]['noMobile'])
        target.append(dfMentioned.loc[i]['mention'][j])
 # Create a dataframe
 dfSA = pd.DataFrame(
    {
        'source': source,
        'target': target
    }
 )
 # Count the unique possibilities of two columns
 dfCombination = dfSA.groupby(['source','target']).size().reset_index().rename(columns = {0:'count'})
 dfCombination.head()
 # Graph representation to the adjacency list
 graph = collections.defaultdict(dict)
 for row in dfCombination.to_numpy():
    graph[row[0]][row[1]] = row[2]
    graph[row[1]][row[0]] = row[2]

 # 1. Determine the figure size
 plt.figure(figsize = (6, 6))
 # 2. Create the graph
 g = nx.from_pandas_edgelist(dfCombination, source = 'source', target = 'target')
 # 3. Create a layout for our nodes 
 layout = nx.spring_layout(g, iterations = 50)
 nx.draw(g)

 # Make a list of the source, we'll use it later
 sources = list(dfCombination['source'].unique())
 # Make a list of the target, we'll use it later
 targets = list(dfCombination['target'].unique())
 # How many connections does You have coming out of it?
 g.degree('193360307006')

 # 1. Determine the figure size
 plt.figure(figsize = (12, 12))
 # 2. Create the graph
 g = nx.from_pandas_edgelist(dfCombination, source = 'source', target = 'target')
 # 3. Create a layout for our nodes 
 layout = nx.spring_layout(g, iterations = 50)
 # 4. Draw the parts we want
 #    - Edges thin and grey
 #    - People small and grey
 #    - Source sized according to their number of connections
 #    - Source blue
 #    - Labels for sources ONLY
 #    - Target who are highly connected are a highlighted color
 # Go through every sources name, ask the graph how many
 # connections it has. Multiply that by 80 to get the circle size
 source_size = [g.degree(source) * 80 for source in sources]
 nx.draw_networkx_nodes(g, 
                       layout, 
                       nodelist = sources,
                       node_size = source_size, # a list of sizes, based on g.degree
                       node_color = 'orange')
 # Draw EVERYONE
 nx.draw_networkx_nodes(g, layout, nodelist = targets, node_color = '#cccccc', node_size = 100)
 # Draw POPULAR target
 popular_target = [target for target in targets if g.degree(target) > 1]
 nx.draw_networkx_nodes(g, layout, nodelist = popular_target, node_color = 'red', node_size = 100)
 nx.draw_networkx_edges(g, layout, width = 1, edge_color = '#cccccc')
 node_labels = dict(zip(sources, sources))
 nx.draw_networkx_labels(g, layout, labels = node_labels)
 # 5. Turn off the axis because we don't want it
 plt.axis('off')
 plt.title('Group ABCDE')
 # 6. Tell matplotlib to show it
 plt.show()
	# Import module for data manipulation
	import pandas as pd
	# Import module for linear algebra
	import numpy as np
	# Import module for directory
	import os
	import sys
	# Import module fo regular expression
	import re
	# Import module for network analysis
	import networkx as nx
	# Import module for creating iterators for efficient looping
	import itertools
	# Import module for storing collections of data
	import collections
	# Import module for data viz
	import matplotlib.pyplot as plt

	# Function to extract mention from chat
	def extractMention(x):
	if isinstance(x, str):
	return re.findall(r'(@\d+)', x)
	return x

	# Import the data
	df = pd.read_csv('data/WhatsApp_Chat - Final.csv', sep = ';')
	print('Dimension data: {} rows and {} columns'.format(len(df), len(df.columns)))
	df.head()
	# Check the data type
	df.info()
	# Check the missing value in the data
	df.isna().sum()
	# Filter the missing value on column of content
	df[df['content'].isna()]
	# Check the data type and scale measurement
	df.select_dtypes(include = ['object']).dtypes
	# Replace You with the phone number
	df['noMobile'].replace('You', '193360307006', inplace = True)
	# Show the unique phone number
	df['noMobile'].unique()

	# Extract the phone number by mentions
	df['mention'] = df['content'].apply(extractMention)
	# Filter the data in which it has the mention wihtin content
	dfMentioned = df[df['mention'].str.len() > 0]
	dfMentioned.reset_index(drop = True, inplace = True)
	dfMentioned.head()

	# Save the source and target phone number based on mentions
	source = []
	target = []
	for i in range(len(dfMentioned)):
	listMentioned = dfMentioned.loc[i]['mention']
	for j in range(len(listMentioned)):
	source.append(dfMentioned.loc[i]['noMobile'])
	target.append(dfMentioned.loc[i]['mention'][j])
	# Create a dataframe
	dfSA = pd.DataFrame(
	{
	'source': source,
	'target': target
	}
	)
	# Count the unique possibilities of two columns
	dfCombination = dfSA.groupby(['source','target']).size().reset_index().rename(columns = {0:'count'})
	dfCombination.head()
	# Graph representation to the adjacency list
	graph = collections.defaultdict(dict)
	for row in dfCombination.to_numpy():
	graph[row[0]][row[1]] = row[2]
	graph[row[1]][row[0]] = row[2]

	# 1. Determine the figure size
	plt.figure(figsize = (6, 6))
	# 2. Create the graph
	g = nx.from_pandas_edgelist(dfCombination, source = 'source', target = 'target')
	# 3. Create a layout for our nodes
	layout = nx.spring_layout(g, iterations = 50)
	nx.draw(g)

	# Make a list of the source, we'll use it later
	sources = list(dfCombination['source'].unique())
	# Make a list of the target, we'll use it later
	targets = list(dfCombination['target'].unique())
	# How many connections does You have coming out of it?
	g.degree('193360307006')

	# 1. Determine the figure size
	plt.figure(figsize = (12, 12))
	# 2. Create the graph
	g = nx.from_pandas_edgelist(dfCombination, source = 'source', target = 'target')
	# 3. Create a layout for our nodes
	layout = nx.spring_layout(g, iterations = 50)
	# 4. Draw the parts we want
	# - Edges thin and grey
	# - People small and grey
	# - Source sized according to their number of connections
	# - Source blue
	# - Labels for sources ONLY
	# - Target who are highly connected are a highlighted color
	# Go through every sources name, ask the graph how many
	# connections it has. Multiply that by 80 to get the circle size
	source_size = [g.degree(source) * 80 for source in sources]
	nx.draw_networkx_nodes(g,
	layout,
	nodelist = sources,
	node_size = source_size, # a list of sizes, based on g.degree
	node_color = 'orange')
	# Draw EVERYONE
	nx.draw_networkx_nodes(g, layout, nodelist = targets, node_color = '#cccccc', node_size = 100)
	# Draw POPULAR target
	popular_target = [target for target in targets if g.degree(target) > 1]
	nx.draw_networkx_nodes(g, layout, nodelist = popular_target, node_color = 'red', node_size = 100)
	nx.draw_networkx_edges(g, layout, width = 1, edge_color = '#cccccc')
	node_labels = dict(zip(sources, sources))
	nx.draw_networkx_labels(g, layout, labels = node_labels)
	# 5. Turn off the axis because we don't want it
	plt.axis('off')
	plt.title('Group ABCDE')
	# 6. Tell matplotlib to show it
	plt.show()
No results found