Instantly share code, notes, and snippets.
Created
May 28, 2021 09:51
-
Star
0
(0)
You must be signed in to star a gist -
Fork
0
(0)
You must be signed in to fork a gist
-
-
Save audhiaprilliant/a328fc6476beee8ce865a31629e35827 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Import module for data manipulation | |
| import pandas as pd | |
| # Import module for linear algebra | |
| import numpy as np | |
| # Import module for directory | |
| import os | |
| import sys | |
| # Import module fo regular expression | |
| import re | |
| # Import module for network analysis | |
| import networkx as nx | |
| # Import module for creating iterators for efficient looping | |
| import itertools | |
| # Import module for storing collections of data | |
| import collections | |
| # Import module for data viz | |
| import matplotlib.pyplot as plt | |
| # Function to extract mention from chat | |
| def extractMention(x): | |
| if isinstance(x, str): | |
| return re.findall(r'(@\d+)', x) | |
| return x | |
| # Import the data | |
| df = pd.read_csv('data/WhatsApp_Chat - Final.csv', sep = ';') | |
| print('Dimension data: {} rows and {} columns'.format(len(df), len(df.columns))) | |
| df.head() | |
| # Check the data type | |
| df.info() | |
| # Check the missing value in the data | |
| df.isna().sum() | |
| # Filter the missing value on column of content | |
| df[df['content'].isna()] | |
| # Check the data type and scale measurement | |
| df.select_dtypes(include = ['object']).dtypes | |
| # Replace You with the phone number | |
| df['noMobile'].replace('You', '193360307006', inplace = True) | |
| # Show the unique phone number | |
| df['noMobile'].unique() | |
| # Extract the phone number by mentions | |
| df['mention'] = df['content'].apply(extractMention) | |
| # Filter the data in which it has the mention wihtin content | |
| dfMentioned = df[df['mention'].str.len() > 0] | |
| dfMentioned.reset_index(drop = True, inplace = True) | |
| dfMentioned.head() | |
| # Save the source and target phone number based on mentions | |
| source = [] | |
| target = [] | |
| for i in range(len(dfMentioned)): | |
| listMentioned = dfMentioned.loc[i]['mention'] | |
| for j in range(len(listMentioned)): | |
| source.append(dfMentioned.loc[i]['noMobile']) | |
| target.append(dfMentioned.loc[i]['mention'][j]) | |
| # Create a dataframe | |
| dfSA = pd.DataFrame( | |
| { | |
| 'source': source, | |
| 'target': target | |
| } | |
| ) | |
| # Count the unique possibilities of two columns | |
| dfCombination = dfSA.groupby(['source','target']).size().reset_index().rename(columns = {0:'count'}) | |
| dfCombination.head() | |
| # Graph representation to the adjacency list | |
| graph = collections.defaultdict(dict) | |
| for row in dfCombination.to_numpy(): | |
| graph[row[0]][row[1]] = row[2] | |
| graph[row[1]][row[0]] = row[2] | |
| # 1. Determine the figure size | |
| plt.figure(figsize = (6, 6)) | |
| # 2. Create the graph | |
| g = nx.from_pandas_edgelist(dfCombination, source = 'source', target = 'target') | |
| # 3. Create a layout for our nodes | |
| layout = nx.spring_layout(g, iterations = 50) | |
| nx.draw(g) | |
| # Make a list of the source, we'll use it later | |
| sources = list(dfCombination['source'].unique()) | |
| # Make a list of the target, we'll use it later | |
| targets = list(dfCombination['target'].unique()) | |
| # How many connections does You have coming out of it? | |
| g.degree('193360307006') | |
| # 1. Determine the figure size | |
| plt.figure(figsize = (12, 12)) | |
| # 2. Create the graph | |
| g = nx.from_pandas_edgelist(dfCombination, source = 'source', target = 'target') | |
| # 3. Create a layout for our nodes | |
| layout = nx.spring_layout(g, iterations = 50) | |
| # 4. Draw the parts we want | |
| # - Edges thin and grey | |
| # - People small and grey | |
| # - Source sized according to their number of connections | |
| # - Source blue | |
| # - Labels for sources ONLY | |
| # - Target who are highly connected are a highlighted color | |
| # Go through every sources name, ask the graph how many | |
| # connections it has. Multiply that by 80 to get the circle size | |
| source_size = [g.degree(source) * 80 for source in sources] | |
| nx.draw_networkx_nodes(g, | |
| layout, | |
| nodelist = sources, | |
| node_size = source_size, # a list of sizes, based on g.degree | |
| node_color = 'orange') | |
| # Draw EVERYONE | |
| nx.draw_networkx_nodes(g, layout, nodelist = targets, node_color = '#cccccc', node_size = 100) | |
| # Draw POPULAR target | |
| popular_target = [target for target in targets if g.degree(target) > 1] | |
| nx.draw_networkx_nodes(g, layout, nodelist = popular_target, node_color = 'red', node_size = 100) | |
| nx.draw_networkx_edges(g, layout, width = 1, edge_color = '#cccccc') | |
| node_labels = dict(zip(sources, sources)) | |
| nx.draw_networkx_labels(g, layout, labels = node_labels) | |
| # 5. Turn off the axis because we don't want it | |
| plt.axis('off') | |
| plt.title('Group ABCDE') | |
| # 6. Tell matplotlib to show it | |
| plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment