Created
May 11, 2015 07:37
-
-
Save byronyi/f2cba140972bba72fcaf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| from __future__ import division, print_function, unicode_literals | |
| import os | |
| from datetime import datetime | |
| from tempfile import mkdtemp | |
| import joblib | |
| import tempfile | |
| from joblib import Memory | |
| from joblib import Parallel, delayed | |
| from joblib.pool import has_shareable_memory | |
| import numpy as np | |
| import scipy.sparse as sp | |
| from matplotlib import pyplot as plt | |
| import pandas as pd | |
| # Cache result for fast IO | |
| cachedir = tempfile.mkdtemp() | |
| memory = Memory(cachedir=cachedir, verbose=0) | |
| @memory.cache() | |
| def get_data(path): | |
| df = pd.read_csv(path, delim_whitespace=True, index_col='Start', | |
| date_parser=lambda t: datetime.strptime(t[:len('00:00:00')], '%H:%M:%S')) | |
| dname = path.split(os.sep)[-1].split('.')[0] | |
| for col in ['SrcIPaddress', 'SrcP', 'DstIPaddress', 'DstP']: | |
| df[col] = pd.Categorical(df[col]) | |
| return df, dname | |
| def plot_traffic_distribution_over_last_octet(path): | |
| df, dname = get_data(path) | |
| last_octet = df['SrcIPaddress'].astype(str).map(lambda x: x.split('.')[-1]) | |
| ts = df.set_index(last_octet.astype(int)).groupby(level=0).size() | |
| plt.figure(figsize=(12, 4)) | |
| ts.plot() | |
| plt.xlabel(dname) | |
| plt.savefig('{}-last_octet.eps'.format(dname), format='eps', dpi=1000) | |
| def plot_all_trace_traffic_over_time(trace_files): | |
| f, axes = plt.subplots(len(trace_files), sharey=True, | |
| figsize=(20, 2*len(trace_files))) | |
| for ax, (i, path) in zip(axes, enumerate(trace_files)): | |
| df, dname = get_data(path) | |
| df_series = df.groupby(level=0, sort=False)['Octets'].sum() | |
| ax.plot(df_series.as_matrix() / df_series.max()) | |
| ax.set_title('{} normalized traffic vs time'.format(dname)) | |
| plt.savefig('all-traf.eps', format='eps', dpi=1000) | |
| def get_avg_pkt_size(path): | |
| df = get_data(path) | |
| pkt_size = df.Octets.mean() | |
| return { | |
| 'trace': path.split(os.sep)[-1].split('.')[0], | |
| 'pkt_size': pkt_size, | |
| } | |
| def get_port_table(path, ports): | |
| df, dname = get_data(path) | |
| mask = (df['SrcP'].astype(int).map(lambda x: x in ports).as_matrix() | | |
| df['DstP'].astype(int).map(lambda x: x in ports).as_matrix()) | |
| port_traffic = df[mask]['Octets'].sum() | |
| total_traffic = df['Octets'].sum() | |
| return { | |
| 'trace': dname, | |
| 'traffic'.format(app): port_traffic, | |
| 'percentage': "{:.2%}".format(port_traffic / total_traffic) | |
| } | |
| def plot_traffic_distribution_over_port(path): | |
| df, dname = get_data(path) | |
| port, traffic = df.groupby('SrcP')['Octets'].sum().reset_index().as_matrix().T | |
| distribution = sp.csr_matrix((traffic, port, [0, port.shape[0]]), shape=(1, 65536)).toarray()[0] | |
| plt.figure(figsize=(16, 3)) | |
| # +1 to prevent underflow under log scale | |
| plt.semilogy(distribution[:1000]+1) | |
| plt.title(dname) | |
| plt.xlabel('Src Port Number') | |
| plt.ylabel('Traffic') | |
| plt.savefig('{}-port.eps'.format(dname), format='eps', dpi=1000) | |
| def main(): | |
| # Find trace files | |
| trace_files = [] | |
| for path, _, files in os.walk('./data'): | |
| for f in files: | |
| if 'trace' in f: | |
| trace_files.append(os.path.join(path, f)) | |
| # Task 1 | |
| for path in trace_files: | |
| if 'gi' in path: | |
| plot_traffic_distribution_over_last_octet(path) | |
| # Plot trace traffic as time series | |
| plot_all_trace_traffic_over_time(trace_files) | |
| # Average packet size of traces | |
| df = pd.DataFrame([get_avg_pkt_size(path) for path in trace_files]).set_index('trace') | |
| df.to_csv('mean_pkt_size.csv') | |
| # Plot traffic distribution over all well known ports | |
| for path in trace_files: | |
| plot_traffic_distribution_over_port(path) | |
| # Traffic distribution over some applications | |
| app_ports = { | |
| 'dns': [53], | |
| 'dhcp': [546, 547], | |
| 'mail': [25, 110, 465, 587, 995], | |
| 'http': [80, 443], | |
| 'db': [1433, 1434, 1521, 3306, 5432, 6379, 11211], | |
| 'flash': [843], | |
| 'rsync': [873], | |
| 'ipv6nfs': [973], | |
| } | |
| for app, ports in app_ports.items(): | |
| df = pd.DataFrame([get_app_table(path, ports) for path in trace_files]).set_index('trace') | |
| df.columns = ['{}-traffic'.format(app), 'percentage'] | |
| df.to_csv('{}-traffic.csv'.format(app)) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment