Created
May 18, 2021 09:38
-
-
Save blu3r4y/dc2b2d305ff0cdd995ddcac70330af7e to your computer and use it in GitHub Desktop.
Features used by Dynatrace - SAL - LIT.AI.JKU in the NAD 2021 challenge
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2021 | |
# Dynatrace Research | |
# SAL Silicon Austria Labs | |
# LIT Artificial Intelligence Lab | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import pandas as pd | |
import feature_ip_port as ipp | |
import feature_ratios as rat | |
import feature_segments as seg | |
df = pd.read_csv("example.csv", parse_dates=["time"]).drop("label", axis="columns") | |
features = pd.concat([ | |
ipp.get_ip_area(df, columns=["src", "dst"], areas=["global", "link_local", "unspecified"]), | |
ipp.get_ip_flow_type(df, src="src", dst="dst"), | |
ipp.get_ip_binary_match(df, src="src", dst="dst"), | |
ipp.get_ip_one_bits(df, columns=["src", "dst"]), | |
ipp.get_ip_parts(df, columns=["src", "dst"]), | |
ipp.get_port_area(df, columns=["spt", "dpt"]), | |
ipp.get_port_match(df, spt="spt", dpt="dpt"), | |
rat.get_cnt_ratios(df, unsafe=False, fill=0), | |
rat.get_in_out_ratios(df, fill=0), | |
rat.get_cnt_distances(df), | |
seg.get_segment_features(df, ipp.get_ip_parts(df, columns=["src", "dst"]), | |
groups=["src", "dst_3", "dst_2", "dst_1"], lag=1, parallel=False) | |
], axis="columns") | |
# take a glimpse at the data | |
print(features.head().T) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
time | src | dst | spt | dpt | duration | out (bytes) | in (bytes) | proto | app | cnt_dst | cnt_src | cnt_serv_src | cnt_serv_dst | cnt_dst_slow | cnt_src_slow | cnt_serv_src_slow | cnt_serv_dst_slow | cnt_dst_conn | cnt_src_conn | cnt_serv_src_conn | cnt_serv_dst_conn | label | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2020-12-03 04:42:51 | 2887270751 | 2887273992 | 37103 | 53 | 0 | 516870 | 43627 | 17 | 5 | 1 | 3 | 2 | 0 | 2 | 22 | 229 | 0 | 1 | 5 | 2 | 0 | Probing-Nmap | |
2020-12-03 07:16:06 | 2887274048 | 167837957 | 53582 | 161 | 64 | 6559 | 0 | 17 | 35 | 5 | 1 | 131 | 1 | 21 | 2 | 3001 | 1 | 4 | 1 | 98 | 1 | Probing-Port sweep | |
2020-12-03 07:29:07 | 2887270690 | 134744072 | 4324 | 53 | 427 | 107636 | 0 | 17 | 5 | 1 | 2 | 7 | 0 | 32 | 8 | 1269 | 0 | 2 | 3 | 21 | 0 | Normal | |
2020-12-03 07:41:14 | 2887274048 | 167838199 | 53306 | 161 | 0 | 0 | 0 | 17 | 35 | 5 | 1 | 112 | 0 | 21 | 1 | 3037 | 0 | 4 | 1 | 70 | 0 | Normal | |
2020-12-03 07:44:00 | 2887270786 | 134744072 | 0 | 0 | 0 | 178651 | 0 | 1 | 11 | 1 | 2 | 5 | 5 | 5 | 8 | 746 | 746 | 1 | 3 | 11 | 11 | Normal | |
2020-12-03 07:53:57 | 2887270776 | 2887273992 | 60846 | 53 | 820 | 292791 | 0 | 17 | 5 | 1 | 13 | 0 | 0 | 52 | 61 | 167 | 0 | 1 | 8 | 1 | 1 | Normal | |
2020-12-03 09:12:28 | 2887270733 | 134744072 | 60235 | 53 | 0 | 433203 | 0 | 17 | 5 | 1 | 4 | 0 | 0 | 23 | 14 | 739 | 0 | 1 | 3 | 0 | 0 | Normal | |
2020-12-03 10:11:42 | 2887274003 | 2887273982 | 49980 | 161 | 0 | 0 | 47106 | 17 | 35 | 1 | 1 | 0 | 0 | 12 | 1 | 400 | 0 | 1 | 1 | 0 | 0 | Probing-Port sweep | |
2020-12-03 10:17:48 | 2887274003 | 167837961 | 35740 | 161 | 230 | 652054 | 0 | 17 | 35 | 4 | 1 | 3 | 0 | 12 | 2 | 400 | 0 | 5 | 1 | 4 | 0 | Probing-IP sweep | |
2020-12-03 11:42:37 | 2887270719 | 2887273992 | 51223 | 53 | 590 | 0 | 62349 | 17 | 5 | 6 | 10 | 36 | 1 | 89 | 52 | 87 | 1 | 3 | 7 | 28 | 1 | Normal | |
2020-12-03 12:00:48 | 2887270690 | 879905829 | 55718 | 443 | 975 | 535170 | 0 | 6 | 10 | 3 | 1 | 0 | 0 | 43 | 1 | 179 | 0 | 2 | 1 | 2 | 2 | Normal | |
2020-12-03 12:01:58 | 2887270803 | 2887274010 | 52860 | 15002 | 251 | 0 | 9322 | 6 | 24 | 1 | 9 | 0 | 0 | 23 | 42 | 47 | 0 | 1 | 5 | 0 | 0 | Normal | |
2020-12-03 12:04:24 | 2887271429 | 879896140 | 50007 | 3478 | 611 | 0 | 11193 | 17 | 24 | 2 | 1 | 0 | 0 | 39 | 1 | 3 | 0 | 1 | 1 | 0 | 0 | Probing-Nmap | |
2020-12-03 12:09:27 | 2887270791 | 2887271430 | 44421 | 3007 | 401 | 0 | 59202 | 6 | 24 | 4 | 1 | 2 | 219 | 34 | 1 | 6 | 249 | 4 | 1 | 2 | 11 | Normal | |
2020-12-03 12:11:00 | 2887270776 | 2887273992 | 53246 | 53 | 0 | 193804 | 0 | 17 | 5 | 2 | 8 | 13 | 0 | 48 | 60 | 100 | 0 | 2 | 4 | 0 | 0 | Normal | |
2020-12-03 12:20:31 | 2887270775 | 2887273543 | 0 | 0 | 0 | 0 | 9192 | 1 | 11 | 256 | 1 | 3256 | 8 | 256 | 1 | 3256 | 8 | 27 | 1 | 95 | 3 | Normal | |
2020-12-03 12:25:38 | 2887270721 | 2726122003 | 12793 | 443 | 130 | 0 | 0 | 6 | 10 | 3 | 1 | 6 | 0 | 284 | 2 | 2346 | 0 | 2 | 1 | 3 | 0 | Normal | |
2020-12-03 12:31:08 | 2887270786 | 134744072 | 0 | 0 | 690 | 201966 | 0 | 1 | 11 | 1 | 3 | 8 | 8 | 2 | 13 | 749 | 749 | 1 | 3 | 2 | 2 | Normal | |
2020-12-03 12:33:38 | 2887270973 | 2007043601 | 3203 | 443 | 244 | 0 | 50333 | 6 | 10 | 1 | 1 | 2 | 2 | 256 | 2 | 1507 | 2 | 1 | 1 | 2 | 2 | Normal | |
2020-12-03 12:59:31 | 2887270791 | 2887271305 | 43422 | 555 | 76 | 0 | 49478 | 6 | 24 | 7 | 1 | 26 | 340 | 258 | 1 | 29 | 825 | 7 | 1 | 2 | 10 | Normal | |
2020-12-03 13:01:33 | 2887270791 | 2887271304 | 43423 | 5061 | 0 | 0 | 0 | 6 | 24 | 7 | 1 | 19 | 360 | 258 | 1 | 20 | 996 | 7 | 1 | 0 | 7 | Normal | |
2020-12-03 13:16:58 | 2887270718 | 2887191178 | 49971 | 443 | 141 | 0 | 15505 | 6 | 10 | 1 | 1 | 1 | 1 | 77 | 2 | 669 | 1 | 1 | 1 | 1 | 1 | Normal | |
2020-12-03 13:19:31 | 2887271434 | 2887273992 | 32876 | 53 | 756 | 0 | 0 | 17 | 5 | 1 | 9 | 0 | 0 | 5 | 55 | 98 | 0 | 1 | 8 | 0 | 0 | Normal | |
2020-12-03 13:20:05 | 2887270791 | 2887274085 | 33938 | 9929 | 263 | 489289 | 0 | 6 | 24 | 23 | 1 | 3 | 185 | 23 | 1 | 7 | 398 | 18 | 1 | 1 | 6 | Probing-Port sweep | |
2020-12-03 13:22:41 | 2887270766 | 2887273992 | 52751 | 389 | 53 | 347788 | 0 | 6 | 24 | 2 | 10 | 0 | 0 | 86 | 44 | 2 | 0 | 2 | 8 | 1 | 1 | Normal | |
2020-12-03 13:24:21 | 2887274048 | 167838202 | 43313 | 161 | 227 | 0 | 100373 | 17 | 35 | 6 | 1 | 139 | 1 | 21 | 1 | 3014 | 1 | 3 | 1 | 65 | 1 | Probing-Port sweep | |
2020-12-03 13:27:20 | 2887270791 | 2887273518 | 38447 | 443 | 424 | 0 | 8667 | 6 | 10 | 18 | 1 | 20 | 1 | 160 | 1 | 743 | 1 | 4 | 1 | 3 | 0 | Normal | |
2020-12-03 13:34:01 | 2887270773 | 599449625 | 60020 | 443 | 40 | 85926 | 10580 | 6 | 10 | 2 | 1 | 11 | 2 | 32 | 2 | 94 | 2 | 2 | 1 | 11 | 2 | Normal | |
2020-12-03 13:36:29 | 2887270695 | 2887273992 | 56041 | 53 | 238 | 579606 | 0 | 17 | 5 | 3 | 9 | 9 | 1 | 31 | 49 | 297 | 1 | 1 | 5 | 9 | 1 | Normal | |
2020-12-03 13:45:54 | 2887270707 | 2887273992 | 56286 | 53 | 711 | 0 | 0 | 17 | 5 | 2 | 4 | 43 | 1 | 88 | 27 | 89 | 1 | 2 | 3 | 43 | 1 | Normal | |
2020-12-03 13:52:01 | 2887270800 | 2887273992 | 60715 | 53 | 0 | 0 | 0 | 17 | 5 | 2 | 11 | 0 | 0 | 42 | 49 | 135 | 0 | 1 | 8 | 0 | 0 | Normal | |
2020-12-03 13:53:22 | 2887270702 | 2887273992 | 55013 | 53 | 573 | 0 | 7914 | 17 | 5 | 2 | 6 | 1 | 1 | 16 | 50 | 82 | 1 | 2 | 3 | 1 | 1 | Normal | |
2020-12-03 13:56:48 | 2887270721 | 2887273992 | 51926 | 53 | 0 | 0 | 25147 | 17 | 5 | 2 | 6 | 9 | 1 | 290 | 44 | 1353 | 3 | 2 | 6 | 9 | 1 | Normal | |
2020-12-03 14:15:24 | 2887270690 | 134744072 | 60007 | 53 | 560 | 94236 | 0 | 17 | 5 | 1 | 3 | 5 | 2 | 42 | 12 | 1345 | 2 | 1 | 3 | 5 | 2 | DDOS-smurf | |
2020-12-03 14:21:56 | 2887270695 | 3232243202 | 54711 | 53 | 505 | 0 | 64957 | 17 | 5 | 1 | 1 | 2 | 2 | 31 | 1 | 276 | 2 | 1 | 1 | 2 | 2 | Normal | |
2020-12-03 14:38:05 | 2887270719 | 2887273992 | 57507 | 53 | 1155 | 0 | 0 | 17 | 5 | 1 | 5 | 1 | 0 | 196 | 42 | 375 | 0 | 1 | 5 | 1 | 0 | Normal | |
2020-12-03 15:08:45 | 2887271433 | 1755049120 | 54908 | 9980 | 0 | 348644 | 0 | 6 | 24 | 2 | 2 | 2 | 2 | 4 | 2 | 80 | 2 | 2 | 1 | 2 | 2 | Probing-Nmap | |
2020-12-03 15:17:08 | 2887274048 | 167837955 | 57681 | 161 | 255 | 424427 | 17477 | 17 | 35 | 7 | 1 | 167 | 1 | 21 | 2 | 3086 | 1 | 4 | 1 | 61 | 1 | DDOS-smurf | |
2020-12-03 15:25:58 | 2887270733 | 134744072 | 56760 | 53 | 0 | 0 | 1214 | 17 | 5 | 2 | 3 | 3 | 0 | 23 | 12 | 797 | 0 | 1 | 3 | 3 | 0 | DDOS-smurf | |
2020-12-03 15:41:42 | 2887270733 | 134744072 | 59899 | 53 | 179 | 33008 | 0 | 17 | 5 | 3 | 4 | 22 | 1 | 33 | 12 | 1011 | 1 | 1 | 3 | 13 | 1 | Normal | |
2020-12-03 15:45:21 | 2887274048 | 167838202 | 53466 | 161 | 738 | 0 | 0 | 17 | 35 | 7 | 1 | 195 | 1 | 21 | 1 | 2975 | 1 | 5 | 1 | 45 | 0 | Normal | |
2020-12-03 16:03:09 | 2887270733 | 134744072 | 58023 | 53 | 0 | 44753 | 24036 | 17 | 5 | 1 | 3 | 23 | 1 | 71 | 10 | 887 | 1 | 1 | 3 | 20 | 0 | Probing-Nmap | |
2020-12-03 16:03:50 | 2887270749 | 2472317572 | 49663 | 443 | 0 | 504157 | 0 | 6 | 10 | 1 | 1 | 1 | 1 | 26 | 6 | 131 | 1 | 1 | 1 | 1 | 1 | Probing-Nmap | |
2020-12-03 16:07:51 | 2887270713 | 2887273992 | 59468 | 53 | 444 | 514854 | 79828 | 17 | 5 | 1 | 7 | 99 | 0 | 87 | 46 | 258 | 0 | 1 | 5 | 50 | 0 | Normal | |
2020-12-03 16:09:02 | 2887271198 | 2887274010 | 4831 | 15002 | 0 | 211426 | 72762 | 6 | 24 | 1 | 2 | 0 | 0 | 2 | 39 | 39 | 0 | 1 | 2 | 0 | 0 | Probing-Nmap | |
2020-12-03 16:13:42 | 2887271433 | 792239461 | 38558 | 22000 | 0 | 0 | 0 | 6 | 24 | 2 | 1 | 2 | 2 | 11 | 2 | 8 | 2 | 2 | 1 | 2 | 2 | Normal | |
2020-12-03 16:16:29 | 2887270949 | 400236807 | 5012 | 443 | 64 | 223690 | 0 | 6 | 10 | 2 | 1 | 13 | 1 | 75 | 1 | 509 | 1 | 2 | 1 | 14 | 2 | Normal | |
2020-12-03 16:27:30 | 2887274002 | 134744072 | 52748 | 53 | 0 | 0 | 19953 | 17 | 5 | 1 | 3 | 18 | 0 | 1 | 11 | 309 | 0 | 1 | 2 | 14 | 0 | Probing-IP sweep | |
2020-12-03 16:30:17 | 2887270880 | 2887273992 | 55273 | 53 | 0 | 365406 | 52995 | 17 | 5 | 2 | 6 | 3 | 1 | 52 | 43 | 273 | 1 | 1 | 4 | 1 | 1 | Normal | |
2020-12-03 16:51:05 | 2887270721 | 2887273992 | 57952 | 389 | 429 | 0 | 24905 | 17 | 24 | 2 | 6 | 4 | 0 | 44 | 44 | 66 | 0 | 2 | 5 | 4 | 0 | Normal | |
2020-12-03 16:56:42 | 2887270766 | 679349478 | 54465 | 443 | 193 | 1184868 | 43560 | 6 | 10 | 1 | 1 | 0 | 0 | 12 | 2 | 21 | 0 | 1 | 1 | 0 | 0 | Probing-Nmap | |
2020-12-03 17:05:44 | 2887270702 | 2887274039 | 54152 | 8080 | 250 | 1238058 | 0 | 6 | 24 | 2 | 2 | 20 | 1 | 50 | 19 | 89 | 1 | 1 | 2 | 20 | 1 | Normal | |
2020-12-03 17:13:24 | 2887270690 | 3639551595 | 62882 | 53 | 234 | 1036442 | 29645 | 17 | 5 | 6 | 1 | 26 | 0 | 48 | 1 | 1286 | 0 | 5 | 1 | 23 | 0 | Normal | |
2020-12-03 17:21:38 | 2887270775 | 2887273525 | 54014 | 9091 | 0 | 0 | 0 | 6 | 24 | 9 | 1 | 7 | 3 | 11 | 1 | 7 | 3 | 7 | 1 | 2 | 1 | Normal | |
2020-12-03 17:30:09 | 2887270791 | 2887274002 | 33938 | 23502 | 36 | 756026 | 31962 | 6 | 24 | 27 | 2 | 0 | 358 | 27 | 33 | 0 | 1115 | 18 | 1 | 1 | 6 | Normal | |
2020-12-03 17:30:16 | 2887274048 | 167838202 | 44714 | 161 | 441 | 412807 | 19311 | 17 | 35 | 5 | 1 | 101 | 1 | 21 | 1 | 3099 | 1 | 3 | 1 | 63 | 1 | Normal | |
2020-12-03 17:51:12 | 2887270690 | 3627731182 | 58941 | 443 | 0 | 105819 | 28366 | 17 | 24 | 3 | 1 | 2 | 2 | 36 | 15 | 164 | 2 | 3 | 1 | 2 | 2 | Normal | |
2020-12-03 17:52:15 | 2887274048 | 167838201 | 50287 | 161 | 117 | 0 | 9899 | 17 | 35 | 6 | 1 | 103 | 0 | 21 | 1 | 3068 | 0 | 5 | 1 | 57 | 0 | Probing-IP sweep | |
2020-12-03 17:59:43 | 3105873370 | 1746059117 | 63891 | 3221 | 331 | 0 | 56057 | 6 | 24 | 5 | 1 | 0 | 185 | 64 | 1 | 5 | 298 | 4 | 1 | 0 | 11 | Normal | |
2020-12-03 18:03:19 | 2887271299 | 887498523 | 53588 | 443 | 64 | 0 | 34686 | 6 | 10 | 2 | 1 | 10 | 0 | 23 | 3 | 150 | 0 | 2 | 1 | 5 | 0 | Normal | |
2020-12-03 18:31:28 | 2887271429 | 677035416 | 6755 | 443 | 2011 | 0 | 20473 | 6 | 10 | 8 | 1 | 19 | 1 | 107 | 5 | 590 | 1 | 3 | 1 | 7 | 1 | Normal | |
2020-12-03 18:33:13 | 2887271434 | 2887273992 | 56338 | 53 | 447 | 17475 | 10263 | 17 | 5 | 1 | 7 | 4 | 0 | 4 | 44 | 98 | 0 | 1 | 7 | 4 | 0 | Normal | |
2020-12-03 18:37:14 | 2887270791 | 2887274035 | 33938 | 8222 | 0 | 225679 | 74265 | 6 | 24 | 128 | 1 | 9 | 374 | 128 | 3 | 34 | 1159 | 21 | 1 | 0 | 3 | Normal | |
2020-12-03 18:37:59 | 167838205 | 2887274082 | 1027 | 6343 | 338 | 0 | 0 | 17 | 24 | 1 | 1 | 5 | 5 | 1 | 4 | 1199 | 1199 | 1 | 1 | 9 | 9 | Normal | |
2020-12-03 18:50:47 | 2887270775 | 2887273527 | 26046 | 1886 | 0 | 0 | 75629 | 6 | 24 | 8 | 1 | 1722 | 223 | 388 | 2 | 27353 | 499 | 8 | 1 | 98 | 11 | Probing-Nmap | |
2020-12-03 18:51:27 | 3116346117 | 2887273528 | 34420 | 1864 | 184 | 1017498 | 0 | 6 | 24 | 7 | 1 | 7 | 0 | 32 | 3 | 8 | 0 | 7 | 1 | 2 | 0 | Normal | |
2020-12-03 18:54:06 | 2887270775 | 2887273475 | 56296 | 9594 | 0 | 1113505 | 0 | 6 | 24 | 1 | 1 | 26 | 2 | 3 | 2 | 75 | 2 | 1 | 1 | 10 | 2 | Probing-Nmap | |
2020-12-03 18:57:39 | 167837960 | 2887273992 | 6434 | 123 | 399 | 0 | 26336 | 17 | 24 | 1 | 9 | 1 | 1 | 1 | 44 | 39 | 1 | 1 | 9 | 1 | 1 | Normal | |
2020-12-03 19:01:35 | 2887270702 | 2887274010 | 56507 | 15002 | 75 | 8341 | 17981 | 6 | 24 | 1 | 10 | 1 | 1 | 31 | 37 | 41 | 1 | 1 | 7 | 1 | 1 | DDOS-smurf | |
2020-12-03 19:13:30 | 2887273729 | 2887274002 | 58822 | 53 | 126 | 414050 | 0 | 17 | 5 | 1 | 1 | 0 | 0 | 5 | 27 | 55 | 0 | 1 | 1 | 0 | 0 | Normal | |
2020-12-03 19:24:07 | 2887270765 | 311247748 | 0 | 0 | 827 | 0 | 764 | 1 | 11 | 249 | 1 | 340 | 1 | 256 | 1 | 347 | 1 | 77 | 1 | 77 | 0 | DDOS-smurf | |
2020-12-03 19:27:19 | 2887274048 | 167838205 | 59247 | 161 | 369 | 956821 | 0 | 17 | 35 | 7 | 2 | 122 | 1 | 21 | 2 | 3018 | 1 | 3 | 2 | 62 | 1 | Normal | |
2020-12-03 19:31:37 | 2887271434 | 599639059 | 60500 | 10001 | 41 | 74904 | 92254 | 6 | 24 | 6 | 1 | 160 | 0 | 9 | 1 | 160 | 0 | 6 | 1 | 93 | 0 | Normal | |
2020-12-03 19:40:45 | 2887270775 | 2887273547 | 26046 | 1886 | 0 | 360049 | 0 | 6 | 24 | 27 | 1 | 3496 | 152 | 163 | 2 | 7818 | 280 | 23 | 1 | 100 | 4 | Probing-Nmap | |
2020-12-03 20:03:33 | 2887270739 | 391226437 | 51482 | 443 | 273 | 0 | 0 | 6 | 10 | 6 | 1 | 15 | 0 | 91 | 1 | 538 | 0 | 6 | 1 | 11 | 1 | Normal | |
2020-12-03 20:22:51 | 2887189907 | 2887274189 | 0 | 0 | 230 | 0 | 114759 | 1 | 11 | 1 | 1 | 4 | 4 | 1 | 1 | 485 | 485 | 1 | 1 | 4 | 4 | Normal | |
2020-12-03 20:24:30 | 2887270721 | 879573039 | 7808 | 443 | 57 | 0 | 0 | 6 | 10 | 1 | 1 | 5 | 2 | 124 | 14 | 959 | 2 | 1 | 1 | 5 | 2 | Normal | |
2020-12-03 21:24:20 | 2887270765 | 387297544 | 62778 | 32588 | 0 | 110564 | 0 | 6 | 24 | 36 | 1 | 5059 | 189 | 41 | 1 | 10434 | 319 | 27 | 1 | 99 | 4 | Normal | |
2020-12-03 21:31:10 | 1123633412 | 1037569569 | 44031 | 1328 | 374 | 0 | 20714 | 6 | 24 | 22 | 1 | 15 | 32 | 22 | 1 | 15 | 32 | 17 | 1 | 15 | 3 | Probing-IP sweep | |
2020-12-03 21:49:32 | 2887270775 | 2887273475 | 35076 | 1104 | 0 | 109715 | 99420 | 6 | 24 | 1 | 1 | 24 | 0 | 3 | 2 | 80 | 0 | 1 | 1 | 8 | 0 | Normal | |
2020-12-03 22:06:54 | 2887270719 | 2887273992 | 11627 | 53 | 695 | 0 | 57472 | 17 | 5 | 3 | 8 | 22 | 1 | 128 | 53 | 293 | 1 | 1 | 5 | 15 | 1 | DDOS-smurf | |
2020-12-03 22:16:21 | 2887271426 | 2887274039 | 12046 | 8080 | 0 | 762925 | 29390 | 6 | 24 | 1 | 1 | 0 | 0 | 31 | 10 | 4 | 0 | 1 | 1 | 0 | 0 | Normal | |
2020-12-03 22:35:33 | 2887270775 | 2887273475 | 55182 | 9091 | 348 | 0 | 0 | 6 | 24 | 1 | 1 | 31 | 7 | 3 | 2 | 47 | 7 | 1 | 1 | 30 | 7 | Normal | |
2020-12-03 22:37:51 | 2887270814 | 2887273992 | 52280 | 53 | 731 | 248100 | 0 | 17 | 5 | 2 | 9 | 2 | 0 | 29 | 45 | 80 | 0 | 2 | 8 | 2 | 0 | Normal | |
2020-12-03 22:38:49 | 2887270695 | 3758096636 | 57430 | 5355 | 646 | 0 | 0 | 17 | 24 | 3 | 1 | 0 | 0 | 26 | 2 | 28 | 0 | 3 | 1 | 0 | 0 | Normal | |
2020-12-03 22:48:28 | 3105873214 | 2887271232 | 54264 | 5998 | 55 | 337558 | 0 | 6 | 24 | 2 | 1 | 1 | 208 | 2 | 1 | 1 | 208 | 2 | 1 | 1 | 53 | Normal | |
2020-12-03 22:56:20 | 1123633412 | 1037569542 | 44031 | 139 | 154 | 903713 | 11111 | 6 | 20 | 29 | 1 | 6 | 97 | 29 | 1 | 15 | 197 | 19 | 1 | 6 | 4 | Normal | |
2020-12-03 23:03:11 | 2887274048 | 167838198 | 57258 | 161 | 419 | 366 | 0 | 17 | 35 | 6 | 1 | 185 | 0 | 21 | 1 | 3073 | 0 | 4 | 1 | 81 | 0 | Normal | |
2020-12-03 23:12:59 | 2887270971 | 134744072 | 63161 | 53 | 364 | 0 | 25562 | 17 | 5 | 5 | 4 | 188 | 2 | 97 | 13 | 830 | 2 | 2 | 3 | 58 | 2 | Normal | |
2020-12-03 23:15:27 | 59465216 | 220415529 | 55046 | 443 | 332 | 0 | 22776 | 6 | 10 | 52 | 1 | 60 | 0 | 128 | 1 | 62 | 0 | 18 | 1 | 21 | 0 | Normal | |
2020-12-03 23:21:19 | 2887274048 | 167838200 | 49910 | 161 | 413 | 0 | 51248 | 17 | 35 | 6 | 1 | 181 | 1 | 21 | 1 | 3004 | 1 | 5 | 1 | 69 | 1 | Normal | |
2020-12-03 23:23:14 | 2887270711 | 2887273473 | 51280 | 5080 | 574 | 0 | 0 | 6 | 24 | 6 | 1 | 0 | 0 | 14 | 2 | 3 | 0 | 6 | 1 | 0 | 0 | Normal | |
2020-12-03 23:37:04 | 2887270773 | 2887191179 | 64916 | 443 | 2672 | 1360504 | 105474 | 6 | 10 | 1 | 1 | 1 | 1 | 25 | 8 | 165 | 1 | 1 | 1 | 1 | 1 | Probing-IP sweep | |
2020-12-04 00:02:07 | 2887270803 | 3423402026 | 56458 | 443 | 639 | 1799779 | 109907 | 6 | 10 | 4 | 1 | 9 | 0 | 57 | 1 | 473 | 0 | 4 | 1 | 9 | 0 | Normal | |
2020-12-04 00:37:16 | 2887270825 | 2887273992 | 49620 | 53 | 2 | 1423168 | 12959 | 17 | 5 | 6 | 4 | 4 | 1 | 31 | 30 | 46 | 1 | 1 | 1 | 1 | 1 | Normal | |
2020-12-04 01:03:03 | 2887270711 | 2887273482 | 46430 | 443 | 437 | 311589 | 0 | 6 | 10 | 58 | 1 | 140 | 2 | 86 | 1 | 597 | 2 | 23 | 1 | 49 | 2 | Normal | |
2020-12-04 01:10:57 | 2887274135 | 2887274010 | 64286 | 15002 | 632 | 816977 | 47380 | 6 | 24 | 1 | 6 | 1 | 1 | 2 | 17 | 41 | 1 | 1 | 7 | 1 | 1 | Normal | |
2020-12-04 01:49:47 | 2887270775 | 2887273726 | 26046 | 1886 | 233 | 0 | 560 | 6 | 24 | 253 | 1 | 6232 | 256 | 388 | 1 | 12496 | 484 | 24 | 1 | 100 | 5 | Probing-Port sweep | |
2020-12-04 01:52:03 | 2887270721 | 2887060099 | 1310 | 18001 | 0 | 0 | 0 | 6 | 24 | 3 | 1 | 1 | 1 | 63 | 1 | 29 | 1 | 2 | 1 | 1 | 1 | Normal | |
2020-12-04 02:50:45 | 2887270775 | 2887273475 | 47416 | 2968 | 210 | 366217 | 0 | 6 | 24 | 1 | 1 | 10 | 3 | 11 | 2 | 63 | 3 | 1 | 1 | 8 | 3 | Normal |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2021 | |
# Dynatrace Research | |
# SAL Silicon Austria Labs | |
# LIT Artificial Intelligence Lab | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
from typing import List | |
from ipaddress import IPv4Address | |
import numpy as np | |
import pandas as pd | |
def get_ip_area(df: pd.DataFrame, columns: List[str], areas: List[str]) -> pd.DataFrame: | |
""" | |
One-hot encode to which ip address block a certain address belongs to | |
:param df: original cleaned data frame | |
:param columns: the columns to process (most likely `["src", "dst"]`) | |
:param areas: the areas to process, one or more of | |
`multicast, private, global, unspecified, reserved, loopback, link_local` | |
""" | |
result = pd.DataFrame(index=df.index) | |
for column in columns: | |
ipv4 = df[column].map(IPv4Address) | |
for area in areas: | |
result[f"{column}_{area}"] = ipv4.apply(lambda ip: getattr(ip, f"is_{area}")).astype(bool) | |
return result | |
# noinspection PyUnresolvedReferences | |
def get_ip_flow_type(df: pd.DataFrame, src: str = "src", dst: str = "dst") -> pd.DataFrame: | |
""" | |
One-hot encoding of source and destination traffic type | |
:param df: original cleaned data frame | |
:param src: the column name of the source address | |
:param dst: the column name of the destination address | |
""" | |
result = pd.DataFrame(index=df.index) | |
src_global = df[src].apply(lambda e: IPv4Address(e).is_global) | |
dst_global = df[dst].apply(lambda e: IPv4Address(e).is_global) | |
result["is_inter"] = np.logical_and(src_global, dst_global).astype(bool) | |
result["is_ingress"] = np.logical_and(src_global, ~dst_global).astype(bool) | |
result["is_egress"] = np.logical_and(~src_global, dst_global).astype(bool) | |
result["is_intra"] = np.logical_and(~src_global, ~dst_global).astype(bool) | |
# ensure one-hot encoding | |
assert np.all(np.sum(result, axis=1) == 1) | |
return result | |
def get_ip_binary_match(df: pd.DataFrame, src: str = "src", dst: str = "dst") -> pd.DataFrame: | |
""" | |
Compute the number of bits that are equal in src and dst, | |
from left to right, stopping at the first mismatch, | |
which helps identifying how similar two addresses are | |
:param df: original cleaned data frame | |
:param src: the column name of the source address | |
:param dst: the column name of the destination address | |
""" | |
ips = df[[src, dst]].to_numpy() | |
# a bitwise XOR will reveal the bit position of the first mismatch | |
# which we use to check the necessary bit representation length | |
match = np.bitwise_xor(ips[:, 0], ips[:, 1]) | |
bit_length = 32 - np.ceil(np.log2(match + 1)).astype("uint8") | |
result = pd.DataFrame(index=df.index) | |
result["ip_match"] = bit_length | |
return result | |
def get_ip_one_bits(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame: | |
""" | |
Transform every number (preferably, ip address) to its number of set bits | |
:param df: original cleaned data frame | |
:param columns: the columns to process (most likely `["src", "dst"]`) | |
""" | |
result = df.loc[:, columns] | |
result[:] = _hamming_weight(result.to_numpy()) | |
result = result.add_suffix("_one_bits") | |
result = result.astype("uint8") | |
return result | |
def get_ip_parts(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame: | |
""" | |
Get the four individual parts of an ip address, indexed from lowest to highest, | |
e.g. `192.168.0.1` becomes `{"_0": 1, "_1": 0, "_2": 168, "_3": 192}` | |
:param df: original cleaned data frame | |
:param columns: the columns to process (most likely `["src", "dst"]`) | |
""" | |
result = pd.DataFrame(index=df.index) | |
for column in columns: | |
arr = df[column].to_numpy() | |
result[f"{column}_0"] = arr & 0xFF | |
result[f"{column}_1"] = (arr >> 8) & 0xFF | |
result[f"{column}_2"] = (arr >> 16) & 0xFF | |
result[f"{column}_3"] = (arr >> 24) & 0xFF | |
result = result.astype("uint8") | |
return result | |
def get_port_area(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame: | |
""" | |
One-hot encode port numbers to their respective area | |
``` | |
common: 0 to 1023 | |
registered: 1024 to 49151 | |
ephemeral: 49152 to 65535 | |
``` | |
:param df: original cleaned data frame | |
:param columns: the columns to process (most likely `["spt", "dpt"]`) | |
""" | |
result = pd.DataFrame(index=df.index) | |
for column in columns: | |
port = df[column] | |
result[f"{column}_common"] = (port <= 1023).astype(bool) | |
result[f"{column}_registered"] = ((port >= 1024) & (port <= 49151)).astype(bool) | |
result[f"{column}_ephemeral"] = (port >= 49152).astype(bool) | |
return result | |
# noinspection PyUnresolvedReferences | |
def get_port_match(df: pd.DataFrame, spt: str = "spt", dpt: str = "dpt") -> pd.DataFrame: | |
""" | |
A column that is one if the src and dst port match | |
:param df: original cleaned data frame | |
:param spt: the column name of the source port | |
:param dpt: the column name of the destination port | |
""" | |
result = pd.DataFrame(index=df.index) | |
result["port_match"] = (df[spt] == df[dpt]).astype(bool) | |
return result | |
HAM_M1 = np.uint64(0x5555555555555555) | |
HAM_M2 = np.uint64(0x3333333333333333) | |
HAM_M4 = np.uint64(0x0f0f0f0f0f0f0f0f) | |
HAM_H01 = np.uint64(0x0101010101010101) | |
def _hamming_weight(x): | |
""" | |
efficient implementation of the hamming weight to | |
find the number of one bits of a number | |
(c) https://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation | |
""" | |
x = x - ((x >> 1) & HAM_M1) | |
x = (x & HAM_M2) + ((x >> 2) & HAM_M2) | |
x = (x + (x >> 4)) & HAM_M4 | |
x = (x * HAM_H01) >> 56 | |
return x |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2021 | |
# Dynatrace Research | |
# SAL Silicon Austria Labs | |
# LIT Artificial Intelligence Lab | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import numpy as np | |
import pandas as pd | |
def get_cnt_ratios(df: pd.DataFrame, unsafe: bool = False, fill: float = 0) -> pd.DataFrame: | |
""" | |
Get various ratios between cnt values. | |
This function expects the following column names to be present: | |
cnt_src, cnt_src_slow, cnt_src_conn, | |
cnt_dst, cnt_dst_slow, cnt_dst_conn, | |
cnt_serv_src, cnt_serv_src_slow, cnt_serv_src_conn, | |
cnt_serv_dst, cnt_serv_dst_slow, cnt_serv_dst_conn | |
:param df: the original cleaned data frame | |
:param unsafe: also compute ratios for columns where the divisor can be zero | |
:param fill: fill invalid divisions with this number | |
""" | |
res = pd.DataFrame(index=df.index) | |
# ratios relative to slow and conn | |
################################## | |
assert np.all(df["cnt_src_slow"] > 0) | |
assert np.all(df["cnt_src_conn"] > 0) | |
res["relative_cnt_src_to_slow"] = df["cnt_src"] / df["cnt_src_slow"] | |
res["relative_cnt_src_to_conn"] = df["cnt_src"] / df["cnt_src_conn"] | |
assert np.all(df["cnt_dst_slow"] > 0) | |
assert np.all(df["cnt_dst_conn"] > 0) | |
res["relative_cnt_dst_to_slow"] = df["cnt_dst"] / df["cnt_dst_slow"] | |
res["relative_cnt_dst_to_conn"] = df["cnt_dst"] / df["cnt_dst_conn"] | |
# assert np.all(df["cnt_serv_src_slow"] > 0) # FALSE | |
# assert np.all(df["cnt_serv_src_conn"] > 0) # FALSE | |
if unsafe: | |
res["relative_cnt_serv_src_to_slow"] = _finite_divide(df["cnt_serv_src"], df["cnt_serv_src_slow"], fill) | |
res["relative_cnt_serv_src_to_conn"] = _finite_divide(df["cnt_serv_src"], df["cnt_serv_src_conn"], fill) | |
# assert np.all(df["cnt_serv_dst_slow"] > 0) # FALSE | |
# assert np.all(df["cnt_serv_dst_conn"] > 0) # FALSE | |
if unsafe: | |
res["relative_cnt_serv_dst_to_slow"] = _finite_divide(df["cnt_serv_dst"], df["cnt_serv_dst_slow"], fill) | |
res["relative_cnt_serv_dst_to_conn"] = _finite_divide(df["cnt_serv_dst"], df["cnt_serv_dst_conn"], fill) | |
# src / dst ratios | |
################## | |
assert np.all(df["cnt_dst"] > 0) | |
assert np.all(df["cnt_dst_slow"] > 0) | |
assert np.all(df["cnt_dst_slow"] > 0) | |
res["ratio_cnt_src_dst"] = df["cnt_src"] / df["cnt_dst"] | |
res["ratio_cnt_src_dst_slow"] = df["cnt_src_slow"] / df["cnt_dst_slow"] | |
res["ratio_cnt_src_dst_conn"] = df["cnt_src_conn"] / df["cnt_dst_conn"] | |
# assert np.all(df["cnt_serv_dst"] > 0) # FALSE | |
# assert np.all(df["cnt_serv_dst_slow"] > 0) # FALSE | |
# assert np.all(df["cnt_serv_dst_conn"] > 0) # FALSE | |
if unsafe: | |
res["ratio_cnt_serv_src_dst"] = _finite_divide(df["cnt_serv_src"], df["cnt_serv_dst"], fill) | |
res["ratio_cnt_serv_src_dst_slow"] = _finite_divide(df["cnt_serv_src_slow"], df["cnt_serv_dst_slow"], fill) | |
res["ratio_cnt_serv_src_dst_conn"] = _finite_divide(df["cnt_serv_src_conn"], df["cnt_serv_dst_conn"], fill) | |
# conn to slow ratios | |
##################### | |
assert np.all(df["cnt_dst_slow"] > 0) | |
assert np.all(df["cnt_src_slow"] > 0) | |
res["ratio_cnt_dst_conn_slow"] = df["cnt_dst_conn"] / df["cnt_dst_slow"] | |
res["ratio_cnt_src_conn_slow"] = df["cnt_src_conn"] / df["cnt_src_slow"] | |
# assert np.all(df["cnt_serv_src_slow"] > 0) # FALSE | |
# assert np.all(df["cnt_serv_dst_slow"] > 0) # FALSE | |
if unsafe: | |
res["ratio_cnt_serv_src_conn_slow"] = _finite_divide(df["cnt_serv_src_conn"], df["cnt_serv_src_slow"], fill) | |
res["ratio_cnt_serv_dst_conn_slow"] = _finite_divide(df["cnt_serv_dst_conn"], df["cnt_serv_dst_slow"], fill) | |
res = res.astype("float32") # save some gpu memory | |
# make sure that our values are finite and not too big | |
assert np.all(np.isfinite(res)) and np.all(res < 1e+12) | |
return res | |
def get_in_out_ratios(df: pd.DataFrame, fill: float = 0) -> pd.DataFrame: | |
""" | |
Get ratios between in and out traffic counts. | |
This function expects the following column names to be present: | |
in (bytes), out (bytes), duration | |
:param df: the original cleaned data frame | |
:param fill: fill invalid divisions with this number | |
""" | |
res = pd.DataFrame(index=df.index) | |
res["in_bytes_per_duration"] = _finite_divide(df["in (bytes)"], df["duration"], fill) | |
res["out_bytes_per_duration"] = _finite_divide(df["out (bytes)"], df["duration"], fill) | |
res["ratio_in_out_bytes"] = _finite_divide(df["in (bytes)"], df["out (bytes)"], fill) | |
res = res.astype("float32") # save some gpu memory | |
# make sure that our values are finite and not too big | |
assert np.all(np.isfinite(res)) and np.all(res < 1e+12) | |
return res | |
def get_cnt_distances(df: pd.DataFrame) -> pd.DataFrame: | |
""" | |
Get various distances between cnt values. | |
This function expects the following column names to be present: | |
cnt_src, cnt_src_slow, cnt_src_conn, | |
cnt_dst, cnt_dst_slow, cnt_dst_conn, | |
cnt_serv_src, cnt_serv_src_slow, cnt_serv_src_conn, | |
cnt_serv_dst, cnt_serv_dst_slow, cnt_serv_dst_conn | |
:param df: the original cleaned data frame | |
""" | |
df = df.astype("int64") | |
res = pd.DataFrame(index=df.index) | |
# src dst differences | |
##################### | |
res["diff_cnt_src_dst"] = df["cnt_src"] - df["cnt_dst"] | |
res["diff_cnt_src_dst_slow"] = df["cnt_src_slow"] - df["cnt_dst_slow"] | |
res["diff_cnt_src_dst_conn"] = df["cnt_src_conn"] - df["cnt_dst_conn"] | |
res["diff_cnt_serv_src_dst"] = df["cnt_serv_src"] - df["cnt_serv_dst"] | |
res["diff_cnt_serv_src_dst_slow"] = df["cnt_serv_src_slow"] - df["cnt_serv_dst_slow"] | |
res["diff_cnt_serv_src_dst_conn"] = df["cnt_serv_src_conn"] - df["cnt_serv_dst_conn"] | |
# conn slow differences | |
####################### | |
res["diff_dst_conn_slow"] = df["cnt_dst_conn"] - df["cnt_dst_slow"] | |
res["diff_src_conn_slow"] = df["cnt_src_conn"] - df["cnt_src_slow"] | |
res["diff_serv_src_conn_slow"] = df["cnt_serv_src_conn"] - df["cnt_serv_src_slow"] | |
res["diff_serv_dst_conn_slow"] = df["cnt_serv_dst_conn"] - df["cnt_serv_dst_slow"] | |
return res | |
def _finite_divide(a: np.ndarray, b: np.ndarray, fill: float = 0) -> np.ndarray: | |
"""Divides `a / b` but will fix `0 / 0` and `1 / 0` to `fill` (default: 0)""" | |
with np.errstate(divide="ignore", invalid="ignore"): | |
c = np.true_divide(a, b) | |
c[c == np.inf] = fill | |
c = np.nan_to_num(c, nan=fill) | |
return c |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2021 | |
# Dynatrace Research | |
# SAL Silicon Austria Labs | |
# LIT Artificial Intelligence Lab | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
from typing import List | |
from functools import partial | |
from multiprocessing import Pool, cpu_count | |
import numpy as np | |
import pandas as pd | |
def get_segment_features(df_clean: pd.DataFrame, df_ip: pd.DataFrame, groups: List[str], lag: int, | |
parallel: bool = True) -> pd.DataFrame: | |
""" | |
Group the data frame according to the specified `groups`, then, within each group, | |
cluster rows together that are consecutive to one another (according to the `lag` param), | |
and assign the segment length, and the number of unique spt, dpt, app per segment. | |
Example: For `times = [1, 3, 4, 5, 7, 8, 15]` we would create the segments | |
`[[1], [3, 4, 5], [7, 8], [15]]` with segment lengths `[1, 3, 2, 1]` with `lag = 1` | |
and the number of unique spt, dpt, app per segment. | |
Larger values for `lag` will also cluster points farther apart into one segment, e.g. | |
with `lag = 2` we would create the segments `[[1, 3, 4, 5, 7, 8], [15]]` | |
:param df_clean: the original cleaned data frame | |
:param df_ip: the "ip_parts" features data frame | |
:param groups: a list of columns to group by | |
:param lag: the minimum difference between observations to form a segment | |
:param parallel: whether to process groups in parallel or not | |
""" | |
df = pd.concat([df_clean, df_ip], axis="columns") | |
# transform and sort time column | |
df["time"] = df["time"].astype(np.int64) // 10 ** 9 | |
df = df.sort_values("time") | |
groups = [group for _, group in df.groupby(groups, as_index=False)] | |
apply_function = partial(_assign_segment_metrics, lag=lag) # fix the lag parameter | |
if not parallel: | |
segments = map(apply_function, groups) | |
else: | |
with Pool(cpu_count()) as pool: | |
segments = pool.map(apply_function, groups) | |
return pd.concat(segments).sort_index() | |
def _assign_segment_metrics(group: pd.DataFrame, lag: int = 1): | |
assert lag >= 1 | |
assert len(group) > 0 | |
assert group["time"].is_monotonic_increasing | |
# append new columns with some default values | |
columns = ["segment_length", "nunique_dst", "nunique_spt", "nunique_dpt", "nunique_app"] | |
result = pd.DataFrame(1, dtype=int, index=group.index, columns=columns) | |
# get the time segment splits | |
time = group["time"].to_numpy() | |
splits = np.flatnonzero(np.diff(time) > lag) + 1 | |
splits = np.insert(splits, 0, 0) | |
dim, nsegments = len(time), len(splits) | |
# pre-cache column locations to make the subsequent loop faster | |
loc_segment_length = result.columns.get_loc("segment_length") | |
loc_nunique_dst = result.columns.get_loc("nunique_dst") | |
loc_nunique_spt = result.columns.get_loc("nunique_spt") | |
loc_nunique_dpt = result.columns.get_loc("nunique_dpt") | |
loc_nunique_app = result.columns.get_loc("nunique_app") | |
loc_dst = group.columns.get_loc("dst") | |
loc_spt = group.columns.get_loc("spt") | |
loc_dpt = group.columns.get_loc("dpt") | |
loc_app = group.columns.get_loc("app") | |
for i in range(nsegments): | |
lo = splits[i] | |
hi = splits[i + 1] if i + 1 < nsegments else dim | |
slc = slice(lo, hi - 1) # segment slice | |
# assign the segment length for the entire group | |
result.iloc[slc, loc_segment_length] = hi - lo | |
# compute number of unique spt, dpt, app fields for this segment | |
result.iloc[slc, loc_nunique_dst] = group.iloc[slc, loc_dst].nunique() | |
result.iloc[slc, loc_nunique_spt] = group.iloc[slc, loc_spt].nunique() | |
result.iloc[slc, loc_nunique_dpt] = group.iloc[slc, loc_dpt].nunique() | |
result.iloc[slc, loc_nunique_app] = group.iloc[slc, loc_app].nunique() | |
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
numpy>=1.19 | |
pandas>=1.2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment