# Copyright 2021 # Dynatrace Research # SAL Silicon Austria Labs # LIT Artificial Intelligence Lab # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import pandas as pd def get_cnt_ratios(df: pd.DataFrame, unsafe: bool = False, fill: float = 0) -> pd.DataFrame: """ Get various ratios between cnt values. This function expects the following column names to be present: cnt_src, cnt_src_slow, cnt_src_conn, cnt_dst, cnt_dst_slow, cnt_dst_conn, cnt_serv_src, cnt_serv_src_slow, cnt_serv_src_conn, cnt_serv_dst, cnt_serv_dst_slow, cnt_serv_dst_conn :param df: the original cleaned data frame :param unsafe: also compute ratios for columns where the divisor can be zero :param fill: fill invalid divisions with this number """ res = pd.DataFrame(index=df.index) # ratios relative to slow and conn ################################## assert np.all(df["cnt_src_slow"] > 0) assert np.all(df["cnt_src_conn"] > 0) res["relative_cnt_src_to_slow"] = df["cnt_src"] / df["cnt_src_slow"] res["relative_cnt_src_to_conn"] = df["cnt_src"] / df["cnt_src_conn"] assert np.all(df["cnt_dst_slow"] > 0) assert np.all(df["cnt_dst_conn"] > 0) res["relative_cnt_dst_to_slow"] = df["cnt_dst"] / df["cnt_dst_slow"] res["relative_cnt_dst_to_conn"] = df["cnt_dst"] / df["cnt_dst_conn"] # assert np.all(df["cnt_serv_src_slow"] > 0) # FALSE # assert np.all(df["cnt_serv_src_conn"] > 0) # FALSE if unsafe: res["relative_cnt_serv_src_to_slow"] = _finite_divide(df["cnt_serv_src"], df["cnt_serv_src_slow"], fill) res["relative_cnt_serv_src_to_conn"] = _finite_divide(df["cnt_serv_src"], df["cnt_serv_src_conn"], fill) # assert np.all(df["cnt_serv_dst_slow"] > 0) # FALSE # assert np.all(df["cnt_serv_dst_conn"] > 0) # FALSE if unsafe: res["relative_cnt_serv_dst_to_slow"] = _finite_divide(df["cnt_serv_dst"], df["cnt_serv_dst_slow"], fill) res["relative_cnt_serv_dst_to_conn"] = _finite_divide(df["cnt_serv_dst"], df["cnt_serv_dst_conn"], fill) # src / dst ratios ################## assert np.all(df["cnt_dst"] > 0) assert np.all(df["cnt_dst_slow"] > 0) assert np.all(df["cnt_dst_slow"] > 0) res["ratio_cnt_src_dst"] = df["cnt_src"] / df["cnt_dst"] res["ratio_cnt_src_dst_slow"] = df["cnt_src_slow"] / df["cnt_dst_slow"] res["ratio_cnt_src_dst_conn"] = df["cnt_src_conn"] / df["cnt_dst_conn"] # assert np.all(df["cnt_serv_dst"] > 0) # FALSE # assert np.all(df["cnt_serv_dst_slow"] > 0) # FALSE # assert np.all(df["cnt_serv_dst_conn"] > 0) # FALSE if unsafe: res["ratio_cnt_serv_src_dst"] = _finite_divide(df["cnt_serv_src"], df["cnt_serv_dst"], fill) res["ratio_cnt_serv_src_dst_slow"] = _finite_divide(df["cnt_serv_src_slow"], df["cnt_serv_dst_slow"], fill) res["ratio_cnt_serv_src_dst_conn"] = _finite_divide(df["cnt_serv_src_conn"], df["cnt_serv_dst_conn"], fill) # conn to slow ratios ##################### assert np.all(df["cnt_dst_slow"] > 0) assert np.all(df["cnt_src_slow"] > 0) res["ratio_cnt_dst_conn_slow"] = df["cnt_dst_conn"] / df["cnt_dst_slow"] res["ratio_cnt_src_conn_slow"] = df["cnt_src_conn"] / df["cnt_src_slow"] # assert np.all(df["cnt_serv_src_slow"] > 0) # FALSE # assert np.all(df["cnt_serv_dst_slow"] > 0) # FALSE if unsafe: res["ratio_cnt_serv_src_conn_slow"] = _finite_divide(df["cnt_serv_src_conn"], df["cnt_serv_src_slow"], fill) res["ratio_cnt_serv_dst_conn_slow"] = _finite_divide(df["cnt_serv_dst_conn"], df["cnt_serv_dst_slow"], fill) res = res.astype("float32") # save some gpu memory # make sure that our values are finite and not too big assert np.all(np.isfinite(res)) and np.all(res < 1e+12) return res def get_in_out_ratios(df: pd.DataFrame, fill: float = 0) -> pd.DataFrame: """ Get ratios between in and out traffic counts. This function expects the following column names to be present: in (bytes), out (bytes), duration :param df: the original cleaned data frame :param fill: fill invalid divisions with this number """ res = pd.DataFrame(index=df.index) res["in_bytes_per_duration"] = _finite_divide(df["in (bytes)"], df["duration"], fill) res["out_bytes_per_duration"] = _finite_divide(df["out (bytes)"], df["duration"], fill) res["ratio_in_out_bytes"] = _finite_divide(df["in (bytes)"], df["out (bytes)"], fill) res = res.astype("float32") # save some gpu memory # make sure that our values are finite and not too big assert np.all(np.isfinite(res)) and np.all(res < 1e+12) return res def get_cnt_distances(df: pd.DataFrame) -> pd.DataFrame: """ Get various distances between cnt values. This function expects the following column names to be present: cnt_src, cnt_src_slow, cnt_src_conn, cnt_dst, cnt_dst_slow, cnt_dst_conn, cnt_serv_src, cnt_serv_src_slow, cnt_serv_src_conn, cnt_serv_dst, cnt_serv_dst_slow, cnt_serv_dst_conn :param df: the original cleaned data frame """ df = df.astype("int64") res = pd.DataFrame(index=df.index) # src dst differences ##################### res["diff_cnt_src_dst"] = df["cnt_src"] - df["cnt_dst"] res["diff_cnt_src_dst_slow"] = df["cnt_src_slow"] - df["cnt_dst_slow"] res["diff_cnt_src_dst_conn"] = df["cnt_src_conn"] - df["cnt_dst_conn"] res["diff_cnt_serv_src_dst"] = df["cnt_serv_src"] - df["cnt_serv_dst"] res["diff_cnt_serv_src_dst_slow"] = df["cnt_serv_src_slow"] - df["cnt_serv_dst_slow"] res["diff_cnt_serv_src_dst_conn"] = df["cnt_serv_src_conn"] - df["cnt_serv_dst_conn"] # conn slow differences ####################### res["diff_dst_conn_slow"] = df["cnt_dst_conn"] - df["cnt_dst_slow"] res["diff_src_conn_slow"] = df["cnt_src_conn"] - df["cnt_src_slow"] res["diff_serv_src_conn_slow"] = df["cnt_serv_src_conn"] - df["cnt_serv_src_slow"] res["diff_serv_dst_conn_slow"] = df["cnt_serv_dst_conn"] - df["cnt_serv_dst_slow"] return res def _finite_divide(a: np.ndarray, b: np.ndarray, fill: float = 0) -> np.ndarray: """Divides `a / b` but will fix `0 / 0` and `1 / 0` to `fill` (default: 0)""" with np.errstate(divide="ignore", invalid="ignore"): c = np.true_divide(a, b) c[c == np.inf] = fill c = np.nan_to_num(c, nan=fill) return c