## RTime: 144s ## This computes the pairwise correlation of traffic for each of the local IPs, ## with the traffic vectors being computed in the same manner as in the query ## names Local IPs. library(gtStats) ## The bounds for the data that is being considered. lower <- as.POSIXct("2012-04-06 00:00:00", "GMT") upper <- as.POSIXct("2012-04-16 23:59:59", "GMT") ## The data is sampled to only include connections originating from a fixed IP ## address registered to the UF, the 128.227.X.X range. data <- Load(NetFlow)[ .(lower) <= StartTime && StartTime <= .(upper) && "128.227.0.0" <= SrcAddr && SrcAddr < "128.228.0.0"] ## The number of seconds since the lower bound is computed for each flow. data <- Generate(data, Day = StartTime$AsDays() - .(as.integer(as.Date(lower)))) ## The data is aggregated at the hour level. data <- GroupBy(data, c(SrcAddr, Day, Hour = StartTime$Hour()), Bytes = Sum(abs(TotBytes))) ## The flows are grouped by IP and the relevant statistics are computed. data <- GroupBy(data, SrcAddr, NumIndices = CountDistinct(24 * Day + Hour), ByteSeries = LineChart(c(24 * Day + Hour, Bytes), 264)) ## If a vector has fewer than 100 entries, it is remove. With only a single ## entry, the correlation coefficient cannot be computed. Comparing two vectors ## with few entries can lead to false positive, hence the strict condition. data <- data[NumIndices >= 100] ## The matrix of pair-wise Pearson correlation coefficients is computed. matrix <- BigMatrix(data, c(SrcAddr, ByteSeries), c(X, Y, P), diag = FALSE, block=1024) ## Only the 200 000 most correlation IP pairs are kept. data <- OrderBy(matrix, dsc(P), limit = 200000) result <- View(data)