Created
February 9, 2017 18:38
-
-
Save alinVD/ee3b4fb842539a5ac93c850ee48bfb78 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ## RTime: 144s | |
| ## This computes the pairwise correlation of traffic for each of the local IPs, | |
| ## with the traffic vectors being computed in the same manner as in the query | |
| ## names Local IPs. | |
| library(gtStats) | |
| ## The bounds for the data that is being considered. | |
| lower <- as.POSIXct("2012-04-06 00:00:00", "GMT") | |
| upper <- as.POSIXct("2012-04-16 23:59:59", "GMT") | |
| ## The data is sampled to only include connections originating from a fixed IP | |
| ## address registered to the UF, the 128.227.X.X range. | |
| data <- Load(NetFlow)[ .(lower) <= StartTime && StartTime <= .(upper) | |
| && "128.227.0.0" <= SrcAddr && SrcAddr < "128.228.0.0"] | |
| ## The number of seconds since the lower bound is computed for each flow. | |
| data <- Generate(data, Day = StartTime$AsDays() - .(as.integer(as.Date(lower)))) | |
| ## The data is aggregated at the hour level. | |
| data <- GroupBy(data, c(SrcAddr, Day, Hour = StartTime$Hour()), | |
| Bytes = Sum(abs(TotBytes))) | |
| ## The flows are grouped by IP and the relevant statistics are computed. | |
| data <- GroupBy(data, SrcAddr, | |
| NumIndices = CountDistinct(24 * Day + Hour), | |
| ByteSeries = LineChart(c(24 * Day + Hour, Bytes), 264)) | |
| ## If a vector has fewer than 100 entries, it is remove. With only a single | |
| ## entry, the correlation coefficient cannot be computed. Comparing two vectors | |
| ## with few entries can lead to false positive, hence the strict condition. | |
| data <- data[NumIndices >= 100] | |
| ## The matrix of pair-wise Pearson correlation coefficients is computed. | |
| matrix <- BigMatrix(data, c(SrcAddr, ByteSeries), c(X, Y, P), diag = FALSE, block=1024) | |
| ## Only the 200 000 most correlation IP pairs are kept. | |
| data <- OrderBy(matrix, dsc(P), limit = 200000) | |
| result <- View(data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment