Created
February 9, 2017 18:39
-
-
Save alinVD/4764530fe30176fde8c18140fbde7316 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # RTime: 166s (2min 46s) | |
| ## These queries attaches the MAC Address to each entry in the NetFlow relation | |
| ## by matching the SrcAddr and each endpoint of the flow to a lease in the DHCP | |
| ## relation. If the two endpoints are matched to different lease, then there is | |
| ## an error, which is recorded in the first query. The second query takes all of | |
| ## the correct results and performs further aggregates on the MAC Address. | |
| ## The DHCP lease timestamps are only accurate to the sample, meaning leases | |
| ## can have equal endpoint. The current interval join only supports intervals | |
| ## closed on both endpoints and without overlap. Therefore, some DHCP leases are | |
| ## removed to fulfill this condition. | |
| library(gtStats) | |
| ## The bounds for the data that is being considered. | |
| lower <- as.POSIXct("2012-04-06 00:00:00", "GMT") | |
| upper <- as.POSIXct("2012-04-16 23:59:59", "GMT") | |
| ## The flows are sampled to only include the 10.X.X.X dynamic source addresses. | |
| ## There are 8 891 228 292 tuples after the filter. | |
| flow <- Load(NetFlow)[ .(lower) <= StartTime && StartTime <= .(upper) | |
| && "10.0.0.0" <= SrcAddr && SrcAddr < "11.0.0.0"] | |
| dhcp <- Load(DHCP)[.(lower) <= StartTime && StartTime <= .(upper)] | |
| ## The intervals that cover less than a second are removed, as the resolution is | |
| ## insufficient to properly analyze these. | |
| dhcp <- dhcp[StartTime != EndTime] | |
| ## Each interval in the DHCP table is interpreted as [StartTime, EndTime). The | |
| ## EndTime is shifted because this is equivalent to [StartTime, EndTime - 1]. | |
| dhcp <- Generate(dhcp, EndTime = EndTime - 1, .overwrite = TRUE) | |
| ## The MAC address is looked up for each flow. | |
| ## There are 215 131 329 matches and 31 080 distinct MAC addresses. | |
| join <- IntervalJoin(flow, c(StartTime, SrcAddr), | |
| dhcp, c(DHCPStart = StartTime, EndTime, UserIP), UserMac) | |
| ## Offset and Index are the number of seconds and hours, respectively, since | |
| ## the beginnining of the data. | |
| data <- Generate(join, Offset = DHCPStart$AsSeconds() - .(as.integer(lower))) | |
| data <- Generate(data, Index = Offset %/% 3600) | |
| ## The data is grouped on the MAC Address and the following is computed for each: | |
| charts <- GroupBy(data, UserMac, | |
| NumIndices = CountDistinct(Index), | |
| NumRecords = Count(), | |
| TotalVolume = Sum(TotBytes), | |
| Chart = LineChart(c(Index, TotBytes), 264)) | |
| ## The data is sorted so that the most complete vectors are at the top. | |
| sorted <- OrderBy(charts, dsc(NumIndices), dsc(NumRecords), asc(UserMac)) | |
| ## The columns are placed in the order we want them to appear. | |
| result <- View(sorted, UserMac, NumIndices, NumRecords, TotalVolume, Chart) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment