Last active
May 13, 2020 01:59
-
-
Save jrdnr/e286c17f47866b68c0ff51723979b5ea to your computer and use it in GitHub Desktop.
Test Speed to read a file and build a collection in powershell
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Using the following file from | |
| # https://github.com/danielmiessler/SecLists/blob/master/Passwords/Common-Credentials/10-million-password-list-top-100000.txt | |
| #$FilePath = 'C:\git\SecLists\Passwords\Common-Credentials\10-million-password-list-top-500.txt' | |
| $FilePath = 'C:\git\SecLists\Passwords\Common-Credentials\10-million-password-list-top-10000.txt' | |
| #$FilePath = 'C:\git\SecLists\Passwords\Common-Credentials\10-million-password-list-top-100000.txt' | |
| #$FilePath = 'C:\rockyou.txt' | |
| $LineLengthFilter = 1 | |
| function Get-FriendlyDuration { | |
| param ( | |
| [Parameter(Position=0, Mandatory=$True, ParameterSetName='StartStop')] | |
| [datetime]$StartTime, | |
| [Parameter(Position=1, ParameterSetName='StartStop')] | |
| [datetime]$EndTime=(Get-Date), | |
| [Parameter(Position=0, Mandatory=$True, ParameterSetName='Duration')] | |
| [timespan]$Duration, | |
| [int]$Round=2 | |
| ) | |
| if($PSCmdlet.ParameterSetName -eq 'StartStop'){ | |
| if ($StartTime -gt $EndTime){ | |
| Write-Error "StartTime Must be Less than EndTime" | |
| } | |
| $Duration = $EndTime - $StartTime | |
| } | |
| if($duration.TotalDays -ge 1){ | |
| $p = [ordered]@{ | |
| Measure = 'Days' | |
| Duration = [math]::Round($Duration.TotalDays,$Round) | |
| } | |
| }elseif($duration.TotalHours -ge 1){ | |
| $p = [ordered]@{ | |
| Measure = 'Hours' | |
| Duration = [math]::Round($Duration.TotalHours,$Round) | |
| } | |
| }elseif($duration.TotalMinutes -ge 1){ | |
| $p = [ordered]@{ | |
| Measure = 'Minutes' | |
| Duration = [math]::Round($Duration.TotalMinutes,$Round) | |
| } | |
| }elseif($duration.TotalSeconds -ge 1){ | |
| $p = [ordered]@{ | |
| Measure = 'Seconds' | |
| Duration = [math]::Round($Duration.TotalSeconds,$Round) | |
| } | |
| }elseif($duration.TotalMilliseconds -ge 1){ | |
| $p = [ordered]@{ | |
| Measure = 'Milliseconds' | |
| Duration = [math]::Round($Duration.TotalMilliseconds,$Round) | |
| } | |
| } | |
| New-Object -TypeName psobject -Property $p | |
| } | |
| function Get-FltrdContent { | |
| param ( | |
| [string]$FilePath, | |
| [int]$ReadCount = 5000, | |
| [int]$LineLengthFilter = 0 | |
| ) | |
| foreach ($chunk in (Get-Content -Path $FilePath -ReadCount $ReadCount)) { | |
| foreach ($line in $chunk){ | |
| if ($line.Length -ge $LineLengthFilter) { | |
| $line | |
| } | |
| } | |
| } | |
| } | |
| function Test-ListTypes { | |
| [CmdletBinding()] | |
| param ( | |
| [string]$FilePath, | |
| [int]$ReadCount = 5000, | |
| [int]$LineLengthFilter = 0, | |
| [ValidateSet('Array','ArrayList','List')] | |
| [string]$Type | |
| ) | |
| begin { | |
| $fileLength = 0 | |
| Get-Content -Path $FilePath -ReadCount $ReadCount | ForEach-Object { $fileLength += $_.Count } | |
| switch ($Type) { | |
| 'Array' { [array]$return = @() } | |
| 'ArrayList' { $return = New-Object System.Collections.ArrayList } | |
| 'List' { $return = New-Object -TypeName 'System.Collections.Generic.List[System.Object]' } | |
| Default { Throw '$Type not set' } | |
| } | |
| } | |
| process { | |
| if ($type -eq 'Array'){ | |
| if ($fileLength -gt 33000){ | |
| Write-Warning "File Length $fileLength lines, Skipping `"$type`" test because building an $type is too slow" | |
| } else { | |
| foreach ($chunk in (Get-Content -Path $FilePath -ReadCount $ReadCount)) { | |
| foreach ($line in $chunk){ | |
| if ($line.Length -ge $LineLengthFilter) { | |
| $return += $line | |
| } | |
| } | |
| } | |
| } | |
| } else { | |
| foreach ($chunk in (Get-Content -Path $FilePath -ReadCount $ReadCount)) { | |
| foreach ($line in $chunk){ | |
| if ($line.Length -ge $LineLengthFilter) { | |
| $null = $return.Add($line) | |
| } | |
| } | |
| } | |
| } | |
| $return | |
| } | |
| end {} | |
| } | |
| function Test-HashTable { | |
| [CmdletBinding()] | |
| param ( | |
| [string]$FilePath, | |
| [int]$ReadCount = 5000, | |
| [int]$LineLengthFilter = 0, | |
| [string[]]$LineFilter | |
| ) | |
| $return = @{} | |
| $i = 0 | |
| foreach ($chunk in (Get-Content -Path $FilePath -ReadCount $ReadCount)) { | |
| foreach ($line in $chunk){ | |
| if ($line.Length -ge $LineLengthFilter) { | |
| $i++ | |
| $return.Add($i, $line) | |
| } | |
| } | |
| } | |
| return $return | |
| } | |
| $startATTime = Get-Date | |
| Write-Output "" | |
| Write-Output "Benchmarking Array Types vs Function, vs Hashtable" | |
| Write-Output "==================================================" | |
| $PercentComplete = 0 | |
| @('Function','Hashtable','List','ArrayList','Array') | ForEach-Object { | |
| Write-Progress -Activity "Benchmarking `"$_`"" -Status "$PercentComplete% of Benchmarking complete" -PercentComplete $PercentComplete | |
| $PercentComplete += 20 | |
| $matchedLines = $null | |
| $splat = @{ | |
| FilePath = $FilePath | |
| LineLengthFilter = $LineLengthFilter | |
| } | |
| if($ReadCount){ | |
| try { | |
| [int]$count = ($ReadCount | Sort-Object -Property Duration | Select-Object -First 1).readcount | |
| } | |
| catch { | |
| [int]$count = (($ReadCount | Sort-Object -Property Duration | Select-Object -First 2).readcount)[1] | |
| } | |
| $splat.Add('ReadCount', $count) | |
| } | |
| # Comment out this section if working with more than 10000 lines | |
| if(('Array','ArrayList','List') -contains $_){ | |
| $splat.Add('Type', $_) | |
| $start = Get-Date | |
| $matchedLines = Test-ListTypes @splat | |
| $time = ((Get-Date) - $start).TotalSeconds | |
| }#> | |
| if ($_ -eq 'Function') { | |
| $start = Get-Date | |
| $matchedLines = Get-FltrdContent @splat | |
| $time = ((Get-Date) - $start).TotalSeconds | |
| } | |
| if ($_ -eq 'Hashtable'){ | |
| $start = Get-Date | |
| $matchedLines = Test-HashTable @splat | |
| $time = ((Get-Date) - $start).TotalSeconds | |
| } | |
| if($matchedLines){ | |
| $ht = [ordered]@{ | |
| 'Object Test' = $_ | |
| 'Duration(Seconds)' = $time | |
| MatchedLines = $matchedLines.Count | |
| } | |
| New-Object -TypeName psobject -Property $ht | |
| } | |
| } | Sort-Object 'Duration(Seconds)' | Format-Table | |
| $d = Get-FriendlyDuration -StartTime $startATTime | |
| Write-Output "" | |
| Write-Output "Benchmarking File read Duration $($d.Duration) $($d.Measure)" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Using the following file from | |
| # https://github.com/danielmiessler/SecLists/blob/master/Passwords/Common-Credentials/10-million-password-list-top-100000.txt | |
| #$FilePath = 'C:\git\SecLists\Passwords\Common-Credentials\10-million-password-list-top-500.txt' | |
| #$FilePath = 'C:\git\SecLists\Passwords\Common-Credentials\10-million-password-list-top-10000.txt' | |
| #$FilePath = 'C:\git\SecLists\Passwords\Common-Credentials\10-million-password-list-top-100000.txt' | |
| $FilePath = 'C:\rockyou.txt' | |
| function Get-FriendlyDuration { | |
| param ( | |
| [Parameter(Position=0, | |
| Mandatory=$True, | |
| ParameterSetName='StartStop')] | |
| [datetime]$StartTime, | |
| [Parameter(Position=1, | |
| ParameterSetName='StartStop')] | |
| [datetime]$EndTime=(Get-Date), | |
| [Parameter(Position=0, | |
| Mandatory=$True, | |
| ParameterSetName='Duration')] | |
| [timespan]$Duration, | |
| [int]$Round=2 | |
| ) | |
| if($PSCmdlet.ParameterSetName -eq 'StartStop'){ | |
| if ($StartTime -gt $EndTime){ | |
| Write-Error "StartTime Must be Less than EndTime" | |
| } | |
| $Duration = $EndTime - $StartTime | |
| } | |
| if($duration.TotalDays -ge 1){ | |
| $p = [ordered]@{ | |
| Measure = 'Days' | |
| Duration = [math]::Round($Duration.TotalDays,$Round) | |
| } | |
| }elseif($duration.TotalHours -ge 1){ | |
| $p = [ordered]@{ | |
| Measure = 'Hours' | |
| Duration = [math]::Round($Duration.TotalHours,$Round) | |
| } | |
| }elseif($duration.TotalMinutes -ge 1){ | |
| $p = [ordered]@{ | |
| Measure = 'Minutes' | |
| Duration = [math]::Round($Duration.TotalMinutes,$Round) | |
| } | |
| }elseif($duration.TotalSeconds -ge 1){ | |
| $p = [ordered]@{ | |
| Measure = 'Seconds' | |
| Duration = [math]::Round($Duration.TotalSeconds,$Round) | |
| } | |
| }elseif($duration.TotalMilliseconds -ge 1){ | |
| $p = [ordered]@{ | |
| Measure = 'Milliseconds' | |
| Duration = [math]::Round($Duration.TotalMilliseconds,$Round) | |
| } | |
| } | |
| New-Object -TypeName psobject -Property $p | |
| } | |
| # Many Forums suggest using System.IO.StreamReader to read large Files | |
| # My testing would indicate that its actually much faster to use Get-Content and specify a read count, | |
| # Test which ReadCount will be faster for your file | |
| # Specifiying a ReadCount of 1 is equivilent to not specifying a readcount however in my tests this was so slow | |
| # I am not including anything less than 1000 | |
| # -1 is unlimited | |
| $StartRCTime = Get-Date | |
| $readTests = @(1000,5000,10000,-1,'Stream','ReadLine') | |
| $readIncrement = 100 / $readTests.Count | |
| $ReadTestPercentComplete = 0 | |
| $ReadCount = $readTests | ForEach-Object { | |
| if ($_.GetType().name -eq 'Int32'){ | |
| $Activity = 'Read Test using Chunk Size {0}' -f $_ | |
| } else { | |
| $Activity = 'Read Test using {0}' -f $_ | |
| } | |
| Write-Progress -Activity $Activity -Status "$([math]::Round($ReadTestPercentComplete, 1))% of Benchmarking complete" -PercentComplete $ReadTestPercentComplete | |
| $ReadTestPercentComplete += $readIncrement | |
| $start = $time = $count = $i = 0 | |
| switch ($_) { | |
| 'Stream' { | |
| $reader = New-Object -TypeName System.IO.StreamReader -ArgumentList $FilePath | |
| $start = Get-Date | |
| while ( !$reader.EndOfStream ) { | |
| $null = $reader.ReadLine() | |
| $count = $i += 1 | |
| } | |
| $reader.Close() | |
| $time = ((Get-Date) - $start).TotalSeconds | |
| } | |
| 'ReadLine' { | |
| $start = Get-Date | |
| foreach ($line in [System.IO.File]::ReadLines($FilePath)) { | |
| $count = $i += 1 | |
| } | |
| $time = ((Get-Date) - $start).TotalSeconds | |
| } | |
| Default { | |
| $splat = @{Path = $FilePath; ReadCount = $_} | |
| $start = Get-Date | |
| foreach ($chunk in Get-Content @splat){ | |
| $i++ | |
| foreach ($ln in $chunk){ | |
| $count++ | |
| } | |
| } | |
| $time = ((Get-Date) - $start).TotalSeconds | |
| } | |
| } | |
| $ht = [ordered]@{ | |
| Duration = $time | |
| ReadCount = $_ | |
| Blocks = $i | |
| Count = $count | |
| } | |
| New-Object -TypeName psobject -Property $ht | |
| #[System.GC]::Collect() | |
| } | |
| Write-Output "" | |
| $title1 = 'Benchmarking File read methods using: "{0}"' -f (Split-Path -Path $FilePath -Leaf) | |
| Write-Output $title1 | |
| Write-Output ('=' * $title1.Length) | |
| $ReadCount | Sort-Object Duration | Format-Table | |
| $d = Get-FriendlyDuration -StartTime $StartRCTime | |
| Write-Output "Benchmarking File read Duration $($d.Duration) $($d.Measure)" |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Granted performance will vary based on computer however here is the output from running this with the 14,341,564 line RockYou.txt file (https://www.kaggle.com/wjburns/common-password-list-rockyoutxt)