Skip to content

Instantly share code, notes, and snippets.

@jrdnr
Last active May 13, 2020 01:59
Show Gist options
  • Select an option

  • Save jrdnr/e286c17f47866b68c0ff51723979b5ea to your computer and use it in GitHub Desktop.

Select an option

Save jrdnr/e286c17f47866b68c0ff51723979b5ea to your computer and use it in GitHub Desktop.
Test Speed to read a file and build a collection in powershell
# Using the following file from
# https://github.com/danielmiessler/SecLists/blob/master/Passwords/Common-Credentials/10-million-password-list-top-100000.txt
#$FilePath = 'C:\git\SecLists\Passwords\Common-Credentials\10-million-password-list-top-500.txt'
$FilePath = 'C:\git\SecLists\Passwords\Common-Credentials\10-million-password-list-top-10000.txt'
#$FilePath = 'C:\git\SecLists\Passwords\Common-Credentials\10-million-password-list-top-100000.txt'
#$FilePath = 'C:\rockyou.txt'
$LineLengthFilter = 1
function Get-FriendlyDuration {
param (
[Parameter(Position=0, Mandatory=$True, ParameterSetName='StartStop')]
[datetime]$StartTime,
[Parameter(Position=1, ParameterSetName='StartStop')]
[datetime]$EndTime=(Get-Date),
[Parameter(Position=0, Mandatory=$True, ParameterSetName='Duration')]
[timespan]$Duration,
[int]$Round=2
)
if($PSCmdlet.ParameterSetName -eq 'StartStop'){
if ($StartTime -gt $EndTime){
Write-Error "StartTime Must be Less than EndTime"
}
$Duration = $EndTime - $StartTime
}
if($duration.TotalDays -ge 1){
$p = [ordered]@{
Measure = 'Days'
Duration = [math]::Round($Duration.TotalDays,$Round)
}
}elseif($duration.TotalHours -ge 1){
$p = [ordered]@{
Measure = 'Hours'
Duration = [math]::Round($Duration.TotalHours,$Round)
}
}elseif($duration.TotalMinutes -ge 1){
$p = [ordered]@{
Measure = 'Minutes'
Duration = [math]::Round($Duration.TotalMinutes,$Round)
}
}elseif($duration.TotalSeconds -ge 1){
$p = [ordered]@{
Measure = 'Seconds'
Duration = [math]::Round($Duration.TotalSeconds,$Round)
}
}elseif($duration.TotalMilliseconds -ge 1){
$p = [ordered]@{
Measure = 'Milliseconds'
Duration = [math]::Round($Duration.TotalMilliseconds,$Round)
}
}
New-Object -TypeName psobject -Property $p
}
function Get-FltrdContent {
param (
[string]$FilePath,
[int]$ReadCount = 5000,
[int]$LineLengthFilter = 0
)
foreach ($chunk in (Get-Content -Path $FilePath -ReadCount $ReadCount)) {
foreach ($line in $chunk){
if ($line.Length -ge $LineLengthFilter) {
$line
}
}
}
}
function Test-ListTypes {
[CmdletBinding()]
param (
[string]$FilePath,
[int]$ReadCount = 5000,
[int]$LineLengthFilter = 0,
[ValidateSet('Array','ArrayList','List')]
[string]$Type
)
begin {
$fileLength = 0
Get-Content -Path $FilePath -ReadCount $ReadCount | ForEach-Object { $fileLength += $_.Count }
switch ($Type) {
'Array' { [array]$return = @() }
'ArrayList' { $return = New-Object System.Collections.ArrayList }
'List' { $return = New-Object -TypeName 'System.Collections.Generic.List[System.Object]' }
Default { Throw '$Type not set' }
}
}
process {
if ($type -eq 'Array'){
if ($fileLength -gt 33000){
Write-Warning "File Length $fileLength lines, Skipping `"$type`" test because building an $type is too slow"
} else {
foreach ($chunk in (Get-Content -Path $FilePath -ReadCount $ReadCount)) {
foreach ($line in $chunk){
if ($line.Length -ge $LineLengthFilter) {
$return += $line
}
}
}
}
} else {
foreach ($chunk in (Get-Content -Path $FilePath -ReadCount $ReadCount)) {
foreach ($line in $chunk){
if ($line.Length -ge $LineLengthFilter) {
$null = $return.Add($line)
}
}
}
}
$return
}
end {}
}
function Test-HashTable {
[CmdletBinding()]
param (
[string]$FilePath,
[int]$ReadCount = 5000,
[int]$LineLengthFilter = 0,
[string[]]$LineFilter
)
$return = @{}
$i = 0
foreach ($chunk in (Get-Content -Path $FilePath -ReadCount $ReadCount)) {
foreach ($line in $chunk){
if ($line.Length -ge $LineLengthFilter) {
$i++
$return.Add($i, $line)
}
}
}
return $return
}
$startATTime = Get-Date
Write-Output ""
Write-Output "Benchmarking Array Types vs Function, vs Hashtable"
Write-Output "=================================================="
$PercentComplete = 0
@('Function','Hashtable','List','ArrayList','Array') | ForEach-Object {
Write-Progress -Activity "Benchmarking `"$_`"" -Status "$PercentComplete% of Benchmarking complete" -PercentComplete $PercentComplete
$PercentComplete += 20
$matchedLines = $null
$splat = @{
FilePath = $FilePath
LineLengthFilter = $LineLengthFilter
}
if($ReadCount){
try {
[int]$count = ($ReadCount | Sort-Object -Property Duration | Select-Object -First 1).readcount
}
catch {
[int]$count = (($ReadCount | Sort-Object -Property Duration | Select-Object -First 2).readcount)[1]
}
$splat.Add('ReadCount', $count)
}
# Comment out this section if working with more than 10000 lines
if(('Array','ArrayList','List') -contains $_){
$splat.Add('Type', $_)
$start = Get-Date
$matchedLines = Test-ListTypes @splat
$time = ((Get-Date) - $start).TotalSeconds
}#>
if ($_ -eq 'Function') {
$start = Get-Date
$matchedLines = Get-FltrdContent @splat
$time = ((Get-Date) - $start).TotalSeconds
}
if ($_ -eq 'Hashtable'){
$start = Get-Date
$matchedLines = Test-HashTable @splat
$time = ((Get-Date) - $start).TotalSeconds
}
if($matchedLines){
$ht = [ordered]@{
'Object Test' = $_
'Duration(Seconds)' = $time
MatchedLines = $matchedLines.Count
}
New-Object -TypeName psobject -Property $ht
}
} | Sort-Object 'Duration(Seconds)' | Format-Table
$d = Get-FriendlyDuration -StartTime $startATTime
Write-Output ""
Write-Output "Benchmarking File read Duration $($d.Duration) $($d.Measure)"
# Using the following file from
# https://github.com/danielmiessler/SecLists/blob/master/Passwords/Common-Credentials/10-million-password-list-top-100000.txt
#$FilePath = 'C:\git\SecLists\Passwords\Common-Credentials\10-million-password-list-top-500.txt'
#$FilePath = 'C:\git\SecLists\Passwords\Common-Credentials\10-million-password-list-top-10000.txt'
#$FilePath = 'C:\git\SecLists\Passwords\Common-Credentials\10-million-password-list-top-100000.txt'
$FilePath = 'C:\rockyou.txt'
function Get-FriendlyDuration {
param (
[Parameter(Position=0,
Mandatory=$True,
ParameterSetName='StartStop')]
[datetime]$StartTime,
[Parameter(Position=1,
ParameterSetName='StartStop')]
[datetime]$EndTime=(Get-Date),
[Parameter(Position=0,
Mandatory=$True,
ParameterSetName='Duration')]
[timespan]$Duration,
[int]$Round=2
)
if($PSCmdlet.ParameterSetName -eq 'StartStop'){
if ($StartTime -gt $EndTime){
Write-Error "StartTime Must be Less than EndTime"
}
$Duration = $EndTime - $StartTime
}
if($duration.TotalDays -ge 1){
$p = [ordered]@{
Measure = 'Days'
Duration = [math]::Round($Duration.TotalDays,$Round)
}
}elseif($duration.TotalHours -ge 1){
$p = [ordered]@{
Measure = 'Hours'
Duration = [math]::Round($Duration.TotalHours,$Round)
}
}elseif($duration.TotalMinutes -ge 1){
$p = [ordered]@{
Measure = 'Minutes'
Duration = [math]::Round($Duration.TotalMinutes,$Round)
}
}elseif($duration.TotalSeconds -ge 1){
$p = [ordered]@{
Measure = 'Seconds'
Duration = [math]::Round($Duration.TotalSeconds,$Round)
}
}elseif($duration.TotalMilliseconds -ge 1){
$p = [ordered]@{
Measure = 'Milliseconds'
Duration = [math]::Round($Duration.TotalMilliseconds,$Round)
}
}
New-Object -TypeName psobject -Property $p
}
# Many Forums suggest using System.IO.StreamReader to read large Files
# My testing would indicate that its actually much faster to use Get-Content and specify a read count,
# Test which ReadCount will be faster for your file
# Specifiying a ReadCount of 1 is equivilent to not specifying a readcount however in my tests this was so slow
# I am not including anything less than 1000
# -1 is unlimited
$StartRCTime = Get-Date
$readTests = @(1000,5000,10000,-1,'Stream','ReadLine')
$readIncrement = 100 / $readTests.Count
$ReadTestPercentComplete = 0
$ReadCount = $readTests | ForEach-Object {
if ($_.GetType().name -eq 'Int32'){
$Activity = 'Read Test using Chunk Size {0}' -f $_
} else {
$Activity = 'Read Test using {0}' -f $_
}
Write-Progress -Activity $Activity -Status "$([math]::Round($ReadTestPercentComplete, 1))% of Benchmarking complete" -PercentComplete $ReadTestPercentComplete
$ReadTestPercentComplete += $readIncrement
$start = $time = $count = $i = 0
switch ($_) {
'Stream' {
$reader = New-Object -TypeName System.IO.StreamReader -ArgumentList $FilePath
$start = Get-Date
while ( !$reader.EndOfStream ) {
$null = $reader.ReadLine()
$count = $i += 1
}
$reader.Close()
$time = ((Get-Date) - $start).TotalSeconds
}
'ReadLine' {
$start = Get-Date
foreach ($line in [System.IO.File]::ReadLines($FilePath)) {
$count = $i += 1
}
$time = ((Get-Date) - $start).TotalSeconds
}
Default {
$splat = @{Path = $FilePath; ReadCount = $_}
$start = Get-Date
foreach ($chunk in Get-Content @splat){
$i++
foreach ($ln in $chunk){
$count++
}
}
$time = ((Get-Date) - $start).TotalSeconds
}
}
$ht = [ordered]@{
Duration = $time
ReadCount = $_
Blocks = $i
Count = $count
}
New-Object -TypeName psobject -Property $ht
#[System.GC]::Collect()
}
Write-Output ""
$title1 = 'Benchmarking File read methods using: "{0}"' -f (Split-Path -Path $FilePath -Leaf)
Write-Output $title1
Write-Output ('=' * $title1.Length)
$ReadCount | Sort-Object Duration | Format-Table
$d = Get-FriendlyDuration -StartTime $StartRCTime
Write-Output "Benchmarking File read Duration $($d.Duration) $($d.Measure)"
@jrdnr
Copy link
Author

jrdnr commented Dec 30, 2019

Granted performance will vary based on computer however here is the output from running this with the 14,341,564 line RockYou.txt file (https://www.kaggle.com/wjburns/common-password-list-rockyoutxt)

# The results are close enough for Get-Content the order switches from run to run
PS > .\ReadFile.ps1

Benchmarking File read methods using: "rockyou.txt"
===================================================

Duration | ReadCount | Blocks | Count
-------- | --------- | ------ | -----
16.1719593 | 5000 | 2869 | 14344391
16.7957086 | 1000 | 14345 | 14344391
17.3133878 | 10000 | 1435 | 14344391
17.6671757 | -1 | 1 | 14344391
18.3647044 | ReadLine | 14344391 | 14344391
25.9790412 | Stream | 14344391 | 14344391

PS > .\CollectionBuilder.ps1 
Benchmarking Array Types vs Function, vs Hashtable
==================================================
WARNING: File Length 14344391 lines, Skipping "Array" test because building an Array is too slow

Object Test | Duration(Seconds) | MatchedLines
----------- | ----------------- | ------------
Function | 5.0165559 | 14344390
Hashtable | 9.2718225 | 14344390
ArrayList | 11.3502448 | 14344390
List | 11.3731981 | 14344390

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment