From 4014b0e8a9bec9d41b1a11137523b363337f2c9a Mon Sep 17 00:00:00 2001 From: Julian Noble Date: Fri, 18 Aug 2023 09:51:46 +1000 Subject: [PATCH] pmix repo base fixes, speed up du on windows --- src/modules/punk-0.1.tm | 3 + src/modules/punk/du-999999.0a1.0.tm | 485 +++++++++++++++++++++++ src/modules/punk/du-buildversion.txt | 3 + src/modules/punk/mix-0.2.tm | 297 ++------------ src/modules/punk/repl-0.1.tm | 1 + src/modules/punk/repo-999999.0a1.0.tm | 161 +++++++- src/modules/punk/winpath-999999.0a1.0.tm | 6 + 7 files changed, 694 insertions(+), 262 deletions(-) create mode 100644 src/modules/punk/du-999999.0a1.0.tm create mode 100644 src/modules/punk/du-buildversion.txt diff --git a/src/modules/punk-0.1.tm b/src/modules/punk-0.1.tm index 08ea9014..a18284de 100644 --- a/src/modules/punk-0.1.tm +++ b/src/modules/punk-0.1.tm @@ -72,6 +72,7 @@ namespace eval ::repl { package require punk::config package require punk::winpath ;# for windows paths - but has functions that can be called on unix systems package require punk::repo +package require punk::du namespace eval punk { interp alias {} purelist {} lreplace x 0 0 ;#required by pipe system @@ -5980,6 +5981,8 @@ namespace eval punk { #While in most/normal cases the library will cd back to the remembered working directory after only a brief time - there seem to be many opportunities for issues #if the repl is used to launch/run a number of things in the one process proc d/ {args} { + set is_win [expr {"windows" eq $::tcl_platform(platform)}] + set ::punk::last_run_display [list] if {([llength $args]) && ([lindex $args 0] eq "")} { diff --git a/src/modules/punk/du-999999.0a1.0.tm b/src/modules/punk/du-999999.0a1.0.tm new file mode 100644 index 00000000..7be23bc0 --- /dev/null +++ b/src/modules/punk/du-999999.0a1.0.tm @@ -0,0 +1,485 @@ +# -*- tcl -*- +# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt +# +# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. +# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# (C) 2023 +# +# @@ Meta Begin +# Application punk::du 999999.0a1.0 +# Meta platform tcl +# Meta license +# @@ Meta End + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Requirements +##e.g package require frobz + +if {"windows" eq $::tcl_platform(platform)} { + if {[catch {package require twapi}]} { + puts stderr "Warning: punk::du - unable to load twapi. Disk operations may be much slower on windows without the twapi package" + } +} + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval punk::du { + + proc du_lit value { + if {![info exists ::punk::du_literal($value)]} { + set ::punk::du_literal($value) $value + } + return $::punk::du_literal($value) + } + proc _du_new_eachdir {dirtails depth parentfolderidx} { + set newlist {} + upvar folders folders + set parentpath [lindex $folders $parentfolderidx] + set newindex [llength $folders] + foreach dt $dirtails { + lappend folders $parentpath/$dt + lappend newlist [::list $depth $parentfolderidx $newindex $dt [expr {0}]] + incr newindex + } + return $newlist + } + proc du_listing {folderpath} { + #note platform differences between what is considered hidden make this tricky. + # on windows 'glob .*' will not return some hidden dot items but will return . .. and glob -types hidden .* will not return some dotted items + # glob -types hidden * on windows will not necessarily return all dot files/folders + # unix-like platforms seem to consider all dot files as hidden so processing is more straightforward + # we need to process * and .* in the same glob calls and remove duplicates + # if we do * and .* in separate iterations of this loop we lose the ability to filter duplicates easily + + #Note - glob operations seem to be faster than looped tests like 'file isdirectory' & 'file readlink' + #set parent [lindex $folders $folderidx] + set hdirs [glob -nocomplain -dir $folderpath -types {hidden d} -tail * .*] + #set hdirs {} + set dirs [glob -nocomplain -dir $folderpath -types d -tail * .*] + + #set hlinks [glob -nocomplain -dir [lindex $folders $folderidx] -types {hidden l} -tail * .*] + set hlinks {} + set links [glob -nocomplain -dir $folderpath -types l -tail * .*] ;#links may have dupes - we don't care. struct::set difference will remove + #set links [lsort -unique [concat $hlinks $links[unset links]]] + + set hfiles [glob -nocomplain -dir $folderpath -types {hidden f} -tail * .*] + #set hfiles {} + set files [glob -nocomplain -dir $folderpath -types f -tail * .*] + #set files {} + + #note struct::set difference produces unordered result + #struct::set difference removes duplicates + #remove links and . .. from directories, remove links from files + set dirs [struct::set difference [concat $hdirs $dirs[unset dirs]] [concat $links {. ..}]] + set files [struct::set difference [concat $hfiles $files[unset files]] $links] + + set filesizes [list]; #not available in listing-call - as opposed to twapi which can do it as it goes + return [list $dirs $files $filesizes] + } + + # get listing without using unix-tools (may not be installed on the windows system) + proc du_listing_twapi {folderpath} { + package require punk::winpath + set dirs [list] + set files [list] + set filesizes [list] + try { + if {[string length [file normalize $folderpath]] >= 250} { + set folderpath_shortname [punk::winpath::pwdshortname $folderpath] + set iterator [twapi::find_file_open $folderpath_shortname/* -detail basic] + } else { + set iterator [twapi::find_file_open $folderpath/* -detail basic] ;# -detail full only adds data to the altname field + } + } on error args { + if {[string match "*denied*" $args]} { + #output similar format as unixy du + puts stderr "[file dirname $folderpath] '[file tail $folderpath]':$args" + return [list {} {} {}] + } + #possibly an illegal windows filename - easily happens on a machine with WSL or with drive mapped to unix share + set errmsg "error reading folder: $folderpath (len:[string length [file normalize $folderpath]])\n" + append errmsg "error: $args" \n + append errmsg "errorcode: $::errorCode" \n + append errmsg {retrying with with windows altname} + puts stderr $errmsg + try { + #go to parent and re-fetch this folder with altnames + #use normalize to get full path - in case we are currently at "." + set parent [file dirname [file normalize $folderpath]] + set iterator [twapi::find_file_open $parent/* -detail full] ;# -detail full because we need altname! + while {[twapi::find_file_next $iterator iteminfo]} { + set nm [dict get $iteminfo name] + if {$nm eq [file tail $folderpath]} { + break + } + } + if {$nm ne [file tail $folderpath]} { + error "failed to re-fetch current directory info with altpath info" + } + set altname [dict get $iteminfo altname] + puts stderr "using altname $parent/$altname" + set iterator [twapi::find_file_open $parent/$altname/* -detail basic] + + + } on error args { + set errmsg "error reading folder: $parent or $folderpath\n" + append errmsg "error: $args" + append errmsg "aborting.." + error $errmsg + } + } + while {[twapi::find_file_next $iterator iteminfo]} { + set nm [dict get $iteminfo name] + if {$nm in {. ..}} { + continue + } + set attrinfo [twapi::decode_file_attributes [dict get $iteminfo attrs]] + if {"reparse_point" in $attrinfo} { + #we will treat as a zero sized file.. review - option -L for symlinks like BSD du? + #Note 'file readlink' can fail on windows - reporting 'invalid argument' - according to tcl docs, 'On systems that don't support symbolic links this option is undefined' + #The link may be viewable ok in windows explorer, and cmd.exe /c dir and unix tools such as ls + #if we need it without resorting to unix-tools that may not be installed: exec {*}[auto_execok dir] /A:L {c:\some\path} + #e.g (stripped of headers/footers and other lines) + #2022-10-02 04:07 AM priv [\\?\c:\repo\elixir\gameportal\apps\test\priv] + #Note we will have to parse beyond header fluff as /B strips the symlink info along with headers. + #du includes the size of the symlink + #but we can't get it with tcl's file size + #twapi doesn't seem to have anything to help read it either (?) + #the above was verified with a symlink that points to a non-existant folder.. mileage may vary for an actually valid link + # + #Note also - a shortcut created in explorer with drag and drop to an existant folder is a different animal to a symlink (file with .lnk extension) even though it looks the same in explorer window. + #This .lnk seems to be an ordinary file and we can treat as such here. + lappend files $nm + lappend filesizes 0 + } elseif {"directory" in $attrinfo} { + lappend dirs $nm + } else { + lappend files $nm + lappend filesizes [dict get $iteminfo size] + } + } + return [list $dirs $files $filesizes] + } + + + #Note that unix du seems to do depth-first - which makese sense when piping.. as output can be emitted as we go rather than requiring sort at end. + #breadth-first with sort can be quite fast though + proc du { args } { + package require struct::set + + + if 0 { + switch -exact [llength $args] { + 0 { + set dir . + set switch -k + } + 1 { + set dir $args + set switch -k + } + 2 { + set switch [lindex $args 0] + set dir [lindex $args 1] + } + default { + set msg "only one switch and one dir " + append msg "currently supported" + return -code error $msg + } + } + + set switch [string tolower $switch] + + set -b 1 + set -k 1024 + set -m [expr 1024*1024] + } + + + set opts $args + # flags in args are singletons (or longopts --something=somethingelse) or sometimes pairopts + # process any pairopts first + + set opt_depth -1 + if {[set posn [lsearch $opts -d]] >= 0} { + set opt_depth [lindex $opts $posn+1] + set opts [lreplace $opts $posn $posn+1] + } + foreach o $opts { + if {[string match --max-depth=* $o]} { + set opt_depth [lindex [split $o =] 1] + if {![string is integer -strict $opt_depth]} { + error "--max-depth=n n must be an integer" + } + } + } + #---- + + set lastarg [lindex $opts end] + if {[string length $lastarg] && (![string match -* $lastarg])} { + set dir $lastarg + set opts [lrange $opts 0 end-1] + } else { + set dir . + set opts $opts + } + foreach a $opts { + if {![string match -* $a]} { + error "unrecognized option '$a'" + } + } + + set -b 1 + set -k 1024 + set -m [expr 1024*1024] + set switch -k ;#default (same as unix) + set lc_opts [string tolower $opts] + + + + if {"-b" in $lc_opts} { + set switch -b + } elseif {"-k" in $lc_opts} { + set switch -k + } elseif {"-m" in $lc_opts} { + set switch -m + } + set opt_progress 0 + if {"--prog" in $lc_opts || "--progress" in $lc_opts} { + set opt_progress 1 + } + set opt_extra 0 + if {"--extra" in $lc_opts} { + set opt_extra 1 + } + + + + set result [list] + + set dir_depths_remaining [list] + + #if {[file normalize [file dirname $dir]] eq [file normalize $dir]} { + # error "du at root of filesystem not yet implemented.. sorry" + # + #} + + set is_windows [expr {$::tcl_platform(platform) eq "windows"}] + set has_twapi 0 ;#default assumption + if {$is_windows} { + if {![catch { + package require twapi + } + ]} { + set has_twapi 1 + } + } + set zero [expr {0}] + + # ## ### ### ### ### + # containerid and itemid + set folders [list] ;#we lookup string by index + lappend folders [file dirname $dir] + lappend folders $dir ;#itemindex 1 + # ## ### ### ### ### + if {![file isdirectory $dir]} { + lappend dir_depths_remaining [list $zero $zero [expr {1}] [du_lit [file tail $dir]] [file size $dir]] + #set ary($dir,bytes) [file size $dir] + set leveldircount 0 + } else { + lappend dir_depths_remaining [list $zero $zero [expr {1}] [du_lit [file tail $dir]] $zero] + set leveldircount 1 + } + set level [expr {0}] + set nextlevel [expr {1}] + #dir_depths list structure + # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + #0 1 2 3 4 5 + #i_depth i_containerid i_itemid i_item i_size i_index + # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + set i_depth [expr {0}] + set i_containerid [expr {1}] + set i_itemid [expr {2}] + set i_item [expr {3}] + set i_size [expr {4}] + set i_index [expr {5}] + + + set listlength [llength $dir_depths_remaining] + set diridx 0 + #this is a breadth-first algorithm + while {$leveldircount > 0} { + set leveldirs 0 + set levelfiles 0 + for {set i $diridx} {$i < $listlength} {incr i} { + #lassign [lindex $dir_depths_remaining $i] _d containeridx folderidx itm bytecount + set folderidx [lindex $dir_depths_remaining $i $i_itemid] + set folderpath [lindex $folders $folderidx] + #puts stderr ->$folderpath + #if {$i >= 20} { + #return + #} + + if {$is_windows && $has_twapi} { + lassign [du_listing_twapi $folderpath] dirs files filesizes; #twapi supports gathering file sizes during directory contents traversal + } else { + lassign [du_listing $folderpath] dirs files filesizes ;#filesizes is empty list if listing mechanism doesn't support it (as Tcl glob doesn't) + } + + + incr leveldirs [llength $dirs] + incr levelfiles [llength $files] + + #lappend dir_depths_remaining {*}[lmap d $dirs {::list $nextdepth [du_lit $cont/$itm] $d $zero}] + #folderidx is parent index for new dirs + lappend dir_depths_remaining {*}[_du_new_eachdir $dirs $nextlevel $folderidx] + + #we don't need to sort files (unless we add an option such as -a to du (?)) + set bytecount [expr {0}] + + if {[llength $files] && ![llength $filesizes]} { + #listing mechanism didn't supply corresponding sizes + foreach filename $files { + incr bytecount [file size "$folderpath/$filename"] + } + } else { + set bytecount [tcl::mathop::+ {*}$filesizes] + } + + + #we can safely assume initial count was zero + lset dir_depths_remaining $i $i_size $bytecount + #incr diridx + } + #puts stdout "level: $level dirs: $leveldirs" + if {$opt_extra} { + puts stdout "level: $level dircount: $leveldirs filecount: $levelfiles" + } + incr level ;#zero based + set nextlevel [expr {$level + 1}] + set leveldircount [expr {[llength $dir_depths_remaining] - $listlength }]; #current - previous - while loop terminates when zero + #puts "diridx: $diridx i: $i rem: [llength $dir_depths_remaining] listlenth:$listlength levldircount: $leveldircount" + set diridx $i + set listlength [llength $dir_depths_remaining] + } + #puts stdout ">>> loop done" + #flush stdout + #puts stdout $dir_depths_remaining + set dirs_as_encountered $dir_depths_remaining ;#index is in sync with 'folders' list + set dir_depths_longfirst $dirs_as_encountered + + #store the index before sorting + for {set i 0} {$i < [llength $dir_depths_remaining]} {incr i} { + lset dir_depths_longfirst $i $i_index $i + } + set dir_depths_longfirst [lsort -integer -index 0 -decreasing $dir_depths_longfirst[set dir_depths_longfirst {}]] + + #store main index in the reducing list + set dir_depths_remaining $dir_depths_longfirst + for {set i 0} {$i < [llength $dir_depths_remaining]} {incr i} { + #stored index at position 3 + lset dir_depths_remaining $i $i_index $i + } + + #index 3 + #dir_depths_remaining -> dir_depths_longfirst -> dirs_as_encountered + + #puts stdout "initial dir_depths_remaining: $dir_depths_remaining" + + + #summing performance is not terrible but significant on large tree - the real time is for large trees in the main loop above + #update - on really large trees the reverse is true especiallyl now that twapi fixed the original speed issues.. todo - rework/simplify below - review natsort + # + #TODO - reconsider sorting by depth.. lreverse dirs_as_encountered should work.. + if {[llength $dir_depths_longfirst] > 1} { + set i 0 + foreach dd $dir_depths_longfirst { + lassign $dd d parentidx folderidx item bytecount + #set nm $cont/$item + set nm [lindex $folders $folderidx] + set dnext [expr {$d +1}] + set nextdepthposns [lsearch -all -integer -index 0 $dir_depths_remaining $dnext] + set nextdepthposns [lsort -integer -decreasing $nextdepthposns[set nextdepthposns {}]];#remove later elements first + foreach posn $nextdepthposns { + set id [lindex $dir_depths_remaining $posn $i_itemid] + set ndirname [lindex $folders $id] + #set ndirname $cont/$item + #set item [lindex $dir_depths_remaining $posn $i_item] + #set ndirname [lindex $ndir 1] + if {[string match $nm/* $ndirname]} { + #puts stdout "dir $nm adding subdir size $ndirname" + #puts stdout "incr $nm from $ary($nm,bytes) plus $ary($ndirname,bytes)" + incr bytecount [lindex $dir_depths_remaining $posn $i_size] + set dir_depths_remaining [lreplace $dir_depths_remaining[set dir_depths_remaining {}] $posn $posn] + } + } + lset dir_depths_longfirst $i $i_size $bytecount + set p [lsearch -index $i_index -integer $dir_depths_remaining $i] + lset dir_depths_remaining $p $i_size $bytecount + #set ary($nm,bytes) $bytecount + incr i + } + } + #set dir_depths_longfirst [lsort -index 1 -decreasing $dir_depths_longfirst] + # + + set retval [list] + #copy across the bytecounts + for {set i 0} {$i < [llength $dir_depths_longfirst]} {incr i} { + set posn [lindex $dir_depths_longfirst $i $i_index] + set bytes [lindex $dir_depths_longfirst $i $i_size] + lset dirs_as_encountered $posn $i_size $bytes + } + foreach dirinfo [lreverse $dirs_as_encountered] { + set id [lindex $dirinfo $i_itemid] + set depth [lindex $dirinfo $i_depth] + if {($opt_depth >= 0) && $depth > $opt_depth} { + continue + } + set path [lindex $folders $id] + #set path $cont/$item + set item [lindex $dirinfo $i_item] + set bytes [lindex $dirinfo $i_size] + set size [expr {$bytes / [set $switch]}] + lappend retval [list $size $path] + } + # copyright 2002 by The LIGO Laboratory + return $retval + } + package require natsort + #interp alias {} du {} .=args>* punk::du |> .=>1 natsort::sort -cols 1 |> list_as_lines * punk::du::du |> .=>1 natsort::sort -cols 1 -outputformat words |> list_as_lines * punk::du::du |> .=>1 natsort::sort -cols 1 -outputformat csv -outputformatoptions {\r\t\t\t} |> list_as_lines