# -*- tcl -*- # Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt # # Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. # Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # (C) 2023 # # @@ Meta Begin # Application punk::du 0.1.0 # Meta platform tcl # Meta license BSD # @@ Meta End # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ ## Requirements ##e.g package require frobz package require punk::mix::base package require struct::set namespace eval punk::du { variable has_twapi 0 } if {"windows" eq $::tcl_platform(platform)} { if {![interp issafe]} { package require zzzload zzzload::pkg_require twapi } if {[catch {package require twapi}]} { puts stderr "Warning: punk::du - unable to load twapi. Disk operations may be much slower on windows without the twapi package" } else { set punk::du::has_twapi 1 } #package require punk::winpath } # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ namespace eval punk::du { proc dirlisting {folderpath args} { set defaults [dict create\ -glob *\ ] set opts [dict merge $defaults $args] # -- --- --- --- --- --- --- --- --- --- --- --- --- --- set opt_glob [dict get $opts -glob] # -- --- --- --- --- --- --- --- --- --- --- --- --- --- if {[lib::pathcharacterlen $folderpath] == 0} { set folderpath [pwd] } elseif {[file pathtype $folderpath] ne "absolute"} { #file normalize relativelly slow - avoid in inner loops #set folderpath [file normalize $folderpath] } #run whichever of du_dirlisting_twapi, du_dirlisting_generic, du_dirlisting_unix has been activated set dirinfo [active::du_dirlisting $folderpath {*}$opts] } #Note that unix du seems to do depth-first - which makese sense when piping.. as output can be emitted as we go rather than requiring sort at end. #breadth-first with sort can be quite fast .. but memory usage can easily get out of control proc du { args } { variable has_twapi if 0 { switch -exact [llength $args] { 0 { set dir . set switch -k } 1 { set dir $args set switch -k } 2 { set switch [lindex $args 0] set dir [lindex $args 1] } default { set msg "only one switch and one dir " append msg "currently supported" return -code error $msg } } set switch [string tolower $switch] set -b 1 set -k 1024 set -m [expr 1024*1024] } set opts $args # flags in args are solos (or longopts --something=somethingelse) or sometimes pairopts # we don't currently support mashopts (ie -xy vs separate -x -y) #------------------------------------------------------- # process any pairopts first and remove the pair # (may also process some solo-opts) set opt_depth -1 if {[set posn [lsearch $opts -d]] >= 0} { set opt_depth [lindex $opts $posn+1] set opts [lreplace $opts $posn $posn+1] } foreach o $opts { if {[string match --max-depth=* $o]} { set opt_depth [lindex [split $o =] 1] if {![string is integer -strict $opt_depth]} { error "--max-depth=n n must be an integer" } } } #------------------------------------------------------- #only solos and longopts remain in the opts now set lastarg [lindex $opts end] if {[string length $lastarg] && (![string match -* $lastarg])} { set dir $lastarg set opts [lrange $opts 0 end-1] } else { set dir . set opts $opts } foreach a $opts { if {![string match -* $a]} { error "unrecognized option '$a'" } } set -b 1 set -k 1024 set -m [expr 1024*1024] set switch -k ;#default (same as unix) set lc_opts [string tolower $opts] if {"-b" in $lc_opts} { set switch -b } elseif {"-k" in $lc_opts} { set switch -k } elseif {"-m" in $lc_opts} { set switch -m } set opt_progress 0 if {"--prog" in $lc_opts || "--progress" in $lc_opts} { set opt_progress 1 } set opt_extra 0 if {"--extra" in $lc_opts} { set opt_extra 1 } set opt_vfs 0 #This configures whether to enter a vfsmount point #It will have no effect if cwd already with a vfs mount point - as then opt_vfs will be set to 1 automatically anyway. if {"--vfs" in $lc_opts} { set opt_vfs 1 } set result [list] set dir_depths_remaining [list] set is_windows [expr {$::tcl_platform(platform) eq "windows"}] set zero [expr {0}] # ## ### ### ### ### # containerid and itemid set folders [list] ;#we lookup string by index lappend folders [file dirname $dir] lappend folders $dir ;#itemindex 1 # ## ### ### ### ### if {![file isdirectory $dir]} { lappend dir_depths_remaining [list $zero $zero [expr {1}] [lib::du_lit [file tail $dir]] [file size $dir]] #set ary($dir,bytes) [file size $dir] set leveldircount 0 } else { lappend dir_depths_remaining [list $zero $zero [expr {1}] [lib::du_lit [file tail $dir]] $zero] set leveldircount 1 } set level [expr {0}] set nextlevel [expr {1}] #dir_depths list structure # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- #0 1 2 3 4 5 #i_depth i_containerid i_itemid i_item i_size i_index # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- set i_depth [expr {0}] set i_containerid [expr {1}] set i_itemid [expr {2}] set i_item [expr {3}] set i_size [expr {4}] set i_index [expr {5}] set listlength [llength $dir_depths_remaining] set diridx 0 #this is a breadth-first algorithm while {$leveldircount > 0} { set leveldirs 0 set levelfiles 0 for {set i $diridx} {$i < $listlength} {incr i} { #lassign [lindex $dir_depths_remaining $i] _d containeridx folderidx itm bytecount set folderidx [lindex $dir_depths_remaining $i $i_itemid] set folderpath [lindex $folders $folderidx] #puts stderr ->$folderpath #if {$i >= 20} { #return #} #twapi supports gathering file sizes during directory contents traversal #for dirlisting methods that return an empty list in filesizes whilst files has entries - we will need to populate it below #e.g tcl glob based dirlisting doesn't support gathering file sizes at the same time set in_vfs 0 if {[package provide vfs] ne ""} { foreach vfsmount [vfs::filesystem info] { if {[file pathtype $folderpath] ne "absolute"} { set testpath [file normalize $folderpath] } else { set testpath $folderpath } if {[punk::mix::base::lib::path_a_atorbelow_b $testpath $vfsmount]} { set in_vfs 1 #if already descended to or below a vfs mount point - set opt_vfs true set opt_vfs 1 break } } } if {$in_vfs} { set du_info [lib::du_dirlisting_tclvfs $folderpath] } else { #run the activated function (proc imported to active namespace and renamed) set du_info [active::du_dirlisting $folderpath] } set dirs [dict get $du_info dirs] set files [dict get $du_info files] set filesizes [dict get $du_info filesizes] set vfsmounts [dict get $du_info vfsmounts] #puts "---> vfsmounts $vfsmounts " if {$opt_vfs} { foreach vm $vfsmounts { #puts stderr "vm: $vm" #check if vfs is mounted over a file or a dir if {$vm in $files} { puts stderr "vfs mounted over file $vm" set mposn [lsearch $files $vm] set files [lreplace $files $mposn $mposn] if {[llength $filesizes]} { set filesizes [lreplace $filesizes $mposn $mposn] } } if {$vm ni $dirs} { puts stderr "treating $vm as dir" lappend dirs $vm } } } incr leveldirs [llength $dirs] incr levelfiles [llength $files] #lappend dir_depths_remaining {*}[lmap d $dirs {::list $nextdepth [lib::du_lit $cont/$itm] $d $zero}] #folderidx is parent index for new dirs lappend dir_depths_remaining {*}[lib::du_new_eachdir $dirs $nextlevel $folderidx] #we don't need to sort files (unless we add an option such as -a to du (?)) set bytecount [expr {0}] if {[llength $files] && ![llength $filesizes]} { #listing mechanism didn't supply corresponding sizes foreach filename $files { #incr bytecount [file size [file join $folderpath $filename] incr bytecount [file size $filename] } } else { set filesizes [lsearch -all -inline -not $filesizes[unset filesizes] na] ;#only legal non-number is na set bytecount [tcl::mathop::+ {*}$filesizes] } #we can safely assume initial count was zero lset dir_depths_remaining $i $i_size $bytecount #incr diridx } #puts stdout "level: $level dirs: $leveldirs" if {$opt_extra} { puts stdout "level: $level dircount: $leveldirs filecount: $levelfiles" } incr level ;#zero based set nextlevel [expr {$level + 1}] set leveldircount [expr {[llength $dir_depths_remaining] - $listlength }]; #current - previous - while loop terminates when zero #puts "diridx: $diridx i: $i rem: [llength $dir_depths_remaining] listlenth:$listlength levldircount: $leveldircount" set diridx $i set listlength [llength $dir_depths_remaining] } #puts stdout ">>> loop done" #flush stdout #puts stdout $dir_depths_remaining set dirs_as_encountered $dir_depths_remaining ;#index is in sync with 'folders' list set dir_depths_longfirst $dirs_as_encountered #store the index before sorting for {set i 0} {$i < [llength $dir_depths_remaining]} {incr i} { lset dir_depths_longfirst $i $i_index $i } set dir_depths_longfirst [lsort -integer -index 0 -decreasing $dir_depths_longfirst[set dir_depths_longfirst {}]] #store main index in the reducing list set dir_depths_remaining $dir_depths_longfirst for {set i 0} {$i < [llength $dir_depths_remaining]} {incr i} { #stored index at position 3 lset dir_depths_remaining $i $i_index $i } #index 3 #dir_depths_remaining -> dir_depths_longfirst -> dirs_as_encountered #puts stdout "initial dir_depths_remaining: $dir_depths_remaining" #summing performance is not terrible but significant on large tree - the real time is for large trees in the main loop above #update - on really large trees the reverse is true especiallyl now that twapi fixed the original speed issues.. todo - rework/simplify below - review natsort # #TODO - reconsider sorting by depth.. lreverse dirs_as_encountered should work.. if {[llength $dir_depths_longfirst] > 1} { set i 0 foreach dd $dir_depths_longfirst { lassign $dd d parentidx folderidx item bytecount #set nm $cont/$item set nm [lindex $folders $folderidx] set dnext [expr {$d +1}] set nextdepthposns [lsearch -all -integer -index 0 $dir_depths_remaining $dnext] set nextdepthposns [lsort -integer -decreasing $nextdepthposns[set nextdepthposns {}]];#remove later elements first foreach posn $nextdepthposns { set id [lindex $dir_depths_remaining $posn $i_itemid] set ndirname [lindex $folders $id] #set ndirname $cont/$item #set item [lindex $dir_depths_remaining $posn $i_item] #set ndirname [lindex $ndir 1] if {[string match $nm/* $ndirname]} { #puts stdout "dir $nm adding subdir size $ndirname" #puts stdout "incr $nm from $ary($nm,bytes) plus $ary($ndirname,bytes)" incr bytecount [lindex $dir_depths_remaining $posn $i_size] set dir_depths_remaining [lreplace $dir_depths_remaining[set dir_depths_remaining {}] $posn $posn] } } lset dir_depths_longfirst $i $i_size $bytecount set p [lsearch -index $i_index -integer $dir_depths_remaining $i] lset dir_depths_remaining $p $i_size $bytecount #set ary($nm,bytes) $bytecount incr i } } #set dir_depths_longfirst [lsort -index 1 -decreasing $dir_depths_longfirst] # set retval [list] #copy across the bytecounts for {set i 0} {$i < [llength $dir_depths_longfirst]} {incr i} { set posn [lindex $dir_depths_longfirst $i $i_index] set bytes [lindex $dir_depths_longfirst $i $i_size] lset dirs_as_encountered $posn $i_size $bytes } foreach dirinfo [lreverse $dirs_as_encountered] { set id [lindex $dirinfo $i_itemid] set depth [lindex $dirinfo $i_depth] if {($opt_depth >= 0) && $depth > $opt_depth} { continue } set path [lindex $folders $id] #set path $cont/$item set item [lindex $dirinfo $i_item] set bytes [lindex $dirinfo $i_size] set size [expr {$bytes / [set $switch]}] lappend retval [list $size $path] } # copyright 2002 by The LIGO Laboratory return $retval } namespace eval active { variable functions [list du_dirlisting ""] variable functions_known [dict create] #known functions from lib namespace dict set functions_known du_dirlisting [list du_dirlisting_twapi du_dirlisting_generic du_dirlisting_unix du_dirlisting_undecided] proc show_functions {} { variable functions variable functions_known set msg "" dict for {callname implementations} $functions_known { append msg "callname: $callname" \n foreach imp $implementations { if {[dict get $functions $callname] eq $imp} { append msg " $imp (active)" \n } else { append msg " $imp" \n } } } return $msg } proc set_active_function {callname implementation} { variable functions variable functions_known if {$callname ni [dict keys $functions_known]} { error "unknown function callname $callname" } if {$implementation ni [dict get $functions_known $callname]} { error "unknown implementation $implementation for callname $callname" } dict set functions $callname $implementation catch {rename ::punk::du::active::$callname ""} namespace eval ::punk::du::active [string map [list %imp% $implementation %call% $callname] { namespace import ::punk::du::lib::%imp% rename %imp% %call% }] return $implementation } proc get_active_function {callname} { variable functions variable functions_known if {$callname ni [dict keys $functions_known]} { error "unknown function callname $callname known functions: [dict keys $functions_known]" } return [dict get $functions $callname] } #where we import & the appropriate du_listing.. function for the platform } namespace eval lib { variable du_literal variable winfile_attributes [list 16 directory 32 archive 1024 reparse_point 18 [list directory hidden] 34 [list archive hidden] ] #caching this is faster than calling twapi api each time.. unknown if twapi is calculating from bitmask - or calling windows api #we could work out all flags and calculate from bitmask.. but it's not necessarily going to be faster than some simple caching mechanism like this proc decode_win_attributes {bitmask} { variable winfile_attributes if {[dict exists $winfile_attributes $bitmask]} { return [dict get $winfile_attributes $bitmask] } else { #list/dict shimmering? return [lindex [lappend winfile_attributes $bitmask [twapi::decode_file_attributes $bitmask]] end] } } proc attributes_twapi {path {detail basic}} { try { set iterator [twapi::find_file_open $path -detail $detail] ;# -detail full only adds data to the altname field if {[twapi::find_file_next $iterator iteminfo]} { set attrinfo [decode_win_attributes [dict get $iteminfo attrs]] set result [dict create -archive 0 -hidden 0 -longname $path -readonly 0 -shortname {} -system 0] if {"hidden" in $attrinfo} { dict set result -hidden 1 } if {"system" in $attrinfo} { dict set result -system 1 } if {"readonly" in $attrinfo} { dict set result -readonly 1 } dict set result -shortname [dict get $iteminfo altname] dict set result -rawflags $attrinfo set extras [list] #foreach prop {ctime atime mtime size} { # lappend extras $prop [dict get $iteminfo $prop] #} #dict set result -extras $extras dict set result -raw $iteminfo return $result } else { error "could not read attributes for $path" } } finally { catch {twapi::find_file_close $iterator} } } #todo - review 'errors' key. We have errors relating to containing folder and args vs per child-item errors - additional key needed? namespace export du_dirlisting_twapi du_dirlisting_generic du_dirlisting_unix du_dirlisting_undecided # get listing without using unix-tools (may not be installed on the windows system) # this dirlisting is customised for du - so only retrieves dirs,files,filesizes (minimum work needed to perform du function) # This also preserves path rep for elements in the dirs/folders keys etc - which can make a big difference in performance proc du_dirlisting_twapi {folderpath args} { set defaults [dict create\ -glob *\ -with_sizes 1\ -with_times 1\ ] set opts [dict merge $defaults $args] # -- --- --- --- --- --- --- --- --- --- --- --- --- --- set opt_glob [dict get $opts -glob] # -- --- --- --- --- --- --- --- --- --- --- --- --- --- set opt_with_sizes [dict get $opts -with_sizes] set ftypes [list f d l] if {"$opt_with_sizes" in {0 1}} { #don't use string is boolean - (f false vs f file!) #only accept 0|1 if {$opt_with_sizes} { set sized_types $ftypes } else { set sized_types [list] } } else { set sized_types $opt_with_sizes } if {[llength $sized_types]} { foreach st $sized_types { if {$st ni $ftypes} { error "du_dirlisting_twapi unrecognized element in -with_sizes '$st'" } } } # -- --- --- --- --- --- --- --- --- --- --- --- --- --- set opt_with_times [dict get $opts -with_times] if {"$opt_with_times" in {0 1}} { if {$opt_with_times} { set timed_types $ftypes } else { set timed_types [list] } } else { set timed_types $opt_with_times } if {[llength $timed_types]} { foreach item $timed_types { if {$item ni $ftypes} { error "du_dirlisting_twapi unrecognised element in -with-times '$item'" } } } # -- --- --- --- --- --- --- --- --- --- --- --- --- --- set errors [dict create] set altname "" ;#possible we have to use a different name e.g short windows name or dos-device path //?/ # return it so it can be stored and tried as an alternative for problem paths #puts stderr ">>> glob: $opt_glob" #REVIEW! windows api pattern matchttps://www.red-gate.com/simple-talk/blogs/the-unexpected-behaviour-of-directoryinfo-getfiles-with-three-letter-extensions/hing is .. weird. partly due to 8.3 filenames #https://www.red-gate.com/simple-talk/blogs/the-unexpected-behaviour-of-directoryinfo-getfiles-with-three-letter-extensions/ #we will certainly need to check the resulting listing with our supplied glob.. but maybe we will have to change the glob passed to find_file_open too. # using * all the time may be inefficient - so we might be able to avoid that in some cases. try { #glob of * will return dotfiles too on windows set iterator [twapi::find_file_open [file join $folderpath $opt_glob] -detail basic] ;# -detail full only adds data to the altname field } on error args { try { if {[string match "*denied*" $args]} { #output similar format as unixy du puts stderr "[file dirname $folderpath] '[file tail $folderpath]':$args" dict lappend errors $folderpath $::errorCode return [list dirs {} vfsmounts {} links {} files {} filesizes {} sizes {} times {} flaggedhidden {} flaggedsystem {} flaggedreadonly {} altname {} opts $opts errors $errors] } if {[string match "*TWAPI_WIN32 59*" $::errorCode]} { puts stderr "[file dirname $folderpath] '[file tail $folderpath]':$args (possibly blocked by permissions or share config e.g follow symlinks = no on samba)" puts stderr " (errorcode: $::errorCode)\n" dict lappend errors $folderpath $::errorCode return [list dirs {} vfsmounts {} links {} files {} filesizes {} sizes {} times {} flaggedhidden {} flaggedsystem {} flaggedreadonly {} altname {} opts $opts errors $errors] } #errorcode TWAPI_WIN32 2 {The system cannot find the file specified.} #This can be a perfectly normal failure to match the glob.. which means we shouldn't really warn or error #The find-all glob * won't get here because it returns . & .. #so we should return immediately only if the glob has globchars ? or * but isn't equal to just "*" ? (review) #Note that windows glob ? seems to return more than just single char results - it includes .. - which differs to tcl glob #also ???? seems to returns items 4 or less - not just items exactly 4 long (review - where is this documented?) if {$opt_glob ne "*" && [regexp {[?*]} $opt_glob]} { if {[string match "*TWAPI_WIN32 2 *" $::errorCode]} { #looks like an ordinary no results for chosen glob return [list dirs {} vfsmounts {} links {} files {} filesizes {} sizes {} times {} flaggedhidden {} flaggedsystem {} flaggedreadonly {} altname {} opts $opts errors $errors] } } if {[set plen [pathcharacterlen $folderpath]] >= 250} { set errmsg "error reading folder: $folderpath (len:$plen)\n" append errmsg "error: $args" \n append errmsg "errorcode: $::errorCode" \n # re-fetch this folder with altnames #file normalize - aside from being slow - will have problems with long paths - so this won't work. #this function should only accept absolute paths # # #Note: using -detail full only helps if the last segment of path has an altname.. #To properly shorten we need to have kept track of altname all the way from the root! #We can .. for now call Tcl's file attributes to get shortname of the whole path - it is *expensive* e.g 5ms for a long path on local ssd #### SLOW set fixedpath [dict get [file attributes $folderpath] -shortname] #### SLOW append errmsg "retrying with with windows altname '$fixedpath'" puts stderr $errmsg } else { set errmsg "error reading folder: $folderpath (len:$plen)\n" append errmsg "error: $args" \n append errmsg "errorcode: $::errorCode" \n set tmp_errors [list $::errorCode] #possibly an illegal windows filename - easily happens on a machine with WSL or with drive mapped to unix share #we can use //?/path dos device path - but not with tcl functions #unfortunately we can't call find_file_open directly on the problem name - we have to call the parent folder and iterate through again.. #this gets problematic as we go deeper unless we rewrite the .. but we can get at least one level further here set fixedtail "" set parent [file dirname $folderpath] set badtail [file tail $folderpath] set iterator [twapi::find_file_open [file join $parent *] -detail full] ;#retrieve with altnames while {[twapi::find_file_next $iterator iteminfo]} { set nm [dict get $iteminfo name] if {$nm eq $badtail} { set fixedtail [dict get $iteminfo altname] break } } if {![string length $fixedtail]} { dict lappend errors $folderpath {*}$tmp_errors puts stderr "[file dirname $folderpath] '[file tail $folderpath]':$args (Unable to retrieve altname to progress further with path - returning no contents for this folder)" return [list dirs {} vfsmounts {} links {} files {} filesizes {} sizes {} times {} flaggedhidden {} flaggedsystem {} flaggedreadonly {} altname {} opts $opts errors $errors] } #twapi as at 2023-08 doesn't seem to support //?/ dos device paths.. #Tcl can test only get as far as testing existence of illegal name by prefixing with //?/ - but can't glob inside it #we can call file attributes on it - but we get no shortname (but we could get shortname for parent that way) #so the illegalname_fix doesn't really work here #set fixedpath [punk::winpath::illegalname_fix $parent $fixedtail] #this has shortpath for the tail - but it's not the canonical-shortpath because we didn't call it on the $parent part REIEW. set fixedpath [file join $parent $fixedtail] append errmsg "retrying with with windows dos device path $fixedpath\n" puts stderr $errmsg } if {[catch { set iterator [twapi::find_file_open $fixedpath/* -detail basic] } errMsg]} { puts stderr "[file dirname $folderpath] '[file tail $folderpath]':$args (failed to read even with fixedpath:'$fixedpath')" puts stderr " (errorcode: $::errorCode)\n" puts stderr "$errMsg" dict lappend errors $folderpath $::errorCode return [list dirs {} vfsmounts {} links {} files {} filesizes {} sizes {} times {} flaggedhidden {} flaggedsystem {} flaggedreadonly {} altname {} opts $opts errors $errors] } } on error args { set errmsg "error reading folder: $folderpath\n" append errmsg "error: $args" \n append errmsg "errorInfo: $::errorInfo" \n puts stderr "$errmsg" puts stderr "FAILED to collect info for folder '$folderpath'" #append errmsg "aborting.." #error $errmsg return [list dirs {} vfsmounts {} links {} files {} filesizes {} sizes {} times {} flaggedhidden {} flaggedsystem {} flaggedreadonly {} altname {} opts $opts errors $errors] } } set dirs [list] set files [list] set filesizes [list] set allsizes [dict create] set alltimes [dict create] set links [list] set flaggedhidden [list] set flaggedsystem [list] set flaggedreadonly [list] while {[twapi::find_file_next $iterator iteminfo]} { set nm [dict get $iteminfo name] #recheck glob #review! if {![string match $opt_glob $nm]} { continue } set tail_altname [dict get $iteminfo altname] ;#altname of tail - not whole path set attrinfo [decode_win_attributes [dict get $iteminfo attrs]] #puts stderr "$iteminfo" #puts stderr "$nm -> [dict get $iteminfo attrs] -> $attrinfo" set ftype "" #attributes applicable to any classification set fullname [file_join_one $folderpath $nm] if {"hidden" in $attrinfo} { lappend flaggedhidden $fullname } if {"system" in $attrinfo} { lappend flaggedsystem $fullname } if {"readonly" in $attrinfo} { lappend flaggedreadonly $fullname } #main classification if {"reparse_point" in $attrinfo} { #this concept doesn't correspond 1-to-1 with unix links #https://learn.microsoft.com/en-us/windows/win32/fileio/reparse-points #review - and see which if any actually belong in the links key of our return #One thing it could be, is a 'mounted folder' https://learn.microsoft.com/en-us/windows/win32/fileio/determining-whether-a-directory-is-a-volume-mount-point # #we will treat as zero sized for du purposes.. review - option -L for symlinks like BSD du? #Note 'file readlink' can fail on windows - reporting 'invalid argument' - according to tcl docs, 'On systems that don't support symbolic links this option is undefined' #The link may be viewable ok in windows explorer, and cmd.exe /c dir and unix tools such as ls #if we need it without resorting to unix-tools that may not be installed: exec {*}[auto_execok dir] /A:L {c:\some\path} #e.g (stripped of headers/footers and other lines) #2022-10-02 04:07 AM priv [\\?\c:\repo\elixir\gameportal\apps\test\priv] #Note we will have to parse beyond header fluff as /B strips the symlink info along with headers. #du includes the size of the symlink #but we can't get it with tcl's file size #twapi doesn't seem to have anything to help read it either (?) #the above was verified with a symlink that points to a non-existant folder.. mileage may vary for an actually valid link # #Note also - a shortcut created in explorer with drag and drop to an existant folder is a different animal to a symlink (file with .lnk extension) even though it looks the same in explorer window. # #links are techically files too, whether they point to a file/dir or nothing. lappend links $fullname set ftype "l" } elseif {"directory" in $attrinfo} { if {$nm in {. ..}} { continue } lappend dirs $fullname set ftype "d" } else { #review - is anything that isn't a reparse_point or a directory, some sort of 'file' in this context? What about the 'device' attribute? Can that occur in a directory listing of some sort? lappend files $fullname if {"f" in $sized_types} { lappend filesizes [dict get $iteminfo size] } set ftype "f" } if {$ftype in $sized_types} { dict set allsizes $fullname [dict create bytes [dict get $iteminfo size]] } if {$ftype in $timed_types} { #convert time from windows (100ns units since jan 1, 1601) to Tcl time (seconds since Jan 1, 1970) #We lose some precision by not passing the boolean to the large_system_time_to_secs_since_1970 function which returns fractional seconds #but we need to maintain compatibility with other platforms and other tcl functions so if we want to return more precise times we will need another flag and/or result dict dict set alltimes $fullname [dict create\ c [twapi::large_system_time_to_secs_since_1970 [dict get $iteminfo ctime]]\ a [twapi::large_system_time_to_secs_since_1970 [dict get $iteminfo atime]]\ m [twapi::large_system_time_to_secs_since_1970 [dict get $iteminfo mtime]]\ ] } } twapi::find_file_close $iterator set vfsmounts [get_vfsmounts_in_folder $folderpath] set effective_opts $opts dict set effective_opts -with_times $timed_types dict set effective_opts -with_sizes $sized_types #also determine whether vfs. file system x is *much* faster than file attributes #whether or not there is a corresponding file/dir add any applicable mountpoints for the containing folder return [list dirs $dirs vfsmounts $vfsmounts links $links files $files filesizes $filesizes sizes $allsizes times $alltimes flaggedhidden $flaggedhidden flaggedsystem $flaggedsystem flaggedreadonly $flaggedreadonly altname $altname opts $effective_opts errors $errors] } proc get_vfsmounts_in_folder {folderpath} { set vfsmounts [list] if {![llength [package provide vfs]]} { return [list] } set fpath [punk::objclone $folderpath] set is_rel 0 if {[file pathtype $fpath] ne "absolute"} { set fpath [file normalize $fpath] set is_rel 1 } set known_vfs_mounts [vfs::filesystem info] foreach mount $known_vfs_mounts { if {[punk::mix::base::lib::path_a_above_b $fpath $mount]} { if {([llength [file split $mount]] - [llength [file split $fpath]]) == 1} { #the mount is in this folder if {$is_rel} { lappend vfsmounts [file join $folderpath [file tail $mount]] } else { lappend vfsmounts $mount } } } } return $vfsmounts } #work around the horrible tilde-expansion thing (not needed for tcl 9+) proc file_join_one {base newtail} { if {[string index $newtail 0] ne {~}} { return [file join $base $newtail] } return [file join $base ./$newtail] } #this is the cross-platform pure-tcl version - which calls glob multiple times to make sure it gets everythign it needs and can ignore everything it needs to. #These repeated calls to glob will be a killer for performance - especially on a network share or when walking a large directory structure proc du_dirlisting_generic {folderpath args} { set opts [dict create\ -glob *\ -with_sizes 0\ -with_times 0\ ] set errors [dict create] foreach {k v} $args { switch -- $k { -glob - -with_sizes - -with_times { dict set opts $k $v } default { error "du_dirlisting_generic unknown-option '$k'. Known-options: [dict keys $opts]" } } } # -- --- --- --- --- --- --- --- --- --- --- --- --- --- set opt_glob [dict get $opts -glob] # -- --- --- --- --- --- --- --- --- --- --- --- --- --- set opt_with_sizes [dict get $opts -with_sizes] set ftypes [list f d l] if {"$opt_with_sizes" in {0 1}} { #don't use string is boolean (false vs f problem where f indicates file) if {$opt_with_sizes} { set sized_types $ftypes } else { set sized_types [list] } } else { set sized_types $opt_with_sizes } if {[llength $sized_types]} { foreach st $sized_types { if {$st ni $ftypes} { error "du_dirlisting_generic unrecognized element in -with_sizes '$st'" } } } # -- --- --- --- --- --- --- --- --- --- --- --- --- --- set opt_with_times [dict get $opts -with_times] if {"$opt_with_times" in {0 1}} { if {$opt_with_times} { set timed_types $ftypes } else { set timed_types [list] } } else { set timed_types $opt_with_times } if {[llength $timed_types]} { foreach item $timed_types { if {$item ni $ftypes} { error "du_dirlisting_generic unrecognised element in -with-times '$item'" } } } # -- --- --- --- --- --- --- --- --- --- --- --- --- --- # The repeated globs are a source of slowness for this function. #TODO - we could minimize the number of globs if we know we need to do a file stat and/or file attributes on each entry anyway #For the case where we don't need times,sizes or other metadata - it is faster to do multiple globs #This all makes this function complicated to gather the required data efficiently. #note platform differences between what is considered hidden make this tricky. # on windows 'glob .*' will not return some hidden dot items but will return . .. and glob -types hidden .* will not return some dotted items # glob -types hidden * on windows will not necessarily return all dot files/folders # unix-like platforms seem to consider all dot files as hidden so processing is more straightforward # we need to process * and .* in the same glob calls and remove duplicates # if we do * and .* in separate iterations of this loop we lose the ability to filter duplicates easily #note - with this design, we can't glob for all except dotfiles - this is for cross-platform consistency and efficiency (Review). #dotfiles aren't considered hidden on all platforms #some sort of antiglob is a possible enhancement and more consistent with the view that leading dot should be treated as any other filename character in this context. if {$opt_glob eq "*"} { #Note - glob operations seem to be faster than looped tests like 'file isdirectory' & 'file readlink' #set parent [lindex $folders $folderidx] set hdirs [glob -nocomplain -dir $folderpath -types {hidden d} * .*] #set hdirs {} set dirs [glob -nocomplain -dir $folderpath -types d * .*] set hlinks [glob -nocomplain -dir $folderpath -types {hidden l} * .*] #set hlinks {} set links [glob -nocomplain -dir $folderpath -types l * .*] ;#links may have dupes - we don't care. struct::set difference will remove (?) #set links [lsort -unique [concat $hlinks $links[unset links]]] set hfiles [glob -nocomplain -dir $folderpath -types {hidden f} * .*] #set hfiles {} set files [glob -nocomplain -dir $folderpath -types f * .*] #set files {} } else { set hdirs [glob -nocomplain -dir $folderpath -types {hidden d} $opt_glob] set dirs [glob -nocomplain -dir $folderpath -types d $opt_glob] set hlinks [glob -nocomplain -dir $folderpath -types {hidden l} $opt_glob] set links [glob -nocomplain -dir $folderpath -types l $opt_glob] ;#links may have dupes - we don't care. struct::set difference will remove (?) set hfiles [glob -nocomplain -dir $folderpath -types {hidden f} $opt_glob] set files [glob -nocomplain -dir $folderpath -types f $opt_glob] } #note struct::set difference produces unordered result #struct::set difference removes duplicates (but not always.. e.g if using tcl impl and 2nd element empty!) #relying on struct::set to remove dupes is somewhat risky. It is not well documented - and behaviour of dupes in inputs is underspecified as it appears to be intended for mathematical 'sets' #remove links and . .. from directories, remove links from files #struct::set will affect order: tcl vs critcl give different ordering! set files [struct::set difference [concat $hfiles $files[unset files]] $links] set dirs [struct::set difference [concat $hdirs $dirs[unset dirs]] [concat $links [list [file join $folderpath .] [file join $folderpath ..] ]]] #set links [lsort -unique [concat $links $hlinks]] #---- set mdata_lists [du_get_metadata_lists $sized_types $timed_types $files $dirs $links] if {"windows" eq $::tcl_platform(platform)} { set flaggedhidden [concat $hdirs $hfiles $hlinks] } else { #unix dotted files/folders are not 'flagged' as such - it's a convention - so the client can use the same convention to decide if something is hidden #this allows us to return less data - but more importantly - reserve flaggedhidden for systems where such a flag exists and avoid conflating the different concepts of what is hidden set flaggedhidden {} } set vfsmounts [get_vfsmounts_in_folder $folderpath] set effective_opts $opts dict set effective_opts -with_times $timed_types dict set effective_opts -with_sizes $sized_types return [list dirs $dirs vfsmounts $vfsmounts links $links files $files filesizes [dict get $mdata_lists fsizes] sizes [dict get $mdata_lists allsizes] times [dict get $mdata_lists alltimes] flaggedhidden {} flaggedsystem {} flaggedreadonly {} altname {} opts $effective_opts errors $errors] } proc du_dirlisting_tclvfs {folderpath args} { set defaults [dict create\ -glob *\ -with_sizes 0\ -with_times 0\ ] set opts [dict merge $defaults $args] # -- --- --- --- --- --- --- --- --- --- --- --- --- --- set opt_glob [dict get $opts -glob] # -- --- --- --- --- --- --- --- --- --- --- --- --- --- set opt_with_sizes [dict get $opts -with_sizes] set ftypes [list f d l] if {"$opt_with_sizes" in {0 1}} { #don't use string is boolean (false vs f problem where f indicates file) if {$opt_with_sizes} { set sized_types $ftypes } else { set sized_types [list] } } else { set sized_types $opt_with_sizes } if {[llength $sized_types]} { foreach st $sized_types { if {$st ni $ftypes} { error "du_dirlisting_generic unrecognized element in -with_sizes '$st'" } } } # -- --- --- --- --- --- --- --- --- --- --- --- --- --- set opt_with_times [dict get $opts -with_times] if {"$opt_with_times" in {0 1}} { if {$opt_with_times} { set timed_types $ftypes } else { set timed_types [list] } } else { set timed_types $opt_with_times } if {[llength $timed_types]} { foreach item $timed_types { if {$item ni $ftypes} { error "du_dirlisting_generic unrecognised element in -with-times '$item'" } } } # -- --- --- --- --- --- --- --- --- --- --- --- --- --- set errors [dict create] if {$opt_glob eq "*"} { set dirs [glob -nocomplain -dir $folderpath -types d * .*] ;# also returns links to dirs #review - how are links handled in vfs? presumably if the vfs is a full implementation they should work at least within the vfs? set links [glob -nocomplain -dir $folderpath -types l * .*] ;# links may have dupes - we don't care. struct::set difference will remove set files [glob -nocomplain -dir $folderpath -types f * .*] ;# also returns links to files } else { set dirs [glob -nocomplain -dir $folderpath -types d $opt_glob] set links [glob -nocomplain -dir $folderpath -types l $opt_glob] set files [glob -nocomplain -dir $folderpath -types f $opt_glob] } #remove any links from our dirs and files collections set dirs [struct::set difference $dirs[unset dirs] [concat $links [list [file join $folderpath .] [file join $folderpath ..] ]]] set files [struct::set difference $files[unset files] $links] #nested vfs mount.. REVIEW - does anything need special handling? set vfsmounts [get_vfsmounts_in_folder $folderpath] set mdata_lists [du_get_metadata_lists $sized_types $timed_types $files $dirs $links] set effective_opts $opts dict set effective_opts -with_times $timed_types dict set effective_opts -with_sizes $sized_types return [list dirs $dirs vfsmounts $vfsmounts links $links files $files filesizes [dict get $mdata_lists fsizes] sizes [dict get $mdata_lists allsizes] times [dict get $mdata_lists alltimes] flaggedhidden {} flaggedsystem {} flaggedreadonly {} altname {} opts $effective_opts errors $errors] } #we can halve the number of round trips on unix-like systems, where 'hidden' always corresponds to dotted files proc du_dirlisting_unix {folderpath args} { set defaults [dict create\ -glob *\ -with_sizes 0\ -with_times 0\ ] set errors [dict create] dict lappend errors $folderpath "metadata support incomplete - prefer du_dirlisting_generic" set opts [dict merge $defaults $args] # -- --- --- --- --- --- --- --- --- --- --- --- --- --- set opt_glob [dict get $opts -glob] # -- --- --- --- --- --- --- --- --- --- --- --- --- --- set opt_with_sizes [dict get $opts -with_sizes] set ftypes [list f d l] if {"$opt_with_sizes" in {0 1}} { #don't use string is boolean (false vs f problem where f indicates file) if {$opt_with_sizes} { set sized_types $ftypes } else { set sized_types [list] } } else { set sized_types $opt_with_sizes } if {[llength $sized_types]} { foreach st $sized_types { if {$st ni $ftypes} { error "du_dirlisting_generic unrecognized element in -with_sizes '$st'" } } } # -- --- --- --- --- --- --- --- --- --- --- --- --- --- set opt_with_times [dict get $opts -with_times] if {"$opt_with_times" in {0 1}} { if {$opt_with_times} { set timed_types $ftypes } else { set timed_types [list] } } else { set timed_types $opt_with_times } if {[llength $timed_types]} { foreach item $timed_types { if {$item ni $ftypes} { error "du_dirlisting_generic unrecognised element in -with-times '$item'" } } } #this is not consistent with standard way * works for unix folders - but it is consistent with behaviour of this facility on windows if {$opt_glob eq "*"} { set dirs [glob -nocomplain -dir $folderpath -types d * .*] ;# also returns links to dirs set links [glob -nocomplain -dir $folderpath -types l * .*] ;# links may have dupes - we don't care. struct::set difference will remove set files [glob -nocomplain -dir $folderpath -types f * .*] ;# also returns links to files } else { set dirs [glob -nocomplain -dir $folderpath -types d $opt_glob] set links [glob -nocomplain -dir $folderpath -types l $opt_glob] set files [glob -nocomplain -dir $folderpath -types f $opt_glob] } #remove any links from our dirs and files collections set dirs [struct::set difference $dirs[unset dirs] [concat $links [list [file join $folderpath .] [file join $folderpath ..] ]]] set files [struct::set difference $files[unset files] $links] set vfsmounts [get_vfsmounts_in_folder $folderpath] set mdata_lists [du_get_metadata_lists $sized_types $timed_types $files $dirs $links] set effective_opts $opts dict set effective_opts -with_times $timed_types dict set effective_opts -with_sizes $sized_types return [list dirs $dirs vfsmounts $vfsmounts links $links files $files filesizes [dict get $mdata_lists fsizes] sizes [dict get $mdata_lists allsizes] times [dict get $mdata_lists alltimes] flaggedhidden {} flaggedsystem {} flaggedreadonly {} altname {} opts $opts errors $errors] } #return fsizes,allsizes,alltimes metadata in same order as files,dirs,links lists - if specified in sized_types proc du_get_metadata_lists {sized_types timed_types files dirs links} { set meta_dict [dict create] set meta_types [concat $sized_types $timed_types] #known tcl stat keys 2023 - review set empty_stat_dict [dict create atime {} ctime {} dev {} gid {} ino {} mode {} mtime {} nlink {} size {} type {} uid {}] #make sure we call file stat only once per item set statkeys [list] if {[llength $meta_types]} { foreach ft {f d l} lvar {files dirs links} { if {"$ft" in $meta_types} { foreach path [set $lvar] { #caller may have read perm on the containing folder - but not on child item - so file stat could raise an error if {![catch {file stat $path arrstat} errM]} { dict set meta_dict $path [dict create shorttype $ft {*}[array get arrstat]] } else { dict lappend errors $path "file stat error: $errM" dict set meta_dict $path [dict create shorttype $ft {*}$empty_stat_dict] } } } } } set fsizes [list] set allsizes [dict create] set alltimes [dict create] #review birthtime field of stat? cross-platform differences ctime etc? dict for {path pathinfo} $meta_dict { set ft [dict get $pathinfo shorttype] if {$ft in $sized_types} { dict set allsizes $path [dict create bytes [dict get $pathinfo size]] if {$ft eq "f"} { #subst with na if empty? lappend fsizes [dict get $pathinfo size] } } if {$ft in $timed_types} { dict set alltimes $path [dict create c [dict get $pathinfo ctime] a [dict get $pathinfo atime] m [dict get $pathinfo mtime]] } } #todo - fix . The list lengths will presumably match but have empty values if failed to stat if {"f" in $sized_types} { if {[llength $fsizes] ne [llength $files]} { dict lappend errors $folderpath "failed to retrieve all file sizes" } } return [dict create fsizes $fsizes allsizes $allsizes alltimes $alltimes] } proc du_lit value { variable du_literal if {![info exists du_literal($value)]} { set du_literal($value) $value } return $du_literal($value) } #v1 proc du_new_eachdirtail {dirtails depth parentfolderidx} { set newlist {} upvar folders folders set parentpath [lindex $folders $parentfolderidx] set newindex [llength $folders] foreach dt $dirtails { lappend folders [file join $parentpath [du_lit $dt]]; #store as a 'path' rather than a string (see tcl::unsupported::representation) lappend newlist [::list $depth $parentfolderidx $newindex [du_lit $dt] [expr {0}]] incr newindex } return $newlist } proc du_new_eachdir {dirpaths depth parentfolderidx} { set newlist {} upvar folders folders set newindex [llength $folders] foreach dp $dirpaths { lappend folders $dp #puts stdout "--->$dp" lappend newlist [::list $depth $parentfolderidx $newindex [du_lit [file tail $dp]] [expr {0}]] incr newindex } return $newlist } #same implementation as punk::strlen #get length of path which has internal rep of path - maintaining path/list rep without shimmering to string representation. proc pathcharacterlen {pathrep} { append str2 $pathrep {} string length $str2 } #just an experiment proc pathcharacterlen1 {pathrep} { #This works - but is unnecessarily complex set l 0 set parts [file split $pathrep] if {[llength $parts] < 2} { return [string length [lindex $parts 0]] } foreach seg $parts { incr l [string length $seg] } return [expr {$l + [llength $parts] -2}] } #slower - doesn't work for short paths like c:/ proc pathcharacterlen2 {pathrep} { return [tcl::mathop::+ {*}[lmap v [set plist [file split $pathrep]] {[string length $v]}] [llength $plist] -2] } #Strip using lengths without examining path components #without normalization is much faster proc path_strip_alreadynormalized_prefixdepth {path prefix} { set tail [lrange [file split $path] [llength [file split $prefix]] end] if {[llength $tail]} { return [file join {*}$tail] } else { return "" } } proc du_dirlisting_undecided {folderpath args} { if {"windows" eq $::tcl_platform(platform)} { #jmn disable twapi #tailcall du_dirlisting_generic $folderpath {*}$args set loadstate [zzzload::pkg_require twapi] if {$loadstate ni [list loading failed]} { #either already loaded by zzload or ordinary package require package require twapi ;#should be fast once twapi dll loaded in zzzload thread set ::punk::du::has_twapi 1 punk::du::active::set_active_function du_dirlisting du_dirlisting_twapi tailcall du_dirlisting_twapi $folderpath {*}$args } else { if {$loadstate eq "failed"} { puts stderr "punk::du defaulting to du_dirlisting_generic because twapi load failed" punk::du::active::set_active_function du_dirlisting du_dirlisting_generic } tailcall du_dirlisting_generic $folderpath {*}$args } } else { punk::du::active::set_active_function du_dirlisting du_dirlisting_unix tailcall du_dirlisting_unix $folderpath {*}$args } } } package require natsort #interp alias {} du {} .=args>* punk::du |> .=>1 natsort::sort -cols 1 |> list_as_lines * punk::du::du |> .=>1 natsort::sort -cols 1 -outputformat words |> list_as_lines * punk::du::du |> .=>1 natsort::sort -cols 1 -outputformat csv -outputformatoptions {\r\t\t\t} |> list_as_lines --