punkshell/src/vendormodules/fauxlink-0.1.1.tm

# -*- tcl -*-
# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from <pkg>-buildversion.txt
#
# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem.
# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository.
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# (C) 2024
#
# @@ Meta Begin
# Application fauxlink 0.1.1
# Meta platform     tcl
# Meta license      MIT
# @@ Meta End


# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# doctools header
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[manpage_begin fauxlink_module_fauxlink 0 0.1.1]
#[copyright "2024"]
#[titledesc {faux link application shortcuts}]  [comment {-- Name section and table of contents description --}]
#[moddesc {.fauxlink .fxlnk}]             [comment {-- Description at end of page heading --}]
#[require fauxlink]
#[keywords symlink faux fake shortcut toml]
#[description]
#[para] A cross platform shortcut/symlink alternative.
#[para] Unapologetically ugly - but practical in certain circumstances.
#[para] A solution is required for application-driven filesystem links that survives cross platform moves as well as
#[para] archiving and packaging systems.
#[para] The target is specified in a minimally-encoded form in the filename itself - but still human readable.
#[para] format of name <nominalname>#<encodedtarget>.fauxlink
#[para] where <nominalname> can be empty - then the effective nominal name is the tail of the <encodedtarget>
#[para] The file extension must be .fauxlink or .fxlnk
#[para] The + symbol substitutes for forward-slashes.
#[para] Other chars can be encoded using url-like encoding - (but only up to %7E !)
#[para] We deliberately treat higher % sequences literally.
#[para] This means actual uri::urn encoded unicode sequences (e.g %E2%99%A5 [lb]heart[rb]) can remain literal for linking to urls.
#[para] e.g if an actual + or # is required in a filename or path segment they can be encoded as %2B & %23
#[para] e.g a link to a file file#A.txt in parent dir could be:
#[para] file%23A.txt#..+file%23A.txt.fauxlink
#[para] or equivalently (but obviously affecting sorting) #..+file%23A.txt.fauxlink
#[para] The <nominalname> can be unrelated to the actual target
#[para] e.g datafile.dat#..+file%23A.txt.fauxlink
#[para] This system has no filesystem support - and must be completely application driven.
#[para] This can be useful for example in application test packages which may be tarred or zipped and moved cross platform.
#[para] The target being fully specified in the name means the file doesn't have to be read for the target to be determined
#[para] Extensions to behaviour should be added in the file as text data in Toml format,
#[para] with custom data being under a single application-chosen table name
#[para] The toplevel Toml table [lb]fauxlink[rb] is reserved for core extensions to this system.
#[para] Aside from the 2 used for delimiting (+ #)
#[para] certain characters which might normally be allowed in filesystems are required to be encoded
#[para] e.g space and tab are required to be %20 %09
#[para] Others that require encoding are: * ? \ / | : ; " < >
#[para] The nul character in raw form, when detected, is always mapped away to the empty string - as very few filesystems support it.
#[para] Control characters and other punctuation is optional to encode.
#[para] Generally utf-8 should be used where possible and unicode characters can often be left unencoded on modern systems.
#[para] Where encoding of unicode is desired in the nominalname,encodedtarget,tag or comment portions it can be specified as %UXXXXXXXX
#[para] There must be between 1 and 8 X digits following the %U. Interpretation of chars following %U stops at the first non-hex character.
#[para] This means %Utest would not get any translation as there were no hex digits so it would come out as %Utest
#
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++

#https://learn.microsoft.com/en-us/troubleshoot/windows-client/networking/url-encoding-unc-paths-not-url-decoded
# ie "//server/c/Program files" works but "//server/c/Program%20Files" is now treated by windows as a literal path with %20 in it.
#Using fauxlink - a link would be:
# "my-program-files#++server+c+Program%20Files.fauxlink"
#If we needed the old-style literal %20 it would become
# "my-program-files#++server+c+Program%2520Files.fauxlink"
#
# The file:// scheme on windows supposedly *does* decode %xx (for use in a browser)
# e.g
# pfiles#file%3a++++localhost+c+Program%2520files
# The browser will work with literal spaces too though - so it could just as well be:
# pfiles#file%3a++++localhost+c+Program%20files
#windows may default to using explorer.exe instead of a browser for file:// urls though
#and explorer doesn't want the literal %20.  It probably depends what API the file:// url is to be passed to?
#in a .url shortcut either literal space or %20 will work ie %xx values are decoded


#*** !doctools
#[section Overview]
#[para] overview of fauxlink
#[subsection Concepts]
#[para] -


# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
##  Requirements
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++

#*** !doctools
#[subsection dependencies]
#[para] packages used by fauxlink
#[list_begin itemized]

package require Tcl 8.6-
#*** !doctools
#[item] [package {Tcl 8.6-}]

# #package require frobz
# #*** !doctools
# #[item] [package {frobz}]

#*** !doctools
#[list_end]

# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++

#*** !doctools
#[section API]

# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# oo::class namespace
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
namespace eval fauxlink::class {
    #*** !doctools
    #[subsection {Namespace fauxlink::class}]
    #[para] class definitions
    if {[info commands [namespace current]::interface_sample1] eq ""} {
        #*** !doctools
        #[list_begin enumerated]

        # oo::class create interface_sample1 {
        #     #*** !doctools
        #     #[enum] CLASS [class interface_sample1]
        #     #[list_begin definitions]

        #     method test {arg1} {
        #         #*** !doctools
        #         #[call class::interface_sample1    [method test] [arg arg1]]
        #         #[para] test method
        #         puts "test: $arg1"
        #     }

        #     #*** !doctools
        #     #[list_end] [comment {-- end definitions interface_sample1}]
        # }

        #*** !doctools
        #[list_end] [comment {--- end class enumeration ---}]
    }
}
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++

# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# Base namespace
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
namespace eval fauxlink {
    namespace export {[a-z]*};   # Convention: export all lowercase

    #todo - enforce utf-8

    #literal unicode chars supported by modern filesystems - leave as is - REVIEW


    variable encode_map
    variable decode_map
    #most filesystems don't allow NULL - map to empty string

    #Make sure % is not in encode_map
    set encode_map [dict create\
        \x00    ""\
        { }     %20\
        \t      %09\
        +       %2B\
        #       %23\
        *       %2A\
        ?       %3F\
        \\      %5C\
        /       %2F\
        |       %7C\
        :       %3A\
        {;}     %3B\
        {"}     %22\
        <       %3C\
        >       %3E\
    ]
    #above have some overlap with ctrl codes below.
    #no big deal as it's a dict

    #must_encode
    # + # * ? \ / | : ; " < > <sp> \t
    # also NUL to empty string

    # also ctrl chars 01 to 1F (1..31)
    for {set i 1} {$i < 32} {incr i} {
        set ch          [format %c $i]
        set enc         "%[format %02X $i]"
        set enc_lower   [string tolower $enc]
        dict set encode_map $ch         $enc
        dict set decode_map $enc        $ch
        dict set decode_map $enc_lower  $ch
    }

    variable must_encode
    set must_encode [dict keys $encode_map]


    #if they are in

    #decode map doesn't include
    #    %00     (nul)
    #    %2F     "/"
    #    %2f     "/"
    #    %7f     (del)
    #we exlude the forward slash because we already have + for that - and multiple ways to specify it obscure intention.
    #
    set decode_map [dict merge $decode_map [dict create\
        %09     \t\
        %20     { }\
        %21     "!"\
        %22     {"}\
        %23     "#"\
        %24     "$"\
        %25     "%"\
        %26     "&"\
        %27     "'"\
        %28     "("\
        %29     ")"\
        %2A     "*"\
        %2a     "*"\
        %2B     "+"\
        %2b     "+"\
        %2C     ","\
        %2c     ","\
        %2D     "-"\
        %2d     "-"\
        %2E     "."\
        %2e     "."\
        %3A     ":"\
        %3a     ":"\
        %3B     {;}\
        %3b     {;}\
        %3D     "="\
        %3C     "<"\
        %3c     "<"\
        %3d     "="\
        %3E     ">"\
        %3e     ">"\
        %3F     "?"\
        %3f     "?"\
        %40     "@"\
        %5B     "\["\
        %5b     "\["\
        %5C     "\\"\
        %5c     "\\"\
        %5D     "\]"\
        %5d     "\]"\
        %5E     "^"\
        %5e     "^"\
        %60     "`"\
        %7B     "{"\
        %7b     "{"\
        %7C     "|"\
        %7c     "|"\
        %7D     "}"\
        %7d     "}"\
        %7E     "~"\
        %7e     "~"\
    ]]
    #Don't go above 7f
    #if we want to specify p


    #*** !doctools
    #[subsection {Namespace fauxlink}]
    #[para] Core API functions for fauxlink
    #[list_begin definitions]
    proc Segment_mustencode_check {str} {
        variable decode_map
        variable encode_map ;#must_encode
        set idx 0
        set err ""
        foreach ch [split $str ""] {
            if {[dict exists $encode_map $ch]} {
                set enc [dict get $encode_map $ch]
                if {[dict exists $decode_map $enc]} {
                    append err "  char $idx should be encoded as $enc" \n
                } else {
                    append err "  no %xx encoding available. Use %UXX if really required" \n
                }
            }
            incr idx
        }
        return $err ;#empty string if ok
    }

    proc resolve {link} {
        variable decode_map
        variable encode_map
        variable must_encode
        set ftail [file tail $link]
        set extension_name [string range [file extension $ftail] 1 end]
        if {$extension_name ni [list fxlnk fauxlink]} {
            set is_fauxlink 0
            #we'll process anyway - but return the result wrapped
            #This should allow deliberate erroring for the calling dict user if the extension difference is inadvertent
            #(e.g blindly processing all files in a folder that is normally only .fauxlink files - but then something added that happens
            # to have # characters in it)
            #It also means if someone really wants to use the fauxlink semantics on a different file type
            # - they can - but just have to access the results differently and take that (minor) risk.
            #error "fauxlink::resolve refusing to process link $link - file extension must be .fxlnk or .fauxlink"
            set err_extra "\nnonstandard extension '$extension_name' for fauxlink. (expected .fxlnk or .fauxlink) Check that the call to fauxlink::resolve was deliberate"
        } else {
            set is_fauxlink 1
            set err_extra ""
        }
        set linkspec [file rootname $ftail]
        # - any # or + within the target path or name should have been uri encoded as %23 and %2b
        if {[tcl::string::first # $linkspec] < 0} {
            set err "fauxlink::resolve '$link'. Link must contain a # (usually at start if name matches target)"
            append err $err_extra
            error $err
        }
        #The 1st 2 parts of split on # are name and target file/dir
        #If there are only 3 parts the 3rd part is a comment and there are no 'tags'
        #if there are 4 parts - the 3rd part is a tagset where each tag begins with @
        #and each subsequent part is a comment. Empty comments are stripped from the comments list
        #A tagset can be empty - but if it's not empty it must contain at least one @ and must start with @
        #e.g name.txt#path#@tag1@tag2#test###.fauxlink
        #has a name, a target, 2 tags and one comment

        #check namespec already has required chars encoded
        set segments [split $linkspec #]
        lassign $segments namespec targetspec
        #puts stderr "-->namespec $namespec"
        set nametest [tcl::string::map $encode_map $namespec]
        #puts stderr "-->nametest $nametest"
        #nothing should be changed - if there are unencoded chars that must be encoded it is an error
        if {[tcl::string::length $nametest] ne [tcl::string::length $namespec]} {
            set err "fauxlink::resolve '$link' invalid chars in name part (section prior to first #)"
            append err [Segment_mustencode_check $namespec]
            append err $err_extra
            error $err
        }
        #see comments below regarding 2 rounds and ordering.
        set name        [decode_unicode_escapes $namespec]
        set name        [tcl::string::map $decode_map $name]
        #puts stderr "-->name: $name"

        set targetsegment [split $targetspec +]
        #check each + delimited part of targetspec already has required chars encoded
        set pp 0 ;#pathpart index
        set targetpath_parts [list]
        foreach pathpart $targetsegment {
            set targettest [tcl::string::map $encode_map $pathpart]
            if {[tcl::string::length $targettest] ne [tcl::string::length $pathpart]} {
                set err "fauxlink::resolve '$link' invalid chars in targetpath (section following first #)"
                append err [Segment_mustencode_check $pathpart]
                append err $err_extra
                error $err
            }
            #2 rounds of substitution is possibly asking for trouble..
            #We allow anything in the resultant segments anyway (as %UXXXX... allows all)
            #so it's not so much about what can be encoded,
            # - but it makes it harder to reason about for users
            # In particular - if we map %XX first it makes %25 -> % substitution tricky
            # if the user requires a literal %UXXX - they can't do %25UXXX
            # the double sub would make it %UXXX -> somechar anyway.
            #we do unicode first - as a 2nd round of %XX substitutions is unlikely to interfere.
            #There is still the opportunity to use things like %U00000025 followed by hex-chars
            # and get some minor surprises, but using %U on ascii is unlikely to be done accidentally - REVIEW
            set pathpart             [decode_unicode_escapes $pathpart]
            set pathpart             [tcl::string::map $decode_map $pathpart]
            lappend targetpath_parts $pathpart

            incr pp
        }
        set targetpath [join $targetpath_parts /]
        if {$name eq ""} {
            set name [lindex $targetpath_parts end]
        }
        #we do the same encoding checks on tags and comments to increase chances of portability
        set tags [list]
        set comments [list]
        switch -- [llength $segments] {
            2 {
                #no tags or comments
            }
            3 {
                #only 3 sections - last is comment - even if looks like tags
                #to make the 3rd part a tagset, an extra # would be needed
                set comments [list [lindex $segments 2]]
            }
            default {
                set tagset [lindex $segments 2]
                if {$tagset eq ""} {
                    #ok - no tags
                } else {
                    if {[string first @ $tagset] != 0} {
                        set err     "fauxlink::resolve '$link' invalid tagset in 3rd #-delimited segment"
                        append err  \n " - must begin with @"
                        append err $err_extra
                        error $err
                    } else {
                        set tagset [string range $tagset 1 end]
                        set rawtags [split $tagset @]
                        set tags [list]
                        foreach t $rawtags {
                            if {$t eq ""} {
                                lappend tags ""
                            } else {
                                set tagtest [tcl::string::map $encode_map $t]
                                if {[tcl::string::length $tagtest] ne [tcl::string::length $t]} {
                                    set err "fauxlink::resolve '$link' invalid chars in tag [llength $tags]"
                                    append err [Segment_mustencode_check $t]
                                    append err $err_extra
                                    error $err
                                }
                                lappend tags [tcl::string::map $decode_map [decode_unicode_escapes $t]]
                            }
                        }
                    }
                }
                set rawcomments [lrange $segments 3 end]
                #set comments  [lsearch -all -inline -not $comments ""]
                set comments [list]
                foreach c $rawcomments {
                    if {$c eq ""} {continue}
                    set commenttest [tcl::string::map $encode_map $c]
                    if {[tcl::string::length $commenttest] ne [tcl::string::length $c]} {
                        set err "fauxlink::resolve '$link' invalid chars in comment [llength $comments]"
                        append err [Segment_mustencode_check $c]
                        append err $err_extra
                        error $err
                    }
                    lappend comments [tcl::string::map $decode_map [decode_unicode_escapes $c]]
                }
            }
        }

        set data [dict create name $name targetpath $targetpath tags $tags comments $comments fauxlinkextension $extension_name]
        if {$is_fauxlink} {
            #standard .fxlnk or .fauxlink
            return $data
        } else {
            #custom extension - or called in error on wrong type of file but happened to parse.
            #see comments at top regarding is_fauxlink
            #make sure no keys in common at top level.
            return [dict create\
                linktype $extension_name\
                note "nonstandard extension returning nonstandard dict with result in data key"\
                data $data\
            ]
        }
    }
    variable map

    #default exclusion of / (%U2f and equivs)
    #this would allow obfuscation of intention - when we have + for that anyway
    proc decode_unicode_escapes {str {exclusions {/ \n \r \x00}}} {
        variable map
        set ucstart [string first %U $str 0]
        if {$ucstart < 0} {
            return $str
        }
        set max 8
        set map [list]
        set strend [expr {[string length $str]-1}]
        while {$ucstart >= 0} {
            set s $ucstart
            set i [expr {$s +2}] ;#skip the %U
            set hex ""
            while {[tcl::string::length $hex] < 8 && $i <= $strend} {
                set in [string index $str $i]
                if {[tcl::string::is xdigit -strict $in]} {
                    append hex $in
                } else {
                    break
                }
                incr i
            }
            if {$hex ne ""} {
                incr i -1
                lappend map $s $i $hex
            }
            set ucstart [tcl::string::first %U $str $i]
        }
        set out ""
        set lastidx -1
        set e 0
        foreach {s e hex} $map {
            append out [string range $str $lastidx+1 $s-1]
            set sub [format %c 0x$hex]
            if {$sub in $exclusions} {
                append out %U$hex ;#put it back
            } else {
                append out $sub
            }
            set lastidx $e
        }
        if {$e < [tcl::string::length $str]-1} {
            append out [string range $str $e+1 end]
        }
        return $out
    }
    proc link_as {name target} {

    }

    #proc sample1 {p1 args} {
    #    #*** !doctools
    #    #[call [fun sample1] [arg p1] [opt {?option value...?}]]
    #    #[para]Description of sample1
    #    return "ok"
    #}


    #*** !doctools
    #[list_end] [comment {--- end definitions namespace fauxlink ---}]
}
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++


# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# Secondary API namespace
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
namespace eval fauxlink::lib {
    namespace export {[a-z]*};   # Convention: export all lowercase
    namespace path [namespace parent]
    #*** !doctools
    #[subsection {Namespace fauxlink::lib}]
    #[para] Secondary functions that are part of the API
    #[list_begin definitions]

    #proc utility1 {p1 args} {
    #    #*** !doctools
    #    #[call lib::[fun utility1] [arg p1] [opt {?option value...?}]]
    #    #[para]Description of utility1
    #    return 1
    #}


    #*** !doctools
    #[list_end] [comment {--- end definitions namespace fauxlink::lib ---}]
}
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++


# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[section Internal]
namespace eval fauxlink::system {
    #*** !doctools
    #[subsection {Namespace fauxlink::system}]
    #[para] Internal functions that are not part of the API


}
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
##  Ready
package provide fauxlink [namespace eval fauxlink {
    variable pkg fauxlink
    variable version
    set version 0.1.1
}]
return

#*** !doctools
#[manpage_end]