punkshell/src/bootsupport/modules/punk/char-0.1.0.tm

# -*- tcl -*-
# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from <pkg>-buildversion.txt
#
# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem.
# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository.
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# (C) 2023
#
# @@ Meta Begin
# Application punk::char 0.1.0
# Meta platform     tcl
# Meta license      <unspecified>
# @@ Meta End

# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# doctools header
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[manpage_begin punkshell_module_punk::char 0 0.1.0]
#[copyright "2024"]
#[titledesc {character-set and unicode utilities}]  [comment {-- Name section and table of contents description --}]
#[moddesc {character-set nad unicode}]             [comment {-- Description at end of page heading --}]
#[require punk::char]
#[keywords module encodings]
#[description]

# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++

#*** !doctools
#[section Overview]
#[para] overview of punk::char
#[subsection Concepts]
#[para] -


# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
##  Requirements
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
##  Requirements
##e.g package require frobz

#*** !doctools
#[subsection dependencies]
#[para] packages used by punk::char
#[list_begin itemized]
#[item] [package {Tcl 8.6}]

#

#*** !doctools
#[item] [package {overtype}]
#[para] -
#[item] [package {textblock}]
#[para] -
#[item] [package console]
#[para] -

package require Tcl 8.6-
#dependency on tcllib not ideal for bootstrapping as punk::char is core to many features.. (textutil::wcswidth is therefore included in bootsupport/include_modules.config) review
package require textutil::wcswidth

#*** !doctools
#[list_end]

# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++

#*** !doctools
#[section API]

#Note that ansi escapes can begin with \033\[ (\u001b\[)  or the single character "Control Sequence Introducer" 0x9b

# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
namespace eval punk::char {
    namespace export *

    variable grapheme_widths [dict create]
    # -- --------------------------------------------------------------------------
    variable encmimens  ;#namespace of mime package providing reversemapencoding and mapencoding functions
    #tcllib mime requires tcl::chan::memchan,events,core and/or Trf
    if {![catch {package require punk::encmime} errM]} {
        set encmimens ::punk::encmime
    } else {
        package require mime
        set encmimens ::mime
    }
    # -- --------------------------------------------------------------------------

    variable invalid "???" ;# ideally this would be 0xFFFD - which should display as black diamond/rhombus with question mark. As at 2023 - this tends to display indistinguishably from other missing glyph boxes - so we use a longer sequence we can detect by length and to display obviously
    variable invalid_display_char \u25ab; #todo - change to 0xFFFD once diamond glyph more common?

    #just the 7-bit ascii. use [page ascii] for the 8-bit
    proc ascii {} {return {
        00 NUL  01 SOH  02 STX  03 ETX  04 EOT  05 ENQ  06 ACK  07 BEL
        08 BS   09 HT   0a LF   0b VT   0c FF   0d CR   0e SO   0f SI
        10 DLE  11 DC1  12 DC2  13 DC3  14 DC4  15 NAK  16 SYN  17 ETB
        18 CAN  19 EM   1a SUB  1b ESC  1c FS   1d GS   1e RS   1f US
        20 SP   21  !   22  "   23  #   24  $   25  %   26  &   27  '
        28  (   29  )   2a  *   2b  +   2c  ,   2d  -   2e  .   2f  /
        30  0   31  1   32  2   33  3   34  4   35  5   36  6   37  7
        38  8   39  9   3a  :   3b  ;   3c  <   3d  =   3e  >   3f  ?
        40  @   41  A   42  B   43  C   44  D   45  E   46  F   47  G
        48  H   49  I   4a  J   4b  K   4c  L   4d  M   4e  N   4f  O
        50  P   51  Q   52  R   53  S   54  T   55  U   56  V   57  W
        58  X   59  Y   5a  Z   5b  [   5c  \   5d  ]   5e  ^   5f  _
        60  `   61  a   62  b   63  c   64  d   65  e   66  f   67  g
        68  h   69  i   6a  j   6b  k   6c  l   6d  m   6e  n   6f  o
        70  p   71  q   72  r   73  s   74  t   75  u   76  v   77  w
        78  x   79  y   7a  z   7b  {   7c  |   7d  }   7e  ~   7f DEL
    }}

    #G0 character set
    proc ascii2 {} {
        set dict [asciidict2]
        set out ""
        set i 1
        append out "        "
        dict for {k v} $dict {
            #single chars are wrapped with \033(0  and \033(B  ie total length 7
            if {[string length $v] == 7} {
                set v " $v "
            } elseif {[string length $v] == 2} {
                set v "$v "
            } elseif {[string length $v] == 0} {
                set v "   "
            }
            append out "$k $v  "
            if {$i > 0 && $i % 8 == 0} {
                set out [string range $out 0 end-2]
                append out \n "        "
            }
            incr i
        }
        set out [string trimright $out " "]
        return $out
    }


    proc symbol {} {
        tailcall page symbol
    }
    proc dingbats {} {
        set out ""
        append out [page dingbats] \n
        set unicode_dict [charset_dictget Dingbats]

        append out "        "
        set i 1
        dict for {k charinfo} $unicode_dict {
            set char [dict get $charinfo char]
            if {[string length $char] == 0} {
                set displayv "   "
            } elseif {[string length $char] == 1} {
                set displayv " $char "
            } else {
                set displayv $char
            }
            append out "$k $displayv  "
            if {$i > 0 && $i % 8 == 0} {
                set out [string range $out 0 end-2]
                append out \n "        "
            }
            incr i
        }
        return $out
    }
    proc page_names {{search *}} {
        set all_names [list]
        set d [page_names_dict $search]
        dict for {k v} $d {
            if {$k ni $all_names} {
                lappend all_names $k
            }
            foreach m $v {
                if {$m ni $all_names} {
                    lappend all_names $m
                }
            }
        }
        return [lsort $all_names]
    }
    proc page_names_help {{namesearch *}} {
        set d [page_names_dict $namesearch]

        set out ""
        dict for {k v} $d {
            append out "$k $v" \n
        }
        return [linesort $out]
    }
    proc page_names_dict {{search *}} {
        if {![regexp {[?*]} $search]} {
            set search "*$search*"
        }
        set encnames [encoding names]
        foreach enc $encnames {
            dict set d $enc [list]
        }
        variable encmimens
        set mimenames [array get ${encmimens}::reversemap]
        dict for {mname encname} $mimenames {
            if {$encname in $encnames} {
                set enclist [dict get $d $encname]
                if {$mname ni $enclist} {
                    dict lappend d $encname $mname
                }
            }
        }
        foreach enc [lsort $encnames] {
            set mime_enc [${encmimens}::mapencoding $enc]
            if {$mime_enc ne ""} {
                set enclist [dict get $d $enc]
                if {$mime_enc ni $enclist} {
                    dict lappend d $enc $mime_enc
                }
            }
        }
        set dresult [dict create]
        if {$search ne "*"} {
            dict for {k v} $d {
                if {[string match -nocase $search $k] || ([lsearch -nocase $v $search]) >= 0} {
                    dict set dresult $k $v
                }
            }
        } else {
            set dresult $d
        }
        return $dresult
    }
    proc page8 {encname args} {
        dict set args -cols 8
        tailcall page $encname {*}$args
    }
    proc page16 {encname args} {
        dict set args -cols 16
        tailcall page $encname {*}$args
    }
    proc page {encname args} {
        variable invalid
        set encname [encname $encname]
        set defaults [list\
            -range {0 256}\
            -cols 16\
        ]
        set opts [dict merge $defaults $args]
        # -- --- --- --- --- --- --- --- ---
        set cols            [dict get $opts -cols]
        # -- --- --- --- --- --- --- --- ---

        set d_bytedisplay [basedict_display]

        #set d_ascii [pagedict_raw ascii]
        set d_ascii [basedict]
        set d_asciiposn [lreverse $d_ascii] ;#values should be unique except for "???". We are mainly interested in which ones have display-names e.g cr csi
        #The results of this are best seen by comparing the ebcdic and ascii pages

        set d_page  [pagedict_raw $encname]

        set out ""
        set i 1
        append out "        "
        dict for {k rawchar} $d_page {
            set num [expr {"0x$k"}]
            #see if ascii equivalent exists and has a name
            if {$rawchar eq $invalid} {
                set displayv "$invalid"
            } else {
                set bytedisplay ""
                if {[dict exists $d_asciiposn $rawchar]} {
                    set asciiposn [dict get $d_asciiposn $rawchar]
                    set bytedisplay [dict get $d_bytedisplay $asciiposn]
                }
                if {$bytedisplay eq $invalid} {
                    #
                    set displayv " $rawchar "
                } else {
                    set displaylen [string length $bytedisplay]
                    if {$displaylen == 2} {
                        set displayv "$bytedisplay "
                    } elseif {$displaylen == 3} {
                        set displayv $bytedisplay
                    } else {
                        if {[string length $rawchar] == 0} {
                            set displayv "   "
                        } else {
                            #presumed 1
                            set displayv " $rawchar "
                        }
                    }
                }
            }

            append out "$k $displayv  "
            if {$i > 0 && $i % $cols == 0} {
                set out [string range $out 0 end-2]
                append out \n "        "
            }
            incr i
        }
        set out [string trimright $out " "]
        return $out
    }

    proc pagechar1 {page num} {
        set encpage [encname $page]
        encoding convertfrom $encpage [format %c $num]
    }

    proc pagechar {page num} {
        set encpage [encname $page]

        set ch [format %c $num]
        if {[decodable $ch $encpage]} {
            set outchar [encoding convertfrom $encpage $ch]
        } else {
            #here we will use \0xFFFD instead of our replacement string ??? - as the name pagechar implies always returning a single character. REVIEW.
            set outchar $::punk::char::invalid_display_char
        }
        return $outchar
    }
    proc pagechar_info {page num} {
        set ch [format %c $num]
        set h [format %04x $num]
        set encpage [encname $page]
        if {[decodable $ch $encpage]} {
            set outchar [encoding convertfrom $encpage $ch]
        } else {
            error "pagechar_info: $h not decodable from $encpage"
        }
        package require punk::console
        puts -nonewline stdout \033\[s;flush stdout
        lassign [punk::console::get_cursor_pos_list] _row1 col1
        puts -nonewline stdout "$outchar";flush stdout
        lassign [punk::console::get_cursor_pos_list] _row2 col2
        puts -nonewline stdout "\033\[u";flush stdout
        return "$col1 -> $col2"
    }

    proc pagebyte {page num} {
        set encpage [encname $page]

        set ch [format %c $num]
        if {[decodable $ch $encpage]} {
            #set outchar [encoding convertto $encpage [format %c $num]]
            set outchar [format %c $num]
        } else {
            set outchar $::punk::char::invalid_display_char
        }
        return $outchar
    }

    proc all_pages {} {
        set out ""
        set mimenamesdict [page_names_dict]
        foreach encname [encoding names] {
            if {[dict exists $mimenamesdict $encname]} {
                set alt "([dict get $mimenamesdict $encname])"
            } else {
                set alt ""
            }
            append out "$encname $alt" \n
            append out [page $encname]
        }
        return $out
    }

    proc encname {encoding_name_or_alias} {
        set encname $encoding_name_or_alias
        if {[lsearch -nocase [page_names] $encname] < 0} {
            error "Unknown encoding '$encname' - use 'punk::char::page_names' to see valid encoding names on this system"
        }
        variable encmimens
        if {$encname ni [encoding names]} {
            set encname [${encmimens}::reversemapencoding $encname]
        }
        return $encname
    }

    proc pagedict_raw {encname} {
        variable invalid ;# ="???"
        set encname [encname $encname]
        set d [dict create]
        for {set i 0} {$i < 256} {incr i} {
            set k [format %02x $i]
            #dict set d $k [encoding convertfrom $encname [format %c $i]]
            set ch [format %c $i] ;
            #jmn
            if {[decodable $ch $encname]} {
                #set encchar [encoding convertto $encname $ch]
                #dict set d $k [encoding convertfrom $encchar]
                dict set d $k [encoding convertfrom $encname $ch]
            } else {
                dict set d $k $invalid ;#use replacement so we can detect difference from actual "?"
            }
        }
        return $d
    }
    proc asciidict {} {
        variable invalid
        set d [dict create]
        set a128 [asciidict128]
        for {set i 0} {$i < 256} {incr i} {
            set k [format %02x $i]
            if {$i <= 127} {
                dict set d $k [dict get $a128 $k]
            } else {
                #
                dict set d $k $invalid
            }

            if {$i <=32} {
                #no point emitting the lower control characters raw to screen - emit the short-names defined in the 'ascii' proc
                dict set d $k [dict get $a128 $k]
            } else {
                if {$i == 0x9b} {
                    dict set d $k CSI ;#don't emit the ansi 'Control Sequence Introducer' - or it will be interpreted by the console and affect the layout.
                } else {
                    dict set d $k [format %c $i]
                }
            }
        }
        return $d
    }

    proc basedict_display {} {
        set d [dict create]
        set a128 [asciidict128]
        for {set i 0} {$i < 256} {incr i} {
            set k [format %02x $i]
            if {$i <=32} {
                #no point emitting the lower control characters raw to screen - emit the short-names defined in the 'ascii' proc
                dict set d $k [dict get $a128 $k]
            } else {
                if {$i == 0x9b} {
                    dict set d $k CSI ;#don't emit the ansi 'Control Sequence Introducer' - or it will be interpreted by the console and affect the layout.
                } elseif {$i == 0x9c} {
                    dict set d $k OSC
                } else {
                    #dict set d $k [encoding convertfrom [encoding system] [format %c $i]]
                    #don't use encoding convertfrom - we want the value independent of the current encoding system.
                    dict set d $k [format %c $i]
                }
            }
        }
        return $d
    }
    proc basedict_encoding_system {} {
        #result depends on 'encoding system' currently in effect
        set d [dict create]
        for {set i 0} {$i < 256} {incr i} {
            set k [format %02x $i]
            dict set d $k [encoding convertfrom [encoding system] [format %c $i]]
        }
        return $d
    }

    proc basedict {} {
        #this gives same result independent of current value of 'encoding system'
        set d [dict create]
        for {set i 0} {$i < 256} {incr i} {
            set k [format %02x $i]
            dict set d $k [format %c $i]
        }
        return $d
    }
    proc pagedict {pagename args} {
        variable  charsets
        set encname [encname $pagename]
        set defaults [list\
            -range {0 256}\
            -charset ""\
        ]
        set opts [dict merge $defaults $args]
        # -- --- --- --- --- --- --- --- --- ---
        set range       [dict get $opts -range]
        set charset     [dict get $opts -charset]
        # -- --- --- --- --- --- --- --- --- ---
        if {$charset ne ""} {
            if {$charset ni [charset_names]} {
                error "unknown charset '$charset' - use 'charset_names' to get list"
            }
            set setinfo [dict get $charsets $charset]
            set ranges [dict get $setinfo ranges]
            set charset_dict [dict create]
            foreach r $ranges {
                set start [dict get $r start]
                set end   [dict get $r end]
                #set charset_dict [dict merge $charset_dict [char_range_dict $start $end]]
                break
            }

        } else {
            set start [lindex $range 0]
            set end  [lindex $range 1]
        }

        set d [dict create]
        for {set i $start} {$i <= $end} {incr i} {
            set k [format %02x $i]
            dict set d $k [encoding convertfrom $encname [format %c $i]]
        }
        return $d
    }

    #todo - benchmark peformance - improve punk pipeline
    proc asciidict128 {} {
        regexp -all -inline {\S+} [concat {*}[linelist -line trimleft [ascii]]]
    }
    proc _asciidict128 {} {
        .= ascii |> .=> linelist -line trimleft |> .=* concat |> {regexp -all -inline {\S+} $data}
    }

    proc asciidict2 {} {
        set d [dict create]
        dict for {k v} [basedict_display] {
            if {[string length $v] == 1} {
                set num [expr {"0x$k"}]
                #dict set d $k "\033(0[subst \\u00$k]\033(B"
                dict set d $k "\033(0[format %c $num]\033(B"
            } else {
                dict set d $k $v
            }
        }
        return $d
    }

    #-- --- --- --- --- --- --- ---
    # encoding convertfrom & encoding convertto can be somewhat confusing to think about. (Need to think in reverse.)
    # e.g encoding convertto dingbats <somethingpretty>  will output something that doesn't look dingbatty on screen.
    #-- --- --- --- --- --- --- ---
    #must use Tcl instead of tcl (at least for 8.6)
    if {![package vsatisfies [package present Tcl] 8.7-]} {
        proc encodable "s {enc [encoding system]}" {
            set encname [encname $enc]
            if {($encname eq "ascii")} {
                #8.6 fails to round-trip convert 0x7f del character despite it being in the ascii range (review Why?? what else doesn't round-trip but should?)
                #just strip it out of the string as we are only after a boolean answer and if s is only a single del char empty string will result in true
                set s [string map [list [format %c 0x7f] ""] $s]
            }
            string eq $s [encoding convertfrom $encname [encoding convertto $encname $s]]
        }
        #note also that tcl8.6 has anomalies with how it handles some unassigned codepoints
        # e.g unassigned codes in the middle of a codepage may appear to be encodable&decodable in a round trip whereas undefined codepoints at the end may get the replacement character defined in the tcl encodings dir (usually the 3f char: "?")
        proc decodable "s {enc [encoding system]}" {
            set encname [encname $enc]
            #review
            string eq $s [encoding convertto $encname [encoding convertfrom $encname $s]]
        }
    } else {
        proc encodable "s {enc [encoding system]}" {
            set encname [encname $enc]
            string eq $s [encoding convertfrom $encname [encoding convertto $encname $s]]
        }
        proc decodable "s {enc [encoding system]}" {
            set encname [encname $enc]
            string eq $s [encoding convertto $encname [encoding convertfrom $encname $s]]
        }
    }
    #-- --- --- --- --- --- --- ---
    proc test_japanese {{encoding jis0208}} {
        #A very basic test of 2char encodings such as jis0208
        set yatbun 日本  ;# encoding convertfrom jis0208 F|K\\
        lassign [split $yatbun] yat bun
        puts "original yatbun ${yat} ${bun}"
        set eyat [encoding convertto $encoding $yat]
        set ebun [encoding convertto $encoding $bun]
        puts "$encoding encoded: ${eyat} ${ebun}"
        puts "reencoded: [encoding convertfrom $encoding $eyat] [encoding convertfrom $encoding $ebun]"
        return $yatbun
    }
    proc test_grave {} {
        set g [format %c 0x300]
        puts stdout "Testing console display of grave accented a in between letters x and y - accent should combine over the top of the letter a."
        puts stdout "Apparent width should theoretically be 1 console-column"
        package require punk::console
        puts stdout "# -- --- --- ---"
        puts -nonewline "xa${g}z";set cursorposn [punk::console::get_cursor_pos]
        puts stdout \n
        puts stdout "cursor position immediately after outputing 4 bytes (expecting 3 glyphs): $cursorposn"
        puts stdout "# -- --- --- ---"
        puts -nonewline "xyz";set cursorposn [punk::console::get_cursor_pos]
        puts stdout \n
        puts stdout "cursor position immediately after outputing 3 bytes (xyz): $cursorposn"
    }
    proc test_farmer {} {
        #an interesting article re grapheme clustering problems in terminals https://mitchellh.com/writing/grapheme-clusters-in-terminals
        #(similar to the problem with grave accent rendering width that the test_grave proc is written for)
        # -- --- --- --- ---
        #These pasted glyphs can display in console even when the unicode versions don't (tcl 8.6 limited to 65533/FFFD ?)
        upvar farmer1_paste test_farmer1
        upvar farmer2_paste test_farmer2
        set test_farmer1 🧑‍🌾   ;#contains zero-width joiner between
        set test_farmer2 🧑🌾
        puts "pasted farmer1 exporting as var farmer1_paste: $test_farmer1"
        puts "pasted farmer2 exporting as var farmer2_paste: $test_farmer2"
        # -- --- --- --- ---


        set farmer1 "\U0001f9d1\U0000200d\U0001f33e"
        set farmer2 "\U0001f9d1\U0001f33e"
        puts stdout "farmer1 with zero-width joiner, codes: \\U0001f9d1\\U0000200d\\U0001f33e   : $farmer1"
        puts stdout "farmer2 with no joiner          codes: \\U0001f9d1\\U001f33e               : $farmer2"

        package require punk::console
        puts stdout "#2--5---9---C---"
        puts -nonewline stdout \033\[2K\033\[1G ;#2K erase line 1G cursor at col1
        puts -nonewline "${farmer1}";set cursorposn [punk::console::get_cursor_pos]
        puts stdout \n
        puts stdout "cursor position immediately after outputing farmer1 (expecting 1 glyph 2 wide)          : $cursorposn"
        puts stdout "#2--5---9---C---"
        puts -nonewline "${farmer2}";set cursorposn [punk::console::get_cursor_pos]
        puts stdout \n
        puts stdout "cursor position immediately after outputing farmer2 (expecting 2 glyphs 4 wide in total): $cursorposn"

        return [list $farmer1 $farmer2]
    }

    #G0 Sets Sequence   G1 Sets Sequence   Meaning
    #ESC ( A            ESC ) A            United Kingdom Set
    #ESC ( B            ESC ) B            ASCII Set
    #ESC ( 0            ESC ) 0            Special Graphics
    #ESC ( 1            ESC ) 1            Alternate Character ROM Standard Character Set
    #ESC ( 2            ESC ) 2            Alternate Character ROM Special Graphic

    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    # Unicode character sets - some hardcoded - some loadable from data files
    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    variable charinfo [dict create]
    variable charsets [dict create]


    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    # Aggregate character sets (ones that pick various ranges from underlying unicode ranges)
    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    dict set charsets "WGL4" [list altname "Windows Glyph List 4" ranges [list\
        {start 0        end 127     name "basic latin"}\
        {start 128      end 255     name "latin-1 supplement"}\
        {start 256      end 383     name "Latin Extended-A"}\
        {start 402      end 402     name "subset Latin Extended-B"}\
        {start 506      end 511     name "subset Latin Extended-B"}\
        {start 710      end 711     name "subset Spacing Modifier Letters"}\
        {start 713      end 713     name "subset Spacing Modifier Letters"}\
        {start 728      end 733     name "subset Spacing Modifier Letters"}\
        {start 900      end 906     name "subset Greek"}\
        {start 908      end 908     name "subset Greek"}\
        {start 910      end 974     name "subset Greek"}\
        {start 1024     end 1119    name "subset Cyrillic"}\
        {start 1168     end 1169    name "subset Cyrillic"}\
        {start 7808     end 7813    name "subset Latin Extended Additional"}\
        {start 7922     end 7923    name "subset Latin Extended Additional"}\
        {start 8211     end 8213    name "subset General Punctuation"}\
        {start 8215     end 8222    name "subset General Punctuation"}\
        {start 8224     end 8226    name "subset General Punctuation"}\
        {start 8230     end 8230    name "subset General Punctuation"}\
        {start 8240     end 8240    name "subset General Punctuation"}\
        {start 8242     end 8243    name "subset General Punctuation"}\
        ] description "Microsoft WGL4 Repertoire" settype "other"]


    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    #The base page 0-256 8bit range - values don't have specific characters or descriptions - as they are codepage dependent
    #we will fill this here for completeness - but with placeholders
    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    dict set charsets "8bit" [list ranges [list {start 0 end 127 name ascii} {start 128 end 255 name 8bit-ascii}] description "8bit extended ascii range" settype "other"]
    for {set i 0} {$i < 256} {incr i} {
        dict set charinfo $i [list desc "codepage-dependent" short "byte_[format %02x $i]"]
    }

    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    # Unicode ranges
    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    dict set charsets "greek" [list ranges [list {start 880 end 1023} {start 7936 end 8191}] description "Greek and Coptic" settype "other"]


    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    dict set charsets "Block Elements" [list ranges [list {start 9600 end 9631}] description "Block Elements" settype "other"]
    dict set charinfo 9600 [list desc "Upper Half Block"                                    short "blocke_up_half"]
    dict set charinfo 9601 [list desc "Lower One Eighth Block"                              short "blocke_lw_1_8th"]
    dict set charinfo 9602 [list desc "Lower One Quarter Block"                             short "blocke_lw_1_qtr"]


    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    dict set charsets "Dingbats" [list ranges [list {start 9984 end 10175 }] description "Unicode Dingbats" settype "tcl_supplemented"]
    dict set charinfo 9984 [list desc "Black Safety Scissors"                               short "dingbats_black_safety_scissors"]
    #...
    dict set charinfo 10175 [list desc "Double Curly Loop"                                  short "dingbats_double_curly_loop"]


    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    #variation selectors 0xFe01 - 0xFE0F
    dict set charsets "Variation Selectors" [list ranges [list {start 65024 end 65039}] description "Variation Selectors" note "combining character with previous char - variant glyph display" settype "tcl_supplemented"]
    dict set charinfo 65024 [list desc "Variation Selector-1"                                                short "VS1"]
    dict set charinfo 65025 [list desc "Variation Selector-2"                                                short "VS2"]
    dict set charinfo 65026 [list desc "Variation Selector-3"                                                short "VS3"]
    dict set charinfo 65027 [list desc "Variation Selector-4"                                                short "VS4"]
    dict set charinfo 65027 [list desc "Variation Selector-5"                                                short "VS5"]
    dict set charinfo 65029 [list desc "Variation Selector-6"                                                short "VS6"]
    dict set charinfo 65030 [list desc "Variation Selector-7"                                                short "VS7"]
    dict set charinfo 65031 [list desc "Variation Selector-8"                                                short "VS8"]
    dict set charinfo 65032 [list desc "Variation Selector-9"                                                short "VS9"]
    dict set charinfo 65033 [list desc "Variation Selector-10"                                               short "VS10"]
    dict set charinfo 65034 [list desc "Variation Selector-11"                                               short "VS11"]
    dict set charinfo 65035 [list desc "Variation Selector-12"                                               short "VS12"]
    dict set charinfo 65036 [list desc "Variation Selector-13"                                               short "VS13"]
    dict set charinfo 65037 [list desc "Variation Selector-14"                                               short "VS14"]
    dict set charinfo 65038 [list desc "Variation Selector-15 text variation"                                short "VS15"] ;#still an image - just more suitable for text-presentation e.g word-processing doc
    dict set charinfo 65039 [list desc "Variation Selector-16 emoji variation"                               short "VS16"]


    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    # emoticons https://www.unicode.org/charts/PDF/U1F600.pdf
    dict set charsets "Emoticons" [list ranges [list {start 128512 end 128591}] description "Emoticons" settype "tcl_supplemented"]
    dict set charinfo 128512 [list desc "Grinning Face"                                     short "emoticon_gface"]
    dict set charinfo 128513 [list desc "Grinning Face with Smiling Eyes"                   short "emoticon_gface_smile_eyes"]
    dict set charinfo 128514 [list desc "Face with Tears of Joy"                            short "emoticon_face_tears_joy"]

    #todo
    dict set charinfo 128590 [list desc "Person with Pouting Face"                          short "emoticon_person_pout"]

    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    dict set charsets "Box Drawing" [list ranges [list {start 9472 end 9599}] description "Box Drawing" settype "tcl_supplemented"]
    dict set charinfo 9472 [list desc "Box Drawings Light Horizontal"                       short "boxd_lhz"]
    dict set charinfo 9473 [list desc "Box Drawings Heavy Horizontal"                       short "boxd_hhz"]
    dict set charinfo 9474 [list desc "Box Drawings Light Vertical"                         short "boxd_lv"]
    dict set charinfo 9475 [list desc "Box Drawings Heavy Vertical"                         short "boxd_hv"]
    dict set charinfo 9476 [list desc "Box Drawings Light Triple Dash Horizontal"           short "boxd_ltdshhz"]
    dict set charinfo 9477 [list desc "Box Drawings Heavy Triple Dash Horizontal"           short "boxd_htdshhz"]
    dict set charinfo 9478 [list desc "Box Drawings Light Triple Dash Vertical"             short "boxd_ltdshv"]
    dict set charinfo 9479 [list desc "Box Drawings Heavy Triple Dash Vertical"             short "boxd_htdshv"]
    dict set charinfo 9480 [list desc "Box Drawings Light Quadruple Dash Horizontal"        short "boxd_lqdshhz"]
    dict set charinfo 9481 [list desc "Box Drawings Heavy Quadruple Dash Horizontal"        short "boxd_hqdshhz"]
    dict set charinfo 9482 [list desc "Box Drawings Light Quadruple Dash Vertical"          short "boxd_lqdshv"]
    dict set charinfo 9483 [list desc "Box Drawings Heavy Quadruple Dash Vertical"          short "boxd_hqdshv"]
    dict set charinfo 9484 [list desc "Box Drawings Light Down and Right"                   short "boxd_ldr"]
    dict set charinfo 9485 [list desc "Box Drawings Down Light and Right Heavy"             short "boxd_dlrh"]
    dict set charinfo 9486 [list desc "Box Drawings Down Heavy and Right Light"             short "boxd_dhrl"]
    dict set charinfo 9487 [list desc "Box Drawings Heavy Down and Right"                   short "boxd_hdr"]
    dict set charinfo 9488 [list desc "Box Drawings Light Down and Left"                    short "boxd_ldl"]
    dict set charinfo 9489 [list desc "Box Drawings Down Light and Left Heavy"              short "boxd_dllh"]
    dict set charinfo 9490 [list desc "Box Drawings Down Heavy and Left Light"              short "boxd_dhll"]
    dict set charinfo 9491 [list desc "Box Drawings Heavy Down and Left"                    short "boxd_hdl"]
    dict set charinfo 9492 [list desc "Box Drawings Light Up and Right"                     short "boxd_lur"]
    dict set charinfo 9493 [list desc "Box Drawings Up Light and Right Heavy"               short "boxd_ulrh"]
    dict set charinfo 9494 [list desc "Box Drawings Up Heavy and Right Light"               short "boxd_uhrl"]
    dict set charinfo 9495 [list desc "Box Drawings Heavy Up and Right"                     short "boxd_hur"]
    dict set charinfo 9496 [list desc "Box Drawings Light Up and Left"                      short "boxd_lul"]
    dict set charinfo 9497 [list desc "Box Drawings Up Light and Left Heavy"                short "boxd_ullh"]
    dict set charinfo 9498 [list desc "Box Drawings Up Heavy and Left Light"                short "boxd_uhll"]
    dict set charinfo 9499 [list desc "Box Drawings Heavy Up and Left"                      short "boxd_hul"]
    dict set charinfo 9500 [list desc "Box Drawings Light Vertical and Right"               short "boxd_lvr"]
    dict set charinfo 9501 [list desc "Box Drawings Vertical Light and Right Heavy"         short "boxd_vlrh"]
    dict set charinfo 9502 [list desc "Box Drawings Up Heavy and Right Down Light"          short "boxd_uhrdl"]
    dict set charinfo 9503 [list desc "Box Drawings Down Heavy and Right Up Light"          short "boxd_dhrul"]
    dict set charinfo 9504 [list desc "Box Drawings Vertical Heavy and Right Light"         short "boxd_vhrl"]
    dict set charinfo 9505 [list desc "Box Drawings Down Light and Right Up Heavy"          short "boxd_dlruh"]
    dict set charinfo 9506 [list desc "Box Drawings Up Light and Right Down Heavy"          short "boxd_ulrdh"]
    dict set charinfo 9507 [list desc "Box Drawings Heavy Vertical and Right"               short "boxd_hvr"]
    dict set charinfo 9508 [list desc "Box Drawings Light Vertical and Left"                short "boxd_lvl"]
    dict set charinfo 9509 [list desc "Box Drawings Vertical Light and Left Heavy"          short "boxd_vllh"]
    dict set charinfo 9510 [list desc "Box Drawings Up Heavy and Let Down Light"            short "boxd_uhldl"]
    dict set charinfo 9511 [list desc "Box Drawings Down Heavy and Left Up Light"           short "boxd_dhlul"]
    dict set charinfo 9512 [list desc "Box Drawings Vertical Heavy and Left Light"          short "boxd_vhll"]
    dict set charinfo 9513 [list desc "Box Drawings Down Light and left Up Heavy"           short "boxd_dlluh"]
    dict set charinfo 9514 [list desc "Box Drawings Up Light and Left Down Heavy"           short "boxd_ulldh"]
    dict set charinfo 9515 [list desc "Box Drawings Heavy Vertical and Left"                short "boxd_hvl"]
    dict set charinfo 9516 [list desc "Box Drawings Light Down and Horizontal"              short "boxd_ldhz"]
    dict set charinfo 9517 [list desc "Box Drawings Left Heavy and Right Down Light"        short "boxd_lhrdl"]
    dict set charinfo 9518 [list desc "Box Drawings Right Heavy and Left Down Light"        short "boxd_rhldl"]
    dict set charinfo 9519 [list desc "Box Drawings Down Light and Horizontal Heavy"        short "boxd_dlhzh"]
    dict set charinfo 9520 [list desc "Box Drawings Down Heavy and Horizontal Light"        short "boxd_dhhzl"]
    dict set charinfo 9521 [list desc "Box Drawings Right Light and Left Down Heavy"        short "boxd_rlldh"]
    dict set charinfo 9522 [list desc "Box Drawings Left Light and Right Down Heavy"        short "boxd_llrdh"]
    dict set charinfo 9523 [list desc "Box Drawings Heavy Down and Horizontal"              short "boxd_hdhz"]
    dict set charinfo 9524 [list desc "Box Drawings Light Up and Horizontal"                short "boxd_luhz"]
    dict set charinfo 9525 [list desc "Box Drawings Left Heavy and Right Up Light"          short "boxd_lhrul"]
    dict set charinfo 9526 [list desc "Box Drawings Right Heavy and Left Up Light"          short "boxd_rhlul"]
    dict set charinfo 9527 [list desc "Box Drawings Up Light and Horizontal Heavy"          short "boxd_ulhzh"]
    dict set charinfo 9528 [list desc "Box Drawings Up Heavy and Horizontal Light"          short "boxd_uhhzl"]
    dict set charinfo 9529 [list desc "Box Drawings Right Light and Left Up Heavy"          short "boxd_rlluh"]
    dict set charinfo 9530 [list desc "Box Drawings Left Light and Right Up Heavy"          short "boxd_llruh"]
    dict set charinfo 9531 [list desc "Box Drawings Heavy Up and Horizontal"                short "boxd_huhz"]
    dict set charinfo 9532 [list desc "Box Drawings Light Vertical and Horizontal"          short "boxd_lvhz"]
    dict set charinfo 9533 [list desc "Box Drawings Left Heavy and Right Vertical Light"    short "boxd_lhrvl"]
    dict set charinfo 9534 [list desc "Box Drawings Right Heavy and Left Vertical Light"    short "boxd_rhlvl"]
    dict set charinfo 9535 [list desc "Box Drawings Vertical Light and Horizontal Heavy"    short "boxd_vlhzh"]
    dict set charinfo 9536 [list desc "Box Drawings Up Heavy and Down Horizontal Light"     short "boxd_uhdhzl"]
    dict set charinfo 9537 [list desc "Box Drawings Down Heavy and Up Horizontal Light"     short "boxd_dhuhzl"]
    dict set charinfo 9538 [list desc "Box Drawings Vertical Heavy and Horizontal Light"    short "boxd_vhhzl"]
    dict set charinfo 9539 [list desc "Box Drawings Left Up Heavy and Right Down Light"     short "boxd_luhrdl"]
    dict set charinfo 9540 [list desc "Box Drawings Right Up Heavy and Left Down Light"     short "boxd_ruhldl"]
    dict set charinfo 9541 [list desc "Box Drawings Left Down Heavy and Right Up Light"     short "boxd_ldhrul"]
    dict set charinfo 9542 [list desc "Box Drawings Right Down Heavy and Left Up Light"     short "boxd_rdhlul"]
    dict set charinfo 9543 [list desc "Box Drawings Down Light and Up Horizontal Heavy"     short "boxd_dluhzh"]
    dict set charinfo 9544 [list desc "Box Drawings Up Light and Down Horizontal Heavy"     short "boxd_dldhzh"]
    dict set charinfo 9545 [list desc "Box Drawings Right Light and Left Vertical Heavy"    short "boxd_rllvh"]
    dict set charinfo 9546 [list desc "Box Drawings Left Light and Right Vertical Heavy"    short "boxd_llrvh"]
    dict set charinfo 9547 [list desc "Box Drawings Heavy Vertical and Horizontal"          short "boxd_hvhz"]
    dict set charinfo 9548 [list desc "Box Drawings Light Double Dash Horizontal"           short "boxd_lddshhz"]
    dict set charinfo 9549 [list desc "Box Drawings Heavy Double Dash Horizontal"           short "boxd_hddshhz"]
    dict set charinfo 9550 [list desc "Box Drawings Light Double Dash Vertical"             short "boxd_lddshv"]
    dict set charinfo 9551 [list desc "Box Drawings Heavy Double Dash Vertical"             short "boxd_hddshv"]
    dict set charinfo 9552 [list desc "Box Drawings Double Horizontal"                      short "boxd_dhz"]
    dict set charinfo 9553 [list desc "Box Drawings Double Vertical"                        short "boxd_dv"]
    dict set charinfo 9554 [list desc "Box Drawings Down Single and Right Double"           short "boxd_dsrd"]
    dict set charinfo 9555 [list desc "Box Drawings Down Double and Right Single"           short "boxd_ddrs"]
    dict set charinfo 9556 [list desc "Box Drawings Double Down and Right"                  short "boxd_ddr"]
    dict set charinfo 9557 [list desc "Box Drawings Down Single and Left Double"            short "boxd_dsld"]
    dict set charinfo 9558 [list desc "Box Drawings Down Double and Left Single"            short "boxd_ddls"]
    dict set charinfo 9559 [list desc "Box Drawings Double Down and Left"                   short "boxd_ddl"]
    dict set charinfo 9560 [list desc "Box Drawings Up Single and Right Double"             short "boxd_usrd"]
    dict set charinfo 9561 [list desc "Box Drawings Up Double and Right Single"             short "boxd_udrs"]
    dict set charinfo 9562 [list desc "Box Drawings Double Up and Right"                    short "boxd_dur"]
    dict set charinfo 9563 [list desc "Box Drawings Up Single and Left Double"              short "boxd_usld"]
    dict set charinfo 9564 [list desc "Box Drawings Up Double and Left Single"              short "boxd_udls"]
    dict set charinfo 9565 [list desc "Box Drawings Double Up and Left"                     short "boxd_dul"]
    dict set charinfo 9566 [list desc "Box Drawings Vertical Single and Right Double"       short "boxd_vsrd"]
    dict set charinfo 9567 [list desc "Box Drawings Vertical Double and Right Single"       short "boxd_vdrs"]
    dict set charinfo 9568 [list desc "Box Drawings Double Vertical and Right"              short "boxd_dvr"]
    dict set charinfo 9569 [list desc "Box Drawings Vertical Single and Left Double"        short "boxd_vsld"]
    dict set charinfo 9570 [list desc "Box Drawings Vertical Double and Left Single"        short "boxd_vdls"]
    dict set charinfo 9571 [list desc "Box Drawings Double Vertical and Left"               short "boxd_dvl"]
    dict set charinfo 9572 [list desc "Box Drawings Down Single and Horizontal Double"      short "boxd_dshzd"]
    dict set charinfo 9573 [list desc "Box Drawings Down Double and Horizontal Single"      short "boxd_ddhzs"]
    dict set charinfo 9574 [list desc "Box Drawings Double Down and Horizontal"             short "boxd_ddhz"]
    dict set charinfo 9575 [list desc "Box Drawings Up Single and Horizontal Double"        short "boxd_ushzd"]
    dict set charinfo 9576 [list desc "Box Drawings Up Double and Horizontal Single"        short "boxd_udhzs"]
    dict set charinfo 9577 [list desc "Box Drawings Double Up and Horizontal"               short "boxd_duhz"]
    dict set charinfo 9578 [list desc "Box Drawings Vertical Single and Horizontal Double"  short "boxd_vshzd"]
    dict set charinfo 9579 [list desc "Box Drawings Vertical Double and Horizontal Single"  short "boxd_vdhzs"]
    dict set charinfo 9580 [list desc "Box Drawings Double Vertical and Horizontal"         short "boxd_dvhz"]
    dict set charinfo 9581 [list desc "Box Drawings Light Arc Down and Right"               short "boxd_ladr"]
    dict set charinfo 9582 [list desc "Box Drawings Light Arc Down and Left"                short "boxd_ladl"]
    dict set charinfo 9583 [list desc "Box Drawings Light Arc Up and Left"                  short "boxd_laul"]
    dict set charinfo 9584 [list desc "Box Drawings Light Arc Up and Right"                 short "boxd_laur"]
    dict set charinfo 9585 [list desc "Box Drawings Light Diagonal Upper Right To Lower Left"   short "boxd_ldgurll"]
    dict set charinfo 9586 [list desc "Box Drawings Light Diagonal Upper Left To Lower Right"   short "boxd_ldgullr"]
    dict set charinfo 9587 [list desc "Box Drawings Light Diagonal Cross"                   short "boxd_ldc"]
    dict set charinfo 9588 [list desc "Box Drawings Light Left"                             short "boxd_ll"]
    dict set charinfo 9589 [list desc "Box Drawings Light Up"                               short "boxd_lu"]
    dict set charinfo 9590 [list desc "Box Drawings Light Right"                            short "boxd_lr"]
    dict set charinfo 9591 [list desc "Box Drawings Light Down"                             short "boxd_ld"]
    dict set charinfo 9592 [list desc "Box Drawings Heavy Left"                             short "boxd_hl"]
    dict set charinfo 9593 [list desc "Box Drawings Heavy Up"                               short "boxd_hu"]
    dict set charinfo 9594 [list desc "Box Drawings Heavy Right"                            short "boxd_hr"]
    dict set charinfo 9595 [list desc "Box Drawings Heavy Down"                             short "boxd_hd"]
    dict set charinfo 9596 [list desc "Box Drawings Light Left and Heavy Right"             short "boxd_llhr"]
    dict set charinfo 9597 [list desc "Box Drawings Light Up and Heavy Down"                short "boxd_luhd"]
    dict set charinfo 9598 [list desc "Box Drawings Heavy Left and Light Right"             short "boxd_hllr"]
    dict set charinfo 9599 [list desc "Box Drawings Heavy Up and Light Down"                short "boxd_huld"]


    dict set charsets "Halfwidth and Fullwidth Forms" [list ranges [list {start 65280 end 65519}] description "Halfwidth and Fullwidth Forms (variants)" settype "tcl_supplemental"]
        dict set charsets "ascii_fullwidth" [list ranges [list {start 65281 end 65374}] description "Ascii 21 to 7E fullwidth" parentblock "halfwidth_and_fullwidth_forms" settype "other"]

    dict set charsets "Specials" [list ranges [list {start 65520 end 65535}] description "Specials" settype "tcl_supplemental"]

    dict set charsets "noncharacters" [list ranges [list\
        {start 64976 end 65007   note "BMP FDD0..FDEF"}\
        {start 65534 end 65535   note "BMP FFFE,FFFF"}\
        {start 131070 end 131071 note "plane1 1FFFE,1FFFF"}\
        {start 196606 end 196607 note "plane2 2FFFE,2FFFF"}\
        {start 262142 end 262143 note "plane3 3FFFE,3FFFF"}\
        {start 327678 end 327679 note "plane4 4FFFE,4FFFF"}\
        {start 393214 end 393215 note "plane5 5FFFE,5FFFF"}\
        {start 458750 end 458751 note "plane6 6FFFE,6FFFF"}\
        {start 524286 end 524287 note "plane7 7FFFE,7FFFF"}\
        {start 589822 end 589823 note "plane8 8FFFE,8FFFF"}\
        {start 655358 end 655359 note "plane9 9FFFE,9FFFF"}\
        {start 720894 end 720895 note "plane10 AFFFE,AFFFF"}\
        {start 786430 end 786431 note "plane11 BFFFE,BFFFF"}\
        {start 851966 end 851967 note "plane12 CFFFE,CFFFF"}\
        {start 917502 end 917503 note "plane13 DFFFE,DFFFF"}\
        {start 983038 end 983039 note "plane14 EFFFE,EFFFF"}\
        {start 1048574 end 1048575 note "plane15 FFFFE,FFFFF"}\
        {start 1114110 end 1114111 note "plane16 10FFFE,10FFFF"}\
        ] description "non-characters" settype "tcl_supplemental"]

    #build dicts keyed on short
    variable charshort
    proc _build_charshort {} {
        variable charshort
        set charshort [dict create]
        variable charinfo
        dict for {k v} $charinfo {
            if {[dict exists $v short]} {
                set sh [dict get $v short]
                if {[dict exists $charshort $sh]} {
                    puts stderr "_build_charshort WARNING character data load duplicate shortcode '$sh'"
                }
                dict set charshort $sh  [format %c $k]
            }
        }
        return [dict size $charshort]
    }
    _build_charshort

    variable charset_extents_startpoints ;#stores endpoints   associated with each startpoint - but named after key which is startpoint.
    variable charset_extents_endpoints   ;#stores startpoints assoicated with each endpoint   - but named after key which is endpoint.
    variable charset_extents_rangenames  ;# dict keyed on start,end pointing to list of 2-tuples {setname rangeindex_within_set}
    #build 2 indexes for each range.(charsets are not just unicode blocks so can have multiple ranges)
    #Note that a range could be as small as a single char (startpoint = endpoint) so there can be many ranges with same start and end if charsets use some of the same subsets.
    #as each charset_extents_startpoins,charset_extents_endpoints is built - the associated range name and index is appended to the rangenames dict
    #startpoints - key is startpoint of a range, value is list of endpoints one for each range starting at this startpoint-key
    #endpoints   - key is endpoint of a range,   value is list of startpoints one for each range ending at this endpoint-key
    proc _build_charset_extents {} {
        variable charsets
        variable charset_extents_startpoints
        variable charset_extents_endpoints
        variable charset_extents_rangenames
        set charset_extents_startpoints     [dict create]
        set charset_extents_endpoints       [dict create]
        set charset_extents_rangenames      [dict create]
        dict for {setname setinfo} $charsets {
            set ranges [dict get $setinfo ranges]
            if {[dict get $setinfo settype] eq "block"} {
                #unicode block must have a single range
                #we consider a char a member of the block even if unassigned/reserved (as per unicode documentation)
                set start [dict get [lindex $ranges 0] start]
                set end [dict get [lindex $ranges 0] end]
                if {![dict exists $charset_extents_startpoints $start] || $end ni [dict get $charset_extents_startpoints $start]} {
                    #assert if end wasn't in startpoits list - then start won't be in endpoints list
                    dict lappend charset_extents_startpoints  $start $end
                    dict lappend charset_extents_endpoints $end $start
                }
                dict lappend charset_extents_rangenames ${start},${end} [list $setname 1]
            } else {
                #multirange sets/scripts. have holes. Char not a member if it's not explicitly in a defined range.
                #They should be in order within a set - but we don't assume so
                set r 1
                foreach range $ranges {
                    set start [dict get $range start]
                    set end [dict get $range end]
                    if {![dict exists $charset_extents_startpoints $start] || $end ni [dict get $charset_extents_startpoints $start]} {
                        #assert if end wasn't in startpoits list - then start won't be in endpoints list
                        dict lappend charset_extents_startpoints  $start $end
                        dict lappend charset_extents_endpoints    $end   $start
                    }
                    dict lappend charset_extents_rangenames ${start},${end} [list $setname $r]
                    incr r
                }
            }
        }
        #maintain in sorted order
        #-stride is available in lsort even at tcl8.6 - but not in lsearch
        set charset_extents_startpoints [lsort -stride 2 -integer $charset_extents_startpoints]
        set charset_extents_endpoints   [lsort -stride 2 -integer $charset_extents_endpoints]
        #no need to sort charset_extents_rangenames - lookup only done using dict methods
        return [dict size $charset_extents_startpoints]
    }
    _build_charset_extents ;#rebuilds for all charsets

    #nerdfonts are within the Private use E000 - F8FF range
    proc load_nerdfonts {} {
        variable charsets
        variable charinfo
        package require fileutil
        set ver [package provide punk::char]
        if {$ver ne ""} {
            set ifneeded [package ifneeded punk::char [package provide punk::char]]
            #puts stderr "punk::char ifneeded script: $ifneeded"
            lassign [split $ifneeded ";"] _  sourceinfo
            set basedir  [file dirname [lindex $sourceinfo end]]
        } else {
            #review - will only work at package load time
            set scr [info script]
            if {$scr eq ""} {
                error "load_nerdfonts unable to determine package folder"
            }
            set basedir [file dirname [info script]]
        }
        set pkg_data_dir [file join $basedir char]
        set fname [file join $pkg_data_dir nerd-fonts-glyph-list.txt]
        if {[file exists $fname]} {
            #puts stderr "load_nerdfonts loading $fname"
            set data [fileutil::cat -translation binary $fname]
            set short_seen [dict create]
            set current_set_range [dict create]
            set filesets_loading [list]
            foreach ln [split $data \n] {
                    set ln [string trim $ln]
                    if {$ln eq ""} {continue}
                    set desc [lassign $ln hex rawsetname]
                    set hexnum 0x$hex
                    set dec [expr $hexnum]
                    set setname "nf_$rawsetname"  ;#Ensure nerdfont set names are prefixed.

                    if {$setname ni $filesets_loading} {
                        if {![dict exists $charsets $setname]} {
                            #set exists - but not in our filesets_loading list - therefore this set has been previously loaded, so clear old data first
                            dict unset charset $setname
                        }
                        set newrange [list start $dec end $dec]
                        dict set current_set_range $setname $newrange
                        dict set charsets $setname [list ranges [list $newrange]  description "nerd fonts $rawsetname" settype "nerdfonts privateuse"]

                        lappend filesets_loading $setname
                    }
                    #expects ordered glyph list
                    set existing_range [dict get $current_set_range $setname]
                    set existing_end [dict get $existing_range end]
                    if {$dec - $existing_end == 1} {
                        #part of current range
                        dict set current_set_range $setname end $dec
                        #overwrite last ranges element
                        set rangelist [lrange [dict get $charsets $setname ranges] 0 end-1]
                        lappend rangelist [dict get $current_set_range $setname]
                        dict set charsets $setname ranges $rangelist
                    } else {
                        #new range for set
                        dict set current_set_range $setname start $dec
                        dict set current_set_range $setname end $dec
                        set rangelist [dict get $charsets $setname ranges]
                        lappend rangelist [dict get $current_set_range $setname]
                        dict set charsets $setname ranges $rangelist
                    }

                    if {![dict exists $charinfo $dec]} {
                        # -- ---
                        #review
                        set map [list beaufort bf gibbous gb crescent cr thunderstorm tstorm thermometer thermom]
                        lappend map {*}[list directory dir creativecommons ccom creative_commons ccom forwardslash fs]
                        lappend map {*}[list multimedia mm multiple multi outline outl language lang]
                        lappend map {*}[list odnoklassniki okru]
                        # -- ---
                        #consider other ways to unambiguously shorten names?
                        #normalize nf_fa & nf_fa 'o' element to 'outl' so outlines can be searched across sets more easily (o not necessarily at last position)
                        set normdesc [list]
                        foreach el $desc {
                            if {$el eq "o"} {
                                set el "outl"
                            }
                            lappend normdesc $el
                        }
                        set joined_desc [join $normdesc _]
                        #map after join so we can normalize some underscored elements e.g creativecommons & creative_commons
                        set mapped_desc [string map $map $joined_desc]
                        set s nf_${rawsetname}_$mapped_desc

                        if {![dict exists $short_seen $s]} {
                            dict set short_seen $s {}
                        } else {
                            #duplicate in the data file (e.g 2023 weather night alt rain mix)
                            set s ${s}_$hex
                        }
                        dict set charinfo $dec [list desc "$desc" short $s]
                    }
            }
           _build_charshort
           _build_charset_extents
        } else {
            puts stderr "unable to find glyph file. Tried $fname"
        }
    }

    proc package_base {} {
        #assume punk::char is in .tm form and we can use the package provide statement to determine base location
        #review
        set pkgver [package present punk::char]
        set pkginfo [package ifneeded punk::char $pkgver]
        set tmfile [lindex $pkginfo end]
        set pkg_base  [file dirname $tmfile]
        return $pkg_base
    }
    namespace eval internal {
        proc unicode_folder {} {
            set parent [file join [punk::char::package_base] char]
            set candidates [glob -nocomplain -type d -dir $parent -tail unicode*]
            set candidates [lsort -increasing $candidates] ;#review - dictionary sort - how are unicode versions ranked/compared??
            if {![llength $candidates]} {
                error "Failed to find unicode data folder in folder '$parent'"
            }
            set folder [file join $parent [lindex $candidates end]]
            return $folder
        }
        proc dict_getdef {dictValue args} {
            if {[llength $args] < 2} {
                error {wrong # args: should be "dict_getdef dictValue ?key ...? key default"}
            }
            set keys [lrange $args 0 end-1]
            if {[dict exists $dictValue {*}$keys]} {
                return [dict get $dictValue {*}$keys]
            } else {
                return [lindex $args end]
            }
        }
    }


    #charsets structure
    #dict set charsets "halfwidth_and_fullwidth_forms" [list ranges [list {start 65280 end 65519}] description "Halfwidth and Fullwidth Forms (variants) settype block"]

    #unicode Blocks.txt
    #load the defined blocks into 'charsets' and mark as type 'block'. Unicode blocks have only one range - and don't overlap.
    #We don't treat unassigned/reserved codes within a block specially at this stage - ie we will not chop a block into subranges on that basis.
    #unassigned code points should get certain default properties (e.g bidirectionality ) according to their block - so it makes sense to treat them as belonging to the block.
    #They also get the general property of Cn (Other,not assigned or Other,reserved) and a "Basic Type" of Noncharacter or Reserved
    proc load_unicode_blocks {} {
        #sample data line
        #0000..007F; Basic Latin
        variable charsets
        set file [file join [internal::unicode_folder] Blocks.txt]
        if {![file exists $file]} {
            error "Unicode Blocks.txt file not found at path '$file'"
        }
        puts "ok.. loading"
        set fd [open $file r]
        fconfigure $fd -translation binary
        set data [read $fd]
        close $fd
        set block_count 0
        foreach ln [split $data \n] {
            set ln [string trim $ln]
            if {[string match #* $ln]} {
                continue
            }
            if {[set pcolon [string first ";" $ln]] > 0} {
                set lhs [string trim [string range $ln 0 $pcolon-1]]
                set name [string trim [string range $ln $pcolon+1 end]]
                set lhsparts [split $lhs .]
                set start [lindex $lhsparts 0]
                set end   [lindex $lhsparts end]
                #puts "$start -> $end  '$name'"
                set  decimal_start [expr {"0x$start"}]
                set  decimal_end   [expr {"0x$end"}]
                dict set charsets $name [list ranges [list [list start $decimal_start end $decimal_end note "unicode block $lhs"]] description "" settype block]
                incr block_count
            }
        }
        _build_charset_extents
        return $block_count
    }

    #unicode scripts

    #unicode UnicodeData.txt


    #https://www.unicode.org/reports/tr44/#Property_Values


    #unicode EastAsianWidth.txt
    #classify width of character - which is contextual in some cases
    #####
    #Review - this is initial naive assumption that should get us mostly what we want for layout purposes in a utf-8-centric world.
    #We will just load the values and treat H,N,Na as 1-wide and A,F,W as 2-wide for functions such as char::string_width on the basis that those using legacy sets can query the property and make their own determinations in those contexts.
    ####
    # -- ---
    #A = Ambiguous - All characters that can be sometimes wide and sometimes narrow.  (wide in east asian legacy sets, narrow in non-east asian usage) (private use chars considered ambiguous)
    #F = East Asian Full-width
    #H = East Asian Half-width
    #N = Not east Asian (Neutral) - all other characters. (do not occur in legacy East Asian character sets) - treated like Na
    #Na = East Asian Narrow  - all other characters that are always narrow and have explicit full-width counterparts (e.g includes all of ASCII)
    #W = East Asian Wide - all other characters that are always wide (Occur only in the context of Eas Asian Typography)
    # -- ---


    proc charshort {shortname} {
        variable charshort
        return [dict get $charshort $shortname]
    }

    proc box_drawing {args} {
        return [charset "Box Drawing" {*}$args]
    }
    proc box_drawing_dict {} {
        return [charset_dict "Box Drawing"]
    }

    proc char_info_hex {hex args} {
        set hex [string map [list _ ""] $hex]
        if {[string is xdigit -strict $hex]} {
            #has no leading 0x
            set dec [expr {"0x$hex"}]
        } else {
            set dec [expr {$hex}]
        }
        return [char_info_dec $dec {*}$args]
    }
    proc char_info {char args} {
        #Note - on some versions of Tcl -e.g 8.6 use could supply something like \U1f600 (smiley icon) but we receive fffd (replacement special)
        #there is no way to detect what the user intended ie we can't distinguish if they actually typed \UFFFD
        #we can test if such mapping happens in general - and warn if codepoint is FFFD in the result dict
        set returninfo [dict create]
        if {[string equal \UFFFD $char] && [string equal \U1F600 \UFFFD]} {
            dict set returninfo WARNING "this tcl maps multiple to FFFD"
        }
        lassign [scan $char %c%s] dec_char remainder
        if {[string length $remainder]} {
            error "char_info requires a single character"
        }
        set result [dict merge $returninfo [char_info_dec $dec_char {*}$args]]
    }
    proc char_info_dec {dec args} {
        set dec_char [expr {$dec}]
        set defaults [dict create\
            -fields {default}\
            -except {}\
        ]
        set known_opts [dict keys $defaults]
        #testwidth is so named because it peforms an actual test on the console using ansi escapes - and the name gives a hint that it is a little slow
        set known_fields [list all default dec hex desc short testwidth char memberof] ;#maint fields from charinfo 'desc' 'short'
        #todo - unicode properties
        # tclwhitespace  (different to unicode concept of whitespace. review )

        foreach {k v} $args {
            if {![dict exists $defaults $k]} {
                error "char_info unrecognised option '$k'. Known options:'$known_opts' known_fields: $known_fields usage: char_info <char> ?-fields {<fieldnames>}? ?-except {<fieldnames>}?"
            }
        }
        set opts [dict merge $defaults $args]
        # -- --- --- --- --- --- --- --- --- --- --- ---
        set opt_fields      [dict get $opts -fields]
        set opt_except      [dict get $opts -except]
        # -- --- --- --- --- --- --- --- --- --- --- ---
        set initial_fields [list]
        if {"default" in $opt_fields} {
            set initial_fields $known_fields
            if {"testwidth" ni $opt_fields} {
                if {"testwidth" ni $opt_except} {
                    lappend opt_except testwidth
                }
            }
            if {"char" ni $opt_fields} {
                if {"char" ni $opt_except} {
                    lappend opt_except char
                }
            }
        } elseif {"all" in $opt_fields} {
            set initial_fields $known_fields
        } else {
            foreach f $opt_fields {
                if {$f in $known_fields} {
                    lappend initial_fields $f
                } else {
                    error "char_info unknown field name: '$f' known fields: '$known_fields'"
                }
            }
        }
        foreach e $opt_except {
            if {$e ni $known_fields} {
                error "char_info unknown field name $e in -except. known fields: '$known_fields'"
            }
        }
        set fields [list]
        foreach f $initial_fields {
            if {$f ne "all" && $f ni $opt_except} {
                lappend fields $f
            }
        }
        if {![llength $fields]} {
            return
        }

        variable charinfo
        variable charsets
        set hex_char [format %04x $dec_char]
        set returninfo [dict create]
        foreach f $fields {
            switch -- $f {
                dec {
                    dict set returninfo dec $dec_char
                }
                hex {
                    dict set returninfo hex $hex_char
                }
                desc {
                    if {[dict exists $charinfo $dec_char desc]} {
                        dict set returninfo desc [dict get $charinfo $dec_char desc]
                    } else {
                        dict set returninfo desc ""
                    }
                }
                short {
                    if {[dict exists $charinfo $dec_char short]} {
                        dict set returninfo desc [dict get $charinfo $dec_char short]
                    } else {
                        dict set returninfo short ""
                    }
                }
                testwidth {
                    #todo - expectedwidth - lookup the printing width it is *supposed* to have from unicode tables
                    #testwidth is one of the main ones likely to be worthwhile excluding as querying the console via ansi takes time
                    set existing_testwidth ""
                    if {[dict exists $charinfo $dec_char testwidth]} {
                    set existing_testwidth [dict get $charinfo $dec_char testwidth]
                    }
                    if {$existing_testwidth eq ""} {
                        #no cached data - do expensive cursor-position test (Note this might not be 100% reliable - terminals lie e.g if ansi escape sequence has set to wide printing.)
                        set char [format %c $dec_char]
                        set chwidth [char_info_testwidth $char]

                        dict set returninfo testwidth $chwidth
                        #cache it. todo - -verify flag to force recalc in case font/terminal changed in some way?
                        dict set charinfo $dec_char testwidth $chwidth
                    } else {
                        dict set returninfo testwidth $existing_testwidth
                    }
                }
                char {
                    set char [format %c $dec_char]
                    dict set returninfo char $char
                }
                memberof {
                    #memberof takes in the order of a few hundred microseconds if a simple scan of all ranges is taken - possibly worthwhile caching/optimising
                    #note that memberof is not just unicode blocks - but scripts and other non-contiguous sets consisting of multiple ranges - some of which may include ranges of a single character. (e.g WGL4)
                    #This means there probably isn't a really efficient means of calculating membership other than scanning all the defined ranges.
                    #We could potentially populate it using a side-thread - but it seems reasonable to just cache result after first use here.
                    #some efficiency could also be gained by pre-calculating the extents for each charset which isn't a simple unicode block. (and perhaps sorting by max char)
                    set memberof [list]
                    dict for {setname setinfo} $charsets {
                        foreach r [dict get $setinfo ranges] {
                            set s [dict get $r start]
                            set e [dict get $r end]
                            if {$dec_char >= $s && $dec_char <= $e} {
                                lappend memberof $setname
                                break
                            }
                        }
                    }
                    dict set returninfo memberof $memberof
                }
            }
        }

        return $returninfo
    }

    proc _char_info_dec_memberof_scan {dec} {
        variable charsets
        set memberof [list]
        dict for {setname setinfo} $charsets {
            foreach r [dict get $setinfo ranges] {
                set s [dict get $r start]
                set e [dict get $r end]
                if {$dec >= $s && $dec <= $e} {
                    lappend memberof $setname
                    break
                }
            }
        }
        return $memberof
    }
    proc range_split_info {dec} {
        variable charset_extents_startpoints
        variable charset_extents_endpoints
        set skeys [dict keys $charset_extents_startpoints]
        set ekeys [dict keys $charset_extents_endpoints]
        set splen [dict size $charset_extents_startpoints]
        set eplen [dict size $charset_extents_endpoints]
        set s [lsearch -bisect -integer $skeys $dec]
        set s_at_or_below [lrange $skeys 0 $s]
        set e_of_s [list]
        foreach sk $s_at_or_below {
            lappend e_of_s {*}[dict get $charset_extents_startpoints $sk]
        }
        set e_of_s [lsort -integer $e_of_s]
        set splitposn [lsearch -bisect -integer $e_of_s $dec]
        if {[lindex $e_of_s $splitposn] < $dec} {incr splitposn}
        #set lhs_endpoints_to_check [expr {[llength $e_of_s] - $splitposn}]
        set reduced_endpoints [lrange $e_of_s $splitposn end]
        set sps [list]
        foreach ep $reduced_endpoints {
            lappend sps {*}[dict get $charset_extents_endpoints $ep]
        }


        set e [lsearch -bisect -integer $ekeys $dec]
        if {$e >= 0} {
            set e_at_or_above [lrange $ekeys $e end]
            set s_of_e [list]
            foreach  ek $e_at_or_above {
                lappend s_of_e {*}[dict get $charset_extents_endpoints $ek]
            }
            set startpoints_of_above [llength $s_of_e]
            set splitposn [lsearch -bisect -integer $s_of_e $dec]
            set reduced_startpoints [lrange $s_of_e 0 $splitposn]
            set eps [list]
            foreach sp $reduced_startpoints {
                lappend eps {*}[dict get $charset_extents_startpoints $sp]
            }
        } else {
            set s_of_e [list]
            set reduced_startpoints [list]
            set eps [list]

        }

        return [dict create startpoints $splen endpoints $eplen midpoint [expr {floor($eplen/2)}] posn $e lhs_endpoints_to_check "[llength $reduced_endpoints]/[llength $e_of_s]=[llength $sps]ranges" rhs_startpoints_to_check "[llength $reduced_startpoints]/[llength $s_of_e]=[llength $eps]"]
    }
    #for just a few extra sets such as wgl4 and unicode blocks loaded - this gives e.g 5x + better performance than the simple search above, and up to twice as slow for tcl 8.6
    #performance biased towards lower numbered characters (which is not too bad in the context of unicode)
    #todo - could be tuned to perform better at end by assuming a fairly even distribution of ranges - and so searching startpoint ranges first for items left of middle, endpoint ranges first for items right of middle
    #review with scripts loaded and more defined ranges..
    #This algorithm supports arbitrary overlapping ranges and ranges with same start & endpoints
    #Should be much better than O(n) for n sets except for pathological case of member of all or nearly-all intervals ?
    #review - compare with 'interval tree' algorithms.
    proc char_info_dec_memberof {dec} {
        variable charset_extents_startpoints
        variable charset_extents_endpoints
        variable charset_extents_rangenames
        if {[package vcompare [info tclversion] 8.7a5] >= 0} {
            #algorithm should theoretically be a little better with -stride
            set last_smaller_or_equal_startposn [lsearch -stride 2 -bisect -integer $charset_extents_startpoints $dec]
            set sets_starting_below [lrange $charset_extents_startpoints 0 $last_smaller_or_equal_startposn+1] ;#+1 to include 2nd element of stridden pair
            set endpoints_of_starting_below [lsort -integer [concat {*}[dict values $sets_starting_below]]]
        } else {
            #no -stride available
            set startkeys [dict keys $charset_extents_startpoints]
            set last_smaller_or_equal_startkeyposn [lsearch -bisect -integer $startkeys $dec] ;#assert will always return one of the keys if number >=0 supplied (last key if > all)
            #set startkey_found [lindex $startkeys $last_smaller_or_equal_startkeyposn]
            set start_below_keys [lrange $startkeys 0 $last_smaller_or_equal_startkeyposn] ;#These are the keys of sets which start at or below dec
            #puts "start_below_keys: '$start_below_keys'"
            set endpoints_of_starting_below [list]
            foreach belowkey $start_below_keys {
                lappend endpoints_of_starting_below {*}[dict get $charset_extents_startpoints $belowkey]
            }
            set endpoints_of_starting_below [lsort -integer $endpoints_of_starting_below[unset endpoints_of_starting_below]]
        }

        set splitposn [lsearch -bisect -integer $endpoints_of_starting_below $dec] ;#splitposn = last smaller or equal endposn
        if {[lindex $endpoints_of_starting_below $splitposn] < $dec} { incr splitposn}
        set reduced_opposite_limit [lrange $endpoints_of_starting_below $splitposn end]
        ################
        #note each endpoint points to multiple startpoints which may still include some that are not in range. (e.g range y can share endpoint with x that starts in-range - but y starts above character )
        #  x1         x2
        #        y1   y2
        #     c
        ################
        #we have reduced our set of endpoints sufficiently (to those at or above dec) to run through and test each startpoint
        set ranges [list]
        foreach ep $reduced_opposite_limit {
            foreach s [dict get $charset_extents_endpoints $ep] {
                if {$s <= $dec} {
                    lappend ranges [dict get $charset_extents_rangenames $s,$ep]
                }
            }
        }
        return $ranges

    }


    #with glob searching of description and short
    proc char_range_dict {start end args} {
        if {![string is integer -strict $start] || ![string is integer -strict $end]} {
            error "char_range_dict error start and end must be integers"
        }
        set and_globs [list]
        if {![llength $args]} {
            set args [list *]
        }
        foreach glob $args {
            if {![regexp {[*?]} $glob]} {
                lappend and_globs "*$glob*"
            } else {
                lappend and_globs $glob
            }
        }
        variable charinfo
        set cdict [dict create]
        set start [expr {$start}] ;#force string rep to decimal - otherwise first use of i as string could be hex or other rep whilst other i values will be decimal string rep due to incr
        for {set i $start} {$i <= $end} {incr i} {
            set hx [format %04x $i]
            set ch [format %c $i]
            if {[dict exists $charinfo $i desc]} {
                set d [dict get $charinfo $i desc]
            } else {
                set d ""
            }
            if {[dict exists $charinfo $i short]} {
                set s [dict get $charinfo $i short]
            } else {
                set s ""
            }
            set matchcount 0
            foreach glob $and_globs {
                if {[string match -nocase $glob $s] || [string match -nocase $glob $d]} {
                   incr matchcount
                }
            }
            if {$matchcount == [llength $and_globs]} {
                if {[dict exists $charinfo $i]} {
                    dict set cdict $hx [dict merge [dict create dec $i hex $hx char $ch] [dict get $charinfo $i]]
                } else {
                    dict set cdict $hx [list dec $i hex $hx char $ch desc $d short $s]
                }
            }
        }
        return $cdict
    }
    #with glob searches of desc and short
    proc char_range {start end args} {
        package require overtype
        if {![string is integer -strict $start] || ![string is integer -strict $end]} {
            error "char_range error start and end must be integers"
        }
        set charset_dict [char_range_dict $start $end {*}$args]
        set out ""
        set col3 [string repeat " " 12]
        dict for {k inf} $charset_dict {
            set s [internal::dict_getdef $inf short ""]
            set d [internal::dict_getdef $inf desc ""]
            set s_col [overtype::left $col3 $s]
            append out "$k [dict get $inf dec] [dict get $inf char] $s_col $d" \n
        }
        return $out
    }


    #non-overlapping unicode blocks
    proc char_blocks {{name_or_glob *}} {
        variable charsets
        #todo - more efficient datastructures?
        if {![regexp {[?*]} $name_or_glob]} {
            #no glob - just retrieve it
            if {[dict exists $charsets $name_or_glob]} {
                if {[dict get $charsets $name_or_glob settype] eq "block"} {
                    return [dict create $name_or_glob [dict get $charsets $name_or_glob]]
                }
            }
            #no exact match - try case insensitive..
            set name [lsearch -inline -nocase [dict keys $charsets] $name_or_glob]
            if {$name ne ""} {
                if {[dict get $charsets $name settype] eq "block"} {
                    return [dict create $name [dict get $charsets $name]]
                }
            }

        } else {
            #build a subset
            set charsets_block [dict create]
            dict for {k v} $charsets {
                if {[string match -nocase $name_or_glob $k]} {
                    if {[dict get $v settype] eq "block"} {
                        dict set charsets_block $k $v
                    }
                }
            }
            return $charsets_block
        }
    }
    proc charset_names {{name_or_glob *}} {
        variable charsets
        if {![regexp {[?*]} $name_or_glob]} {
            #no glob - just retrieve it
            if {[dict exists $charsets $name_or_glob]} {
                return [list $name_or_glob]
            }
            #no exact match - try case insensitive..
            set name [lsearch -inline -nocase [dict keys $charsets] $name_or_glob]
            if {$name ne ""} {
                return [list $name]
            }
        } else {
            if {$name_or_glob eq "*"} {
                return [lsort [dict keys $charsets]]
            }
            return [lsort [lsearch -all -inline -nocase [dict keys $charsets] $name_or_glob]]
        }
    }

    #deprecated
    #major named sets such as unicode blocks, scripts, and other sets such as microsoft WGL4
    #case insensitive search - possibly with *basic* globs (? and * only - not lb rb)
    proc charset_names2 {{namesearch *}} {
        variable charsets
        #dictionary sorting of the keys is slow! - we should obviously store it in sorted order instead of sorting entire list on retrieval - or just sort results
        #set sortedkeys [lsort -increasing -dictionary [dict keys $charsets]] ;#NOTE must use -dictionary to use -sorted flag below
        set sortedkeys [lsort -increasing [dict keys $charsets]]
        if {$namesearch eq "*"} {
            return $sortedkeys
        }
        if {[regexp {[?*]} $namesearch]} {
            #name glob search
            return [lsearch -all -inline -nocase $sortedkeys $namesearch] ;#no point using -sorted flag when -all is used
        } else {
            #return [lsearch -sorted -inline -nocase $sortedkeys $namesearch] ;#no globs  - bails out earlier if -sorted?
            return [lsearch -inline -nocase $sortedkeys $namesearch] ;#no globs
        }
    }
    proc charsets {{namesearch *}} {
        package require textblock
        variable charsets
        set charset_names [charset_names $namesearch]
        set settype_list [list]
        foreach setname $charset_names {
            lappend settype_list [dict get $charsets $setname settype]
        }

        set charset_names [linsert $charset_names 0 "Set Name"]
        set settype_list  [linsert $settype_list 0 "Set Type"]

        return [textblock::join [list_as_lines -- $charset_names] " " [list_as_lines $settype_list]]
    }
    proc charset_defget {exactname} {
        variable charsets
        return [dict get $charsets $exactname]
    }
    proc charset_defs {charsetname} {
        variable charsets
        set matches [charset_names $charsetname]
        set def_list [list]
        foreach setname $matches {
            lappend def_list [dict create $setname [dict get $charsets $setname]]
        }
        return [join $def_list \n]
    }
    proc charset_dictget {exactname} {
        variable charsets
        set setinfo [dict get $charsets $exactname]
        set ranges [dict get $setinfo ranges]
        set charset_dict [dict create]
        foreach r $ranges {
            set start [dict get $r start]
            set end   [dict get $r end]
            set charset_dict [dict merge $charset_dict [char_range_dict $start $end]]
        }
        return $charset_dict
    }
    proc charset_dicts {searchname} {
        variable  charsets
        set matches [charset_names $searchname]
        if {![llength $matches]} {
            error "No charset found matching name '$searchname' - use 'charset_names' to get list"
        }
        set dict_list [list]
        foreach m $matches {
            lappend dict_list [dict create $m [charset_dictget $m]]
        }
        #return $dict_list
        return [join $dict_list \n]
    }
    proc charset_page {namesearch args} {
        _charset_page_search $namesearch $args ;#pass args to descsearch argument
    }
    proc _charset_page_search {namesearch search_this_and_that args} {
        variable  charsets
        variable charinfo
        set matched_names [charset_names $namesearch]
        if {![llength $matched_names]} {
            error "charset_page no charset matched pattern '$namesearch' - use 'charset_names' to get list"
        }
        set defaults [dict create\
            -ansi 0\
            -lined 1\
        ]
        set opts [dict merge $defaults $args]
        # -- --- --- ---
        set opt_ansi        [dict get $opts -ansi]
        set opt_lined       [dict get $opts -lined]
        # -- --- --- ---
        set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}

        if {$opt_ansi} {
            set a1 [a BLACK white bold]
            set a2 [a]
        } else {
            set a1 ""
            set a2 ""
        }
        set cols 16
        set prefix "    "
        append out $prefix
        foreach charsetname $matched_names {
            if {[llength $search_this_and_that]} {
                set setinfo [dict get $charsets $charsetname]
                set ranges [dict get $setinfo ranges]
                set charset_dict [dict create]
                foreach r $ranges {
                    set start [dict get $r start]
                    set end   [dict get $r end]
                    set charset_dict [dict merge $charset_dict [char_range_dict $start $end {*}$search_this_and_that]]
                }
            } else {
                set charset_dict [charset_dictget $charsetname]
            }
            if {![dict size $charset_dict]} {
                continue
            }
            set i 1
            append out \n $prefix $charsetname
            append out \n

            set marker_line $prefix
            set line     $prefix
            dict for {hex inf} $charset_dict {
                set ch [dict get $inf char]
                set twidth ""
                set dec [expr {"0x$hex"}]
                if {[dict exists $charinfo $dec testwidth]} {
                    set twidth [dict get $charinfo $dec testwidth]
                }
                if {$twidth eq ""} {
                    #set width [ansifreestring_width $ch] ;#based on unicode props
                    set width [grapheme_width_cached $ch]
                } else {
                    set width $twidth
                }
                if {$width == 0} {
                    set marker "   "
                    if {[regexp $re_diacritics $ch]} {
                        #attempt to combine with space to get 3-wide displayv with diacritic showing at left space
                        #todo - dualchar diacritics?
                        set displayv " $ch  "
                    } else {
                        set displayv "   "
                    }
                } elseif {$width == 1} {
                    set marker "_  "
                    set displayv "${a1}$ch${a2}  "
                } else {
                    #presumed 2
                    set marker "__ "
                    set displayv "${a1}$ch${a2} "
                }
                set hexlen [string length $hex]
                append marker_line "[string repeat " " $hexlen] $marker"
                append line "$hex $displayv"
                if {$i == [dict size $charset_dict] || $i % $cols == 0} {
                    if {$opt_lined} {
                        append out $marker_line \n
                    }
                    append out $line \n
                    set marker_line $prefix
                    set line $prefix
                    #set out [string range $out 0 end-2]
                    #append out \n "        "
                }
                incr i
            }
        }
        set out [string trimright $out " "]
        return $out
    }

    #allows search on both name and an anded list of globs to be applied to description & short
    proc charset {namesearch args} {
        package require overtype
        variable  charsets
        set matched_names [charset_names $namesearch]
        if {![llength $matched_names]} {
            error "No charset matched pattern '$namesearch' - use 'charset_names' to get list"
        }
        set search_this_and_that $args

        set out ""

        foreach charsetname $matched_names {
            if {[llength $search_this_and_that]} {
                set setinfo [dict get $charsets $charsetname]
                set ranges [dict get $setinfo ranges]
                set charset_dict [dict create]
                foreach r $ranges {
                    set start [dict get $r start]
                    set end   [dict get $r end]
                    set charset_dict [dict merge $charset_dict [char_range_dict $start $end {*}$search_this_and_that]]
                }
            } else {
                set charset_dict [charset_dictget $charsetname]
            }

            set col_items_short [list]
            set col_items_desc  [list]
            dict for {k inf} $charset_dict {
                lappend col_items_desc  [internal::dict_getdef $inf desc ""]
                lappend col_items_short [internal::dict_getdef $inf short ""]
            }
            if {[llength $col_items_desc]} {
                set widest3 [tcl::mathfunc::max {*}[lmap v $col_items_short {string length $v}]]
                if {$widest3 == 0} {
                    set col3 " "
                } else {
                    set col3 [string repeat " " $widest3]
                }
                dict for {k inf} $charset_dict {
                    set s [internal::dict_getdef $inf short ""]
                    set d [internal::dict_getdef $inf desc ""]
                    set s_col [overtype::left $col3 $s]
                    append out "$k [dict get $inf char] $s_col $d" \n
                }
            }
        }

        return $out
    }

    #use console cursor movements to test and cache the column-width of each char in the set of characters returned by the search criteria
    proc charset_calibrate {namesearch args} {
        variable  charsets
        variable charinfo
        set matched_names [charset_names $namesearch]
        if {![llength $matched_names]} {
            error "No charset matched pattern '$namesearch' - use 'charset_names' to get list"
        }
        set search_this_and_that $args
        set charcount 0
        set width_results [dict create]
        puts stdout "calibrating using terminal cursor movements.."
        foreach charsetname $matched_names {
            if {[llength $search_this_and_that]} {
                set setinfo [dict get $charsets $charsetname]
                set ranges [dict get $setinfo ranges]
                set charset_dict [dict create]
                foreach r $ranges {
                    set start [dict get $r start]
                    set end   [dict get $r end]
                    set charset_dict [dict merge $charset_dict [char_range_dict $start $end {*}$search_this_and_that]]
                }
            } else {
                set charset_dict [charset_dictget $charsetname]
            }
            if {![dict size $charset_dict]} {
                continue
            }
            dict for {hex inf} $charset_dict {
                set ch [format %c 0x$hex]
                set twidth ""
                set dec [expr {"0x$hex"}]
                if {[dict exists $charinfo $dec testwidth]} {
                    set twidth [dict get $charinfo $dec testwidth]
                }
                if {$twidth eq ""} {
                    #puts -nonewline stdout "." ;#this
                    set width [char_info_testwidth $ch] ;#based on console test rather than unicode props
                    dict set charinfo $dec testwidth $width
                } else {
                    set width $twidth
                }
                dict incr width_results $width
                incr charcount
            }
        }
        puts stdout "\ncalibration done - results cached in charinfo dictionary"
        return [dict create charcount $charcount widths $width_results]
    }

    #maint warning - also in overtype!
    #intended for single grapheme - but will work for multiple
    #cannot contain ansi or newlines
    #(a cache of ansifreestring_width calls - as these are quite regex heavy)
    proc grapheme_width_cached {ch} {
        variable grapheme_widths
        if {[dict exists $grapheme_widths $ch]} {
            return [dict get $grapheme_widths $ch]
        }
        set width [punk::char::ansifreestring_width $ch] ;#review - can we provide faster version if we know it's a single grapheme rather than a string? (grapheme is still a string as it may have combiners/diacritics)
        dict set grapheme_widths $ch $width
        return $width
    }
    #no char_width - use grapheme_width terminology to be clearer
    proc grapheme_width {char} {
        error "grapheme_width unimplemented - use ansifreestring_width"
    }

    #return N Na W etc from unicode data
    #review
    proc char_uc_width_prop {char} {
        error "char_uc_width unimplemented try textutil::wcswidth_type"
    }
    #todo - provide a grapheme_width function  that is optimised for speed
    proc string_width {text} {
        #burn approx 2uS (2024) checking for ansi codes - not just SGR
        if {[punk::ansi::ta::detect $text]} {
            puts stderr "string_width detected ANSI!"
        }
        if {[string first \n $text] >= 0} {
            error "string_width accepts only a single line"
        }
        tailcall ansifreestring_width $text
    }

    #faster than textutil::wcswidth (at least for string up to a few K in length)
    proc wcswidth {string} {
        set codes [scan $string [string repeat %c [string length $string]]]
        set width 0
        foreach c $codes {
            set w [textutil::wcswidth_char $c]
            if {$w < 0} {
                return -1
            } else {
                incr width $w
            }
        }
        return $width
    }
    proc wcswidth2 {string} {
        set codes [scan $string [string repeat %c [string length $string]]]
        set widths [lmap c $codes {textutil::wcswidth_char $c}]
        if {-1 in $widths} {
            return -1
        }
        return [tcl::mathop::+ {*}$widths]
    }

    #prerequisites - no ansi escapes -  no newlines
    #review - what about \r \t \b ?
    #NO processing of \b - already handled in ansi::printing_length which then calls this
    #this version breaks string into sequences of ascii vs unicode
    proc ansifreestring_width {text} {
        #caller responsible for calling ansistrip first if text may have ansi codes - and for ensuring no newlines
        #we can c0 control characters after or while processing ansi escapes.
        #we need to map remaining control characters to zero-width (under assumption we're not using a font/codepage that displays them - review!)
        #anyway - we must allow things like raw ESC,DEL, NUL etc to pass through without error
        #if {[string first \033 $text] >= 0} {
        #    error "string_width doesn't accept ansi escape sequences. Use punk::ansi::stripansi first"
        #}


        #todo - various combining diacritical marks.. from grave - to various complicated unicode joiners and composing chars etc
        #as at 2023 - terminals generally seem to use the simplistic approach of tallying up individual character-widths, which means combinations that print short are reported too long by the terminal esc 6 n sequence.


        #- {Combining Diacritical Marks} {ranges {{start 768 end 879 note {unicode block 0300..036F}}} description {} settype block}
        #- {Combining Diacritical Marks Extended} {ranges {{start 6832 end 6911 note {unicode block 1AB0..1AFF}}} description {} settype block}
        #- {Combining Diacritical Marks Supplement} {ranges {{start 7616 end 7679 note {unicode block 1DC0..1DFF}}} description {} settype block}
        #- {Combining Diacritical Marks for Symbols} {ranges {{start 8400 end 8447 note {unicode block 20D0..20FF}}} description {} settype block}
        #- {Combining Half Marks} {ranges {{start 65056 end 65071 note {unicode block FE20..FE2F}}} description {} settype block}
        #
        # initial simplistic approach is just to strip these ... todo REVIEW

        #experiment to detect leading diacritics - but this isn't necessary - it's still zero-width - and if the user is splitting properly we shouldn't be getting a string with leading diacritics anyway
        #(leading combiners may display in terminal as mark on rightmost prompt char which is usually a space - but won't add width even then)
        #set re_leading_diacritic {^(?:[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+)}
        #if {[regexp $re_leading_diacritic $text]} {
        #    set text " $text"
        #}
        set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
        set text [regsub -all $re_diacritics $text ""]

        #review
        #if we strip out ZWJ \u0200d (zero width combiner) - then we don't give the chance for a grapheme combining test to merge properly e.g combining emojis
        #as at 2024 - textutil::wcswidth doesn't seem to be aware of these - and counts the joiner as one wide

        #ZWNJ \u0200c should also be zero length - but as a 'non' joiner - it should add zero to the length

        #ZWSP \u0200b zero width space


        #we should only map control sequences to nothing after processing ones with length effects, such as \b (\x07f) or DEL \x1f
        #todo - document that these shouldn't be present in input rather than explicitly checking here
        set re_ascii_c0 {[\U0000-\U001F]}
        set text [regsub -all $re_ascii_c0 $text ""]

        #short-circuit basic cases
        #support tcl pre 2023-11 - see regexp bug below
        #if {![regexp {[\uFF-\U10FFFF]} $text]} {
        #    return [string length $text]
        #}
        if {![regexp "\[\uFF-\U10FFFF\]" $text]} {
            return [string length $text]
        }

        #split just to get the standalone character widths - and then scan for other combiners (?) - or scan for clusters first?

        #review
        #set can_regex_high_unicode [string match [regexp -all -inline {[\U80-\U0010FFFF]} \U0001F525] \U0001F525]
        #tcl pre 2023-11 - braced high unicode regexes don't work
        #fixed in bug-4ed788c618 2023-11
        #set uc_chars [regexp -all -inline "\[\u0100-\U10FFFF\]" $text] ;#e.g return list of chars in range only

        #maintain unicode as sequences - todo - scan for grapheme clusters
        #set uc_sequences [punk::ansi::ta::_perlish_split "(?:\[\u0100-\U10FFFF\])+" $text]
        set uc_sequences [punk::ansi::ta::_perlish_split "(?:\[\u0000-\u00FF\])+" $text]
        set len 0
        foreach {uc ascii} $uc_sequences {
            #puts "-ascii $ascii"
            #puts "-uc $uc"
            incr len [string length $ascii]
            #textutil::wcswidth uses unicode data
            #fall back to textutil::wcswidth (which doesn't for example handle diactricts/combiners so we can't use until these and other things such as \u200b and diacritics are already stripped/accounted for)
            #todo - find something that understands grapheme clusters - needed also for grapheme_split
            #use punk::char::wcswidth - faster than the string split in textutil::wcswidth but still uses textutil::wcswidth_char
            incr len [wcswidth $uc]
        }
        #todo - work out what to do with anomalies like grave combiner which print at different lengths on different terminals (fonts?) and for which cursor-query escape sequence lies.
        return $len
    }

    #kept as a fallback for review/test if textutil::wcswidth doesn't do what we require on all terminals.
    #this version looks at console testwidths if they've been cached.
    #It is relatively fast - but tests unicode widths char by char - so won't be useful going forward for grapheme clusters.
    proc ansifreestring_width2 {text} {
        #caller responsible for calling ansistrip first if text may have ansi codes - and for ensuring no newlines


        #we can c0 control characters after or while processing ansi escapes.
        #we need to map remaining control characters to zero-width (under assumption we're not using a font/codepage that displays them - review!)
        #anyway - we must allow things like raw ESC,DEL, NUL etc to pass through without error
        #if {[string first \033 $text] >= 0} {
        #    error "string_width doesn't accept ansi escape sequences. Use punk::ansi::stripansi first"
        #}


        #todo - various combining diacritical marks.. from grave - to various complicated unicode joiners and composing chars etc
        #as at 2023 - terminals generally seem to use the simplistic approach of tallying up individual character-widths, which means combinations that print short are reported too long by the terminal esc 6 n sequence.


        #- {Combining Diacritical Marks} {ranges {{start 768 end 879 note {unicode block 0300..036F}}} description {} settype block}
        #- {Combining Diacritical Marks Extended} {ranges {{start 6832 end 6911 note {unicode block 1AB0..1AFF}}} description {} settype block}
        #- {Combining Diacritical Marks Supplement} {ranges {{start 7616 end 7679 note {unicode block 1DC0..1DFF}}} description {} settype block}
        #- {Combining Diacritical Marks for Symbols} {ranges {{start 8400 end 8447 note {unicode block 20D0..20FF}}} description {} settype block}
        #- {Combining Half Marks} {ranges {{start 65056 end 65071 note {unicode block FE20..FE2F}}} description {} settype block}
        #
        # initial simplistic approach is just to strip these ... todo REVIEW

        #experiment to detect leading diacritics - but this isn't necessary - it's still zero-width - and if the user is splitting properly we shouldn't be getting a string with leading diacritics anyway
        #(leading combiners may display in terminal as mark on rightmost prompt char which is usually a space - but won't add width even then)
        #set re_leading_diacritic {^(?:[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+)}
        #if {[regexp $re_leading_diacritic $text]} {
        #    set text " $text"
        #}
        set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
        set text [regsub -all $re_diacritics $text ""]

        #review
        #if we strip out ZWJ \u0200d (zero width combiner) - then we don't give the chance for a grapheme combining test to merge properly e.g combining emojis
        #as at 2024 - textutil::wcswidth doesn't seem to be aware of these - and counts the joiner as one wide

        #ZWNJ \u0200c should also be zero length - but as a 'non' joiner - it should add zero to the length

        #ZWSP \u0200b zero width space


        #we should only map control sequences to nothing after processing ones with length effects, such as \b (\x07f) or DEL \x1f
        #todo - document that these shouldn't be present in input rather than explicitly checking here
        set re_ascii_c0 {[\U0000-\U001F]}
        set text [regsub -all $re_ascii_c0 $text ""]

        #short-circuit basic cases
        #support tcl pre 2023-11 - see regexp bug below
        #if {![regexp {[\uFF-\U10FFFF]} $text]} {
        #    return [string length $text]
        #}
        if {![regexp "\[\uFF-\U10FFFF\]" $text]} {
            return [string length $text]
        }

        #review - wcswidth should detect these
        set re_ascii_fullwidth {[\uFF01-\uFF5e]}

        set doublewidth_char_count 0
        set zerowidth_char_count 0
        #split just to get the standalone character widths - and then scan for other combiners (?)
        #review
        #set can_regex_high_unicode [string match [regexp -all -inline {[\U80-\U0010FFFF]} \U0001F525] \U0001F525]
        #tcl pre 2023-11 - braced high unicode regexes don't work
        #fixed in bug-4ed788c618 2023-11
        #set uc_sequences [regexp -all -inline -indices {[\u0100-\U10FFFF]} $text]
        #set uc_sequences [regexp -all -inline -indices "\[\u0100-\U10FFFF\]" $text] ;#e.g for string: \U0001f9d1abc\U001f525ab returns {0 0} {4 4}
        set uc_chars [regexp -all -inline "\[\u0100-\U10FFFF\]" $text] ;#e.g return list of chars in range only
        foreach c $uc_chars {
            if {[regexp $re_ascii_fullwidth $c]} {
                incr doublewidth_char_count
            } else {
                #review
                # a)- terminals lie - so we could have a bad cached testwidth
                # b)- textutil::wcswidth_char seems to be east-asian-width based - and not a reliable indicator of 'character cells taken by the character when printed to the terminal' despite this statement in tclllib docs.
                #(character width is a complex context-dependent topic)
                # c) by checking for a cached console testwidth first - we make this function less deterministic/repeatable depending on whether console tests have been run.
                # d) Upstream caching of grapheme_width may also lock in a result from whatever method was first employed here
                #Despite all this - the mechanism is hoped to give best effort consistency for terminals
                #further work needed for combining emojis etc - which can't be done in a per character loop
                #todo - see if wcswidth does any of this. It is very slow for strings that include mixed ascii/unicode - so perhaps we can use a perlish_split
                # to process sequences of unicode.
                #- and the user has the option to test character sets first if terminal position reporting gives better results
                if {[char_info_is_testwidth_cached $c]} {
                    set width [char_info_testwidth_cached $c]
                } else {
                    #textutil::wcswidth uses unicode data
                    #fall back to textutil::wcswidth (which doesn't for example handle diactricts/combiners so we can't use until these and other things such as \u200b and diacritics are already stripped/accounted for)
                    set width [textutil::wcswidth_char [scan $c %c]]
                }
                if {$width == 0} {
                    incr zerowidth_char_count
                } elseif {$width == 2} {
                    incr doublewidth_char_count
                }
            }
        }
        #todo - work out what to do with anomalies like grave combiner which print at different lengths on different terminals (fonts?) and for which cursor-query escape sequence lies.
        return [expr {[string length $text] + $doublewidth_char_count - $zerowidth_char_count}]
    }

    #slow - textutil::wcswidth is slow with mixed ascii uc
    proc ansifreestring_width3 {text} {
        #caller responsible for calling ansistrip first if text may have ansi codes - and for ensuring no newlines


        #we can c0 control characters after or while processing ansi escapes.
        #we need to map remaining control characters to zero-width (under assumption we're not using a font/codepage that displays them - review!)
        #anyway - we must allow things like raw ESC,DEL, NUL etc to pass through without error
        #if {[string first \033 $text] >= 0} {
        #    error "string_width doesn't accept ansi escape sequences. Use punk::ansi::stripansi first"
        #}


        #review - work out what to do with anomalies like grave combiner which print at different lengths on different terminals (fonts?) and for which cursor-query escape sequence lies.
        # - various combining diacritical marks.. from grave - to various complicated unicode joiners and composing chars etc
        #as at 2023 - terminals generally seem to use the simplistic approach of tallying up individual character-widths, which means combinations that print short are reported too long by the terminal esc 6 n sequence.

        #- {Combining Diacritical Marks} {ranges {{start 768 end 879 note {unicode block 0300..036F}}} description {} settype block}
        #- {Combining Diacritical Marks Extended} {ranges {{start 6832 end 6911 note {unicode block 1AB0..1AFF}}} description {} settype block}
        #- {Combining Diacritical Marks Supplement} {ranges {{start 7616 end 7679 note {unicode block 1DC0..1DFF}}} description {} settype block}
        #- {Combining Diacritical Marks for Symbols} {ranges {{start 8400 end 8447 note {unicode block 20D0..20FF}}} description {} settype block}
        #- {Combining Half Marks} {ranges {{start 65056 end 65071 note {unicode block FE20..FE2F}}} description {} settype block}
        #
        # initial simplistic approach is just to strip these ... todo REVIEW

        #experiment to detect leading diacritics - but this isn't necessary - it's still zero-width - and if the user is splitting properly we shouldn't be getting a string with leading diacritics anyway
        #(leading combiners may display in terminal as mark on rightmost prompt char which is usually a space - but won't add width even then)
        #set re_leading_diacritic {^(?:[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+)}
        #if {[regexp $re_leading_diacritic $text]} {
        #    set text " $text"
        #}
        set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
        set text [regsub -all $re_diacritics $text ""]

        #we should only map control sequences to nothing after processing ones with length effects, such as \b (\x07f) or DEL \x1f
        #todo - document that these shouldn't be present in input rather than explicitly checking here
        set re_ascii_c0 {[\U0000-\U001F]}
        set text [regsub -all $re_ascii_c0 $text ""]

        #short-circuit basic cases
        #support tcl pre 2023-11 - see regexp bug below
        #if {![regexp {[\uFF-\U10FFFF]} $text]} {
        #    return [string length $text]
        #}
        if {![regexp "\[\uFF-\U10FFFF\]" $text]} {
            return [string length $text]
        }

        #slow when ascii mixed with unicode (but why?)
        return [punk::wcswidth $text]
    }
    #This shouldn't be called on text containing ansi codes!
    proc strip_nonprinting_ascii {str} {
        #review - some single-byte 'control' chars have visual representations e.g ETX as heart depending on font/codepage
        #It is currently used for screen display width calculations
        #equivalent for various unicode combining chars etc?
        set map [list\
            \x00 ""\
            \x07 ""\
            \x7f ""\
        ]
        return [string map $map $str]
    }


    #split into plaintext and runs of combiners (combining diacritical marks - not ZWJ or ZWJNJ)
    proc combiner_split {text} {
        #split into odd numbered list (or zero) in a similar way to punk::ansi::ta::_perlish_split
        #
        #set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
        set graphemes [list]
        if {[string length $text] == 0} {
            return {}
        }
        set list [list]
        set start 0
        set strlen [string length $text]
        #make sure our regexes aren't non-greedy - or we may not have exit condition for loop
        #review
        while {$start < $strlen && [regexp -start $start -indices -- {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+} $text match]} {
            lassign $match      matchStart matchEnd
            #puts "->start $start ->match $matchStart $matchEnd"
            lappend list [string range $text $start $matchStart-1] [string range $text $matchStart $matchEnd]
            set start [expr {$matchEnd+1}]

            #if {$start >= [string length $text]} {
            #    break
            #}
        }
        lappend list [string range $text $start end]
    }

    #ZWJ ZWNJ ?

    #1st shot - basic diacritics
    #todo - become aware of unicode grapheme cluster boundaries
    #This is difficult in Tcl without unicode property based Character Classes in the regex engine
    #review - this needs to be performant - it is used a lot by punk terminal/ansi features
    #todo - trie data structures for unicode?
    #for now we can at least combine diacritics
    #should also handle the ZWJ (and the variation selectors? eg \uFE0F) character which should account for emoji clusters
    #Note - emoji cluster could be for example 11 code points/41 bytes (family emoji with skin tone modifiers for each member, 3 ZWJs)
    #This still leaves a whole class of clusters.. korean etc unhandled :/
    proc grapheme_split {text} {
        set graphemes [list]
        set csplits [combiner_split $text]
        foreach {pt combiners} [lrange $csplits 0 end-1] {
            set clist [split $pt ""]
            lappend graphemes {*}[lrange $clist 0 end-1]
            lappend graphemes [string cat [lindex $clist end] $combiners]
        }
        #last csplit never has a combiner (_perlish_split style) - and may be empty - in which case we don't append it as a grapheme
        if {[lindex $csplits end] ne ""} {
            lappend graphemes {*}[split [lindex $csplits end] ""]
        }
        return $graphemes
    }
    proc grapheme_split_dec {text} {
        set graphemes [list]
        set csplits [combiner_split $text]
        foreach {pt combiners} [lrange $csplits 0 end-1] {
            set pt_decs [scan $pt [string repeat %c [string length $pt]]]
            set combiner_decs [scan $combiners [string repeat %c [string length $combiners]]]
            lset pt_decs end [concat [lindex $pt_decs end] $combiner_decs]
            lappend graphemes {*}$pt_decs
        }
        #last csplit never has a combiner (_perlish_split style) - and may be empty - in which case we don't append it as a grapheme
        if {[lindex $csplits end] ne ""} {
            lappend graphemes {*}[scan [lindex $csplits end] [string repeat %c [string length [lindex $csplits end]]]]
        }
        return $graphemes
    }
    proc grapheme_split_dec2 {text} {
        set graphemes [list]
        set csplits [combiner_split $text]
        foreach {pt combiners} $csplits {
            set pt_decs [scan $pt [string repeat %c [string length $pt]]]
            if {$combiners ne ""} {
                set combiner_decs [scan $combiners [string repeat %c [string length $combiners]]]
                lset pt_decs end [concat [lindex $pt_decs end] $combiner_decs]
            }
            lappend graphemes {*}$pt_decs
        }
        return $graphemes
    }
    proc grapheme_split2 {text} {
        set graphemes [list]
        set csplits [combiner_split $text]
        foreach {pt combiners} [lrange $csplits 0 end-1] {
            set clist [split $pt ""]
            lappend graphemes {*}[lrange $clist 0 end-1] [string cat [lindex $clist end] $combiners]
        }
        #last csplit never has a combiner (_perlish_split style) - and may be empty - in which case we don't append it as a grapheme
        if {[lindex $csplits end] ne ""} {
            lappend graphemes {*}[split [lindex $csplits end] ""]
        }
        return $graphemes
    }

    # -- --- --- --- ---
    #will accept a single char or a string - test using console cursor position reporting
    #unreliable
    proc char_info_testwidth {ch {emit 0}} {
        package require punk::console
        #uses cursor movement and relies on console to report position.. which it may do properly for single chars - but may misreport for combinations that combine into a single glyph
        tailcall punk::console::test_char_width $ch $emit
    }
    proc char_info_testwidth_cached {char} {
        variable charinfo
        set dec [scan $char %c]
        set twidth ""
        if {[dict exists $charinfo $dec testwidth]} {
            set twidth [dict get $charinfo $dec testwidth]
        }
        if {$twidth eq ""} {
            set width [char_info_testwidth $char]
            dict set charinfo $dec testwidth $width
            return $width
        } else {
            return $twidth
        }
    }
    proc char_info_is_testwidth_cached {char} {
        variable charinfo
        return [dict exists $charinfo [scan $char %c] testwidth]
    }
    # -- --- --- --- ---


}


# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
##  Ready
package provide punk::char [namespace eval punk::char {
    variable version
    set version 0.1.0
}]
return

#*** !doctools
#[manpage_end]