You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

2676 lines
134 KiB

# -*- tcl -*-
# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from <pkg>-buildversion.txt
#
# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem.
# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository.
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# (C) 2023
#
# @@ Meta Begin
# Application punk::char 0.1.0
# Meta platform tcl
# Meta license <unspecified>
# @@ Meta End
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# doctools header
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[manpage_begin punkshell_module_punk::char 0 0.1.0]
#[copyright "2024"]
#[titledesc {character-set and unicode utilities}] [comment {-- Name section and table of contents description --}]
#[moddesc {character-set nad unicode}] [comment {-- Description at end of page heading --}]
#[require punk::char]
#[keywords module encodings]
#[description]
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[section Overview]
#[para] overview of punk::char
#[subsection Concepts]
#[para] -
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
## Requirements
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
## Requirements
##e.g package require frobz
#*** !doctools
#[subsection dependencies]
#[para] packages used by punk::char
#[list_begin itemized]
#[item] [package {Tcl 8.6}]
#
#*** !doctools
#[item] [package {overtype}]
#[para] -
#[item] [package {textblock}]
#[para] -
#[item] [package console]
#[para] -
package require Tcl 8.6-
#dependency on tcllib not ideal for bootstrapping as punk::char is core to many features.. (textutil::wcswidth is therefore included in bootsupport/include_modules.config) review
package require textutil
package require textutil::wcswidth
#*** !doctools
#[list_end]
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[section API]
#Note that ansi escapes can begin with \033\[ (\u001b\[) or the single character "Control Sequence Introducer" 0x9b
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
tcl::namespace::eval punk::char {
tcl::namespace::export *
variable grapheme_widths [tcl::dict::create]
# -- --------------------------------------------------------------------------
variable encmimens ;#namespace of mime package providing reversemapencoding and mapencoding functions
#tcllib mime requires tcl::chan::memchan,events,core and/or Trf
if {![catch {package require punk::encmime} errM]} {
set encmimens ::punk::encmime
} else {
package require mime
set encmimens ::mime
}
# -- --------------------------------------------------------------------------
variable invalid "???" ;# ideally this would be 0xFFFD - which should display as black diamond/rhombus with question mark. As at 2023 - this tends to display indistinguishably from other missing glyph boxes - so we use a longer sequence we can detect by length and to display obviously
variable invalid_display_char \u25ab; #todo - change to 0xFFFD once diamond glyph more common?
#just the 7-bit ascii. use [page ascii] for the 8-bit layout
proc ascii {} {return {
00 NUL 01 SOH 02 STX 03 ETX 04 EOT 05 ENQ 06 ACK 07 BEL
08 BS 09 HT 0a LF 0b VT 0c FF 0d CR 0e SO 0f SI
10 DLE 11 DC1 12 DC2 13 DC3 14 DC4 15 NAK 16 SYN 17 ETB
18 CAN 19 EM 1a SUB 1b ESC 1c FS 1d GS 1e RS 1f US
20 SP 21 ! 22 " 23 # 24 $ 25 % 26 & 27 '
28 ( 29 ) 2a * 2b + 2c , 2d - 2e . 2f /
30 0 31 1 32 2 33 3 34 4 35 5 36 6 37 7
38 8 39 9 3a : 3b ; 3c < 3d = 3e > 3f ?
40 @ 41 A 42 B 43 C 44 D 45 E 46 F 47 G
48 H 49 I 4a J 4b K 4c L 4d M 4e N 4f O
50 P 51 Q 52 R 53 S 54 T 55 U 56 V 57 W
58 X 59 Y 5a Z 5b [ 5c \ 5d ] 5e ^ 5f _
60 ` 61 a 62 b 63 c 64 d 65 e 66 f 67 g
68 h 69 i 6a j 6b k 6c l 6d m 6e n 6f o
70 p 71 q 72 r 73 s 74 t 75 u 76 v 77 w
78 x 79 y 7a z 7b { 7c | 7d } 7e ~ 7f DEL
}}
#G0 character set
proc ascii2 {} {
set dict [asciidict2]
set out ""
set i 1
append out " "
tcl::dict::for {k v} $dict {
#single chars are wrapped with \033(0 and \033(B ie total length 7
if {[tcl::string::length $v] == 7} {
set v " $v "
} elseif {[tcl::string::length $v] == 2} {
set v "$v "
} elseif {[tcl::string::length $v] == 0} {
set v " "
}
append out "$k $v "
if {$i > 0 && $i % 8 == 0} {
set out [tcl::string::range $out 0 end-2]
append out \n " "
}
incr i
}
set out [tcl::string::trimright $out " "]
return $out
}
proc symbol {} {
tailcall page symbol
}
proc dingbats {} {
set out ""
append out [page dingbats] \n
set unicode_dict [charset_dictget Dingbats]
append out " "
set i 1
tcl::dict::for {k charinfo} $unicode_dict {
set char [tcl::dict::get $charinfo char]
if {[tcl::string::length $char] == 0} {
set displayv " "
} elseif {[tcl::string::length $char] == 1} {
set displayv " $char "
} else {
set displayv $char
}
append out "$k $displayv "
if {$i > 0 && $i % 8 == 0} {
set out [tcl::string::range $out 0 end-2]
append out \n " "
}
incr i
}
return $out
}
proc page_names {{search *}} {
set all_names [list]
set d [page_names_dict $search]
tcl::dict::for {k v} $d {
if {$k ni $all_names} {
lappend all_names $k
}
foreach m $v {
if {$m ni $all_names} {
lappend all_names $m
}
}
}
return [lsort $all_names]
}
proc page_names_help {{namesearch *}} {
set d [page_names_dict $namesearch]
set out ""
tcl::dict::for {k v} $d {
append out "$k $v" \n
}
return [linesort $out]
}
proc page_names_dict {{search *}} {
if {![regexp {[?*]} $search]} {
set search "*$search*"
}
set encnames [encoding names]
foreach enc $encnames {
tcl::dict::set d $enc [list]
}
variable encmimens
set mimenames [array get ${encmimens}::reversemap]
tcl::dict::for {mname encname} $mimenames {
if {$encname in $encnames} {
set enclist [tcl::dict::get $d $encname]
if {$mname ni $enclist} {
tcl::dict::lappend d $encname $mname
}
}
}
foreach enc [lsort $encnames] {
set mime_enc [${encmimens}::mapencoding $enc]
if {$mime_enc ne ""} {
set enclist [tcl::dict::get $d $enc]
if {$mime_enc ni $enclist} {
tcl::dict::lappend d $enc $mime_enc
}
}
}
set dresult [tcl::dict::create]
if {$search ne "*"} {
tcl::dict::for {k v} $d {
if {[tcl::string::match -nocase $search $k] || ([lsearch -nocase $v $search]) >= 0} {
tcl::dict::set dresult $k $v
}
}
} else {
set dresult $d
}
return $dresult
}
proc page8 {encname args} {
tcl::dict::set args -cols 8
tailcall page $encname {*}$args
}
proc page16 {encname args} {
tcl::dict::set args -cols 16
tailcall page $encname {*}$args
}
#This will not display for example, c0 glyphs for cp437
# we could use the punk::ansi::cp437_map dict - but while that might be what some expect to see - it would probably be too much magic for this function - which is intended to align more with what Tcl's encoding convertfrom/to actually does.
# for nonprinting members of the page, 2 and 3 letter codes are used rather than unicode visualisation replacements or even unicode equivalent replacements known to be appropriate for the page
proc page {encname args} {
variable invalid
set encname [encname $encname]
set defaults [list\
-range {0 256}\
-cols 16\
]
set opts [tcl::dict::merge $defaults $args]
# -- --- --- --- --- --- --- --- ---
set cols [tcl::dict::get $opts -cols]
# -- --- --- --- --- --- --- --- ---
set d_bytedisplay [basedict_display]
#set d_ascii [pagedict_raw ascii]
set d_ascii [basedict]
set d_asciiposn [lreverse $d_ascii] ;#values should be unique except for "???". We are mainly interested in which ones have display-names e.g cr csi
#The results of this are best seen by comparing the ebcdic and ascii pages
set d_page [pagedict_raw $encname]
set out ""
set i 1
append out " "
tcl::dict::for {k rawchar} $d_page {
set num [expr {"0x$k"}]
#see if ascii equivalent exists and has a name
if {$rawchar eq $invalid} {
set displayv "$invalid"
} else {
set bytedisplay ""
if {[tcl::dict::exists $d_asciiposn $rawchar]} {
set asciiposn [tcl::dict::get $d_asciiposn $rawchar]
set bytedisplay [tcl::dict::get $d_bytedisplay $asciiposn]
}
if {$bytedisplay eq $invalid} {
#
set displayv " $rawchar "
} else {
set displaylen [tcl::string::length $bytedisplay]
if {$displaylen == 2} {
set displayv "$bytedisplay "
} elseif {$displaylen == 3} {
set displayv $bytedisplay
} else {
if {[tcl::string::length $rawchar] == 0} {
set displayv " "
} else {
#presumed 1
set displayv " $rawchar "
}
}
}
}
append out "$k $displayv "
if {$i > 0 && $i % $cols == 0} {
set out [tcl::string::range $out 0 end-2]
append out \n " "
}
incr i
}
set out [tcl::string::trimright $out " "]
return $out
}
proc pagechar1 {page num} {
set encpage [encname $page]
encoding convertfrom $encpage [format %c $num]
}
proc pagechar {page num} {
set encpage [encname $page]
set ch [format %c $num]
if {[decodable $ch $encpage]} {
set outchar [encoding convertfrom $encpage $ch]
} else {
#here we will use \0xFFFD instead of our replacement string ??? - as the name pagechar implies always returning a single character. REVIEW.
set outchar $::punk::char::invalid_display_char
}
return $outchar
}
proc pagechar_info {page num} {
set ch [format %c $num]
set h [format %04x $num]
set encpage [encname $page]
if {[decodable $ch $encpage]} {
set outchar [encoding convertfrom $encpage $ch]
} else {
error "pagechar_info: $h not decodable from $encpage"
}
package require punk::console
puts -nonewline stdout \033\[s;flush stdout
lassign [punk::console::get_cursor_pos_list] _row1 col1
puts -nonewline stdout "$outchar";flush stdout
lassign [punk::console::get_cursor_pos_list] _row2 col2
puts -nonewline stdout "\033\[u";flush stdout
return "$col1 -> $col2"
}
proc pagebyte {page num} {
set encpage [encname $page]
set ch [format %c $num]
if {[decodable $ch $encpage]} {
#set outchar [encoding convertto $encpage [format %c $num]]
set outchar [format %c $num]
} else {
set outchar $::punk::char::invalid_display_char
}
return $outchar
}
proc all_pages {} {
set out ""
set mimenamesdict [page_names_dict]
foreach encname [encoding names] {
if {[tcl::dict::exists $mimenamesdict $encname]} {
set alt "([tcl::dict::get $mimenamesdict $encname])"
} else {
set alt ""
}
append out "$encname $alt" \n
append out [page $encname]
}
return $out
}
proc encname {encoding_name_or_alias} {
set encname $encoding_name_or_alias
if {[lsearch -nocase [page_names] $encname] < 0} {
error "Unknown encoding '$encname' - use 'punk::char::page_names' to see valid encoding names on this system"
}
variable encmimens
if {$encname ni [encoding names]} {
set encname [${encmimens}::reversemapencoding $encname]
}
return $encname
}
proc pagedict_raw {encname} {
variable invalid ;# ="???"
set encname [encname $encname]
set d [tcl::dict::create]
for {set i 0} {$i < 256} {incr i} {
set k [format %02x $i]
#tcl::dict::set d $k [encoding convertfrom $encname [format %c $i]]
set ch [format %c $i] ;
#jmn
if {[decodable $ch $encname]} {
#set encchar [encoding convertto $encname $ch]
#tcl::dict::set d $k [encoding convertfrom $encchar]
tcl::dict::set d $k [encoding convertfrom $encname $ch]
} else {
tcl::dict::set d $k $invalid ;#use replacement so we can detect difference from actual "?"
}
}
return $d
}
proc asciidict {} {
variable invalid
set d [tcl::dict::create]
set a128 [asciidict128]
for {set i 0} {$i < 256} {incr i} {
set k [format %02x $i]
if {$i <= 127} {
tcl::dict::set d $k [tcl::dict::get $a128 $k]
} else {
#
tcl::dict::set d $k $invalid
}
if {$i <=32} {
#no point emitting the lower control characters raw to screen - emit the short-names defined in the 'ascii' proc
tcl::dict::set d $k [tcl::dict::get $a128 $k]
} else {
if {$i == 0x9b} {
tcl::dict::set d $k CSI ;#don't emit the ansi 'Control Sequence Introducer' - or it will be interpreted by the console and affect the layout.
} else {
tcl::dict::set d $k [format %c $i]
}
}
}
return $d
}
proc basedict_display {} {
set d [tcl::dict::create]
set a128 [asciidict128]
for {set i 0} {$i < 256} {incr i} {
set k [format %02x $i]
if {$i <=32} {
#no point emitting the lower control characters raw to screen - emit the short-names defined in the 'ascii' proc
tcl::dict::set d $k [tcl::dict::get $a128 $k]
} else {
if {$i == 0x9b} {
tcl::dict::set d $k CSI ;#don't emit the ansi 'Control Sequence Introducer' - or it will be interpreted by the console and affect the layout.
} elseif {$i == 0x9c} {
tcl::dict::set d $k OSC
} else {
#tcl::dict::set d $k [encoding convertfrom [encoding system] [format %c $i]]
#don't use encoding convertfrom - we want the value independent of the current encoding system.
tcl::dict::set d $k [format %c $i]
}
}
}
return $d
}
proc basedict_encoding_system {} {
#result depends on 'encoding system' currently in effect
set d [tcl::dict::create]
for {set i 0} {$i < 256} {incr i} {
set k [format %02x $i]
tcl::dict::set d $k [encoding convertfrom [encoding system] [format %c $i]]
}
return $d
}
proc basedict {} {
#this gives same result independent of current value of 'encoding system'
set d [tcl::dict::create]
for {set i 0} {$i < 256} {incr i} {
set k [format %02x $i]
tcl::dict::set d $k [format %c $i]
}
return $d
}
proc pagedict {pagename args} {
variable charsets
set encname [encname $pagename]
set defaults [list\
-range {0 255}\
-charset ""\
]
set opts [tcl::dict::merge $defaults $args]
# -- --- --- --- --- --- --- --- --- ---
set range [tcl::dict::get $opts -range]
set charset [tcl::dict::get $opts -charset]
# -- --- --- --- --- --- --- --- --- ---
if {$charset ne ""} {
if {$charset ni [charset_names]} {
error "unknown charset '$charset' - use 'charset_names' to get list"
}
set setinfo [tcl::dict::get $charsets $charset]
set ranges [tcl::dict::get $setinfo ranges]
set charset_dict [tcl::dict::create]
foreach r $ranges {
set start [tcl::dict::get $r start]
set end [tcl::dict::get $r end]
#set charset_dict [tcl::dict::merge $charset_dict [char_range_dict $start $end]]
break
}
} else {
set start [lindex $range 0]
set end [lindex $range 1]
}
set d [tcl::dict::create]
for {set i $start} {$i <= $end} {incr i} {
set k [format %02x $i]
tcl::dict::set d $k [encoding convertfrom $encname [format %c $i]]
}
return $d
}
#todo - benchmark peformance - improve punk pipeline
proc asciidict128 {} {
regexp -all -inline {\S+} [concat {*}[linelist -line trimleft [ascii]]]
}
proc _asciidict128 {} {
.= ascii |> .=> linelist -line trimleft |> .=* concat |> {regexp -all -inline {\S+} $data}
}
#review - use terminal to display actual supported DEC specials vs using dict at: punk::ansi::map_special_graphics which maps to known unicode equivalents
proc asciidict2 {} {
set d [tcl::dict::create]
tcl::dict::for {k v} [basedict_display] {
if {[tcl::string::length $v] == 1} {
set num [expr {"0x$k"}]
#tcl::dict::set d $k "\033(0[subst \\u00$k]\033(B"
tcl::dict::set d $k "\033(0[format %c $num]\033(B"
} else {
tcl::dict::set d $k $v
}
}
return $d
}
#-- --- --- --- --- --- --- ---
# encoding convertfrom & encoding convertto can be somewhat confusing to think about. (Need to think in reverse.)
# e.g encoding convertto dingbats <somethingpretty> will output something that doesn't look dingbatty on screen.
#-- --- --- --- --- --- --- ---
#must use Tcl instead of tcl (at least for 8.6)
if {![package vsatisfies [package present Tcl] 8.7-]} {
proc encodable "s {enc [encoding system]}" {
set encname [encname $enc]
if {($encname eq "ascii")} {
#8.6 fails to round-trip convert 0x7f del character despite it being in the ascii range (review Why?? what else doesn't round-trip but should?)
#just strip it out of the string as we are only after a boolean answer and if s is only a single del char empty string will result in true
set s [tcl::string::map [list [format %c 0x7f] ""] $s]
}
string eq $s [encoding convertfrom $encname [encoding convertto $encname $s]]
}
#note also that tcl8.6 has anomalies with how it handles some unassigned codepoints
# e.g unassigned codes in the middle of a codepage may appear to be encodable&decodable in a round trip whereas undefined codepoints at the end may get the replacement character defined in the tcl encodings dir (usually the 3f char: "?")
proc decodable "s {enc [encoding system]}" {
set encname [encname $enc]
#review
string eq $s [encoding convertto $encname [encoding convertfrom $encname $s]]
}
} else {
#review - use -profile?
proc encodable "s {enc [encoding system]}" {
set encname [encname $enc]
if {![catch {
string eq $s [encoding convertfrom $encname [encoding convertto $encname $s]]
} result]} {
return $result
} else {
return 0
}
}
proc decodable "s {enc [encoding system]}" {
set encname [encname $enc]
if {![catch {
string eq $s [encoding convertto $encname [encoding convertfrom $encname $s]]
} result]} {
return $result
} else {
return 0
}
}
}
#-- --- --- --- --- --- --- ---
proc test_japanese {{encoding jis0208}} {
#A very basic test of 2char encodings such as jis0208
set yatbun ;# encoding convertfrom jis0208 F|K\\
lassign [split $yatbun] yat bun
puts "original yatbun ${yat} ${bun}"
set eyat [encoding convertto $encoding $yat]
set ebun [encoding convertto $encoding $bun]
puts "$encoding encoded: ${eyat} ${ebun}"
puts "reencoded: [encoding convertfrom $encoding $eyat] [encoding convertfrom $encoding $ebun]"
return $yatbun
}
proc test_grave {} {
set g [format %c 0x300]
puts stdout "Testing console display of grave accented a in between letters x and y - accent should combine over the top of the letter a."
puts stdout "Apparent width should theoretically be 1 console-column"
package require punk::console
puts stdout "# -- --- --- ---"
puts -nonewline "xa${g}z";set cursorposn [punk::console::get_cursor_pos]
puts stdout \n
puts stdout "cursor position immediately after outputing 4 bytes (expecting 3 glyphs): $cursorposn"
puts stdout "# -- --- --- ---"
puts -nonewline "xyz";set cursorposn [punk::console::get_cursor_pos]
puts stdout \n
puts stdout "cursor position immediately after outputing 3 bytes (xyz): $cursorposn"
}
proc test_zalgo {} {
#from: https://github.com/jameslanska/unicode-display-width/blob/5e28d94c75e8c421a87199363b85c90dc37125b8/docs/unicode_background.md
#see: https://lingojam.com/ZalgoText
puts stdout "44 chars long - 9 graphemes - 9 columns wide"
set str "̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍̓̓ͅ"
}
proc test_zalgo2 {} {
# ------------------------
set str "Z̸̢͉̣͔̭̪͙̖̳̘͈͍̤̩̟͈͈͕̯̅̏̆̓̌́́͌̿̕ͅą̷̦̤̫̩͕̥̐̎̓́́̂̀͆́̔̄̈́̏̌́̆͜͜͝͠l̴̩̙̺͚̟͇͖͔͕̹̟͌̈́̄̇́̉̋̕̚͜͠͠g̸͇̩͔̺̝̗̥̖̙̑̈́̈́̿̾̌͌͊̈̀͊̑̇̑͌̍̅̌͊͜͠͝ǫ̴̢̧͎͔̞̤̻̱̮̬͕̗̭͍̣̠̳̆̀̋̓͒̾̄͜͝͠͝T̴̛͉̬̋́̈́̈́̓͗̍̏̔̈̋̀͐̓̎̅̚̕͠͝͝ê̵̖̖̈͊̔̓̔̐̓̃͊̋͋̎͂͋̕x̴̡͎͖̼͎̦͎̲̪̘̺̯̲̬̮̥̼̰͌̃͜ͅt̶̛̘͎̰̔͐͌́̈́͊͗͌̅͂̐̆͌̀͂͘"
# ------------------------
}
proc test_zalgo3 {} {
# ------------------------
set str "Ẕ̸̢̼̺̜̰̣̣̖̭̜͓͖̺̥̼̠͙͙̟̥̟̠̤̫͍̠̤̮͚̝̜̙͈̦̙̩̹̙̜̩̦͔͕̈̃̅̇̈́͂̇̽̈́́́́̎͊̂̑̋͆̔̾͋̚͜ͅã̵̛̪̽̎̃͑̒̇͋͂̽̃̍͊̀̈̊̊̔̊̆̈́͗͑͗̽̄̋͗̄͌̑͊͝͝͠ͅl̵̢͇̖͉͖̝̹̜̞͓͎͍͈̞̱̙͙̦̪͔̱̮͈͉̼͎̭̝̯͇͚̺̟̱͙̳̰̙͚͖̝̫͙̎̅̃͆̈́̋̌̔̋̋͗̈́̔̐͆̈́̓̾̄̀́̏́͒̆̌͒̈́̈́̾̏̀͜͝g̸̖͂͋̊͗̈́̓̆̈̋̒͐̕o̶̧̢͓̲̗̠̘͕̦̤̹̗͉̰͔̯͓̭̹̻͔͇̯̜̙̍̂̃̃̀̓͌̒͊̊̋̒̿̿̌͐̄͗̾̕͝T̶̛̳̜̰͍̹̻̫̠̱̼̼̙̆̑̾̾͛́́̿͋͌̎̀̀̽̆͌̽̂̈̇̅̇̃́̿͗̾͒̎͊̑̾͝͠ȩ̸̢̨̛̛̛̭͔͎͇̫͎͈̲̻̙͕͎̣͈̩̺̗͖͙͇͌͐̒̎͐̓́̉́͐̓̐͌̅̊͋͑̈́͗͑̏̕͜͜͝ͅx̸̧̧̛̖̣̥̘͎͎̳̭̙̦̝͖̝̮̱̹̯̺̙̩̯̮͖̻̰͓̰̩͇̥̑͌̊̐̉̏͆̓̑̎̊̓͒̂̄̆͆̀̊̄̈̽͐͛̏͊̓̌͑́̎̈́̍̈́̊͗̉̋͆̿̊͘͜͜͝͝ͅͅͅt̵̡̨̜̘͎̦͚̠̗̺͉̼̠̲̠͙̺̹̗̲̏̈́̂̚͜͜͝ͅ"
# ------------------------
}
proc test_farmer {} {
#an interesting article re grapheme clustering problems in terminals https://mitchellh.com/writing/grapheme-clusters-in-terminals
#(similar to the problem with grave accent rendering width that the test_grave proc is written for)
# -- --- --- --- ---
#These pasted glyphs can display in console even when the unicode versions don't (tcl 8.6 limited to 65533/FFFD ?)
upvar farmer1_paste test_farmer1
upvar farmer2_paste test_farmer2
set test_farmer1 🧑🌾 ;#contains zero-width joiner between
set test_farmer2 🧑🌾
puts "pasted farmer1 exporting as var farmer1_paste: $test_farmer1"
puts "pasted farmer2 exporting as var farmer2_paste: $test_farmer2"
# -- --- --- --- ---
set farmer1 "\U0001f9d1\U0000200d\U0001f33e"
set farmer2 "\U0001f9d1\U0001f33e"
puts stdout "farmer1 with zero-width joiner, codes: \\U0001f9d1\\U0000200d\\U0001f33e : $farmer1"
puts stdout "farmer2 with no joiner codes: \\U0001f9d1\\U001f33e : $farmer2"
package require punk::console
puts stdout \n
puts stdout "#2--5---9---C---"
puts -nonewline stdout \033\[2K\033\[1G ;#2K erase line 1G cursor at col1
puts -nonewline "${farmer1}";set cursorposn [punk::console::get_cursor_pos_list]
puts stdout "\ncursor position immediately after outputing farmer1 (expecting 1 glyph 2 wide) : cursor at col [lindex $cursorposn 1]"
if {[lindex $cursorposn 1] eq "3"} {
puts stdout "[a+ green]OK[a]"
} else {
puts stdout "[a+ red]ERR - expected cursor position to be 3 after emitting farmer1[a]"
}
puts stdout "----------------"
puts stdout "#2--5---9---C---"
puts -nonewline "${farmer2}";set cursorposn [punk::console::get_cursor_pos_list]
puts stdout "\ncursor position immediately after outputing farmer2 (expecting 2 glyphs 4 wide in total): cursor at col [lindex $cursorposn 1]"
if {[lindex $cursorposn 1] eq "5"} {
puts stdout "[a+ green]OK[a]"
} else {
puts stdout "[a+ red]ERR - expected cursor position to be 5 after emitting farmer2[a]"
}
puts stdout "----------------"
puts "returning farmer1 - should be single glyph"
return $farmer1
}
#G0 Sets Sequence G1 Sets Sequence Meaning
#ESC ( A ESC ) A United Kingdom Set
#ESC ( B ESC ) B ASCII Set
#ESC ( 0 ESC ) 0 Special Graphics
#ESC ( 1 ESC ) 1 Alternate Character ROM Standard Character Set
#ESC ( 2 ESC ) 2 Alternate Character ROM Special Graphic
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# Unicode character sets - some hardcoded - some loadable from data files
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
variable charinfo [tcl::dict::create]
variable charsets [tcl::dict::create]
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# Aggregate character sets (ones that pick various ranges from underlying unicode ranges)
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
tcl::dict::set charsets "WGL4" [list altname "Windows Glyph List 4" ranges [list\
{start 0 end 127 name "basic latin"}\
{start 128 end 255 name "latin-1 supplement"}\
{start 256 end 383 name "Latin Extended-A"}\
{start 402 end 402 name "subset Latin Extended-B"}\
{start 506 end 511 name "subset Latin Extended-B"}\
{start 710 end 711 name "subset Spacing Modifier Letters"}\
{start 713 end 713 name "subset Spacing Modifier Letters"}\
{start 728 end 733 name "subset Spacing Modifier Letters"}\
{start 900 end 906 name "subset Greek"}\
{start 908 end 908 name "subset Greek"}\
{start 910 end 974 name "subset Greek"}\
{start 1024 end 1119 name "subset Cyrillic"}\
{start 1168 end 1169 name "subset Cyrillic"}\
{start 7808 end 7813 name "subset Latin Extended Additional"}\
{start 7922 end 7923 name "subset Latin Extended Additional"}\
{start 8211 end 8213 name "subset General Punctuation"}\
{start 8215 end 8222 name "subset General Punctuation"}\
{start 8224 end 8226 name "subset General Punctuation"}\
{start 8230 end 8230 name "subset General Punctuation"}\
{start 8240 end 8240 name "subset General Punctuation"}\
{start 8242 end 8243 name "subset General Punctuation"}\
] description "Microsoft WGL4 Repertoire" settype "other"]
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
#The base page 0-256 8bit range - values don't have specific characters or descriptions - as they are codepage dependent
#we will fill this here for completeness - but with placeholders
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
tcl::dict::set charsets "8bit" [list ranges [list {start 0 end 127 name ascii} {start 128 end 255 name 8bit-ascii}] description "8bit extended ascii range" settype "other"]
for {set i 0} {$i < 256} {incr i} {
tcl::dict::set charinfo $i [list desc "codepage-dependent" short "byte_[format %02x $i]"]
}
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# Unicode ranges
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
tcl::dict::set charsets "greek" [list ranges [list {start 880 end 1023} {start 7936 end 8191}] description "Greek and Coptic" settype "other"]
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
tcl::dict::set charsets "Block Elements" [list ranges [list {start 9600 end 9631}] description "Block Elements" settype "other"]
tcl::dict::set charinfo 9600 [list desc "Upper Half Block" short "blocke_up_half"]
tcl::dict::set charinfo 9601 [list desc "Lower One Eighth Block" short "blocke_lw_1_8th"]
tcl::dict::set charinfo 9602 [list desc "Lower One Quarter Block" short "blocke_lw_1_qtr"]
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
tcl::dict::set charsets "Dingbats" [list ranges [list {start 9984 end 10175 }] description "Unicode Dingbats" settype "tcl_supplemented"]
tcl::dict::set charinfo 9984 [list desc "Black Safety Scissors" short "dingbats_black_safety_scissors"]
#...
tcl::dict::set charinfo 10175 [list desc "Double Curly Loop" short "dingbats_double_curly_loop"]
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
#variation selectors 0xFe01 - 0xFE0F
tcl::dict::set charsets "Variation Selectors" [list ranges [list {start 65024 end 65039}] description "Variation Selectors" note "combining character with previous char - variant glyph display" settype "tcl_supplemented"]
tcl::dict::set charinfo 65024 [list desc "Variation Selector-1" short "VS1"]
tcl::dict::set charinfo 65025 [list desc "Variation Selector-2" short "VS2"]
tcl::dict::set charinfo 65026 [list desc "Variation Selector-3" short "VS3"]
tcl::dict::set charinfo 65027 [list desc "Variation Selector-4" short "VS4"]
tcl::dict::set charinfo 65027 [list desc "Variation Selector-5" short "VS5"]
tcl::dict::set charinfo 65029 [list desc "Variation Selector-6" short "VS6"]
tcl::dict::set charinfo 65030 [list desc "Variation Selector-7" short "VS7"]
tcl::dict::set charinfo 65031 [list desc "Variation Selector-8" short "VS8"]
tcl::dict::set charinfo 65032 [list desc "Variation Selector-9" short "VS9"]
tcl::dict::set charinfo 65033 [list desc "Variation Selector-10" short "VS10"]
tcl::dict::set charinfo 65034 [list desc "Variation Selector-11" short "VS11"]
tcl::dict::set charinfo 65035 [list desc "Variation Selector-12" short "VS12"]
tcl::dict::set charinfo 65036 [list desc "Variation Selector-13" short "VS13"]
tcl::dict::set charinfo 65037 [list desc "Variation Selector-14" short "VS14"]
tcl::dict::set charinfo 65038 [list desc "Variation Selector-15 text variation" short "VS15"] ;#still an image - just more suitable for text-presentation e.g word-processing doc
tcl::dict::set charinfo 65039 [list desc "Variation Selector-16 emoji variation" short "VS16"]
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# emoticons https://www.unicode.org/charts/PDF/U1F600.pdf
tcl::dict::set charsets "Emoticons" [list ranges [list {start 128512 end 128591}] description "Emoticons" settype "tcl_supplemented"]
tcl::dict::set charinfo 128512 [list desc "Grinning Face" short "emoticon_gface"]
tcl::dict::set charinfo 128513 [list desc "Grinning Face with Smiling Eyes" short "emoticon_gface_smile_eyes"]
tcl::dict::set charinfo 128514 [list desc "Face with Tears of Joy" short "emoticon_face_tears_joy"]
#todo
tcl::dict::set charinfo 128590 [list desc "Person with Pouting Face" short "emoticon_person_pout"]
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
tcl::dict::set charsets "Box Drawing" [list ranges [list {start 9472 end 9599}] description "Box Drawing" settype "tcl_supplemented"]
tcl::dict::set charinfo 9472 [list desc "Box Drawings Light Horizontal" short "boxd_lhz"]
tcl::dict::set charinfo 9473 [list desc "Box Drawings Heavy Horizontal" short "boxd_hhz"]
tcl::dict::set charinfo 9474 [list desc "Box Drawings Light Vertical" short "boxd_lv"]
tcl::dict::set charinfo 9475 [list desc "Box Drawings Heavy Vertical" short "boxd_hv"]
tcl::dict::set charinfo 9476 [list desc "Box Drawings Light Triple Dash Horizontal" short "boxd_ltdshhz"]
tcl::dict::set charinfo 9477 [list desc "Box Drawings Heavy Triple Dash Horizontal" short "boxd_htdshhz"]
tcl::dict::set charinfo 9478 [list desc "Box Drawings Light Triple Dash Vertical" short "boxd_ltdshv"]
tcl::dict::set charinfo 9479 [list desc "Box Drawings Heavy Triple Dash Vertical" short "boxd_htdshv"]
tcl::dict::set charinfo 9480 [list desc "Box Drawings Light Quadruple Dash Horizontal" short "boxd_lqdshhz"]
tcl::dict::set charinfo 9481 [list desc "Box Drawings Heavy Quadruple Dash Horizontal" short "boxd_hqdshhz"]
tcl::dict::set charinfo 9482 [list desc "Box Drawings Light Quadruple Dash Vertical" short "boxd_lqdshv"]
tcl::dict::set charinfo 9483 [list desc "Box Drawings Heavy Quadruple Dash Vertical" short "boxd_hqdshv"]
tcl::dict::set charinfo 9484 [list desc "Box Drawings Light Down and Right" short "boxd_ldr"]
tcl::dict::set charinfo 9485 [list desc "Box Drawings Down Light and Right Heavy" short "boxd_dlrh"]
tcl::dict::set charinfo 9486 [list desc "Box Drawings Down Heavy and Right Light" short "boxd_dhrl"]
tcl::dict::set charinfo 9487 [list desc "Box Drawings Heavy Down and Right" short "boxd_hdr"]
tcl::dict::set charinfo 9488 [list desc "Box Drawings Light Down and Left" short "boxd_ldl"]
tcl::dict::set charinfo 9489 [list desc "Box Drawings Down Light and Left Heavy" short "boxd_dllh"]
tcl::dict::set charinfo 9490 [list desc "Box Drawings Down Heavy and Left Light" short "boxd_dhll"]
tcl::dict::set charinfo 9491 [list desc "Box Drawings Heavy Down and Left" short "boxd_hdl"]
tcl::dict::set charinfo 9492 [list desc "Box Drawings Light Up and Right" short "boxd_lur"]
tcl::dict::set charinfo 9493 [list desc "Box Drawings Up Light and Right Heavy" short "boxd_ulrh"]
tcl::dict::set charinfo 9494 [list desc "Box Drawings Up Heavy and Right Light" short "boxd_uhrl"]
tcl::dict::set charinfo 9495 [list desc "Box Drawings Heavy Up and Right" short "boxd_hur"]
tcl::dict::set charinfo 9496 [list desc "Box Drawings Light Up and Left" short "boxd_lul"]
tcl::dict::set charinfo 9497 [list desc "Box Drawings Up Light and Left Heavy" short "boxd_ullh"]
tcl::dict::set charinfo 9498 [list desc "Box Drawings Up Heavy and Left Light" short "boxd_uhll"]
tcl::dict::set charinfo 9499 [list desc "Box Drawings Heavy Up and Left" short "boxd_hul"]
tcl::dict::set charinfo 9500 [list desc "Box Drawings Light Vertical and Right" short "boxd_lvr"]
tcl::dict::set charinfo 9501 [list desc "Box Drawings Vertical Light and Right Heavy" short "boxd_vlrh"]
tcl::dict::set charinfo 9502 [list desc "Box Drawings Up Heavy and Right Down Light" short "boxd_uhrdl"]
tcl::dict::set charinfo 9503 [list desc "Box Drawings Down Heavy and Right Up Light" short "boxd_dhrul"]
tcl::dict::set charinfo 9504 [list desc "Box Drawings Vertical Heavy and Right Light" short "boxd_vhrl"]
tcl::dict::set charinfo 9505 [list desc "Box Drawings Down Light and Right Up Heavy" short "boxd_dlruh"]
tcl::dict::set charinfo 9506 [list desc "Box Drawings Up Light and Right Down Heavy" short "boxd_ulrdh"]
tcl::dict::set charinfo 9507 [list desc "Box Drawings Heavy Vertical and Right" short "boxd_hvr"]
tcl::dict::set charinfo 9508 [list desc "Box Drawings Light Vertical and Left" short "boxd_lvl"]
tcl::dict::set charinfo 9509 [list desc "Box Drawings Vertical Light and Left Heavy" short "boxd_vllh"]
tcl::dict::set charinfo 9510 [list desc "Box Drawings Up Heavy and Let Down Light" short "boxd_uhldl"]
tcl::dict::set charinfo 9511 [list desc "Box Drawings Down Heavy and Left Up Light" short "boxd_dhlul"]
tcl::dict::set charinfo 9512 [list desc "Box Drawings Vertical Heavy and Left Light" short "boxd_vhll"]
tcl::dict::set charinfo 9513 [list desc "Box Drawings Down Light and left Up Heavy" short "boxd_dlluh"]
tcl::dict::set charinfo 9514 [list desc "Box Drawings Up Light and Left Down Heavy" short "boxd_ulldh"]
tcl::dict::set charinfo 9515 [list desc "Box Drawings Heavy Vertical and Left" short "boxd_hvl"]
tcl::dict::set charinfo 9516 [list desc "Box Drawings Light Down and Horizontal" short "boxd_ldhz"]
tcl::dict::set charinfo 9517 [list desc "Box Drawings Left Heavy and Right Down Light" short "boxd_lhrdl"]
tcl::dict::set charinfo 9518 [list desc "Box Drawings Right Heavy and Left Down Light" short "boxd_rhldl"]
tcl::dict::set charinfo 9519 [list desc "Box Drawings Down Light and Horizontal Heavy" short "boxd_dlhzh"]
tcl::dict::set charinfo 9520 [list desc "Box Drawings Down Heavy and Horizontal Light" short "boxd_dhhzl"]
tcl::dict::set charinfo 9521 [list desc "Box Drawings Right Light and Left Down Heavy" short "boxd_rlldh"]
tcl::dict::set charinfo 9522 [list desc "Box Drawings Left Light and Right Down Heavy" short "boxd_llrdh"]
tcl::dict::set charinfo 9523 [list desc "Box Drawings Heavy Down and Horizontal" short "boxd_hdhz"]
tcl::dict::set charinfo 9524 [list desc "Box Drawings Light Up and Horizontal" short "boxd_luhz"]
tcl::dict::set charinfo 9525 [list desc "Box Drawings Left Heavy and Right Up Light" short "boxd_lhrul"]
tcl::dict::set charinfo 9526 [list desc "Box Drawings Right Heavy and Left Up Light" short "boxd_rhlul"]
tcl::dict::set charinfo 9527 [list desc "Box Drawings Up Light and Horizontal Heavy" short "boxd_ulhzh"]
tcl::dict::set charinfo 9528 [list desc "Box Drawings Up Heavy and Horizontal Light" short "boxd_uhhzl"]
tcl::dict::set charinfo 9529 [list desc "Box Drawings Right Light and Left Up Heavy" short "boxd_rlluh"]
tcl::dict::set charinfo 9530 [list desc "Box Drawings Left Light and Right Up Heavy" short "boxd_llruh"]
tcl::dict::set charinfo 9531 [list desc "Box Drawings Heavy Up and Horizontal" short "boxd_huhz"]
tcl::dict::set charinfo 9532 [list desc "Box Drawings Light Vertical and Horizontal" short "boxd_lvhz"]
tcl::dict::set charinfo 9533 [list desc "Box Drawings Left Heavy and Right Vertical Light" short "boxd_lhrvl"]
tcl::dict::set charinfo 9534 [list desc "Box Drawings Right Heavy and Left Vertical Light" short "boxd_rhlvl"]
tcl::dict::set charinfo 9535 [list desc "Box Drawings Vertical Light and Horizontal Heavy" short "boxd_vlhzh"]
tcl::dict::set charinfo 9536 [list desc "Box Drawings Up Heavy and Down Horizontal Light" short "boxd_uhdhzl"]
tcl::dict::set charinfo 9537 [list desc "Box Drawings Down Heavy and Up Horizontal Light" short "boxd_dhuhzl"]
tcl::dict::set charinfo 9538 [list desc "Box Drawings Vertical Heavy and Horizontal Light" short "boxd_vhhzl"]
tcl::dict::set charinfo 9539 [list desc "Box Drawings Left Up Heavy and Right Down Light" short "boxd_luhrdl"]
tcl::dict::set charinfo 9540 [list desc "Box Drawings Right Up Heavy and Left Down Light" short "boxd_ruhldl"]
tcl::dict::set charinfo 9541 [list desc "Box Drawings Left Down Heavy and Right Up Light" short "boxd_ldhrul"]
tcl::dict::set charinfo 9542 [list desc "Box Drawings Right Down Heavy and Left Up Light" short "boxd_rdhlul"]
tcl::dict::set charinfo 9543 [list desc "Box Drawings Down Light and Up Horizontal Heavy" short "boxd_dluhzh"]
tcl::dict::set charinfo 9544 [list desc "Box Drawings Up Light and Down Horizontal Heavy" short "boxd_dldhzh"]
tcl::dict::set charinfo 9545 [list desc "Box Drawings Right Light and Left Vertical Heavy" short "boxd_rllvh"]
tcl::dict::set charinfo 9546 [list desc "Box Drawings Left Light and Right Vertical Heavy" short "boxd_llrvh"]
tcl::dict::set charinfo 9547 [list desc "Box Drawings Heavy Vertical and Horizontal" short "boxd_hvhz"]
tcl::dict::set charinfo 9548 [list desc "Box Drawings Light Double Dash Horizontal" short "boxd_lddshhz"]
tcl::dict::set charinfo 9549 [list desc "Box Drawings Heavy Double Dash Horizontal" short "boxd_hddshhz"]
tcl::dict::set charinfo 9550 [list desc "Box Drawings Light Double Dash Vertical" short "boxd_lddshv"]
tcl::dict::set charinfo 9551 [list desc "Box Drawings Heavy Double Dash Vertical" short "boxd_hddshv"]
tcl::dict::set charinfo 9552 [list desc "Box Drawings Double Horizontal" short "boxd_dhz"]
tcl::dict::set charinfo 9553 [list desc "Box Drawings Double Vertical" short "boxd_dv"]
tcl::dict::set charinfo 9554 [list desc "Box Drawings Down Single and Right Double" short "boxd_dsrd"]
tcl::dict::set charinfo 9555 [list desc "Box Drawings Down Double and Right Single" short "boxd_ddrs"]
tcl::dict::set charinfo 9556 [list desc "Box Drawings Double Down and Right" short "boxd_ddr"]
tcl::dict::set charinfo 9557 [list desc "Box Drawings Down Single and Left Double" short "boxd_dsld"]
tcl::dict::set charinfo 9558 [list desc "Box Drawings Down Double and Left Single" short "boxd_ddls"]
tcl::dict::set charinfo 9559 [list desc "Box Drawings Double Down and Left" short "boxd_ddl"]
tcl::dict::set charinfo 9560 [list desc "Box Drawings Up Single and Right Double" short "boxd_usrd"]
tcl::dict::set charinfo 9561 [list desc "Box Drawings Up Double and Right Single" short "boxd_udrs"]
tcl::dict::set charinfo 9562 [list desc "Box Drawings Double Up and Right" short "boxd_dur"]
tcl::dict::set charinfo 9563 [list desc "Box Drawings Up Single and Left Double" short "boxd_usld"]
tcl::dict::set charinfo 9564 [list desc "Box Drawings Up Double and Left Single" short "boxd_udls"]
tcl::dict::set charinfo 9565 [list desc "Box Drawings Double Up and Left" short "boxd_dul"]
tcl::dict::set charinfo 9566 [list desc "Box Drawings Vertical Single and Right Double" short "boxd_vsrd"]
tcl::dict::set charinfo 9567 [list desc "Box Drawings Vertical Double and Right Single" short "boxd_vdrs"]
tcl::dict::set charinfo 9568 [list desc "Box Drawings Double Vertical and Right" short "boxd_dvr"]
tcl::dict::set charinfo 9569 [list desc "Box Drawings Vertical Single and Left Double" short "boxd_vsld"]
tcl::dict::set charinfo 9570 [list desc "Box Drawings Vertical Double and Left Single" short "boxd_vdls"]
tcl::dict::set charinfo 9571 [list desc "Box Drawings Double Vertical and Left" short "boxd_dvl"]
tcl::dict::set charinfo 9572 [list desc "Box Drawings Down Single and Horizontal Double" short "boxd_dshzd"]
tcl::dict::set charinfo 9573 [list desc "Box Drawings Down Double and Horizontal Single" short "boxd_ddhzs"]
tcl::dict::set charinfo 9574 [list desc "Box Drawings Double Down and Horizontal" short "boxd_ddhz"]
tcl::dict::set charinfo 9575 [list desc "Box Drawings Up Single and Horizontal Double" short "boxd_ushzd"]
tcl::dict::set charinfo 9576 [list desc "Box Drawings Up Double and Horizontal Single" short "boxd_udhzs"]
tcl::dict::set charinfo 9577 [list desc "Box Drawings Double Up and Horizontal" short "boxd_duhz"]
tcl::dict::set charinfo 9578 [list desc "Box Drawings Vertical Single and Horizontal Double" short "boxd_vshzd"]
tcl::dict::set charinfo 9579 [list desc "Box Drawings Vertical Double and Horizontal Single" short "boxd_vdhzs"]
tcl::dict::set charinfo 9580 [list desc "Box Drawings Double Vertical and Horizontal" short "boxd_dvhz"]
tcl::dict::set charinfo 9581 [list desc "Box Drawings Light Arc Down and Right" short "boxd_ladr"]
tcl::dict::set charinfo 9582 [list desc "Box Drawings Light Arc Down and Left" short "boxd_ladl"]
tcl::dict::set charinfo 9583 [list desc "Box Drawings Light Arc Up and Left" short "boxd_laul"]
tcl::dict::set charinfo 9584 [list desc "Box Drawings Light Arc Up and Right" short "boxd_laur"]
tcl::dict::set charinfo 9585 [list desc "Box Drawings Light Diagonal Upper Right To Lower Left" short "boxd_ldgurll"]
tcl::dict::set charinfo 9586 [list desc "Box Drawings Light Diagonal Upper Left To Lower Right" short "boxd_ldgullr"]
tcl::dict::set charinfo 9587 [list desc "Box Drawings Light Diagonal Cross" short "boxd_ldc"]
tcl::dict::set charinfo 9588 [list desc "Box Drawings Light Left" short "boxd_ll"]
tcl::dict::set charinfo 9589 [list desc "Box Drawings Light Up" short "boxd_lu"]
tcl::dict::set charinfo 9590 [list desc "Box Drawings Light Right" short "boxd_lr"]
tcl::dict::set charinfo 9591 [list desc "Box Drawings Light Down" short "boxd_ld"]
tcl::dict::set charinfo 9592 [list desc "Box Drawings Heavy Left" short "boxd_hl"]
tcl::dict::set charinfo 9593 [list desc "Box Drawings Heavy Up" short "boxd_hu"]
tcl::dict::set charinfo 9594 [list desc "Box Drawings Heavy Right" short "boxd_hr"]
tcl::dict::set charinfo 9595 [list desc "Box Drawings Heavy Down" short "boxd_hd"]
tcl::dict::set charinfo 9596 [list desc "Box Drawings Light Left and Heavy Right" short "boxd_llhr"]
tcl::dict::set charinfo 9597 [list desc "Box Drawings Light Up and Heavy Down" short "boxd_luhd"]
tcl::dict::set charinfo 9598 [list desc "Box Drawings Heavy Left and Light Right" short "boxd_hllr"]
tcl::dict::set charinfo 9599 [list desc "Box Drawings Heavy Up and Light Down" short "boxd_huld"]
tcl::dict::set charsets "Halfwidth and Fullwidth Forms" [list ranges [list {start 65280 end 65519}] description "Halfwidth and Fullwidth Forms (variants)" settype "tcl_supplemental"]
tcl::dict::set charsets "ascii_fullwidth" [list ranges [list {start 65281 end 65374}] description "Ascii 21 to 7E fullwidth" parentblock "halfwidth_and_fullwidth_forms" settype "other"]
tcl::dict::set charsets "Specials" [list ranges [list {start 65520 end 65535}] description "Specials" settype "tcl_supplemental"]
tcl::dict::set charsets "noncharacters" [list ranges [list\
{start 64976 end 65007 note "BMP FDD0..FDEF"}\
{start 65534 end 65535 note "BMP FFFE,FFFF"}\
{start 131070 end 131071 note "plane1 1FFFE,1FFFF"}\
{start 196606 end 196607 note "plane2 2FFFE,2FFFF"}\
{start 262142 end 262143 note "plane3 3FFFE,3FFFF"}\
{start 327678 end 327679 note "plane4 4FFFE,4FFFF"}\
{start 393214 end 393215 note "plane5 5FFFE,5FFFF"}\
{start 458750 end 458751 note "plane6 6FFFE,6FFFF"}\
{start 524286 end 524287 note "plane7 7FFFE,7FFFF"}\
{start 589822 end 589823 note "plane8 8FFFE,8FFFF"}\
{start 655358 end 655359 note "plane9 9FFFE,9FFFF"}\
{start 720894 end 720895 note "plane10 AFFFE,AFFFF"}\
{start 786430 end 786431 note "plane11 BFFFE,BFFFF"}\
{start 851966 end 851967 note "plane12 CFFFE,CFFFF"}\
{start 917502 end 917503 note "plane13 DFFFE,DFFFF"}\
{start 983038 end 983039 note "plane14 EFFFE,EFFFF"}\
{start 1048574 end 1048575 note "plane15 FFFFE,FFFFF"}\
{start 1114110 end 1114111 note "plane16 10FFFE,10FFFF"}\
] description "non-characters" settype "tcl_supplemental"]
#build dicts keyed on short
variable charshort
proc _build_charshort {} {
variable charshort
set charshort [tcl::dict::create]
variable charinfo
tcl::dict::for {k v} $charinfo {
if {[tcl::dict::exists $v short]} {
set sh [tcl::dict::get $v short]
if {[tcl::dict::exists $charshort $sh]} {
puts stderr "_build_charshort WARNING character data load duplicate shortcode '$sh'"
}
tcl::dict::set charshort $sh [format %c $k]
}
}
return [tcl::dict::size $charshort]
}
_build_charshort
variable charset_extents_startpoints ;#stores endpoints associated with each startpoint - but named after key which is startpoint.
variable charset_extents_endpoints ;#stores startpoints assoicated with each endpoint - but named after key which is endpoint.
variable charset_extents_rangenames ;# dict keyed on start,end pointing to list of 2-tuples {setname rangeindex_within_set}
#build 2 indexes for each range.(charsets are not just unicode blocks so can have multiple ranges)
#Note that a range could be as small as a single char (startpoint = endpoint) so there can be many ranges with same start and end if charsets use some of the same subsets.
#as each charset_extents_startpoins,charset_extents_endpoints is built - the associated range name and index is appended to the rangenames dict
#startpoints - key is startpoint of a range, value is list of endpoints one for each range starting at this startpoint-key
#endpoints - key is endpoint of a range, value is list of startpoints one for each range ending at this endpoint-key
proc _build_charset_extents {} {
variable charsets
variable charset_extents_startpoints
variable charset_extents_endpoints
variable charset_extents_rangenames
set charset_extents_startpoints [tcl::dict::create]
set charset_extents_endpoints [tcl::dict::create]
set charset_extents_rangenames [tcl::dict::create]
tcl::dict::for {setname setinfo} $charsets {
set ranges [tcl::dict::get $setinfo ranges]
if {[tcl::dict::get $setinfo settype] eq "block"} {
#unicode block must have a single range
#we consider a char a member of the block even if unassigned/reserved (as per unicode documentation)
set start [tcl::dict::get [lindex $ranges 0] start]
set end [tcl::dict::get [lindex $ranges 0] end]
if {![tcl::dict::exists $charset_extents_startpoints $start] || $end ni [tcl::dict::get $charset_extents_startpoints $start]} {
#assertion if end wasn't in startpoits list - then start won't be in endpoints list
tcl::dict::lappend charset_extents_startpoints $start $end
tcl::dict::lappend charset_extents_endpoints $end $start
}
tcl::dict::lappend charset_extents_rangenames ${start},${end} [list $setname 1]
} else {
#multirange sets/scripts. have holes. Char not a member if it's not explicitly in a defined range.
#They should be in order within a set - but we don't assume so
set r 1
foreach range $ranges {
set start [tcl::dict::get $range start]
set end [tcl::dict::get $range end]
if {![tcl::dict::exists $charset_extents_startpoints $start] || $end ni [tcl::dict::get $charset_extents_startpoints $start]} {
#assertion if end wasn't in startpoits list - then start won't be in endpoints list
tcl::dict::lappend charset_extents_startpoints $start $end
tcl::dict::lappend charset_extents_endpoints $end $start
}
tcl::dict::lappend charset_extents_rangenames ${start},${end} [list $setname $r]
incr r
}
}
}
#maintain in sorted order
#-stride is available in lsort even at tcl8.6 - but not in lsearch
set charset_extents_startpoints [lsort -stride 2 -integer $charset_extents_startpoints]
set charset_extents_endpoints [lsort -stride 2 -integer $charset_extents_endpoints]
#no need to sort charset_extents_rangenames - lookup only done using dict methods
return [tcl::dict::size $charset_extents_startpoints]
}
_build_charset_extents ;#rebuilds for all charsets
#nerdfonts are within the Private use E000 - F8FF range
proc load_nerdfonts {} {
variable charsets
variable charinfo
package require fileutil
set ver [package provide punk::char]
if {$ver ne ""} {
set ifneeded [package ifneeded punk::char [package provide punk::char]]
#puts stderr "punk::char ifneeded script: $ifneeded"
lassign [split $ifneeded ";"] _ sourceinfo
set basedir [file dirname [lindex $sourceinfo end]]
} else {
#review - will only work at package load time
set scr [info script]
if {$scr eq ""} {
error "load_nerdfonts unable to determine package folder"
}
set basedir [file dirname [info script]]
}
set pkg_data_dir [file join $basedir char]
set fname [file join $pkg_data_dir nerd-fonts-glyph-list.txt]
if {[file exists $fname]} {
#puts stderr "load_nerdfonts loading $fname"
set data [fileutil::cat -translation binary $fname]
set short_seen [tcl::dict::create]
set current_set_range [tcl::dict::create]
set filesets_loading [list]
foreach ln [split $data \n] {
set ln [tcl::string::trim $ln]
if {$ln eq ""} {continue}
set desc [lassign $ln hex rawsetname]
set hexnum 0x$hex
set dec [expr $hexnum]
set setname "nf_$rawsetname" ;#Ensure nerdfont set names are prefixed.
if {$setname ni $filesets_loading} {
if {![tcl::dict::exists $charsets $setname]} {
#set exists - but not in our filesets_loading list - therefore this set has been previously loaded, so clear old data first
dict unset charset $setname
}
set newrange [list start $dec end $dec]
tcl::dict::set current_set_range $setname $newrange
tcl::dict::set charsets $setname [list ranges [list $newrange] description "nerd fonts $rawsetname" settype "nerdfonts privateuse"]
lappend filesets_loading $setname
}
#expects ordered glyph list
set existing_range [tcl::dict::get $current_set_range $setname]
set existing_end [tcl::dict::get $existing_range end]
if {$dec - $existing_end == 1} {
#part of current range
tcl::dict::set current_set_range $setname end $dec
#overwrite last ranges element
set rangelist [lrange [tcl::dict::get $charsets $setname ranges] 0 end-1]
lappend rangelist [tcl::dict::get $current_set_range $setname]
tcl::dict::set charsets $setname ranges $rangelist
} else {
#new range for set
tcl::dict::set current_set_range $setname start $dec
tcl::dict::set current_set_range $setname end $dec
set rangelist [tcl::dict::get $charsets $setname ranges]
lappend rangelist [tcl::dict::get $current_set_range $setname]
tcl::dict::set charsets $setname ranges $rangelist
}
if {![tcl::dict::exists $charinfo $dec]} {
# -- ---
#review
set map [list beaufort bf gibbous gb crescent cr thunderstorm tstorm thermometer thermom]
lappend map {*}[list directory dir creativecommons ccom creative_commons ccom forwardslash fs]
lappend map {*}[list multimedia mm multiple multi outline outl language lang]
lappend map {*}[list odnoklassniki okru]
# -- ---
#consider other ways to unambiguously shorten names?
#normalize nf_fa & nf_fa 'o' element to 'outl' so outlines can be searched across sets more easily (o not necessarily at last position)
set normdesc [list]
foreach el $desc {
if {$el eq "o"} {
set el "outl"
}
lappend normdesc $el
}
set joined_desc [join $normdesc _]
#map after join so we can normalize some underscored elements e.g creativecommons & creative_commons
set mapped_desc [tcl::string::map $map $joined_desc]
set s nf_${rawsetname}_$mapped_desc
if {![tcl::dict::exists $short_seen $s]} {
tcl::dict::set short_seen $s {}
} else {
#duplicate in the data file (e.g 2023 weather night alt rain mix)
set s ${s}_$hex
}
tcl::dict::set charinfo $dec [list desc "$desc" short $s]
}
}
_build_charshort
_build_charset_extents
} else {
puts stderr "unable to find glyph file. Tried $fname"
}
}
proc package_base {} {
#assume punk::char is in .tm form and we can use the package provide statement to determine base location
#review
set pkgver [package present punk::char]
set pkginfo [package ifneeded punk::char $pkgver]
set tmfile [lindex $pkginfo end]
set pkg_base [file dirname $tmfile]
return $pkg_base
}
tcl::namespace::eval internal {
proc unicode_folder {} {
set parent [file join [punk::char::package_base] char]
set candidates [glob -nocomplain -type d -dir $parent -tail unicode*]
set candidates [lsort -increasing $candidates] ;#review - dictionary sort - how are unicode versions ranked/compared??
if {![llength $candidates]} {
error "Failed to find unicode data folder in folder '$parent'"
}
set folder [file join $parent [lindex $candidates end]]
return $folder
}
proc dict_getdef {dictValue args} {
if {[llength $args] < 2} {
error {wrong # args: should be "dict_getdef dictValue ?key ...? key default"}
}
set keys [lrange $args 0 end-1]
if {[tcl::dict::exists $dictValue {*}$keys]} {
return [tcl::dict::get $dictValue {*}$keys]
} else {
return [lindex $args end]
}
}
}
#charsets structure
#tcl::dict::set charsets "halfwidth_and_fullwidth_forms" [list ranges [list {start 65280 end 65519}] description "Halfwidth and Fullwidth Forms (variants) settype block"]
#unicode Blocks.txt
#load the defined blocks into 'charsets' and mark as type 'block'. Unicode blocks have only one range - and don't overlap.
#We don't treat unassigned/reserved codes within a block specially at this stage - ie we will not chop a block into subranges on that basis.
#unassigned code points should get certain default properties (e.g bidirectionality ) according to their block - so it makes sense to treat them as belonging to the block.
#They also get the general property of Cn (Other,not assigned or Other,reserved) and a "Basic Type" of Noncharacter or Reserved
proc load_unicode_blocks {} {
#sample data line
#0000..007F; Basic Latin
variable charsets
set file [file join [internal::unicode_folder] Blocks.txt]
if {![file exists $file]} {
error "Unicode Blocks.txt file not found at path '$file'"
}
puts "ok.. loading"
set fd [open $file r]
chan configure $fd -translation binary
set data [read $fd]
close $fd
set block_count 0
foreach ln [split $data \n] {
set ln [tcl::string::trim $ln]
if {[tcl::string::match #* $ln]} {
continue
}
if {[set pcolon [tcl::string::first ";" $ln]] > 0} {
set lhs [tcl::string::trim [tcl::string::range $ln 0 $pcolon-1]]
set name [tcl::string::trim [tcl::string::range $ln $pcolon+1 end]]
set lhsparts [split $lhs .]
set start [lindex $lhsparts 0]
set end [lindex $lhsparts end]
#puts "$start -> $end '$name'"
set decimal_start [expr {"0x$start"}]
set decimal_end [expr {"0x$end"}]
tcl::dict::set charsets $name [list ranges [list [list start $decimal_start end $decimal_end note "unicode block $lhs"]] description "" settype block]
incr block_count
}
}
_build_charset_extents
return $block_count
}
#unicode scripts
#unicode UnicodeData.txt
#https://www.unicode.org/reports/tr44/#Property_Values
#unicode EastAsianWidth.txt
#classify width of character - which is contextual in some cases
#####
#Review - this is initial naive assumption that should get us mostly what we want for layout purposes in a utf-8-centric world.
#We will just load the values and treat H,N,Na as 1-wide and A,F,W as 2-wide for functions such as char::string_width on the basis that those using legacy sets can query the property and make their own determinations in those contexts.
####
# -- ---
#A = Ambiguous - All characters that can be sometimes wide and sometimes narrow. (wide in east asian legacy sets, narrow in non-east asian usage) (private use chars considered ambiguous)
#F = East Asian Full-width
#H = East Asian Half-width
#N = Not east Asian (Neutral) - all other characters. (do not occur in legacy East Asian character sets) - treated like Na
#Na = East Asian Narrow - all other characters that are always narrow and have explicit full-width counterparts (e.g includes all of ASCII)
#W = East Asian Wide - all other characters that are always wide (Occur only in the context of Eas Asian Typography)
# -- ---
proc charshort {shortname} {
variable charshort
return [tcl::dict::get $charshort $shortname]
}
proc box_drawing {args} {
return [charset "Box Drawing" {*}$args]
}
proc box_drawing_dict {} {
return [charset_dict "Box Drawing"]
}
proc char_hex {char} {
return [format %08x [scan $char %c]]
}
proc char_info_hex {hex args} {
set hex [tcl::string::map [list _ ""] $hex]
if {[tcl::string::is xdigit -strict $hex]} {
#has no leading 0x
set dec [expr {"0x$hex"}]
} else {
set dec [expr {$hex}]
}
return [char_info_dec $dec {*}$args]
}
proc char_info {char args} {
#Note - on some versions of Tcl -e.g 8.6 use could supply something like \U1f600 (smiley icon) but we receive fffd (replacement special)
#there is no way to detect what the user intended ie we can't distinguish if they actually typed \UFFFD
#we can test if such mapping happens in general - and warn if codepoint is FFFD in the result dict
set returninfo [tcl::dict::create]
if {[tcl::string::equal \UFFFD $char] && [tcl::string::equal \U1F600 \UFFFD]} {
tcl::dict::set returninfo WARNING "this tcl maps multiple to FFFD"
}
lassign [scan $char %c%s] dec_char remainder
if {[tcl::string::length $remainder]} {
error "char_info requires a single character"
}
set result [tcl::dict::merge $returninfo [char_info_dec $dec_char {*}$args]]
}
proc char_info_dec {dec args} {
set dec_char [expr {$dec}]
set opts [tcl::dict::create\
-fields {default}\
-except {}\
]
#testwidth is so named because it peforms an actual test on the console using ansi escapes - and the name gives a hint that it is a little slow
set known_fields [list all default dec hex desc short testwidth char memberof] ;#maint fields from charinfo 'desc' 'short'
#todo - unicode properties
# tclwhitespace (different to unicode concept of whitespace. review )
foreach {k v} $args {
switch -- $k {
-fields - -except {
tcl::dict::set opts $k $v
}
default {
error "char_info unrecognised option '$k'. Known options:'[tcl::dict::keys $opts]' known_fields: $known_fields usage: char_info <char> ?-fields {<fieldnames>}? ?-except {<fieldnames>}?"
}
}
}
# -- --- --- --- --- --- --- --- --- --- --- ---
set opt_fields [tcl::dict::get $opts -fields]
set opt_except [tcl::dict::get $opts -except]
# -- --- --- --- --- --- --- --- --- --- --- ---
set initial_fields [list]
if {"default" in $opt_fields} {
set initial_fields $known_fields
if {"testwidth" ni $opt_fields} {
if {"testwidth" ni $opt_except} {
lappend opt_except testwidth
}
}
if {"char" ni $opt_fields} {
if {"char" ni $opt_except} {
lappend opt_except char
}
}
} elseif {"all" in $opt_fields} {
set initial_fields $known_fields
} else {
foreach f $opt_fields {
if {$f in $known_fields} {
lappend initial_fields $f
} else {
error "char_info unknown field name: '$f' known fields: '$known_fields'"
}
}
}
foreach e $opt_except {
if {$e ni $known_fields} {
error "char_info unknown field name $e in -except. known fields: '$known_fields'"
}
}
set fields [list]
foreach f $initial_fields {
if {$f ne "all" && $f ni $opt_except} {
lappend fields $f
}
}
if {![llength $fields]} {
return
}
variable charinfo
variable charsets
set hex_char [format %04x $dec_char]
set returninfo [tcl::dict::create]
foreach f $fields {
switch -- $f {
dec {
tcl::dict::set returninfo dec $dec_char
}
hex {
tcl::dict::set returninfo hex $hex_char
}
desc {
if {[tcl::dict::exists $charinfo $dec_char desc]} {
tcl::dict::set returninfo desc [tcl::dict::get $charinfo $dec_char desc]
} else {
tcl::dict::set returninfo desc ""
}
}
short {
if {[tcl::dict::exists $charinfo $dec_char short]} {
tcl::dict::set returninfo desc [tcl::dict::get $charinfo $dec_char short]
} else {
tcl::dict::set returninfo short ""
}
}
testwidth {
#todo - expectedwidth - lookup the printing width it is *supposed* to have from unicode tables
#testwidth is one of the main ones likely to be worthwhile excluding as querying the console via ansi takes time
set existing_testwidth ""
if {[tcl::dict::exists $charinfo $dec_char testwidth]} {
set existing_testwidth [tcl::dict::get $charinfo $dec_char testwidth]
}
if {$existing_testwidth eq ""} {
#no cached data - do expensive cursor-position test (Note this might not be 100% reliable - terminals lie e.g if ansi escape sequence has set to wide printing.)
set char [format %c $dec_char]
set chwidth [char_info_testwidth $char]
tcl::dict::set returninfo testwidth $chwidth
#cache it. todo - -verify flag to force recalc in case font/terminal changed in some way?
tcl::dict::set charinfo $dec_char testwidth $chwidth
} else {
tcl::dict::set returninfo testwidth $existing_testwidth
}
}
char {
set char [format %c $dec_char]
tcl::dict::set returninfo char $char
}
memberof {
#memberof takes in the order of a few hundred microseconds if a simple scan of all ranges is taken - possibly worthwhile caching/optimising
#note that memberof is not just unicode blocks - but scripts and other non-contiguous sets consisting of multiple ranges - some of which may include ranges of a single character. (e.g WGL4)
#This means there probably isn't a really efficient means of calculating membership other than scanning all the defined ranges.
#We could potentially populate it using a side-thread - but it seems reasonable to just cache result after first use here.
#some efficiency could also be gained by pre-calculating the extents for each charset which isn't a simple unicode block. (and perhaps sorting by max char)
set memberof [list]
tcl::dict::for {setname setinfo} $charsets {
foreach r [tcl::dict::get $setinfo ranges] {
set s [tcl::dict::get $r start]
set e [tcl::dict::get $r end]
if {$dec_char >= $s && $dec_char <= $e} {
lappend memberof $setname
break
}
}
}
tcl::dict::set returninfo memberof $memberof
}
}
}
return $returninfo
}
proc _char_info_dec_memberof_scan {dec} {
variable charsets
set memberof [list]
tcl::dict::for {setname setinfo} $charsets {
foreach r [tcl::dict::get $setinfo ranges] {
set s [tcl::dict::get $r start]
set e [tcl::dict::get $r end]
if {$dec >= $s && $dec <= $e} {
lappend memberof $setname
break
}
}
}
return $memberof
}
proc range_split_info {dec} {
variable charset_extents_startpoints
variable charset_extents_endpoints
set skeys [tcl::dict::keys $charset_extents_startpoints]
set ekeys [tcl::dict::keys $charset_extents_endpoints]
set splen [tcl::dict::size $charset_extents_startpoints]
set eplen [tcl::dict::size $charset_extents_endpoints]
set s [lsearch -bisect -integer $skeys $dec]
set s_at_or_below [lrange $skeys 0 $s]
set e_of_s [list]
foreach sk $s_at_or_below {
lappend e_of_s {*}[tcl::dict::get $charset_extents_startpoints $sk]
}
set e_of_s [lsort -integer $e_of_s]
set splitposn [lsearch -bisect -integer $e_of_s $dec]
if {[lindex $e_of_s $splitposn] < $dec} {incr splitposn}
#set lhs_endpoints_to_check [expr {[llength $e_of_s] - $splitposn}]
set reduced_endpoints [lrange $e_of_s $splitposn end]
set sps [list]
foreach ep $reduced_endpoints {
lappend sps {*}[tcl::dict::get $charset_extents_endpoints $ep]
}
set e [lsearch -bisect -integer $ekeys $dec]
if {$e >= 0} {
set e_at_or_above [lrange $ekeys $e end]
set s_of_e [list]
foreach ek $e_at_or_above {
lappend s_of_e {*}[tcl::dict::get $charset_extents_endpoints $ek]
}
set startpoints_of_above [llength $s_of_e]
set splitposn [lsearch -bisect -integer $s_of_e $dec]
set reduced_startpoints [lrange $s_of_e 0 $splitposn]
set eps [list]
foreach sp $reduced_startpoints {
lappend eps {*}[tcl::dict::get $charset_extents_startpoints $sp]
}
} else {
set s_of_e [list]
set reduced_startpoints [list]
set eps [list]
}
return [tcl::dict::create startpoints $splen endpoints $eplen midpoint [expr {floor($eplen/2)}] posn $e lhs_endpoints_to_check "[llength $reduced_endpoints]/[llength $e_of_s]=[llength $sps]ranges" rhs_startpoints_to_check "[llength $reduced_startpoints]/[llength $s_of_e]=[llength $eps]"]
}
#for just a few extra sets such as wgl4 and unicode blocks loaded - this gives e.g 5x + better performance than the simple search above, and up to twice as slow for tcl 8.6
#performance biased towards lower numbered characters (which is not too bad in the context of unicode)
#todo - could be tuned to perform better at end by assuming a fairly even distribution of ranges - and so searching startpoint ranges first for items left of middle, endpoint ranges first for items right of middle
#review with scripts loaded and more defined ranges..
#This algorithm supports arbitrary overlapping ranges and ranges with same start & endpoints
#Should be much better than O(n) for n sets except for pathological case of member of all or nearly-all intervals ?
#review - compare with 'interval tree' algorithms.
proc char_info_dec_memberof {dec} {
variable charset_extents_startpoints
variable charset_extents_endpoints
variable charset_extents_rangenames
if {[package vcompare [info tclversion] 8.7a5] >= 0} {
#algorithm should theoretically be a little better with -stride
set last_smaller_or_equal_startposn [lsearch -stride 2 -bisect -integer $charset_extents_startpoints $dec]
set sets_starting_below [lrange $charset_extents_startpoints 0 $last_smaller_or_equal_startposn+1] ;#+1 to include 2nd element of stridden pair
set endpoints_of_starting_below [lsort -integer [concat {*}[tcl::dict::values $sets_starting_below]]]
} else {
#no -stride available
set startkeys [tcl::dict::keys $charset_extents_startpoints]
set last_smaller_or_equal_startkeyposn [lsearch -bisect -integer $startkeys $dec] ;#assert will always return one of the keys if number >=0 supplied (last key if > all)
#set startkey_found [lindex $startkeys $last_smaller_or_equal_startkeyposn]
set start_below_keys [lrange $startkeys 0 $last_smaller_or_equal_startkeyposn] ;#These are the keys of sets which start at or below dec
#puts "start_below_keys: '$start_below_keys'"
set endpoints_of_starting_below [list]
foreach belowkey $start_below_keys {
lappend endpoints_of_starting_below {*}[tcl::dict::get $charset_extents_startpoints $belowkey]
}
set endpoints_of_starting_below [lsort -integer $endpoints_of_starting_below[unset endpoints_of_starting_below]]
}
set splitposn [lsearch -bisect -integer $endpoints_of_starting_below $dec] ;#splitposn = last smaller or equal endposn
if {[lindex $endpoints_of_starting_below $splitposn] < $dec} { incr splitposn}
set reduced_opposite_limit [lrange $endpoints_of_starting_below $splitposn end]
################
#note each endpoint points to multiple startpoints which may still include some that are not in range. (e.g range y can share endpoint with x that starts in-range - but y starts above character )
# x1 x2
# y1 y2
# c
################
#we have reduced our set of endpoints sufficiently (to those at or above dec) to run through and test each startpoint
set ranges [list]
foreach ep $reduced_opposite_limit {
foreach s [tcl::dict::get $charset_extents_endpoints $ep] {
if {$s <= $dec} {
lappend ranges [tcl::dict::get $charset_extents_rangenames $s,$ep]
}
}
}
return $ranges
}
#with glob searching of description and short
proc char_range_dict {start end args} {
if {![tcl::string::is integer -strict $start] || ![tcl::string::is integer -strict $end]} {
error "char_range_dict error start and end must be integers"
}
set and_globs [list]
if {![llength $args]} {
set args [list *]
}
foreach glob $args {
if {![regexp {[*?]} $glob]} {
lappend and_globs "*$glob*"
} else {
lappend and_globs $glob
}
}
variable charinfo
set cdict [tcl::dict::create]
set start [expr {$start}] ;#force string rep to decimal - otherwise first use of i as string could be hex or other rep whilst other i values will be decimal string rep due to incr
for {set i $start} {$i <= $end} {incr i} {
set hx [format %04x $i]
set ch [format %c $i]
if {[tcl::dict::exists $charinfo $i desc]} {
set d [tcl::dict::get $charinfo $i desc]
} else {
set d ""
}
if {[tcl::dict::exists $charinfo $i short]} {
set s [tcl::dict::get $charinfo $i short]
} else {
set s ""
}
set matchcount 0
foreach glob $and_globs {
if {[tcl::string::match -nocase $glob $s] || [tcl::string::match -nocase $glob $d]} {
incr matchcount
}
}
if {$matchcount == [llength $and_globs]} {
if {[tcl::dict::exists $charinfo $i]} {
tcl::dict::set cdict $hx [tcl::dict::merge [tcl::dict::create dec $i hex $hx char $ch] [tcl::dict::get $charinfo $i]]
} else {
tcl::dict::set cdict $hx [list dec $i hex $hx char $ch desc $d short $s]
}
}
}
return $cdict
}
#with glob searches of desc and short
proc char_range {start end args} {
package require overtype
if {![tcl::string::is integer -strict $start] || ![tcl::string::is integer -strict $end]} {
error "char_range error start and end must be integers"
}
set charset_dict [char_range_dict $start $end {*}$args]
set out ""
set col3 [tcl::string::repeat " " 12]
tcl::dict::for {k inf} $charset_dict {
set s [internal::dict_getdef $inf short ""]
set d [internal::dict_getdef $inf desc ""]
set s_col [overtype::left $col3 $s]
append out "$k [tcl::dict::get $inf dec] [tcl::dict::get $inf char] $s_col $d" \n
}
return $out
}
#non-overlapping unicode blocks
proc char_blocks {{name_or_glob *}} {
variable charsets
#todo - more efficient datastructures?
if {![regexp {[?*]} $name_or_glob]} {
#no glob - just retrieve it
if {[tcl::dict::exists $charsets $name_or_glob]} {
if {[tcl::dict::get $charsets $name_or_glob settype] eq "block"} {
return [tcl::dict::create $name_or_glob [tcl::dict::get $charsets $name_or_glob]]
}
}
#no exact match - try case insensitive..
set name [lsearch -inline -nocase [tcl::dict::keys $charsets] $name_or_glob]
if {$name ne ""} {
if {[tcl::dict::get $charsets $name settype] eq "block"} {
return [tcl::dict::create $name [tcl::dict::get $charsets $name]]
}
}
} else {
#build a subset
set charsets_block [tcl::dict::create]
tcl::dict::for {k v} $charsets {
if {[tcl::string::match -nocase $name_or_glob $k]} {
if {[tcl::dict::get $v settype] eq "block"} {
tcl::dict::set charsets_block $k $v
}
}
}
return $charsets_block
}
}
proc charset_names {{name_or_glob *}} {
variable charsets
if {![regexp {[?*]} $name_or_glob]} {
#no glob - just retrieve it
if {[tcl::dict::exists $charsets $name_or_glob]} {
return [list $name_or_glob]
}
#no exact match - try case insensitive..
set name [lsearch -inline -nocase [tcl::dict::keys $charsets] $name_or_glob]
if {$name ne ""} {
return [list $name]
}
} else {
if {$name_or_glob eq "*"} {
return [lsort [tcl::dict::keys $charsets]]
}
#tcl::dict::keys $dict <pattern> doesn't have option for case insensitive searches
return [lsort [lsearch -all -inline -nocase [tcl::dict::keys $charsets] $name_or_glob]]
}
}
#deprecated
#major named sets such as unicode blocks, scripts, and other sets such as microsoft WGL4
#case insensitive search - possibly with *basic* globs (? and * only - not lb rb)
proc charset_names2 {{namesearch *}} {
variable charsets
#dictionary sorting of the keys is slow! - we should obviously store it in sorted order instead of sorting entire list on retrieval - or just sort results
#set sortedkeys [lsort -increasing -dictionary [tcl::dict::keys $charsets]] ;#NOTE must use -dictionary to use -sorted flag below
set sortedkeys [lsort -increasing [tcl::dict::keys $charsets]]
if {$namesearch eq "*"} {
return $sortedkeys
}
if {[regexp {[?*]} $namesearch]} {
#name glob search
return [lsearch -all -inline -nocase $sortedkeys $namesearch] ;#no point using -sorted flag when -all is used
} else {
#return [lsearch -sorted -inline -nocase $sortedkeys $namesearch] ;#no globs - bails out earlier if -sorted?
return [lsearch -inline -nocase $sortedkeys $namesearch] ;#no globs
}
}
proc charsets {{namesearch *}} {
package require textblock
variable charsets
set charset_names [charset_names $namesearch]
set settype_list [list]
foreach setname $charset_names {
lappend settype_list [tcl::dict::get $charsets $setname settype]
}
set charset_names [linsert $charset_names 0 "Set Name"]
set settype_list [linsert $settype_list 0 "Set Type"]
return [textblock::join -- [list_as_lines -- $charset_names] " " [list_as_lines $settype_list]]
}
proc charset_defget {exactname} {
variable charsets
return [tcl::dict::get $charsets $exactname]
}
proc charset_defs {charsetname} {
variable charsets
set matches [charset_names $charsetname]
set def_list [list]
foreach setname $matches {
lappend def_list [tcl::dict::create $setname [tcl::dict::get $charsets $setname]]
}
return [join $def_list \n]
}
proc charset_dictget {exactname} {
variable charsets
set setinfo [tcl::dict::get $charsets $exactname]
set ranges [tcl::dict::get $setinfo ranges]
set charset_dict [tcl::dict::create]
foreach r $ranges {
set start [tcl::dict::get $r start]
set end [tcl::dict::get $r end]
set charset_dict [tcl::dict::merge $charset_dict [char_range_dict $start $end]]
}
return $charset_dict
}
proc charset_dicts {searchname} {
variable charsets
set matches [charset_names $searchname]
if {![llength $matches]} {
error "No charset found matching name '$searchname' - use 'charset_names' to get list"
}
set dict_list [list]
foreach m $matches {
lappend dict_list [tcl::dict::create $m [charset_dictget $m]]
}
#return $dict_list
return [join $dict_list \n]
}
proc charset_page {namesearch args} {
_charset_page_search $namesearch $args ;#pass args to descsearch argument
}
proc _charset_page_search {namesearch search_this_and_that args} {
variable charsets
variable charinfo
set matched_names [charset_names $namesearch]
if {![llength $matched_names]} {
error "charset_page no charset matched pattern '$namesearch' - use 'charset_names' to get list"
}
set defaults [tcl::dict::create\
-ansi 0\
-lined 1\
]
set opts [tcl::dict::merge $defaults $args]
# -- --- --- ---
set opt_ansi [tcl::dict::get $opts -ansi]
set opt_lined [tcl::dict::get $opts -lined]
# -- --- --- ---
set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
if {$opt_ansi} {
set a1 [a BLACK white bold]
set a2 [a]
} else {
set a1 ""
set a2 ""
}
set cols 16
set prefix " "
append out $prefix
foreach charsetname $matched_names {
if {[llength $search_this_and_that]} {
set setinfo [tcl::dict::get $charsets $charsetname]
set ranges [tcl::dict::get $setinfo ranges]
set charset_dict [tcl::dict::create]
foreach r $ranges {
set start [tcl::dict::get $r start]
set end [tcl::dict::get $r end]
set charset_dict [tcl::dict::merge $charset_dict [char_range_dict $start $end {*}$search_this_and_that]]
}
} else {
set charset_dict [charset_dictget $charsetname]
}
if {![tcl::dict::size $charset_dict]} {
continue
}
set i 1
append out \n $prefix $charsetname
append out \n
set marker_line $prefix
set line $prefix
tcl::dict::for {hex inf} $charset_dict {
set ch [tcl::dict::get $inf char]
set twidth ""
set dec [expr {"0x$hex"}]
if {[tcl::dict::exists $charinfo $dec testwidth]} {
set twidth [tcl::dict::get $charinfo $dec testwidth]
}
if {$twidth eq ""} {
#set width [ansifreestring_width $ch] ;#based on unicode props
set width [grapheme_width_cached $ch]
} else {
set width $twidth
}
if {$width == 0} {
set marker " "
if {[regexp $re_diacritics $ch]} {
#attempt to combine with space to get 3-wide displayv with diacritic showing at left space
#todo - dualchar diacritics?
set displayv " $ch "
} else {
set displayv " "
}
} elseif {$width == 1} {
set marker "_ "
set displayv "${a1}$ch${a2} "
} else {
#presumed 2
set marker "__ "
set displayv "${a1}$ch${a2} "
}
set hexlen [tcl::string::length $hex]
append marker_line "[tcl::string::repeat " " $hexlen] $marker"
append line "$hex $displayv"
if {$i == [tcl::dict::size $charset_dict] || $i % $cols == 0} {
if {$opt_lined} {
append out $marker_line \n
}
append out $line \n
set marker_line $prefix
set line $prefix
#set out [tcl::string::range $out 0 end-2]
#append out \n " "
}
incr i
}
}
set out [tcl::string::trimright $out " "]
return $out
}
#allows search on both name and an anded list of globs to be applied to description & short
proc charset {namesearch args} {
package require overtype
variable charsets
set matched_names [charset_names $namesearch]
if {![llength $matched_names]} {
error "No charset matched pattern '$namesearch' - use 'charset_names' to get list"
}
set search_this_and_that $args
set out ""
foreach charsetname $matched_names {
if {[llength $search_this_and_that]} {
set setinfo [tcl::dict::get $charsets $charsetname]
set ranges [tcl::dict::get $setinfo ranges]
set charset_dict [tcl::dict::create]
foreach r $ranges {
set start [tcl::dict::get $r start]
set end [tcl::dict::get $r end]
set charset_dict [tcl::dict::merge $charset_dict [char_range_dict $start $end {*}$search_this_and_that]]
}
} else {
set charset_dict [charset_dictget $charsetname]
}
set col_items_short [list]
set col_items_desc [list]
tcl::dict::for {k inf} $charset_dict {
lappend col_items_desc [internal::dict_getdef $inf desc ""]
lappend col_items_short [internal::dict_getdef $inf short ""]
}
if {[llength $col_items_desc]} {
set widest3 [tcl::mathfunc::max {*}[lmap v $col_items_short {tcl::string::length $v}]]
if {$widest3 == 0} {
set col3 " "
} else {
set col3 [tcl::string::repeat " " $widest3]
}
tcl::dict::for {k inf} $charset_dict {
set s [internal::dict_getdef $inf short ""]
set d [internal::dict_getdef $inf desc ""]
set s_col [overtype::left $col3 $s]
append out "$k [tcl::dict::get $inf char] $s_col $d" \n
}
}
}
return $out
}
#use console cursor movements to test and cache the column-width of each char in the set of characters returned by the search criteria
proc charset_calibrate {namesearch args} {
variable charsets
variable charinfo
set matched_names [charset_names $namesearch]
if {![llength $matched_names]} {
error "No charset matched pattern '$namesearch' - use 'charset_names' to get list"
}
set search_this_and_that $args
set charcount 0
set width_results [tcl::dict::create]
puts stdout "calibrating using terminal cursor movements.."
foreach charsetname $matched_names {
if {[llength $search_this_and_that]} {
set setinfo [tcl::dict::get $charsets $charsetname]
set ranges [tcl::dict::get $setinfo ranges]
set charset_dict [tcl::dict::create]
foreach r $ranges {
set start [tcl::dict::get $r start]
set end [tcl::dict::get $r end]
set charset_dict [tcl::dict::merge $charset_dict [char_range_dict $start $end {*}$search_this_and_that]]
}
} else {
set charset_dict [charset_dictget $charsetname]
}
if {![tcl::dict::size $charset_dict]} {
continue
}
tcl::dict::for {hex inf} $charset_dict {
set ch [format %c 0x$hex]
set twidth ""
set dec [expr {"0x$hex"}]
if {[tcl::dict::exists $charinfo $dec testwidth]} {
set twidth [tcl::dict::get $charinfo $dec testwidth]
}
if {$twidth eq ""} {
#puts -nonewline stdout "." ;#this
set width [char_info_testwidth $ch] ;#based on console test rather than unicode props
tcl::dict::set charinfo $dec testwidth $width
} else {
set width $twidth
}
tcl::dict::incr width_results $width
incr charcount
}
}
puts stdout "\ncalibration done - results cached in charinfo dictionary"
return [tcl::dict::create charcount $charcount widths $width_results]
}
#maint warning - also in overtype!
#intended for single grapheme - but will work for multiple
#cannot contain ansi or newlines
#(a cache of ansifreestring_width calls - as these are quite regex heavy)
#review - effective memory leak on longrunning programs if never cleared
#tradeoff in fragmenting cache and reducing efficiency vs ability to clear in a scoped manner
proc grapheme_width_cached {ch {key ""}} {
variable grapheme_widths
#if key eq "*" - we won't be able to clear that cache individually. Perhaps that's ok
if {[tcl::dict::exists $grapheme_widths $key $ch]} {
return [tcl::dict::get $grapheme_widths $key $ch]
}
set width [punk::char::ansifreestring_width $ch] ;#review - can we provide faster version if we know it's a single grapheme rather than a string? (grapheme is still a string as it may have combiners/diacritics)
tcl::dict::set grapheme_widths $key $ch $width
return $width
}
proc grapheme_width_cache_clear {key} {
variable grapheme_widths
if {$key eq "*} {
set grapheme_widths [tcl::dict::create]
} else {
tcl::dict::unset grapheme_widths $key
}
return
}
#no char_width - use grapheme_width terminology to be clearer
proc grapheme_width {char} {
error "grapheme_width unimplemented - use ansifreestring_width"
}
#return N Na W etc from unicode data
#review
proc char_uc_width_prop {char} {
error "char_uc_width unimplemented try textutil::wcswidth_type"
}
#todo - provide a grapheme_width function that is optimised for speed
proc string_width {text} {
#burn approx 2uS (2024) checking for ansi codes - not just SGR
if {[punk::ansi::ta::detect $text]} {
puts stderr "string_width detected ANSI!"
}
if {[tcl::string::last \n $text] >= 0} {
error "string_width accepts only a single line"
}
#tailcall ansifreestring_width $text
ansifreestring_width $text
}
#todo - consider disallowing/erroring out when \r \n in string?
# - tab/vtab?
# - compare with wcswidth returning -1 for entire string containing such in python,perl
proc wcswidth {string} {
#faster than textutil::wcswidth (at least for string up to a few K in length)
#..but - 'scan' is horrible for 400K+ (Tcl evaluation stack has to be reallocated/copied?)
#Tcl initial evaluation stack size is 2000 (? review)
#REVIEW - when we cater for grapheme clusters - we can't just split the string at arbitrary points like this!!
set chunksize 2000
set chunks_required [expr {int(ceil([tcl::string::length $string] / double($chunksize)))}]
set width 0
set startidx 0
set endidx [expr {$startidx + $chunksize -1}]
for {set i 0} {$i < $chunks_required} {incr i} {
set chunk [tcl::string::range $string $startidx $endidx]
set codes [scan $chunk [tcl::string::repeat %c [tcl::string::length $chunk]]]
foreach c $codes {
if {$c <= 255} {
if {$c == 9 || ($c >= 31 && $c != 127)} {
#review - non-printing ascii? why does textutil::wcswidth report 1 ??
#todo - compare with python or other lang wcwidth
incr width
}
} elseif {$c < 917504 || $c > 917631} {
#TODO - various other joiners and non-printing chars
set w [textutil::wcswidth_char $c]
if {$w < 0} {
return -1
} else {
incr width $w
}
}
}
incr startidx $chunksize
incr endidx $chunksize
}
return $width
}
# ------------------------------------------------------------------------------------------------------
#test
# ------------------------------------------------------------------------------------------------------
proc grapheme_split_tk {string} {
if {![regexp "\[\uFF-\U10FFFF\]" $string]} {
#only ascii (7 or 8 bit) - no joiners or unicode
return [split $string {}]
}
package require tk
set i 0
set graphemes [list]
while {$i < [tcl::string::length $string]} {
set aftercluster [tk::endOfCluster $string $i]
lappend graphemes [string range $string $i $aftercluster-1]
set i $aftercluster
}
return $graphemes
}
proc wcswidth_clustered {string} {
package require tk
set width 0
set i 0
if {![regexp "\[\uFF-\U10FFFF\]" $string]} {
return [punk::char::wcswidth_unclustered $string] ;#still use our wcswidth to account for non-printable ascii
}
while {$i < [tcl::string::length $string]} {
set aftercluster [tk::endOfCluster $string $i]
set g [string range $string $i $aftercluster-1]
if {$aftercluster > ($i + 1)} {
#review - proper way to determine screen width (columns occupied) of a cluster??
#according to this:
#https://lib.rs/crates/unicode-display-width
#for each grapheme - if any of the code points in the cluster have an east asian width of 2,
#The entire grapheme width is 2 regardless of how many code points constitute the grapheme
set gw 1
foreach ch [split $g ""] {
if {[punk::char::wcswidth_single $ch] == 2} {
set gw 2
break
}
}
incr width $gw
#if {[string first \u200d $g] >=0} {
# incr width 2
#} else {
# #other joiners???
# incr width [wcswidth_unclustered $g]
#}
} else {
incr width [wcswidth_unclustered $g]
}
set i $aftercluster
}
return $width
}
proc wcswidth_single {char} {
scan $char %c dec
if {$dec <= 255} {
if {$dec == 9} {
#tab always represented by at least one char in terminal etc.
#caller will need to process tabs themselves to determine extra width applicable to their circumstance.
return 1
}
if {($dec < 31 || $dec == 127)} {
return 0
}
#review - non-printing ascii? why does textutil::wcswidth report 1 ??
#todo - compare with python or other lang wcwidth
return 1
} elseif {$dec < 917504 || $dec > 917631} {
#TODO - various other joiners and non-printing chars
return [textutil::wcswidth_char $dec] ;#note textutil::wcswidth_char takes a decimal codepoint!
#may return -1 - REVIEW
}
return 0
}
proc wcswidth_unclustered1 {string} {
set width 0
foreach c [split $string {}] {
scan $c %c dec
if {$dec <= 255} {
if {$dec == 9 || ($dec >= 31 && $dec != 127)} {
#review - non-printing ascii? why does textutil::wcswidth report 1 ??
#todo - compare with python or other lang wcwidth
incr width
}
} elseif {$dec < 917504 || $dec > 917631} {
#TODO - various other joiners and non-printing chars
set w [textutil::wcswidth_char $dec] ;#takes decimal codepoint
if {$w < 0} {
return -1
} else {
incr width $w
}
}
}
return $width
}
#todo - consider disallowing/erroring out when \r \n in string?
# - tab/vtab?
# - compare with wcswidth returning -1 for entire string containing such in python,perl
proc wcswidth_unclustered {string} {
#faster than textutil::wcswidth (at least for string up to a few K in length)
#..but - 'scan' is horrible for 400K+ (Tcl evaluation stack has to be reallocated/copied?)
#Tcl initial evaluation stack size is 2000 (? review)
#we can only split the string at arbitrary points like this because we are specifically dealing with input that has no clusters!.
set chunksize 2000
set chunks_required [expr {int(ceil([tcl::string::length $string] / double($chunksize)))}]
set width 0
set startidx 0
set endidx [expr {$startidx + $chunksize -1}]
for {set i 0} {$i < $chunks_required} {incr i} {
set chunk [tcl::string::range $string $startidx $endidx]
set codes [scan $chunk [tcl::string::repeat %c [tcl::string::length $chunk]]]
foreach dec $codes {
if {$dec <= 255} {
if {($dec ==9 || ($dec >= 31 && $dec != 127))} {
#review - non-printing ascii? why does textutil::wcswidth report 1 ??
#todo - compare with python or other lang wcswidth
incr width
}
} elseif {$dec < 917504 || $dec > 917631} {
#TODO - various other joiners and non-printing chars
set w [textutil::wcswidth_char $dec]
if {$w < 0} {
return -1
} else {
incr width $w
}
}
}
incr startidx $chunksize
incr endidx $chunksize
}
return $width
}
# ------------------------------------------------------------------------------------------------------
proc wcswidth0 {string} {
#faster than textutil::wcswidth (at least for string up to a few K in length)
#..but - 'scan' is horrible for 400K+
#TODO
set codes [scan $string [tcl::string::repeat %c [tcl::string::length $string]]]
set width 0
foreach dec $codes {
#unicode Tags block zero width
if {$dec < 917504 || $dec > 917631} {
if {$dec <= 255} {
#review - non-printing ascii? why does textutil::wcswidth report 1 ??
#todo - compare with python or other lang wcwidth
if {!($dec < 31 || $dec == 127)} {
incr width
}
} else {
#TODO - various other joiners and non-printing chars
set w [textutil::wcswidth_char $dec] ;#takes decimal codepoint
if {$w < 0} {
return -1
} else {
incr width $w
}
}
}
}
return $width
}
proc wcswidth2 {string} {
set codes [scan $string [tcl::string::repeat %c [tcl::string::length $string]]]
set widths [lmap dec $codes {textutil::wcswidth_char $dec}]
if {-1 in $widths} {
return -1
}
return [tcl::mathop::+ {*}$widths]
}
#prerequisites - no ansi escapes - no newlines - utf8 encoding assumed
#review - what about \r \t \b ?
#NO processing of \b - already handled in ansi::printing_length which then calls this
#this version breaks string into sequences of ascii vs unicode
proc ansifreestring_width {text} {
#caller responsible for calling ansistrip first if text may have ansi codes - and for ensuring no newlines
#we can c0 control characters after or while processing ansi escapes.
#we need to map remaining control characters to zero-width (under assumption we're not using a font/codepage that displays them - review!)
#anyway - we must allow things like raw ESC,DEL, NUL etc to pass through without error
#if {[tcl::string::first \033 $text] >= 0} {
# error "string_width doesn't accept ansi escape sequences. Use punk::ansi::ansistrip first"
#}
#todo - various combining diacritical marks.. from grave - to various complicated unicode joiners and composing chars etc
#as at 2023 - terminals generally seem to use the simplistic approach of tallying up individual character-widths, which means combinations that print short are reported too long by the terminal esc 6 n sequence.
#- {Combining Diacritical Marks} {ranges {{start 768 end 879 note {unicode block 0300..036F}}} description {} settype block}
#- {Combining Diacritical Marks Extended} {ranges {{start 6832 end 6911 note {unicode block 1AB0..1AFF}}} description {} settype block}
#- {Combining Diacritical Marks Supplement} {ranges {{start 7616 end 7679 note {unicode block 1DC0..1DFF}}} description {} settype block}
#- {Combining Diacritical Marks for Symbols} {ranges {{start 8400 end 8447 note {unicode block 20D0..20FF}}} description {} settype block}
#- {Combining Half Marks} {ranges {{start 65056 end 65071 note {unicode block FE20..FE2F}}} description {} settype block}
#
# initial simplistic approach is just to strip these ... todo REVIEW
#experiment to detect leading diacritics - but this isn't necessary - it's still zero-width - and if the user is splitting properly we shouldn't be getting a string with leading diacritics anyway
#(leading combiners may display in terminal as mark on rightmost prompt char which is usually a space - but won't add width even then)
#set re_leading_diacritic {^(?:[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+)}
#if {[regexp $re_leading_diacritic $text]} {
# set text " $text"
#}
set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
set text [regsub -all $re_diacritics $text ""]
# -- --- --- --- --- --- ---
#review
#if we strip out ZWJ \u200d (zero width combiner) - then we don't give the chance for a grapheme combining test to merge properly e.g combining emojis
#as at 2024 - textutil::wcswidth just uses the unicode east-asian width property data and doesn't seem to handle these specially - it counts this joiner and others as one wide (also BOM \uFFEF)
#TODO - once we have proper grapheme cluster splitting - work out which of these characters should be left in and/or when exactly their length-effects apply
#
#for now - strip them out
#ZWNJ \u200c should also be zero length - but as a 'non' joiner - it should add zero to the length
#ZWSP \u200b zero width space
#\uFFEFBOM/ ZWNBSP and others that should be zero width
#todo - work out proper way to mark/group zero width.
#set text [tcl::string::map [list \u200b "" \u200c "" \u200d "" \uFFEF ""] $text]
set text [tcl::string::map [list \u200b "" \u200c "" \u200d ""] $text]
#\uFFEF tends to print as 1 length replacement char - REVIEW
#\uFFFF varies between terminals - some print replacement char (width 1) some print nothing (width 0)
# -- --- --- --- --- --- ---
#we should only map control sequences to nothing after processing ones with length effects, such as \b (\x07f) or DEL \x1f
#todo - document that these shouldn't be present in input rather than explicitly checking here
#c0 controls + del (127 7f) - tab
#set re_ascii_c0 {[\U0000-\U001F]}
set re_ascii_c0 {[\u0000-\u0008\u000A-\u001F\u007F]}
set text [regsub -all $re_ascii_c0 $text ""]
#c1 controls - first section of the Latin-1 Supplement block - all non-printable from a utf-8 perspective
#some or all of these may have a visual representation in other encodings e.g cp855 seems to have 1 width for them all
#we are presuming that the text supplied has been converted from any other encoding to utf8 - so we don't need to handle that here
#they should also be unlikely to be present in an ansi-free string (as is a precondition for use of this function)
set text [regsub -all {[\u0080-\u009f]+} $text ""]
#short-circuit basic cases
#support tcl pre 2023-11 - see regexp bug below
#if {![regexp {[\uFF-\U10FFFF]} $text]} {
# return [tcl::string::length $text]
#}
if {![regexp "\[\uFF-\U10FFFF\]" $text]} {
return [tcl::string::length $text]
#punk::char::wcswidth has to split and examine dec value of each code
#By stripping controls + 7F (leaving tab) we've already eliminated the non-printable ascii - REVIEW
#return [punk::char::wcswidth $text] ;#still use our wcswidth to account for non-printable ascii
}
#split just to get the standalone character widths - and then scan for other combiners (?) - or scan for clusters first?
#review
#set can_regex_high_unicode [tcl::string::match [regexp -all -inline {[\U80-\U0010FFFF]} \U0001F525] \U0001F525]
#tcl pre 2023-11 - braced high unicode regexes don't work
#fixed in bug-4ed788c618 2023-11
#set uc_chars [regexp -all -inline "\[\u0100-\U10FFFF\]" $text] ;#e.g return list of chars in range only
#maintain unicode as sequences - todo - scan for grapheme clusters
#set uc_sequences [punk::ansi::ta::_perlish_split "(?:\[\u0100-\U10FFFF\])+" $text]
set uc_sequences [punk::ansi::ta::_perlish_split "(?:\[\u0000-\u00FF\])+" $text]
set len 0
foreach {uc ascii} $uc_sequences {
#puts "-ascii $ascii"
#puts "-uc $uc"
incr len [tcl::string::length $ascii]
#textutil::wcswidth uses unicode data
#fall back to textutil::wcswidth (which doesn't for example handle diactricts/combiners so we can't use until these and other things such as \u200b and diacritics are already stripped/accounted for)
#todo - find something that understands grapheme clusters - needed also for grapheme_split
#use punk::char::wcswidth - faster than the string split in textutil::wcswidth but still uses textutil::wcswidth_char
incr len [punk::char::wcswidth $uc]
}
#todo - work out what to do with anomalies like grave combiner which print at different lengths on different terminals (fonts?) and for which cursor-query escape sequence lies.
return $len
}
#kept as a fallback for review/test if textutil::wcswidth doesn't do what we require on all terminals.
#this version looks at console testwidths if they've been cached.
#It is relatively fast - but tests unicode widths char by char - so won't be useful going forward for grapheme clusters.
proc ansifreestring_width2 {text} {
#caller responsible for calling ansistrip first if text may have ansi codes - and for ensuring no newlines
#we can c0 control characters after or while processing ansi escapes.
#we need to map remaining control characters to zero-width (under assumption we're not using a font/codepage that displays them - review!)
#anyway - we must allow things like raw ESC,DEL, NUL etc to pass through without error
#if {[tcl::string::first \033 $text] >= 0} {
# error "string_width doesn't accept ansi escape sequences. Use punk::ansi::ansistrip first"
#}
#todo - various combining diacritical marks.. from grave - to various complicated unicode joiners and composing chars etc
#as at 2023 - terminals generally seem to use the simplistic approach of tallying up individual character-widths, which means combinations that print short are reported too long by the terminal esc 6 n sequence.
#- {Combining Diacritical Marks} {ranges {{start 768 end 879 note {unicode block 0300..036F}}} description {} settype block}
#- {Combining Diacritical Marks Extended} {ranges {{start 6832 end 6911 note {unicode block 1AB0..1AFF}}} description {} settype block}
#- {Combining Diacritical Marks Supplement} {ranges {{start 7616 end 7679 note {unicode block 1DC0..1DFF}}} description {} settype block}
#- {Combining Diacritical Marks for Symbols} {ranges {{start 8400 end 8447 note {unicode block 20D0..20FF}}} description {} settype block}
#- {Combining Half Marks} {ranges {{start 65056 end 65071 note {unicode block FE20..FE2F}}} description {} settype block}
#
# initial simplistic approach is just to strip these ... todo REVIEW
#experiment to detect leading diacritics - but this isn't necessary - it's still zero-width - and if the user is splitting properly we shouldn't be getting a string with leading diacritics anyway
#(leading combiners may display in terminal as mark on rightmost prompt char which is usually a space - but won't add width even then)
#set re_leading_diacritic {^(?:[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+)}
#if {[regexp $re_leading_diacritic $text]} {
# set text " $text"
#}
set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
set text [regsub -all $re_diacritics $text ""]
#review
#if we strip out ZWJ \u0200d (zero width combiner) - then we don't give the chance for a grapheme combining test to merge properly e.g combining emojis
#as at 2024 - textutil::wcswidth doesn't seem to be aware of these - and counts the joiner as one wide
#ZWNJ \u0200c should also be zero length - but as a 'non' joiner - it should add zero to the length
#ZWSP \u0200b zero width space
#we should only map control sequences to nothing after processing ones with length effects, such as \b (\x07f) or DEL \x1f
#todo - document that these shouldn't be present in input rather than explicitly checking here
set re_ascii_c0 {[\U0000-\U001F]}
set text [regsub -all $re_ascii_c0 $text ""]
#short-circuit basic cases
#support tcl pre 2023-11 - see regexp bug below
#if {![regexp {[\uFF-\U10FFFF]} $text]} {
# return [tcl::string::length $text]
#}
if {![regexp "\[\uFF-\U10FFFF\]" $text]} {
return [tcl::string::length $text]
}
#review - wcswidth should detect these
set re_ascii_fullwidth {[\uFF01-\uFF5e]}
set doublewidth_char_count 0
set zerowidth_char_count 0
#split just to get the standalone character widths - and then scan for other combiners (?)
#review
#set can_regex_high_unicode [tcl::string::match [regexp -all -inline {[\U80-\U0010FFFF]} \U0001F525] \U0001F525]
#tcl pre 2023-11 - braced high unicode regexes don't work
#fixed in bug-4ed788c618 2023-11
#set uc_sequences [regexp -all -inline -indices {[\u0100-\U10FFFF]} $text]
#set uc_sequences [regexp -all -inline -indices "\[\u0100-\U10FFFF\]" $text] ;#e.g for string: \U0001f9d1abc\U001f525ab returns {0 0} {4 4}
set uc_chars [regexp -all -inline "\[\u0100-\U10FFFF\]" $text] ;#e.g return list of chars in range only
foreach c $uc_chars {
if {[regexp $re_ascii_fullwidth $c]} {
incr doublewidth_char_count
} else {
#review
# a)- terminals lie - so we could have a bad cached testwidth
# b)- textutil::wcswidth_char seems to be east-asian-width based - and not a reliable indicator of 'character cells taken by the character when printed to the terminal' despite this statement in tclllib docs.
#(character width is a complex context-dependent topic)
# c) by checking for a cached console testwidth first - we make this function less deterministic/repeatable depending on whether console tests have been run.
# d) Upstream caching of grapheme_width may also lock in a result from whatever method was first employed here
#Despite all this - the mechanism is hoped to give best effort consistency for terminals
#further work needed for combining emojis etc - which can't be done in a per character loop
#todo - see if wcswidth does any of this. It is very slow for strings that include mixed ascii/unicode - so perhaps we can use a perlish_split
# to process sequences of unicode.
#- and the user has the option to test character sets first if terminal position reporting gives better results
if {[char_info_is_testwidth_cached $c]} {
set width [char_info_testwidth_cached $c]
} else {
#textutil::wcswidth uses unicode data
#fall back to textutil::wcswidth (which doesn't for example handle diactricts/combiners so we can't use until these and other things such as \u200b and diacritics are already stripped/accounted for)
set width [textutil::wcswidth_char [scan $c %c]]
}
if {$width == 0} {
incr zerowidth_char_count
} elseif {$width == 2} {
incr doublewidth_char_count
}
}
}
#todo - work out what to do with anomalies like grave combiner which print at different lengths on different terminals (fonts?) and for which cursor-query escape sequence lies.
return [expr {[tcl::string::length $text] + $doublewidth_char_count - $zerowidth_char_count}]
}
#slow - textutil::wcswidth is slow with mixed ascii uc
proc ansifreestring_width3 {text} {
#caller responsible for calling ansistrip first if text may have ansi codes - and for ensuring no newlines
#we can c0 control characters after or while processing ansi escapes.
#we need to map remaining control characters to zero-width (under assumption we're not using a font/codepage that displays them - review!)
#anyway - we must allow things like raw ESC,DEL, NUL etc to pass through without error
#if {[tcl::string::first \033 $text] >= 0} {
# error "string_width doesn't accept ansi escape sequences. Use punk::ansi::ansistrip first"
#}
#review - work out what to do with anomalies like grave combiner which print at different lengths on different terminals (fonts?) and for which cursor-query escape sequence lies.
# - various combining diacritical marks.. from grave - to various complicated unicode joiners and composing chars etc
#as at 2023 - terminals generally seem to use the simplistic approach of tallying up individual character-widths, which means combinations that print short are reported too long by the terminal esc 6 n sequence.
#- {Combining Diacritical Marks} {ranges {{start 768 end 879 note {unicode block 0300..036F}}} description {} settype block}
#- {Combining Diacritical Marks Extended} {ranges {{start 6832 end 6911 note {unicode block 1AB0..1AFF}}} description {} settype block}
#- {Combining Diacritical Marks Supplement} {ranges {{start 7616 end 7679 note {unicode block 1DC0..1DFF}}} description {} settype block}
#- {Combining Diacritical Marks for Symbols} {ranges {{start 8400 end 8447 note {unicode block 20D0..20FF}}} description {} settype block}
#- {Combining Half Marks} {ranges {{start 65056 end 65071 note {unicode block FE20..FE2F}}} description {} settype block}
#
# initial simplistic approach is just to strip these ... todo REVIEW
#experiment to detect leading diacritics - but this isn't necessary - it's still zero-width - and if the user is splitting properly we shouldn't be getting a string with leading diacritics anyway
#(leading combiners may display in terminal as mark on rightmost prompt char which is usually a space - but won't add width even then)
#set re_leading_diacritic {^(?:[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+)}
#if {[regexp $re_leading_diacritic $text]} {
# set text " $text"
#}
set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
set text [regsub -all $re_diacritics $text ""]
#we should only map control sequences to nothing after processing ones with length effects, such as \b (\x07f) or DEL \x1f
#todo - document that these shouldn't be present in input rather than explicitly checking here
set re_ascii_c0 {[\U0000-\U001F]}
set text [regsub -all $re_ascii_c0 $text ""]
#short-circuit basic cases
#support tcl pre 2023-11 - see regexp bug below
#if {![regexp {[\uFF-\U10FFFF]} $text]} {
# return [tcl::string::length $text]
#}
if {![regexp "\[\uFF-\U10FFFF\]" $text]} {
return [tcl::string::length $text]
}
#slow when ascii mixed with unicode (but why?)
return [punk::char::wcswidth $text]
}
#This shouldn't be called on text containing ansi codes!
proc strip_nonprinting_ascii {str} {
#review - some single-byte 'control' chars have visual representations e.g ETX as heart depending on font/codepage
#It is currently used for screen display width calculations
#equivalent for various unicode combining chars etc?
set map [list\
\x00 ""\
\x07 ""\
\x7f ""\
]
return [tcl::string::map $map $str]
}
#todo - lookup from unicode tables
variable flags [dict create\
AU \U1F1E6\U1F1FA\
US \U1F1FA\U1F1F8\
ZW \U1F1FF\U1F1FC
]
variable rflags
dict for {k v} $flags {
dict set rflags $v $k
}
proc flag_from_ascii {code} {
variable flags
if {[regexp {^[A-Z]{2}$} $code]} {
if {[dict exists $flags $code]} {
return [dict get $flags $code]
} else {
error "unsupported flags code: $code"
}
} else {
#try as subregion
#e.g gbeng,gbwls,gbsct
return \U1f3f4[tag_from_ascii $code]\Ue007f
}
}
proc flag_to_ascii {charsequence} {
variable rflags
if {[dict exists $rflags $charsequence]} {
return [dict get $rflags $charsequence]
}
if {[string index $charsequence 0] eq "\U1F3F4" && [string index $charsequence end] eq "\UE007F"} {
#subdivision flag
set tag [string range $charsequence 1 end-1]
return [tag_to_ascii $tag]
}
error "unknown flag $charsequence"
}
proc tag_to_ascii {t} {
set fmt [string repeat %c [string length $t]]
set declist [scan $t $fmt]
#unicode Tags block - e0000 to e007f
set declist [lmap dec $declist {
if {$dec < 917504 || $dec > 917631} {
error "char [ansistring VIEW -lf 1 -cr 1 -vt 1 [format %c $dec]] has decimal value $dec. Not in unicode Tags block range 917504-917631 (e0000-e007f)"
}
incr dec -917504
}]
return [format $fmt {*}$declist]
}
proc tag_from_ascii {a} {
set fmt [string repeat %c [string length $a]]
set declist [scan $a $fmt]
set declist [lmap dec $declist {
if {$dec > 127} {
error "char [ansistring VIEW -lf 1 -cr 1 -vt 1 [format %c $dec]] has decimal value $dec. Not in ascii range 0-127"
}
incr dec 917504
}]
return [format $fmt {*}$declist]
}
#split into plaintext and runs of combiners (combining diacritical marks - not ZWJ or ZWJNJ)
proc combiner_split {text} {
#split into odd numbered list (or zero) in a similar way to punk::ansi::ta::_perlish_split
#
#set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
set graphemes [list]
if {[tcl::string::length $text] == 0} {
return {}
}
set list [list]
set start 0
set strlen [tcl::string::length $text]
#make sure our regexes aren't non-greedy - or we may not have exit condition for loop
#review
while {$start < $strlen && [regexp -start $start -indices -- {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+} $text match]} {
lassign $match matchStart matchEnd
#puts "->start $start ->match $matchStart $matchEnd"
lappend list [tcl::string::range $text $start $matchStart-1] [tcl::string::range $text $matchStart $matchEnd]
set start [expr {$matchEnd+1}]
}
lappend list [tcl::string::range $text $start end]
}
#ZWJ ZWNJ ?
#SWSP ?
#1st shot - basic diacritics
#todo - become aware of unicode grapheme cluster boundaries
#This is difficult in Tcl without unicode property based Character Classes in the regex engine
#review - this needs to be performant - it is used a lot by punk terminal/ansi features
#todo - trie data structures for unicode?
#for now we can at least combine diacritics
#should also handle the ZWJ (and the variation selectors? eg \uFE0F) character which should account for emoji clusters
#Note - emoji cluster could be for example 11 code points/41 bytes (family emoji with skin tone modifiers for each member, 3 ZWJs)
#This still leaves a whole class of clusters.. korean etc unhandled :/
#todo - tk::startOfCluster / tk::endOfCluster - try to get it brought into Tcl
#https://core.tcl-lang.org/tcl/tktview/a4c7eeaf63
proc grapheme_split {text} {
set graphemes [list]
set csplits [combiner_split $text]
foreach {pt combiners} [lrange $csplits 0 end-1] {
set clist [split $pt ""]
lappend graphemes {*}[lrange $clist 0 end-1]
lappend graphemes [tcl::string::cat [lindex $clist end] $combiners]
}
#last csplit never has a combiner (_perlish_split style) - and may be empty - in which case we don't append it as a grapheme
if {[lindex $csplits end] ne ""} {
lappend graphemes {*}[split [lindex $csplits end] ""]
}
return $graphemes
}
proc grapheme_split_dec {text} {
set graphemes [list]
set csplits [combiner_split $text]
foreach {pt combiners} [lrange $csplits 0 end-1] {
set pt_decs [scan $pt [tcl::string::repeat %c [tcl::string::length $pt]]] ;#warning scan %c... slow for v large strings (e.g 400k+)
set combiner_decs [scan $combiners [tcl::string::repeat %c [tcl::string::length $combiners]]]
lset pt_decs end [concat [lindex $pt_decs end] $combiner_decs]
lappend graphemes {*}$pt_decs
}
#last csplit never has a combiner (_perlish_split style) - and may be empty - in which case we don't append it as a grapheme
if {[lindex $csplits end] ne ""} {
lappend graphemes {*}[scan [lindex $csplits end] [tcl::string::repeat %c [tcl::string::length [lindex $csplits end]]]]
}
return $graphemes
}
proc grapheme_split_dec2 {text} {
set graphemes [list]
set csplits [combiner_split $text]
foreach {pt combiners} $csplits {
set pt_decs [scan $pt [tcl::string::repeat %c [tcl::string::length $pt]]]
if {$combiners ne ""} {
set combiner_decs [scan $combiners [tcl::string::repeat %c [tcl::string::length $combiners]]]
lset pt_decs end [concat [lindex $pt_decs end] $combiner_decs]
}
lappend graphemes {*}$pt_decs
}
return $graphemes
}
proc grapheme_split2 {text} {
set graphemes [list]
set csplits [combiner_split $text]
foreach {pt combiners} [lrange $csplits 0 end-1] {
set clist [split $pt ""]
lappend graphemes {*}[lrange $clist 0 end-1] [tcl::string::cat [lindex $clist end] $combiners]
}
#last csplit never has a combiner (_perlish_split style) - and may be empty - in which case we don't append it as a grapheme
if {[lindex $csplits end] ne ""} {
lappend graphemes {*}[split [lindex $csplits end] ""]
}
return $graphemes
}
# -- --- --- --- ---
#will accept a single char or a string - test using console cursor position reporting
#unreliable
proc char_info_testwidth {ch {emit 0}} {
package require punk::console
#uses cursor movement and relies on console to report position.. which it may do properly for single chars - but may misreport for combinations that combine into a single glyph
tailcall punk::console::test_char_width $ch $emit
}
proc char_info_testwidth_cached {char} {
variable charinfo
set dec [scan $char %c]
set twidth ""
if {[tcl::dict::exists $charinfo $dec testwidth]} {
set twidth [tcl::dict::get $charinfo $dec testwidth]
}
if {$twidth eq ""} {
set width [char_info_testwidth $char]
tcl::dict::set charinfo $dec testwidth $width
return $width
} else {
return $twidth
}
}
proc char_info_is_testwidth_cached {char} {
variable charinfo
return [tcl::dict::exists $charinfo [scan $char %c] testwidth]
}
# -- --- --- --- ---
}
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
## Ready
package provide punk::char [tcl::namespace::eval punk::char {
variable version
set version 0.1.0
}]
return
#*** !doctools
#[manpage_end]