@ -19,7 +19,7 @@
#[manpage_begin punkshell_module_punk::char 0 999999.0a1.0]
#[manpage_begin punkshell_module_punk::char 0 999999.0a1.0]
#[copyright "2024"]
#[copyright "2024"]
#[titledesc {character-set and unicode utilities}] [comment {-- Name section and table of contents description --}]
#[titledesc {character-set and unicode utilities}] [comment {-- Name section and table of contents description --}]
#[moddesc {character-set nad unicode}] [comment {-- Description at end of page heading --}]
#[moddesc {character-set nad unicode}] [comment {-- Description at end of page heading --}]
#[require punk::char]
#[require punk::char]
#[keywords module encodings]
#[keywords module encodings]
#[description]
#[description]
@ -49,11 +49,11 @@
#*** !doctools
#*** !doctools
#[item] [package {overtype}]
#[item] [package {overtype}]
#[para] -
#[para] -
#[item] [package {textblock}]
#[item] [package {textblock}]
#[para] -
#[para] -
#[item] [package console]
#[item] [package console]
#[para] -
#[para] -
package require Tcl 8.6-
package require Tcl 8.6-
#dependency on tcllib not ideal for bootstrapping as punk::char is core to many features.. (textutil::wcswidth is therefore included in bootsupport/include_modules.config) review
#dependency on tcllib not ideal for bootstrapping as punk::char is core to many features.. (textutil::wcswidth is therefore included in bootsupport/include_modules.config) review
@ -92,20 +92,20 @@ tcl::namespace::eval punk::char {
#just the 7-bit ascii. use [page ascii] for the 8-bit layout
#just the 7-bit ascii. use [page ascii] for the 8-bit layout
proc ascii {} {return {
proc ascii {} {return {
00 NUL 01 SOH 02 STX 03 ETX 04 EOT 05 ENQ 06 ACK 07 BEL
00 NUL 01 SOH 02 STX 03 ETX 04 EOT 05 ENQ 06 ACK 07 BEL
08 BS 09 HT 0a LF 0b VT 0c FF 0d CR 0e SO 0f SI
08 BS 09 HT 0a LF 0b VT 0c FF 0d CR 0e SO 0f SI
10 DLE 11 DC1 12 DC2 13 DC3 14 DC4 15 NAK 16 SYN 17 ETB
10 DLE 11 DC1 12 DC2 13 DC3 14 DC4 15 NAK 16 SYN 17 ETB
18 CAN 19 EM 1a SUB 1b ESC 1c FS 1d GS 1e RS 1f US
18 CAN 19 EM 1a SUB 1b ESC 1c FS 1d GS 1e RS 1f US
20 SP 21 ! 22 " 23 # 24 $ 25 % 26 & 27 '
20 SP 21 ! 22 " 23 # 24 $ 25 % 26 & 27 '
28 ( 29 ) 2a * 2b + 2c , 2d - 2e . 2f /
28 ( 29 ) 2a * 2b + 2c , 2d - 2e . 2f /
30 0 31 1 32 2 33 3 34 4 35 5 36 6 37 7
30 0 31 1 32 2 33 3 34 4 35 5 36 6 37 7
38 8 39 9 3a : 3b ; 3c < 3d = 3e > 3f ?
38 8 39 9 3a : 3b ; 3c < 3d = 3e > 3f ?
40 @ 41 A 42 B 43 C 44 D 45 E 46 F 47 G
40 @ 41 A 42 B 43 C 44 D 45 E 46 F 47 G
48 H 49 I 4a J 4b K 4c L 4d M 4e N 4f O
48 H 49 I 4a J 4b K 4c L 4d M 4e N 4f O
50 P 51 Q 52 R 53 S 54 T 55 U 56 V 57 W
50 P 51 Q 52 R 53 S 54 T 55 U 56 V 57 W
58 X 59 Y 5a Z 5b [ 5c \ 5d ] 5e ^ 5f _
58 X 59 Y 5a Z 5b [ 5c \ 5d ] 5e ^ 5f _
60 ` 61 a 62 b 63 c 64 d 65 e 66 f 67 g
60 ` 61 a 62 b 63 c 64 d 65 e 66 f 67 g
68 h 69 i 6a j 6b k 6c l 6d m 6e n 6f o
68 h 69 i 6a j 6b k 6c l 6d m 6e n 6f o
70 p 71 q 72 r 73 s 74 t 75 u 76 v 77 w
70 p 71 q 72 r 73 s 74 t 75 u 76 v 77 w
78 x 79 y 7a z 7b { 7c | 7d } 7e ~ 7f DEL
78 x 79 y 7a z 7b { 7c | 7d } 7e ~ 7f DEL
}}
}}
@ -124,7 +124,7 @@ tcl::namespace::eval punk::char {
} elseif {[tcl::string::length $v] == 0} {
} elseif {[tcl::string::length $v] == 0} {
set v " "
set v " "
}
}
append out "$k $v "
append out "$k $v "
if {$i > 0 && $i % 8 == 0} {
if {$i > 0 && $i % 8 == 0} {
set out [tcl::string::range $out 0 end-2]
set out [tcl::string::range $out 0 end-2]
append out \n " "
append out \n " "
@ -143,7 +143,7 @@ tcl::namespace::eval punk::char {
set out ""
set out ""
append out [page dingbats] \n
append out [page dingbats] \n
set unicode_dict [charset_dictget Dingbats]
set unicode_dict [charset_dictget Dingbats]
append out " "
append out " "
set i 1
set i 1
tcl::dict::for {k charinfo} $unicode_dict {
tcl::dict::for {k charinfo} $unicode_dict {
@ -155,7 +155,7 @@ tcl::namespace::eval punk::char {
} else {
} else {
set displayv $char
set displayv $char
}
}
append out "$k $displayv "
append out "$k $displayv "
if {$i > 0 && $i % 8 == 0} {
if {$i > 0 && $i % 8 == 0} {
set out [tcl::string::range $out 0 end-2]
set out [tcl::string::range $out 0 end-2]
append out \n " "
append out \n " "
@ -205,7 +205,7 @@ tcl::namespace::eval punk::char {
tcl::dict::lappend d $encname $mname
tcl::dict::lappend d $encname $mname
}
}
}
}
}
}
foreach enc [lsort $encnames] {
foreach enc [lsort $encnames] {
set mime_enc [${encmimens}::mapencoding $enc]
set mime_enc [${encmimens}::mapencoding $enc]
if {$mime_enc ne ""} {
if {$mime_enc ne ""} {
@ -236,7 +236,7 @@ tcl::namespace::eval punk::char {
tailcall page $encname {*}$args
tailcall page $encname {*}$args
}
}
#This will not display for example, c0 glyphs for cp437
#This will not display for example, c0 glyphs for cp437
# we could use the punk::ansi::cp437_map dict - but while that might be what some expect to see - it would probably be too much magic for this function - which is intended to align more with what Tcl's encoding convertfrom/to actually does.
# we could use the punk::ansi::cp437_map dict - but while that might be what some expect to see - it would probably be too much magic for this function - which is intended to align more with what Tcl's encoding convertfrom/to actually does.
# for nonprinting members of the page, 2 and 3 letter codes are used rather than unicode visualisation replacements or even unicode equivalent replacements known to be appropriate for the page
# for nonprinting members of the page, 2 and 3 letter codes are used rather than unicode visualisation replacements or even unicode equivalent replacements known to be appropriate for the page
proc page {encname args} {
proc page {encname args} {
@ -255,7 +255,7 @@ tcl::namespace::eval punk::char {
#set d_ascii [pagedict_raw ascii]
#set d_ascii [pagedict_raw ascii]
set d_ascii [basedict]
set d_ascii [basedict]
set d_asciiposn [lreverse $d_ascii] ;#values should be unique except for "???". We are mainly interested in which ones have display-names e.g cr csi
set d_asciiposn [lreverse $d_ascii] ;#values should be unique except for "???". We are mainly interested in which ones have display-names e.g cr csi
#The results of this are best seen by comparing the ebcdic and ascii pages
#The results of this are best seen by comparing the ebcdic and ascii pages
set d_page [pagedict_raw $encname]
set d_page [pagedict_raw $encname]
@ -285,7 +285,7 @@ tcl::namespace::eval punk::char {
set displayv $bytedisplay
set displayv $bytedisplay
} else {
} else {
if {[tcl::string::length $rawchar] == 0} {
if {[tcl::string::length $rawchar] == 0} {
set displayv " "
set displayv " "
} else {
} else {
#presumed 1
#presumed 1
set displayv " $rawchar "
set displayv " $rawchar "
@ -294,7 +294,7 @@ tcl::namespace::eval punk::char {
}
}
}
}
append out "$k $displayv "
append out "$k $displayv "
if {$i > 0 && $i % $cols == 0} {
if {$i > 0 && $i % $cols == 0} {
set out [tcl::string::range $out 0 end-2]
set out [tcl::string::range $out 0 end-2]
append out \n " "
append out \n " "
@ -318,7 +318,7 @@ tcl::namespace::eval punk::char {
set outchar [encoding convertfrom $encpage $ch]
set outchar [encoding convertfrom $encpage $ch]
} else {
} else {
#here we will use \0xFFFD instead of our replacement string ??? - as the name pagechar implies always returning a single character. REVIEW.
#here we will use \0xFFFD instead of our replacement string ??? - as the name pagechar implies always returning a single character. REVIEW.
set outchar $::punk::char::invalid_display_char
set outchar $::punk::char::invalid_display_char
}
}
return $outchar
return $outchar
}
}
@ -348,7 +348,7 @@ tcl::namespace::eval punk::char {
#set outchar [encoding convertto $encpage [format %c $num]]
#set outchar [encoding convertto $encpage [format %c $num]]
set outchar [format %c $num]
set outchar [format %c $num]
} else {
} else {
set outchar $::punk::char::invalid_display_char
set outchar $::punk::char::invalid_display_char
}
}
return $outchar
return $outchar
}
}
@ -381,7 +381,7 @@ tcl::namespace::eval punk::char {
}
}
proc pagedict_raw {encname} {
proc pagedict_raw {encname} {
variable invalid ;# ="???"
variable invalid ;# ="???"
set encname [encname $encname]
set encname [encname $encname]
set d [tcl::dict::create]
set d [tcl::dict::create]
for {set i 0} {$i < 256} {incr i} {
for {set i 0} {$i < 256} {incr i} {
@ -409,7 +409,7 @@ tcl::namespace::eval punk::char {
tcl::dict::set d $k [tcl::dict::get $a128 $k]
tcl::dict::set d $k [tcl::dict::get $a128 $k]
} else {
} else {
#
#
tcl::dict::set d $k $invalid
tcl::dict::set d $k $invalid
}
}
if {$i <=32} {
if {$i <=32} {
@ -457,7 +457,7 @@ tcl::namespace::eval punk::char {
}
}
return $d
return $d
}
}
proc basedict {} {
proc basedict {} {
#this gives same result independent of current value of 'encoding system'
#this gives same result independent of current value of 'encoding system'
set d [tcl::dict::create]
set d [tcl::dict::create]
@ -529,10 +529,10 @@ tcl::namespace::eval punk::char {
return $d
return $d
}
}
#-- --- --- --- --- --- --- ---
#-- --- --- --- --- --- --- ---
# encoding convertfrom & encoding convertto can be somewhat confusing to think about. (Need to think in reverse.)
# encoding convertfrom & encoding convertto can be somewhat confusing to think about. (Need to think in reverse.)
# e.g encoding convertto dingbats <somethingpretty> will output something that doesn't look dingbatty on screen.
# e.g encoding convertto dingbats <somethingpretty> will output something that doesn't look dingbatty on screen.
#-- --- --- --- --- --- --- ---
#-- --- --- --- --- --- --- ---
#must use Tcl instead of tcl (at least for 8.6)
#must use Tcl instead of tcl (at least for 8.6)
if {![package vsatisfies [package present Tcl] 8.7-]} {
if {![package vsatisfies [package present Tcl] 8.7-]} {
proc encodable "s {enc [encoding system]}" {
proc encodable "s {enc [encoding system]}" {
@ -574,7 +574,7 @@ tcl::namespace::eval punk::char {
}
}
}
}
}
}
#-- --- --- --- --- --- --- ---
#-- --- --- --- --- --- --- ---
proc test_japanese {{encoding jis0208}} {
proc test_japanese {{encoding jis0208}} {
#A very basic test of 2char encodings such as jis0208
#A very basic test of 2char encodings such as jis0208
set yatbun 日本 ;# encoding convertfrom jis0208 F|K\\
set yatbun 日本 ;# encoding convertfrom jis0208 F|K\\
@ -584,7 +584,7 @@ tcl::namespace::eval punk::char {
set ebun [encoding convertto $encoding $bun]
set ebun [encoding convertto $encoding $bun]
puts "$encoding encoded: ${eyat} ${ebun}"
puts "$encoding encoded: ${eyat} ${ebun}"
puts "reencoded: [encoding convertfrom $encoding $eyat] [encoding convertfrom $encoding $ebun]"
puts "reencoded: [encoding convertfrom $encoding $eyat] [encoding convertfrom $encoding $ebun]"
return $yatbun
return $yatbun
}
}
proc test_grave {} {
proc test_grave {} {
set g [format %c 0x300]
set g [format %c 0x300]
@ -692,7 +692,7 @@ tcl::namespace::eval punk::char {
#ESC ( B ESC ) B ASCII Set
#ESC ( B ESC ) B ASCII Set
#ESC ( 0 ESC ) 0 Special Graphics
#ESC ( 0 ESC ) 0 Special Graphics
#ESC ( 1 ESC ) 1 Alternate Character ROM Standard Character Set
#ESC ( 1 ESC ) 1 Alternate Character ROM Standard Character Set
#ESC ( 2 ESC ) 2 Alternate Character ROM Special Graphic
#ESC ( 2 ESC ) 2 Alternate Character ROM Special Graphic
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# Unicode character sets - some hardcoded - some loadable from data files
# Unicode character sets - some hardcoded - some loadable from data files
@ -700,7 +700,7 @@ tcl::namespace::eval punk::char {
variable charinfo [tcl::dict::create]
variable charinfo [tcl::dict::create]
variable charsets [tcl::dict::create]
variable charsets [tcl::dict::create]
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# Aggregate character sets (ones that pick various ranges from underlying unicode ranges)
# Aggregate character sets (ones that pick various ranges from underlying unicode ranges)
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
@ -741,7 +741,7 @@ tcl::namespace::eval punk::char {
}
}
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# Unicode ranges
# Unicode ranges
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
tcl::dict::set charsets "greek" [list ranges [list {start 880 end 1023} {start 7936 end 8191}] description "Greek and Coptic" settype "other"]
tcl::dict::set charsets "greek" [list ranges [list {start 880 end 1023} {start 7936 end 8191}] description "Greek and Coptic" settype "other"]
@ -975,9 +975,9 @@ tcl::namespace::eval punk::char {
variable charset_extents_rangenames ;# dict keyed on start,end pointing to list of 2-tuples {setname rangeindex_within_set}
variable charset_extents_rangenames ;# dict keyed on start,end pointing to list of 2-tuples {setname rangeindex_within_set}
#build 2 indexes for each range.(charsets are not just unicode blocks so can have multiple ranges)
#build 2 indexes for each range.(charsets are not just unicode blocks so can have multiple ranges)
#Note that a range could be as small as a single char (startpoint = endpoint) so there can be many ranges with same start and end if charsets use some of the same subsets.
#Note that a range could be as small as a single char (startpoint = endpoint) so there can be many ranges with same start and end if charsets use some of the same subsets.
#as each charset_extents_startpoins,charset_extents_endpoints is built - the associated range name and index is appended to the rangenames dict
#as each charset_extents_startpoins,charset_extents_endpoints is built - the associated range name and index is appended to the rangenames dict
#startpoints - key is startpoint of a range, value is list of endpoints one for each range starting at this startpoint-key
#startpoints - key is startpoint of a range, value is list of endpoints one for each range starting at this startpoint-key
#endpoints - key is endpoint of a range, value is list of startpoints one for each range ending at this endpoint-key
#endpoints - key is endpoint of a range, value is list of startpoints one for each range ending at this endpoint-key
proc _build_charset_extents {} {
proc _build_charset_extents {} {
variable charsets
variable charsets
variable charset_extents_startpoints
variable charset_extents_startpoints
@ -995,7 +995,7 @@ tcl::namespace::eval punk::char {
set end [tcl::dict::get [lindex $ranges 0] end]
set end [tcl::dict::get [lindex $ranges 0] end]
if {![tcl::dict::exists $charset_extents_startpoints $start] || $end ni [tcl::dict::get $charset_extents_startpoints $start]} {
if {![tcl::dict::exists $charset_extents_startpoints $start] || $end ni [tcl::dict::get $charset_extents_startpoints $start]} {
#assertion if end wasn't in startpoits list - then start won't be in endpoints list
#assertion if end wasn't in startpoits list - then start won't be in endpoints list
tcl::dict::lappend charset_extents_startpoints $start $end
tcl::dict::lappend charset_extents_startpoints $start $end
tcl::dict::lappend charset_extents_endpoints $end $start
tcl::dict::lappend charset_extents_endpoints $end $start
}
}
tcl::dict::lappend charset_extents_rangenames ${start},${end} [list $setname 1]
tcl::dict::lappend charset_extents_rangenames ${start},${end} [list $setname 1]
@ -1023,14 +1023,14 @@ tcl::namespace::eval punk::char {
#no need to sort charset_extents_rangenames - lookup only done using dict methods
#no need to sort charset_extents_rangenames - lookup only done using dict methods
return [tcl::dict::size $charset_extents_startpoints]
return [tcl::dict::size $charset_extents_startpoints]
}
}
_build_charset_extents ;#rebuilds for all charsets
_build_charset_extents ;#rebuilds for all charsets
#nerdfonts are within the Private use E000 - F8FF range
#nerdfonts are within the Private use E000 - F8FF range
proc load_nerdfonts {} {
proc load_nerdfonts {} {
variable charsets
variable charsets
variable charinfo
variable charinfo
package require fileutil
package require fileutil
set ver [package provide punk::char]
set ver [package provide punk::char]
if {$ver ne ""} {
if {$ver ne ""} {
set ifneeded [package ifneeded punk::char [package provide punk::char]]
set ifneeded [package ifneeded punk::char [package provide punk::char]]
#puts stderr "punk::char ifneeded script: $ifneeded"
#puts stderr "punk::char ifneeded script: $ifneeded"
@ -1038,7 +1038,7 @@ tcl::namespace::eval punk::char {
set basedir [file dirname [lindex $sourceinfo end]]
set basedir [file dirname [lindex $sourceinfo end]]
} else {
} else {
#review - will only work at package load time
#review - will only work at package load time
set scr [info script]
set scr [info script]
if {$scr eq ""} {
if {$scr eq ""} {
error "load_nerdfonts unable to determine package folder"
error "load_nerdfonts unable to determine package folder"
}
}
@ -1048,7 +1048,7 @@ tcl::namespace::eval punk::char {
set fname [file join $pkg_data_dir nerd-fonts-glyph-list.txt]
set fname [file join $pkg_data_dir nerd-fonts-glyph-list.txt]
if {[file exists $fname]} {
if {[file exists $fname]} {
#puts stderr "load_nerdfonts loading $fname"
#puts stderr "load_nerdfonts loading $fname"
set data [fileutil::cat -translation binary $fname]
set data [fileutil::cat -translation binary $fname]
set short_seen [tcl::dict::create]
set short_seen [tcl::dict::create]
set current_set_range [tcl::dict::create]
set current_set_range [tcl::dict::create]
set filesets_loading [list]
set filesets_loading [list]
@ -1066,7 +1066,7 @@ tcl::namespace::eval punk::char {
dict unset charset $setname
dict unset charset $setname
}
}
set newrange [list start $dec end $dec]
set newrange [list start $dec end $dec]
tcl::dict::set current_set_range $setname $newrange
tcl::dict::set current_set_range $setname $newrange
tcl::dict::set charsets $setname [list ranges [list $newrange] description "nerd fonts $rawsetname" settype "nerdfonts privateuse"]
tcl::dict::set charsets $setname [list ranges [list $newrange] description "nerd fonts $rawsetname" settype "nerdfonts privateuse"]
lappend filesets_loading $setname
lappend filesets_loading $setname
@ -1080,13 +1080,13 @@ tcl::namespace::eval punk::char {
#overwrite last ranges element
#overwrite last ranges element
set rangelist [lrange [tcl::dict::get $charsets $setname ranges] 0 end-1]
set rangelist [lrange [tcl::dict::get $charsets $setname ranges] 0 end-1]
lappend rangelist [tcl::dict::get $current_set_range $setname]
lappend rangelist [tcl::dict::get $current_set_range $setname]
tcl::dict::set charsets $setname ranges $rangelist
tcl::dict::set charsets $setname ranges $rangelist
} else {
} else {
#new range for set
#new range for set
tcl::dict::set current_set_range $setname start $dec
tcl::dict::set current_set_range $setname start $dec
tcl::dict::set current_set_range $setname end $dec
tcl::dict::set current_set_range $setname end $dec
set rangelist [tcl::dict::get $charsets $setname ranges]
set rangelist [tcl::dict::get $charsets $setname ranges]
lappend rangelist [tcl::dict::get $current_set_range $setname]
lappend rangelist [tcl::dict::get $current_set_range $setname]
tcl::dict::set charsets $setname ranges $rangelist
tcl::dict::set charsets $setname ranges $rangelist
}
}
@ -1130,7 +1130,7 @@ tcl::namespace::eval punk::char {
proc package_base {} {
proc package_base {} {
#assume punk::char is in .tm form and we can use the package provide statement to determine base location
#assume punk::char is in .tm form and we can use the package provide statement to determine base location
#review
#review
set pkgver [package present punk::char]
set pkgver [package present punk::char]
set pkginfo [package ifneeded punk::char $pkgver]
set pkginfo [package ifneeded punk::char $pkgver]
set tmfile [lindex $pkginfo end]
set tmfile [lindex $pkginfo end]
@ -1156,7 +1156,7 @@ tcl::namespace::eval punk::char {
if {[tcl::dict::exists $dictValue {*}$keys]} {
if {[tcl::dict::exists $dictValue {*}$keys]} {
return [tcl::dict::get $dictValue {*}$keys]
return [tcl::dict::get $dictValue {*}$keys]
} else {
} else {
return [lindex $args end]
return [lindex $args end]
}
}
}
}
}
}
@ -1193,7 +1193,7 @@ tcl::namespace::eval punk::char {
if {[set pcolon [tcl::string::first ";" $ln]] > 0} {
if {[set pcolon [tcl::string::first ";" $ln]] > 0} {
set lhs [tcl::string::trim [tcl::string::range $ln 0 $pcolon-1]]
set lhs [tcl::string::trim [tcl::string::range $ln 0 $pcolon-1]]
set name [tcl::string::trim [tcl::string::range $ln $pcolon+1 end]]
set name [tcl::string::trim [tcl::string::range $ln $pcolon+1 end]]
set lhsparts [split $lhs .]
set lhsparts [split $lhs .]
set start [lindex $lhsparts 0]
set start [lindex $lhsparts 0]
set end [lindex $lhsparts end]
set end [lindex $lhsparts end]
#puts "$start -> $end '$name'"
#puts "$start -> $end '$name'"
@ -1207,9 +1207,9 @@ tcl::namespace::eval punk::char {
return $block_count
return $block_count
}
}
#unicode scripts
#unicode scripts
#unicode UnicodeData.txt
#unicode UnicodeData.txt
@ -1225,8 +1225,8 @@ tcl::namespace::eval punk::char {
# -- ---
# -- ---
#A = Ambiguous - All characters that can be sometimes wide and sometimes narrow. (wide in east asian legacy sets, narrow in non-east asian usage) (private use chars considered ambiguous)
#A = Ambiguous - All characters that can be sometimes wide and sometimes narrow. (wide in east asian legacy sets, narrow in non-east asian usage) (private use chars considered ambiguous)
#F = East Asian Full-width
#F = East Asian Full-width
#H = East Asian Half-width
#H = East Asian Half-width
#N = Not east Asian (Neutral) - all other characters. (do not occur in legacy East Asian character sets) - treated like Na
#N = Not east Asian (Neutral) - all other characters. (do not occur in legacy East Asian character sets) - treated like Na
#Na = East Asian Narrow - all other characters that are always narrow and have explicit full-width counterparts (e.g includes all of ASCII)
#Na = East Asian Narrow - all other characters that are always narrow and have explicit full-width counterparts (e.g includes all of ASCII)
#W = East Asian Wide - all other characters that are always wide (Occur only in the context of Eas Asian Typography)
#W = East Asian Wide - all other characters that are always wide (Occur only in the context of Eas Asian Typography)
# -- ---
# -- ---
@ -1304,7 +1304,7 @@ tcl::namespace::eval punk::char {
set initial_fields $known_fields
set initial_fields $known_fields
if {"testwidth" ni $opt_fields} {
if {"testwidth" ni $opt_fields} {
if {"testwidth" ni $opt_except} {
if {"testwidth" ni $opt_except} {
lappend opt_except testwidth
lappend opt_except testwidth
}
}
}
}
if {"char" ni $opt_fields} {
if {"char" ni $opt_fields} {
@ -1369,13 +1369,13 @@ tcl::namespace::eval punk::char {
#testwidth is one of the main ones likely to be worthwhile excluding as querying the console via ansi takes time
#testwidth is one of the main ones likely to be worthwhile excluding as querying the console via ansi takes time
set existing_testwidth ""
set existing_testwidth ""
if {[tcl::dict::exists $charinfo $dec_char testwidth]} {
if {[tcl::dict::exists $charinfo $dec_char testwidth]} {
set existing_testwidth [tcl::dict::get $charinfo $dec_char testwidth]
set existing_testwidth [tcl::dict::get $charinfo $dec_char testwidth]
}
}
if {$existing_testwidth eq ""} {
if {$existing_testwidth eq ""} {
#no cached data - do expensive cursor-position test (Note this might not be 100% reliable - terminals lie e.g if ansi escape sequence has set to wide printing.)
#no cached data - do expensive cursor-position test (Note this might not be 100% reliable - terminals lie e.g if ansi escape sequence has set to wide printing.)
set char [format %c $dec_char]
set char [format %c $dec_char]
set chwidth [char_info_testwidth $char]
set chwidth [char_info_testwidth $char]
tcl::dict::set returninfo testwidth $chwidth
tcl::dict::set returninfo testwidth $chwidth
#cache it. todo - -verify flag to force recalc in case font/terminal changed in some way?
#cache it. todo - -verify flag to force recalc in case font/terminal changed in some way?
tcl::dict::set charinfo $dec_char testwidth $chwidth
tcl::dict::set charinfo $dec_char testwidth $chwidth
@ -1387,10 +1387,10 @@ tcl::namespace::eval punk::char {
set char [format %c $dec_char]
set char [format %c $dec_char]
tcl::dict::set returninfo char $char
tcl::dict::set returninfo char $char
}
}
memberof {
memberof {
#memberof takes in the order of a few hundred microseconds if a simple scan of all ranges is taken - possibly worthwhile caching/optimising
#memberof takes in the order of a few hundred microseconds if a simple scan of all ranges is taken - possibly worthwhile caching/optimising
#note that memberof is not just unicode blocks - but scripts and other non-contiguous sets consisting of multiple ranges - some of which may include ranges of a single character. (e.g WGL4)
#note that memberof is not just unicode blocks - but scripts and other non-contiguous sets consisting of multiple ranges - some of which may include ranges of a single character. (e.g WGL4)
#This means there probably isn't a really efficient means of calculating membership other than scanning all the defined ranges.
#This means there probably isn't a really efficient means of calculating membership other than scanning all the defined ranges.
#We could potentially populate it using a side-thread - but it seems reasonable to just cache result after first use here.
#We could potentially populate it using a side-thread - but it seems reasonable to just cache result after first use here.
#some efficiency could also be gained by pre-calculating the extents for each charset which isn't a simple unicode block. (and perhaps sorting by max char)
#some efficiency could also be gained by pre-calculating the extents for each charset which isn't a simple unicode block. (and perhaps sorting by max char)
set memberof [list]
set memberof [list]
@ -1435,7 +1435,7 @@ tcl::namespace::eval punk::char {
set splen [tcl::dict::size $charset_extents_startpoints]
set splen [tcl::dict::size $charset_extents_startpoints]
set eplen [tcl::dict::size $charset_extents_endpoints]
set eplen [tcl::dict::size $charset_extents_endpoints]
set s [lsearch -bisect -integer $skeys $dec]
set s [lsearch -bisect -integer $skeys $dec]
set s_at_or_below [lrange $skeys 0 $s]
set s_at_or_below [lrange $skeys 0 $s]
set e_of_s [list]
set e_of_s [list]
foreach sk $s_at_or_below {
foreach sk $s_at_or_below {
lappend e_of_s {*}[tcl::dict::get $charset_extents_startpoints $sk]
lappend e_of_s {*}[tcl::dict::get $charset_extents_startpoints $sk]
@ -1472,16 +1472,16 @@ tcl::namespace::eval punk::char {
set eps [list]
set eps [list]
}
}
return [tcl::dict::create startpoints $splen endpoints $eplen midpoint [expr {floor($eplen/2)}] posn $e lhs_endpoints_to_check "[llength $reduced_endpoints]/[llength $e_of_s]=[llength $sps]ranges" rhs_startpoints_to_check "[llength $reduced_startpoints]/[llength $s_of_e]=[llength $eps]"]
return [tcl::dict::create startpoints $splen endpoints $eplen midpoint [expr {floor($eplen/2)}] posn $e lhs_endpoints_to_check "[llength $reduced_endpoints]/[llength $e_of_s]=[llength $sps]ranges" rhs_startpoints_to_check "[llength $reduced_startpoints]/[llength $s_of_e]=[llength $eps]"]
}
}
#for just a few extra sets such as wgl4 and unicode blocks loaded - this gives e.g 5x + better performance than the simple search above, and up to twice as slow for tcl 8.6
#for just a few extra sets such as wgl4 and unicode blocks loaded - this gives e.g 5x + better performance than the simple search above, and up to twice as slow for tcl 8.6
#performance biased towards lower numbered characters (which is not too bad in the context of unicode)
#performance biased towards lower numbered characters (which is not too bad in the context of unicode)
#todo - could be tuned to perform better at end by assuming a fairly even distribution of ranges - and so searching startpoint ranges first for items left of middle, endpoint ranges first for items right of middle
#todo - could be tuned to perform better at end by assuming a fairly even distribution of ranges - and so searching startpoint ranges first for items left of middle, endpoint ranges first for items right of middle
#review with scripts loaded and more defined ranges..
#review with scripts loaded and more defined ranges..
#This algorithm supports arbitrary overlapping ranges and ranges with same start & endpoints
#This algorithm supports arbitrary overlapping ranges and ranges with same start & endpoints
#Should be much better than O(n) for n sets except for pathological case of member of all or nearly-all intervals ?
#Should be much better than O(n) for n sets except for pathological case of member of all or nearly-all intervals ?
#review - compare with 'interval tree' algorithms.
#review - compare with 'interval tree' algorithms.
proc char_info_dec_memberof {dec} {
proc char_info_dec_memberof {dec} {
variable charset_extents_startpoints
variable charset_extents_startpoints
variable charset_extents_endpoints
variable charset_extents_endpoints
@ -1563,7 +1563,7 @@ tcl::namespace::eval punk::char {
set matchcount 0
set matchcount 0
foreach glob $and_globs {
foreach glob $and_globs {
if {[tcl::string::match -nocase $glob $s] || [tcl::string::match -nocase $glob $d]} {
if {[tcl::string::match -nocase $glob $s] || [tcl::string::match -nocase $glob $d]} {
incr matchcount
incr matchcount
}
}
}
}
if {$matchcount == [llength $and_globs]} {
if {$matchcount == [llength $and_globs]} {
@ -1595,12 +1595,12 @@ tcl::namespace::eval punk::char {
}
}
#non-overlapping unicode blocks
#non-overlapping unicode blocks
proc char_blocks {{name_or_glob *}} {
proc char_blocks {{name_or_glob *}} {
variable charsets
variable charsets
#todo - more efficient datastructures?
#todo - more efficient datastructures?
if {![regexp {[?*]} $name_or_glob]} {
if {![regexp {[?*]} $name_or_glob]} {
#no glob - just retrieve it
#no glob - just retrieve it
if {[tcl::dict::exists $charsets $name_or_glob]} {
if {[tcl::dict::exists $charsets $name_or_glob]} {
if {[tcl::dict::get $charsets $name_or_glob settype] eq "block"} {
if {[tcl::dict::get $charsets $name_or_glob settype] eq "block"} {
return [tcl::dict::create $name_or_glob [tcl::dict::get $charsets $name_or_glob]]
return [tcl::dict::create $name_or_glob [tcl::dict::get $charsets $name_or_glob]]
@ -1630,7 +1630,7 @@ tcl::namespace::eval punk::char {
proc charset_names {{name_or_glob *}} {
proc charset_names {{name_or_glob *}} {
variable charsets
variable charsets
if {![regexp {[?*]} $name_or_glob]} {
if {![regexp {[?*]} $name_or_glob]} {
#no glob - just retrieve it
#no glob - just retrieve it
if {[tcl::dict::exists $charsets $name_or_glob]} {
if {[tcl::dict::exists $charsets $name_or_glob]} {
return [list $name_or_glob]
return [list $name_or_glob]
}
}
@ -1641,7 +1641,7 @@ tcl::namespace::eval punk::char {
}
}
} else {
} else {
if {$name_or_glob eq "*"} {
if {$name_or_glob eq "*"} {
return [lsort [tcl::dict::keys $charsets]]
return [lsort [tcl::dict::keys $charsets]]
}
}
#tcl::dict::keys $dict <pattern> doesn't have option for case insensitive searches
#tcl::dict::keys $dict <pattern> doesn't have option for case insensitive searches
return [lsort [lsearch -all -inline -nocase [tcl::dict::keys $charsets] $name_or_glob]]
return [lsort [lsearch -all -inline -nocase [tcl::dict::keys $charsets] $name_or_glob]]
@ -1655,7 +1655,7 @@ tcl::namespace::eval punk::char {
variable charsets
variable charsets
#dictionary sorting of the keys is slow! - we should obviously store it in sorted order instead of sorting entire list on retrieval - or just sort results
#dictionary sorting of the keys is slow! - we should obviously store it in sorted order instead of sorting entire list on retrieval - or just sort results
#set sortedkeys [lsort -increasing -dictionary [tcl::dict::keys $charsets]] ;#NOTE must use -dictionary to use -sorted flag below
#set sortedkeys [lsort -increasing -dictionary [tcl::dict::keys $charsets]] ;#NOTE must use -dictionary to use -sorted flag below
set sortedkeys [lsort -increasing [tcl::dict::keys $charsets]]
set sortedkeys [lsort -increasing [tcl::dict::keys $charsets]]
if {$namesearch eq "*"} {
if {$namesearch eq "*"} {
return $sortedkeys
return $sortedkeys
}
}
@ -1664,7 +1664,7 @@ tcl::namespace::eval punk::char {
return [lsearch -all -inline -nocase $sortedkeys $namesearch] ;#no point using -sorted flag when -all is used
return [lsearch -all -inline -nocase $sortedkeys $namesearch] ;#no point using -sorted flag when -all is used
} else {
} else {
#return [lsearch -sorted -inline -nocase $sortedkeys $namesearch] ;#no globs - bails out earlier if -sorted?
#return [lsearch -sorted -inline -nocase $sortedkeys $namesearch] ;#no globs - bails out earlier if -sorted?
return [lsearch -inline -nocase $sortedkeys $namesearch] ;#no globs
return [lsearch -inline -nocase $sortedkeys $namesearch] ;#no globs
}
}
}
}
proc charsets {{namesearch *}} {
proc charsets {{namesearch *}} {
@ -1738,7 +1738,7 @@ tcl::namespace::eval punk::char {
set opt_ansi [tcl::dict::get $opts -ansi]
set opt_ansi [tcl::dict::get $opts -ansi]
set opt_lined [tcl::dict::get $opts -lined]
set opt_lined [tcl::dict::get $opts -lined]
# -- --- --- ---
# -- --- --- ---
set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
if {$opt_ansi} {
if {$opt_ansi} {
set a1 [a BLACK white bold]
set a1 [a BLACK white bold]
@ -1768,7 +1768,7 @@ tcl::namespace::eval punk::char {
}
}
set i 1
set i 1
append out \n $prefix $charsetname
append out \n $prefix $charsetname
append out \n
append out \n
set marker_line $prefix
set marker_line $prefix
set line $prefix
set line $prefix
@ -1847,7 +1847,7 @@ tcl::namespace::eval punk::char {
} else {
} else {
set charset_dict [charset_dictget $charsetname]
set charset_dict [charset_dictget $charsetname]
}
}
set col_items_short [list]
set col_items_short [list]
set col_items_desc [list]
set col_items_desc [list]
tcl::dict::for {k inf} $charset_dict {
tcl::dict::for {k inf} $charset_dict {
@ -1873,7 +1873,7 @@ tcl::namespace::eval punk::char {
return $out
return $out
}
}
#use console cursor movements to test and cache the column-width of each char in the set of characters returned by the search criteria
#use console cursor movements to test and cache the column-width of each char in the set of characters returned by the search criteria
proc charset_calibrate {namesearch args} {
proc charset_calibrate {namesearch args} {
variable charsets
variable charsets
variable charinfo
variable charinfo
@ -1909,7 +1909,7 @@ tcl::namespace::eval punk::char {
set twidth [tcl::dict::get $charinfo $dec testwidth]
set twidth [tcl::dict::get $charinfo $dec testwidth]
}
}
if {$twidth eq ""} {
if {$twidth eq ""} {
#puts -nonewline stdout "." ;#this
#puts -nonewline stdout "." ;#this
set width [char_info_testwidth $ch] ;#based on console test rather than unicode props
set width [char_info_testwidth $ch] ;#based on console test rather than unicode props
tcl::dict::set charinfo $dec testwidth $width
tcl::dict::set charinfo $dec testwidth $width
} else {
} else {
@ -1925,10 +1925,10 @@ tcl::namespace::eval punk::char {
#maint warning - also in overtype!
#maint warning - also in overtype!
#intended for single grapheme - but will work for multiple
#intended for single grapheme - but will work for multiple
#cannot contain ansi or newlines
#cannot contain ansi or newlines
#(a cache of ansifreestring_width calls - as these are quite regex heavy)
#(a cache of ansifreestring_width calls - as these are quite regex heavy)
#review - effective memory leak on longrunning programs if never cleared
#review - effective memory leak on longrunning programs if never cleared
#tradeoff in fragmenting cache and reducing efficiency vs ability to clear in a scoped manner
#tradeoff in fragmenting cache and reducing efficiency vs ability to clear in a scoped manner
proc grapheme_width_cached {ch {key ""}} {
proc grapheme_width_cached {ch {key ""}} {
variable grapheme_widths
variable grapheme_widths
#if key eq "*" - we won't be able to clear that cache individually. Perhaps that's ok
#if key eq "*" - we won't be able to clear that cache individually. Perhaps that's ok
@ -1975,7 +1975,7 @@ tcl::namespace::eval punk::char {
# - compare with wcswidth returning -1 for entire string containing such in python,perl
# - compare with wcswidth returning -1 for entire string containing such in python,perl
proc wcswidth {string} {
proc wcswidth {string} {
#faster than textutil::wcswidth (at least for string up to a few K in length)
#faster than textutil::wcswidth (at least for string up to a few K in length)
#..but - 'scan' is horrible for 400K+ (Tcl evaluation stack has to be reallocated/copied?)
#..but - 'scan' is horrible for 400K+ (Tcl evaluation stack has to be reallocated/copied?)
#Tcl initial evaluation stack size is 2000 (? review)
#Tcl initial evaluation stack size is 2000 (? review)
#REVIEW - when we cater for grapheme clusters - we can't just split the string at arbitrary points like this!!
#REVIEW - when we cater for grapheme clusters - we can't just split the string at arbitrary points like this!!
set chunksize 2000
set chunksize 2000
@ -1984,14 +1984,14 @@ tcl::namespace::eval punk::char {
set startidx 0
set startidx 0
set endidx [expr {$startidx + $chunksize -1}]
set endidx [expr {$startidx + $chunksize -1}]
for {set i 0} {$i < $chunks_required} {incr i} {
for {set i 0} {$i < $chunks_required} {incr i} {
set chunk [tcl::string::range $string $startidx $endidx]
set chunk [tcl::string::range $string $startidx $endidx]
set codes [scan $chunk [tcl::string::repeat %c [tcl::string::length $chunk]]]
set codes [scan $chunk [tcl::string::repeat %c [tcl::string::length $chunk]]]
foreach c $codes {
foreach c $codes {
if {$c <= 255 && !($c < 31 || $c == 127)} {
if {$c <= 255 && !($c < 31 || $c == 127)} {
#review - non-printing ascii? why does textutil::wcswidth report 1 ??
#review - non-printing ascii? why does textutil::wcswidth report 1 ??
#todo - compare with python or other lang wcwidth
#todo - compare with python or other lang wcwidth
incr width
incr width
} elseif {$c < 917504 || $c > 917631} {
} elseif {$c < 917504 || $c > 917631} {
#TODO - various other joiners and non-printing chars
#TODO - various other joiners and non-printing chars
set w [textutil::wcswidth_char $c]
set w [textutil::wcswidth_char $c]
@ -2023,7 +2023,7 @@ tcl::namespace::eval punk::char {
set graphemes [list]
set graphemes [list]
while {$i < [tcl::string::length $string]} {
while {$i < [tcl::string::length $string]} {
set aftercluster [tk::endOfCluster $string $i]
set aftercluster [tk::endOfCluster $string $i]
lappend graphemes [string range $string $i $aftercluster-1]
lappend graphemes [string range $string $i $aftercluster-1]
set i $aftercluster
set i $aftercluster
}
}
return $graphemes
return $graphemes
@ -2052,15 +2052,15 @@ tcl::namespace::eval punk::char {
}
}
}
}
incr width $gw
incr width $gw
#if {[string first \u200d $g] >=0} {
#if {[string first \u200d $g] >=0} {
# incr width 2
# incr width 2
#} else {
#} else {
# #other joiners???
# #other joiners???
# incr width [wcswidth_unclustered $g]
# incr width [wcswidth_unclustered $g]
#}
#}
} else {
} else {
incr width [wcswidth_unclustered $g]
incr width [wcswidth_unclustered $g]
}
}
set i $aftercluster
set i $aftercluster
}
}
@ -2071,8 +2071,8 @@ tcl::namespace::eval punk::char {
scan $char %c dec
scan $char %c dec
if {$dec <= 255 && !($dec < 31 || $dec == 127)} {
if {$dec <= 255 && !($dec < 31 || $dec == 127)} {
#review - non-printing ascii? why does textutil::wcswidth report 1 ??
#review - non-printing ascii? why does textutil::wcswidth report 1 ??
#todo - compare with python or other lang wcwidth
#todo - compare with python or other lang wcwidth
return 1
return 1
} elseif {$dec < 917504 || $dec > 917631} {
} elseif {$dec < 917504 || $dec > 917631} {
#TODO - various other joiners and non-printing chars
#TODO - various other joiners and non-printing chars
return [textutil::wcswidth_char $dec] ;#note textutil::wcswidth_char takes a decimal codepoint!
return [textutil::wcswidth_char $dec] ;#note textutil::wcswidth_char takes a decimal codepoint!
@ -2086,8 +2086,8 @@ tcl::namespace::eval punk::char {
scan $c %c dec
scan $c %c dec
if {$dec <= 255 && !($dec < 31 || $dec == 127)} {
if {$dec <= 255 && !($dec < 31 || $dec == 127)} {
#review - non-printing ascii? why does textutil::wcswidth report 1 ??
#review - non-printing ascii? why does textutil::wcswidth report 1 ??
#todo - compare with python or other lang wcwidth
#todo - compare with python or other lang wcwidth
incr width
incr width
} elseif {$dec < 917504 || $dec > 917631} {
} elseif {$dec < 917504 || $dec > 917631} {
#TODO - various other joiners and non-printing chars
#TODO - various other joiners and non-printing chars
set w [textutil::wcswidth_char $dec] ;#takes decimal codepoint
set w [textutil::wcswidth_char $dec] ;#takes decimal codepoint
@ -2105,7 +2105,7 @@ tcl::namespace::eval punk::char {
# - compare with wcswidth returning -1 for entire string containing such in python,perl
# - compare with wcswidth returning -1 for entire string containing such in python,perl
proc wcswidth_unclustered {string} {
proc wcswidth_unclustered {string} {
#faster than textutil::wcswidth (at least for string up to a few K in length)
#faster than textutil::wcswidth (at least for string up to a few K in length)
#..but - 'scan' is horrible for 400K+ (Tcl evaluation stack has to be reallocated/copied?)
#..but - 'scan' is horrible for 400K+ (Tcl evaluation stack has to be reallocated/copied?)
#Tcl initial evaluation stack size is 2000 (? review)
#Tcl initial evaluation stack size is 2000 (? review)
#we can only split the string at arbitrary points like this because we are specifically dealing with input that has no clusters!.
#we can only split the string at arbitrary points like this because we are specifically dealing with input that has no clusters!.
set chunksize 2000
set chunksize 2000
@ -2114,14 +2114,14 @@ tcl::namespace::eval punk::char {
set startidx 0
set startidx 0
set endidx [expr {$startidx + $chunksize -1}]
set endidx [expr {$startidx + $chunksize -1}]
for {set i 0} {$i < $chunks_required} {incr i} {
for {set i 0} {$i < $chunks_required} {incr i} {
set chunk [tcl::string::range $string $startidx $endidx]
set chunk [tcl::string::range $string $startidx $endidx]
set codes [scan $chunk [tcl::string::repeat %c [tcl::string::length $chunk]]]
set codes [scan $chunk [tcl::string::repeat %c [tcl::string::length $chunk]]]
foreach dec $codes {
foreach dec $codes {
if {$dec <= 255 && !($dec < 31 || $dec == 127)} {
if {$dec <= 255 && !($dec < 31 || $dec == 127)} {
#review - non-printing ascii? why does textutil::wcswidth report 1 ??
#review - non-printing ascii? why does textutil::wcswidth report 1 ??
#todo - compare with python or other lang wcwidth
#todo - compare with python or other lang wcwidth
incr width
incr width
} elseif {$dec < 917504 || $dec > 917631} {
} elseif {$dec < 917504 || $dec > 917631} {
#TODO - various other joiners and non-printing chars
#TODO - various other joiners and non-printing chars
set w [textutil::wcswidth_char $dec]
set w [textutil::wcswidth_char $dec]
@ -2141,8 +2141,8 @@ tcl::namespace::eval punk::char {
proc wcswidth0 {string} {
proc wcswidth0 {string} {
#faster than textutil::wcswidth (at least for string up to a few K in length)
#faster than textutil::wcswidth (at least for string up to a few K in length)
#..but - 'scan' is horrible for 400K+
#..but - 'scan' is horrible for 400K+
#TODO
#TODO
set codes [scan $string [tcl::string::repeat %c [tcl::string::length $string]]]
set codes [scan $string [tcl::string::repeat %c [tcl::string::length $string]]]
set width 0
set width 0
foreach dec $codes {
foreach dec $codes {
@ -2150,9 +2150,9 @@ tcl::namespace::eval punk::char {
if {$dec < 917504 || $dec > 917631} {
if {$dec < 917504 || $dec > 917631} {
if {$dec <= 255} {
if {$dec <= 255} {
#review - non-printing ascii? why does textutil::wcswidth report 1 ??
#review - non-printing ascii? why does textutil::wcswidth report 1 ??
#todo - compare with python or other lang wcwidth
#todo - compare with python or other lang wcwidth
if {!($dec < 31 || $dec == 127)} {
if {!($dec < 31 || $dec == 127)} {
incr width
incr width
}
}
} else {
} else {
#TODO - various other joiners and non-printing chars
#TODO - various other joiners and non-printing chars
@ -2179,11 +2179,11 @@ tcl::namespace::eval punk::char {
#prerequisites - no ansi escapes - no newlines - utf8 encoding assumed
#prerequisites - no ansi escapes - no newlines - utf8 encoding assumed
#review - what about \r \t \b ?
#review - what about \r \t \b ?
#NO processing of \b - already handled in ansi::printing_length which then calls this
#NO processing of \b - already handled in ansi::printing_length which then calls this
#this version breaks string into sequences of ascii vs unicode
#this version breaks string into sequences of ascii vs unicode
proc ansifreestring_width {text} {
proc ansifreestring_width {text} {
#caller responsible for calling ansistrip first if text may have ansi codes - and for ensuring no newlines
#caller responsible for calling ansistrip first if text may have ansi codes - and for ensuring no newlines
#we can c0 control characters after or while processing ansi escapes.
#we can c0 control characters after or while processing ansi escapes.
#we need to map remaining control characters to zero-width (under assumption we're not using a font/codepage that displays them - review!)
#we need to map remaining control characters to zero-width (under assumption we're not using a font/codepage that displays them - review!)
#anyway - we must allow things like raw ESC,DEL, NUL etc to pass through without error
#anyway - we must allow things like raw ESC,DEL, NUL etc to pass through without error
#if {[tcl::string::first \033 $text] >= 0} {
#if {[tcl::string::first \033 $text] >= 0} {
# error "string_width doesn't accept ansi escape sequences. Use punk::ansi::ansistrip first"
# error "string_width doesn't accept ansi escape sequences. Use punk::ansi::ansistrip first"
@ -2204,11 +2204,11 @@ tcl::namespace::eval punk::char {
#experiment to detect leading diacritics - but this isn't necessary - it's still zero-width - and if the user is splitting properly we shouldn't be getting a string with leading diacritics anyway
#experiment to detect leading diacritics - but this isn't necessary - it's still zero-width - and if the user is splitting properly we shouldn't be getting a string with leading diacritics anyway
#(leading combiners may display in terminal as mark on rightmost prompt char which is usually a space - but won't add width even then)
#(leading combiners may display in terminal as mark on rightmost prompt char which is usually a space - but won't add width even then)
#set re_leading_diacritic {^(?:[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+)}
#set re_leading_diacritic {^(?:[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+)}
#if {[regexp $re_leading_diacritic $text]} {
#if {[regexp $re_leading_diacritic $text]} {
# set text " $text"
# set text " $text"
#}
#}
set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
set text [regsub -all $re_diacritics $text ""]
set text [regsub -all $re_diacritics $text ""]
# -- --- --- --- --- --- ---
# -- --- --- --- --- --- ---
@ -2221,10 +2221,10 @@ tcl::namespace::eval punk::char {
#for now - strip them out
#for now - strip them out
#ZWNJ \u200c should also be zero length - but as a 'non' joiner - it should add zero to the length
#ZWNJ \u200c should also be zero length - but as a 'non' joiner - it should add zero to the length
#ZWSP \u200b zero width space
#ZWSP \u200b zero width space
#\uFFEFBOM/ ZWNBSP and others that should be zero width
#\uFFEFBOM/ ZWNBSP and others that should be zero width
#todo - work out proper way to mark/group zero width.
#todo - work out proper way to mark/group zero width.
#set text [tcl::string::map [list \u200b "" \u200c "" \u200d "" \uFFEF ""] $text]
#set text [tcl::string::map [list \u200b "" \u200c "" \u200d "" \uFFEF ""] $text]
set text [tcl::string::map [list \u200b "" \u200c "" \u200d ""] $text]
set text [tcl::string::map [list \u200b "" \u200c "" \u200d ""] $text]
@ -2241,7 +2241,7 @@ tcl::namespace::eval punk::char {
#c1 controls - first section of the Latin-1 Supplement block - all non-printable from a utf-8 perspective
#c1 controls - first section of the Latin-1 Supplement block - all non-printable from a utf-8 perspective
#some or all of these may have a visual representation in other encodings e.g cp855 seems to have 1 width for them all
#some or all of these may have a visual representation in other encodings e.g cp855 seems to have 1 width for them all
#we are presuming that the text supplied has been converted from any other encoding to utf8 - so we don't need to handle that here
#we are presuming that the text supplied has been converted from any other encoding to utf8 - so we don't need to handle that here
#they should also be unlikely to be present in an ansi-free string (as is a precondition for use of this function)
#they should also be unlikely to be present in an ansi-free string (as is a precondition for use of this function)
set text [regsub -all {[\u0080-\u009f]+} $text ""]
set text [regsub -all {[\u0080-\u009f]+} $text ""]
@ -2262,7 +2262,7 @@ tcl::namespace::eval punk::char {
#set can_regex_high_unicode [tcl::string::match [regexp -all -inline {[\U80-\U0010FFFF]} \U0001F525] \U0001F525]
#set can_regex_high_unicode [tcl::string::match [regexp -all -inline {[\U80-\U0010FFFF]} \U0001F525] \U0001F525]
#tcl pre 2023-11 - braced high unicode regexes don't work
#tcl pre 2023-11 - braced high unicode regexes don't work
#fixed in bug-4ed788c618 2023-11
#fixed in bug-4ed788c618 2023-11
#set uc_chars [regexp -all -inline "\[\u0100-\U10FFFF\]" $text] ;#e.g return list of chars in range only
#set uc_chars [regexp -all -inline "\[\u0100-\U10FFFF\]" $text] ;#e.g return list of chars in range only
#maintain unicode as sequences - todo - scan for grapheme clusters
#maintain unicode as sequences - todo - scan for grapheme clusters
#set uc_sequences [punk::ansi::ta::_perlish_split "(?:\[\u0100-\U10FFFF\])+" $text]
#set uc_sequences [punk::ansi::ta::_perlish_split "(?:\[\u0100-\U10FFFF\])+" $text]
@ -2291,7 +2291,7 @@ tcl::namespace::eval punk::char {
#we can c0 control characters after or while processing ansi escapes.
#we can c0 control characters after or while processing ansi escapes.
#we need to map remaining control characters to zero-width (under assumption we're not using a font/codepage that displays them - review!)
#we need to map remaining control characters to zero-width (under assumption we're not using a font/codepage that displays them - review!)
#anyway - we must allow things like raw ESC,DEL, NUL etc to pass through without error
#anyway - we must allow things like raw ESC,DEL, NUL etc to pass through without error
#if {[tcl::string::first \033 $text] >= 0} {
#if {[tcl::string::first \033 $text] >= 0} {
# error "string_width doesn't accept ansi escape sequences. Use punk::ansi::ansistrip first"
# error "string_width doesn't accept ansi escape sequences. Use punk::ansi::ansistrip first"
@ -2312,11 +2312,11 @@ tcl::namespace::eval punk::char {
#experiment to detect leading diacritics - but this isn't necessary - it's still zero-width - and if the user is splitting properly we shouldn't be getting a string with leading diacritics anyway
#experiment to detect leading diacritics - but this isn't necessary - it's still zero-width - and if the user is splitting properly we shouldn't be getting a string with leading diacritics anyway
#(leading combiners may display in terminal as mark on rightmost prompt char which is usually a space - but won't add width even then)
#(leading combiners may display in terminal as mark on rightmost prompt char which is usually a space - but won't add width even then)
#set re_leading_diacritic {^(?:[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+)}
#set re_leading_diacritic {^(?:[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+)}
#if {[regexp $re_leading_diacritic $text]} {
#if {[regexp $re_leading_diacritic $text]} {
# set text " $text"
# set text " $text"
#}
#}
set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
set text [regsub -all $re_diacritics $text ""]
set text [regsub -all $re_diacritics $text ""]
#review
#review
@ -2325,7 +2325,7 @@ tcl::namespace::eval punk::char {
#ZWNJ \u0200c should also be zero length - but as a 'non' joiner - it should add zero to the length
#ZWNJ \u0200c should also be zero length - but as a 'non' joiner - it should add zero to the length
#ZWSP \u0200b zero width space
#ZWSP \u0200b zero width space
#we should only map control sequences to nothing after processing ones with length effects, such as \b (\x07f) or DEL \x1f
#we should only map control sequences to nothing after processing ones with length effects, such as \b (\x07f) or DEL \x1f
@ -2343,10 +2343,10 @@ tcl::namespace::eval punk::char {
}
}
#review - wcswidth should detect these
#review - wcswidth should detect these
set re_ascii_fullwidth {[\uFF01-\uFF5e]}
set re_ascii_fullwidth {[\uFF01-\uFF5e]}
set doublewidth_char_count 0
set doublewidth_char_count 0
set zerowidth_char_count 0
set zerowidth_char_count 0
#split just to get the standalone character widths - and then scan for other combiners (?)
#split just to get the standalone character widths - and then scan for other combiners (?)
#review
#review
#set can_regex_high_unicode [tcl::string::match [regexp -all -inline {[\U80-\U0010FFFF]} \U0001F525] \U0001F525]
#set can_regex_high_unicode [tcl::string::match [regexp -all -inline {[\U80-\U0010FFFF]} \U0001F525] \U0001F525]
@ -2354,7 +2354,7 @@ tcl::namespace::eval punk::char {
#fixed in bug-4ed788c618 2023-11
#fixed in bug-4ed788c618 2023-11
#set uc_sequences [regexp -all -inline -indices {[\u0100-\U10FFFF]} $text]
#set uc_sequences [regexp -all -inline -indices {[\u0100-\U10FFFF]} $text]
#set uc_sequences [regexp -all -inline -indices "\[\u0100-\U10FFFF\]" $text] ;#e.g for string: \U0001f9d1abc\U001f525ab returns {0 0} {4 4}
#set uc_sequences [regexp -all -inline -indices "\[\u0100-\U10FFFF\]" $text] ;#e.g for string: \U0001f9d1abc\U001f525ab returns {0 0} {4 4}
set uc_chars [regexp -all -inline "\[\u0100-\U10FFFF\]" $text] ;#e.g return list of chars in range only
set uc_chars [regexp -all -inline "\[\u0100-\U10FFFF\]" $text] ;#e.g return list of chars in range only
foreach c $uc_chars {
foreach c $uc_chars {
if {[regexp $re_ascii_fullwidth $c]} {
if {[regexp $re_ascii_fullwidth $c]} {
incr doublewidth_char_count
incr doublewidth_char_count
@ -2364,12 +2364,12 @@ tcl::namespace::eval punk::char {
# b)- textutil::wcswidth_char seems to be east-asian-width based - and not a reliable indicator of 'character cells taken by the character when printed to the terminal' despite this statement in tclllib docs.
# b)- textutil::wcswidth_char seems to be east-asian-width based - and not a reliable indicator of 'character cells taken by the character when printed to the terminal' despite this statement in tclllib docs.
#(character width is a complex context-dependent topic)
#(character width is a complex context-dependent topic)
# c) by checking for a cached console testwidth first - we make this function less deterministic/repeatable depending on whether console tests have been run.
# c) by checking for a cached console testwidth first - we make this function less deterministic/repeatable depending on whether console tests have been run.
# d) Upstream caching of grapheme_width may also lock in a result from whatever method was first employed here
# d) Upstream caching of grapheme_width may also lock in a result from whatever method was first employed here
#Despite all this - the mechanism is hoped to give best effort consistency for terminals
#Despite all this - the mechanism is hoped to give best effort consistency for terminals
#further work needed for combining emojis etc - which can't be done in a per character loop
#further work needed for combining emojis etc - which can't be done in a per character loop
#todo - see if wcswidth does any of this. It is very slow for strings that include mixed ascii/unicode - so perhaps we can use a perlish_split
#todo - see if wcswidth does any of this. It is very slow for strings that include mixed ascii/unicode - so perhaps we can use a perlish_split
# to process sequences of unicode.
# to process sequences of unicode.
#- and the user has the option to test character sets first if terminal position reporting gives better results
#- and the user has the option to test character sets first if terminal position reporting gives better results
if {[char_info_is_testwidth_cached $c]} {
if {[char_info_is_testwidth_cached $c]} {
set width [char_info_testwidth_cached $c]
set width [char_info_testwidth_cached $c]
} else {
} else {
@ -2378,7 +2378,7 @@ tcl::namespace::eval punk::char {
set width [textutil::wcswidth_char [scan $c %c]]
set width [textutil::wcswidth_char [scan $c %c]]
}
}
if {$width == 0} {
if {$width == 0} {
incr zerowidth_char_count
incr zerowidth_char_count
} elseif {$width == 2} {
} elseif {$width == 2} {
incr doublewidth_char_count
incr doublewidth_char_count
}
}
@ -2395,7 +2395,7 @@ tcl::namespace::eval punk::char {
#we can c0 control characters after or while processing ansi escapes.
#we can c0 control characters after or while processing ansi escapes.
#we need to map remaining control characters to zero-width (under assumption we're not using a font/codepage that displays them - review!)
#we need to map remaining control characters to zero-width (under assumption we're not using a font/codepage that displays them - review!)
#anyway - we must allow things like raw ESC,DEL, NUL etc to pass through without error
#anyway - we must allow things like raw ESC,DEL, NUL etc to pass through without error
#if {[tcl::string::first \033 $text] >= 0} {
#if {[tcl::string::first \033 $text] >= 0} {
# error "string_width doesn't accept ansi escape sequences. Use punk::ansi::ansistrip first"
# error "string_width doesn't accept ansi escape sequences. Use punk::ansi::ansistrip first"
@ -2416,11 +2416,11 @@ tcl::namespace::eval punk::char {
#experiment to detect leading diacritics - but this isn't necessary - it's still zero-width - and if the user is splitting properly we shouldn't be getting a string with leading diacritics anyway
#experiment to detect leading diacritics - but this isn't necessary - it's still zero-width - and if the user is splitting properly we shouldn't be getting a string with leading diacritics anyway
#(leading combiners may display in terminal as mark on rightmost prompt char which is usually a space - but won't add width even then)
#(leading combiners may display in terminal as mark on rightmost prompt char which is usually a space - but won't add width even then)
#set re_leading_diacritic {^(?:[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+)}
#set re_leading_diacritic {^(?:[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+)}
#if {[regexp $re_leading_diacritic $text]} {
#if {[regexp $re_leading_diacritic $text]} {
# set text " $text"
# set text " $text"
#}
#}
set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
set text [regsub -all $re_diacritics $text ""]
set text [regsub -all $re_diacritics $text ""]
#we should only map control sequences to nothing after processing ones with length effects, such as \b (\x07f) or DEL \x1f
#we should only map control sequences to nothing after processing ones with length effects, such as \b (\x07f) or DEL \x1f
@ -2437,7 +2437,7 @@ tcl::namespace::eval punk::char {
return [tcl::string::length $text]
return [tcl::string::length $text]
}
}
#slow when ascii mixed with unicode (but why?)
#slow when ascii mixed with unicode (but why?)
return [punk::char::wcswidth $text]
return [punk::char::wcswidth $text]
}
}
#This shouldn't be called on text containing ansi codes!
#This shouldn't be called on text containing ansi codes!
@ -2516,11 +2516,11 @@ tcl::namespace::eval punk::char {
return [format $fmt {*}$declist]
return [format $fmt {*}$declist]
}
}
#split into plaintext and runs of combiners (combining diacritical marks - not ZWJ or ZWJNJ)
#split into plaintext and runs of combiners (combining diacritical marks - not ZWJ or ZWJNJ)
proc combiner_split {text} {
proc combiner_split {text} {
#split into odd numbered list (or zero) in a similar way to punk::ansi::ta::_perlish_split
#split into odd numbered list (or zero) in a similar way to punk::ansi::ta::_perlish_split
#
#
#set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
#set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
set graphemes [list]
set graphemes [list]
if {[tcl::string::length $text] == 0} {
if {[tcl::string::length $text] == 0} {
return {}
return {}
@ -2528,12 +2528,12 @@ tcl::namespace::eval punk::char {
set list [list]
set list [list]
set start 0
set start 0
set strlen [tcl::string::length $text]
set strlen [tcl::string::length $text]
#make sure our regexes aren't non-greedy - or we may not have exit condition for loop
#make sure our regexes aren't non-greedy - or we may not have exit condition for loop
#review
#review
while {$start < $strlen && [regexp -start $start -indices -- {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+} $text match]} {
while {$start < $strlen && [regexp -start $start -indices -- {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+} $text match]} {
lassign $match matchStart matchEnd
lassign $match matchStart matchEnd
#puts "->start $start ->match $matchStart $matchEnd"
#puts "->start $start ->match $matchStart $matchEnd"
lappend list [tcl::string::range $text $start $matchStart-1] [tcl::string::range $text $matchStart $matchEnd]
lappend list [tcl::string::range $text $start $matchStart-1] [tcl::string::range $text $matchStart $matchEnd]
set start [expr {$matchEnd+1}]
set start [expr {$matchEnd+1}]
}
}
lappend list [tcl::string::range $text $start end]
lappend list [tcl::string::range $text $start end]
@ -2547,7 +2547,7 @@ tcl::namespace::eval punk::char {
#This is difficult in Tcl without unicode property based Character Classes in the regex engine
#This is difficult in Tcl without unicode property based Character Classes in the regex engine
#review - this needs to be performant - it is used a lot by punk terminal/ansi features
#review - this needs to be performant - it is used a lot by punk terminal/ansi features
#todo - trie data structures for unicode?
#todo - trie data structures for unicode?
#for now we can at least combine diacritics
#for now we can at least combine diacritics
#should also handle the ZWJ (and the variation selectors? eg \uFE0F) character which should account for emoji clusters
#should also handle the ZWJ (and the variation selectors? eg \uFE0F) character which should account for emoji clusters
#Note - emoji cluster could be for example 11 code points/41 bytes (family emoji with skin tone modifiers for each member, 3 ZWJs)
#Note - emoji cluster could be for example 11 code points/41 bytes (family emoji with skin tone modifiers for each member, 3 ZWJs)
#This still leaves a whole class of clusters.. korean etc unhandled :/
#This still leaves a whole class of clusters.. korean etc unhandled :/
@ -2560,7 +2560,7 @@ tcl::namespace::eval punk::char {
set clist [split $pt ""]
set clist [split $pt ""]
lappend graphemes {*}[lrange $clist 0 end-1]
lappend graphemes {*}[lrange $clist 0 end-1]
lappend graphemes [tcl::string::cat [lindex $clist end] $combiners]
lappend graphemes [tcl::string::cat [lindex $clist end] $combiners]
}
}
#last csplit never has a combiner (_perlish_split style) - and may be empty - in which case we don't append it as a grapheme
#last csplit never has a combiner (_perlish_split style) - and may be empty - in which case we don't append it as a grapheme
if {[lindex $csplits end] ne ""} {
if {[lindex $csplits end] ne ""} {
lappend graphemes {*}[split [lindex $csplits end] ""]
lappend graphemes {*}[split [lindex $csplits end] ""]
@ -2575,7 +2575,7 @@ tcl::namespace::eval punk::char {
set combiner_decs [scan $combiners [tcl::string::repeat %c [tcl::string::length $combiners]]]
set combiner_decs [scan $combiners [tcl::string::repeat %c [tcl::string::length $combiners]]]
lset pt_decs end [concat [lindex $pt_decs end] $combiner_decs]
lset pt_decs end [concat [lindex $pt_decs end] $combiner_decs]
lappend graphemes {*}$pt_decs
lappend graphemes {*}$pt_decs
}
}
#last csplit never has a combiner (_perlish_split style) - and may be empty - in which case we don't append it as a grapheme
#last csplit never has a combiner (_perlish_split style) - and may be empty - in which case we don't append it as a grapheme
if {[lindex $csplits end] ne ""} {
if {[lindex $csplits end] ne ""} {
lappend graphemes {*}[scan [lindex $csplits end] [tcl::string::repeat %c [tcl::string::length [lindex $csplits end]]]]
lappend graphemes {*}[scan [lindex $csplits end] [tcl::string::repeat %c [tcl::string::length [lindex $csplits end]]]]
@ -2592,7 +2592,7 @@ tcl::namespace::eval punk::char {
lset pt_decs end [concat [lindex $pt_decs end] $combiner_decs]
lset pt_decs end [concat [lindex $pt_decs end] $combiner_decs]
}
}
lappend graphemes {*}$pt_decs
lappend graphemes {*}$pt_decs
}
}
return $graphemes
return $graphemes
}
}
proc grapheme_split2 {text} {
proc grapheme_split2 {text} {
@ -2601,7 +2601,7 @@ tcl::namespace::eval punk::char {
foreach {pt combiners} [lrange $csplits 0 end-1] {
foreach {pt combiners} [lrange $csplits 0 end-1] {
set clist [split $pt ""]
set clist [split $pt ""]
lappend graphemes {*}[lrange $clist 0 end-1] [tcl::string::cat [lindex $clist end] $combiners]
lappend graphemes {*}[lrange $clist 0 end-1] [tcl::string::cat [lindex $clist end] $combiners]
}
}
#last csplit never has a combiner (_perlish_split style) - and may be empty - in which case we don't append it as a grapheme
#last csplit never has a combiner (_perlish_split style) - and may be empty - in which case we don't append it as a grapheme
if {[lindex $csplits end] ne ""} {
if {[lindex $csplits end] ne ""} {
lappend graphemes {*}[split [lindex $csplits end] ""]
lappend graphemes {*}[split [lindex $csplits end] ""]
@ -2645,10 +2645,10 @@ tcl::namespace::eval punk::char {
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
## Ready
## Ready
package provide punk::char [tcl::namespace::eval punk::char {
package provide punk::char [tcl::namespace::eval punk::char {
variable version
variable version
set version 999999.0a1.0
set version 999999.0a1.0
}]
}]
return
return