|
|
# -*- tcl -*- |
|
|
# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from <pkg>-buildversion.txt |
|
|
# |
|
|
# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. |
|
|
# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. |
|
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
|
# (C) 2023 |
|
|
# |
|
|
# @@ Meta Begin |
|
|
# Application punk::char 0.1.0 |
|
|
# Meta platform tcl |
|
|
# Meta license <unspecified> |
|
|
# @@ Meta End |
|
|
|
|
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
|
# doctools header |
|
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
|
#*** !doctools |
|
|
#[manpage_begin punkshell_module_punk::char 0 0.1.0] |
|
|
#[copyright "2024"] |
|
|
#[titledesc {character-set and unicode utilities}] [comment {-- Name section and table of contents description --}] |
|
|
#[moddesc {character-set nad unicode}] [comment {-- Description at end of page heading --}] |
|
|
#[require punk::char] |
|
|
#[keywords module encodings] |
|
|
#[description] |
|
|
|
|
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
|
|
|
|
#*** !doctools |
|
|
#[section Overview] |
|
|
#[para] overview of punk::char |
|
|
#[subsection Concepts] |
|
|
#[para] - |
|
|
|
|
|
|
|
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
|
## Requirements |
|
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
|
## Requirements |
|
|
##e.g package require frobz |
|
|
|
|
|
#*** !doctools |
|
|
#[subsection dependencies] |
|
|
#[para] packages used by punk::char |
|
|
#[list_begin itemized] |
|
|
#[item] [package {Tcl 8.6}] |
|
|
|
|
|
# |
|
|
|
|
|
#*** !doctools |
|
|
#[item] [package {overtype}] |
|
|
#[para] - |
|
|
#[item] [package {textblock}] |
|
|
#[para] - |
|
|
#[item] [package console] |
|
|
#[para] - |
|
|
|
|
|
package require Tcl 8.6- |
|
|
#dependency on tcllib not ideal for bootstrapping as punk::char is core to many features.. (textutil::wcswidth is therefore included in bootsupport/include_modules.config) review |
|
|
package require textutil |
|
|
package require textutil::wcswidth |
|
|
|
|
|
#*** !doctools |
|
|
#[list_end] |
|
|
|
|
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
|
|
|
|
#*** !doctools |
|
|
#[section API] |
|
|
|
|
|
#Note that ansi escapes can begin with \033\[ (\u001b\[) or the single character "Control Sequence Introducer" 0x9b |
|
|
|
|
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
|
tcl::namespace::eval punk::char { |
|
|
tcl::namespace::export * |
|
|
|
|
|
variable grapheme_widths [tcl::dict::create] |
|
|
# -- -------------------------------------------------------------------------- |
|
|
variable encmimens ;#namespace of mime package providing reversemapencoding and mapencoding functions |
|
|
#tcllib mime requires tcl::chan::memchan,events,core and/or Trf |
|
|
if {![catch {package require punk::encmime} errM]} { |
|
|
set encmimens ::punk::encmime |
|
|
} else { |
|
|
package require mime |
|
|
set encmimens ::mime |
|
|
} |
|
|
# -- -------------------------------------------------------------------------- |
|
|
|
|
|
variable invalid "???" ;# ideally this would be 0xFFFD - which should display as black diamond/rhombus with question mark. As at 2023 - this tends to display indistinguishably from other missing glyph boxes - so we use a longer sequence we can detect by length and to display obviously |
|
|
variable invalid_display_char \u25ab; #todo - change to 0xFFFD once diamond glyph more common? |
|
|
|
|
|
#just the 7-bit ascii. use [page ascii] for the 8-bit layout |
|
|
proc ascii {} {return { |
|
|
00 NUL 01 SOH 02 STX 03 ETX 04 EOT 05 ENQ 06 ACK 07 BEL |
|
|
08 BS 09 HT 0a LF 0b VT 0c FF 0d CR 0e SO 0f SI |
|
|
10 DLE 11 DC1 12 DC2 13 DC3 14 DC4 15 NAK 16 SYN 17 ETB |
|
|
18 CAN 19 EM 1a SUB 1b ESC 1c FS 1d GS 1e RS 1f US |
|
|
20 SP 21 ! 22 " 23 # 24 $ 25 % 26 & 27 ' |
|
|
28 ( 29 ) 2a * 2b + 2c , 2d - 2e . 2f / |
|
|
30 0 31 1 32 2 33 3 34 4 35 5 36 6 37 7 |
|
|
38 8 39 9 3a : 3b ; 3c < 3d = 3e > 3f ? |
|
|
40 @ 41 A 42 B 43 C 44 D 45 E 46 F 47 G |
|
|
48 H 49 I 4a J 4b K 4c L 4d M 4e N 4f O |
|
|
50 P 51 Q 52 R 53 S 54 T 55 U 56 V 57 W |
|
|
58 X 59 Y 5a Z 5b [ 5c \ 5d ] 5e ^ 5f _ |
|
|
60 ` 61 a 62 b 63 c 64 d 65 e 66 f 67 g |
|
|
68 h 69 i 6a j 6b k 6c l 6d m 6e n 6f o |
|
|
70 p 71 q 72 r 73 s 74 t 75 u 76 v 77 w |
|
|
78 x 79 y 7a z 7b { 7c | 7d } 7e ~ 7f DEL |
|
|
}} |
|
|
|
|
|
#G0 character set |
|
|
proc ascii2 {} { |
|
|
set dict [asciidict2] |
|
|
set out "" |
|
|
set i 1 |
|
|
append out " " |
|
|
tcl::dict::for {k v} $dict { |
|
|
#single chars are wrapped with \033(0 and \033(B ie total length 7 |
|
|
if {[tcl::string::length $v] == 7} { |
|
|
set v " $v " |
|
|
} elseif {[tcl::string::length $v] == 2} { |
|
|
set v "$v " |
|
|
} elseif {[tcl::string::length $v] == 0} { |
|
|
set v " " |
|
|
} |
|
|
append out "$k $v " |
|
|
if {$i > 0 && $i % 8 == 0} { |
|
|
set out [tcl::string::range $out 0 end-2] |
|
|
append out \n " " |
|
|
} |
|
|
incr i |
|
|
} |
|
|
set out [tcl::string::trimright $out " "] |
|
|
return $out |
|
|
} |
|
|
|
|
|
|
|
|
proc symbol {} { |
|
|
tailcall page symbol |
|
|
} |
|
|
proc dingbats {} { |
|
|
set out "" |
|
|
append out [page dingbats] \n |
|
|
set unicode_dict [charset_dictget Dingbats] |
|
|
|
|
|
append out " " |
|
|
set i 1 |
|
|
tcl::dict::for {k charinfo} $unicode_dict { |
|
|
set char [tcl::dict::get $charinfo char] |
|
|
if {[tcl::string::length $char] == 0} { |
|
|
set displayv " " |
|
|
} elseif {[tcl::string::length $char] == 1} { |
|
|
set displayv " $char " |
|
|
} else { |
|
|
set displayv $char |
|
|
} |
|
|
append out "$k $displayv " |
|
|
if {$i > 0 && $i % 8 == 0} { |
|
|
set out [tcl::string::range $out 0 end-2] |
|
|
append out \n " " |
|
|
} |
|
|
incr i |
|
|
} |
|
|
return $out |
|
|
} |
|
|
proc page_names {{search *}} { |
|
|
set all_names [list] |
|
|
set d [page_names_dict $search] |
|
|
tcl::dict::for {k v} $d { |
|
|
if {$k ni $all_names} { |
|
|
lappend all_names $k |
|
|
} |
|
|
foreach m $v { |
|
|
if {$m ni $all_names} { |
|
|
lappend all_names $m |
|
|
} |
|
|
} |
|
|
} |
|
|
return [lsort $all_names] |
|
|
} |
|
|
proc page_names_help {{namesearch *}} { |
|
|
set d [page_names_dict $namesearch] |
|
|
|
|
|
set out "" |
|
|
tcl::dict::for {k v} $d { |
|
|
append out "$k $v" \n |
|
|
} |
|
|
return [linesort $out] |
|
|
} |
|
|
proc page_names_dict {{search *}} { |
|
|
if {![regexp {[?*]} $search]} { |
|
|
set search "*$search*" |
|
|
} |
|
|
set encnames [encoding names] |
|
|
foreach enc $encnames { |
|
|
tcl::dict::set d $enc [list] |
|
|
} |
|
|
variable encmimens |
|
|
set mimenames [array get ${encmimens}::reversemap] |
|
|
tcl::dict::for {mname encname} $mimenames { |
|
|
if {$encname in $encnames} { |
|
|
set enclist [tcl::dict::get $d $encname] |
|
|
if {$mname ni $enclist} { |
|
|
tcl::dict::lappend d $encname $mname |
|
|
} |
|
|
} |
|
|
} |
|
|
foreach enc [lsort $encnames] { |
|
|
set mime_enc [${encmimens}::mapencoding $enc] |
|
|
if {$mime_enc ne ""} { |
|
|
set enclist [tcl::dict::get $d $enc] |
|
|
if {$mime_enc ni $enclist} { |
|
|
tcl::dict::lappend d $enc $mime_enc |
|
|
} |
|
|
} |
|
|
} |
|
|
set dresult [tcl::dict::create] |
|
|
if {$search ne "*"} { |
|
|
tcl::dict::for {k v} $d { |
|
|
if {[tcl::string::match -nocase $search $k] || ([lsearch -nocase $v $search]) >= 0} { |
|
|
tcl::dict::set dresult $k $v |
|
|
} |
|
|
} |
|
|
} else { |
|
|
set dresult $d |
|
|
} |
|
|
return $dresult |
|
|
} |
|
|
proc page8 {encname args} { |
|
|
tcl::dict::set args -cols 8 |
|
|
tailcall page $encname {*}$args |
|
|
} |
|
|
proc page16 {encname args} { |
|
|
tcl::dict::set args -cols 16 |
|
|
tailcall page $encname {*}$args |
|
|
} |
|
|
|
|
|
#This will not display for example, c0 glyphs for cp437 |
|
|
# we could use the punk::ansi::cp437_map dict - but while that might be what some expect to see - it would probably be too much magic for this function - which is intended to align more with what Tcl's encoding convertfrom/to actually does. |
|
|
# for nonprinting members of the page, 2 and 3 letter codes are used rather than unicode visualisation replacements or even unicode equivalent replacements known to be appropriate for the page |
|
|
proc page {encname args} { |
|
|
variable invalid |
|
|
set encname [encname $encname] |
|
|
set defaults [list\ |
|
|
-range {0 256}\ |
|
|
-cols 16\ |
|
|
] |
|
|
set opts [tcl::dict::merge $defaults $args] |
|
|
# -- --- --- --- --- --- --- --- --- |
|
|
set cols [tcl::dict::get $opts -cols] |
|
|
# -- --- --- --- --- --- --- --- --- |
|
|
|
|
|
set d_bytedisplay [basedict_display] |
|
|
|
|
|
#set d_ascii [pagedict_raw ascii] |
|
|
set d_ascii [basedict] |
|
|
set d_asciiposn [lreverse $d_ascii] ;#values should be unique except for "???". We are mainly interested in which ones have display-names e.g cr csi |
|
|
#The results of this are best seen by comparing the ebcdic and ascii pages |
|
|
|
|
|
set d_page [pagedict_raw $encname] |
|
|
|
|
|
set out "" |
|
|
set i 1 |
|
|
append out " " |
|
|
tcl::dict::for {k rawchar} $d_page { |
|
|
set num [expr {"0x$k"}] |
|
|
#see if ascii equivalent exists and has a name |
|
|
if {$rawchar eq $invalid} { |
|
|
set displayv "$invalid" |
|
|
} else { |
|
|
set bytedisplay "" |
|
|
if {[tcl::dict::exists $d_asciiposn $rawchar]} { |
|
|
set asciiposn [tcl::dict::get $d_asciiposn $rawchar] |
|
|
set bytedisplay [tcl::dict::get $d_bytedisplay $asciiposn] |
|
|
} |
|
|
if {$bytedisplay eq $invalid} { |
|
|
# |
|
|
set displayv " $rawchar " |
|
|
} else { |
|
|
set displaylen [tcl::string::length $bytedisplay] |
|
|
if {$displaylen == 2} { |
|
|
set displayv "$bytedisplay " |
|
|
} elseif {$displaylen == 3} { |
|
|
set displayv $bytedisplay |
|
|
} else { |
|
|
if {[tcl::string::length $rawchar] == 0} { |
|
|
set displayv " " |
|
|
} else { |
|
|
#presumed 1 |
|
|
set displayv " $rawchar " |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
append out "$k $displayv " |
|
|
if {$i > 0 && $i % $cols == 0} { |
|
|
set out [tcl::string::range $out 0 end-2] |
|
|
append out \n " " |
|
|
} |
|
|
incr i |
|
|
} |
|
|
set out [tcl::string::trimright $out " "] |
|
|
return $out |
|
|
} |
|
|
|
|
|
proc pagechar1 {page num} { |
|
|
set encpage [encname $page] |
|
|
encoding convertfrom $encpage [format %c $num] |
|
|
} |
|
|
|
|
|
proc pagechar {page num} { |
|
|
set encpage [encname $page] |
|
|
|
|
|
set ch [format %c $num] |
|
|
if {[decodable $ch $encpage]} { |
|
|
set outchar [encoding convertfrom $encpage $ch] |
|
|
} else { |
|
|
#here we will use \0xFFFD instead of our replacement string ??? - as the name pagechar implies always returning a single character. REVIEW. |
|
|
set outchar $::punk::char::invalid_display_char |
|
|
} |
|
|
return $outchar |
|
|
} |
|
|
proc pagechar_info {page num} { |
|
|
set ch [format %c $num] |
|
|
set h [format %04x $num] |
|
|
set encpage [encname $page] |
|
|
if {[decodable $ch $encpage]} { |
|
|
set outchar [encoding convertfrom $encpage $ch] |
|
|
} else { |
|
|
error "pagechar_info: $h not decodable from $encpage" |
|
|
} |
|
|
package require punk::console |
|
|
puts -nonewline stdout \033\[s;flush stdout |
|
|
lassign [punk::console::get_cursor_pos_list] _row1 col1 |
|
|
puts -nonewline stdout "$outchar";flush stdout |
|
|
lassign [punk::console::get_cursor_pos_list] _row2 col2 |
|
|
puts -nonewline stdout "\033\[u";flush stdout |
|
|
return "$col1 -> $col2" |
|
|
} |
|
|
|
|
|
proc pagebyte {page num} { |
|
|
set encpage [encname $page] |
|
|
|
|
|
set ch [format %c $num] |
|
|
if {[decodable $ch $encpage]} { |
|
|
#set outchar [encoding convertto $encpage [format %c $num]] |
|
|
set outchar [format %c $num] |
|
|
} else { |
|
|
set outchar $::punk::char::invalid_display_char |
|
|
} |
|
|
return $outchar |
|
|
} |
|
|
|
|
|
proc all_pages {} { |
|
|
set out "" |
|
|
set mimenamesdict [page_names_dict] |
|
|
foreach encname [encoding names] { |
|
|
if {[tcl::dict::exists $mimenamesdict $encname]} { |
|
|
set alt "([tcl::dict::get $mimenamesdict $encname])" |
|
|
} else { |
|
|
set alt "" |
|
|
} |
|
|
append out "$encname $alt" \n |
|
|
append out [page $encname] |
|
|
} |
|
|
return $out |
|
|
} |
|
|
|
|
|
proc encname {encoding_name_or_alias} { |
|
|
set encname $encoding_name_or_alias |
|
|
if {[lsearch -nocase [page_names] $encname] < 0} { |
|
|
error "Unknown encoding '$encname' - use 'punk::char::page_names' to see valid encoding names on this system" |
|
|
} |
|
|
variable encmimens |
|
|
if {$encname ni [encoding names]} { |
|
|
set encname [${encmimens}::reversemapencoding $encname] |
|
|
} |
|
|
return $encname |
|
|
} |
|
|
|
|
|
proc pagedict_raw {encname} { |
|
|
variable invalid ;# ="???" |
|
|
set encname [encname $encname] |
|
|
set d [tcl::dict::create] |
|
|
for {set i 0} {$i < 256} {incr i} { |
|
|
set k [format %02x $i] |
|
|
#tcl::dict::set d $k [encoding convertfrom $encname [format %c $i]] |
|
|
set ch [format %c $i] ; |
|
|
#jmn |
|
|
if {[decodable $ch $encname]} { |
|
|
#set encchar [encoding convertto $encname $ch] |
|
|
#tcl::dict::set d $k [encoding convertfrom $encchar] |
|
|
tcl::dict::set d $k [encoding convertfrom $encname $ch] |
|
|
} else { |
|
|
tcl::dict::set d $k $invalid ;#use replacement so we can detect difference from actual "?" |
|
|
} |
|
|
} |
|
|
return $d |
|
|
} |
|
|
proc asciidict {} { |
|
|
variable invalid |
|
|
set d [tcl::dict::create] |
|
|
set a128 [asciidict128] |
|
|
for {set i 0} {$i < 256} {incr i} { |
|
|
set k [format %02x $i] |
|
|
if {$i <= 127} { |
|
|
tcl::dict::set d $k [tcl::dict::get $a128 $k] |
|
|
} else { |
|
|
# |
|
|
tcl::dict::set d $k $invalid |
|
|
} |
|
|
|
|
|
if {$i <=32} { |
|
|
#no point emitting the lower control characters raw to screen - emit the short-names defined in the 'ascii' proc |
|
|
tcl::dict::set d $k [tcl::dict::get $a128 $k] |
|
|
} else { |
|
|
if {$i == 0x9b} { |
|
|
tcl::dict::set d $k CSI ;#don't emit the ansi 'Control Sequence Introducer' - or it will be interpreted by the console and affect the layout. |
|
|
} else { |
|
|
tcl::dict::set d $k [format %c $i] |
|
|
} |
|
|
} |
|
|
} |
|
|
return $d |
|
|
} |
|
|
|
|
|
proc basedict_display {} { |
|
|
set d [tcl::dict::create] |
|
|
set a128 [asciidict128] |
|
|
for {set i 0} {$i < 256} {incr i} { |
|
|
set k [format %02x $i] |
|
|
if {$i <=32} { |
|
|
#no point emitting the lower control characters raw to screen - emit the short-names defined in the 'ascii' proc |
|
|
tcl::dict::set d $k [tcl::dict::get $a128 $k] |
|
|
} else { |
|
|
if {$i == 0x9b} { |
|
|
tcl::dict::set d $k CSI ;#don't emit the ansi 'Control Sequence Introducer' - or it will be interpreted by the console and affect the layout. |
|
|
} elseif {$i == 0x9c} { |
|
|
tcl::dict::set d $k OSC |
|
|
} else { |
|
|
#tcl::dict::set d $k [encoding convertfrom [encoding system] [format %c $i]] |
|
|
#don't use encoding convertfrom - we want the value independent of the current encoding system. |
|
|
tcl::dict::set d $k [format %c $i] |
|
|
} |
|
|
} |
|
|
} |
|
|
return $d |
|
|
} |
|
|
proc basedict_encoding_system {} { |
|
|
#result depends on 'encoding system' currently in effect |
|
|
set d [tcl::dict::create] |
|
|
for {set i 0} {$i < 256} {incr i} { |
|
|
set k [format %02x $i] |
|
|
tcl::dict::set d $k [encoding convertfrom [encoding system] [format %c $i]] |
|
|
} |
|
|
return $d |
|
|
} |
|
|
|
|
|
proc basedict {} { |
|
|
#this gives same result independent of current value of 'encoding system' |
|
|
set d [tcl::dict::create] |
|
|
for {set i 0} {$i < 256} {incr i} { |
|
|
set k [format %02x $i] |
|
|
tcl::dict::set d $k [format %c $i] |
|
|
} |
|
|
return $d |
|
|
} |
|
|
proc pagedict {pagename args} { |
|
|
variable charsets |
|
|
set encname [encname $pagename] |
|
|
set defaults [list\ |
|
|
-range {0 255}\ |
|
|
-charset ""\ |
|
|
] |
|
|
set opts [tcl::dict::merge $defaults $args] |
|
|
# -- --- --- --- --- --- --- --- --- --- |
|
|
set range [tcl::dict::get $opts -range] |
|
|
set charset [tcl::dict::get $opts -charset] |
|
|
# -- --- --- --- --- --- --- --- --- --- |
|
|
if {$charset ne ""} { |
|
|
if {$charset ni [charset_names]} { |
|
|
error "unknown charset '$charset' - use 'charset_names' to get list" |
|
|
} |
|
|
set setinfo [tcl::dict::get $charsets $charset] |
|
|
set ranges [tcl::dict::get $setinfo ranges] |
|
|
set charset_dict [tcl::dict::create] |
|
|
foreach r $ranges { |
|
|
set start [tcl::dict::get $r start] |
|
|
set end [tcl::dict::get $r end] |
|
|
#set charset_dict [tcl::dict::merge $charset_dict [char_range_dict $start $end]] |
|
|
break |
|
|
} |
|
|
|
|
|
} else { |
|
|
set start [lindex $range 0] |
|
|
set end [lindex $range 1] |
|
|
} |
|
|
|
|
|
set d [tcl::dict::create] |
|
|
for {set i $start} {$i <= $end} {incr i} { |
|
|
set k [format %02x $i] |
|
|
tcl::dict::set d $k [encoding convertfrom $encname [format %c $i]] |
|
|
} |
|
|
return $d |
|
|
} |
|
|
|
|
|
#todo - benchmark peformance - improve punk pipeline |
|
|
proc asciidict128 {} { |
|
|
regexp -all -inline {\S+} [concat {*}[linelist -line trimleft [ascii]]] |
|
|
} |
|
|
proc _asciidict128 {} { |
|
|
.= ascii |> .=> linelist -line trimleft |> .=* concat |> {regexp -all -inline {\S+} $data} |
|
|
} |
|
|
|
|
|
#review - use terminal to display actual supported DEC specials vs using dict at: punk::ansi::map_special_graphics which maps to known unicode equivalents |
|
|
proc asciidict2 {} { |
|
|
set d [tcl::dict::create] |
|
|
tcl::dict::for {k v} [basedict_display] { |
|
|
if {[tcl::string::length $v] == 1} { |
|
|
set num [expr {"0x$k"}] |
|
|
#tcl::dict::set d $k "\033(0[subst \\u00$k]\033(B" |
|
|
tcl::dict::set d $k "\033(0[format %c $num]\033(B" |
|
|
} else { |
|
|
tcl::dict::set d $k $v |
|
|
} |
|
|
} |
|
|
return $d |
|
|
} |
|
|
|
|
|
#-- --- --- --- --- --- --- --- |
|
|
# encoding convertfrom & encoding convertto can be somewhat confusing to think about. (Need to think in reverse.) |
|
|
# e.g encoding convertto dingbats <somethingpretty> will output something that doesn't look dingbatty on screen. |
|
|
#-- --- --- --- --- --- --- --- |
|
|
#must use Tcl instead of tcl (at least for 8.6) |
|
|
if {![package vsatisfies [package present Tcl] 8.7-]} { |
|
|
proc encodable "s {enc [encoding system]}" { |
|
|
set encname [encname $enc] |
|
|
if {($encname eq "ascii")} { |
|
|
#8.6 fails to round-trip convert 0x7f del character despite it being in the ascii range (review Why?? what else doesn't round-trip but should?) |
|
|
#just strip it out of the string as we are only after a boolean answer and if s is only a single del char empty string will result in true |
|
|
set s [tcl::string::map [list [format %c 0x7f] ""] $s] |
|
|
} |
|
|
string eq $s [encoding convertfrom $encname [encoding convertto $encname $s]] |
|
|
} |
|
|
#note also that tcl8.6 has anomalies with how it handles some unassigned codepoints |
|
|
# e.g unassigned codes in the middle of a codepage may appear to be encodable&decodable in a round trip whereas undefined codepoints at the end may get the replacement character defined in the tcl encodings dir (usually the 3f char: "?") |
|
|
proc decodable "s {enc [encoding system]}" { |
|
|
set encname [encname $enc] |
|
|
#review |
|
|
string eq $s [encoding convertto $encname [encoding convertfrom $encname $s]] |
|
|
} |
|
|
} else { |
|
|
#review - use -profile? |
|
|
proc encodable "s {enc [encoding system]}" { |
|
|
set encname [encname $enc] |
|
|
if {![catch { |
|
|
string eq $s [encoding convertfrom $encname [encoding convertto $encname $s]] |
|
|
} result]} { |
|
|
return $result |
|
|
} else { |
|
|
return 0 |
|
|
} |
|
|
} |
|
|
proc decodable "s {enc [encoding system]}" { |
|
|
set encname [encname $enc] |
|
|
if {![catch { |
|
|
string eq $s [encoding convertto $encname [encoding convertfrom $encname $s]] |
|
|
} result]} { |
|
|
return $result |
|
|
} else { |
|
|
return 0 |
|
|
} |
|
|
} |
|
|
} |
|
|
#-- --- --- --- --- --- --- --- |
|
|
proc test_japanese {{encoding jis0208}} { |
|
|
#A very basic test of 2char encodings such as jis0208 |
|
|
set yatbun 日本 ;# encoding convertfrom jis0208 F|K\\ |
|
|
lassign [split $yatbun] yat bun |
|
|
puts "original yatbun ${yat} ${bun}" |
|
|
set eyat [encoding convertto $encoding $yat] |
|
|
set ebun [encoding convertto $encoding $bun] |
|
|
puts "$encoding encoded: ${eyat} ${ebun}" |
|
|
puts "reencoded: [encoding convertfrom $encoding $eyat] [encoding convertfrom $encoding $ebun]" |
|
|
return $yatbun |
|
|
} |
|
|
proc test_grave {} { |
|
|
set g [format %c 0x300] |
|
|
puts stdout "Testing console display of grave accented a in between letters x and y - accent should combine over the top of the letter a." |
|
|
puts stdout "Apparent width should theoretically be 1 console-column" |
|
|
package require punk::console |
|
|
puts stdout "# -- --- --- ---" |
|
|
puts -nonewline "xa${g}z";set cursorposn [punk::console::get_cursor_pos] |
|
|
puts stdout \n |
|
|
puts stdout "cursor position immediately after outputing 4 bytes (expecting 3 glyphs): $cursorposn" |
|
|
puts stdout "# -- --- --- ---" |
|
|
puts -nonewline "xyz";set cursorposn [punk::console::get_cursor_pos] |
|
|
puts stdout \n |
|
|
puts stdout "cursor position immediately after outputing 3 bytes (xyz): $cursorposn" |
|
|
} |
|
|
proc test_zalgo {} { |
|
|
#from: https://github.com/jameslanska/unicode-display-width/blob/5e28d94c75e8c421a87199363b85c90dc37125b8/docs/unicode_background.md |
|
|
#see: https://lingojam.com/ZalgoText |
|
|
puts stdout "44 chars long - 9 graphemes - 9 columns wide" |
|
|
|
|
|
set str "Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ" |
|
|
|
|
|
} |
|
|
|
|
|
proc test_zalgo2 {} { |
|
|
|
|
|
# ------------------------ |
|
|
|
|
|
|
|
|
|
|
|
set str "Z̸̢͉̣͔̭̪͙̖̳̘͈͍̤̩̟͈͈͕̯̅̏̆̓̌́́͌̿̕ͅą̷̦̤̫̩͕̥̐̎̓́́̂̀͆́̔̄̈́̏̌́̆͜͜͝͠l̴̩̙̺͚̟͇͖͔͕̹̟͌̈́̄̇́̉̋̕̚͜͠͠g̸͇̩͔̺̝̗̥̖̙̑̈́̈́̿̾̌͌͊̈̀͊̑̇̑͌̍̅̌͊͜͠͝ǫ̴̢̧͎͔̞̤̻̱̮̬͕̗̭͍̣̠̳̆̀̋̓͒̾̄͜͝͠͝T̴̛͉̬̋́̈́̈́̓͗̍̏̔̈̋̀͐̓̎̅̚̕͠͝͝ê̵̖̖̈͊̔̓̔̐̓̃͊̋͋̎͂͋̕x̴̡͎͖̼͎̦͎̲̪̘̺̯̲̬̮̥̼̰͌̃͜ͅt̶̛̘͎̰̔͐͌́̈́͊͗͌̅͂̐̆͌̀͂͘" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ------------------------ |
|
|
} |
|
|
|
|
|
proc test_zalgo3 {} { |
|
|
|
|
|
# ------------------------ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
set str "Ẕ̸̢̼̺̜̰̣̣̖̭̜͓͖̺̥̼̠͙͙̟̥̟̠̤̫͍̠̤̮͚̝̜̙͈̦̙̩̹̙̜̩̦͔͕̈̃̅̇̈́͂̇̽̈́́́́̎͊̂̑̋͆̔̾͋̚͜ͅã̵̛̪̽̎̃͑̒̇͋͂̽̃̍͊̀̈̊̊̔̊̆̈́͗͑͗̽̄̋͗̄͌̑͊͝͝͠ͅl̵̢͇̖͉͖̝̹̜̞͓͎͍͈̞̱̙͙̦̪͔̱̮͈͉̼͎̭̝̯͇͚̺̟̱͙̳̰̙͚͖̝̫͙̎̅̃͆̈́̋̌̔̋̋͗̈́̔̐͆̈́̓̾̄̀́̏́͒̆̌͒̈́̈́̾̏̀͜͝g̸̖͂͋̊͗̈́̓̆̈̋̒͐̕o̶̧̢͓̲̗̠̘͕̦̤̹̗͉̰͔̯͓̭̹̻͔͇̯̜̙̍̂̃̃̀̓͌̒͊̊̋̒̿̿̌͐̄͗̾̕͝T̶̛̳̜̰͍̹̻̫̠̱̼̼̙̆̑̾̾͛́́̿͋͌̎̀̀̽̆͌̽̂̈̇̅̇̃́̿͗̾͒̎͊̑̾͝͠ȩ̸̢̨̛̛̛̭͔͎͇̫͎͈̲̻̙͕͎̣͈̩̺̗͖͙͇͌͐̒̎͐̓́̉́͐̓̐͌̅̊͋͑̈́͗͑̏̕͜͜͝ͅx̸̧̧̛̖̣̥̘͎͎̳̭̙̦̝͖̝̮̱̹̯̺̙̩̯̮͖̻̰͓̰̩͇̥̑͌̊̐̉̏͆̓̑̎̊̓͒̂̄̆͆̀̊̄̈̽͐͛̏͊̓̌͑́̎̈́̍̈́̊͗̉̋͆̿̊͘͜͜͝͝ͅͅͅt̵̡̨̜̘͎̦͚̠̗̺͉̼̠̲̠͙̺̹̗̲̏̈́̂̚͜͜͝ͅ" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ------------------------ |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
proc test_farmer {} { |
|
|
#an interesting article re grapheme clustering problems in terminals https://mitchellh.com/writing/grapheme-clusters-in-terminals |
|
|
#(similar to the problem with grave accent rendering width that the test_grave proc is written for) |
|
|
# -- --- --- --- --- |
|
|
#These pasted glyphs can display in console even when the unicode versions don't (tcl 8.6 limited to 65533/FFFD ?) |
|
|
upvar farmer1_paste test_farmer1 |
|
|
upvar farmer2_paste test_farmer2 |
|
|
set test_farmer1 🧑🌾 ;#contains zero-width joiner between |
|
|
set test_farmer2 🧑🌾 |
|
|
puts "pasted farmer1 exporting as var farmer1_paste: $test_farmer1" |
|
|
puts "pasted farmer2 exporting as var farmer2_paste: $test_farmer2" |
|
|
# -- --- --- --- --- |
|
|
|
|
|
|
|
|
set farmer1 "\U0001f9d1\U0000200d\U0001f33e" |
|
|
set farmer2 "\U0001f9d1\U0001f33e" |
|
|
puts stdout "farmer1 with zero-width joiner, codes: \\U0001f9d1\\U0000200d\\U0001f33e : $farmer1" |
|
|
puts stdout "farmer2 with no joiner codes: \\U0001f9d1\\U001f33e : $farmer2" |
|
|
|
|
|
package require punk::console |
|
|
puts stdout \n |
|
|
puts stdout "#2--5---9---C---" |
|
|
puts -nonewline stdout \033\[2K\033\[1G ;#2K erase line 1G cursor at col1 |
|
|
puts -nonewline "${farmer1}";set cursorposn [punk::console::get_cursor_pos_list] |
|
|
puts stdout "\ncursor position immediately after outputing farmer1 (expecting 1 glyph 2 wide) : cursor at col [lindex $cursorposn 1]" |
|
|
if {[lindex $cursorposn 1] eq "3"} { |
|
|
puts stdout "[a+ green]OK[a]" |
|
|
} else { |
|
|
puts stdout "[a+ red]ERR - expected cursor position to be 3 after emitting farmer1[a]" |
|
|
} |
|
|
puts stdout "----------------" |
|
|
puts stdout "#2--5---9---C---" |
|
|
puts -nonewline "${farmer2}";set cursorposn [punk::console::get_cursor_pos_list] |
|
|
puts stdout "\ncursor position immediately after outputing farmer2 (expecting 2 glyphs 4 wide in total): cursor at col [lindex $cursorposn 1]" |
|
|
if {[lindex $cursorposn 1] eq "5"} { |
|
|
puts stdout "[a+ green]OK[a]" |
|
|
} else { |
|
|
puts stdout "[a+ red]ERR - expected cursor position to be 5 after emitting farmer2[a]" |
|
|
} |
|
|
puts stdout "----------------" |
|
|
|
|
|
puts "returning farmer1 - should be single glyph" |
|
|
return $farmer1 |
|
|
} |
|
|
|
|
|
#G0 Sets Sequence G1 Sets Sequence Meaning |
|
|
#ESC ( A ESC ) A United Kingdom Set |
|
|
#ESC ( B ESC ) B ASCII Set |
|
|
#ESC ( 0 ESC ) 0 Special Graphics |
|
|
#ESC ( 1 ESC ) 1 Alternate Character ROM Standard Character Set |
|
|
#ESC ( 2 ESC ) 2 Alternate Character ROM Special Graphic |
|
|
|
|
|
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- |
|
|
# Unicode character sets - some hardcoded - some loadable from data files |
|
|
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- |
|
|
variable charinfo [tcl::dict::create] |
|
|
variable charsets [tcl::dict::create] |
|
|
|
|
|
|
|
|
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- |
|
|
# Aggregate character sets (ones that pick various ranges from underlying unicode ranges) |
|
|
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- |
|
|
tcl::dict::set charsets "WGL4" [list altname "Windows Glyph List 4" ranges [list\ |
|
|
{start 0 end 127 name "basic latin"}\ |
|
|
{start 128 end 255 name "latin-1 supplement"}\ |
|
|
{start 256 end 383 name "Latin Extended-A"}\ |
|
|
{start 402 end 402 name "subset Latin Extended-B"}\ |
|
|
{start 506 end 511 name "subset Latin Extended-B"}\ |
|
|
{start 710 end 711 name "subset Spacing Modifier Letters"}\ |
|
|
{start 713 end 713 name "subset Spacing Modifier Letters"}\ |
|
|
{start 728 end 733 name "subset Spacing Modifier Letters"}\ |
|
|
{start 900 end 906 name "subset Greek"}\ |
|
|
{start 908 end 908 name "subset Greek"}\ |
|
|
{start 910 end 974 name "subset Greek"}\ |
|
|
{start 1024 end 1119 name "subset Cyrillic"}\ |
|
|
{start 1168 end 1169 name "subset Cyrillic"}\ |
|
|
{start 7808 end 7813 name "subset Latin Extended Additional"}\ |
|
|
{start 7922 end 7923 name "subset Latin Extended Additional"}\ |
|
|
{start 8211 end 8213 name "subset General Punctuation"}\ |
|
|
{start 8215 end 8222 name "subset General Punctuation"}\ |
|
|
{start 8224 end 8226 name "subset General Punctuation"}\ |
|
|
{start 8230 end 8230 name "subset General Punctuation"}\ |
|
|
{start 8240 end 8240 name "subset General Punctuation"}\ |
|
|
{start 8242 end 8243 name "subset General Punctuation"}\ |
|
|
] description "Microsoft WGL4 Repertoire" settype "other"] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- |
|
|
#The base page 0-256 8bit range - values don't have specific characters or descriptions - as they are codepage dependent |
|
|
#we will fill this here for completeness - but with placeholders |
|
|
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- |
|
|
tcl::dict::set charsets "8bit" [list ranges [list {start 0 end 127 name ascii} {start 128 end 255 name 8bit-ascii}] description "8bit extended ascii range" settype "other"] |
|
|
for {set i 0} {$i < 256} {incr i} { |
|
|
tcl::dict::set charinfo $i [list desc "codepage-dependent" short "byte_[format %02x $i]"] |
|
|
} |
|
|
|
|
|
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- |
|
|
# Unicode ranges |
|
|
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- |
|
|
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- |
|
|
tcl::dict::set charsets "greek" [list ranges [list {start 880 end 1023} {start 7936 end 8191}] description "Greek and Coptic" settype "other"] |
|
|
|
|
|
|
|
|
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- |
|
|
tcl::dict::set charsets "Block Elements" [list ranges [list {start 9600 end 9631}] description "Block Elements" settype "other"] |
|
|
tcl::dict::set charinfo 9600 [list desc "Upper Half Block" short "blocke_up_half"] |
|
|
tcl::dict::set charinfo 9601 [list desc "Lower One Eighth Block" short "blocke_lw_1_8th"] |
|
|
tcl::dict::set charinfo 9602 [list desc "Lower One Quarter Block" short "blocke_lw_1_qtr"] |
|
|
|
|
|
|
|
|
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- |
|
|
tcl::dict::set charsets "Dingbats" [list ranges [list {start 9984 end 10175 }] description "Unicode Dingbats" settype "tcl_supplemented"] |
|
|
tcl::dict::set charinfo 9984 [list desc "Black Safety Scissors" short "dingbats_black_safety_scissors"] |
|
|
#... |
|
|
tcl::dict::set charinfo 10175 [list desc "Double Curly Loop" short "dingbats_double_curly_loop"] |
|
|
|
|
|
|
|
|
|
|
|
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- |
|
|
#variation selectors 0xFe01 - 0xFE0F |
|
|
tcl::dict::set charsets "Variation Selectors" [list ranges [list {start 65024 end 65039}] description "Variation Selectors" note "combining character with previous char - variant glyph display" settype "tcl_supplemented"] |
|
|
tcl::dict::set charinfo 65024 [list desc "Variation Selector-1" short "VS1"] |
|
|
tcl::dict::set charinfo 65025 [list desc "Variation Selector-2" short "VS2"] |
|
|
tcl::dict::set charinfo 65026 [list desc "Variation Selector-3" short "VS3"] |
|
|
tcl::dict::set charinfo 65027 [list desc "Variation Selector-4" short "VS4"] |
|
|
tcl::dict::set charinfo 65027 [list desc "Variation Selector-5" short "VS5"] |
|
|
tcl::dict::set charinfo 65029 [list desc "Variation Selector-6" short "VS6"] |
|
|
tcl::dict::set charinfo 65030 [list desc "Variation Selector-7" short "VS7"] |
|
|
tcl::dict::set charinfo 65031 [list desc "Variation Selector-8" short "VS8"] |
|
|
tcl::dict::set charinfo 65032 [list desc "Variation Selector-9" short "VS9"] |
|
|
tcl::dict::set charinfo 65033 [list desc "Variation Selector-10" short "VS10"] |
|
|
tcl::dict::set charinfo 65034 [list desc "Variation Selector-11" short "VS11"] |
|
|
tcl::dict::set charinfo 65035 [list desc "Variation Selector-12" short "VS12"] |
|
|
tcl::dict::set charinfo 65036 [list desc "Variation Selector-13" short "VS13"] |
|
|
tcl::dict::set charinfo 65037 [list desc "Variation Selector-14" short "VS14"] |
|
|
tcl::dict::set charinfo 65038 [list desc "Variation Selector-15 text variation" short "VS15"] ;#still an image - just more suitable for text-presentation e.g word-processing doc |
|
|
tcl::dict::set charinfo 65039 [list desc "Variation Selector-16 emoji variation" short "VS16"] |
|
|
|
|
|
|
|
|
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- |
|
|
# emoticons https://www.unicode.org/charts/PDF/U1F600.pdf |
|
|
tcl::dict::set charsets "Emoticons" [list ranges [list {start 128512 end 128591}] description "Emoticons" settype "tcl_supplemented"] |
|
|
tcl::dict::set charinfo 128512 [list desc "Grinning Face" short "emoticon_gface"] |
|
|
tcl::dict::set charinfo 128513 [list desc "Grinning Face with Smiling Eyes" short "emoticon_gface_smile_eyes"] |
|
|
tcl::dict::set charinfo 128514 [list desc "Face with Tears of Joy" short "emoticon_face_tears_joy"] |
|
|
|
|
|
#todo |
|
|
tcl::dict::set charinfo 128590 [list desc "Person with Pouting Face" short "emoticon_person_pout"] |
|
|
|
|
|
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- |
|
|
tcl::dict::set charsets "Box Drawing" [list ranges [list {start 9472 end 9599}] description "Box Drawing" settype "tcl_supplemented"] |
|
|
tcl::dict::set charinfo 9472 [list desc "Box Drawings Light Horizontal" short "boxd_lhz"] |
|
|
tcl::dict::set charinfo 9473 [list desc "Box Drawings Heavy Horizontal" short "boxd_hhz"] |
|
|
tcl::dict::set charinfo 9474 [list desc "Box Drawings Light Vertical" short "boxd_lv"] |
|
|
tcl::dict::set charinfo 9475 [list desc "Box Drawings Heavy Vertical" short "boxd_hv"] |
|
|
tcl::dict::set charinfo 9476 [list desc "Box Drawings Light Triple Dash Horizontal" short "boxd_ltdshhz"] |
|
|
tcl::dict::set charinfo 9477 [list desc "Box Drawings Heavy Triple Dash Horizontal" short "boxd_htdshhz"] |
|
|
tcl::dict::set charinfo 9478 [list desc "Box Drawings Light Triple Dash Vertical" short "boxd_ltdshv"] |
|
|
tcl::dict::set charinfo 9479 [list desc "Box Drawings Heavy Triple Dash Vertical" short "boxd_htdshv"] |
|
|
tcl::dict::set charinfo 9480 [list desc "Box Drawings Light Quadruple Dash Horizontal" short "boxd_lqdshhz"] |
|
|
tcl::dict::set charinfo 9481 [list desc "Box Drawings Heavy Quadruple Dash Horizontal" short "boxd_hqdshhz"] |
|
|
tcl::dict::set charinfo 9482 [list desc "Box Drawings Light Quadruple Dash Vertical" short "boxd_lqdshv"] |
|
|
tcl::dict::set charinfo 9483 [list desc "Box Drawings Heavy Quadruple Dash Vertical" short "boxd_hqdshv"] |
|
|
tcl::dict::set charinfo 9484 [list desc "Box Drawings Light Down and Right" short "boxd_ldr"] |
|
|
tcl::dict::set charinfo 9485 [list desc "Box Drawings Down Light and Right Heavy" short "boxd_dlrh"] |
|
|
tcl::dict::set charinfo 9486 [list desc "Box Drawings Down Heavy and Right Light" short "boxd_dhrl"] |
|
|
tcl::dict::set charinfo 9487 [list desc "Box Drawings Heavy Down and Right" short "boxd_hdr"] |
|
|
tcl::dict::set charinfo 9488 [list desc "Box Drawings Light Down and Left" short "boxd_ldl"] |
|
|
tcl::dict::set charinfo 9489 [list desc "Box Drawings Down Light and Left Heavy" short "boxd_dllh"] |
|
|
tcl::dict::set charinfo 9490 [list desc "Box Drawings Down Heavy and Left Light" short "boxd_dhll"] |
|
|
tcl::dict::set charinfo 9491 [list desc "Box Drawings Heavy Down and Left" short "boxd_hdl"] |
|
|
tcl::dict::set charinfo 9492 [list desc "Box Drawings Light Up and Right" short "boxd_lur"] |
|
|
tcl::dict::set charinfo 9493 [list desc "Box Drawings Up Light and Right Heavy" short "boxd_ulrh"] |
|
|
tcl::dict::set charinfo 9494 [list desc "Box Drawings Up Heavy and Right Light" short "boxd_uhrl"] |
|
|
tcl::dict::set charinfo 9495 [list desc "Box Drawings Heavy Up and Right" short "boxd_hur"] |
|
|
tcl::dict::set charinfo 9496 [list desc "Box Drawings Light Up and Left" short "boxd_lul"] |
|
|
tcl::dict::set charinfo 9497 [list desc "Box Drawings Up Light and Left Heavy" short "boxd_ullh"] |
|
|
tcl::dict::set charinfo 9498 [list desc "Box Drawings Up Heavy and Left Light" short "boxd_uhll"] |
|
|
tcl::dict::set charinfo 9499 [list desc "Box Drawings Heavy Up and Left" short "boxd_hul"] |
|
|
tcl::dict::set charinfo 9500 [list desc "Box Drawings Light Vertical and Right" short "boxd_lvr"] |
|
|
tcl::dict::set charinfo 9501 [list desc "Box Drawings Vertical Light and Right Heavy" short "boxd_vlrh"] |
|
|
tcl::dict::set charinfo 9502 [list desc "Box Drawings Up Heavy and Right Down Light" short "boxd_uhrdl"] |
|
|
tcl::dict::set charinfo 9503 [list desc "Box Drawings Down Heavy and Right Up Light" short "boxd_dhrul"] |
|
|
tcl::dict::set charinfo 9504 [list desc "Box Drawings Vertical Heavy and Right Light" short "boxd_vhrl"] |
|
|
tcl::dict::set charinfo 9505 [list desc "Box Drawings Down Light and Right Up Heavy" short "boxd_dlruh"] |
|
|
tcl::dict::set charinfo 9506 [list desc "Box Drawings Up Light and Right Down Heavy" short "boxd_ulrdh"] |
|
|
tcl::dict::set charinfo 9507 [list desc "Box Drawings Heavy Vertical and Right" short "boxd_hvr"] |
|
|
tcl::dict::set charinfo 9508 [list desc "Box Drawings Light Vertical and Left" short "boxd_lvl"] |
|
|
tcl::dict::set charinfo 9509 [list desc "Box Drawings Vertical Light and Left Heavy" short "boxd_vllh"] |
|
|
tcl::dict::set charinfo 9510 [list desc "Box Drawings Up Heavy and Let Down Light" short "boxd_uhldl"] |
|
|
tcl::dict::set charinfo 9511 [list desc "Box Drawings Down Heavy and Left Up Light" short "boxd_dhlul"] |
|
|
tcl::dict::set charinfo 9512 [list desc "Box Drawings Vertical Heavy and Left Light" short "boxd_vhll"] |
|
|
tcl::dict::set charinfo 9513 [list desc "Box Drawings Down Light and left Up Heavy" short "boxd_dlluh"] |
|
|
tcl::dict::set charinfo 9514 [list desc "Box Drawings Up Light and Left Down Heavy" short "boxd_ulldh"] |
|
|
tcl::dict::set charinfo 9515 [list desc "Box Drawings Heavy Vertical and Left" short "boxd_hvl"] |
|
|
tcl::dict::set charinfo 9516 [list desc "Box Drawings Light Down and Horizontal" short "boxd_ldhz"] |
|
|
tcl::dict::set charinfo 9517 [list desc "Box Drawings Left Heavy and Right Down Light" short "boxd_lhrdl"] |
|
|
tcl::dict::set charinfo 9518 [list desc "Box Drawings Right Heavy and Left Down Light" short "boxd_rhldl"] |
|
|
tcl::dict::set charinfo 9519 [list desc "Box Drawings Down Light and Horizontal Heavy" short "boxd_dlhzh"] |
|
|
tcl::dict::set charinfo 9520 [list desc "Box Drawings Down Heavy and Horizontal Light" short "boxd_dhhzl"] |
|
|
tcl::dict::set charinfo 9521 [list desc "Box Drawings Right Light and Left Down Heavy" short "boxd_rlldh"] |
|
|
tcl::dict::set charinfo 9522 [list desc "Box Drawings Left Light and Right Down Heavy" short "boxd_llrdh"] |
|
|
tcl::dict::set charinfo 9523 [list desc "Box Drawings Heavy Down and Horizontal" short "boxd_hdhz"] |
|
|
tcl::dict::set charinfo 9524 [list desc "Box Drawings Light Up and Horizontal" short "boxd_luhz"] |
|
|
tcl::dict::set charinfo 9525 [list desc "Box Drawings Left Heavy and Right Up Light" short "boxd_lhrul"] |
|
|
tcl::dict::set charinfo 9526 [list desc "Box Drawings Right Heavy and Left Up Light" short "boxd_rhlul"] |
|
|
tcl::dict::set charinfo 9527 [list desc "Box Drawings Up Light and Horizontal Heavy" short "boxd_ulhzh"] |
|
|
tcl::dict::set charinfo 9528 [list desc "Box Drawings Up Heavy and Horizontal Light" short "boxd_uhhzl"] |
|
|
tcl::dict::set charinfo 9529 [list desc "Box Drawings Right Light and Left Up Heavy" short "boxd_rlluh"] |
|
|
tcl::dict::set charinfo 9530 [list desc "Box Drawings Left Light and Right Up Heavy" short "boxd_llruh"] |
|
|
tcl::dict::set charinfo 9531 [list desc "Box Drawings Heavy Up and Horizontal" short "boxd_huhz"] |
|
|
tcl::dict::set charinfo 9532 [list desc "Box Drawings Light Vertical and Horizontal" short "boxd_lvhz"] |
|
|
tcl::dict::set charinfo 9533 [list desc "Box Drawings Left Heavy and Right Vertical Light" short "boxd_lhrvl"] |
|
|
tcl::dict::set charinfo 9534 [list desc "Box Drawings Right Heavy and Left Vertical Light" short "boxd_rhlvl"] |
|
|
tcl::dict::set charinfo 9535 [list desc "Box Drawings Vertical Light and Horizontal Heavy" short "boxd_vlhzh"] |
|
|
tcl::dict::set charinfo 9536 [list desc "Box Drawings Up Heavy and Down Horizontal Light" short "boxd_uhdhzl"] |
|
|
tcl::dict::set charinfo 9537 [list desc "Box Drawings Down Heavy and Up Horizontal Light" short "boxd_dhuhzl"] |
|
|
tcl::dict::set charinfo 9538 [list desc "Box Drawings Vertical Heavy and Horizontal Light" short "boxd_vhhzl"] |
|
|
tcl::dict::set charinfo 9539 [list desc "Box Drawings Left Up Heavy and Right Down Light" short "boxd_luhrdl"] |
|
|
tcl::dict::set charinfo 9540 [list desc "Box Drawings Right Up Heavy and Left Down Light" short "boxd_ruhldl"] |
|
|
tcl::dict::set charinfo 9541 [list desc "Box Drawings Left Down Heavy and Right Up Light" short "boxd_ldhrul"] |
|
|
tcl::dict::set charinfo 9542 [list desc "Box Drawings Right Down Heavy and Left Up Light" short "boxd_rdhlul"] |
|
|
tcl::dict::set charinfo 9543 [list desc "Box Drawings Down Light and Up Horizontal Heavy" short "boxd_dluhzh"] |
|
|
tcl::dict::set charinfo 9544 [list desc "Box Drawings Up Light and Down Horizontal Heavy" short "boxd_dldhzh"] |
|
|
tcl::dict::set charinfo 9545 [list desc "Box Drawings Right Light and Left Vertical Heavy" short "boxd_rllvh"] |
|
|
tcl::dict::set charinfo 9546 [list desc "Box Drawings Left Light and Right Vertical Heavy" short "boxd_llrvh"] |
|
|
tcl::dict::set charinfo 9547 [list desc "Box Drawings Heavy Vertical and Horizontal" short "boxd_hvhz"] |
|
|
tcl::dict::set charinfo 9548 [list desc "Box Drawings Light Double Dash Horizontal" short "boxd_lddshhz"] |
|
|
tcl::dict::set charinfo 9549 [list desc "Box Drawings Heavy Double Dash Horizontal" short "boxd_hddshhz"] |
|
|
tcl::dict::set charinfo 9550 [list desc "Box Drawings Light Double Dash Vertical" short "boxd_lddshv"] |
|
|
tcl::dict::set charinfo 9551 [list desc "Box Drawings Heavy Double Dash Vertical" short "boxd_hddshv"] |
|
|
tcl::dict::set charinfo 9552 [list desc "Box Drawings Double Horizontal" short "boxd_dhz"] |
|
|
tcl::dict::set charinfo 9553 [list desc "Box Drawings Double Vertical" short "boxd_dv"] |
|
|
tcl::dict::set charinfo 9554 [list desc "Box Drawings Down Single and Right Double" short "boxd_dsrd"] |
|
|
tcl::dict::set charinfo 9555 [list desc "Box Drawings Down Double and Right Single" short "boxd_ddrs"] |
|
|
tcl::dict::set charinfo 9556 [list desc "Box Drawings Double Down and Right" short "boxd_ddr"] |
|
|
tcl::dict::set charinfo 9557 [list desc "Box Drawings Down Single and Left Double" short "boxd_dsld"] |
|
|
tcl::dict::set charinfo 9558 [list desc "Box Drawings Down Double and Left Single" short "boxd_ddls"] |
|
|
tcl::dict::set charinfo 9559 [list desc "Box Drawings Double Down and Left" short "boxd_ddl"] |
|
|
tcl::dict::set charinfo 9560 [list desc "Box Drawings Up Single and Right Double" short "boxd_usrd"] |
|
|
tcl::dict::set charinfo 9561 [list desc "Box Drawings Up Double and Right Single" short "boxd_udrs"] |
|
|
tcl::dict::set charinfo 9562 [list desc "Box Drawings Double Up and Right" short "boxd_dur"] |
|
|
tcl::dict::set charinfo 9563 [list desc "Box Drawings Up Single and Left Double" short "boxd_usld"] |
|
|
tcl::dict::set charinfo 9564 [list desc "Box Drawings Up Double and Left Single" short "boxd_udls"] |
|
|
tcl::dict::set charinfo 9565 [list desc "Box Drawings Double Up and Left" short "boxd_dul"] |
|
|
tcl::dict::set charinfo 9566 [list desc "Box Drawings Vertical Single and Right Double" short "boxd_vsrd"] |
|
|
tcl::dict::set charinfo 9567 [list desc "Box Drawings Vertical Double and Right Single" short "boxd_vdrs"] |
|
|
tcl::dict::set charinfo 9568 [list desc "Box Drawings Double Vertical and Right" short "boxd_dvr"] |
|
|
tcl::dict::set charinfo 9569 [list desc "Box Drawings Vertical Single and Left Double" short "boxd_vsld"] |
|
|
tcl::dict::set charinfo 9570 [list desc "Box Drawings Vertical Double and Left Single" short "boxd_vdls"] |
|
|
tcl::dict::set charinfo 9571 [list desc "Box Drawings Double Vertical and Left" short "boxd_dvl"] |
|
|
tcl::dict::set charinfo 9572 [list desc "Box Drawings Down Single and Horizontal Double" short "boxd_dshzd"] |
|
|
tcl::dict::set charinfo 9573 [list desc "Box Drawings Down Double and Horizontal Single" short "boxd_ddhzs"] |
|
|
tcl::dict::set charinfo 9574 [list desc "Box Drawings Double Down and Horizontal" short "boxd_ddhz"] |
|
|
tcl::dict::set charinfo 9575 [list desc "Box Drawings Up Single and Horizontal Double" short "boxd_ushzd"] |
|
|
tcl::dict::set charinfo 9576 [list desc "Box Drawings Up Double and Horizontal Single" short "boxd_udhzs"] |
|
|
tcl::dict::set charinfo 9577 [list desc "Box Drawings Double Up and Horizontal" short "boxd_duhz"] |
|
|
tcl::dict::set charinfo 9578 [list desc "Box Drawings Vertical Single and Horizontal Double" short "boxd_vshzd"] |
|
|
tcl::dict::set charinfo 9579 [list desc "Box Drawings Vertical Double and Horizontal Single" short "boxd_vdhzs"] |
|
|
tcl::dict::set charinfo 9580 [list desc "Box Drawings Double Vertical and Horizontal" short "boxd_dvhz"] |
|
|
tcl::dict::set charinfo 9581 [list desc "Box Drawings Light Arc Down and Right" short "boxd_ladr"] |
|
|
tcl::dict::set charinfo 9582 [list desc "Box Drawings Light Arc Down and Left" short "boxd_ladl"] |
|
|
tcl::dict::set charinfo 9583 [list desc "Box Drawings Light Arc Up and Left" short "boxd_laul"] |
|
|
tcl::dict::set charinfo 9584 [list desc "Box Drawings Light Arc Up and Right" short "boxd_laur"] |
|
|
tcl::dict::set charinfo 9585 [list desc "Box Drawings Light Diagonal Upper Right To Lower Left" short "boxd_ldgurll"] |
|
|
tcl::dict::set charinfo 9586 [list desc "Box Drawings Light Diagonal Upper Left To Lower Right" short "boxd_ldgullr"] |
|
|
tcl::dict::set charinfo 9587 [list desc "Box Drawings Light Diagonal Cross" short "boxd_ldc"] |
|
|
tcl::dict::set charinfo 9588 [list desc "Box Drawings Light Left" short "boxd_ll"] |
|
|
tcl::dict::set charinfo 9589 [list desc "Box Drawings Light Up" short "boxd_lu"] |
|
|
tcl::dict::set charinfo 9590 [list desc "Box Drawings Light Right" short "boxd_lr"] |
|
|
tcl::dict::set charinfo 9591 [list desc "Box Drawings Light Down" short "boxd_ld"] |
|
|
tcl::dict::set charinfo 9592 [list desc "Box Drawings Heavy Left" short "boxd_hl"] |
|
|
tcl::dict::set charinfo 9593 [list desc "Box Drawings Heavy Up" short "boxd_hu"] |
|
|
tcl::dict::set charinfo 9594 [list desc "Box Drawings Heavy Right" short "boxd_hr"] |
|
|
tcl::dict::set charinfo 9595 [list desc "Box Drawings Heavy Down" short "boxd_hd"] |
|
|
tcl::dict::set charinfo 9596 [list desc "Box Drawings Light Left and Heavy Right" short "boxd_llhr"] |
|
|
tcl::dict::set charinfo 9597 [list desc "Box Drawings Light Up and Heavy Down" short "boxd_luhd"] |
|
|
tcl::dict::set charinfo 9598 [list desc "Box Drawings Heavy Left and Light Right" short "boxd_hllr"] |
|
|
tcl::dict::set charinfo 9599 [list desc "Box Drawings Heavy Up and Light Down" short "boxd_huld"] |
|
|
|
|
|
|
|
|
tcl::dict::set charsets "Halfwidth and Fullwidth Forms" [list ranges [list {start 65280 end 65519}] description "Halfwidth and Fullwidth Forms (variants)" settype "tcl_supplemental"] |
|
|
tcl::dict::set charsets "ascii_fullwidth" [list ranges [list {start 65281 end 65374}] description "Ascii 21 to 7E fullwidth" parentblock "halfwidth_and_fullwidth_forms" settype "other"] |
|
|
|
|
|
tcl::dict::set charsets "Specials" [list ranges [list {start 65520 end 65535}] description "Specials" settype "tcl_supplemental"] |
|
|
|
|
|
tcl::dict::set charsets "noncharacters" [list ranges [list\ |
|
|
{start 64976 end 65007 note "BMP FDD0..FDEF"}\ |
|
|
{start 65534 end 65535 note "BMP FFFE,FFFF"}\ |
|
|
{start 131070 end 131071 note "plane1 1FFFE,1FFFF"}\ |
|
|
{start 196606 end 196607 note "plane2 2FFFE,2FFFF"}\ |
|
|
{start 262142 end 262143 note "plane3 3FFFE,3FFFF"}\ |
|
|
{start 327678 end 327679 note "plane4 4FFFE,4FFFF"}\ |
|
|
{start 393214 end 393215 note "plane5 5FFFE,5FFFF"}\ |
|
|
{start 458750 end 458751 note "plane6 6FFFE,6FFFF"}\ |
|
|
{start 524286 end 524287 note "plane7 7FFFE,7FFFF"}\ |
|
|
{start 589822 end 589823 note "plane8 8FFFE,8FFFF"}\ |
|
|
{start 655358 end 655359 note "plane9 9FFFE,9FFFF"}\ |
|
|
{start 720894 end 720895 note "plane10 AFFFE,AFFFF"}\ |
|
|
{start 786430 end 786431 note "plane11 BFFFE,BFFFF"}\ |
|
|
{start 851966 end 851967 note "plane12 CFFFE,CFFFF"}\ |
|
|
{start 917502 end 917503 note "plane13 DFFFE,DFFFF"}\ |
|
|
{start 983038 end 983039 note "plane14 EFFFE,EFFFF"}\ |
|
|
{start 1048574 end 1048575 note "plane15 FFFFE,FFFFF"}\ |
|
|
{start 1114110 end 1114111 note "plane16 10FFFE,10FFFF"}\ |
|
|
] description "non-characters" settype "tcl_supplemental"] |
|
|
|
|
|
#build dicts keyed on short |
|
|
variable charshort |
|
|
proc _build_charshort {} { |
|
|
variable charshort |
|
|
set charshort [tcl::dict::create] |
|
|
variable charinfo |
|
|
tcl::dict::for {k v} $charinfo { |
|
|
if {[tcl::dict::exists $v short]} { |
|
|
set sh [tcl::dict::get $v short] |
|
|
if {[tcl::dict::exists $charshort $sh]} { |
|
|
puts stderr "_build_charshort WARNING character data load duplicate shortcode '$sh'" |
|
|
} |
|
|
tcl::dict::set charshort $sh [format %c $k] |
|
|
} |
|
|
} |
|
|
return [tcl::dict::size $charshort] |
|
|
} |
|
|
_build_charshort |
|
|
|
|
|
variable charset_extents_startpoints ;#stores endpoints associated with each startpoint - but named after key which is startpoint. |
|
|
variable charset_extents_endpoints ;#stores startpoints assoicated with each endpoint - but named after key which is endpoint. |
|
|
variable charset_extents_rangenames ;# dict keyed on start,end pointing to list of 2-tuples {setname rangeindex_within_set} |
|
|
#build 2 indexes for each range.(charsets are not just unicode blocks so can have multiple ranges) |
|
|
#Note that a range could be as small as a single char (startpoint = endpoint) so there can be many ranges with same start and end if charsets use some of the same subsets. |
|
|
#as each charset_extents_startpoins,charset_extents_endpoints is built - the associated range name and index is appended to the rangenames dict |
|
|
#startpoints - key is startpoint of a range, value is list of endpoints one for each range starting at this startpoint-key |
|
|
#endpoints - key is endpoint of a range, value is list of startpoints one for each range ending at this endpoint-key |
|
|
proc _build_charset_extents {} { |
|
|
variable charsets |
|
|
variable charset_extents_startpoints |
|
|
variable charset_extents_endpoints |
|
|
variable charset_extents_rangenames |
|
|
set charset_extents_startpoints [tcl::dict::create] |
|
|
set charset_extents_endpoints [tcl::dict::create] |
|
|
set charset_extents_rangenames [tcl::dict::create] |
|
|
tcl::dict::for {setname setinfo} $charsets { |
|
|
set ranges [tcl::dict::get $setinfo ranges] |
|
|
if {[tcl::dict::get $setinfo settype] eq "block"} { |
|
|
#unicode block must have a single range |
|
|
#we consider a char a member of the block even if unassigned/reserved (as per unicode documentation) |
|
|
set start [tcl::dict::get [lindex $ranges 0] start] |
|
|
set end [tcl::dict::get [lindex $ranges 0] end] |
|
|
if {![tcl::dict::exists $charset_extents_startpoints $start] || $end ni [tcl::dict::get $charset_extents_startpoints $start]} { |
|
|
#assertion if end wasn't in startpoits list - then start won't be in endpoints list |
|
|
tcl::dict::lappend charset_extents_startpoints $start $end |
|
|
tcl::dict::lappend charset_extents_endpoints $end $start |
|
|
} |
|
|
tcl::dict::lappend charset_extents_rangenames ${start},${end} [list $setname 1] |
|
|
} else { |
|
|
#multirange sets/scripts. have holes. Char not a member if it's not explicitly in a defined range. |
|
|
#They should be in order within a set - but we don't assume so |
|
|
set r 1 |
|
|
foreach range $ranges { |
|
|
set start [tcl::dict::get $range start] |
|
|
set end [tcl::dict::get $range end] |
|
|
if {![tcl::dict::exists $charset_extents_startpoints $start] || $end ni [tcl::dict::get $charset_extents_startpoints $start]} { |
|
|
#assertion if end wasn't in startpoits list - then start won't be in endpoints list |
|
|
tcl::dict::lappend charset_extents_startpoints $start $end |
|
|
tcl::dict::lappend charset_extents_endpoints $end $start |
|
|
} |
|
|
tcl::dict::lappend charset_extents_rangenames ${start},${end} [list $setname $r] |
|
|
incr r |
|
|
} |
|
|
} |
|
|
} |
|
|
#maintain in sorted order |
|
|
#-stride is available in lsort even at tcl8.6 - but not in lsearch |
|
|
set charset_extents_startpoints [lsort -stride 2 -integer $charset_extents_startpoints] |
|
|
set charset_extents_endpoints [lsort -stride 2 -integer $charset_extents_endpoints] |
|
|
#no need to sort charset_extents_rangenames - lookup only done using dict methods |
|
|
return [tcl::dict::size $charset_extents_startpoints] |
|
|
} |
|
|
_build_charset_extents ;#rebuilds for all charsets |
|
|
|
|
|
#nerdfonts are within the Private use E000 - F8FF range |
|
|
proc load_nerdfonts {} { |
|
|
variable charsets |
|
|
variable charinfo |
|
|
package require fileutil |
|
|
set ver [package provide punk::char] |
|
|
if {$ver ne ""} { |
|
|
set ifneeded [package ifneeded punk::char [package provide punk::char]] |
|
|
#puts stderr "punk::char ifneeded script: $ifneeded" |
|
|
lassign [split $ifneeded ";"] _ sourceinfo |
|
|
set basedir [file dirname [lindex $sourceinfo end]] |
|
|
} else { |
|
|
#review - will only work at package load time |
|
|
set scr [info script] |
|
|
if {$scr eq ""} { |
|
|
error "load_nerdfonts unable to determine package folder" |
|
|
} |
|
|
set basedir [file dirname [info script]] |
|
|
} |
|
|
set pkg_data_dir [file join $basedir char] |
|
|
set fname [file join $pkg_data_dir nerd-fonts-glyph-list.txt] |
|
|
if {[file exists $fname]} { |
|
|
#puts stderr "load_nerdfonts loading $fname" |
|
|
set data [fileutil::cat -translation binary $fname] |
|
|
set short_seen [tcl::dict::create] |
|
|
set current_set_range [tcl::dict::create] |
|
|
set filesets_loading [list] |
|
|
foreach ln [split $data \n] { |
|
|
set ln [tcl::string::trim $ln] |
|
|
if {$ln eq ""} {continue} |
|
|
set desc [lassign $ln hex rawsetname] |
|
|
set hexnum 0x$hex |
|
|
set dec [expr $hexnum] |
|
|
set setname "nf_$rawsetname" ;#Ensure nerdfont set names are prefixed. |
|
|
|
|
|
if {$setname ni $filesets_loading} { |
|
|
if {![tcl::dict::exists $charsets $setname]} { |
|
|
#set exists - but not in our filesets_loading list - therefore this set has been previously loaded, so clear old data first |
|
|
dict unset charset $setname |
|
|
} |
|
|
set newrange [list start $dec end $dec] |
|
|
tcl::dict::set current_set_range $setname $newrange |
|
|
tcl::dict::set charsets $setname [list ranges [list $newrange] description "nerd fonts $rawsetname" settype "nerdfonts privateuse"] |
|
|
|
|
|
lappend filesets_loading $setname |
|
|
} |
|
|
#expects ordered glyph list |
|
|
set existing_range [tcl::dict::get $current_set_range $setname] |
|
|
set existing_end [tcl::dict::get $existing_range end] |
|
|
if {$dec - $existing_end == 1} { |
|
|
#part of current range |
|
|
tcl::dict::set current_set_range $setname end $dec |
|
|
#overwrite last ranges element |
|
|
set rangelist [lrange [tcl::dict::get $charsets $setname ranges] 0 end-1] |
|
|
lappend rangelist [tcl::dict::get $current_set_range $setname] |
|
|
tcl::dict::set charsets $setname ranges $rangelist |
|
|
} else { |
|
|
#new range for set |
|
|
tcl::dict::set current_set_range $setname start $dec |
|
|
tcl::dict::set current_set_range $setname end $dec |
|
|
set rangelist [tcl::dict::get $charsets $setname ranges] |
|
|
lappend rangelist [tcl::dict::get $current_set_range $setname] |
|
|
tcl::dict::set charsets $setname ranges $rangelist |
|
|
} |
|
|
|
|
|
if {![tcl::dict::exists $charinfo $dec]} { |
|
|
# -- --- |
|
|
#review |
|
|
set map [list beaufort bf gibbous gb crescent cr thunderstorm tstorm thermometer thermom] |
|
|
lappend map {*}[list directory dir creativecommons ccom creative_commons ccom forwardslash fs] |
|
|
lappend map {*}[list multimedia mm multiple multi outline outl language lang] |
|
|
lappend map {*}[list odnoklassniki okru] |
|
|
# -- --- |
|
|
#consider other ways to unambiguously shorten names? |
|
|
#normalize nf_fa & nf_fa 'o' element to 'outl' so outlines can be searched across sets more easily (o not necessarily at last position) |
|
|
set normdesc [list] |
|
|
foreach el $desc { |
|
|
if {$el eq "o"} { |
|
|
set el "outl" |
|
|
} |
|
|
lappend normdesc $el |
|
|
} |
|
|
set joined_desc [join $normdesc _] |
|
|
#map after join so we can normalize some underscored elements e.g creativecommons & creative_commons |
|
|
set mapped_desc [tcl::string::map $map $joined_desc] |
|
|
set s nf_${rawsetname}_$mapped_desc |
|
|
|
|
|
if {![tcl::dict::exists $short_seen $s]} { |
|
|
tcl::dict::set short_seen $s {} |
|
|
} else { |
|
|
#duplicate in the data file (e.g 2023 weather night alt rain mix) |
|
|
set s ${s}_$hex |
|
|
} |
|
|
tcl::dict::set charinfo $dec [list desc "$desc" short $s] |
|
|
} |
|
|
} |
|
|
_build_charshort |
|
|
_build_charset_extents |
|
|
} else { |
|
|
puts stderr "unable to find glyph file. Tried $fname" |
|
|
} |
|
|
} |
|
|
|
|
|
proc package_base {} { |
|
|
#assume punk::char is in .tm form and we can use the package provide statement to determine base location |
|
|
#review |
|
|
set pkgver [package present punk::char] |
|
|
set pkginfo [package ifneeded punk::char $pkgver] |
|
|
set tmfile [lindex $pkginfo end] |
|
|
set pkg_base [file dirname $tmfile] |
|
|
return $pkg_base |
|
|
} |
|
|
tcl::namespace::eval internal { |
|
|
proc unicode_folder {} { |
|
|
set parent [file join [punk::char::package_base] char] |
|
|
set candidates [glob -nocomplain -type d -dir $parent -tail unicode*] |
|
|
set candidates [lsort -increasing $candidates] ;#review - dictionary sort - how are unicode versions ranked/compared?? |
|
|
if {![llength $candidates]} { |
|
|
error "Failed to find unicode data folder in folder '$parent'" |
|
|
} |
|
|
set folder [file join $parent [lindex $candidates end]] |
|
|
return $folder |
|
|
} |
|
|
proc dict_getdef {dictValue args} { |
|
|
if {[llength $args] < 2} { |
|
|
error {wrong # args: should be "dict_getdef dictValue ?key ...? key default"} |
|
|
} |
|
|
set keys [lrange $args 0 end-1] |
|
|
if {[tcl::dict::exists $dictValue {*}$keys]} { |
|
|
return [tcl::dict::get $dictValue {*}$keys] |
|
|
} else { |
|
|
return [lindex $args end] |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
#charsets structure |
|
|
#tcl::dict::set charsets "halfwidth_and_fullwidth_forms" [list ranges [list {start 65280 end 65519}] description "Halfwidth and Fullwidth Forms (variants) settype block"] |
|
|
|
|
|
#unicode Blocks.txt |
|
|
#load the defined blocks into 'charsets' and mark as type 'block'. Unicode blocks have only one range - and don't overlap. |
|
|
#We don't treat unassigned/reserved codes within a block specially at this stage - ie we will not chop a block into subranges on that basis. |
|
|
#unassigned code points should get certain default properties (e.g bidirectionality ) according to their block - so it makes sense to treat them as belonging to the block. |
|
|
#They also get the general property of Cn (Other,not assigned or Other,reserved) and a "Basic Type" of Noncharacter or Reserved |
|
|
proc load_unicode_blocks {} { |
|
|
#sample data line |
|
|
#0000..007F; Basic Latin |
|
|
variable charsets |
|
|
set file [file join [internal::unicode_folder] Blocks.txt] |
|
|
if {![file exists $file]} { |
|
|
error "Unicode Blocks.txt file not found at path '$file'" |
|
|
} |
|
|
puts "ok.. loading" |
|
|
set fd [open $file r] |
|
|
chan configure $fd -translation binary |
|
|
set data [read $fd] |
|
|
close $fd |
|
|
set block_count 0 |
|
|
foreach ln [split $data \n] { |
|
|
set ln [tcl::string::trim $ln] |
|
|
if {[tcl::string::match #* $ln]} { |
|
|
continue |
|
|
} |
|
|
if {[set pcolon [tcl::string::first ";" $ln]] > 0} { |
|
|
set lhs [tcl::string::trim [tcl::string::range $ln 0 $pcolon-1]] |
|
|
set name [tcl::string::trim [tcl::string::range $ln $pcolon+1 end]] |
|
|
set lhsparts [split $lhs .] |
|
|
set start [lindex $lhsparts 0] |
|
|
set end [lindex $lhsparts end] |
|
|
#puts "$start -> $end '$name'" |
|
|
set decimal_start [expr {"0x$start"}] |
|
|
set decimal_end [expr {"0x$end"}] |
|
|
tcl::dict::set charsets $name [list ranges [list [list start $decimal_start end $decimal_end note "unicode block $lhs"]] description "" settype block] |
|
|
incr block_count |
|
|
} |
|
|
} |
|
|
_build_charset_extents |
|
|
return $block_count |
|
|
} |
|
|
|
|
|
#unicode scripts |
|
|
|
|
|
#unicode UnicodeData.txt |
|
|
|
|
|
|
|
|
|
|
|
#https://www.unicode.org/reports/tr44/#Property_Values |
|
|
|
|
|
|
|
|
#unicode EastAsianWidth.txt |
|
|
#classify width of character - which is contextual in some cases |
|
|
##### |
|
|
#Review - this is initial naive assumption that should get us mostly what we want for layout purposes in a utf-8-centric world. |
|
|
#We will just load the values and treat H,N,Na as 1-wide and A,F,W as 2-wide for functions such as char::string_width on the basis that those using legacy sets can query the property and make their own determinations in those contexts. |
|
|
#### |
|
|
# -- --- |
|
|
#A = Ambiguous - All characters that can be sometimes wide and sometimes narrow. (wide in east asian legacy sets, narrow in non-east asian usage) (private use chars considered ambiguous) |
|
|
#F = East Asian Full-width |
|
|
#H = East Asian Half-width |
|
|
#N = Not east Asian (Neutral) - all other characters. (do not occur in legacy East Asian character sets) - treated like Na |
|
|
#Na = East Asian Narrow - all other characters that are always narrow and have explicit full-width counterparts (e.g includes all of ASCII) |
|
|
#W = East Asian Wide - all other characters that are always wide (Occur only in the context of Eas Asian Typography) |
|
|
# -- --- |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
proc charshort {shortname} { |
|
|
variable charshort |
|
|
return [tcl::dict::get $charshort $shortname] |
|
|
} |
|
|
|
|
|
proc box_drawing {args} { |
|
|
return [charset "Box Drawing" {*}$args] |
|
|
} |
|
|
proc box_drawing_dict {} { |
|
|
return [charset_dict "Box Drawing"] |
|
|
} |
|
|
|
|
|
proc char_hex {char} { |
|
|
return [format %08x [scan $char %c]] |
|
|
} |
|
|
|
|
|
proc char_info_hex {hex args} { |
|
|
set hex [tcl::string::map [list _ ""] $hex] |
|
|
if {[tcl::string::is xdigit -strict $hex]} { |
|
|
#has no leading 0x |
|
|
set dec [expr {"0x$hex"}] |
|
|
} else { |
|
|
set dec [expr {$hex}] |
|
|
} |
|
|
return [char_info_dec $dec {*}$args] |
|
|
} |
|
|
proc char_info {char args} { |
|
|
#Note - on some versions of Tcl -e.g 8.6 use could supply something like \U1f600 (smiley icon) but we receive fffd (replacement special) |
|
|
#there is no way to detect what the user intended ie we can't distinguish if they actually typed \UFFFD |
|
|
#we can test if such mapping happens in general - and warn if codepoint is FFFD in the result dict |
|
|
set returninfo [tcl::dict::create] |
|
|
if {[tcl::string::equal \UFFFD $char] && [tcl::string::equal \U1F600 \UFFFD]} { |
|
|
tcl::dict::set returninfo WARNING "this tcl maps multiple to FFFD" |
|
|
} |
|
|
lassign [scan $char %c%s] dec_char remainder |
|
|
if {[tcl::string::length $remainder]} { |
|
|
error "char_info requires a single character" |
|
|
} |
|
|
set result [tcl::dict::merge $returninfo [char_info_dec $dec_char {*}$args]] |
|
|
} |
|
|
proc char_info_dec {dec args} { |
|
|
set dec_char [expr {$dec}] |
|
|
set opts [tcl::dict::create\ |
|
|
-fields {default}\ |
|
|
-except {}\ |
|
|
] |
|
|
#testwidth is so named because it peforms an actual test on the console using ansi escapes - and the name gives a hint that it is a little slow |
|
|
set known_fields [list all default dec hex desc short testwidth char memberof] ;#maint fields from charinfo 'desc' 'short' |
|
|
#todo - unicode properties |
|
|
# tclwhitespace (different to unicode concept of whitespace. review ) |
|
|
|
|
|
foreach {k v} $args { |
|
|
switch -- $k { |
|
|
-fields - -except { |
|
|
tcl::dict::set opts $k $v |
|
|
} |
|
|
default { |
|
|
error "char_info unrecognised option '$k'. Known options:'[tcl::dict::keys $opts]' known_fields: $known_fields usage: char_info <char> ?-fields {<fieldnames>}? ?-except {<fieldnames>}?" |
|
|
} |
|
|
} |
|
|
} |
|
|
# -- --- --- --- --- --- --- --- --- --- --- --- |
|
|
set opt_fields [tcl::dict::get $opts -fields] |
|
|
set opt_except [tcl::dict::get $opts -except] |
|
|
# -- --- --- --- --- --- --- --- --- --- --- --- |
|
|
set initial_fields [list] |
|
|
if {"default" in $opt_fields} { |
|
|
set initial_fields $known_fields |
|
|
if {"testwidth" ni $opt_fields} { |
|
|
if {"testwidth" ni $opt_except} { |
|
|
lappend opt_except testwidth |
|
|
} |
|
|
} |
|
|
if {"char" ni $opt_fields} { |
|
|
if {"char" ni $opt_except} { |
|
|
lappend opt_except char |
|
|
} |
|
|
} |
|
|
} elseif {"all" in $opt_fields} { |
|
|
set initial_fields $known_fields |
|
|
} else { |
|
|
foreach f $opt_fields { |
|
|
if {$f in $known_fields} { |
|
|
lappend initial_fields $f |
|
|
} else { |
|
|
error "char_info unknown field name: '$f' known fields: '$known_fields'" |
|
|
} |
|
|
} |
|
|
} |
|
|
foreach e $opt_except { |
|
|
if {$e ni $known_fields} { |
|
|
error "char_info unknown field name $e in -except. known fields: '$known_fields'" |
|
|
} |
|
|
} |
|
|
set fields [list] |
|
|
foreach f $initial_fields { |
|
|
if {$f ne "all" && $f ni $opt_except} { |
|
|
lappend fields $f |
|
|
} |
|
|
} |
|
|
if {![llength $fields]} { |
|
|
return |
|
|
} |
|
|
|
|
|
variable charinfo |
|
|
variable charsets |
|
|
set hex_char [format %04x $dec_char] |
|
|
set returninfo [tcl::dict::create] |
|
|
foreach f $fields { |
|
|
switch -- $f { |
|
|
dec { |
|
|
tcl::dict::set returninfo dec $dec_char |
|
|
} |
|
|
hex { |
|
|
tcl::dict::set returninfo hex $hex_char |
|
|
} |
|
|
desc { |
|
|
if {[tcl::dict::exists $charinfo $dec_char desc]} { |
|
|
tcl::dict::set returninfo desc [tcl::dict::get $charinfo $dec_char desc] |
|
|
} else { |
|
|
tcl::dict::set returninfo desc "" |
|
|
} |
|
|
} |
|
|
short { |
|
|
if {[tcl::dict::exists $charinfo $dec_char short]} { |
|
|
tcl::dict::set returninfo desc [tcl::dict::get $charinfo $dec_char short] |
|
|
} else { |
|
|
tcl::dict::set returninfo short "" |
|
|
} |
|
|
} |
|
|
testwidth { |
|
|
#todo - expectedwidth - lookup the printing width it is *supposed* to have from unicode tables |
|
|
#testwidth is one of the main ones likely to be worthwhile excluding as querying the console via ansi takes time |
|
|
set existing_testwidth "" |
|
|
if {[tcl::dict::exists $charinfo $dec_char testwidth]} { |
|
|
set existing_testwidth [tcl::dict::get $charinfo $dec_char testwidth] |
|
|
} |
|
|
if {$existing_testwidth eq ""} { |
|
|
#no cached data - do expensive cursor-position test (Note this might not be 100% reliable - terminals lie e.g if ansi escape sequence has set to wide printing.) |
|
|
set char [format %c $dec_char] |
|
|
set chwidth [char_info_testwidth $char] |
|
|
|
|
|
tcl::dict::set returninfo testwidth $chwidth |
|
|
#cache it. todo - -verify flag to force recalc in case font/terminal changed in some way? |
|
|
tcl::dict::set charinfo $dec_char testwidth $chwidth |
|
|
} else { |
|
|
tcl::dict::set returninfo testwidth $existing_testwidth |
|
|
} |
|
|
} |
|
|
char { |
|
|
set char [format %c $dec_char] |
|
|
tcl::dict::set returninfo char $char |
|
|
} |
|
|
memberof { |
|
|
#memberof takes in the order of a few hundred microseconds if a simple scan of all ranges is taken - possibly worthwhile caching/optimising |
|
|
#note that memberof is not just unicode blocks - but scripts and other non-contiguous sets consisting of multiple ranges - some of which may include ranges of a single character. (e.g WGL4) |
|
|
#This means there probably isn't a really efficient means of calculating membership other than scanning all the defined ranges. |
|
|
#We could potentially populate it using a side-thread - but it seems reasonable to just cache result after first use here. |
|
|
#some efficiency could also be gained by pre-calculating the extents for each charset which isn't a simple unicode block. (and perhaps sorting by max char) |
|
|
set memberof [list] |
|
|
tcl::dict::for {setname setinfo} $charsets { |
|
|
foreach r [tcl::dict::get $setinfo ranges] { |
|
|
set s [tcl::dict::get $r start] |
|
|
set e [tcl::dict::get $r end] |
|
|
if {$dec_char >= $s && $dec_char <= $e} { |
|
|
lappend memberof $setname |
|
|
break |
|
|
} |
|
|
} |
|
|
} |
|
|
tcl::dict::set returninfo memberof $memberof |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
return $returninfo |
|
|
} |
|
|
|
|
|
proc _char_info_dec_memberof_scan {dec} { |
|
|
variable charsets |
|
|
set memberof [list] |
|
|
tcl::dict::for {setname setinfo} $charsets { |
|
|
foreach r [tcl::dict::get $setinfo ranges] { |
|
|
set s [tcl::dict::get $r start] |
|
|
set e [tcl::dict::get $r end] |
|
|
if {$dec >= $s && $dec <= $e} { |
|
|
lappend memberof $setname |
|
|
break |
|
|
} |
|
|
} |
|
|
} |
|
|
return $memberof |
|
|
} |
|
|
proc range_split_info {dec} { |
|
|
variable charset_extents_startpoints |
|
|
variable charset_extents_endpoints |
|
|
set skeys [tcl::dict::keys $charset_extents_startpoints] |
|
|
set ekeys [tcl::dict::keys $charset_extents_endpoints] |
|
|
set splen [tcl::dict::size $charset_extents_startpoints] |
|
|
set eplen [tcl::dict::size $charset_extents_endpoints] |
|
|
set s [lsearch -bisect -integer $skeys $dec] |
|
|
set s_at_or_below [lrange $skeys 0 $s] |
|
|
set e_of_s [list] |
|
|
foreach sk $s_at_or_below { |
|
|
lappend e_of_s {*}[tcl::dict::get $charset_extents_startpoints $sk] |
|
|
} |
|
|
set e_of_s [lsort -integer $e_of_s] |
|
|
set splitposn [lsearch -bisect -integer $e_of_s $dec] |
|
|
if {[lindex $e_of_s $splitposn] < $dec} {incr splitposn} |
|
|
#set lhs_endpoints_to_check [expr {[llength $e_of_s] - $splitposn}] |
|
|
set reduced_endpoints [lrange $e_of_s $splitposn end] |
|
|
set sps [list] |
|
|
foreach ep $reduced_endpoints { |
|
|
lappend sps {*}[tcl::dict::get $charset_extents_endpoints $ep] |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
set e [lsearch -bisect -integer $ekeys $dec] |
|
|
if {$e >= 0} { |
|
|
set e_at_or_above [lrange $ekeys $e end] |
|
|
set s_of_e [list] |
|
|
foreach ek $e_at_or_above { |
|
|
lappend s_of_e {*}[tcl::dict::get $charset_extents_endpoints $ek] |
|
|
} |
|
|
set startpoints_of_above [llength $s_of_e] |
|
|
set splitposn [lsearch -bisect -integer $s_of_e $dec] |
|
|
set reduced_startpoints [lrange $s_of_e 0 $splitposn] |
|
|
set eps [list] |
|
|
foreach sp $reduced_startpoints { |
|
|
lappend eps {*}[tcl::dict::get $charset_extents_startpoints $sp] |
|
|
} |
|
|
} else { |
|
|
set s_of_e [list] |
|
|
set reduced_startpoints [list] |
|
|
set eps [list] |
|
|
|
|
|
} |
|
|
|
|
|
return [tcl::dict::create startpoints $splen endpoints $eplen midpoint [expr {floor($eplen/2)}] posn $e lhs_endpoints_to_check "[llength $reduced_endpoints]/[llength $e_of_s]=[llength $sps]ranges" rhs_startpoints_to_check "[llength $reduced_startpoints]/[llength $s_of_e]=[llength $eps]"] |
|
|
} |
|
|
#for just a few extra sets such as wgl4 and unicode blocks loaded - this gives e.g 5x + better performance than the simple search above, and up to twice as slow for tcl 8.6 |
|
|
#performance biased towards lower numbered characters (which is not too bad in the context of unicode) |
|
|
#todo - could be tuned to perform better at end by assuming a fairly even distribution of ranges - and so searching startpoint ranges first for items left of middle, endpoint ranges first for items right of middle |
|
|
#review with scripts loaded and more defined ranges.. |
|
|
#This algorithm supports arbitrary overlapping ranges and ranges with same start & endpoints |
|
|
#Should be much better than O(n) for n sets except for pathological case of member of all or nearly-all intervals ? |
|
|
#review - compare with 'interval tree' algorithms. |
|
|
proc char_info_dec_memberof {dec} { |
|
|
variable charset_extents_startpoints |
|
|
variable charset_extents_endpoints |
|
|
variable charset_extents_rangenames |
|
|
if {[package vcompare [info tclversion] 8.7a5] >= 0} { |
|
|
#algorithm should theoretically be a little better with -stride |
|
|
set last_smaller_or_equal_startposn [lsearch -stride 2 -bisect -integer $charset_extents_startpoints $dec] |
|
|
set sets_starting_below [lrange $charset_extents_startpoints 0 $last_smaller_or_equal_startposn+1] ;#+1 to include 2nd element of stridden pair |
|
|
set endpoints_of_starting_below [lsort -integer [concat {*}[tcl::dict::values $sets_starting_below]]] |
|
|
} else { |
|
|
#no -stride available |
|
|
set startkeys [tcl::dict::keys $charset_extents_startpoints] |
|
|
set last_smaller_or_equal_startkeyposn [lsearch -bisect -integer $startkeys $dec] ;#assert will always return one of the keys if number >=0 supplied (last key if > all) |
|
|
#set startkey_found [lindex $startkeys $last_smaller_or_equal_startkeyposn] |
|
|
set start_below_keys [lrange $startkeys 0 $last_smaller_or_equal_startkeyposn] ;#These are the keys of sets which start at or below dec |
|
|
#puts "start_below_keys: '$start_below_keys'" |
|
|
set endpoints_of_starting_below [list] |
|
|
foreach belowkey $start_below_keys { |
|
|
lappend endpoints_of_starting_below {*}[tcl::dict::get $charset_extents_startpoints $belowkey] |
|
|
} |
|
|
set endpoints_of_starting_below [lsort -integer $endpoints_of_starting_below[unset endpoints_of_starting_below]] |
|
|
} |
|
|
|
|
|
set splitposn [lsearch -bisect -integer $endpoints_of_starting_below $dec] ;#splitposn = last smaller or equal endposn |
|
|
if {[lindex $endpoints_of_starting_below $splitposn] < $dec} { incr splitposn} |
|
|
set reduced_opposite_limit [lrange $endpoints_of_starting_below $splitposn end] |
|
|
################ |
|
|
#note each endpoint points to multiple startpoints which may still include some that are not in range. (e.g range y can share endpoint with x that starts in-range - but y starts above character ) |
|
|
# x1 x2 |
|
|
# y1 y2 |
|
|
# c |
|
|
################ |
|
|
#we have reduced our set of endpoints sufficiently (to those at or above dec) to run through and test each startpoint |
|
|
set ranges [list] |
|
|
foreach ep $reduced_opposite_limit { |
|
|
foreach s [tcl::dict::get $charset_extents_endpoints $ep] { |
|
|
if {$s <= $dec} { |
|
|
lappend ranges [tcl::dict::get $charset_extents_rangenames $s,$ep] |
|
|
} |
|
|
} |
|
|
} |
|
|
return $ranges |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
#with glob searching of description and short |
|
|
proc char_range_dict {start end args} { |
|
|
if {![tcl::string::is integer -strict $start] || ![tcl::string::is integer -strict $end]} { |
|
|
error "char_range_dict error start and end must be integers" |
|
|
} |
|
|
set and_globs [list] |
|
|
if {![llength $args]} { |
|
|
set args [list *] |
|
|
} |
|
|
foreach glob $args { |
|
|
if {![regexp {[*?]} $glob]} { |
|
|
lappend and_globs "*$glob*" |
|
|
} else { |
|
|
lappend and_globs $glob |
|
|
} |
|
|
} |
|
|
variable charinfo |
|
|
set cdict [tcl::dict::create] |
|
|
set start [expr {$start}] ;#force string rep to decimal - otherwise first use of i as string could be hex or other rep whilst other i values will be decimal string rep due to incr |
|
|
for {set i $start} {$i <= $end} {incr i} { |
|
|
set hx [format %04x $i] |
|
|
set ch [format %c $i] |
|
|
if {[tcl::dict::exists $charinfo $i desc]} { |
|
|
set d [tcl::dict::get $charinfo $i desc] |
|
|
} else { |
|
|
set d "" |
|
|
} |
|
|
if {[tcl::dict::exists $charinfo $i short]} { |
|
|
set s [tcl::dict::get $charinfo $i short] |
|
|
} else { |
|
|
set s "" |
|
|
} |
|
|
set matchcount 0 |
|
|
foreach glob $and_globs { |
|
|
if {[tcl::string::match -nocase $glob $s] || [tcl::string::match -nocase $glob $d]} { |
|
|
incr matchcount |
|
|
} |
|
|
} |
|
|
if {$matchcount == [llength $and_globs]} { |
|
|
if {[tcl::dict::exists $charinfo $i]} { |
|
|
tcl::dict::set cdict $hx [tcl::dict::merge [tcl::dict::create dec $i hex $hx char $ch] [tcl::dict::get $charinfo $i]] |
|
|
} else { |
|
|
tcl::dict::set cdict $hx [list dec $i hex $hx char $ch desc $d short $s] |
|
|
} |
|
|
} |
|
|
} |
|
|
return $cdict |
|
|
} |
|
|
#with glob searches of desc and short |
|
|
proc char_range {start end args} { |
|
|
package require overtype |
|
|
if {![tcl::string::is integer -strict $start] || ![tcl::string::is integer -strict $end]} { |
|
|
error "char_range error start and end must be integers" |
|
|
} |
|
|
set charset_dict [char_range_dict $start $end {*}$args] |
|
|
set out "" |
|
|
set col3 [tcl::string::repeat " " 12] |
|
|
tcl::dict::for {k inf} $charset_dict { |
|
|
set s [internal::dict_getdef $inf short ""] |
|
|
set d [internal::dict_getdef $inf desc ""] |
|
|
set s_col [overtype::left $col3 $s] |
|
|
append out "$k [tcl::dict::get $inf dec] [tcl::dict::get $inf char] $s_col $d" \n |
|
|
} |
|
|
return $out |
|
|
} |
|
|
|
|
|
|
|
|
#non-overlapping unicode blocks |
|
|
proc char_blocks {{name_or_glob *}} { |
|
|
variable charsets |
|
|
#todo - more efficient datastructures? |
|
|
if {![regexp {[?*]} $name_or_glob]} { |
|
|
#no glob - just retrieve it |
|
|
if {[tcl::dict::exists $charsets $name_or_glob]} { |
|
|
if {[tcl::dict::get $charsets $name_or_glob settype] eq "block"} { |
|
|
return [tcl::dict::create $name_or_glob [tcl::dict::get $charsets $name_or_glob]] |
|
|
} |
|
|
} |
|
|
#no exact match - try case insensitive.. |
|
|
set name [lsearch -inline -nocase [tcl::dict::keys $charsets] $name_or_glob] |
|
|
if {$name ne ""} { |
|
|
if {[tcl::dict::get $charsets $name settype] eq "block"} { |
|
|
return [tcl::dict::create $name [tcl::dict::get $charsets $name]] |
|
|
} |
|
|
} |
|
|
|
|
|
} else { |
|
|
#build a subset |
|
|
set charsets_block [tcl::dict::create] |
|
|
tcl::dict::for {k v} $charsets { |
|
|
if {[tcl::string::match -nocase $name_or_glob $k]} { |
|
|
if {[tcl::dict::get $v settype] eq "block"} { |
|
|
tcl::dict::set charsets_block $k $v |
|
|
} |
|
|
} |
|
|
} |
|
|
return $charsets_block |
|
|
} |
|
|
} |
|
|
proc charset_names {{name_or_glob *}} { |
|
|
variable charsets |
|
|
if {![regexp {[?*]} $name_or_glob]} { |
|
|
#no glob - just retrieve it |
|
|
if {[tcl::dict::exists $charsets $name_or_glob]} { |
|
|
return [list $name_or_glob] |
|
|
} |
|
|
#no exact match - try case insensitive.. |
|
|
set name [lsearch -inline -nocase [tcl::dict::keys $charsets] $name_or_glob] |
|
|
if {$name ne ""} { |
|
|
return [list $name] |
|
|
} |
|
|
} else { |
|
|
if {$name_or_glob eq "*"} { |
|
|
return [lsort [tcl::dict::keys $charsets]] |
|
|
} |
|
|
#tcl::dict::keys $dict <pattern> doesn't have option for case insensitive searches |
|
|
return [lsort [lsearch -all -inline -nocase [tcl::dict::keys $charsets] $name_or_glob]] |
|
|
} |
|
|
} |
|
|
|
|
|
#deprecated |
|
|
#major named sets such as unicode blocks, scripts, and other sets such as microsoft WGL4 |
|
|
#case insensitive search - possibly with *basic* globs (? and * only - not lb rb) |
|
|
proc charset_names2 {{namesearch *}} { |
|
|
variable charsets |
|
|
#dictionary sorting of the keys is slow! - we should obviously store it in sorted order instead of sorting entire list on retrieval - or just sort results |
|
|
#set sortedkeys [lsort -increasing -dictionary [tcl::dict::keys $charsets]] ;#NOTE must use -dictionary to use -sorted flag below |
|
|
set sortedkeys [lsort -increasing [tcl::dict::keys $charsets]] |
|
|
if {$namesearch eq "*"} { |
|
|
return $sortedkeys |
|
|
} |
|
|
if {[regexp {[?*]} $namesearch]} { |
|
|
#name glob search |
|
|
return [lsearch -all -inline -nocase $sortedkeys $namesearch] ;#no point using -sorted flag when -all is used |
|
|
} else { |
|
|
#return [lsearch -sorted -inline -nocase $sortedkeys $namesearch] ;#no globs - bails out earlier if -sorted? |
|
|
return [lsearch -inline -nocase $sortedkeys $namesearch] ;#no globs |
|
|
} |
|
|
} |
|
|
proc charsets {{namesearch *}} { |
|
|
package require textblock |
|
|
variable charsets |
|
|
set charset_names [charset_names $namesearch] |
|
|
set settype_list [list] |
|
|
foreach setname $charset_names { |
|
|
lappend settype_list [tcl::dict::get $charsets $setname settype] |
|
|
} |
|
|
|
|
|
set charset_names [linsert $charset_names 0 "Set Name"] |
|
|
set settype_list [linsert $settype_list 0 "Set Type"] |
|
|
|
|
|
return [textblock::join -- [list_as_lines -- $charset_names] " " [list_as_lines $settype_list]] |
|
|
} |
|
|
proc charset_defget {exactname} { |
|
|
variable charsets |
|
|
return [tcl::dict::get $charsets $exactname] |
|
|
} |
|
|
proc charset_defs {charsetname} { |
|
|
variable charsets |
|
|
set matches [charset_names $charsetname] |
|
|
set def_list [list] |
|
|
foreach setname $matches { |
|
|
lappend def_list [tcl::dict::create $setname [tcl::dict::get $charsets $setname]] |
|
|
} |
|
|
return [join $def_list \n] |
|
|
} |
|
|
proc charset_dictget {exactname} { |
|
|
variable charsets |
|
|
set setinfo [tcl::dict::get $charsets $exactname] |
|
|
set ranges [tcl::dict::get $setinfo ranges] |
|
|
set charset_dict [tcl::dict::create] |
|
|
foreach r $ranges { |
|
|
set start [tcl::dict::get $r start] |
|
|
set end [tcl::dict::get $r end] |
|
|
set charset_dict [tcl::dict::merge $charset_dict [char_range_dict $start $end]] |
|
|
} |
|
|
return $charset_dict |
|
|
} |
|
|
proc charset_dicts {searchname} { |
|
|
variable charsets |
|
|
set matches [charset_names $searchname] |
|
|
if {![llength $matches]} { |
|
|
error "No charset found matching name '$searchname' - use 'charset_names' to get list" |
|
|
} |
|
|
set dict_list [list] |
|
|
foreach m $matches { |
|
|
lappend dict_list [tcl::dict::create $m [charset_dictget $m]] |
|
|
} |
|
|
#return $dict_list |
|
|
return [join $dict_list \n] |
|
|
} |
|
|
proc charset_page {namesearch args} { |
|
|
_charset_page_search $namesearch $args ;#pass args to descsearch argument |
|
|
} |
|
|
proc _charset_page_search {namesearch search_this_and_that args} { |
|
|
variable charsets |
|
|
variable charinfo |
|
|
set matched_names [charset_names $namesearch] |
|
|
if {![llength $matched_names]} { |
|
|
error "charset_page no charset matched pattern '$namesearch' - use 'charset_names' to get list" |
|
|
} |
|
|
set defaults [tcl::dict::create\ |
|
|
-ansi 0\ |
|
|
-lined 1\ |
|
|
] |
|
|
set opts [tcl::dict::merge $defaults $args] |
|
|
# -- --- --- --- |
|
|
set opt_ansi [tcl::dict::get $opts -ansi] |
|
|
set opt_lined [tcl::dict::get $opts -lined] |
|
|
# -- --- --- --- |
|
|
set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+} |
|
|
|
|
|
if {$opt_ansi} { |
|
|
set a1 [a BLACK white bold] |
|
|
set a2 [a] |
|
|
} else { |
|
|
set a1 "" |
|
|
set a2 "" |
|
|
} |
|
|
set cols 16 |
|
|
set prefix " " |
|
|
append out $prefix |
|
|
foreach charsetname $matched_names { |
|
|
if {[llength $search_this_and_that]} { |
|
|
set setinfo [tcl::dict::get $charsets $charsetname] |
|
|
set ranges [tcl::dict::get $setinfo ranges] |
|
|
set charset_dict [tcl::dict::create] |
|
|
foreach r $ranges { |
|
|
set start [tcl::dict::get $r start] |
|
|
set end [tcl::dict::get $r end] |
|
|
set charset_dict [tcl::dict::merge $charset_dict [char_range_dict $start $end {*}$search_this_and_that]] |
|
|
} |
|
|
} else { |
|
|
set charset_dict [charset_dictget $charsetname] |
|
|
} |
|
|
if {![tcl::dict::size $charset_dict]} { |
|
|
continue |
|
|
} |
|
|
set i 1 |
|
|
append out \n $prefix $charsetname |
|
|
append out \n |
|
|
|
|
|
set marker_line $prefix |
|
|
set line $prefix |
|
|
tcl::dict::for {hex inf} $charset_dict { |
|
|
set ch [tcl::dict::get $inf char] |
|
|
set twidth "" |
|
|
set dec [expr {"0x$hex"}] |
|
|
if {[tcl::dict::exists $charinfo $dec testwidth]} { |
|
|
set twidth [tcl::dict::get $charinfo $dec testwidth] |
|
|
} |
|
|
if {$twidth eq ""} { |
|
|
#set width [ansifreestring_width $ch] ;#based on unicode props |
|
|
set width [grapheme_width_cached $ch] |
|
|
} else { |
|
|
set width $twidth |
|
|
} |
|
|
if {$width == 0} { |
|
|
set marker " " |
|
|
if {[regexp $re_diacritics $ch]} { |
|
|
#attempt to combine with space to get 3-wide displayv with diacritic showing at left space |
|
|
#todo - dualchar diacritics? |
|
|
set displayv " $ch " |
|
|
} else { |
|
|
set displayv " " |
|
|
} |
|
|
} elseif {$width == 1} { |
|
|
set marker "_ " |
|
|
set displayv "${a1}$ch${a2} " |
|
|
} else { |
|
|
#presumed 2 |
|
|
set marker "__ " |
|
|
set displayv "${a1}$ch${a2} " |
|
|
} |
|
|
set hexlen [tcl::string::length $hex] |
|
|
append marker_line "[tcl::string::repeat " " $hexlen] $marker" |
|
|
append line "$hex $displayv" |
|
|
if {$i == [tcl::dict::size $charset_dict] || $i % $cols == 0} { |
|
|
if {$opt_lined} { |
|
|
append out $marker_line \n |
|
|
} |
|
|
append out $line \n |
|
|
set marker_line $prefix |
|
|
set line $prefix |
|
|
#set out [tcl::string::range $out 0 end-2] |
|
|
#append out \n " " |
|
|
} |
|
|
incr i |
|
|
} |
|
|
} |
|
|
set out [tcl::string::trimright $out " "] |
|
|
return $out |
|
|
} |
|
|
|
|
|
#allows search on both name and an anded list of globs to be applied to description & short |
|
|
proc charset {namesearch args} { |
|
|
package require overtype |
|
|
variable charsets |
|
|
set matched_names [charset_names $namesearch] |
|
|
if {![llength $matched_names]} { |
|
|
error "No charset matched pattern '$namesearch' - use 'charset_names' to get list" |
|
|
} |
|
|
set search_this_and_that $args |
|
|
|
|
|
set out "" |
|
|
|
|
|
foreach charsetname $matched_names { |
|
|
if {[llength $search_this_and_that]} { |
|
|
set setinfo [tcl::dict::get $charsets $charsetname] |
|
|
set ranges [tcl::dict::get $setinfo ranges] |
|
|
set charset_dict [tcl::dict::create] |
|
|
foreach r $ranges { |
|
|
set start [tcl::dict::get $r start] |
|
|
set end [tcl::dict::get $r end] |
|
|
set charset_dict [tcl::dict::merge $charset_dict [char_range_dict $start $end {*}$search_this_and_that]] |
|
|
} |
|
|
} else { |
|
|
set charset_dict [charset_dictget $charsetname] |
|
|
} |
|
|
|
|
|
set col_items_short [list] |
|
|
set col_items_desc [list] |
|
|
tcl::dict::for {k inf} $charset_dict { |
|
|
lappend col_items_desc [internal::dict_getdef $inf desc ""] |
|
|
lappend col_items_short [internal::dict_getdef $inf short ""] |
|
|
} |
|
|
if {[llength $col_items_desc]} { |
|
|
set widest3 [tcl::mathfunc::max {*}[lmap v $col_items_short {tcl::string::length $v}]] |
|
|
if {$widest3 == 0} { |
|
|
set col3 " " |
|
|
} else { |
|
|
set col3 [tcl::string::repeat " " $widest3] |
|
|
} |
|
|
tcl::dict::for {k inf} $charset_dict { |
|
|
set s [internal::dict_getdef $inf short ""] |
|
|
set d [internal::dict_getdef $inf desc ""] |
|
|
set s_col [overtype::left $col3 $s] |
|
|
append out "$k [tcl::dict::get $inf char] $s_col $d" \n |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
return $out |
|
|
} |
|
|
|
|
|
#use console cursor movements to test and cache the column-width of each char in the set of characters returned by the search criteria |
|
|
proc charset_calibrate {namesearch args} { |
|
|
variable charsets |
|
|
variable charinfo |
|
|
set matched_names [charset_names $namesearch] |
|
|
if {![llength $matched_names]} { |
|
|
error "No charset matched pattern '$namesearch' - use 'charset_names' to get list" |
|
|
} |
|
|
set search_this_and_that $args |
|
|
set charcount 0 |
|
|
set width_results [tcl::dict::create] |
|
|
puts stdout "calibrating using terminal cursor movements.." |
|
|
foreach charsetname $matched_names { |
|
|
if {[llength $search_this_and_that]} { |
|
|
set setinfo [tcl::dict::get $charsets $charsetname] |
|
|
set ranges [tcl::dict::get $setinfo ranges] |
|
|
set charset_dict [tcl::dict::create] |
|
|
foreach r $ranges { |
|
|
set start [tcl::dict::get $r start] |
|
|
set end [tcl::dict::get $r end] |
|
|
set charset_dict [tcl::dict::merge $charset_dict [char_range_dict $start $end {*}$search_this_and_that]] |
|
|
} |
|
|
} else { |
|
|
set charset_dict [charset_dictget $charsetname] |
|
|
} |
|
|
if {![tcl::dict::size $charset_dict]} { |
|
|
continue |
|
|
} |
|
|
tcl::dict::for {hex inf} $charset_dict { |
|
|
set ch [format %c 0x$hex] |
|
|
set twidth "" |
|
|
set dec [expr {"0x$hex"}] |
|
|
if {[tcl::dict::exists $charinfo $dec testwidth]} { |
|
|
set twidth [tcl::dict::get $charinfo $dec testwidth] |
|
|
} |
|
|
if {$twidth eq ""} { |
|
|
#puts -nonewline stdout "." ;#this |
|
|
set width [char_info_testwidth $ch] ;#based on console test rather than unicode props |
|
|
tcl::dict::set charinfo $dec testwidth $width |
|
|
} else { |
|
|
set width $twidth |
|
|
} |
|
|
tcl::dict::incr width_results $width |
|
|
incr charcount |
|
|
} |
|
|
} |
|
|
puts stdout "\ncalibration done - results cached in charinfo dictionary" |
|
|
return [tcl::dict::create charcount $charcount widths $width_results] |
|
|
} |
|
|
|
|
|
#maint warning - also in overtype! |
|
|
#intended for single grapheme - but will work for multiple |
|
|
#cannot contain ansi or newlines |
|
|
#(a cache of ansifreestring_width calls - as these are quite regex heavy) |
|
|
#review - effective memory leak on longrunning programs if never cleared |
|
|
#tradeoff in fragmenting cache and reducing efficiency vs ability to clear in a scoped manner |
|
|
proc grapheme_width_cached {ch {key ""}} { |
|
|
variable grapheme_widths |
|
|
#if key eq "*" - we won't be able to clear that cache individually. Perhaps that's ok |
|
|
if {[tcl::dict::exists $grapheme_widths $key $ch]} { |
|
|
return [tcl::dict::get $grapheme_widths $key $ch] |
|
|
} |
|
|
set width [punk::char::ansifreestring_width $ch] ;#review - can we provide faster version if we know it's a single grapheme rather than a string? (grapheme is still a string as it may have combiners/diacritics) |
|
|
tcl::dict::set grapheme_widths $key $ch $width |
|
|
return $width |
|
|
} |
|
|
proc grapheme_width_cache_clear {key} { |
|
|
variable grapheme_widths |
|
|
if {$key eq "*} { |
|
|
set grapheme_widths [tcl::dict::create] |
|
|
} else { |
|
|
tcl::dict::unset grapheme_widths $key |
|
|
} |
|
|
return |
|
|
} |
|
|
#no char_width - use grapheme_width terminology to be clearer |
|
|
proc grapheme_width {char} { |
|
|
error "grapheme_width unimplemented - use ansifreestring_width" |
|
|
} |
|
|
|
|
|
#return N Na W etc from unicode data |
|
|
#review |
|
|
proc char_uc_width_prop {char} { |
|
|
error "char_uc_width unimplemented try textutil::wcswidth_type" |
|
|
} |
|
|
#todo - provide a grapheme_width function that is optimised for speed |
|
|
proc string_width {text} { |
|
|
#burn approx 2uS (2024) checking for ansi codes - not just SGR |
|
|
if {[punk::ansi::ta::detect $text]} { |
|
|
puts stderr "string_width detected ANSI!" |
|
|
} |
|
|
if {[tcl::string::last \n $text] >= 0} { |
|
|
error "string_width accepts only a single line" |
|
|
} |
|
|
#tailcall ansifreestring_width $text |
|
|
ansifreestring_width $text |
|
|
} |
|
|
|
|
|
#todo - consider disallowing/erroring out when \r \n in string? |
|
|
# - tab/vtab? |
|
|
# - compare with wcswidth returning -1 for entire string containing such in python,perl |
|
|
proc wcswidth {string} { |
|
|
#faster than textutil::wcswidth (at least for string up to a few K in length) |
|
|
#..but - 'scan' is horrible for 400K+ (Tcl evaluation stack has to be reallocated/copied?) |
|
|
#Tcl initial evaluation stack size is 2000 (? review) |
|
|
#REVIEW - when we cater for grapheme clusters - we can't just split the string at arbitrary points like this!! |
|
|
set chunksize 2000 |
|
|
set chunks_required [expr {int(ceil([tcl::string::length $string] / double($chunksize)))}] |
|
|
set width 0 |
|
|
set startidx 0 |
|
|
set endidx [expr {$startidx + $chunksize -1}] |
|
|
for {set i 0} {$i < $chunks_required} {incr i} { |
|
|
set chunk [tcl::string::range $string $startidx $endidx] |
|
|
|
|
|
set codes [scan $chunk [tcl::string::repeat %c [tcl::string::length $chunk]]] |
|
|
foreach c $codes { |
|
|
if {$c <= 255} { |
|
|
if {$c == 9 || ($c >= 31 && $c != 127)} { |
|
|
#review - non-printing ascii? why does textutil::wcswidth report 1 ?? |
|
|
#todo - compare with python or other lang wcwidth |
|
|
incr width |
|
|
} |
|
|
} elseif {$c < 917504 || $c > 917631} { |
|
|
#TODO - various other joiners and non-printing chars |
|
|
set w [textutil::wcswidth_char $c] |
|
|
if {$w < 0} { |
|
|
return -1 |
|
|
} else { |
|
|
incr width $w |
|
|
} |
|
|
} |
|
|
} |
|
|
incr startidx $chunksize |
|
|
incr endidx $chunksize |
|
|
} |
|
|
return $width |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------------------------------------------ |
|
|
#test |
|
|
# ------------------------------------------------------------------------------------------------------ |
|
|
proc grapheme_split_tk {string} { |
|
|
if {![regexp "\[\uFF-\U10FFFF\]" $string]} { |
|
|
#only ascii (7 or 8 bit) - no joiners or unicode |
|
|
return [split $string {}] |
|
|
} |
|
|
package require tk |
|
|
set i 0 |
|
|
set graphemes [list] |
|
|
while {$i < [tcl::string::length $string]} { |
|
|
set aftercluster [tk::endOfCluster $string $i] |
|
|
lappend graphemes [string range $string $i $aftercluster-1] |
|
|
set i $aftercluster |
|
|
} |
|
|
return $graphemes |
|
|
} |
|
|
proc wcswidth_clustered {string} { |
|
|
package require tk |
|
|
set width 0 |
|
|
set i 0 |
|
|
if {![regexp "\[\uFF-\U10FFFF\]" $string]} { |
|
|
return [punk::char::wcswidth_unclustered $string] ;#still use our wcswidth to account for non-printable ascii |
|
|
} |
|
|
while {$i < [tcl::string::length $string]} { |
|
|
set aftercluster [tk::endOfCluster $string $i] |
|
|
set g [string range $string $i $aftercluster-1] |
|
|
if {$aftercluster > ($i + 1)} { |
|
|
#review - proper way to determine screen width (columns occupied) of a cluster?? |
|
|
#according to this: |
|
|
#https://lib.rs/crates/unicode-display-width |
|
|
#for each grapheme - if any of the code points in the cluster have an east asian width of 2, |
|
|
#The entire grapheme width is 2 regardless of how many code points constitute the grapheme |
|
|
set gw 1 |
|
|
foreach ch [split $g ""] { |
|
|
if {[punk::char::wcswidth_single $ch] == 2} { |
|
|
set gw 2 |
|
|
break |
|
|
} |
|
|
} |
|
|
incr width $gw |
|
|
|
|
|
#if {[string first \u200d $g] >=0} { |
|
|
# incr width 2 |
|
|
#} else { |
|
|
# #other joiners??? |
|
|
# incr width [wcswidth_unclustered $g] |
|
|
#} |
|
|
} else { |
|
|
incr width [wcswidth_unclustered $g] |
|
|
} |
|
|
set i $aftercluster |
|
|
} |
|
|
|
|
|
return $width |
|
|
} |
|
|
proc wcswidth_single {char} { |
|
|
scan $char %c dec |
|
|
if {$dec <= 255} { |
|
|
if {$dec == 9} { |
|
|
#tab always represented by at least one char in terminal etc. |
|
|
#caller will need to process tabs themselves to determine extra width applicable to their circumstance. |
|
|
return 1 |
|
|
} |
|
|
if {($dec < 31 || $dec == 127)} { |
|
|
return 0 |
|
|
} |
|
|
#review - non-printing ascii? why does textutil::wcswidth report 1 ?? |
|
|
#todo - compare with python or other lang wcwidth |
|
|
return 1 |
|
|
} elseif {$dec < 917504 || $dec > 917631} { |
|
|
#TODO - various other joiners and non-printing chars |
|
|
return [textutil::wcswidth_char $dec] ;#note textutil::wcswidth_char takes a decimal codepoint! |
|
|
#may return -1 - REVIEW |
|
|
} |
|
|
return 0 |
|
|
} |
|
|
proc wcswidth_unclustered1 {string} { |
|
|
set width 0 |
|
|
foreach c [split $string {}] { |
|
|
scan $c %c dec |
|
|
if {$dec <= 255} { |
|
|
if {$dec == 9 || ($dec >= 31 && $dec != 127)} { |
|
|
#review - non-printing ascii? why does textutil::wcswidth report 1 ?? |
|
|
#todo - compare with python or other lang wcwidth |
|
|
incr width |
|
|
} |
|
|
} elseif {$dec < 917504 || $dec > 917631} { |
|
|
#TODO - various other joiners and non-printing chars |
|
|
set w [textutil::wcswidth_char $dec] ;#takes decimal codepoint |
|
|
if {$w < 0} { |
|
|
return -1 |
|
|
} else { |
|
|
incr width $w |
|
|
} |
|
|
} |
|
|
} |
|
|
return $width |
|
|
} |
|
|
#todo - consider disallowing/erroring out when \r \n in string? |
|
|
# - tab/vtab? |
|
|
# - compare with wcswidth returning -1 for entire string containing such in python,perl |
|
|
proc wcswidth_unclustered {string} { |
|
|
#faster than textutil::wcswidth (at least for string up to a few K in length) |
|
|
#..but - 'scan' is horrible for 400K+ (Tcl evaluation stack has to be reallocated/copied?) |
|
|
#Tcl initial evaluation stack size is 2000 (? review) |
|
|
#we can only split the string at arbitrary points like this because we are specifically dealing with input that has no clusters!. |
|
|
set chunksize 2000 |
|
|
set chunks_required [expr {int(ceil([tcl::string::length $string] / double($chunksize)))}] |
|
|
set width 0 |
|
|
set startidx 0 |
|
|
set endidx [expr {$startidx + $chunksize -1}] |
|
|
for {set i 0} {$i < $chunks_required} {incr i} { |
|
|
set chunk [tcl::string::range $string $startidx $endidx] |
|
|
|
|
|
set codes [scan $chunk [tcl::string::repeat %c [tcl::string::length $chunk]]] |
|
|
foreach dec $codes { |
|
|
if {$dec <= 255} { |
|
|
if {($dec ==9 || ($dec >= 31 && $dec != 127))} { |
|
|
#review - non-printing ascii? why does textutil::wcswidth report 1 ?? |
|
|
#todo - compare with python or other lang wcswidth |
|
|
incr width |
|
|
} |
|
|
} elseif {$dec < 917504 || $dec > 917631} { |
|
|
#TODO - various other joiners and non-printing chars |
|
|
set w [textutil::wcswidth_char $dec] |
|
|
if {$w < 0} { |
|
|
return -1 |
|
|
} else { |
|
|
incr width $w |
|
|
} |
|
|
} |
|
|
} |
|
|
incr startidx $chunksize |
|
|
incr endidx $chunksize |
|
|
} |
|
|
return $width |
|
|
} |
|
|
# ------------------------------------------------------------------------------------------------------ |
|
|
|
|
|
proc wcswidth0 {string} { |
|
|
#faster than textutil::wcswidth (at least for string up to a few K in length) |
|
|
#..but - 'scan' is horrible for 400K+ |
|
|
#TODO |
|
|
set codes [scan $string [tcl::string::repeat %c [tcl::string::length $string]]] |
|
|
set width 0 |
|
|
foreach dec $codes { |
|
|
#unicode Tags block zero width |
|
|
if {$dec < 917504 || $dec > 917631} { |
|
|
if {$dec <= 255} { |
|
|
#review - non-printing ascii? why does textutil::wcswidth report 1 ?? |
|
|
#todo - compare with python or other lang wcwidth |
|
|
if {!($dec < 31 || $dec == 127)} { |
|
|
incr width |
|
|
} |
|
|
} else { |
|
|
#TODO - various other joiners and non-printing chars |
|
|
set w [textutil::wcswidth_char $dec] ;#takes decimal codepoint |
|
|
if {$w < 0} { |
|
|
return -1 |
|
|
} else { |
|
|
incr width $w |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
return $width |
|
|
} |
|
|
proc wcswidth2 {string} { |
|
|
set codes [scan $string [tcl::string::repeat %c [tcl::string::length $string]]] |
|
|
set widths [lmap dec $codes {textutil::wcswidth_char $dec}] |
|
|
if {-1 in $widths} { |
|
|
return -1 |
|
|
} |
|
|
return [tcl::mathop::+ {*}$widths] |
|
|
} |
|
|
|
|
|
#prerequisites - no ansi escapes - no newlines - utf8 encoding assumed |
|
|
#review - what about \r \t \b ? |
|
|
#NO processing of \b - already handled in ansi::printing_length which then calls this |
|
|
#this version breaks string into sequences of ascii vs unicode |
|
|
proc ansifreestring_width {text} { |
|
|
#caller responsible for calling ansistrip first if text may have ansi codes - and for ensuring no newlines |
|
|
#we can c0 control characters after or while processing ansi escapes. |
|
|
#we need to map remaining control characters to zero-width (under assumption we're not using a font/codepage that displays them - review!) |
|
|
#anyway - we must allow things like raw ESC,DEL, NUL etc to pass through without error |
|
|
#if {[tcl::string::first \033 $text] >= 0} { |
|
|
# error "string_width doesn't accept ansi escape sequences. Use punk::ansi::ansistrip first" |
|
|
#} |
|
|
|
|
|
|
|
|
#todo - various combining diacritical marks.. from grave - to various complicated unicode joiners and composing chars etc |
|
|
#as at 2023 - terminals generally seem to use the simplistic approach of tallying up individual character-widths, which means combinations that print short are reported too long by the terminal esc 6 n sequence. |
|
|
|
|
|
|
|
|
#- {Combining Diacritical Marks} {ranges {{start 768 end 879 note {unicode block 0300..036F}}} description {} settype block} |
|
|
#- {Combining Diacritical Marks Extended} {ranges {{start 6832 end 6911 note {unicode block 1AB0..1AFF}}} description {} settype block} |
|
|
#- {Combining Diacritical Marks Supplement} {ranges {{start 7616 end 7679 note {unicode block 1DC0..1DFF}}} description {} settype block} |
|
|
#- {Combining Diacritical Marks for Symbols} {ranges {{start 8400 end 8447 note {unicode block 20D0..20FF}}} description {} settype block} |
|
|
#- {Combining Half Marks} {ranges {{start 65056 end 65071 note {unicode block FE20..FE2F}}} description {} settype block} |
|
|
# |
|
|
# initial simplistic approach is just to strip these ... todo REVIEW |
|
|
|
|
|
#experiment to detect leading diacritics - but this isn't necessary - it's still zero-width - and if the user is splitting properly we shouldn't be getting a string with leading diacritics anyway |
|
|
#(leading combiners may display in terminal as mark on rightmost prompt char which is usually a space - but won't add width even then) |
|
|
#set re_leading_diacritic {^(?:[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+)} |
|
|
#if {[regexp $re_leading_diacritic $text]} { |
|
|
# set text " $text" |
|
|
#} |
|
|
set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+} |
|
|
set text [regsub -all $re_diacritics $text ""] |
|
|
|
|
|
# -- --- --- --- --- --- --- |
|
|
#review |
|
|
#if we strip out ZWJ \u200d (zero width combiner) - then we don't give the chance for a grapheme combining test to merge properly e.g combining emojis |
|
|
#as at 2024 - textutil::wcswidth just uses the unicode east-asian width property data and doesn't seem to handle these specially - it counts this joiner and others as one wide (also BOM \uFFEF) |
|
|
|
|
|
#TODO - once we have proper grapheme cluster splitting - work out which of these characters should be left in and/or when exactly their length-effects apply |
|
|
# |
|
|
#for now - strip them out |
|
|
|
|
|
#ZWNJ \u200c should also be zero length - but as a 'non' joiner - it should add zero to the length |
|
|
#ZWSP \u200b zero width space |
|
|
|
|
|
#\uFFEFBOM/ ZWNBSP and others that should be zero width |
|
|
#todo - work out proper way to mark/group zero width. |
|
|
|
|
|
#set text [tcl::string::map [list \u200b "" \u200c "" \u200d "" \uFFEF ""] $text] |
|
|
set text [tcl::string::map [list \u200b "" \u200c "" \u200d ""] $text] |
|
|
#\uFFEF tends to print as 1 length replacement char - REVIEW |
|
|
#\uFFFF varies between terminals - some print replacement char (width 1) some print nothing (width 0) |
|
|
# -- --- --- --- --- --- --- |
|
|
|
|
|
#we should only map control sequences to nothing after processing ones with length effects, such as \b (\x07f) or DEL \x1f |
|
|
#todo - document that these shouldn't be present in input rather than explicitly checking here |
|
|
|
|
|
#c0 controls + del (127 7f) - tab |
|
|
#set re_ascii_c0 {[\U0000-\U001F]} |
|
|
set re_ascii_c0 {[\u0000-\u0008\u000A-\u001F\u007F]} |
|
|
set text [regsub -all $re_ascii_c0 $text ""] |
|
|
|
|
|
#c1 controls - first section of the Latin-1 Supplement block - all non-printable from a utf-8 perspective |
|
|
#some or all of these may have a visual representation in other encodings e.g cp855 seems to have 1 width for them all |
|
|
#we are presuming that the text supplied has been converted from any other encoding to utf8 - so we don't need to handle that here |
|
|
#they should also be unlikely to be present in an ansi-free string (as is a precondition for use of this function) |
|
|
set text [regsub -all {[\u0080-\u009f]+} $text ""] |
|
|
|
|
|
|
|
|
#short-circuit basic cases |
|
|
#support tcl pre 2023-11 - see regexp bug below |
|
|
#if {![regexp {[\uFF-\U10FFFF]} $text]} { |
|
|
# return [tcl::string::length $text] |
|
|
#} |
|
|
if {![regexp "\[\uFF-\U10FFFF\]" $text]} { |
|
|
return [tcl::string::length $text] |
|
|
#punk::char::wcswidth has to split and examine dec value of each code |
|
|
#By stripping controls + 7F (leaving tab) we've already eliminated the non-printable ascii - REVIEW |
|
|
#return [punk::char::wcswidth $text] ;#still use our wcswidth to account for non-printable ascii |
|
|
} |
|
|
|
|
|
#split just to get the standalone character widths - and then scan for other combiners (?) - or scan for clusters first? |
|
|
|
|
|
#review |
|
|
#set can_regex_high_unicode [tcl::string::match [regexp -all -inline {[\U80-\U0010FFFF]} \U0001F525] \U0001F525] |
|
|
#tcl pre 2023-11 - braced high unicode regexes don't work |
|
|
#fixed in bug-4ed788c618 2023-11 |
|
|
#set uc_chars [regexp -all -inline "\[\u0100-\U10FFFF\]" $text] ;#e.g return list of chars in range only |
|
|
|
|
|
#maintain unicode as sequences - todo - scan for grapheme clusters |
|
|
#set uc_sequences [punk::ansi::ta::_perlish_split "(?:\[\u0100-\U10FFFF\])+" $text] |
|
|
set uc_sequences [punk::ansi::ta::_perlish_split "(?:\[\u0000-\u00FF\])+" $text] |
|
|
set len 0 |
|
|
foreach {uc ascii} $uc_sequences { |
|
|
#puts "-ascii $ascii" |
|
|
#puts "-uc $uc" |
|
|
incr len [tcl::string::length $ascii] |
|
|
#textutil::wcswidth uses unicode data |
|
|
#fall back to textutil::wcswidth (which doesn't for example handle diactricts/combiners so we can't use until these and other things such as \u200b and diacritics are already stripped/accounted for) |
|
|
#todo - find something that understands grapheme clusters - needed also for grapheme_split |
|
|
#use punk::char::wcswidth - faster than the string split in textutil::wcswidth but still uses textutil::wcswidth_char |
|
|
incr len [punk::char::wcswidth $uc] |
|
|
} |
|
|
#todo - work out what to do with anomalies like grave combiner which print at different lengths on different terminals (fonts?) and for which cursor-query escape sequence lies. |
|
|
return $len |
|
|
} |
|
|
|
|
|
#kept as a fallback for review/test if textutil::wcswidth doesn't do what we require on all terminals. |
|
|
#this version looks at console testwidths if they've been cached. |
|
|
#It is relatively fast - but tests unicode widths char by char - so won't be useful going forward for grapheme clusters. |
|
|
proc ansifreestring_width2 {text} { |
|
|
#caller responsible for calling ansistrip first if text may have ansi codes - and for ensuring no newlines |
|
|
|
|
|
|
|
|
|
|
|
#we can c0 control characters after or while processing ansi escapes. |
|
|
#we need to map remaining control characters to zero-width (under assumption we're not using a font/codepage that displays them - review!) |
|
|
#anyway - we must allow things like raw ESC,DEL, NUL etc to pass through without error |
|
|
#if {[tcl::string::first \033 $text] >= 0} { |
|
|
# error "string_width doesn't accept ansi escape sequences. Use punk::ansi::ansistrip first" |
|
|
#} |
|
|
|
|
|
|
|
|
#todo - various combining diacritical marks.. from grave - to various complicated unicode joiners and composing chars etc |
|
|
#as at 2023 - terminals generally seem to use the simplistic approach of tallying up individual character-widths, which means combinations that print short are reported too long by the terminal esc 6 n sequence. |
|
|
|
|
|
|
|
|
#- {Combining Diacritical Marks} {ranges {{start 768 end 879 note {unicode block 0300..036F}}} description {} settype block} |
|
|
#- {Combining Diacritical Marks Extended} {ranges {{start 6832 end 6911 note {unicode block 1AB0..1AFF}}} description {} settype block} |
|
|
#- {Combining Diacritical Marks Supplement} {ranges {{start 7616 end 7679 note {unicode block 1DC0..1DFF}}} description {} settype block} |
|
|
#- {Combining Diacritical Marks for Symbols} {ranges {{start 8400 end 8447 note {unicode block 20D0..20FF}}} description {} settype block} |
|
|
#- {Combining Half Marks} {ranges {{start 65056 end 65071 note {unicode block FE20..FE2F}}} description {} settype block} |
|
|
# |
|
|
# initial simplistic approach is just to strip these ... todo REVIEW |
|
|
|
|
|
#experiment to detect leading diacritics - but this isn't necessary - it's still zero-width - and if the user is splitting properly we shouldn't be getting a string with leading diacritics anyway |
|
|
#(leading combiners may display in terminal as mark on rightmost prompt char which is usually a space - but won't add width even then) |
|
|
#set re_leading_diacritic {^(?:[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+)} |
|
|
#if {[regexp $re_leading_diacritic $text]} { |
|
|
# set text " $text" |
|
|
#} |
|
|
set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+} |
|
|
set text [regsub -all $re_diacritics $text ""] |
|
|
|
|
|
#review |
|
|
#if we strip out ZWJ \u0200d (zero width combiner) - then we don't give the chance for a grapheme combining test to merge properly e.g combining emojis |
|
|
#as at 2024 - textutil::wcswidth doesn't seem to be aware of these - and counts the joiner as one wide |
|
|
|
|
|
#ZWNJ \u0200c should also be zero length - but as a 'non' joiner - it should add zero to the length |
|
|
|
|
|
#ZWSP \u0200b zero width space |
|
|
|
|
|
|
|
|
#we should only map control sequences to nothing after processing ones with length effects, such as \b (\x07f) or DEL \x1f |
|
|
#todo - document that these shouldn't be present in input rather than explicitly checking here |
|
|
set re_ascii_c0 {[\U0000-\U001F]} |
|
|
set text [regsub -all $re_ascii_c0 $text ""] |
|
|
|
|
|
#short-circuit basic cases |
|
|
#support tcl pre 2023-11 - see regexp bug below |
|
|
#if {![regexp {[\uFF-\U10FFFF]} $text]} { |
|
|
# return [tcl::string::length $text] |
|
|
#} |
|
|
if {![regexp "\[\uFF-\U10FFFF\]" $text]} { |
|
|
return [tcl::string::length $text] |
|
|
} |
|
|
|
|
|
#review - wcswidth should detect these |
|
|
set re_ascii_fullwidth {[\uFF01-\uFF5e]} |
|
|
|
|
|
set doublewidth_char_count 0 |
|
|
set zerowidth_char_count 0 |
|
|
#split just to get the standalone character widths - and then scan for other combiners (?) |
|
|
#review |
|
|
#set can_regex_high_unicode [tcl::string::match [regexp -all -inline {[\U80-\U0010FFFF]} \U0001F525] \U0001F525] |
|
|
#tcl pre 2023-11 - braced high unicode regexes don't work |
|
|
#fixed in bug-4ed788c618 2023-11 |
|
|
#set uc_sequences [regexp -all -inline -indices {[\u0100-\U10FFFF]} $text] |
|
|
#set uc_sequences [regexp -all -inline -indices "\[\u0100-\U10FFFF\]" $text] ;#e.g for string: \U0001f9d1abc\U001f525ab returns {0 0} {4 4} |
|
|
set uc_chars [regexp -all -inline "\[\u0100-\U10FFFF\]" $text] ;#e.g return list of chars in range only |
|
|
foreach c $uc_chars { |
|
|
if {[regexp $re_ascii_fullwidth $c]} { |
|
|
incr doublewidth_char_count |
|
|
} else { |
|
|
#review |
|
|
# a)- terminals lie - so we could have a bad cached testwidth |
|
|
# b)- textutil::wcswidth_char seems to be east-asian-width based - and not a reliable indicator of 'character cells taken by the character when printed to the terminal' despite this statement in tclllib docs. |
|
|
#(character width is a complex context-dependent topic) |
|
|
# c) by checking for a cached console testwidth first - we make this function less deterministic/repeatable depending on whether console tests have been run. |
|
|
# d) Upstream caching of grapheme_width may also lock in a result from whatever method was first employed here |
|
|
#Despite all this - the mechanism is hoped to give best effort consistency for terminals |
|
|
#further work needed for combining emojis etc - which can't be done in a per character loop |
|
|
#todo - see if wcswidth does any of this. It is very slow for strings that include mixed ascii/unicode - so perhaps we can use a perlish_split |
|
|
# to process sequences of unicode. |
|
|
#- and the user has the option to test character sets first if terminal position reporting gives better results |
|
|
if {[char_info_is_testwidth_cached $c]} { |
|
|
set width [char_info_testwidth_cached $c] |
|
|
} else { |
|
|
#textutil::wcswidth uses unicode data |
|
|
#fall back to textutil::wcswidth (which doesn't for example handle diactricts/combiners so we can't use until these and other things such as \u200b and diacritics are already stripped/accounted for) |
|
|
set width [textutil::wcswidth_char [scan $c %c]] |
|
|
} |
|
|
if {$width == 0} { |
|
|
incr zerowidth_char_count |
|
|
} elseif {$width == 2} { |
|
|
incr doublewidth_char_count |
|
|
} |
|
|
} |
|
|
} |
|
|
#todo - work out what to do with anomalies like grave combiner which print at different lengths on different terminals (fonts?) and for which cursor-query escape sequence lies. |
|
|
return [expr {[tcl::string::length $text] + $doublewidth_char_count - $zerowidth_char_count}] |
|
|
} |
|
|
|
|
|
#slow - textutil::wcswidth is slow with mixed ascii uc |
|
|
proc ansifreestring_width3 {text} { |
|
|
#caller responsible for calling ansistrip first if text may have ansi codes - and for ensuring no newlines |
|
|
|
|
|
|
|
|
|
|
|
#we can c0 control characters after or while processing ansi escapes. |
|
|
#we need to map remaining control characters to zero-width (under assumption we're not using a font/codepage that displays them - review!) |
|
|
#anyway - we must allow things like raw ESC,DEL, NUL etc to pass through without error |
|
|
#if {[tcl::string::first \033 $text] >= 0} { |
|
|
# error "string_width doesn't accept ansi escape sequences. Use punk::ansi::ansistrip first" |
|
|
#} |
|
|
|
|
|
|
|
|
#review - work out what to do with anomalies like grave combiner which print at different lengths on different terminals (fonts?) and for which cursor-query escape sequence lies. |
|
|
# - various combining diacritical marks.. from grave - to various complicated unicode joiners and composing chars etc |
|
|
#as at 2023 - terminals generally seem to use the simplistic approach of tallying up individual character-widths, which means combinations that print short are reported too long by the terminal esc 6 n sequence. |
|
|
|
|
|
#- {Combining Diacritical Marks} {ranges {{start 768 end 879 note {unicode block 0300..036F}}} description {} settype block} |
|
|
#- {Combining Diacritical Marks Extended} {ranges {{start 6832 end 6911 note {unicode block 1AB0..1AFF}}} description {} settype block} |
|
|
#- {Combining Diacritical Marks Supplement} {ranges {{start 7616 end 7679 note {unicode block 1DC0..1DFF}}} description {} settype block} |
|
|
#- {Combining Diacritical Marks for Symbols} {ranges {{start 8400 end 8447 note {unicode block 20D0..20FF}}} description {} settype block} |
|
|
#- {Combining Half Marks} {ranges {{start 65056 end 65071 note {unicode block FE20..FE2F}}} description {} settype block} |
|
|
# |
|
|
# initial simplistic approach is just to strip these ... todo REVIEW |
|
|
|
|
|
#experiment to detect leading diacritics - but this isn't necessary - it's still zero-width - and if the user is splitting properly we shouldn't be getting a string with leading diacritics anyway |
|
|
#(leading combiners may display in terminal as mark on rightmost prompt char which is usually a space - but won't add width even then) |
|
|
#set re_leading_diacritic {^(?:[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+)} |
|
|
#if {[regexp $re_leading_diacritic $text]} { |
|
|
# set text " $text" |
|
|
#} |
|
|
set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+} |
|
|
set text [regsub -all $re_diacritics $text ""] |
|
|
|
|
|
#we should only map control sequences to nothing after processing ones with length effects, such as \b (\x07f) or DEL \x1f |
|
|
#todo - document that these shouldn't be present in input rather than explicitly checking here |
|
|
set re_ascii_c0 {[\U0000-\U001F]} |
|
|
set text [regsub -all $re_ascii_c0 $text ""] |
|
|
|
|
|
#short-circuit basic cases |
|
|
#support tcl pre 2023-11 - see regexp bug below |
|
|
#if {![regexp {[\uFF-\U10FFFF]} $text]} { |
|
|
# return [tcl::string::length $text] |
|
|
#} |
|
|
if {![regexp "\[\uFF-\U10FFFF\]" $text]} { |
|
|
return [tcl::string::length $text] |
|
|
} |
|
|
|
|
|
#slow when ascii mixed with unicode (but why?) |
|
|
return [punk::char::wcswidth $text] |
|
|
} |
|
|
#This shouldn't be called on text containing ansi codes! |
|
|
proc strip_nonprinting_ascii {str} { |
|
|
#review - some single-byte 'control' chars have visual representations e.g ETX as heart depending on font/codepage |
|
|
#It is currently used for screen display width calculations |
|
|
#equivalent for various unicode combining chars etc? |
|
|
set map [list\ |
|
|
\x00 ""\ |
|
|
\x07 ""\ |
|
|
\x7f ""\ |
|
|
] |
|
|
return [tcl::string::map $map $str] |
|
|
} |
|
|
|
|
|
#todo - lookup from unicode tables |
|
|
variable flags [dict create\ |
|
|
AU \U1F1E6\U1F1FA\ |
|
|
US \U1F1FA\U1F1F8\ |
|
|
ZW \U1F1FF\U1F1FC |
|
|
] |
|
|
variable rflags |
|
|
dict for {k v} $flags { |
|
|
dict set rflags $v $k |
|
|
} |
|
|
|
|
|
|
|
|
proc flag_from_ascii {code} { |
|
|
variable flags |
|
|
if {[regexp {^[A-Z]{2}$} $code]} { |
|
|
if {[dict exists $flags $code]} { |
|
|
return [dict get $flags $code] |
|
|
} else { |
|
|
error "unsupported flags code: $code" |
|
|
} |
|
|
} else { |
|
|
#try as subregion |
|
|
#e.g gbeng,gbwls,gbsct |
|
|
return \U1f3f4[tag_from_ascii $code]\Ue007f |
|
|
} |
|
|
} |
|
|
proc flag_to_ascii {charsequence} { |
|
|
variable rflags |
|
|
if {[dict exists $rflags $charsequence]} { |
|
|
return [dict get $rflags $charsequence] |
|
|
} |
|
|
if {[string index $charsequence 0] eq "\U1F3F4" && [string index $charsequence end] eq "\UE007F"} { |
|
|
#subdivision flag |
|
|
set tag [string range $charsequence 1 end-1] |
|
|
return [tag_to_ascii $tag] |
|
|
} |
|
|
error "unknown flag $charsequence" |
|
|
} |
|
|
|
|
|
proc tag_to_ascii {t} { |
|
|
set fmt [string repeat %c [string length $t]] |
|
|
set declist [scan $t $fmt] |
|
|
#unicode Tags block - e0000 to e007f |
|
|
set declist [lmap dec $declist { |
|
|
if {$dec < 917504 || $dec > 917631} { |
|
|
error "char [ansistring VIEW -lf 1 -cr 1 -vt 1 [format %c $dec]] has decimal value $dec. Not in unicode Tags block range 917504-917631 (e0000-e007f)" |
|
|
} |
|
|
incr dec -917504 |
|
|
}] |
|
|
return [format $fmt {*}$declist] |
|
|
} |
|
|
proc tag_from_ascii {a} { |
|
|
set fmt [string repeat %c [string length $a]] |
|
|
set declist [scan $a $fmt] |
|
|
set declist [lmap dec $declist { |
|
|
if {$dec > 127} { |
|
|
error "char [ansistring VIEW -lf 1 -cr 1 -vt 1 [format %c $dec]] has decimal value $dec. Not in ascii range 0-127" |
|
|
} |
|
|
incr dec 917504 |
|
|
}] |
|
|
return [format $fmt {*}$declist] |
|
|
} |
|
|
|
|
|
#split into plaintext and runs of combiners (combining diacritical marks - not ZWJ or ZWJNJ) |
|
|
proc combiner_split {text} { |
|
|
#split into odd numbered list (or zero) in a similar way to punk::ansi::ta::_perlish_split |
|
|
# |
|
|
#set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+} |
|
|
set graphemes [list] |
|
|
if {[tcl::string::length $text] == 0} { |
|
|
return {} |
|
|
} |
|
|
set list [list] |
|
|
set start 0 |
|
|
set strlen [tcl::string::length $text] |
|
|
#make sure our regexes aren't non-greedy - or we may not have exit condition for loop |
|
|
#review |
|
|
while {$start < $strlen && [regexp -start $start -indices -- {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+} $text match]} { |
|
|
lassign $match matchStart matchEnd |
|
|
#puts "->start $start ->match $matchStart $matchEnd" |
|
|
lappend list [tcl::string::range $text $start $matchStart-1] [tcl::string::range $text $matchStart $matchEnd] |
|
|
set start [expr {$matchEnd+1}] |
|
|
} |
|
|
lappend list [tcl::string::range $text $start end] |
|
|
} |
|
|
|
|
|
#ZWJ ZWNJ ? |
|
|
#SWSP ? |
|
|
|
|
|
#1st shot - basic diacritics |
|
|
#todo - become aware of unicode grapheme cluster boundaries |
|
|
#This is difficult in Tcl without unicode property based Character Classes in the regex engine |
|
|
#review - this needs to be performant - it is used a lot by punk terminal/ansi features |
|
|
#todo - trie data structures for unicode? |
|
|
#for now we can at least combine diacritics |
|
|
#should also handle the ZWJ (and the variation selectors? eg \uFE0F) character which should account for emoji clusters |
|
|
#Note - emoji cluster could be for example 11 code points/41 bytes (family emoji with skin tone modifiers for each member, 3 ZWJs) |
|
|
#This still leaves a whole class of clusters.. korean etc unhandled :/ |
|
|
#todo - tk::startOfCluster / tk::endOfCluster - try to get it brought into Tcl |
|
|
#https://core.tcl-lang.org/tcl/tktview/a4c7eeaf63 |
|
|
proc grapheme_split {text} { |
|
|
set graphemes [list] |
|
|
set csplits [combiner_split $text] |
|
|
foreach {pt combiners} [lrange $csplits 0 end-1] { |
|
|
set clist [split $pt ""] |
|
|
lappend graphemes {*}[lrange $clist 0 end-1] |
|
|
lappend graphemes [tcl::string::cat [lindex $clist end] $combiners] |
|
|
} |
|
|
#last csplit never has a combiner (_perlish_split style) - and may be empty - in which case we don't append it as a grapheme |
|
|
if {[lindex $csplits end] ne ""} { |
|
|
lappend graphemes {*}[split [lindex $csplits end] ""] |
|
|
} |
|
|
return $graphemes |
|
|
} |
|
|
proc grapheme_split_dec {text} { |
|
|
set graphemes [list] |
|
|
set csplits [combiner_split $text] |
|
|
foreach {pt combiners} [lrange $csplits 0 end-1] { |
|
|
set pt_decs [scan $pt [tcl::string::repeat %c [tcl::string::length $pt]]] ;#warning scan %c... slow for v large strings (e.g 400k+) |
|
|
set combiner_decs [scan $combiners [tcl::string::repeat %c [tcl::string::length $combiners]]] |
|
|
lset pt_decs end [concat [lindex $pt_decs end] $combiner_decs] |
|
|
lappend graphemes {*}$pt_decs |
|
|
} |
|
|
#last csplit never has a combiner (_perlish_split style) - and may be empty - in which case we don't append it as a grapheme |
|
|
if {[lindex $csplits end] ne ""} { |
|
|
lappend graphemes {*}[scan [lindex $csplits end] [tcl::string::repeat %c [tcl::string::length [lindex $csplits end]]]] |
|
|
} |
|
|
return $graphemes |
|
|
} |
|
|
proc grapheme_split_dec2 {text} { |
|
|
set graphemes [list] |
|
|
set csplits [combiner_split $text] |
|
|
foreach {pt combiners} $csplits { |
|
|
set pt_decs [scan $pt [tcl::string::repeat %c [tcl::string::length $pt]]] |
|
|
if {$combiners ne ""} { |
|
|
set combiner_decs [scan $combiners [tcl::string::repeat %c [tcl::string::length $combiners]]] |
|
|
lset pt_decs end [concat [lindex $pt_decs end] $combiner_decs] |
|
|
} |
|
|
lappend graphemes {*}$pt_decs |
|
|
} |
|
|
return $graphemes |
|
|
} |
|
|
proc grapheme_split2 {text} { |
|
|
set graphemes [list] |
|
|
set csplits [combiner_split $text] |
|
|
foreach {pt combiners} [lrange $csplits 0 end-1] { |
|
|
set clist [split $pt ""] |
|
|
lappend graphemes {*}[lrange $clist 0 end-1] [tcl::string::cat [lindex $clist end] $combiners] |
|
|
} |
|
|
#last csplit never has a combiner (_perlish_split style) - and may be empty - in which case we don't append it as a grapheme |
|
|
if {[lindex $csplits end] ne ""} { |
|
|
lappend graphemes {*}[split [lindex $csplits end] ""] |
|
|
} |
|
|
return $graphemes |
|
|
} |
|
|
|
|
|
# -- --- --- --- --- |
|
|
#will accept a single char or a string - test using console cursor position reporting |
|
|
#unreliable |
|
|
proc char_info_testwidth {ch {emit 0}} { |
|
|
package require punk::console |
|
|
#uses cursor movement and relies on console to report position.. which it may do properly for single chars - but may misreport for combinations that combine into a single glyph |
|
|
tailcall punk::console::test_char_width $ch $emit |
|
|
} |
|
|
proc char_info_testwidth_cached {char} { |
|
|
variable charinfo |
|
|
set dec [scan $char %c] |
|
|
set twidth "" |
|
|
if {[tcl::dict::exists $charinfo $dec testwidth]} { |
|
|
set twidth [tcl::dict::get $charinfo $dec testwidth] |
|
|
} |
|
|
if {$twidth eq ""} { |
|
|
set width [char_info_testwidth $char] |
|
|
tcl::dict::set charinfo $dec testwidth $width |
|
|
return $width |
|
|
} else { |
|
|
return $twidth |
|
|
} |
|
|
} |
|
|
proc char_info_is_testwidth_cached {char} { |
|
|
variable charinfo |
|
|
return [tcl::dict::exists $charinfo [scan $char %c] testwidth] |
|
|
} |
|
|
# -- --- --- --- --- |
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
|
## Ready |
|
|
package provide punk::char [tcl::namespace::eval punk::char { |
|
|
variable version |
|
|
set version 0.1.0 |
|
|
}] |
|
|
return |
|
|
|
|
|
#*** !doctools |
|
|
#[manpage_end] |
|
|
|
|
|
|
|
|
|