From 604d363d92350d14ce700e1c00cc119693293da3 Mon Sep 17 00:00:00 2001 From: Julian Noble Date: Thu, 1 Feb 2024 07:11:54 +1100 Subject: [PATCH] punk::ansi fixes and improvements, bootsupport modules --- src/bootsupport/include_modules.config | 12 +- src/bootsupport/modules/punk/ansi-0.1.0.tm | 1028 +++++++++ src/bootsupport/modules/punk/args-0.1.0.tm | 625 ++++++ src/bootsupport/modules/punk/char-0.1.0.tm | 1921 +++++++++++++++++ src/bootsupport/modules/punk/console-0.1.0.tm | 916 ++++++++ .../modules/punk/fileline-0.1.0.tm | 1710 +++++++++++++++ src/bootsupport/modules/punk/lib-0.1.0.tm | 619 ++++++ .../man/files/punk/_module_ansi-0.1.0.tm.n | 36 + .../doc/files/punk/_module_ansi-0.1.0.tm.md | 57 + .../doc/files/punk/_module_ansi-0.1.0.tm.html | 18 + src/modules/punk/ansi-999999.0a1.0.tm | 280 ++- src/modules/shellfilter-0.1.8.tm | 65 - 12 files changed, 7200 insertions(+), 87 deletions(-) create mode 100644 src/bootsupport/modules/punk/ansi-0.1.0.tm create mode 100644 src/bootsupport/modules/punk/args-0.1.0.tm create mode 100644 src/bootsupport/modules/punk/char-0.1.0.tm create mode 100644 src/bootsupport/modules/punk/console-0.1.0.tm create mode 100644 src/bootsupport/modules/punk/fileline-0.1.0.tm create mode 100644 src/bootsupport/modules/punk/lib-0.1.0.tm diff --git a/src/bootsupport/include_modules.config b/src/bootsupport/include_modules.config index aa6c50f..4c31e88 100644 --- a/src/bootsupport/include_modules.config +++ b/src/bootsupport/include_modules.config @@ -7,14 +7,19 @@ set bootsupport_modules [list\ src/vendormodules oolib\ src/vendormodules http\ modules punkcheck\ - modules punk::ns\ - modules punk::path\ + modules punk::ansi\ + modules punk::args\ modules punk::cap\ modules punk::cap::handlers::caphandler\ modules punk::cap::handlers::scriptlibs\ modules punk::cap::handlers::templates\ + modules punk::char\ + modules punk::console\ modules punk::du\ + modules punk::encmime\ + modules punk::fileline\ modules punk::docgen\ + modules punk::lib\ modules punk::mix\ modules punk::mix::base\ modules punk::mix::cli\ @@ -29,9 +34,10 @@ set bootsupport_modules [list\ modules punk::mix::commandset::project\ modules punk::mix::commandset::repo\ modules punk::mix::commandset::scriptwrap\ + modules punk::ns\ modules punk::overlay\ + modules punk::path\ modules punk::repo\ - modules punk::encmime\ modules punk::tdl\ modules punk::winpath\ ] diff --git a/src/bootsupport/modules/punk/ansi-0.1.0.tm b/src/bootsupport/modules/punk/ansi-0.1.0.tm new file mode 100644 index 0000000..d465796 --- /dev/null +++ b/src/bootsupport/modules/punk/ansi-0.1.0.tm @@ -0,0 +1,1028 @@ +# -*- tcl -*- +# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt +# +# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. +# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# (C) 2023 +# +# @@ Meta Begin +# Application punk::ansi 0.1.0 +# Meta platform tcl +# Meta license +# @@ Meta End + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# doctools header +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +#*** !doctools +#[manpage_begin punkshell_module_punk::ansi 0 0.1.0] +#[copyright "2023"] +#[titledesc {Ansi string functions}] [comment {-- Name section and table of contents description --}] +#[moddesc {punk Ansi library}] [comment {-- Description at end of page heading --}] +#[require punk::ansi] +#[keywords module ansi terminal console string] +#[description] +#[para]Ansi based terminal control string functions +#[para]See [package punk::ansi::console] for related functions for controlling a console + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section Overview] +#[para] overview of punk::ansi +#[para]punk::ansi functions return their values - no implicit emission to console/stdout +#[subsection Concepts] +#[para]Ansi codes can be used to control most terminals on most platforms in an 'almost' standard manner +#[para]There are many differences in terminal implementations - but most should support a core set of features +#[para]punk::ansi does not contain any code for direct terminal manipulation via the local system APIs. +#[para]Sticking to ansi codes where possible may be better for cross-platform and remote operation where such APIs are unlikely to be useable. + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Requirements +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[subsection dependencies] +#[para] packages used by punk::ansi +#[list_begin itemized] + +package require Tcl 8.6 +#*** !doctools +#[item] [package {Tcl 8.6}] + +# #package require frobz +# #*** !doctools +# #[item] [package {frobz}] + +#*** !doctools +#[list_end] + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section API] + + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval punk::ansi { + #*** !doctools + #[subsection {Namespace punk::ansi}] + #[para] Core API functions for punk::ansi + #[list_begin definitions] + + + #see also ansicolor page on wiki https://wiki.tcl-lang.org/page/ANSI+color+control + + variable test "blah\033\[1;33mETC\033\[0;mOK" + + + #Note that a? is actually a pattern. We can't explicitly match it without also matcing a+ ab etc. Presumably this won't matter here. + namespace export\ + {a?} {a+} a \ + convert*\ + clear*\ + cursor_*\ + detect*\ + get_*\ + move*\ + reset*\ + strip*\ + test_decaln\ + titleset\ + + + variable escape_terminals + #single "final byte" in the range 0x40–0x7E (ASCII @A–Z[\]^_`a–z{|}~). + dict set escape_terminals CSI [list @ \\ ^ _ ` | ~ a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z "\{" "\}"] + #dict set escape_terminals CSI [list J K m n A B C D E F G s u] ;#basic + dict set escape_terminals OSC [list \007 \033\\ \u009c] ;#note mix of 1 and 2-byte terminals + dict set escape_terminals DCS [list \007 \033\\ \u009c] + dict set escape_terminals MISC [list \007 \033\\ \u009c] + #NOTE - we are assuming an OSC or DCS started with one type of sequence (7 or 8bit) can be terminated by either 7 or 8 bit ST (or BEL e.g wezterm ) + #This using a different type of ST to that of the opening sequence is presumably unlikely in the wild - but who knows? + + #self-contained 2 byte ansi escape sequences - review more? + variable ansi_2byte_codes_dict + set ansi_2byte_codes_dict [dict create\ + "reset_terminal" "\u001bc"\ + "save_cursor_posn" "\u001b7"\ + "restore_cursor_posn" "\u001b8"\ + "cursor_up_one" "\u001bM"\ + "NEL - Next Line" "\u001bE"\ + "IND - Down one line" "\u001bD"\ + "HTS - Set Tab Stop" "\u001bH"\ + "DECPAM app keypad" "\x1b="\ + "DECPNM norm keypad" "\x1b>"\ + ] + + + #debatable whether strip should reveal the somethinghidden - some terminals don't hide it anyway. + # "PM - Privacy Message" "\u001b^somethinghidden\033\\"\ + #The intent is that it's not rendered to the terminal - so on balance it seems best to strip it out. + #todo - review - printing_length calculations affected by whether terminal honours PMs or not. detect and accomodate. + #candidate for zig/c implementation? + proc stripansi {text} { + #*** !doctools + #[call [fun stripansi] [arg text] ] + #[para]Return a string with ansi codes stripped out + + #todo - character set selection - SS2 SS3 - how are they terminated? REVIEW + + variable escape_terminals ;#dict + + set text [convert_g0 $text] + + + #we should just map away the 2-byte sequences too + #standalone 3 byte VT100 sequences - some of these work in wezterm + #\x1b#3 double-height letters top half + #\x1b#4 double-height letters bottom half + #\x1b#5 single-width line + #\x1b#6 double-width line + #\x1b#8 dec test fill screen + + set clean_map_2b [list \x1bc "" \x1b7 "" \x1b8 "" \x1bM "" \x1bE "" \x1bD "" \x1bH "" \x1b= "" \x1b> ""] + set clean_map_3b [list \x1b#3 "" \x1b#4 "" \x1b#5 "" \x1b#6 "" \x1b#8 ""] + set text [string map [concat $clean_map_2b $clean_map_3b] $text] + + #we process char by char - line-endings whether \r\n or \n should be processed as per any other character. + #line endings can theoretically occur within an ansi escape sequence payload (review e.g title?) + + set inputlist [split $text ""] + set outputlist [list] + + set in_escapesequence 0 + #assumption - undertext already 'rendered' - ie no backspaces or carriagereturns or other cursor movement controls + set i 0 + foreach u $inputlist { + set v [lindex $inputlist $i+1] + set uv ${u}${v} + if {$in_escapesequence eq "2b"} { + #2nd byte - done. + set in_escapesequence 0 + } elseif {$in_escapesequence != 0} { + set endseq [dict get $escape_terminals $in_escapesequence] + if {$u in $endseq} { + set in_escapesequence 0 + } elseif {$uv in $endseq} { + set in_escapseequence 2b ;#flag next byte as last in sequence + } + } else { + #handle both 7-bit and 8-bit CSI and OSC + if {[regexp {^(?:\033\[|\u009b)} $uv]} { + set in_escapesequence CSI + } elseif {[regexp {^(?:\033\]|\u009d)} $uv]} { + set in_escapesequence OSC + } elseif {[regexp {^(?:\033P|\u0090)} $uv]} { + set in_escapesequence DCS + } elseif {[regexp {^(?:\033X|\u0098|\033^|\u009E|\033_|\u009F)} $uv]} { + #SOS,PM,APC - all terminated with ST + set in_escapesequence MISC + } else { + lappend outputlist $u + } + } + incr i + } + return [join $outputlist ""] + } + + #review - what happens when no terminator? + #todo - map other chars to unicode equivs + proc convert_g0 {text} { + #using not \033 inside to stop greediness - review how does it compare to ".*?" + set re {\033\(0[^\033]*\033\(B} + set re2 {\033\(0(.*)\033\(B} ;#capturing + set parts [::punk::ansi::ta::_perlish_split $re $text] + set out "" + foreach {pt g} $parts { + append out $pt + if {$g ne ""} { + #puts --$g-- + #box sample + #lqk + #x x + #mqj + #m = boxd_lur + #set map [list l \u250f k \u2513] ;#heavy + set map [list l \u250c q \u2500 k \u2510 x \u2502 m \u2514 j \u2518] ;#light + + regexp $re2 $g _match contents + append out [string map $map $contents] + } + } + return $out + } + + #todo - convert esc(0 graphics sequences to single char unicode equivalents e.g box drawing set + # esc) ?? + proc stripansi_gx {text} { + #e.g "\033(0" - select VT100 graphics for character set G0 + #e.g "\033(B" - reset + #e.g "\033)0" - select VT100 graphics for character set G1 + #e.g "\033)X" - where X is any char other than 0 to reset ?? + return [convert_g0 $text] + } + + + #CSI m = SGR (Select Graphic Rendition) + variable SGR_setting_map { + bold 1 dim 2 blink 5 fastblink 6 noblink 25 hide 8 normal 22 + underline 4 doubleunderline 21 nounderline 24 strike 9 nostrike 29 italic 3 noitalic 23 + reverse 7 noreverse 27 defaultfg 39 defaultbg 49 + overline 53 nooverline 55 frame 51 framecircle 52 noframe 54 + } + variable SGR_colour_map { + black 30 red 31 green 32 yellow 33 blue 34 purple 35 cyan 36 white 37 + Black 40 Red 41 Green 42 Yellow 43 Blue 44 Purple 45 Cyan 46 White 47 + BLACK 100 RED 101 GREEN 102 YELLOW 103 BLUE 104 PURPLE 105 CYAN 106 WHITE 107 + } + variable SGR_map + set SGR_map [dict merge $SGR_colour_map $SGR_setting_map] + + + proc colourmap1 {{bgname White}} { + package require textblock + + set bg [textblock::block 3 33 "[a+ $bgname] [a]"] + set colormap "" + for {set i 0} {$i <= 7} {incr i} { + append colormap "_[a+ white bold 48\;5\;$i] $i [a]" + } + set map1 [overtype::left -transparent _ $bg "\n$colormap"] + return $map1 + } + proc colourmap2 {{bgname White}} { + package require textblock + set bg [textblock::block 3 39 "[a+ $bgname] [a]"] + set colormap "" + for {set i 8} {$i <= 15} {incr i} { + append colormap "_[a+ black normal 48\;5\;$i] $i [a]" ;#black normal is blacker than black bold - which often displays as a grey + } + set map2 [overtype::left -transparent _ $bg "\n$colormap"] + return $map2 + } + proc a? {args} { + #*** !doctools + #[call [fun a?] [opt {ansicode...}]] + #[para]Return an ansi string representing a table of codes and a panel showing the colours + variable SGR_setting_map + variable SGR_colour_map + + if {![llength $args]} { + set out "" + append out $SGR_setting_map \n + append out $SGR_colour_map \n + + try { + package require overtype ;# circular dependency - many components require overtype. Here we only need it for nice layout in the a? query proc - so we'll do a soft-dependency by only loading when needed and also wrapping in a try + set bgname "White" + set map1 [colourmap1 $bgname] + set map1 [overtype::centre -transparent 1 $map1 "[a black $bgname]Standard colours[a]"] + set map2 [colourmap2 $bgname] + set map2 [overtype::centre -transparent 1 $map2 "[a black $bgname]High-intensity colours[a]"] + append out [textblock::join $map1 " " $map2] \n + #append out $map1[a] \n + #append out $map2[a] \n + + + + } on error {result options} { + puts stderr "Failed to draw colormap" + puts stderr "$result" + } finally { + return $out + } + } else { + set result [list] + set rmap [lreverse $map] + foreach i $args { + if {[string is integer -strict $i]} { + if {[dict exists $rmap $i]} { + lappend result $i [dict get $rmap $i] + } + } else { + if {[dict exists $map $i]} { + lappend result $i [dict get $map $i] + } + } + } + return $result + } + } + proc a+ {args} { + #*** !doctools + #[call [fun a+] [opt {ansicode...}]] + #[para]Returns the ansi code to apply those from the supplied list - without any reset being performed first + #[para] e.g to set foreground red and bold + #[para]punk::ansi::a red bold + #[para]to set background red + #[para]punk::ansi::a Red + #[para]see [cmd punk::ansi::a?] to display a list of codes + + #don't disable ansi here. + #we want this to be available to call even if ansi is off + variable SGR_map + set t [list] + foreach i $args { + if {[string is integer -strict $i]} { + lappend t $i + } elseif {[string first ";" $i] >=0} { + #literal with params + lappend t $i + } else { + if {[dict exists $SGR_map $i]} { + lappend t [dict get $SGR_map $i] + } else { + #accept examples for foreground + # 256f-# or 256fg-# or 256f# + # rgbf--- or rgbfg--- or rgbf-- + if {[string match -nocase "256f*" $i]} { + set cc [string trim [string range $i 4 end] -gG] + lappend t "38;5;$cc" + } elseif {[string match -nocase 256b* $i]} { + set cc [string trim [string range $i 4 end] -gG] + lappend t "48;5;$cc" + } elseif {[string match -nocase rgbf* $i]} { + set rgb [string trim [string range $i 4 end] -gG] + lassign [split $rgb -] r g b + lappend t "38;2;$r;$g;$b" + } elseif {[string match -nocase rgbb* $i]} { + set rgb [string trim [string range $i 4 end] -gG] + lassign [split $rgb -] r g b + lappend t "48;2;$r;$g;$b" + } + } + } + } + # \033 - octal. equivalently \x1b in hex which is more common in documentation + if {![llength $t]} { + return "" ;# a+ nonexistent should return nothing rather than a reset ( \033\[\;m is a reset even without explicit zero(s)) + } + return "\x1b\[[join $t {;}]m" + } + proc a {args} { + #*** !doctools + #[call [fun a] [opt {ansicode...}]] + #[para]Returns the ansi code to reset any current settings and apply those from the supplied list + #[para] by calling punk::ansi::a with no arguments - the result is a reset to plain text + #[para] e.g to set foreground red and bold + #[para]punk::ansi::a red bold + #[para]to set background red + #[para]punk::ansi::a Red + #[para]see [cmd punk::ansi::a?] to display a list of codes + + + #don't disable ansi here. + #we want this to be available to call even if ansi is off + variable SGR_map + set t [list] + foreach i $args { + if {[string is integer -strict $i]} { + lappend t $i + } elseif {[string first ";" $i] >=0} { + #literal with params + lappend t $i + } else { + if {[dict exists $SGR_map $i]} { + lappend t [dict get $SGR_map $i] + } else { + #accept examples for foreground + # 256f-# or 256fg-# or 256f# + # rgbf--- or rgbfg--- or rgbf-- + if {[string match -nocase "256f*" $i]} { + set cc [string trim [string range $i 4 end] -gG] + lappend t "38;5;$cc" + } elseif {[string match -nocase 256b* $i]} { + set cc [string trim [string range $i 4 end] -gG] + lappend t "48;5;$cc" + } elseif {[string match -nocase rgbf* $i]} { + set rgb [string trim [string range $i 4 end] -gG] + lassign [split $rgb -] r g b + lappend t "38;2;$r;$g;$b" + } elseif {[string match -nocase rgbb* $i]} { + set rgb [string trim [string range $i 4 end] -gG] + lassign [split $rgb -] r g b + lappend t "48;2;$r;$g;$b" + } + } + } + } + # \033 - octal. equivalently \x1b in hex which is more common in documentation + # empty list [a=] should do reset - same for [a= nonexistant] + # explicit reset at beginning of parameter list for a= (as opposed to a+) + set t [linsert $t 0 0] + return "\x1b\[[join $t {;}]m" + } + + + + + proc get_code_name {code} { + #*** !doctools + #[call [fun get_code_name] [arg code]] + #[para]for example + #[para] get_code_name red will return 31 + #[para] get_code_name 31 will return red + variable SGR_map + set res [list] + foreach i [split $code ";"] { + set ix [lsearch -exact $SGR_map $i] + if {[string is digit -strict $code]} { + if {$ix>-1} {lappend res [lindex $SGR_map [incr ix -1]]} + } else { + #reverse lookup code from name + if {$ix>-1} {lappend res [lindex $SGR_map [incr ix]]} + } + } + set res + } + proc reset {} { + #*** !doctools + #[call [fun reset]] + #[para]reset console + return "\x1bc" + } + proc reset_soft {} { + #*** !doctools + #[call [fun reset_soft]] + return \x1b\[!p + } + proc reset_colour {} { + #*** !doctools + #[call [fun reset_colour]] + #[para]reset colour only + return "\x1b\[0m" + } + + # -- --- --- --- --- + proc clear {} { + #*** !doctools + #[call [fun clear]] + return "\033\[2J" + } + proc clear_above {} { + #*** !doctools + #[call [fun clear_above]] + return \033\[1J + } + proc clear_below {} { + #*** !doctools + #[call [fun clear_below]] + return \033\[0J + } + + proc clear_all {} { + # - doesn't work?? + return \033\[3J + } + #see also erase_ functions + # -- --- --- --- --- + + proc cursor_on {} { + #*** !doctools + #[call [fun cursor_on]] + return "\033\[?25h" + } + proc cursor_off {} { + #*** !doctools + #[call [fun cursor_off]] + return "\033\[?25l" + } + + # -- --- --- --- --- + proc move {row col} { + #*** !doctools + #[call [fun move] [arg row] [arg col]] + #[para]Return an ansi sequence to move to row,col + #[para]aka cursor home + return \033\[${row}\;${col}H + } + proc move_emit {row col data args} { + #*** !doctools + #[call [fun move_emit] [arg row] [arg col] [arg data] [opt {row col data...}]] + #[para]Return an ansi string representing a move to row col with data appended + #[para]row col data can be repeated any number of times to return a string representing the output of the data elements at all those points + #[para]Compare to punk::console::move_emit which calls this function - but writes it to stdout + #[para]punk::console::move_emit_return will also return the cursor to the original position + #[para]There is no punk::ansi::move_emit_return because in a standard console there is no ansi string which can represent a jump back to starting position. + #[para]There is an ansi code to write the current cursor position to stdin (which will generally display on the console) - this is not quite the same thing. + #[para]punk::console::move_emit_return does it by emitting that code and starting a loop to read stdin + #[para]punk::ansi could implement a move_emit_return using the punk::console mechanism - but the resulting string would capture the cursor position at the time the string is built - which is not necessarily when the string is used. + #[para]The following example shows how to do this manually, emitting the string blah at screen position 10,10 and emitting DONE back at the line we started: + #[para][example {punk::ansi::move_emit 10 10 blah {*}[punk::console::get_cursor_pos_list] DONE}] + #[para]A string created by any move_emit_return for punk::ansi would not behave in an intuitive manner compared to other punk::ansi move functions - so is deliberately omitted. + + set out "" + append out \033\[${row}\;${col}H$data + foreach {row col data} $args { + append out \033\[${row}\;${col}H$data + } + return $out + } + proc move_forward {{n 1}} { + #*** !doctools + #[call [fun move_forward] [arg n]] + return \033\[${n}C + } + proc move_back {{n 1}} { + #*** !doctools + #[call [fun move_back] [arg n]] + return \033\[${n}D + } + proc move_up {{n 1}} { + #*** !doctools + #[call [fun move_up] [arg n]] + return \033\[${n}A + } + proc move_down {{n 1}} { + #*** !doctools + #[call [fun move_down] [arg n]] + return \033\[${n}B + } + # -- --- --- --- --- + + + # -- --- --- --- --- + proc erase_line {} { + #*** !doctools + #[call [fun erase_line]] + return \033\[2K + } + proc erase_sol {} { + #*** !doctools + #[call [fun erase_sol]] + #[para]Erase to start of line, leaving cursor position alone. + return \033\[1K + } + proc erase_eol {} { + #*** !doctools + #[call [fun erase_eol]] + return \033\[K + } + #see also clear_above clear_below + # -- --- --- --- --- + + proc cursor_pos {} { + #*** !doctools + #[call [fun cursor_pos]] + #[para]cursor_pos unlikely to be useful on it's own like this as when written to the terminal, this sequence causes the terminal to emit the row;col sequence to stdin + #[para]The output on screen will look something like ^[lb][lb]47;3R + #[para]Use punk::console::get_cursor_pos or punk::console::get_cursor_pos_list instead. + #[para]These functions will emit the code - but read it in from stdin so that it doesn't display, and then return the row and column as a colon-delimited string or list respectively. + #[para]The punk::ansi::cursor_pos function is used by punk::console::get_cursor_pos and punk::console::get_cursor_pos_list + return \033\[6n + } + + + #alternative to string terminator is \007 - + proc titleset {windowtitle} { + #*** !doctools + #[call [fun titleset] [arg windowtitles]] + #[para]Returns the code to set the title of the terminal window to windowtitle + #[para]This may not work on terminals which have multiple panes/windows + return "\033\]2;$windowtitle\033\\" ;#works for xterm and most derivatives + } + #titleget - https://invisible-island.net/xterm/xterm.faq.html#how2_title + #no cross-platform ansi-only mechanism ? + + proc test_decaln {} { + #Screen Alignment Test + #Reset margins, move cursor to the top left, and fill the screen with 'E' + #(doesn't work on many terminals - seems to work in FreeBSD 13.2 and wezterm on windows) + return \x1b#8 + } + + #length of text for printing characters only + #review - unicode and other non-printing chars and combining sequences? + #certain unicode chars are full-width (single char 2 columns wide) e.g see "Halfwdith and fullwidth forms" and ascii_fuillwidth blocks in punk::char::charset_names + #review - is there an existing library or better method? print to a terminal and query cursor position? + #Note this length calculation is only suitable for lines being appended to other strings if the line is pre-processed to account for backspace and carriage returns first + #If the raw line is appended to another string without such processing - the backspaces & carriage returns can affect data prior to the start of the string. + proc printing_length {line} { + if {[string first \n $line] >= 0} { + error "line_print_length must not contain newline characters" + } + + #review - + set line [punk::ansi::stripansi $line] + + set line [punk::char::strip_nonprinting_ascii $line] ;#only strip nonprinting after stripansi - some like BEL are part of ansi + #backspace 0x08 only erases* printing characters anyway - so presumably order of processing doesn't matter + #(* more correctly - moves cursor back) + #backspace will not move beyond a preceding newline - but we have disallowed newlines for this function already + #leading backspaces will eat into any string (even prompt in non-standard tclsh shell) that is prepended to the line + # - but for the purposes of overtype we wouldn't want that - so we strip it here in the length calculation and should strip leading backspaces in the actual data concatenation operations too. + #curiously - a backspace sequence at the end of a string also doesn't reduce the printing width - so we can also strip from RHS + + #Note that backspace following a \t will only shorten the string by one (ie it doesn't move back the whole tab width like it does interactively in the terminal) + #for this reason - it would seem best to normalize the tabs to spaces prior to performing the backspace calculation - otherwise we won't account for the 'short' tabs it effectivley produces + #normalize tabs to an appropriate* width + #*todo - handle terminal/context where tabwidth != the default 8 spaces + set line [textutil::tabify::untabify2 $line] + + set bs [format %c 0x08] + #set line [string map [list "\r${bs}" "\r"] $line] ;#backsp following a \r will have no effect + set line [string trim $line $bs] + set n 0 + + set chars [split $line ""] + #build an output + set idx 0 + set outchars [list] + set outsizes [list] + foreach c $chars { + if {$c eq $bs} { + if {$idx > 0} { + incr idx -1 + } + } elseif {$c eq "\r"} { + set idx 0 + } else { + punk::ansi::internal::printing_length_addchar $idx $c + incr idx + } + } + set line2 [join $outchars ""] + return [punk::char::string_width $line2] + } + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace punk::ansi ---}] +} + + +namespace eval punk::ansi { + + + # -- --- --- --- --- --- + #XTGETTCAP + # xterm responds with + # DCS 1 + r Pt ST for valid requests, adding to Pt an = , and + # the value of the corresponding string that xterm would send, + # or + # DCS 0 + r ST for invalid requests. + # The strings are encoded in hexadecimal (2 digits per + # character). If more than one name is given, xterm replies + # with each name/value pair in the same response. An invalid + # name (one not found in xterm's tables) ends processing of the + # list of names. + proc xtgetcap {keylist} { + #ESC P = 0x90 = DCS = Device Control String + set hexkeys [list] + foreach k $keylist { + lappend hexkeys [util::str2hex $k] + } + set payload [join $hexkeys ";"] + return "\x1bP+q$payload\x1b\\" + } + proc xtgetcap2 {keylist} { + #ESC P = 0x90 = DCS = Device Control String + set hexkeys [list] + foreach k $keylist { + lappend hexkeys [util::str2hex $k] + } + set payload [join $hexkeys ";"] + return "\u0090+q$payload\u009c" + } + namespace eval codetype { + proc is_sgr {code} { + #SGR (Select Graphic Rendition) - codes ending in 'm' - e.g colour/underline + #we will accept and pass through the less common colon separator (ITU Open Document Architecture) + #Terminals should generally ignore it if they don't use it + regexp {\033\[[0-9;:]*m$} $code + } + proc is_cursor_move_in_line {code} { + #review - what about CSI n : m H where row n happens to be current line? + regexp {\033\[[0-9]*(:?C|D|G)$} + } + #pure SGR reset + proc is_sgr_reset {code} { + #todo 8-bit csi + regexp {\033\[0*m$} $code + } + #whether this code has 0 (or equivalently empty) parameter (but may set others) + #if an SGR code as a reset in it - we don't need to carry forward any previous SGR codes + #it generally only makes sense for the reset to be the first entry - otherwise the code has ineffective portions + #However - detecting zero or empty parameter in other positions requires knowing all other codes that may allow zero or empty params. + #We will only look at initial parameter as this is the well-formed normal case. + #Review - consider normalizing sgr codes to remove other redundancies such as setting fg or bg color twice in same code + proc has_sgr_leadingreset {code} { + set params "" + regexp {\033\[(.*)m} $code _match params + set plist [split $params ";"] + if {[string trim [lindex $plist 0] 0] eq ""} { + #e.g \033\[m \033\[0\;...m \033\[0000...m + return 1 + } else { + return 0 + } + } + } + namespace eval sequence_type { + proc is_Fe {code} { + # C1 control codes + if {[regexp {^\033\[[\u0040-\u005F]}]} { + #7bit - typical case + return 1 + } + #8bit + #review - all C1 escapes ? 0x80-0x90F + #This is possibly problematic as it is affected by encoding. + #According to https://en.wikipedia.org/wiki/ANSI_escape_code#8-bit + #"However, in character encodings used on modern devices such as UTF-8 or CP-1252, those codes are often used for other purposes, so only the 2-byte sequence is typically used." + return 0 + } + proc is_Fs {code} { + puts stderr "is_Fs unimplemented" + } + } + # -- --- --- --- --- --- --- --- --- --- --- + #todo - implement colour resets like the perl module: + #https://metacpan.org/pod/Text::ANSI::Util + #(saves up all ansi color codes since previous color reset and replays the saved codes after our highlighting is done) +} + + +namespace eval punk::ansi::ta { + #*** !doctools + #[subsection {Namespace punk::ansi::ta}] + #[para] text ansi functions + #[para] based on but not identical to the Perl Text Ansi module: + #[para] https://github.com/perlancar/perl-Text-ANSI-Util/blob/master/lib/Text/ANSI/BaseUtil.pm + #[list_begin definitions] + namespace path ::punk::ansi + + #handle both 7-bit and 8-bit csi + #review - does codepage affect this? e.g ebcdic has 8bit csi in different position + + #CSI + #variable re_csi_open {(?:\033\[|\u009b)[0-9;]+} ;#too specific - doesn't detect \033\[m + variable re_csi_open {(?:\033\[|\u009b)} + + #colour and style + variable re_csi_colour {(?:\033\[|\u009b)[0-9;]*m} ;#e.g \033\[31m \033\[m \033\[0m \033\[m0000m + #single "final byte" in the range 0x40–0x7E (ASCII @A–Z[\]^_`a–z{|}~). + variable re_csi_code {(?:\033\[|\u009b)[0-9;]*[a-zA-Z\\@^_|~`]} + + #OSC - termnate with BEL (\a \007) or ST (string terminator \033\\) + # 8-byte string terminator is \x9c (\u009c) + + #test - non-greedy + variable re_esc_osc1 {(?:\033\]).*?\007} + variable re_esc_osc2 {(?:\033\]).*?\033\\} + variable re_esc_osc3 {(?:\u009d).*?\u009c} + + variable re_osc_open {(?:\033\]|\u009d).*} + + variable re_ansi_detect "${re_csi_open}|${re_esc_osc1}|${re_esc_osc2}" + + #detect any ansi escapes + #review - only detect 'complete' codes - or just use the opening escapes for performance? + proc detect {text} { + #*** !doctools + #[call [fun detect] [arg text]] + #[para]Return a boolean indicating whether Ansi codes were detected in text + #[para] + + variable re_ansi_detect + #variable re_csi_open + #variable re_esc_osc1 + #variable re_esc_osc2 + #todo - other escape sequences + #expr {[regexp $re_csi_open $text] || [regexp $re_esc_osc1 $text] || [regexp $re_esc_osc2 $text]} + expr {[regexp $re_ansi_detect $text]} + } + #not in perl ta + proc detect_csi {text} { + #*** !doctools + #[call [fun detect_csi] [arg text]] + #[para]Return a boolean indicating whether an Ansi Control Sequence Introducer (CSI) was detected in text + #[para]The csi is often represented in code as \x1b or \033 followed by a left bracket [lb] + #[para]The initial byte or escape is commonly referenced as ESC in Ansi documentation + #[para]There is also a multi-byte escape sequence \u009b + #[para]This is less commonly used but is also detected here + #[para](This function is not in perl ta) + variable re_csi_open + expr {[regexp $re_csi_open $text]} + } + proc detect_sgr {text} { + #*** !doctools + #[call [fun detect_sgr] [arg text]] + #[para]Return a boolean indicating whether an ansi Select Graphics Rendition code was detected. + #[para]This is the set of CSI sequences ending in 'm' + #[para]This is most commonly an Ansi colour code - but also things such as underline and italics + #[para]An SGR with empty or a single zero argument is a reset of the SGR features - this is also detected. + #[para](This function is not in perl ta) + variable re_csi_colour + expr {[regexp $re_csi_colour $text]} + } + proc strip {text} { + #*** !doctools + #[call [fun strip] [arg text]] + #[para]Return text stripped of Ansi codes + #[para]This is a tailcall to punk::ansi::stripansi + tailcall stripansi $text + } + proc length {text} { + #*** !doctools + #[call [fun length] [arg text]] + #[para]Return the character length after stripping ansi codes - not the printing length + string length [stripansi $text] + } + #todo - handle newlines + #not in perl ta + #proc printing_length {text} { + # + #} + + proc trunc {text width args} { + + } + + #not in perl ta + #returns just the plaintext portions in a list + proc split_at_codes {text} { + variable re_esc_osc1 + variable re_esc_osc2 + variable re_csi_code + punk::ansi::internal::splitx $text "${re_csi_code}|${re_esc_osc1}|${re_esc_osc2}" + } + + # -- --- --- --- --- --- + #Split $text to a list containing alternating ANSI color codes and text. + #ANSI color codes are always on the second element, fourth, and so on. + #(ie plaintext on odd list-indices ansi on even indices) + # Example: + #ta_split_codes "" # => "" + #ta_split_codes "a" # => "a" + #ta_split_codes "a\e[31m" # => {"a" "\e[31m"} + #ta_split_codes "\e[31ma" # => {"" "\e[31m" "a"} + #ta_split_codes "\e[31ma\e[0m" # => {"" "\e[31m" "a" "\e[0m"} + #ta_split_codes "\e[31ma\e[0mb" # => {"" "\e[31m" "a" "\e[0m", "b"} + #ta_split_codes "\e[31m\e[0mb" # => {"" "\e[31m\e[0m" "b"} + # + proc split_codes {text} { + variable re_esc_osc1 + variable re_esc_osc2 + variable re_csi_code + set re "(?:${re_csi_code}|${re_esc_osc1}|${re_esc_osc2})+" + return [_perlish_split $re $text] + } + #like split_codes - but each ansi-escape is split out separately (with empty string of plaintext between codes so odd/even plain ansi still holds) + proc split_codes_single {text} { + variable re_esc_osc1 + variable re_esc_osc2 + variable re_csi_code + set re "${re_csi_code}|${re_esc_osc1}|${re_esc_osc2}" + return [_perlish_split $re $text] + } + + #review - tcl greedy expressions may match multiple in one element + proc _perlish_split {re text} { + if {[string length $text] == 0} { + return {} + } + set list [list] + set start 0 + while {[regexp -start $start -indices -- $re $text match]} { + lassign $match matchStart matchEnd + lappend list [string range $text $start $matchStart-1] [string range $text $matchStart $matchEnd] + set start [expr {$matchEnd+1}] + } + lappend list [string range $text $start end] + return $list + } + proc _ws_split {text} { + regexp -all -inline {(?:\S+)|(?:\s+)} $text + } + # -- --- --- --- --- --- + + #*** !doctools + #[list_end] [comment {--- end definitions namespace punk::ansi::ta ---}] +} +# -- --- --- --- --- --- --- --- --- --- --- + +namespace eval punk::ansi::ansistring { + #*** !doctools + #[subsection {Namespace punk::ansi::ansistring}] + #[para]punk::ansi::string ensemble + #[list_begin definitions] + namespace path [list ::punk::ansi ::punk::ansi::ta] + namespace ensemble create + namespace export length + + proc length {string} { + string length [ansistrip $string] + } + proc trimleft {string args} { + + } + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace punk::ansi::ta ---}] +} + +namespace eval punk::ansi::internal { + proc splitn {str {len 1}} { + #from textutil::split::splitn + if {$len <= 0} { + return -code error "len must be > 0" + } + if {$len == 1} { + return [split $str {}] + } + set result [list] + set max [string length $str] + set i 0 + set j [expr {$len -1}] + while {$i < $max} { + lappend result [string range $str $i $j] + incr i $len + incr j $len + } + return $result + } + proc splitx {str {regexp {[\t \r\n]+}}} { + #from textutil::split::splitx + # Bugfix 476988 + if {[string length $str] == 0} { + return {} + } + if {[string length $regexp] == 0} { + return [::split $str ""] + } + if {[regexp $regexp {}]} { + return -code error \ + "splitting on regexp \"$regexp\" would cause infinite loop" + } + set list {} + set start 0 + while {[regexp -start $start -indices -- $regexp $str match submatch]} { + foreach {subStart subEnd} $submatch break + foreach {matchStart matchEnd} $match break + incr matchStart -1 + incr matchEnd + lappend list [string range $str $start $matchStart] + if {$subStart >= $start} { + lappend list [string range $str $subStart $subEnd] + } + set start $matchEnd + } + lappend list [string range $str $start end] + return $list + } + + proc printing_length_addchar {i c} { + upvar outchars outc + upvar outsizes outs + set nxt [llength $outc] + if {$i < $nxt} { + lset outc $i $c + } else { + lappend outc $c + } + } + + #string to 2digit hex - e.g used by XTGETTCAP + proc str2hex {input} { + set 2hex "" + foreach ch [split $input ""] { + append 2hex [format %02X [scan $ch %c]] + } + return $2hex + } + proc hex2str {2digithexchars} { + set 2digithexchars [string map [list _ ""] $2digithexchars] ;#compatibility with tcl tip 551 (compatibility in the sense that users might expect to be able to use underscores and it's nice to support the syntax here too - not that it's required) + if {$2digithexchars eq ""} { + return "" + } + if {[string length $2digithexchars] % 2 != 0} { + error "hex2str requires an even number of hex digits (2 per character)" + } + set 2str "" + foreach pair [splitn $2digithexchars 2] { + append 2str [format %c 0x$pair] + } + return $2str + } +} + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Ready +package provide punk::ansi [namespace eval punk::ansi { + variable version + set version 0.1.0 +}] +return + + +#*** !doctools +#[manpage_end] + diff --git a/src/bootsupport/modules/punk/args-0.1.0.tm b/src/bootsupport/modules/punk/args-0.1.0.tm new file mode 100644 index 0000000..f7eb125 --- /dev/null +++ b/src/bootsupport/modules/punk/args-0.1.0.tm @@ -0,0 +1,625 @@ +# -*- tcl -*- +# Maintenance Instruction: leave the 999999.xxx.x as is and use punkshell 'pmix make' or bin/punkmake to update from -buildversion.txt +# module template: shellspy/src/decktemplates/vendor/punk/modules/template_module-0.0.2.tm +# +# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. +# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# (C) 2024 +# +# @@ Meta Begin +# Application punk::args 0.1.0 +# Meta platform tcl +# Meta license +# @@ Meta End + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# doctools header +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +#*** !doctools +#[manpage_begin punkshell_module_punk::args 0 0.1.0] +#[copyright "2024"] +#[titledesc {args parsing}] [comment {-- Name section and table of contents description --}] +#[moddesc {args to option-value dict and values dict}] [comment {-- Description at end of page heading --}] +#[require punk::args] +#[keywords module proc args arguments parse] +#[description] +#[para]Utilities for parsing proc args + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section Overview] +#[para] overview of punk::args +#[subsection Concepts] +#[para]There are 2 main conventions for parsing a proc args list +#[list_begin enumerated] +#[enum] +#[para]leading option-value pairs followed by a list of values (Tk style) +#[enum] +#[para]leading list of values followed by option-value pairs (Tcl style) +#[list_end] +#[para]punk::args is focused on the 1st convention (Tk style): parsing of args in leading option-value pair style - even for non-Tk usage. +#[para]The proc can still contain some leading required values e.g [example "proc dostuff {arg1 arg2 args} {...}}"] +#[para]but having the core values elements at the end of args is more generally useful - especially in cases where the number of trailing values is unknown and/or the proc is to be called in a functional 'pipeline' style. +#[para]The basic principle is that a call to punk::args::opts_vals is made near the beginning of the proc e.g +#[example_begin] +# proc dofilestuff {args} { +# lassign [lb]dict values [lb]punk::args { +# -directory -default "" +# -translation -default binary +# } $args[rb][rb] opts values +# +# puts "translation is [lb]dict get $opts -translation[rb]" +# foreach f [lb]dict values $values[rb] { +# puts "doing stuff with file: $f" +# } +# } +#[example_end] + +#*** !doctools +#[subsection Notes] +#[para]There are alternative args parsing packages such as: +#[list_begin enumerated] +#[enum]argp +#[enum]The tcllib set of TEPAM modules +#[para]TEPAM requires an alternative procedure declaration syntax instead of proc - but has support for Tk and documentation generation. +#[list_end] +#[para]punk::args was designed initially without specific reference to TEPAM - and to handle some edge cases in specific projects where TEPAM wasn't suitable. +#[para]In subsequent revisions of punk::args - some features were made to operate in a way that is similar to TEPAM - to avoid gratuitous differences where possible, but of course there are differences +#[para]and those used TEPAM or mixing TEPAM and punk::args should take care to assess the differences. +#[para]TEPAM is a mature solution and is widely available as it is included in tcllib. +#[para]Serious consideration should be given to using TEPAM if suitable for your project. + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Requirements +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[subsection dependencies] +#[para] packages used by punk::args +#[list_begin itemized] + +package require Tcl 8.6 +#*** !doctools +#[item] [package {Tcl 8.6}] + +# #package require frobz +# #*** !doctools +# #[item] [package {frobz}] + +#*** !doctools +#[list_end] + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section API] + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# oo::class namespace +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval punk::args::class { + #*** !doctools + #[subsection {Namespace punk::args::class}] + #[para] class definitions + if {[info commands [namespace current]::interface_sample1] eq ""} { + #*** !doctools + #[list_begin enumerated] + + # oo::class create interface_sample1 { + # #*** !doctools + # #[enum] CLASS [class interface_sample1] + # #[list_begin definitions] + + # method test {arg1} { + # #*** !doctools + # #[call class::interface_sample1 [method test] [arg arg1]] + # #[para] test method + # puts "test: $arg1" + # } + + # #*** !doctools + # #[list_end] [comment {-- end definitions interface_sample1}] + # } + + #*** !doctools + #[list_end] [comment {--- end class enumeration ---}] + } +} +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# Base namespace +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval punk::args { + namespace export * + #variable xyz + + #*** !doctools + #[subsection {Namespace punk::args}] + #[para] Core API functions for punk::args + #[list_begin definitions] + + + #generally we expect values to contain leading dashes only if -- specified. Otherwise no reliable way determine difference between bad flags and values + #If no eopts (--) specified we stop looking for opts at the first nondash encountered in a position we'd expect a dash - so without eopt, values could contain dashes - but not in first position after flags. + #only supports -flag val pairs, not solo options + #If an option is supplied multiple times - only the last value is used. + proc opts_values {optionspecs rawargs args} { + #*** !doctools + #[call [fun opts_values] [arg optionspecs] [arg rawargs] [opt {option value...}]] + #[para]Parse rawargs as a sequence of zero or more option-value pairs followed by zero or more values + #[para]Returns a dict of the form: opts values + #[para]ARGUMENTS: + #[list_begin arguments] + #[arg_def multiline-string optionspecs] + #[para] This a block of text with records delimited by newlines (lf or crlf) - but with multiline values allowed if properly quoted/braced + #[para]'info complete' is used to determine if a record spans multiple lines due to multiline values + #[para]Each optionspec line must be of the form: + #[para]-optionname -key val -key2 val2... + #[para]where the valid keys for each option specification are: -default -type -range -choices -optional + #[arg_def list rawargs] + #[para] This is a list of the arguments to parse. Usually it will be the \$args value from the containing proc + #[list_end] + #[para] + + #consider line-processing example below for we need info complete to determine record boundaries + #punk::args::opt_values { + # -opt1 -default {} + # -opt2 -default { + # etc + # } -multiple 1 + #} $args + + set optionspecs [string map [list \r\n \n] $optionspecs] + set known_argspecs [list -default -type -range -choices -nocase -optional -multiple -validate_without_ansi -allow_ansi -strip_ansi] + set optspec_defaults [dict create\ + -optional 1\ + -allow_ansi 1\ + -validate_without_ansi 0\ + -strip_ansi 0\ + -nocase 0\ + ] + set required_opts [list] + set required_vals [list] + set arg_info [dict create] + set defaults_dict_opts [dict create] + set defaults_dict_values [dict create] + #first process dashed and non-dashed argspecs without regard to whether non-dashed are at the beginning or end + set value_names [list] + + set records [list] + set linebuild "" + foreach rawline [split $optionspecs \n] { + set recordsofar [string cat $linebuild $rawline] + if {![info complete $recordsofar]} { + append linebuild [string trimleft $rawline] \n + } else { + lappend records [string cat $linebuild $rawline] + set linebuild "" + } + } + + foreach ln $records { + set trimln [string trim $ln] + if {$trimln eq ""} { + continue + } + if {[string index $trimln 0] eq "#"} { + continue + } + set argname [lindex $trimln 0] + set argspecs [lrange $trimln 1 end] + if {[string match -* $argname]} { + dict set argspecs -ARGTYPE option + set is_opt 1 + } else { + dict set argspecs -ARGTYPE value + lappend value_names $argname + set is_opt 0 + } + if {[llength $argspecs] %2 != 0} { + error "punk::args::opts_values - bad optionspecs line for argument '$argname' Remaining items on line must be in paired option-value format - received '$argspecs'" + } + dict for {spec specval} $argspecs { + if {$spec ni [concat $known_argspecs -ARGTYPE]} { + error "punk::args::opts_values - unrecognised key '$spec' in specifications for argument '$argname' Known option specification keys: $known_argspecs" + } + } + set argspecs [dict merge $optspec_defaults $argspecs] + dict set arg_info $argname $argspecs + if {![dict get $argspecs -optional]} { + if {$is_opt} { + lappend required_opts $argname + } else { + lappend required_vals $argname + } + } + if {[dict exists $arg_info $argname -default]} { + if {$is_opt} { + dict set defaults_dict_opts $argname [dict get $arg_info $argname -default] + } else { + dict set defaults_dict_values $argname [dict get $arg_info $argname -default] + } + } + } + + #puts "--> [info frame -2] <--" + set cmdinfo [dict get [info frame -2] cmd] + #we can't treat cmdinfo as a list - it may be something like {command {*}$args} in which case lindex $cmdinfo 0 won't work + #hopefully first word is a plain proc name if this function was called in the normal manner - directly from a proc + #we will break at first space and assume the lhs of that will give enough info to be reasonable - (alternatively we could use entire cmdinfo - but it might be big and ugly) + set caller [regexp -inline {\S+} $cmdinfo] + + #if called from commandline or some other contexts such as outside of a proc in a namespace - caller may just be "namespace" + if {$caller eq "namespace"} { + set caller "punk::args::opts_values called from namespace" + } + + # ------------------------------ + if {$caller ne "punk::args::opts_values"} { + #check our own args + lassign [punk::args::opts_values "-anyopts -default 0\n -minvalues -default 0\n -maxvalues -default -1" $args] _o ownopts _v ownvalues + if {[llength $ownvalues] > 0} { + error "punk::args::opts_values expected: a multiline text block of option-specifications, a list of args and at most three option pairs -minvalues , -maxvalues , -anyopts true|false - got extra arguments: '$ownvalues'" + } + set opt_minvalues [dict get $ownopts -minvalues] + set opt_maxvalues [dict get $ownopts -maxvalues] + set opt_anyopts [dict get $ownopts -anyopts] + } else { + #don't check our own args if we called ourself + set opt_minvalues 0 + set opt_maxvalues 0 + set opt_anyopts 0 + } + # ------------------------------ + + if {[set eopts [lsearch $rawargs "--"]] >= 0} { + set values [lrange $rawargs $eopts+1 end] + set arglist [lrange $rawargs 0 $eopts-1] + } else { + if {[lsearch $rawargs -*] >= 0} { + #to support option values with leading dash e.g -offset -1 , we can't just take the last flagindex + set i 0 + foreach {k v} $rawargs { + if {![string match -* $k]} { + break + } + if {$i+1 >= [llength $rawargs]} { + #no value for last flag + error "bad options for $caller. No value supplied for last option $k" + } + incr i 2 + } + set arglist [lrange $rawargs 0 $i-1] + set values [lrange $rawargs $i end] + } else { + set values $rawargs ;#no -flags detected + set arglist [list] + } + } + #confirm any valnames before last don't have -multiple key + foreach valname [lrange $value_names 0 end-1] { + if {[dict exists $arg_info $valname -multiple ]} { + error "bad key -multiple on argument spec for '$valname'. Only the last value argument specification can be marked -multiple" + } + } + set values_dict [dict create] + set validx 0 + set in_multiple "" + foreach valname $value_names val $values { + if {$validx+1 > [llength $values]} { + break + } + if {$valname ne ""} { + if {[dict exists $arg_info $valname -multiple] && [dict get $arg_info $valname -multiple]} { + dict lappend values_dict $valname $val + set in_multiple $valname + } else { + dict set values_dict $valname $val + } + } else { + if {$in_multiple ne ""} { + dict lappend values_dict $in_multiple $val + } else { + dict set values_dict $validx $val + } + } + incr validx + } + + if {$opt_maxvalues == -1} { + #only check min + if {[llength $values] < $opt_minvalues} { + error "bad number of trailing values for $caller. Got [llength $values] values. Expected at least $opt_minvalues" + } + } else { + if {[llength $values] < $opt_minvalues || [llength $values] > $opt_maxvalues} { + if {$opt_minvalues == $opt_maxvalues} { + error "bad number of trailing values for $caller. Got [llength $values] values. Expected exactly $opt_minvalues" + } else { + error "bad number of trailing values for $caller. Got [llength $values] values. Expected between $opt_minvalues and $opt_maxvalues inclusive" + } + } + } + #opts explicitly marked as -optional 0 must be present - regardless of -anyopts (which allows us to ignore additional opts to pass on to next call) + #however - if -anyopts is true, there is a risk we will treat a shortened option name as matching our default - when it was intended for the next call + #We will always require exact matches for all required opts to avoid this risk, even though an ultimately-called function may not require the full-length option-name REVIEW + #The aim is to allow a wrapper function to specify a default value for an option (and/or other changes/restrictions to a particular option or two) - without having to re-specify all the options for the underlying function. + #without full respecification in the wrapper - we can't know that a supplied prefix is unambiguous at the next level + #For this reason we need to pass on full-length opts for any defined options in the wrapper even if anyopts is true + foreach r $required_opts { + if {$r ni [dict keys $arglist]} { + error "Required option missing for $caller. '$r' is marked with -optional false - so must be present in its full-length form" + } + } + foreach r $required_vals { + if {$r ni [dict keys $values_dict]} { + error "Required value missing for $caller. '$r' is marked with -optional false - so must be present" + } + } + if {!$opt_anyopts} { + set checked_args [dict create] + for {set i 0} {$i < [llength $arglist]} {incr i} { + #allow this to error out with message indicating expected flags + set val [lindex $arglist $i+1] + set fullopt [tcl::prefix match -message "options for $caller. Unexpected option" [dict keys $arg_info] [lindex $arglist $i]] + if {[dict exists $arg_info $fullopt -multiple] && [dict get $arg_info $fullopt -multiple]} { + dict lappend checked_args $fullopt $val + } else { + dict set checked_args $fullopt $val + } + incr i ;#skip val + } + } else { + #still need to use tcl::prefix match to normalize - but don't raise an error + set checked_args [dict create] + dict for {k v} $arglist { + if {![catch {tcl::prefix::match [dict keys $arg_info] $k} fullopt]} { + if {[dict exists $arg_info $fullopt -multiple] && [dict get $arg_info $fullopt -multiple]} { + dict lappend checked_args $fullopt $v + } else { + dict set checked_args $fullopt $v + } + } else { + #opt was unspecified + dict set checked_args $k $v + } + } + } + set opts [dict merge $defaults_dict_opts $checked_args] + #assert - checked_args keys are full-length option names if -anyopts was false or if the supplied option as a shortname matched one of our defined options + + set values [dict merge $defaults_dict_values $values_dict] + + #todo - allow defaults outside of choices/ranges + + #check types,ranges,choices + set opts_and_values [concat $opts $values] + set combined_defaults [concat $defaults_dict_values $defaults_dict_opts] ;#can be no key collisions - we don't allow a value key beginning with dash - opt names must begin with dash + dict for {o v} $opts_and_values { + if {[dict exists $arg_info $o -multiple] && [dict get $arg_info $o -multiple]} { + set vlist $v + } else { + set vlist [list $v] + } + + if {[dict exists $arg_info $o -validate_without_ansi] && [dict get $arg_info $o -validate_without_ansi]} { + set validate_without_ansi 1 + package require punk::ansi + } else { + set validate_without_ansi 0 + } + if {[dict exists $arg_info $o -allow_ansi] && [dict get $arg_info $o -allow_ansi]} { + set allow_ansi 1 + } else { + package require punk::ansi + set allow_ansi 0 + } + + foreach e $vlist { + if {!$allow_ansi} { + if {[punk::ansi::ta::detect $e]} { + error "Option $o for $caller contains ansi - but -allow_ansi is false. Received: '$e'" + } + } + } + + set vlist_check [list] + foreach e $vlist { + if {$validate_without_ansi} { + lappend vlist_check [punk::ansi::stripansi $e] + } else { + lappend vlist_check $e + } + } + + set is_default 0 + foreach e $vlist e_check $vlist_check { + if {[dict exists $combined_defaults $o] && ($e_check eq [dict get $combined_defaults $o])} { + incr is_default + } + } + if {$is_default eq [llength $vlist]} { + set is_default true + } + #we want defaults to pass through - even if they don't pass the checks that would be required for a specified value + #If the caller manually specified a value that happens to match the default - we don't detect that as any different from an unspecified value - Review. + if {!$is_default} { + if {[dict exists $arg_info $o -type]} { + set type [dict get $arg_info $o -type] + if {[string tolower $type] in {int integer double}} { + if {[string tolower $type] in {int integer}} { + foreach e $vlist e_check $vlist_check { + if {![string is integer -strict $e_check]} { + error "Option $o for $caller requires type 'integer'. Received: '$e'" + } + } + } elseif {[string tolower $type] in {double}} { + foreach e $vlist e_check $vlist_check { + if {![string is double -strict $e_check]} { + error "Option $o for $caller requires type 'double'. Received: '$e'" + } + } + } + + #todo - small-value double comparisons with error-margin? review + if {[dict exists $arg_info $o -range]} { + lassign [dict get $arg_info $o -range] low high + foreach e $vlist e_check $vlist_check { + if {$e_check < $low || $e_check > $high} { + error "Option $o for $caller must be between $low and $high. Received: '$e'" + } + } + } + } elseif {[string tolower $type] in {bool boolean}} { + foreach e $vlist e_check $vlist_check { + if {![string is boolean -strict $e_check]} { + error "Option $o for $caller requires type 'boolean'. Received: '$e'" + } + } + } elseif {[string tolower $type] in {alnum alpha ascii control digit graph lower print punct space upper wordchar xdigit}} { + foreach e $vlist e_check $vlist_check { + if {![string is [string tolower $type] $e_check]} { + error "Option $o for $caller requires type '[string tolower $type]'. Received: '$e'" + } + } + } elseif {[string tolower $type] in {file directory existingfile existingdirectory}} { + foreach e $vlist e_check $vlist_check { + if {!([string length $e_check]>0 && ![regexp {[\"*?<>\;]} $e_check])} { + error "Option $o for $caller requires type '[string tolower $type]'. Received: '$e' which doesn't look like it could be a file or directory" + } + } + if {[string tolower $type] in {existingfile}} { + foreach e $vlist e_check $vlist_check { + if {![file exists $e_check]} { + error "Option $o for $caller requires type '[string tolower $type]'. Received: '$e' which is not an existing file" + } + } + } elseif {[string tolower $type] in {existingdirectory}} { + foreach e $vlist e_check $vlist_check { + if {![file isdirectory $e_check]} { + error "Option $o for $caller requires type '[string tolower $type]'. Received: '$e' which is not an existing directory" + } + } + } + } elseif {[string tolower $type] in {char character}} { + foreach e $vlist e_check $vlist_check { + if {[string length != 1]} { + error "Option $o for $caller requires type '[string tolower $type]'. Received: '$e' which is not a single character" + } + } + } + } + if {[dict exists $arg_info $o -choices]} { + set choices [dict get $arg_info $o -choices] + set nocase [dict get $arg_info $o -nocase] + foreach e $vlist e_check $vlist_check { + if {$nocase} { + set casemsg "(case insensitive)" + set choices_test [string tolower $choices] + set v_test [string tolower $e_check] + } else { + set casemsg "(case sensitive)" + set v_test $e_check + set choices_test $choices + } + if {$v_test ni $choices_test} { + error "Option $o for $caller must be one of the listed values $choices $casemsg. Received: '$e'" + } + } + } + } + if {[dict exists $arg_info $o -strip_ansi] && [dict get $arg_info $o -strip_ansi]} { + set stripped_list [list] + foreach e $vlist { + lappend stripped_list [punk::ansi::stripansi $e] + } + if {[dict exists $arg_info $o -multiple] && [dict get $arg_info $o -multiple]} { + if {[dict get $arg_info $o -ARGTYPE] eq "option"} { + dict set opts $o $stripped_list + } else { + dict set values $o $stripped_list + } + } else { + if {[dict get $arg_info $o -ARGTYPE] eq "option"} { + dict set opts $o [lindex $stripped_list 0] + } else { + dict set values [lindex $stripped_list 0] + } + } + } + } + + #maintain order of opts $opts values $values as caller may use lassign. + return [dict create opts $opts values $values] + } + + #proc sample1 {p1 args} { + # #*** !doctools + # #[call [fun sample1] [arg p1] [opt {?option value...?}]] + # #[para]Description of sample1 + # return "ok" + #} + + + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace punk::args ---}] +} +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# Secondary API namespace +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval punk::args::lib { + namespace export * + namespace path [namespace parent] + #*** !doctools + #[subsection {Namespace punk::args::lib}] + #[para] Secondary functions that are part of the API + #[list_begin definitions] + + #proc utility1 {p1 args} { + # #*** !doctools + # #[call lib::[fun utility1] [arg p1] [opt {option value...}]] + # #[para]Description of utility1 + # return 1 + #} + + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace punk::args::lib ---}] +} +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +#*** !doctools +#[section Internal] +namespace eval punk::args::system { + #*** !doctools + #[subsection {Namespace punk::args::system}] + #[para] Internal functions that are not part of the API + + + +} +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Ready +package provide punk::args [namespace eval punk::args { + variable pkg punk::args + variable version + set version 0.1.0 +}] +return + +#*** !doctools +#[manpage_end] + diff --git a/src/bootsupport/modules/punk/char-0.1.0.tm b/src/bootsupport/modules/punk/char-0.1.0.tm new file mode 100644 index 0000000..1980480 --- /dev/null +++ b/src/bootsupport/modules/punk/char-0.1.0.tm @@ -0,0 +1,1921 @@ +# -*- tcl -*- +# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt +# +# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. +# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# (C) 2023 +# +# @@ Meta Begin +# Application punk::char 0.1.0 +# Meta platform tcl +# Meta license +# @@ Meta End + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# doctools header +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +#*** !doctools +#[manpage_begin punkshell_module_punk::char 0 0.1.0] +#[copyright "2024"] +#[titledesc {character-set and unicode utilities}] [comment {-- Name section and table of contents description --}] +#[moddesc {character-set nad unicode}] [comment {-- Description at end of page heading --}] +#[require punk::char] +#[keywords module encodings] +#[description] + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section Overview] +#[para] overview of punk::char +#[subsection Concepts] +#[para] - + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Requirements +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Requirements +##e.g package require frobz + +#*** !doctools +#[subsection dependencies] +#[para] packages used by punk::char +#[list_begin itemized] +#[item] [package {Tcl 8.6}] + +# + +#*** !doctools +#[item] [package {overtype}] +#[para] - +#[item] [package {textblock}] +#[para] - +#[item] [package console] +#[para] - + +package require Tcl 8.6 + +#*** !doctools +#[list_end] + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section API] + +#Note that ansi escapes can begin with \033\[ (\u001b\[) or the single character "Control Sequence Introducer" 0x9b + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval punk::char { + namespace export * + + # -- -------------------------------------------------------------------------- + variable encmimens ;#namespace of mime package providing reversemapencoding and mapencoding functions + #tcllib mime requires tcl::chan::memchan,events,core and/or Trf + if {![catch {package require punk::encmime} errM]} { + set encmimens ::punk::encmime + } else { + package require mime + set encmimens ::mime + } + # -- -------------------------------------------------------------------------- + + variable invalid "???" ;# ideally this would be 0xFFFD - which should display as black diamond/rhombus with question mark. As at 2023 - this tends to display indistinguishably from other missing glyph boxes - so we use a longer sequence we can detect by length and to display obviously + variable invalid_display_char \u25ab; #todo - change to 0xFFFD once diamond glyph more common? + + #just the 7-bit ascii. use [page ascii] for the 8-bit + proc ascii {} {return { + 00 NUL 01 SOH 02 STX 03 ETX 04 EOT 05 ENQ 06 ACK 07 BEL + 08 BS 09 HT 0a LF 0b VT 0c FF 0d CR 0e SO 0f SI + 10 DLE 11 DC1 12 DC2 13 DC3 14 DC4 15 NAK 16 SYN 17 ETB + 18 CAN 19 EM 1a SUB 1b ESC 1c FS 1d GS 1e RS 1f US + 20 SP 21 ! 22 " 23 # 24 $ 25 % 26 & 27 ' + 28 ( 29 ) 2a * 2b + 2c , 2d - 2e . 2f / + 30 0 31 1 32 2 33 3 34 4 35 5 36 6 37 7 + 38 8 39 9 3a : 3b ; 3c < 3d = 3e > 3f ? + 40 @ 41 A 42 B 43 C 44 D 45 E 46 F 47 G + 48 H 49 I 4a J 4b K 4c L 4d M 4e N 4f O + 50 P 51 Q 52 R 53 S 54 T 55 U 56 V 57 W + 58 X 59 Y 5a Z 5b [ 5c \ 5d ] 5e ^ 5f _ + 60 ` 61 a 62 b 63 c 64 d 65 e 66 f 67 g + 68 h 69 i 6a j 6b k 6c l 6d m 6e n 6f o + 70 p 71 q 72 r 73 s 74 t 75 u 76 v 77 w + 78 x 79 y 7a z 7b { 7c | 7d } 7e ~ 7f DEL + }} + + #G0 character set + proc ascii2 {} { + set dict [asciidict2] + set out "" + set i 1 + append out " " + dict for {k v} $dict { + #single chars are wrapped with \033(0 and \033(B ie total length 7 + if {[string length $v] == 7} { + set v " $v " + } elseif {[string length $v] == 2} { + set v "$v " + } elseif {[string length $v] == 0} { + set v " " + } + append out "$k $v " + if {$i > 0 && $i % 8 == 0} { + set out [string range $out 0 end-2] + append out \n " " + } + incr i + } + set out [string trimright $out " "] + return $out + } + + + proc symbol {} { + tailcall page symbol + } + proc dingbats {} { + set out "" + append out [page dingbats] \n + set unicode_dict [charset_dictget Dingbats] + + append out " " + set i 1 + dict for {k charinfo} $unicode_dict { + set char [dict get $charinfo char] + if {[string length $char] == 0} { + set displayv " " + } elseif {[string length $char] == 1} { + set displayv " $char " + } else { + set displayv $char + } + append out "$k $displayv " + if {$i > 0 && $i % 8 == 0} { + set out [string range $out 0 end-2] + append out \n " " + } + incr i + } + return $out + } + proc page_names {{search *}} { + set all_names [list] + set d [page_names_dict $search] + dict for {k v} $d { + if {$k ni $all_names} { + lappend all_names $k + } + foreach m $v { + if {$m ni $all_names} { + lappend all_names $m + } + } + } + return [lsort $all_names] + } + proc page_names_help {{namesearch *}} { + set d [page_names_dict $namesearch] + + set out "" + dict for {k v} $d { + append out "$k $v" \n + } + return [linesort $out] + } + proc page_names_dict {{search *}} { + if {![regexp {[?*]} $search]} { + set search "*$search*" + } + set encnames [encoding names] + foreach enc $encnames { + dict set d $enc [list] + } + variable encmimens + set mimenames [array get ${encmimens}::reversemap] + dict for {mname encname} $mimenames { + if {$encname in $encnames} { + set enclist [dict get $d $encname] + if {$mname ni $enclist} { + dict lappend d $encname $mname + } + } + } + foreach enc [lsort $encnames] { + set mime_enc [${encmimens}::mapencoding $enc] + if {$mime_enc ne ""} { + set enclist [dict get $d $enc] + if {$mime_enc ni $enclist} { + dict lappend d $enc $mime_enc + } + } + } + set dresult [dict create] + if {$search ne "*"} { + dict for {k v} $d { + if {[string match -nocase $search $k] || ([lsearch -nocase $v $search]) >= 0} { + dict set dresult $k $v + } + } + } else { + set dresult $d + } + return $dresult + } + proc page8 {encname args} { + dict set args -cols 8 + tailcall page $encname {*}$args + } + proc page16 {encname args} { + dict set args -cols 16 + tailcall page $encname {*}$args + } + proc page {encname args} { + variable invalid + set encname [encname $encname] + set defaults [list\ + -range {0 256}\ + -cols 16\ + ] + set opts [dict merge $defaults $args] + # -- --- --- --- --- --- --- --- --- + set cols [dict get $opts -cols] + # -- --- --- --- --- --- --- --- --- + + set d_bytedisplay [basedict_display] + + #set d_ascii [pagedict_raw ascii] + set d_ascii [basedict] + set d_asciiposn [lreverse $d_ascii] ;#values should be unique except for "???". We are mainly interested in which ones have display-names e.g cr csi + #The results of this are best seen by comparing the ebcdic and ascii pages + + set d_page [pagedict_raw $encname] + + set out "" + set i 1 + append out " " + dict for {k rawchar} $d_page { + set num [expr {"0x$k"}] + #see if ascii equivalent exists and has a name + if {$rawchar eq $invalid} { + set displayv "$invalid" + } else { + set bytedisplay "" + if {[dict exists $d_asciiposn $rawchar]} { + set asciiposn [dict get $d_asciiposn $rawchar] + set bytedisplay [dict get $d_bytedisplay $asciiposn] + } + if {$bytedisplay eq $invalid} { + # + set displayv " $rawchar " + } else { + set displaylen [string length $bytedisplay] + if {$displaylen == 2} { + set displayv "$bytedisplay " + } elseif {$displaylen == 3} { + set displayv $bytedisplay + } else { + if {[string length $rawchar] == 0} { + set displayv " " + } else { + #presumed 1 + set displayv " $rawchar " + } + } + } + } + + append out "$k $displayv " + if {$i > 0 && $i % $cols == 0} { + set out [string range $out 0 end-2] + append out \n " " + } + incr i + } + set out [string trimright $out " "] + return $out + } + + proc pagechar1 {page num} { + set encpage [encname $page] + encoding convertfrom $encpage [format %c $num] + } + + proc pagechar {page num} { + set encpage [encname $page] + + set ch [format %c $num] + if {[decodable $ch $encpage]} { + set outchar [encoding convertfrom $encpage $ch] + } else { + #here we will use \0xFFFD instead of our replacement string ??? - as the name pagechar implies always returning a single character. REVIEW. + set outchar $::punk::char::invalid_display_char + } + return $outchar + } + proc pagechar_info {page num} { + set ch [format %c $num] + set h [format %04x $num] + set encpage [encname $page] + if {[decodable $ch $encpage]} { + set outchar [encoding convertfrom $encpage $ch] + } else { + error "pagechar_info: $h not decodable from $encpage" + } + package require punk::console + puts -nonewline stdout \033\[s;flush stdout + lassign [punk::console::get_cursor_pos_list] _row1 col1 + puts -nonewline stdout "$outchar";flush stdout + lassign [punk::console::get_cursor_pos_list] _row2 col2 + puts -nonewline stdout "\033\[u";flush stdout + return "$col1 -> $col2" + } + + proc pagebyte {page num} { + set encpage [encname $page] + + set ch [format %c $num] + if {[decodable $ch $encpage]} { + #set outchar [encoding convertto $encpage [format %c $num]] + set outchar [format %c $num] + } else { + set outchar $::punk::char::invalid_display_char + } + return $outchar + } + + proc all_pages {} { + set out "" + set mimenamesdict [page_names_dict] + foreach encname [encoding names] { + if {[dict exists $mimenamesdict $encname]} { + set alt "([dict get $mimenamesdict $encname])" + } else { + set alt "" + } + append out "$encname $alt" \n + append out [page $encname] + } + return $out + } + + proc encname {encoding_name_or_alias} { + set encname $encoding_name_or_alias + if {[lsearch -nocase [page_names] $encname] < 0} { + error "Unknown encoding '$encname' - use 'punk::char::page_names' to see valid encoding names on this system" + } + variable encmimens + if {$encname ni [encoding names]} { + set encname [${encmimens}::reversemapencoding $encname] + } + return $encname + } + + proc pagedict_raw {encname} { + variable invalid ;# ="???" + set encname [encname $encname] + set d [dict create] + for {set i 0} {$i < 256} {incr i} { + set k [format %02x $i] + #dict set d $k [encoding convertfrom $encname [format %c $i]] + set ch [format %c $i] ; + #jmn + if {[decodable $ch $encname]} { + #set encchar [encoding convertto $encname $ch] + #dict set d $k [encoding convertfrom $encchar] + dict set d $k [encoding convertfrom $encname $ch] + } else { + dict set d $k $invalid ;#use replacement so we can detect difference from actual "?" + } + } + return $d + } + proc asciidict {} { + variable invalid + set d [dict create] + set a128 [asciidict128] + for {set i 0} {$i < 256} {incr i} { + set k [format %02x $i] + if {$i <= 127} { + dict set d $k [dict get $a128 $k] + } else { + # + dict set d $k $invalid + } + + if {$i <=32} { + #no point emitting the lower control characters raw to screen - emit the short-names defined in the 'ascii' proc + dict set d $k [dict get $a128 $k] + } else { + if {$i == 0x9b} { + dict set d $k CSI ;#don't emit the ansi 'Control Sequence Introducer' - or it will be interpreted by the console and affect the layout. + } else { + dict set d $k [format %c $i] + } + } + } + return $d + } + + proc basedict_display {} { + set d [dict create] + set a128 [asciidict128] + for {set i 0} {$i < 256} {incr i} { + set k [format %02x $i] + if {$i <=32} { + #no point emitting the lower control characters raw to screen - emit the short-names defined in the 'ascii' proc + dict set d $k [dict get $a128 $k] + } else { + if {$i == 0x9b} { + dict set d $k CSI ;#don't emit the ansi 'Control Sequence Introducer' - or it will be interpreted by the console and affect the layout. + } elseif {$i == 0x9c} { + dict set d $k OSC + } else { + #dict set d $k [encoding convertfrom [encoding system] [format %c $i]] + #don't use encoding convertfrom - we want the value independent of the current encoding system. + dict set d $k [format %c $i] + } + } + } + return $d + } + proc basedict_encoding_system {} { + #result depends on 'encoding system' currently in effect + set d [dict create] + for {set i 0} {$i < 256} {incr i} { + set k [format %02x $i] + dict set d $k [encoding convertfrom [encoding system] [format %c $i]] + } + return $d + } + + proc basedict {} { + #this gives same result independent of current value of 'encoding system' + set d [dict create] + for {set i 0} {$i < 256} {incr i} { + set k [format %02x $i] + dict set d $k [format %c $i] + } + return $d + } + proc pagedict {pagename args} { + variable charsets + set encname [encname $pagename] + set defaults [list\ + -range {0 256}\ + -charset ""\ + ] + set opts [dict merge $defaults $args] + # -- --- --- --- --- --- --- --- --- --- + set range [dict get $opts -range] + set charset [dict get $opts -charset] + # -- --- --- --- --- --- --- --- --- --- + if {$charset ne ""} { + if {$charset ni [charset_names]} { + error "unknown charset '$charset' - use 'charset_names' to get list" + } + set setinfo [dict get $charsets $charset] + set ranges [dict get $setinfo ranges] + set charset_dict [dict create] + foreach r $ranges { + set start [dict get $r start] + set end [dict get $r end] + #set charset_dict [dict merge $charset_dict [char_range_dict $start $end]] + break + } + + } else { + set start [lindex $range 0] + set end [lindex $range 1] + } + + set d [dict create] + for {set i $start} {$i <= $end} {incr i} { + set k [format %02x $i] + dict set d $k [encoding convertfrom $encname [format %c $i]] + } + return $d + } + + #todo - benchmark peformance - improve punk pipeline + proc asciidict128 {} { + regexp -all -inline {\S+} [concat {*}[linelist -line trimleft [ascii]]] + } + proc _asciidict128 {} { + .= ascii |> .=> linelist -line trimleft |> .=* concat |> {regexp -all -inline {\S+} $data} + } + + proc asciidict2 {} { + set d [dict create] + dict for {k v} [basedict_display] { + if {[string length $v] == 1} { + set num [expr {"0x$k"}] + #dict set d $k "\033(0[subst \\u00$k]\033(B" + dict set d $k "\033(0[format %c $num]\033(B" + } else { + dict set d $k $v + } + } + return $d + } + + #-- --- --- --- --- --- --- --- + # encoding convertfrom & encoding convertto can be somewhat confusing to think about. (Need to think in reverse.) + # e.g encoding convertto dingbats will output something that doesn't look dingbatty on screen. + #-- --- --- --- --- --- --- --- + #must use Tcl instead of tcl (at least for 8.6) + if {![package vsatisfies [package present Tcl] 8.7]} { + proc encodable "s {enc [encoding system]}" { + set encname [encname $enc] + if {($encname eq "ascii")} { + #8.6 fails to round-trip convert 0x7f del character despite it being in the ascii range (review Why?? what else doesn't round-trip but should?) + #just strip it out of the string as we are only after a boolean answer and if s is only a single del char empty string will result in true + set s [string map [list [format %c 0x7f] ""] $s] + } + string eq $s [encoding convertfrom $encname [encoding convertto $encname $s]] + } + #note also that tcl8.6 has anomalies with how it handles some unassigned codepoints + # e.g unassigned codes in the middle of a codepage may appear to be encodable&decodable in a round trip whereas undefined codepoints at the end may get the replacement character defined in the tcl encodings dir (usually the 3f char: "?") + proc decodable "s {enc [encoding system]}" { + set encname [encname $enc] + #review + string eq $s [encoding convertto $encname [encoding convertfrom $encname $s]] + } + } else { + proc encodable "s {enc [encoding system]}" { + set encname [encname $enc] + string eq $s [encoding convertfrom $encname [encoding convertto $encname $s]] + } + proc decodable "s {enc [encoding system]}" { + set encname [encname $enc] + string eq $s [encoding convertto $encname [encoding convertfrom $encname $s]] + } + } + #-- --- --- --- --- --- --- --- + proc test_japanese {{encoding jis0208}} { + #A very basic test of 2char encodings such as jis0208 + set yatbun 日本 ;# encoding convertfrom jis0208 F|K\\ + lassign [split $yatbun] yat bun + puts "original yatbun ${yat} ${bun}" + set eyat [encoding convertto $encoding $yat] + set ebun [encoding convertto $encoding $bun] + puts "$encoding encoded: ${eyat} ${ebun}" + puts "reencoded: [encoding convertfrom $encoding $eyat] [encoding convertfrom $encoding $ebun]" + return $yatbun + } + proc test_grave {} { + set g [format %c 0x300] + puts stdout "Testing console display of grave accented a in between letters x and y - accent should combine over the top of the letter a." + puts stdout "Apparent width should theoretically be 1 console-column" + package require punk::console + puts stdout "# -- --- --- ---" + puts -nonewline "xa${g}z";set cursorposn [punk::console::get_cursor_pos] + puts stdout \n + puts stdout "cursor position immediately after outputing 4 bytes (expecting 3 glyphs): $cursorposn" + puts stdout "# -- --- --- ---" + puts -nonewline "xyz";set cursorposn [punk::console::get_cursor_pos] + puts stdout \n + puts stdout "cursor position immediately after outputing 3 bytes (xyz): $cursorposn" + } + proc test_farmer {} { + #an interesting article re grapheme clustering problems in terminals https://mitchellh.com/writing/grapheme-clusters-in-terminals + #(similar to the problem with grave accent rendering width that the test_grave proc is written for) + # -- --- --- --- --- + #These pasted glyphs can display in console even when the unicode versions don't (tcl 8.6 limited to 65533/FFFD ?) + upvar farmer1_paste test_farmer1 + upvar farmer2_paste test_farmer2 + set test_farmer1 🧑‍🌾 ;#contains zero-width joiner between + set test_farmer2 🧑🌾 + puts "pasted farmer1 exporting as var farmer1_paste: $test_farmer1" + puts "pasted farmer2 exporting as var farmer2_paste: $test_farmer2" + # -- --- --- --- --- + + + set farmer1 "\U0001f9d1\U0000200d\U0001f33e" + set farmer2 "\U0001f9d1\U0001f33e" + puts stdout "farmer1 with zero-width joiner, codes: \\U0001f9d1\\U0000200d\\U0001f33e : $farmer1" + puts stdout "farmer2 with no joiner codes: \\U0001f9d1\\U001f33e : $farmer2" + + package require punk::console + puts stdout "#2--5---9---C---" + puts -nonewline stdout \033\[2K\033\[1G ;#2K erase line 1G cursor at col1 + puts -nonewline "${farmer1}";set cursorposn [punk::console::get_cursor_pos] + puts stdout \n + puts stdout "cursor position immediately after outputing farmer1 (expecting 1 glyph 2 wide) : $cursorposn" + puts stdout "#2--5---9---C---" + puts -nonewline "${farmer2}";set cursorposn [punk::console::get_cursor_pos] + puts stdout \n + puts stdout "cursor position immediately after outputing farmer2 (expecting 2 glyphs 4 wide in total): $cursorposn" + + return [list $farmer1 $farmer2] + } + + #G0 Sets Sequence G1 Sets Sequence Meaning + #ESC ( A ESC ) A United Kingdom Set + #ESC ( B ESC ) B ASCII Set + #ESC ( 0 ESC ) 0 Special Graphics + #ESC ( 1 ESC ) 1 Alternate Character ROM Standard Character Set + #ESC ( 2 ESC ) 2 Alternate Character ROM Special Graphic + + # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + # Unicode character sets - some hardcoded - some loadable from data files + # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + variable charinfo [dict create] + variable charsets [dict create] + + + # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + # Aggregate character sets (ones that pick various ranges from underlying unicode ranges) + # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + dict set charsets "WGL4" [list altname "Windows Glyph List 4" ranges [list\ + {start 0 end 127 name "basic latin"}\ + {start 128 end 255 name "latin-1 supplement"}\ + {start 256 end 383 name "Latin Extended-A"}\ + {start 402 end 402 name "subset Latin Extended-B"}\ + {start 506 end 511 name "subset Latin Extended-B"}\ + {start 710 end 711 name "subset Spacing Modifier Letters"}\ + {start 713 end 713 name "subset Spacing Modifier Letters"}\ + {start 728 end 733 name "subset Spacing Modifier Letters"}\ + {start 900 end 906 name "subset Greek"}\ + {start 908 end 908 name "subset Greek"}\ + {start 910 end 974 name "subset Greek"}\ + {start 1024 end 1119 name "subset Cyrillic"}\ + {start 1168 end 1169 name "subset Cyrillic"}\ + {start 7808 end 7813 name "subset Latin Extended Additional"}\ + {start 7922 end 7923 name "subset Latin Extended Additional"}\ + {start 8211 end 8213 name "subset General Punctuation"}\ + {start 8215 end 8222 name "subset General Punctuation"}\ + {start 8224 end 8226 name "subset General Punctuation"}\ + {start 8230 end 8230 name "subset General Punctuation"}\ + {start 8240 end 8240 name "subset General Punctuation"}\ + {start 8242 end 8243 name "subset General Punctuation"}\ + ] description "Microsoft WGL4 Repertoire" settype "other"] + + + + + # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + #The base page 0-256 8bit range - values don't have specific characters or descriptions - as they are codepage dependent + #we will fill this here for completeness - but with placeholders + # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + dict set charsets "8bit" [list ranges [list {start 0 end 127 name ascii} {start 128 end 255 name 8bit-ascii}] description "8bit extended ascii range" settype "other"] + for {set i 0} {$i < 256} {incr i} { + dict set charinfo $i [list desc "codepage-dependent" short "byte_[format %02x $i]"] + } + + # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + # Unicode ranges + # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + dict set charsets "greek" [list ranges [list {start 880 end 1023} {start 7936 end 8191}] description "Greek and Coptic" settype "other"] + + + # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + dict set charsets "Block Elements" [list ranges [list {start 9600 end 9631}] description "Block Elements" settype "other"] + dict set charinfo 9600 [list desc "Upper Half Block" short "blocke_up_half"] + dict set charinfo 9601 [list desc "Lower One Eighth Block" short "blocke_lw_1_8th"] + dict set charinfo 9602 [list desc "Lower One Quarter Block" short "blocke_lw_1_qtr"] + + + # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + dict set charsets "Dingbats" [list ranges [list {start 9984 end 10175 }] description "Unicode Dingbats" settype "tcl_supplemented"] + dict set charinfo 9984 [list desc "Black Safety Scissors" short "dingbats_black_safety_scissors"] + #... + dict set charinfo 10175 [list desc "Double Curly Loop" short "dingbats_double_curly_loop"] + + + + # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + #variation selectors 0xFe01 - 0xFE0F + dict set charsets "Variation Selectors" [list ranges [list {start 65024 end 65039}] description "Variation Selectors" note "combining character with previous char - variant glyph display" settype "tcl_supplemented"] + dict set charinfo 65024 [list desc "Variation Selector-1" short "VS1"] + dict set charinfo 65025 [list desc "Variation Selector-2" short "VS2"] + dict set charinfo 65026 [list desc "Variation Selector-3" short "VS3"] + dict set charinfo 65027 [list desc "Variation Selector-4" short "VS4"] + dict set charinfo 65027 [list desc "Variation Selector-5" short "VS5"] + dict set charinfo 65029 [list desc "Variation Selector-6" short "VS6"] + dict set charinfo 65030 [list desc "Variation Selector-7" short "VS7"] + dict set charinfo 65031 [list desc "Variation Selector-8" short "VS8"] + dict set charinfo 65032 [list desc "Variation Selector-9" short "VS9"] + dict set charinfo 65033 [list desc "Variation Selector-10" short "VS10"] + dict set charinfo 65034 [list desc "Variation Selector-11" short "VS11"] + dict set charinfo 65035 [list desc "Variation Selector-12" short "VS12"] + dict set charinfo 65036 [list desc "Variation Selector-13" short "VS13"] + dict set charinfo 65037 [list desc "Variation Selector-14" short "VS14"] + dict set charinfo 65038 [list desc "Variation Selector-15 text variation" short "VS15"] ;#still an image - just more suitable for text-presentation e.g word-processing doc + dict set charinfo 65039 [list desc "Variation Selector-16 emoji variation" short "VS16"] + + + # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + # emoticons https://www.unicode.org/charts/PDF/U1F600.pdf + dict set charsets "Emoticons" [list ranges [list {start 128512 end 128591}] description "Emoticons" settype "tcl_supplemented"] + dict set charinfo 128512 [list desc "Grinning Face" short "emoticon_gface"] + dict set charinfo 128513 [list desc "Grinning Face with Smiling Eyes" short "emoticon_gface_smile_eyes"] + dict set charinfo 128514 [list desc "Face with Tears of Joy" short "emoticon_face_tears_joy"] + + #todo + dict set charinfo 128590 [list desc "Person with Pouting Face" short "emoticon_person_pout"] + + # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + dict set charsets "Box Drawing" [list ranges [list {start 9472 end 9599}] description "Box Drawing" settype "tcl_supplemented"] + dict set charinfo 9472 [list desc "Box Drawings Light Horizontal" short "boxd_lhz"] + dict set charinfo 9473 [list desc "Box Drawings Heavy Horizontal" short "boxd_hhz"] + dict set charinfo 9474 [list desc "Box Drawings Light Vertical" short "boxd_lv"] + dict set charinfo 9475 [list desc "Box Drawings Heavy Vertical" short "boxd_hv"] + dict set charinfo 9476 [list desc "Box Drawings Light Triple Dash Horizontal" short "boxd_ltdshhz"] + dict set charinfo 9477 [list desc "Box Drawings Heavy Triple Dash Horizontal" short "boxd_htdshhz"] + dict set charinfo 9478 [list desc "Box Drawings Light Triple Dash Vertical" short "boxd_ltdshv"] + dict set charinfo 9479 [list desc "Box Drawings Heavy Triple Dash Vertical" short "boxd_htdshv"] + dict set charinfo 9480 [list desc "Box Drawings Light Quadruple Dash Horizontal" short "boxd_lqdshhz"] + dict set charinfo 9481 [list desc "Box Drawings Heavy Quadruple Dash Horizontal" short "boxd_hqdshhz"] + dict set charinfo 9482 [list desc "Box Drawings Light Quadruple Dash Vertical" short "boxd_lqdshv"] + dict set charinfo 9483 [list desc "Box Drawings Heavy Quadruple Dash Vertical" short "boxd_hqdshv"] + dict set charinfo 9484 [list desc "Box Drawings Light Down and Right" short "boxd_ldr"] + dict set charinfo 9485 [list desc "Box Drawings Down Light and Right Heavy" short "boxd_dlrh"] + dict set charinfo 9486 [list desc "Box Drawings Down Heavy and Right Light" short "boxd_dhrl"] + dict set charinfo 9487 [list desc "Box Drawings Heavy Down and Right" short "boxd_hdr"] + dict set charinfo 9488 [list desc "Box Drawings Light Down and Left" short "boxd_ldl"] + dict set charinfo 9489 [list desc "Box Drawings Down Light and Left Heavy" short "boxd_dllh"] + dict set charinfo 9490 [list desc "Box Drawings Down Heavy and Left Light" short "boxd_dhll"] + dict set charinfo 9491 [list desc "Box Drawings Heavy Down and Left" short "boxd_hdl"] + dict set charinfo 9492 [list desc "Box Drawings Light Up and Right" short "boxd_lur"] + dict set charinfo 9493 [list desc "Box Drawings Up Light and Right Heavy" short "boxd_ulrh"] + dict set charinfo 9494 [list desc "Box Drawings Up Heavy and Right Light" short "boxd_uhrl"] + dict set charinfo 9495 [list desc "Box Drawings Heavy Up and Right" short "boxd_hur"] + dict set charinfo 9496 [list desc "Box Drawings Light Up and Left" short "boxd_lul"] + dict set charinfo 9497 [list desc "Box Drawings Up Light and Left Heavy" short "boxd_ullh"] + dict set charinfo 9498 [list desc "Box Drawings Up Heavy and Left Light" short "boxd_uhll"] + dict set charinfo 9499 [list desc "Box Drawings Heavy Up and Left" short "boxd_hul"] + dict set charinfo 9500 [list desc "Box Drawings Light Vertical and Right" short "boxd_lvr"] + dict set charinfo 9501 [list desc "Box Drawings Vertical Light and Right Heavy" short "boxd_vlrh"] + dict set charinfo 9502 [list desc "Box Drawings Up Heavy and Right Down Light" short "boxd_uhrdl"] + dict set charinfo 9503 [list desc "Box Drawings Down Heavy and Right Up Light" short "boxd_dhrul"] + dict set charinfo 9504 [list desc "Box Drawings Vertical Heavy and Right Light" short "boxd_vhrl"] + dict set charinfo 9505 [list desc "Box Drawings Down Light and Right Up Heavy" short "boxd_dlruh"] + dict set charinfo 9506 [list desc "Box Drawings Up Light and Right Down Heavy" short "boxd_ulrdh"] + dict set charinfo 9507 [list desc "Box Drawings Heavy Vertical and Right" short "boxd_hvr"] + dict set charinfo 9508 [list desc "Box Drawings Light Vertical and Left" short "boxd_lvl"] + dict set charinfo 9509 [list desc "Box Drawings Vertical Light and Left Heavy" short "boxd_vllh"] + dict set charinfo 9510 [list desc "Box Drawings Up Heavy and Let Down Light" short "boxd_uhldl"] + dict set charinfo 9511 [list desc "Box Drawings Down Heavy and Left Up Light" short "boxd_dhlul"] + dict set charinfo 9512 [list desc "Box Drawings Vertical Heavy and Left Light" short "boxd_vhll"] + dict set charinfo 9513 [list desc "Box Drawings Down Light and left Up Heavy" short "boxd_dlluh"] + dict set charinfo 9514 [list desc "Box Drawings Up Light and Left Down Heavy" short "boxd_ulldh"] + dict set charinfo 9515 [list desc "Box Drawings Heavy Vertical and Left" short "boxd_hvl"] + dict set charinfo 9516 [list desc "Box Drawings Light Down and Horizontal" short "boxd_ldhz"] + dict set charinfo 9517 [list desc "Box Drawings Left Heavy and Right Down Light" short "boxd_lhrdl"] + dict set charinfo 9518 [list desc "Box Drawings Right Heavy and Left Down Light" short "boxd_rhldl"] + dict set charinfo 9519 [list desc "Box Drawings Down Light and Horizontal Heavy" short "boxd_dlhzh"] + dict set charinfo 9520 [list desc "Box Drawings Down Heavy and Horizontal Light" short "boxd_dhhzl"] + dict set charinfo 9521 [list desc "Box Drawings Right Light and Left Down Heavy" short "boxd_rlldh"] + dict set charinfo 9522 [list desc "Box Drawings Left Light and Right Down Heavy" short "boxd_llrdh"] + dict set charinfo 9523 [list desc "Box Drawings Heavy Down and Horizontal" short "boxd_hdhz"] + dict set charinfo 9524 [list desc "Box Drawings Light Up and Horizontal" short "boxd_luhz"] + dict set charinfo 9525 [list desc "Box Drawings Left Heavy and Right Up Light" short "boxd_lhrul"] + dict set charinfo 9526 [list desc "Box Drawings Right Heavy and Left Up Light" short "boxd_rhlul"] + dict set charinfo 9527 [list desc "Box Drawings Up Light and Horizontal Heavy" short "boxd_ulhzh"] + dict set charinfo 9528 [list desc "Box Drawings Up Heavy and Horizontal Light" short "boxd_uhhzl"] + dict set charinfo 9529 [list desc "Box Drawings Right Light and Left Up Heavy" short "boxd_rlluh"] + dict set charinfo 9530 [list desc "Box Drawings Left Light and Right Up Heavy" short "boxd_llruh"] + dict set charinfo 9531 [list desc "Box Drawings Heavy Up and Horizontal" short "boxd_huhz"] + dict set charinfo 9532 [list desc "Box Drawings Light Vertical and Horizontal" short "boxd_lvhz"] + dict set charinfo 9533 [list desc "Box Drawings Left Heavy and Right Vertical Light" short "boxd_lhrvl"] + dict set charinfo 9534 [list desc "Box Drawings Right Heavy and Left Vertical Light" short "boxd_rhlvl"] + dict set charinfo 9535 [list desc "Box Drawings Vertical Light and Horizontal Heavy" short "boxd_vlhzh"] + dict set charinfo 9536 [list desc "Box Drawings Up Heavy and Down Horizontal Light" short "boxd_uhdhzl"] + dict set charinfo 9537 [list desc "Box Drawings Down Heavy and Up Horizontal Light" short "boxd_dhuhzl"] + dict set charinfo 9538 [list desc "Box Drawings Vertical Heavy and Horizontal Light" short "boxd_vhhzl"] + dict set charinfo 9539 [list desc "Box Drawings Left Up Heavy and Right Down Light" short "boxd_luhrdl"] + dict set charinfo 9540 [list desc "Box Drawings Right Up Heavy and Left Down Light" short "boxd_ruhldl"] + dict set charinfo 9541 [list desc "Box Drawings Left Down Heavy and Right Up Light" short "boxd_ldhrul"] + dict set charinfo 9542 [list desc "Box Drawings Right Down Heavy and Left Up Light" short "boxd_rdhlul"] + dict set charinfo 9543 [list desc "Box Drawings Down Light and Up Horizontal Heavy" short "boxd_dluhzh"] + dict set charinfo 9544 [list desc "Box Drawings Up Light and Down Horizontal Heavy" short "boxd_dldhzh"] + dict set charinfo 9545 [list desc "Box Drawings Right Light and Left Vertical Heavy" short "boxd_rllvh"] + dict set charinfo 9546 [list desc "Box Drawings Left Light and Right Vertical Heavy" short "boxd_llrvh"] + dict set charinfo 9547 [list desc "Box Drawings Heavy Vertical and Horizontal" short "boxd_hvhz"] + dict set charinfo 9548 [list desc "Box Drawings Light Double Dash Horizontal" short "boxd_lddshhz"] + dict set charinfo 9549 [list desc "Box Drawings Heavy Double Dash Horizontal" short "boxd_hddshhz"] + dict set charinfo 9550 [list desc "Box Drawings Light Double Dash Vertical" short "boxd_lddshv"] + dict set charinfo 9551 [list desc "Box Drawings Heavy Double Dash Vertical" short "boxd_hddshv"] + dict set charinfo 9552 [list desc "Box Drawings Double Horizontal" short "boxd_dhz"] + dict set charinfo 9553 [list desc "Box Drawings Double Vertical" short "boxd_dv"] + dict set charinfo 9554 [list desc "Box Drawings Down Single and Right Double" short "boxd_dsrd"] + dict set charinfo 9555 [list desc "Box Drawings Down Double and Right Single" short "boxd_ddrs"] + dict set charinfo 9556 [list desc "Box Drawings Double Down and Right" short "boxd_ddr"] + dict set charinfo 9557 [list desc "Box Drawings Down Single and Left Double" short "boxd_dsld"] + dict set charinfo 9558 [list desc "Box Drawings Down Double and Left Single" short "boxd_ddls"] + dict set charinfo 9559 [list desc "Box Drawings Double Down and Left" short "boxd_ddl"] + dict set charinfo 9560 [list desc "Box Drawings Up Single and Right Double" short "boxd_usrd"] + dict set charinfo 9561 [list desc "Box Drawings Up Double and Right Single" short "boxd_udrs"] + dict set charinfo 9562 [list desc "Box Drawings Double Up and Right" short "boxd_dur"] + dict set charinfo 9563 [list desc "Box Drawings Up Single and Left Double" short "boxd_usld"] + dict set charinfo 9564 [list desc "Box Drawings Up Double and Left Single" short "boxd_udls"] + dict set charinfo 9565 [list desc "Box Drawings Double Up and Left" short "boxd_dul"] + dict set charinfo 9566 [list desc "Box Drawings Vertical Single and Right Double" short "boxd_vsrd"] + dict set charinfo 9567 [list desc "Box Drawings Vertical Double and Right Single" short "boxd_vdrs"] + dict set charinfo 9568 [list desc "Box Drawings Double Vertical and Right" short "boxd_dvr"] + dict set charinfo 9569 [list desc "Box Drawings Vertical Single and Left Double" short "boxd_vsld"] + dict set charinfo 9570 [list desc "Box Drawings Vertical Double and Left Single" short "boxd_vdls"] + dict set charinfo 9571 [list desc "Box Drawings Double Vertical and Left" short "boxd_dvl"] + dict set charinfo 9572 [list desc "Box Drawings Down Single and Horizontal Double" short "boxd_dshzd"] + dict set charinfo 9573 [list desc "Box Drawings Down Double and Horizontal Single" short "boxd_ddhzs"] + dict set charinfo 9574 [list desc "Box Drawings Double Down and Horizontal" short "boxd_ddhz"] + dict set charinfo 9575 [list desc "Box Drawings Up Single and Horizontal Double" short "boxd_ushzd"] + dict set charinfo 9576 [list desc "Box Drawings Up Double and Horizontal Single" short "boxd_udhzs"] + dict set charinfo 9577 [list desc "Box Drawings Double Up and Horizontal" short "boxd_duhz"] + dict set charinfo 9578 [list desc "Box Drawings Vertical Single and Horizontal Double" short "boxd_vshzd"] + dict set charinfo 9579 [list desc "Box Drawings Vertical Double and Horizontal Single" short "boxd_vdhzs"] + dict set charinfo 9580 [list desc "Box Drawings Double Vertical and Horizontal" short "boxd_dvhz"] + dict set charinfo 9581 [list desc "Box Drawings Light Arc Down and Right" short "boxd_ladr"] + dict set charinfo 9582 [list desc "Box Drawings Light Arc Down and Left" short "boxd_ladl"] + dict set charinfo 9583 [list desc "Box Drawings Light Arc Up and Left" short "boxd_laul"] + dict set charinfo 9584 [list desc "Box Drawings Light Arc Up and Right" short "boxd_laur"] + dict set charinfo 9585 [list desc "Box Drawings Light Diagonal Upper Right To Lower Left" short "boxd_ldgurll"] + dict set charinfo 9586 [list desc "Box Drawings Light Diagonal Upper Left To Lower Right" short "boxd_ldgullr"] + dict set charinfo 9587 [list desc "Box Drawings Light Diagonal Cross" short "boxd_ldc"] + dict set charinfo 9588 [list desc "Box Drawings Light Left" short "boxd_ll"] + dict set charinfo 9589 [list desc "Box Drawings Light Up" short "boxd_lu"] + dict set charinfo 9590 [list desc "Box Drawings Light Right" short "boxd_lr"] + dict set charinfo 9591 [list desc "Box Drawings Light Down" short "boxd_ld"] + dict set charinfo 9592 [list desc "Box Drawings Heavy Left" short "boxd_hl"] + dict set charinfo 9593 [list desc "Box Drawings Heavy Up" short "boxd_hu"] + dict set charinfo 9594 [list desc "Box Drawings Heavy Right" short "boxd_hr"] + dict set charinfo 9595 [list desc "Box Drawings Heavy Down" short "boxd_hd"] + dict set charinfo 9596 [list desc "Box Drawings Light Left and Heavy Right" short "boxd_llhr"] + dict set charinfo 9597 [list desc "Box Drawings Light Up and Heavy Down" short "boxd_luhd"] + dict set charinfo 9598 [list desc "Box Drawings Heavy Left and Light Right" short "boxd_hllr"] + dict set charinfo 9599 [list desc "Box Drawings Heavy Up and Light Down" short "boxd_huld"] + + + dict set charsets "Halfwidth and Fullwidth Forms" [list ranges [list {start 65280 end 65519}] description "Halfwidth and Fullwidth Forms (variants)" settype "tcl_supplemental"] + dict set charsets "ascii_fullwidth" [list ranges [list {start 65281 end 65374}] description "Ascii 21 to 7E fullwidth" parentblock "halfwidth_and_fullwidth_forms" settype "other"] + + dict set charsets "Specials" [list ranges [list {start 65520 end 65535}] description "Specials" settype "tcl_supplemental"] + + dict set charsets "noncharacters" [list ranges [list\ + {start 64976 end 65007 note "BMP FDD0..FDEF"}\ + {start 65534 end 65535 note "BMP FFFE,FFFF"}\ + {start 131070 end 131071 note "plane1 1FFFE,1FFFF"}\ + {start 196606 end 196607 note "plane2 2FFFE,2FFFF"}\ + {start 262142 end 262143 note "plane3 3FFFE,3FFFF"}\ + {start 327678 end 327679 note "plane4 4FFFE,4FFFF"}\ + {start 393214 end 393215 note "plane5 5FFFE,5FFFF"}\ + {start 458750 end 458751 note "plane6 6FFFE,6FFFF"}\ + {start 524286 end 524287 note "plane7 7FFFE,7FFFF"}\ + {start 589822 end 589823 note "plane8 8FFFE,8FFFF"}\ + {start 655358 end 655359 note "plane9 9FFFE,9FFFF"}\ + {start 720894 end 720895 note "plane10 AFFFE,AFFFF"}\ + {start 786430 end 786431 note "plane11 BFFFE,BFFFF"}\ + {start 851966 end 851967 note "plane12 CFFFE,CFFFF"}\ + {start 917502 end 917503 note "plane13 DFFFE,DFFFF"}\ + {start 983038 end 983039 note "plane14 EFFFE,EFFFF"}\ + {start 1048574 end 1048575 note "plane15 FFFFE,FFFFF"}\ + {start 1114110 end 1114111 note "plane16 10FFFE,10FFFF"}\ + ] description "non-characters" settype "tcl_supplemental"] + + #build dicts keyed on short + variable charshort + proc _build_charshort {} { + variable charshort + set charshort [dict create] + variable charinfo + dict for {k v} $charinfo { + if {[dict exists $v short]} { + set sh [dict get $v short] + if {[dict exists $charshort $sh]} { + puts stderr "_build_charshort WARNING character data load duplicate shortcode '$sh'" + } + dict set charshort $sh [format %c $k] + } + } + return [dict size $charshort] + } + _build_charshort + + variable charset_extents_startpoints ;#stores endpoints associated with each startpoint - but named after key which is startpoint. + variable charset_extents_endpoints ;#stores startpoints assoicated with each endpoint - but named after key which is endpoint. + variable charset_extents_rangenames ;# dict keyed on start,end pointing to list of 2-tuples {setname rangeindex_within_set} + #build 2 indexes for each range.(charsets are not just unicode blocks so can have multiple ranges) + #Note that a range could be as small as a single char (startpoint = endpoint) so there can be many ranges with same start and end if charsets use some of the same subsets. + #as each charset_extents_startpoins,charset_extents_endpoints is built - the associated range name and index is appended to the rangenames dict + #startpoints - key is startpoint of a range, value is list of endpoints one for each range starting at this startpoint-key + #endpoints - key is endpoint of a range, value is list of startpoints one for each range ending at this endpoint-key + proc _build_charset_extents {} { + variable charsets + variable charset_extents_startpoints + variable charset_extents_endpoints + variable charset_extents_rangenames + set charset_extents_startpoints [dict create] + set charset_extents_endpoints [dict create] + set charset_extents_rangenames [dict create] + dict for {setname setinfo} $charsets { + set ranges [dict get $setinfo ranges] + if {[dict get $setinfo settype] eq "block"} { + #unicode block must have a single range + #we consider a char a member of the block even if unassigned/reserved (as per unicode documentation) + set start [dict get [lindex $ranges 0] start] + set end [dict get [lindex $ranges 0] end] + if {![dict exists $charset_extents_startpoints $start] || $end ni [dict get $charset_extents_startpoints $start]} { + #assert if end wasn't in startpoits list - then start won't be in endpoints list + dict lappend charset_extents_startpoints $start $end + dict lappend charset_extents_endpoints $end $start + } + dict lappend charset_extents_rangenames ${start},${end} [list $setname 1] + } else { + #multirange sets/scripts. have holes. Char not a member if it's not explicitly in a defined range. + #They should be in order within a set - but we don't assume so + set r 1 + foreach range $ranges { + set start [dict get $range start] + set end [dict get $range end] + if {![dict exists $charset_extents_startpoints $start] || $end ni [dict get $charset_extents_startpoints $start]} { + #assert if end wasn't in startpoits list - then start won't be in endpoints list + dict lappend charset_extents_startpoints $start $end + dict lappend charset_extents_endpoints $end $start + } + dict lappend charset_extents_rangenames ${start},${end} [list $setname $r] + incr r + } + } + } + #maintain in sorted order + #-stride is available in lsort even at tcl8.6 - but not in lsearch + set charset_extents_startpoints [lsort -stride 2 -integer $charset_extents_startpoints] + set charset_extents_endpoints [lsort -stride 2 -integer $charset_extents_endpoints] + #no need to sort charset_extents_rangenames - lookup only done using dict methods + return [dict size $charset_extents_startpoints] + } + _build_charset_extents ;#rebuilds for all charsets + + #nerdfonts are within the Private use E000 - F8FF range + proc load_nerdfonts {} { + variable charsets + variable charinfo + package require fileutil + set ver [package provide punk::char] + if {$ver ne ""} { + set ifneeded [package ifneeded punk::char [package provide punk::char]] + #puts stderr "punk::char ifneeded script: $ifneeded" + lassign [split $ifneeded ";"] _ sourceinfo + set basedir [file dirname [lindex $sourceinfo end]] + } else { + #review - will only work at package load time + set scr [info script] + if {$scr eq ""} { + error "load_nerdfonts unable to determine package folder" + } + set basedir [file dirname [info script]] + } + set pkg_data_dir [file join $basedir char] + set fname [file join $pkg_data_dir nerd-fonts-glyph-list.txt] + if {[file exists $fname]} { + #puts stderr "load_nerdfonts loading $fname" + set data [fileutil::cat -translation binary $fname] + set short_seen [dict create] + set current_set_range [dict create] + set filesets_loading [list] + foreach ln [split $data \n] { + set ln [string trim $ln] + if {$ln eq ""} {continue} + set desc [lassign $ln hex rawsetname] + set hexnum 0x$hex + set dec [expr $hexnum] + set setname "nf_$rawsetname" ;#Ensure nerdfont set names are prefixed. + + if {$setname ni $filesets_loading} { + if {![dict exists $charsets $setname]} { + #set exists - but not in our filesets_loading list - therefore this set has been previously loaded, so clear old data first + dict unset charset $setname + } + set newrange [list start $dec end $dec] + dict set current_set_range $setname $newrange + dict set charsets $setname [list ranges [list $newrange] description "nerd fonts $rawsetname" settype "nerdfonts privateuse"] + + lappend filesets_loading $setname + } + #expects ordered glyph list + set existing_range [dict get $current_set_range $setname] + set existing_end [dict get $existing_range end] + if {$dec - $existing_end == 1} { + #part of current range + dict set current_set_range $setname end $dec + #overwrite last ranges element + set rangelist [lrange [dict get $charsets $setname ranges] 0 end-1] + lappend rangelist [dict get $current_set_range $setname] + dict set charsets $setname ranges $rangelist + } else { + #new range for set + dict set current_set_range $setname start $dec + dict set current_set_range $setname end $dec + set rangelist [dict get $charsets $setname ranges] + lappend rangelist [dict get $current_set_range $setname] + dict set charsets $setname ranges $rangelist + } + + if {![dict exists $charinfo $dec]} { + # -- --- + #review + set map [list beaufort bf gibbous gb crescent cr thunderstorm tstorm thermometer thermom] + lappend map {*}[list directory dir creativecommons ccom creative_commons ccom forwardslash fs] + lappend map {*}[list multimedia mm multiple multi outline outl language lang] + lappend map {*}[list odnoklassniki okru] + # -- --- + #consider other ways to unambiguously shorten names? + #normalize nf_fa & nf_fa 'o' element to 'outl' so outlines can be searched across sets more easily (o not necessarily at last position) + set normdesc [list] + foreach el $desc { + if {$el eq "o"} { + set el "outl" + } + lappend normdesc $el + } + set joined_desc [join $normdesc _] + #map after join so we can normalize some underscored elements e.g creativecommons & creative_commons + set mapped_desc [string map $map $joined_desc] + set s nf_${rawsetname}_$mapped_desc + + if {![dict exists $short_seen $s]} { + dict set short_seen $s {} + } else { + #duplicate in the data file (e.g 2023 weather night alt rain mix) + set s ${s}_$hex + } + dict set charinfo $dec [list desc "$desc" short $s] + } + } + _build_charshort + _build_charset_extents + } else { + puts stderr "unable to find glyph file. Tried $fname" + } + } + + proc package_base {} { + #assume punk::char is in .tm form and we can use the package provide statement to determine base location + #review + set pkgver [package present punk::char] + set pkginfo [package ifneeded punk::char $pkgver] + set tmfile [lindex $pkginfo end] + set pkg_base [file dirname $tmfile] + return $pkg_base + } + namespace eval internal { + proc unicode_folder {} { + set parent [file join [punk::char::package_base] char] + set candidates [glob -nocomplain -type d -dir $parent -tail unicode*] + set candidates [lsort -increasing $candidates] ;#review - dictionary sort - how are unicode versions ranked/compared?? + if {![llength $candidates]} { + error "Failed to find unicode data folder in folder '$parent'" + } + set folder [file join $parent [lindex $candidates end]] + return $folder + } + proc dict_getdef {dictValue args} { + if {[llength $args] < 2} { + error {wrong # args: should be "dict_getdef dictValue ?key ...? key default"} + } + set keys [lrange $args 0 end-1] + if {[dict exists $dictValue {*}$keys]} { + return [dict get $dictValue {*}$keys] + } else { + return [lindex $args end] + } + } + } + + + + #charsets structure + #dict set charsets "halfwidth_and_fullwidth_forms" [list ranges [list {start 65280 end 65519}] description "Halfwidth and Fullwidth Forms (variants) settype block"] + + #unicode Blocks.txt + #load the defined blocks into 'charsets' and mark as type 'block'. Unicode blocks have only one range - and don't overlap. + #We don't treat unassigned/reserved codes within a block specially at this stage - ie we will not chop a block into subranges on that basis. + #unassigned code points should get certain default properties (e.g bidirectionality ) according to their block - so it makes sense to treat them as belonging to the block. + #They also get the general property of Cn (Other,not assigned or Other,reserved) and a "Basic Type" of Noncharacter or Reserved + proc load_unicode_blocks {} { + #sample data line + #0000..007F; Basic Latin + variable charsets + set file [file join [internal::unicode_folder] Blocks.txt] + if {![file exists $file]} { + error "Unicode Blocks.txt file not found at path '$file'" + } + puts "ok.. loading" + set fd [open $file r] + fconfigure $fd -translation binary + set data [read $fd] + close $fd + set block_count 0 + foreach ln [split $data \n] { + set ln [string trim $ln] + if {[string match #* $ln]} { + continue + } + if {[set pcolon [string first ";" $ln]] > 0} { + set lhs [string trim [string range $ln 0 $pcolon-1]] + set name [string trim [string range $ln $pcolon+1 end]] + set lhsparts [split $lhs .] + set start [lindex $lhsparts 0] + set end [lindex $lhsparts end] + #puts "$start -> $end '$name'" + set decimal_start [expr {"0x$start"}] + set decimal_end [expr {"0x$end"}] + dict set charsets $name [list ranges [list [list start $decimal_start end $decimal_end note "unicode block $lhs"]] description "" settype block] + incr block_count + } + } + _build_charset_extents + return $block_count + } + + #unicode scripts + + #unicode UnicodeData.txt + + + + #https://www.unicode.org/reports/tr44/#Property_Values + + + #unicode EastAsianWidth.txt + #classify width of character - which is contextual in some cases + ##### + #Review - this is initial naive assumption that should get us mostly what we want for layout purposes in a utf-8-centric world. + #We will just load the values and treat H,N,Na as 1-wide and A,F,W as 2-wide for functions such as char::string_width on the basis that those using legacy sets can query the property and make their own determinations in those contexts. + #### + # -- --- + #A = Ambiguous - All characters that can be sometimes wide and sometimes narrow. (wide in east asian legacy sets, narrow in non-east asian usage) (private use chars considered ambiguous) + #F = East Asian Full-width + #H = East Asian Half-width + #N = Not east Asian (Neutral) - all other characters. (do not occur in legacy East Asian character sets) - treated like Na + #Na = East Asian Narrow - all other characters that are always narrow and have explicit full-width counterparts (e.g includes all of ASCII) + #W = East Asian Wide - all other characters that are always wide (Occur only in the context of Eas Asian Typography) + # -- --- + + + + + proc charshort {shortname} { + variable charshort + return [dict get $charshort $shortname] + } + + proc box_drawing {args} { + return [charset "Box Drawing" {*}$args] + } + proc box_drawing_dict {} { + return [charset_dict "Box Drawing"] + } + + proc char_info_hex {hex args} { + set hex [string map [list _ ""] $hex] + if {[string is xdigit -strict $hex]} { + #has no leading 0x + set dec [expr {"0x$hex"}] + } else { + set dec [expr {$hex}] + } + return [char_info_dec $dec {*}$args] + } + proc char_info {char args} { + #Note - on some versions of Tcl -e.g 8.6 use could supply something like \U1f600 (smiley icon) but we receive fffd (replacement special) + #there is no way to detect what the user intended ie we can't distinguish if they actually typed \UFFFD + #we can test if such mapping happens in general - and warn if codepoint is FFFD in the result dict + set returninfo [dict create] + if {[string equal \UFFFD $char] && [string equal \U1F600 \UFFFD]} { + dict set returninfo WARNING "this tcl maps multiple to FFFD" + } + lassign [scan $char %c%s] dec_char remainder + if {[string length $remainder]} { + error "char_info requires a single character" + } + set result [dict merge $returninfo [char_info_dec $dec_char {*}$args]] + } + proc char_info_dec {dec args} { + set dec_char [expr {$dec}] + set defaults [dict create\ + -fields {default}\ + -except {}\ + ] + set known_opts [dict keys $defaults] + #testwidth is so named because it peforms an actual test on the console using ansi escapes - and the name gives a hint that it is a little slow + set known_fields [list all default dec hex desc short testwidth char memberof] ;#maint fields from charinfo 'desc' 'short' + #todo - unicode properties + # tclwhitespace (different to unicode concept of whitespace. review ) + + foreach {k v} $args { + if {![dict exists $defaults $k]} { + error "char_info unrecognised option '$k'. Known options:'$known_opts' known_fields: $known_fields usage: char_info ?-fields {}? ?-except {}?" + } + } + set opts [dict merge $defaults $args] + # -- --- --- --- --- --- --- --- --- --- --- --- + set opt_fields [dict get $opts -fields] + set opt_except [dict get $opts -except] + # -- --- --- --- --- --- --- --- --- --- --- --- + set initial_fields [list] + if {"default" in $opt_fields} { + set initial_fields $known_fields + if {"testwidth" ni $opt_fields} { + if {"testwidth" ni $opt_except} { + lappend opt_except testwidth + } + } + if {"char" ni $opt_fields} { + if {"char" ni $opt_except} { + lappend opt_except char + } + } + } elseif {"all" in $opt_fields} { + set initial_fields $known_fields + } else { + foreach f $opt_fields { + if {$f in $known_fields} { + lappend initial_fields $f + } else { + error "char_info unknown field name: '$f' known fields: '$known_fields'" + } + } + } + foreach e $opt_except { + if {$e ni $known_fields} { + error "char_info unknown field name $e in -except. known fields: '$known_fields'" + } + } + set fields [list] + foreach f $initial_fields { + if {$f ne "all" && $f ni $opt_except} { + lappend fields $f + } + } + if {![llength $fields]} { + return + } + + variable charinfo + variable charsets + set hex_char [format %04x $dec_char] + set returninfo [dict create] + if {"dec" in $fields} { + dict set returninfo dec $dec_char + } + if {"hex" in $fields} { + dict set returninfo hex $hex_char + } + if {"desc" in $fields} { + if {[dict exists $charinfo $dec_char desc]} { + dict set returninfo desc [dict get $charinfo $dec_char desc] + } else { + dict set returninfo desc "" + } + } + if {"short" in $fields} { + if {[dict exists $charinfo $dec_char short]} { + dict set returninfo desc [dict get $charinfo $dec_char short] + } else { + dict set returninfo short "" + } + } + + #todo - expectedwidth - lookup the printing width it is *supposed* to have from unicode tables + + #testwidth is one of the main ones likely to be worthwhile excluding as querying the console via ansi takes time + if {"testwidth" in $fields} { + set existing_testwidth "" + if {[dict exists $charinfo $dec_char testwidth]} { + set existing_testwidth [dict get $charinfo $dec_char testwidth] + } + if {$existing_testwidth eq ""} { + #no cached data - do expensive cursor-position test (Note this might not be 100% reliable - terminals lie e.g if ansi escape sequence has set to wide printing.) + set char [format %c $dec_char] + set chwidth [char_info_testwidth $char] + + dict set returninfo testwidth $chwidth + #cache it. todo - -verify flag to force recalc in case font/terminal changed in some way? + dict set charinfo $dec_char testwidth $chwidth + } else { + dict set returninfo testwidth $existing_testwidth + } + } + if {"char" in $fields} { + set char [format %c $dec_char] + dict set returninfo char $char + } + + #memberof takes in the order of a few hundred microseconds if a simple scan of all ranges is taken - possibly worthwhile caching/optimising + #note that memberof is not just unicode blocks - but scripts and other non-contiguous sets consisting of multiple ranges - some of which may include ranges of a single character. (e.g WGL4) + #This means there probably isn't a really efficient means of calculating membership other than scanning all the defined ranges. + #We could potentially populate it using a side-thread - but it seems reasonable to just cache result after first use here. + #some efficiency could also be gained by pre-calculating the extents for each charset which isn't a simple unicode block. (and perhaps sorting by max char) + if {"memberof" in $fields} { + set memberof [list] + dict for {setname setinfo} $charsets { + foreach r [dict get $setinfo ranges] { + set s [dict get $r start] + set e [dict get $r end] + if {$dec_char >= $s && $dec_char <= $e} { + lappend memberof $setname + break + } + } + } + dict set returninfo memberof $memberof + } + + return $returninfo + } + + proc _char_info_dec_memberof_scan {dec} { + variable charsets + set memberof [list] + dict for {setname setinfo} $charsets { + foreach r [dict get $setinfo ranges] { + set s [dict get $r start] + set e [dict get $r end] + if {$dec >= $s && $dec <= $e} { + lappend memberof $setname + break + } + } + } + return $memberof + } + proc range_split_info {dec} { + variable charset_extents_startpoints + variable charset_extents_endpoints + set skeys [dict keys $charset_extents_startpoints] + set ekeys [dict keys $charset_extents_endpoints] + set splen [dict size $charset_extents_startpoints] + set eplen [dict size $charset_extents_endpoints] + set s [lsearch -bisect -integer $skeys $dec] + set s_at_or_below [lrange $skeys 0 $s] + set e_of_s [list] + foreach sk $s_at_or_below { + lappend e_of_s {*}[dict get $charset_extents_startpoints $sk] + } + set e_of_s [lsort -integer $e_of_s] + set splitposn [lsearch -bisect -integer $e_of_s $dec] + if {[lindex $e_of_s $splitposn] < $dec} {incr splitposn} + #set lhs_endpoints_to_check [expr {[llength $e_of_s] - $splitposn}] + set reduced_endpoints [lrange $e_of_s $splitposn end] + set sps [list] + foreach ep $reduced_endpoints { + lappend sps {*}[dict get $charset_extents_endpoints $ep] + } + + + + set e [lsearch -bisect -integer $ekeys $dec] + if {$e >= 0} { + set e_at_or_above [lrange $ekeys $e end] + set s_of_e [list] + foreach ek $e_at_or_above { + lappend s_of_e {*}[dict get $charset_extents_endpoints $ek] + } + set startpoints_of_above [llength $s_of_e] + set splitposn [lsearch -bisect -integer $s_of_e $dec] + set reduced_startpoints [lrange $s_of_e 0 $splitposn] + set eps [list] + foreach sp $reduced_startpoints { + lappend eps {*}[dict get $charset_extents_startpoints $sp] + } + } else { + set s_of_e [list] + set reduced_startpoints [list] + set eps [list] + + } + + return [dict create startpoints $splen endpoints $eplen midpoint [expr {floor($eplen/2)}] posn $e lhs_endpoints_to_check "[llength $reduced_endpoints]/[llength $e_of_s]=[llength $sps]ranges" rhs_startpoints_to_check "[llength $reduced_startpoints]/[llength $s_of_e]=[llength $eps]"] + } + #for just a few extra sets such as wgl4 and unicode blocks loaded - this gives e.g 5x + better performance than the simple search above, and up to twice as slow for tcl 8.6 + #performance biased towards lower numbered characters (which is not too bad in the context of unicode) + #todo - could be tuned to perform better at end by assuming a fairly even distribution of ranges - and so searching startpoint ranges first for items left of middle, endpoint ranges first for items right of middle + #review with scripts loaded and more defined ranges.. + #This algorithm supports arbitrary overlapping ranges and ranges with same start & endpoints + #Should be much better than O(n) for n sets except for pathological case of member of all or nearly-all intervals ? + #review - compare with 'interval tree' algorithms. + proc char_info_dec_memberof {dec} { + variable charset_extents_startpoints + variable charset_extents_endpoints + variable charset_extents_rangenames + if {[package vcompare [info tclversion] 8.7a5] >= 0} { + #algorithm should theoretically be a little better with -stride + set last_smaller_or_equal_startposn [lsearch -stride 2 -bisect -integer $charset_extents_startpoints $dec] + set sets_starting_below [lrange $charset_extents_startpoints 0 $last_smaller_or_equal_startposn+1] ;#+1 to include 2nd element of stridden pair + set endpoints_of_starting_below [lsort -integer [concat {*}[dict values $sets_starting_below]]] + } else { + #no -stride available + set startkeys [dict keys $charset_extents_startpoints] + set last_smaller_or_equal_startkeyposn [lsearch -bisect -integer $startkeys $dec] ;#assert will always return one of the keys if number >=0 supplied (last key if > all) + #set startkey_found [lindex $startkeys $last_smaller_or_equal_startkeyposn] + set start_below_keys [lrange $startkeys 0 $last_smaller_or_equal_startkeyposn] ;#These are the keys of sets which start at or below dec + #puts "start_below_keys: '$start_below_keys'" + set endpoints_of_starting_below [list] + foreach belowkey $start_below_keys { + lappend endpoints_of_starting_below {*}[dict get $charset_extents_startpoints $belowkey] + } + set endpoints_of_starting_below [lsort -integer $endpoints_of_starting_below[unset endpoints_of_starting_below]] + } + + set splitposn [lsearch -bisect -integer $endpoints_of_starting_below $dec] ;#splitposn = last smaller or equal endposn + if {[lindex $endpoints_of_starting_below $splitposn] < $dec} { incr splitposn} + set reduced_opposite_limit [lrange $endpoints_of_starting_below $splitposn end] + ################ + #note each endpoint points to multiple startpoints which may still include some that are not in range. (e.g range y can share endpoint with x that starts in-range - but y starts above character ) + # x1 x2 + # y1 y2 + # c + ################ + #we have reduced our set of endpoints sufficiently (to those at or above dec) to run through and test each startpoint + set ranges [list] + foreach ep $reduced_opposite_limit { + foreach s [dict get $charset_extents_endpoints $ep] { + if {$s <= $dec} { + lappend ranges [dict get $charset_extents_rangenames $s,$ep] + } + } + } + return $ranges + + } + + + #with glob searching of description and short + proc char_range_dict {start end args} { + if {![string is integer -strict $start] || ![string is integer -strict $end]} { + error "char_range_dict error start and end must be integers" + } + set and_globs [list] + if {![llength $args]} { + set args [list *] + } + foreach glob $args { + if {![regexp {[*?]} $glob]} { + lappend and_globs "*$glob*" + } else { + lappend and_globs $glob + } + } + variable charinfo + set cdict [dict create] + set start [expr {$start}] ;#force string rep to decimal - otherwise first use of i as string could be hex or other rep whilst other i values will be decimal string rep due to incr + for {set i $start} {$i <= $end} {incr i} { + set hx [format %04x $i] + set ch [format %c $i] + if {[dict exists $charinfo $i desc]} { + set d [dict get $charinfo $i desc] + } else { + set d "" + } + if {[dict exists $charinfo $i short]} { + set s [dict get $charinfo $i short] + } else { + set s "" + } + set matchcount 0 + foreach glob $and_globs { + if {[string match -nocase $glob $s] || [string match -nocase $glob $d]} { + incr matchcount + } + } + if {$matchcount == [llength $and_globs]} { + if {[dict exists $charinfo $i]} { + dict set cdict $hx [dict merge [dict create dec $i hex $hx char $ch] [dict get $charinfo $i]] + } else { + dict set cdict $hx [list dec $i hex $hx char $ch desc $d short $s] + } + } + } + return $cdict + } + #with glob searches of desc and short + proc char_range {start end args} { + package require overtype + if {![string is integer -strict $start] || ![string is integer -strict $end]} { + error "char_range error start and end must be integers" + } + set charset_dict [char_range_dict $start $end {*}$args] + set out "" + set col3 [string repeat " " 12] + dict for {k inf} $charset_dict { + set s [internal::dict_getdef $inf short ""] + set d [internal::dict_getdef $inf desc ""] + set s_col [overtype::left $col3 $s] + append out "$k [dict get $inf dec] [dict get $inf char] $s_col $d" \n + } + return $out + } + + + #non-overlapping unicode blocks + proc char_blocks {name_or_glob} { + error "unicode block searching unimplemented" + #todo - search only charsets that have settype = block + } + + #major named sets such as unicode blocks, scripts, and other sets such as microsoft WGL4 + #case insensitive search - possibly with globs + proc charset_names {{namesearch *}} { + variable charsets + set sortedkeys [lsort -increasing -dictionary [dict keys $charsets]] ;#NOTE must use -dictionary to use -sorted flag below + if {$namesearch eq "*"} { + return $sortedkeys + } + if {[regexp {[?*]} $namesearch]} { + #name glob search + set matched_names [lsearch -all -inline -nocase $sortedkeys $namesearch] ;#no point using -sorted flag when -all is used + } else { + set matched [lsearch -sorted -inline -nocase $sortedkeys $namesearch] ;#no globs - stop on first match + if {[llength $matched]} { + return [list $matched] + } else { + return [list] + } + } + return $matched_names + } + proc charsets {{namesearch *}} { + package require textblock + variable charsets + set charset_names [charset_names $namesearch] + set settype_list [list] + foreach setname $charset_names { + lappend settype_list [dict get $charsets $setname settype] + } + + set charset_names [linsert $charset_names 0 "Set Name"] + set settype_list [linsert $settype_list 0 "Set Type"] + + return [textblock::join [list_as_lines -- $charset_names] " " [list_as_lines $settype_list]] + } + proc charset_defget {exactname} { + variable charsets + return [dict get $charsets $exactname] + } + proc charset_defs {charsetname} { + variable charsets + set matches [charset_names $charsetname] + set def_list [list] + foreach setname $matches { + lappend def_list [dict create $setname [dict get $charsets $setname]] + } + return [join $def_list \n] + } + proc charset_dictget {exactname} { + variable charsets + set setinfo [dict get $charsets $exactname] + set ranges [dict get $setinfo ranges] + set charset_dict [dict create] + foreach r $ranges { + set start [dict get $r start] + set end [dict get $r end] + set charset_dict [dict merge $charset_dict [char_range_dict $start $end]] + } + return $charset_dict + } + proc charset_dicts {searchname} { + variable charsets + set matches [charset_names $searchname] + if {![llength $matches]} { + error "No charset found matching name '$searchname' - use 'charset_names' to get list" + } + set dict_list [list] + foreach m $matches { + lappend dict_list [dict create $m [charset_dictget $name]] + } + #return $dict_list + return [join $dict_list \n] + } + proc charset_page {namesearch args} { + _charset_page_search $namesearch $args ;#pass args to descsearch argument + } + proc _charset_page_search {namesearch search_this_and_that args} { + variable charsets + variable charinfo + set matched_names [charset_names $namesearch] + if {![llength $matched_names]} { + error "charset_page no charset matched pattern '$namesearch' - use 'charset_names' to get list" + } + set defaults [dict create\ + -ansi 0\ + -lined 1\ + ] + set opts [dict merge $defaults $args] + # -- --- --- --- + set opt_ansi [dict get $opts -ansi] + set opt_lined [dict get $opts -lined] + # -- --- --- --- + set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+} + + if {$opt_ansi} { + set a1 [a BLACK white bold] + set a2 [a] + } else { + set a1 "" + set a2 "" + } + set cols 16 + set prefix " " + append out $prefix + foreach charsetname $matched_names { + if {[llength $search_this_and_that]} { + set setinfo [dict get $charsets $charsetname] + set ranges [dict get $setinfo ranges] + set charset_dict [dict create] + foreach r $ranges { + set start [dict get $r start] + set end [dict get $r end] + set charset_dict [dict merge $charset_dict [char_range_dict $start $end {*}$search_this_and_that]] + } + } else { + set charset_dict [charset_dictget $charsetname] + } + if {![dict size $charset_dict]} { + continue + } + set i 1 + append out \n $prefix $charsetname + append out \n + + set marker_line $prefix + set line $prefix + dict for {hex inf} $charset_dict { + set ch [dict get $inf char] + set twidth "" + set dec [expr {"0x$hex"}] + if {[dict exists $charinfo $dec testwidth]} { + set twidth [dict get $charinfo $dec testwidth] + } + if {$twidth eq ""} { + set width [string_width $ch] ;#based on unicode props + } else { + set width $twidth + } + if {$width == 0} { + set marker " " + if {[regexp $re_diacritics $ch]} { + #attempt to combine with space to get 3-wide displayv with diacritic showing at left space + #todo - dualchar diacritics? + set displayv " $ch " + } else { + set displayv " " + } + } elseif {$width == 1} { + set marker "_ " + set displayv "${a1}$ch${a2} " + } else { + #presumed 2 + set marker "__ " + set displayv "${a1}$ch${a2} " + } + set hexlen [string length $hex] + append marker_line "[string repeat " " $hexlen] $marker" + append line "$hex $displayv" + if {$i == [dict size $charset_dict] || $i % $cols == 0} { + if {$opt_lined} { + append out $marker_line \n + } + append out $line \n + set marker_line $prefix + set line $prefix + #set out [string range $out 0 end-2] + #append out \n " " + } + incr i + } + } + set out [string trimright $out " "] + return $out + } + + #allows search on both name and an anded list of globs to be applied to description & short + proc charset {namesearch args} { + package require overtype + variable charsets + set matched_names [charset_names $namesearch] + if {![llength $matched_names]} { + error "No charset matched pattern '$namesearch' - use 'charset_names' to get list" + } + set search_this_and_that $args + + set out "" + + foreach charsetname $matched_names { + if {[llength $search_this_and_that]} { + set setinfo [dict get $charsets $charsetname] + set ranges [dict get $setinfo ranges] + set charset_dict [dict create] + foreach r $ranges { + set start [dict get $r start] + set end [dict get $r end] + set charset_dict [dict merge $charset_dict [char_range_dict $start $end {*}$search_this_and_that]] + } + } else { + set charset_dict [charset_dictget $charsetname] + } + + set col_items_short [list] + set col_items_desc [list] + dict for {k inf} $charset_dict { + lappend col_items_desc [internal::dict_getdef $inf desc ""] + lappend col_items_short [internal::dict_getdef $inf short ""] + } + if {[llength $col_items_desc]} { + set widest3 [tcl::mathfunc::max {*}[lmap v $col_items_short {string length $v}]] + if {$widest3 == 0} { + set col3 " " + } else { + set col3 [string repeat " " $widest3] + } + dict for {k inf} $charset_dict { + set s [internal::dict_getdef $inf short ""] + set d [internal::dict_getdef $inf desc ""] + set s_col [overtype::left $col3 $s] + append out "$k [dict get $inf char] $s_col $d" \n + } + } + } + + return $out + } + + #use console cursor movements to test and cache the column-width of each char in the set of characters returned by the search criteria + proc charset_calibrate {namesearch args} { + variable charsets + variable charinfo + set matched_names [charset_names $namesearch] + if {![llength $matched_names]} { + error "No charset matched pattern '$namesearch' - use 'charset_names' to get list" + } + set search_this_and_that $args + set charcount 0 + set width_results [dict create] + puts stdout "calibrating using terminal cursor movements.." + foreach charsetname $matched_names { + if {[llength $search_this_and_that]} { + set setinfo [dict get $charsets $charsetname] + set ranges [dict get $setinfo ranges] + set charset_dict [dict create] + foreach r $ranges { + set start [dict get $r start] + set end [dict get $r end] + set charset_dict [dict merge $charset_dict [char_range_dict $start $end {*}$search_this_and_that]] + } + } else { + set charset_dict [charset_dictget $charsetname] + } + if {![dict size $charset_dict]} { + continue + } + dict for {hex inf} $charset_dict { + set ch [format %c 0x$hex] + set twidth "" + set dec [expr {"0x$hex"}] + if {[dict exists $charinfo $dec testwidth]} { + set twidth [dict get $charinfo $dec testwidth] + } + if {$twidth eq ""} { + #puts -nonewline stdout "." ;#this + set width [char_info_testwidth $ch] ;#based on unicode props + dict set charinfo $dec testwidth $width + } else { + set width $twidth + } + dict incr width_results $width + incr charcount + } + } + puts stdout "\ncalibration done - results cached in charinfo dictionary" + return [dict create charcount $charcount widths $width_results] + } + #prerequisites - no ansi escapes - no newlines + #review - what about \r \t \b ? + proc string_width {text} { + #review is detecting \033 enough? what about 8-bit escapes? + if {[string first \n $text] >= 0} { + error "string_width accepts only a single line" + } + if {[string first \033 $text] >= 0} { + error "string_width doesn't accept ansi escape sequences. Use punk::ansi::stripansi first" + } + #todo - check double-width chars in unicode blocks.. try to do reasonably quicky + #short-circuit basic cases + if {![regexp {[\uFF-\U10FFFF]} $text]} { + #control chars? + return [string length $text] + } + + + #todo - various combining diacritical marks.. from grave - to various complicated unicode joiners and composing chars etc + #as at 2023 - terminals generally seem to use the simplistic approach of tallying up individual character-widths, which means combinations that print short are reported too long by the terminal esc 6 n sequence. + + + #- {Combining Diacritical Marks} {ranges {{start 768 end 879 note {unicode block 0300..036F}}} description {} settype block} + #- {Combining Diacritical Marks Extended} {ranges {{start 6832 end 6911 note {unicode block 1AB0..1AFF}}} description {} settype block} + #- {Combining Diacritical Marks Supplement} {ranges {{start 7616 end 7679 note {unicode block 1DC0..1DFF}}} description {} settype block} + #- {Combining Diacritical Marks for Symbols} {ranges {{start 8400 end 8447 note {unicode block 20D0..20FF}}} description {} settype block} + #- {Combining Half Marks} {ranges {{start 65056 end 65071 note {unicode block FE20..FE2F}}} description {} settype block} + # + # initial simplistic approach is just to strip these ... todo REVIEW + + set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+} + set text [regsub -all $re_diacritics $text ""] + + set re_ascii_fullwidth {[\uFF01-\uFF5e]} + + set doublewidth_char_count 0 + set zerowidth_char_count 0 + #split just to get the standalone character widths - and then scan for other combiners (?) + #review + #set can_regex_high_unicode [string match [regexp -all -inline {[\U80-\U0010FFFF]} \U0001F525] \U0001F525] + #tcl pre 2023-11 - braced high unicode regexes don't work + #fixed in bug-4ed788c618 2023-11 + #set uc_sequences [regexp -all -inline -indices {[\u0100-\U10FFFF]} $text] + set uc_sequences [regexp -all -inline -indices "\[\u0100-\U10FFFF\]" $text] + foreach uc_range $uc_sequences { + set chars [string range $text {*}$uc_range] + foreach c $chars { + if {[regexp $re_ascii_fullwidth $c]} { + incr doublewidth_char_count + } else { + #todo - replace with function that doesn't use console - just unicode data + set width [char_info_testwidth_cached $c] + if {$width == 0} { + incr zerowidth_char_count + } elseif {$width == 2} { + incr doublewidth_char_count + } + } + } + } + #todo - work out what to do with anomalies like grave combiner which print at different lengths on different terminals (fonts?) and for which cursor-query escape sequence lies. + return [expr {[string length $text] + $doublewidth_char_count - $zerowidth_char_count}] + } + + #This shouldn't be called on text containing ansi codes! + proc strip_nonprinting_ascii {str} { + #review - some single-byte 'control' chars have visual representations e.g ETX as heart + #It is currently used for screen display width calculations + #equivalent for various unicode combining chars etc? + set map [list\ + \007 ""\ + [format %c 0] ""\ + [format %c 0x7f] ""\ + ] + return [string map $map $str] + } + + proc char_width {char} { + error "char_width unimplemented" + } + #return N Na W etc from unicode data + proc char_uc_width_prop {char} { + error "char_uc_width unimplemented" + } + + # -- --- --- --- --- + #will accept a single char or a string - test using console cursor position reporting + proc char_info_testwidth {ch {emit 0}} { + package require punk::console + #uses cursor movement and relies on console to report position.. which it may do properly for single chars - but may misreport for combinations that combine into a single glyph + tailcall punk::console::test_char_width $ch $emit + } + proc char_info_testwidth_cached {char} { + variable charinfo + set dec [scan $char %c] + set twidth "" + if {[dict exists $charinfo $dec testwidth]} { + set twidth [dict get $charinfo $dec testwidth] + } + if {$twidth eq ""} { + set width [char_info_testwidth $char] + dict set charinfo $dec testwidth $width + return $width + } else { + return $twidth + } + } + # -- --- --- --- --- + + +} + + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Ready +package provide punk::char [namespace eval punk::char { + variable version + set version 0.1.0 +}] +return + +#*** !doctools +#[manpage_end] + + diff --git a/src/bootsupport/modules/punk/console-0.1.0.tm b/src/bootsupport/modules/punk/console-0.1.0.tm new file mode 100644 index 0000000..b6e7e92 --- /dev/null +++ b/src/bootsupport/modules/punk/console-0.1.0.tm @@ -0,0 +1,916 @@ +# -*- tcl -*- +# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt +# +# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. +# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# (C) 2023 +# +# @@ Meta Begin +# Application punk::console 0.1.0 +# Meta platform tcl +# Meta license +# @@ Meta End + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Requirements +##e.g package require frobz +package require punk::ansi + + +if {"windows" eq $::tcl_platform(platform)} { + #package require zzzload + #zzzload::pkg_require twapi +} + +#see https://learn.microsoft.com/en-us/windows/console/classic-vs-vt +#https://learn.microsoft.com/en-us/windows/console/creating-a-pseudoconsole-session + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval punk::console { + variable has_twapi 0 + + #punk::console namespace - contains *directly* acting functions - some based on ansi escapes from the 'ansi' sub namespace, some on local system calls or executable calls wrapped in the 'local' sub namespace + #directly acting means they write to stdout to cause the console to peform the action, or they perform the action immediately via other means. + #punk::console::ansi contains a subset of punk::ansi, but with emission to stdout as opposed to simply returning the ansi sequence. + #punk::console::local functions are used by punk::console commands when there is no ansi equivalent + #ansi escape sequences are possibly preferable esp if terminal is remote to process running punk::console + # punk::local commands may be more performant in some circumstances where console is directly attached, but it shouldn't be assumed. e.g ansi::titleset outperforms local::titleset on windows with twapi. + + namespace eval ansi { + #ansi escape sequence based terminal/console control functions + namespace export * + } + namespace eval local { + #non-ansi terminal/console control functions + #e.g external utils system API's. + namespace export * + } + + if {"windows" eq $::tcl_platform(platform)} { + proc enableAnsi {} { + #loopavoidancetoken (don't remove) + internal::define_windows_procs + internal::abort_if_loop + tailcall enableAnsi + } + proc enableRaw {{channel stdin}} { + #loopavoidancetoken (don't remove) + internal::define_windows_procs + internal::abort_if_loop + tailcall enableRaw $channel + } + proc disableRaw {{channel stdin}} { + #loopavoidancetoken (don't remove) + internal::define_windows_procs + internal::abort_if_loop + tailcall disableRaw $channel + } + } else { + proc enableAnsi {} { + #todo? + } + proc enableRaw {{channel stdin}} { + set sttycmd [auto_execok stty] + exec {*}$sttycmd raw -echo <@$channel + } + proc disableRaw {{channel stdin}} { + set sttycmd [auto_execok stty] + exec {*}$sttycmd raw echo <@$channel + } + } + + proc enable_mouse {} { + puts -nonewline stdout \x1b\[?1000h + puts -nonewline stdout \x1b\[?1003h + puts -nonewline stdout \x1b\[?1015h + puts -nonewline stdout \x1b\[?1006h + flush stdout + } + proc disable_mouse {} { + puts -nonewline stdout \x1b\[?1000l + puts -nonewline stdout \x1b\[?1003l + puts -nonewline stdout \x1b\[?1015l + puts -nonewline stdout \x1b\[?1006l + flush stdout + } + proc enable_bracketed_paste {} { + puts -nonewline stdout \x1b\[?2004h + } + proc disable_bracketed_paste {} { + puts -nonewline stdout \x1b\[?2004l + } + proc start_application_mode {} { + #need loop to read events? + puts -nonewline stdout \x1b\[?1049h ;#alt screen + enable_mouse + #puts -nonewline stdout \x1b\[?25l ;#hide cursor + puts -nonewline stdout \x1b\[?1003h\n + enable_bracketed_paste + + } + + namespace eval internal { + proc abort_if_loop {{failmsg ""}} { + #puts "il1 [info level 1]" + #puts "thisproc: [lindex [info level 0] 0]" + set would_loop [uplevel 1 {expr {[string match *loopavoidancetoken* [info body [namespace tail [lindex [info level 0] 0]]]]}}] + #puts "would_loop: $would_loop" + if {$would_loop} { + set procname [uplevel 1 {namespace tail [lindex [info level 0] 0]}] + if {$failmsg eq ""} { + set errmsg "[namespace current] Failed to redefine procedure $procname" + } else { + set errmsg $failmsg + } + error $errmsg + } + } + proc define_windows_procs {} { + package require zzzload + set loadstate [zzzload::pkg_require twapi] + if {$loadstate ni [list failed]} { + #review zzzload usage + #puts stdout "=========== console loading twapi =============" + zzzload::pkg_wait twapi + package require twapi ;#should be fast once twapi dll loaded in zzzload thread + set ::punk::console::has_twapi 1 + + #todo - move some of these to the punk::console::local sub-namespace - as they use APIs rather than in-band ANSI to do their work. + #enableAnsi seems like it should be directly under punk::console .. but then it seems inconsistent if other local console-mode setting functions aren't. + #Find a compromise to organise things somewhat sensibly.. + + proc [namespace parent]::enableAnsi {} { + #output handle modes + #Enable virtual terminal processing (sometimes off in older windows terminals) + #ENABLE_PROCESSED_OUTPUT = 0x0001 + #ENABLE_WRAP_AT_EOL_OUTPUT = 0x0002 + #ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004 + #DISABLE_NEWLINE_AUTO_RETURN = 0x0008 + set h_out [twapi::get_console_handle stdout] + set oldmode_out [twapi::GetConsoleMode $h_out] + set newmode_out [expr {$oldmode_out | 5}] ;#5? + + twapi::SetConsoleMode $h_out $newmode_out + + #input handle modes + #ENABLE_PROCESSED_INPUT 0x0001 + #ENABLE_LINE_INPUT 0x0002 + #ENABLE_ECHO_INPUT 0x0004 + #ENABLE_WINDOW_INPUT 0x0008 (default off when a terminal created) + #ENABLE_MOUSE_INPUT 0x0010 + #ENABLE_INSERT_MODE 0X0020 + #ENABLE_QUICK_EDIT_MODE 0x0040 + #ENABLE_VIRTUAL_TERMINAL_INPUT 0x0200 (default off when a terminal created) (512) + set h_in [twapi::get_console_handle stdin] + set oldmode_in [twapi::GetConsoleMode $h_in] + set newmode_in [expr {$oldmode_in | 8}] + + twapi::SetConsoleMode $h_in $newmode_in + + return [list stdout [list from $oldmode_out to $newmode_out] stdin [list from $oldmode_in to $newmode_in]] + } + proc [namespace parent]::disableAnsi {} { + set h_out [twapi::get_console_handle stdout] + set oldmode_out [twapi::GetConsoleMode $h_out] + set newmode_out [expr {$oldmode_out & ~5}] + twapi::SetConsoleMode $h_out $newmode_out + + + set h_in [twapi::get_console_handle stdin] + set oldmode_in [twapi::GetConsoleMode $h_in] + set newmode_in [expr {$oldmode_in & ~8}] + twapi::SetConsoleMode $h_in $newmode_in + + + return [list stdout [list from $oldmode_out to $newmode_out] stdin [list from $oldmode_in to $newmode_in]] + } + + proc [namespace parent]::enableProcessedInput {} { + set h_in [twapi::get_console_handle stdin] + set oldmode_in [twapi::GetConsoleMode $h_in] + set newmode_in [expr {$oldmode_in | 1}] + twapi::SetConsoleMode $h_in $newmode_in + return [list stdin [list from $oldmode_in to $newmode_in]] + } + proc [namespace parent]::disableProcessedInput {} { + set h_in [twapi::get_console_handle stdin] + set oldmode_in [twapi::GetConsoleMode $h_in] + set newmode_in [expr {$oldmode_in & ~1}] + twapi::SetConsoleMode $h_in $newmode_in + return [list stdin [list from $oldmode_in to $newmode_in]] + } + + + proc [namespace parent]::enableRaw {{channel stdin}} { + #review - change to modify_console_input_mode + set console_handle [twapi::GetStdHandle -10] + set oldmode [twapi::GetConsoleMode $console_handle] + set newmode [expr {$oldmode & ~6}] ;# Turn off the echo and line-editing bits + twapi::SetConsoleMode $console_handle $newmode + return [list stdin [list from $oldmode to $newmode]] + } + proc [namespace parent]::disableRaw {{channel stdin}} { + set console_handle [twapi::GetStdHandle -10] + set oldmode [twapi::GetConsoleMode $console_handle] + set newmode [expr {$oldmode | 6}] ;# Turn on the echo and line-editing bits + twapi::SetConsoleMode $console_handle $newmode + return [list stdin [list from $oldmode to $newmode]] + } + + } else { + if {$loadstate eq "failed"} { + puts stderr "punk::console falling back to stty because twapi load failed" + proc [namespace parent]::enableAnsi {} { + puts stderr "punk::console::enableAnsi todo" + } + proc [namespace parent]::enableRaw {{channel stdin}} { + set sttycmd [auto_execok stty] + exec {*}$sttycmd raw -echo <@$channel + } + proc [namespace parent]::disableRaw {{channel stdin}} { + set sttycmd [auto_execok stty] + exec {*}$sttycmd raw echo <@$channel + } + } + } + } + + proc ansi_response_handler {chan accumulatorvar waitvar} { + set status [catch {read $chan 1} bytes] + if { $status != 0 } { + # Error on the channel + fileevent stdin readable {} + puts "error reading $chan: $bytes" + set $waitvar [list error_read status $status bytes $bytes] + } elseif {$bytes ne ""} { + # Successfully read the channel + #puts "got: [string length $bytes]" + upvar $accumulatorvar chunk + append chunk $bytes + if {$bytes eq "R"} { + fileevent stdin readable {} + set $waitvar ok + } + } elseif { [eof $chan] } { + fileevent stdin readable {} + # End of file on the channel + #review + puts "ansi_response_handler end of file" + set $waitvar eof + } elseif { [fblocked $chan] } { + # Read blocked. Just return + } else { + fileevent stdin readable {} + # Something else + puts "ansi_response_handler can't happen" + set $waitvar error_unknown + } + } + } ;#end namespace eval internal + + variable colour_disabled 0 + # https://no-color.org + if {[info exists ::env(NO_COLOR)]} { + if {$::env(NO_COLOR) ne ""} { + set colour_disabled 1 + } + } + + namespace eval ansi { + proc a+ {args} { + puts -nonewline [::punk::ansi::a+ {*}$args] + } + } + proc ansi+ {args} { + variable colour_disabled + if {$colour_disabled == 1} { + return + } + #stdout + tailcall ansi::a+ {*}$args + } + proc get_ansi+ {args} { + variable colour_disabled + if {$colour_disabled == 1} { + return + } + tailcall punk::ansi::a+ {*}$args + } + + namespace eval ansi { + proc a {args} { + puts -nonewline [::punk::ansi::a {*}$args] + } + } + proc ansi {args} { + variable colour_disabled + if {$colour_disabled == 1} { + return + } + #stdout + tailcall ansi::a {*}$args + } + proc get_ansi {args} { + variable colour_disabled + if {$colour_disabled == 1} { + return + } + tailcall punk::ansi::a {*}$args + } + + namespace eval ansi { + proc a? {args} { + puts -nonewline stdout [::punk::ansi::a? {*}$args] + } + } + proc ansi? {args} { + #stdout + tailcall ansi::a? {*}$args + } + proc get_ansi? {args} { + tailcall ::punk::ansi::a? {*}$args + } + + proc colour {{onoff {}}} { + variable colour_disabled + if {[string length $onoff]} { + set onoff [string tolower $onoff] + if {$onoff in [list 1 on true yes]} { + interp alias "" a+ "" punk::console::ansi+ + set colour_disabled 0 + } elseif {$onoff in [list 0 off false no]} { + interp alias "" a+ "" control::no-op + set colour_disabled 1 + } else { + error "punk::console::colour expected 0|1|on|off|true|false|yes|no" + } + } + catch {repl::reset_prompt} + return [expr {!$colour_disabled}] + } + + namespace eval ansi { + proc reset {} { + puts -nonewline stdout [punk::ansi::reset] + } + } + namespace import ansi::reset + + namespace eval ansi { + proc clear {} { + puts -nonewline stdout [punk::ansi::clear] + } + proc clear_above {} { + puts -nonewline stdout [punk::ansi::clear_above] + } + proc clear_below {} { + puts -nonewline stdout [punk::ansi::clear_below] + } + proc clear_all {} { + puts -nonewline stdout [punk::ansi::clear_all] + } + } + namespace import ansi::clear + namespace import ansi::clear_above + namespace import ansi::clear_below + namespace import ansi::clear_all + + namespace eval local { + proc set_codepage_output {cpname} { + #todo + if {"windows" eq $::tcl_platform(platform)} { + twapi::set_console_output_codepage $cpname + } else { + error "set_codepage_output unimplemented on $::tcl_platform(platform)" + } + } + proc set_codepage_input {cpname} { + #todo + if {"windows" eq $::tcl_platform(platform)} { + twapi::set_console_input_codepage $cpname + } else { + error "set_codepage_input unimplemented on $::tcl_platform(platform)" + } + } + } + namespace import local::set_codepage_output + namespace import local::set_codepage_input + + + proc get_cursor_pos {} { + set ::punk::console::chunk "" + + set accumulator ::punk::console::chunk + set waitvar ::punk::console::chunkdone + set existing_handler [fileevent stdin readable] + set $waitvar "" + #todo - test and save rawstate so we don't disableRaw if terminal was already raw + enableRaw + fconfigure stdin -blocking 0 + fileevent stdin readable [list ::punk::console::internal::ansi_response_handler stdin $accumulator $waitvar] + puts -nonewline stdout \033\[6n ;flush stdout + after 0 {update idletasks} + #e.g \033\[46;1R + #todo - reset + set info "" + if {[set $waitvar] eq ""} { + vwait $waitvar + } + disableRaw + if {[string length $existing_handler]} { + fileevent stdin readable $existing_handler + } + + set info [set $accumulator] + #set punk::console::chunk "" + set data [string range $info 2 end-1] + return $data + } + proc get_cursor_pos_list {} { + return [split [get_cursor_pos] ";"] + } + + #terminals lie. This should be a reasonable (albeit relatively slow) test of actual width - but some terminals seem to miscalculate. + #todo - a visual interactive test/questionnaire to ask user if things are lining up or if the terminal is telling fibs about cursor position. + #todo - determine if these anomalies are independent of font + #punk::ansi should be able to glean widths from unicode data files - but this may be incomplete - todo - compare with what terminal actually does. + proc test_char_width {char_or_string {emit 0}} { + if {!$emit} { + puts -nonewline stdout \033\[2K\033\[1G ;#2K erase line 1G cursor at col1 + } + lassign [split [punk::console::get_cursor_pos] ";"] _row1 col1 + puts -nonewline stdout $char_or_string + lassign [split [punk::console::get_cursor_pos] ";"] _row2 col2 + if {!$emit} { + puts -nonewline stdout \033\[2K\033\[1G + } + flush stdout;#if we don't flush - a subsequent stderr write could move the cursor to a newline and interfere with our 2K1G erasure and cursor repositioning. + return [expr {$col2 - $col1}] + } + + namespace eval ansi { + proc cursor_on {} { + puts -nonewline stdout [punk::ansi::cursor_on] + } + proc cursor_off {} { + puts -nonewline stdout [punk::ansi::cursor_off] + } + } + namespace import ansi::cursor_on + namespace import ansi::cursor_off + + namespace eval local { + proc titleset {windowtitle} { + if {"windows" eq $::tcl_platform(platform)} { + if {![catch {twapi::set_console_title $windowtitle} result]} { + return $windowtitle + } else { + error "punk::console::titleset failed to set title - try punk::console::ansi::titleset" + } + } else { + error "punk::console::titleset has no local mechanism to set the window title on this platform. try punk::console::ansi::titleset" + } + } + proc titleget {} { + if {"windows" eq $::tcl_platform(platform)} { + if {![catch {twapi::get_console_title} result]} { + return $result + } else { + error "punk::console::titleset failed to set title - ensure twapi is available" + } + } else { + #titleget - https://invisible-island.net/xterm/xterm.faq.html#how2_title + # won't work on all platforms/terminals - but may be worth implementing + error "punk::console::titleget has no local mechanism to get the window title on this platform." + } + } + } + + namespace eval ansi { + proc titleset {windowtitle} { + puts -nonewline stdout [punk::ansi::titleset $windowtitle] + } + } + namespace import ansi::titleset + #no known pure-ansi solution + proc titleget {} { + return [local::titleget] + } + + proc infocmp_test {} { + set cmd1 [auto_execok infocmp] + if {[string length $cmd1]} { + puts stderr "infocmp seems to be available" + return [exec {*}$cmd1] + } else { + puts stderr "infcmp doesn't seem to be present" + set tcmd [auto_execok tput] + if {[string length $tcmd]} { + puts stderr "tput seems to be available. Try something like: tput -S - (freebsd)" + } + } + } + + proc test_cursor_pos {} { + enableRaw + puts -nonewline stdout \033\[6n ;flush stdout + fconfigure stdin -blocking 0 + set info [read stdin 20] ;# + after 1 + if {[string first "R" $info] <=0} { + append info [read stdin 20] + } + disableRaw + set data [string range [string trim $info] 2 end-1] + return [split $data ";"] + } + + namespace eval ansi { + proc move {row col} { + puts -nonewline stdout [punk::ansi::move $row $col] + } + proc move_forward {row col} { + puts -nonewline stdout [punk::ansi::move_forward $row $col] + } + proc move_back {row col} { + puts -nonewline stdout [punk::ansi::move_back $row $col] + } + proc move_up {row col} { + puts -nonewline stdout [punk::ansi::move_up $row $col] + } + proc move_down {row col} { + puts -nonewline stdout [punk::ansi::move_down $row $col] + } + proc move_emit {row col data args} { + puts -nonewline stdout [punk::ansi::move_emit $row $col $data {*}$args] + } + proc move_emit_return {row col data args} { + lassign [punk::console::get_cursor_pos_list] orig_row orig_col + set out "" + append out [punk::ansi::move_emit $row $col $data {*}$args] + if {!$is_in_raw} { + incr orig_row -1 + } + move $orig_row $orig_col + } + } + namespace import ansi::move + namespace import ansi::move_emit + namespace import ansi::move_forward + namespace import ansi::move_back + namespace import ansi::move_up + namespace import ansi::move_down + + proc move_emit_return {row col data args} { + #todo detect if in raw mode or not? + set is_in_raw 0 + lassign [punk::console::get_cursor_pos_list] orig_row orig_col + + move_emit $row $col $data + foreach {row col data} $args { + move_emit $row $col $data + } + + if {!$is_in_raw} { + incr orig_row -1 + } + move $orig_row $orig_col + return "" + } + proc move_call_return {row col script} { + lassign [punk::console::get_cursor_pos_list] orig_row orig_col + move $row $col + uplevel 1 $script + move $orig_row $orig_col + } + + #this doesn't work - we would need an internal virtual screen structure to pick up cursor attributes from arbitrary locations + # ncurses and its ilk may have something like that - but we specifically want to avoid curses libraries + proc pick {row col} { + lassign [punk::console::get_cursor_pos_list] orig_row orig_col + set test "" + #set test [a green Yellow] + move_emit $row $col $test\0337 + puts -nonewline \0338\033\[${orig_row}\;${orig_col}H + } + proc pick_emit {row col data} { + set test "" + #set test [a green Purple] + lassign [punk::console::get_cursor_pos_list] orig_row orig_col + move_emit $row $col $test\0337 + puts -nonewline \0338\033\[${orig_row}\;${orig_col}H$data + } + + # -- --- --- --- --- --- + namespace eval ansi { + proc test_decaln {} { + puts -nonewline stdout [punk::ansi::test_decaln] + } + } + namespace import ansi::test_decaln + + namespace eval clock { + + #map chars of chars "0" to "?"" ie 0x30 to x3f + variable fontmap1 { + 7C CE DE F6 E6 C6 7C 00 + 30 70 30 30 30 30 FC 00 + 78 CC 0C 38 60 CC FC 00 + 78 CC 0C 38 0C CC 78 00 + 1C 3C 6C CC FE 0C 1E 00 + FC C0 F8 0C 0C CC 78 00 + 38 60 C0 F8 CC CC 78 00 + FC CC 0C 18 30 30 30 00 + 78 CC CC 78 CC CC 78 00 + 78 CC CC 7C 0C 18 70 00 + 00 18 18 00 00 18 18 00 + 00 18 18 00 00 18 18 30 + 18 30 60 C0 60 30 18 00 + 00 00 7E 00 7E 00 00 00 + 60 30 18 0C 18 30 60 00 + 3C 66 0C 18 18 00 18 00 + } + #libungif extras + append fontmap1 { + 7c 82 9a aa aa 9e 7c 00 + 38 6c c6 c6 fe c6 c6 00 + fc c6 c6 fc c6 c6 fc 00 + } + + #https://github.com/Distrotech/libungif/blob/master/lib/gif_font.c + variable fontmap { + } + #ascii row 0x00 to 0x1F control chars + #(cp437 glyphs) + append fontmap { + 00 00 00 00 00 00 00 00 + 3c 42 a5 81 bd 42 3c 00 + 3c 7e db ff c3 7e 3c 00 + 00 ee fe fe 7c 38 10 00 + 10 38 7c fe 7c 38 10 00 + 00 3c 18 ff ff 08 18 00 + 10 38 7c fe fe 10 38 00 + 00 00 18 3c 18 00 00 00 + ff ff e7 c3 e7 ff ff ff + 00 3c 42 81 81 42 3c 00 + ff c3 bd 7e 7e bd c3 ff + 1f 07 0d 7c c6 c6 7c 00 + 00 7e c3 c3 7e 18 7e 18 + 04 06 07 04 04 fc f8 00 + 0c 0a 0d 0b f9 f9 1f 1f + 00 92 7c 44 c6 7c 92 00 + 00 00 60 78 7e 78 60 00 + 00 00 06 1e 7e 1e 06 00 + 18 7e 18 18 18 18 7e 18 + 66 66 66 66 66 00 66 00 + ff b6 76 36 36 36 36 00 + 7e c1 dc 22 22 1f 83 7e + 00 00 00 7e 7e 00 00 00 + 18 7e 18 18 7e 18 00 ff + 18 7e 18 18 18 18 18 00 + 18 18 18 18 18 7e 18 00 + 00 04 06 ff 06 04 00 00 + 00 20 60 ff 60 20 00 00 + 00 00 00 c0 c0 c0 ff 00 + 00 24 66 ff 66 24 00 00 + 00 00 10 38 7c fe 00 00 + 00 00 00 fe 7c 38 10 00 + } + #chars SP to "/" row 0x20 to 0x2f + append fontmap { + 00 00 00 00 00 00 00 00 + 30 30 30 30 30 00 30 00 + 66 66 00 00 00 00 00 00 + 6c 6c fe 6c fe 6c 6c 00 + 10 7c d2 7c 86 7c 10 00 + f0 96 fc 18 3e 72 de 00 + 30 48 30 78 ce cc 78 00 + 0c 0c 18 00 00 00 00 00 + 10 60 c0 c0 c0 60 10 00 + 10 0c 06 06 06 0c 10 00 + 00 54 38 fe 38 54 00 00 + 00 18 18 7e 18 18 00 00 + 00 00 00 00 00 00 18 70 + 00 00 00 7e 00 00 00 00 + 00 00 00 00 00 00 18 00 + 02 06 0c 18 30 60 c0 00 + } + #chars "0" to "?"" row 0x30 to 0x3f + append fontmap { + 7c c6 c6 c6 c6 c6 7c 00 + 18 38 78 18 18 18 3c 00 + 7c c6 06 0c 30 60 fe 00 + 7c c6 06 3c 06 c6 7c 00 + 0e 1e 36 66 fe 06 06 00 + fe c0 c0 fc 06 06 fc 00 + 7c c6 c0 fc c6 c6 7c 00 + fe 06 0c 18 30 60 60 00 + 7c c6 c6 7c c6 c6 7c 00 + 7c c6 c6 7e 06 c6 7c 00 + 00 30 00 00 00 30 00 00 + 00 30 00 00 00 30 20 00 + 00 1c 30 60 30 1c 00 00 + 00 00 7e 00 7e 00 00 00 + 00 70 18 0c 18 70 00 00 + 7c c6 0c 18 30 00 30 00 + } + #chars "@" to "O" row 0x40 to 0x4f + append fontmap { + 7c 82 9a aa aa 9e 7c 00 + 38 6c c6 c6 fe c6 c6 00 + fc c6 c6 fc c6 c6 fc 00 + 7c c6 c6 c0 c0 c6 7c 00 + f8 cc c6 c6 c6 cc f8 00 + fe c0 c0 fc c0 c0 fe 00 + fe c0 c0 fc c0 c0 c0 00 + 7c c6 c0 ce c6 c6 7e 00 + c6 c6 c6 fe c6 c6 c6 00 + 78 30 30 30 30 30 78 00 + 1e 06 06 06 c6 c6 7c 00 + c6 cc d8 f0 d8 cc c6 00 + c0 c0 c0 c0 c0 c0 fe 00 + c6 ee fe d6 c6 c6 c6 00 + c6 e6 f6 de ce c6 c6 00 + 7c c6 c6 c6 c6 c6 7c 00 + } + #chars "P" to "_" row 0x50 to 0x5f + append fontmap { + fc c6 c6 fc c0 c0 c0 00 + 7c c6 c6 c6 c6 c6 7c 06 + fc c6 c6 fc c6 c6 c6 00 + 78 cc 60 30 18 cc 78 00 + fc 30 30 30 30 30 30 00 + c6 c6 c6 c6 c6 c6 7c 00 + c6 c6 c6 c6 c6 6c 38 00 + c6 c6 c6 d6 fe ee c6 00 + c6 c6 6c 38 6c c6 c6 00 + c3 c3 66 3c 18 18 18 00 + fe 0c 18 30 60 c0 fe 00 + 3c 30 30 30 30 30 3c 00 + c0 60 30 18 0c 06 03 00 + 3c 0c 0c 0c 0c 0c 3c 00 + 00 38 6c c6 00 00 00 00 + 00 00 00 00 00 00 00 ff + } + #chars "`" to "o" row 0x60 to 0x6f + append fontmap { + 30 30 18 00 00 00 00 00 + 00 00 7c 06 7e c6 7e 00 + c0 c0 fc c6 c6 e6 dc 00 + 00 00 7c c6 c0 c0 7e 00 + 06 06 7e c6 c6 ce 76 00 + 00 00 7c c6 fe c0 7e 00 + 1e 30 7c 30 30 30 30 00 + 00 00 7e c6 ce 76 06 7c + c0 c0 fc c6 c6 c6 c6 00 + 18 00 38 18 18 18 3c 00 + 18 00 38 18 18 18 18 f0 + c0 c0 cc d8 f0 d8 cc 00 + 38 18 18 18 18 18 3c 00 + 00 00 cc fe d6 c6 c6 00 + 00 00 fc c6 c6 c6 c6 00 + 00 00 7c c6 c6 c6 7c 00 + } + #chars "p" to DEL row 0x70 to 0x7f + append fontmap { + 00 00 fc c6 c6 e6 dc c0 + 00 00 7e c6 c6 ce 76 06 + 00 00 6e 70 60 60 60 00 + 00 00 7c c0 7c 06 fc 00 + 30 30 7c 30 30 30 1c 00 + 00 00 c6 c6 c6 c6 7e 00 + 00 00 c6 c6 c6 6c 38 00 + 00 00 c6 c6 d6 fe 6c 00 + 00 00 c6 6c 38 6c c6 00 + 00 00 c6 c6 ce 76 06 7c + 00 00 fc 18 30 60 fc 00 + 0e 18 18 70 18 18 0e 00 + 18 18 18 00 18 18 18 00 + e0 30 30 1c 30 30 e0 00 + 00 00 70 9a 0e 00 00 00 + 00 00 18 3c 66 ff 00 00 + } + + proc bigstr {str row col} { + variable fontmap + #curses attr off reverse + #a noreverse + set reverse 0 + set output "" + set charno 0 + foreach char [split $str {}] { + binary scan $char c f + set index [expr {$f * 8}] + for {set line 0} {$line < 8} {incr line} { + set bitline 0x[lindex $fontmap [expr {$index + $line}]] + binary scan [binary format c $bitline] B8 charline + set cix 0 + foreach c [split $charline {}] { + if {$c} { + append output [punk::ansi::move_emit [expr {$row + $line}] [expr {$col + $charno * 8 + $cix}] "[a reverse] [a noreverse]"] + #curses attr on reverse + #curses move [expr $row + $line] [expr $col + $charno * 8 + $cix] + #curses puts " " + } + incr cix + } + } + incr charno + } + return $output + } + proc display1 {} { + #punk::console::clear + punk::console::move_call_return 20 20 {punk::console::clear_above} + flush stdout + punk::console::move_call_return 0 0 {puts stdout [bigstr [clock format [clock seconds] -format %H:%M:%S] 10 5]} + after 2000 {punk::console::clock::display} + } + proc display {} { + lassign [punk::console::get_cursor_pos_list] orig_row orig_col + punk::console::move 20 20 + punk::console::clear_above + punk::console::move 0 0 + puts -nonewline [bigstr [clock format [clock seconds] -format %H:%M:%S] 10 5] + + punk::console::move $orig_row $orig_col + #after 2000 {punk::console::clock::display} + } + + proc displaystr {str} { + lassign [punk::console::get_cursor_pos_list] orig_row orig_col + punk::console::move 20 20 + punk::console::clear_above + punk::console::move 0 0 + puts -nonewline [bigstr $str 10 5] + + punk::console::move $orig_row $orig_col + } + + + } + + proc test {} { + set high_unicode_length [string length \U00010000] + set can_high_unicode 0 + set can_regex_high_unicode 0 + set can_terminal_report_dingbat_width 0 + set can_terminal_report_diacritic_width 0 + if {$high_unicode_length != 1} { + puts stderr "punk::console WARNING: no modern unicode support in this Tcl version. High unicode values not properly supported. (string length \\U00010000 : $high_unicode_length should be 1)" + } else { + set can_high_unicode 1 + set can_regex_high_unicode [string match [regexp -all -inline {[\U80-\U0010FFFF]} \U0001F525] \U0001F525] + if {!$can_regex_high_unicode} { + puts stderr "punk::console warning: TCL version cannot perform braced regex of high unicode" + } + } + set dingbat_heavy_plus_width [punk::console::test_char_width \U2795] ;#review - may be font dependent. We chose a wide dingbat as a glyph that is hopefully commonly renderable - and should display 2 wide. + #This will give a false report that terminal can't report width if the glyph (or replacement glyph) is actually being rendered 1 wide. + #we can't distinguish without user interaction? + if {$dingbat_heavy_plus_width == 2} { + set can_terminal_report_dingbat_width 1 + } else { + puts stderr "punk::console warning: terminal either not displaying wide unicode as wide, or unable to report width properly." + } + set diacritic_width [punk::console::test_char_width a\u0300] + if {$diacritic_width == 1} { + set can_terminal_report_diacritic_width 1 + } else { + puts stderr "punk::console warning: terminal unable to report diacritic width properly." + } + + if {$can_high_unicode && $can_regex_high_unicode && $can_terminal_report_dingbat_width && $can_terminal_report_diacritic_width} { + set result [list result ok] + } else { + set result [list result error] + } + return $result + } + #run the test and allow warnings to be emitted to stderr on package load. User should know the terminal and/or Tcl version are not optimal for unicode character work + #set testresult [test1] +} + + + + + + + + + + + + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Ready +package provide punk::console [namespace eval punk::console { + variable version + set version 0.1.0 +}] +return \ No newline at end of file diff --git a/src/bootsupport/modules/punk/fileline-0.1.0.tm b/src/bootsupport/modules/punk/fileline-0.1.0.tm new file mode 100644 index 0000000..c3a7b5b --- /dev/null +++ b/src/bootsupport/modules/punk/fileline-0.1.0.tm @@ -0,0 +1,1710 @@ +# -*- tcl -*- +# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt +# +# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. +# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# (C) 2024 +# +# @@ Meta Begin +# Application punk::fileline 0.1.0 +# Meta platform tcl +# Meta license BSD +# @@ Meta End + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# doctools header +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +#*** !doctools +#[manpage_begin punkshell_module_punk::fileline 0 0.1.0] +#[copyright "2024"] +#[titledesc {file line-handling utilities}] [comment {-- Name section and table of contents description --}] +#[moddesc {punk fileline}] [comment {-- Description at end of page heading --}] +#[require punk::fileline] +#[keywords module text parse file encoding BOM] +#[description] +#[para] - + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section Overview] +#[para]Utilities for in-memory analysis of text file data as both line data and byte/char-counted data whilst preserving the line-endings (even if mixed) +#[para]This is important for certain text files where examining the number of chars/bytes is important +#[para]For example - windows .cmd/.bat files need some byte counting to determine if labels lie on chunk boundaries and need to be moved. +#[para]This chunk-size counting will depend on the character encoding. +#[para]Despite including the word 'file', the library doesn't necessarily deal with reading/writing to the filesystem - +#[para]The raw data can be supplied as a string, or loaded from a file using punk::fileline::get_textinfo -file +#[subsection Concepts] +#[para]A chunk of textfile data (possibly representing a whole file - but usually at least a complete set of lines) is loaded into a punk::fileline::class::textinfo instance at object creation. +#[example_begin] +# package require punk::fileline +# package require fileutil +# set rawdata [lb]fileutil::cat data.txt -translation binary[rb] +# punk::fileline::class::textinfo create obj_data $rawdata +# puts stdout [lb]obj_data linecount[rb] +#[example_end] +#[subsection Notes] +#[para]Line records are referred to by a zero-based index instead of a one-based index as is commonly used when displaying files. +#[para]This is for programming consistency and convenience, and the module user should do their own conversion to one-based indexing for line display or messaging if desired. +#[para]No support for lone carriage-returns being interpreted as line-endings. +#[para]CR line-endings that are intended to be interpreted as such should be mapped to something else before the data is supplied to this module. + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Requirements +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[subsection dependencies] +#[para] packages needed by punk::fileline +#[list_begin itemized] + + package require Tcl 8.6 + package require punk::args + #*** !doctools + #[item] [package {Tcl 8.6}] + #[item] [package {punk::args}] + + + # #package require frobz + # #*** !doctools + # #[item] [package {frobz}] + +#*** !doctools +#[list_end] [comment {- end dependencies list -}] + +#*** !doctools +#[subsection {optional dependencies}] +#[para] packages that add functionality but aren't strictly required +#[list_begin itemized] + + #*** !doctools + #[item] [package {punk::ansi}] + #[para] - recommended for class::textinfo [method chunk_boundary_display] + #[item] [package {punk::char}] + #[para] - recommended for class::textinfo [method chunk_boundary_display] + #[item] [package {overtype}] + #[para] - recommended for class::textinfo [method chunk_boundary_display] + + +#*** !doctools +#[list_end] [comment {- end optional dependencies list -}] + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section API] + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# oo::class namespace +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval punk::fileline::class { + namespace export * + #*** !doctools + #[subsection {Namespace punk::fileline::class}] + #[para] class definitions + if {[info commands [namespace current]::textinfo] eq ""} { + #*** !doctools + #[list_begin enumerated] + + # oo::class create interface_sample1 { + # #*** !doctools + # #[enum] CLASS [class interface_sample1] + # #[list_begin definitions] + + # method test {arg1} { + # #*** !doctools + # #[call class::interface_sample1 [method test] [arg arg1]] + # #[para] test method + # puts "test: $arg1" + # } + + # #*** !doctools + # #[list_end] [comment {-- end definitions interface_sample1}] + # } + + + #uses zero based indexing. Caller can add 1 for line numbers + oo::class create [namespace current]::textinfo { + #*** !doctools + #[enum] CLASS [class textinfo] + #[list_begin definitions] + # [para] [emph METHODS] + + variable o_chunk ;#current state + variable o_chunkop_store + variable o_lineop_store + + variable o_chunk_epoch + variable o_line_epoch + variable o_payloadlist + variable o_linemap + variable o_LF_C + variable o_CRLF_C + + + variable o_bom_id + variable o_bom + variable o_bom_map + + #review - for now we expect datachunk to be data without BOM and already encoded appropriately + #fileline::get_textinfo has support for interpreting BOM - but we currently have no way to do that for data not coming from a file + #refactor to allow that code to be called from here? + constructor {datachunk args} { + #*** !doctools + #[call class::textinfo [method constructor] [arg datachunk] [opt {option value...}]] + #[para] Constructor for textinfo object which represents a chunk or all of a file + #[para] datachunk should be passed with the file data including line-endings as-is for full functionality. ie use something like: + #[example_begin] + # fconfigure $fd -translation binary + # set chunkdata [lb]read $fd[rb]] + #or + # set chunkdata [lb]fileutil::cat -translation binary[rb] + #[example_end] + #[para] when loading the data + namespace eval [namespace current] { + set nspath [namespace path] + foreach p [list ::punk::fileline ::punk::fileline::ansi] { + if {$p ni $nspath} { + lappend nspath $p + } + } + namespace path $nspath + } + + set o_bom_map [list\ + utf-8 \u00ef\u00bb\u00bf\ + utf-16be \u00fe\u00ff\ + utf-16le \u00ff\u00fe\ + utf-32be \u0000\u0000\u00fe\u00ff\ + utf-32le \u00ff\u00fe\u0000\u0000\ + utf-7 \u002b\u002f\u0076\ + utf-1 \u00f7\u0064\u004c\ + utf-ebcdic \u00dd\u0073\u0066\u0073\ + utf-scsu \u0003\u00fe\u00ff\ + utf-bocu-1 \u00fb\u00ee\u0028\ + utf-gb18030 \u0084\u0031\u0095\u0033\ + ] + set o_bom_id "" + set o_bom "" ;#review + + set o_chunk $datachunk + set o_line_epoch [list] + set o_chunk_epoch [list "fromchunkchange-at-[clock micros]"] + set crlf_lf_placeholders [list \uFFFF \uFFFE] ;#defaults - if already exist in file - error out with message + set defaults [dict create\ + -substitutionmap {}\ + -crlf_lf_placeholders $crlf_lf_placeholders\ + -userid ""\ + ] + set known_opts [dict keys $defaults] + foreach {k v} $args { + if {$k ni $known_opts} { + error "[self] constructor error: unknown option '$k'. Known options: $known_opts" + } + } + set opts [dict merge $defaults $args] + # -- --- --- --- --- --- --- + set opt_substitutionmap [dict get $opts -substitutionmap] ;#review - can be done by caller - or a loadable -policy + set opt_crlf_lf_placeholders [dict get $opts -crlf_lf_placeholders] + set opt_userid [dict get $opts -userid] + # -- --- --- --- --- --- --- + + if {[llength $opt_crlf_lf_placeholders] != 2 || [string length [lindex $opt_crlf_lf_placeholders 0]] !=1 || [string length [lindex $opt_crlf_lf_placeholders 1]] !=1} { + error "textinfo::constructor error: -crlf_lf_placeholders requires a list of exactly 2 chars" + } + lassign $opt_crlf_lf_placeholders o_LF_C o_CRLF_C + if {[string first $o_LF_C $o_chunk] >=0} { + set decval [scan $o_LF_C %c] + if {$decval < 32 || $decval > 127} { + set char_desc "(decimal value $decval)" + } else { + set char_desc "'$o_LF_C' (decimal value $decval)" + } + error "textinfo::constructor error: rawfiledata already contains linefeed substitution character $char_desc specified as first element of -crlf_lf_placeholders" + } + if {[string first $o_CRLF_C $o_chunk] >=0} { + set decval [scan $o_CRLF_C %c] + if {$decval < 32 || $decval > 127} { + set char_desc "(decimal value $decval)" + } else { + set char_desc "'$o_CRLF_C' (decimal value $decval)" + } + error "textinfo::constructor error: rawfiledata already contains carriagereturn-linefeed substitution character $char_desc specified as second element of -crlf_lf_placeholders" + } + if {$o_LF_C eq $o_CRLF_C} { + puts stderr "WARNING: same substitution character used for both elements of -crlf_lf_placeholders - byte counting may be off if file contains mixed line-endings" + } + + my regenerate_lines + + } + + method set_bomid {bomid} { + if {$bomid ni [dict keys $o_bom_map]} { + error "Unrecognised bom-id $bomid. Known values: [dict keys $o_bom_map]" + } + set o_bom_id $bomid + set o_bom [dict get $o_bom_map $bomid] + } + method get_bomid {} { + return $o_bom_id + } + method get_bom {} { + return $o_bom + } + + method chunk {chunkstart chunkend} { + #*** !doctools + #[call class::textinfo [method chunk] [arg chunkstart] [arg chunkend]] + #[para]Return a range of bytes from the underlying raw chunk data. + #[para] e.g The following retrieves the entire chunk + #[para] objName chunk 0 end + return [string range $o_chunk $chunkstart $chunkend] + } + method chunklen {} { + #*** !doctools + #[call class::textinfo [method chunklen]] + #[para] Number of bytes/characters in the raw data of the file + return [string length $o_chunk] + } + method chunk_boundary_display {chunkstart chunkend chunksize args} { + #*** !doctools + #[call class::textinfo [method chunk_boundary_display]] + #[para]Returns a string displaying the boundaries at chunksize bytes between chunkstart and chunkend + #[para]Defaults to using ansi colour if punk::ansi module is available. Use -ansi 0 to disable colour + set defaults [dict create\ + -ansi $::punk::fileline::ansi::enabled\ + -offset 0\ + -displaybytes 200\ + -truncatedmark "..."\ + -completemark "---"\ + -moremark " + "\ + -continuemark " > "\ + -linemaxwidth 100\ + -linebase 0\ + -limit -1\ + -boundaries {}\ + -showconfig 0\ + -boundaryheader {Boundary %i% at %b%}\ + ] + set known_opts [dict keys $defaults] + foreach {k v} $args { + if {$k ni $known_opts} { + error "[self]::chunk_boundary error: unknown option '$k'. Known options: $known_opts" + } + } + set opts [dict merge $defaults $args] + # -- --- --- --- --- --- + set opt_ansi [dict get $opts -ansi] + set opt_offset [dict get $opts -offset] + set opt_displaybytes [dict get $opts -displaybytes] + set opt_tmark [dict get $opts -truncatedmark] + set opt_cmark [dict get $opts -completemark] + set opt_linemax [dict get $opts -linemaxwidth] + set opt_linebase [dict get $opts -linebase] + set opt_linebase [string map [list _ ""] $opt_linebase] + set opt_limit [dict get $opts -limit] ;#limit number of boundaries to display + set opt_boundaries [dict get $opts -boundaries] ;#use pre-calculated boundaries if supplied + set opt_showconfig [dict get $opts -showconfig] + set opt_boundaryheader [dict get $opts -boundaryheader] + # -- --- --- --- --- --- + package require overtype + # will require punk::char and punk::ansi + + if {"::punk::fileline::ansi::stripansi" ne [info commands ::punk::fileline::ansi::stripansi]} { + namespace eval ::punk::fileline::ansi { + namespace import ::punk::ansi::* + } + } + + #This mechanism for enabling/disabling ansi is a bit clumsy - prone to errors with regard to keeping in sync with any api changes in punk ansi + #It's done here to allow this to be used without the full set of punk modules and/or shell - REVIEW + + #risk of failing to reset on error + set pre_ansi_enabled $::punk::fileline::ansi::enabled + if {$opt_ansi} { + set ::punk::fileline::ansi::enabled 1 + } else { + set ::punk::fileline::ansi::enabled 0 + } + if {"::punk::fileline::stripansi" ne [info commands ::punk::fileline::stripansi]} { + proc ::punk::fileline::a {args} { + if {$::punk::fileline::ansi::enabled} { + tailcall ::punk::fileline::ansi::a {*}$args + } else { + return "" + } + } + proc ::punk::fileline::a+ {args} { + if {$::punk::fileline::ansi::enabled} { + tailcall ::punk::fileline::ansi::a+ {*}$args + } else { + return "" + } + } + proc ::punk::fileline::stripansi {str} { + if {$::punk::fileline::ansi::enabled} { + tailcall ::punk::fileline::ansi::stripansi $str + } else { + return $str + } + } + } + set maxline [lindex [my chunkrange_to_linerange $chunkend $chunkend] 0] + set minline [lindex [my chunkrange_to_linerange $chunkstart $chunkstart] 0] + + #suport simple end+-int (+-)start(+-)int to set linebase to line corresponding to chunkstart or chunkend + #also simple int+int and int-int - nothing more complicated (similar to Tcl lrange etc in that regard) + #commonly this will be something like -start or -end + if {![string is integer -strict $opt_linebase]} { + set sign "" + set errunrecognised "unrecognised -linebase value '$opt_linebase'. Expected positive or negative integer or -start -start-int -start+int -end -end-int -end+int or -eof (where leading - is optional but probably desirable) " + if {[string index $opt_linebase 0] eq "-"} { + set sign - + set tail [string range $opt_linebase 1 end] + } else { + set tail [string trimleft $opt_linebase +];#ignore + + } + if {[string match eof* $tail]} { + set endmath [string range $tail 3 end] + #todo endmath? + if {$tail eq "eof"} { + set lastline [lindex [my chunkrange_to_linerange end end] 0] + set linebase ${sign}$lastline + } else { + error $errunrecognised + } + } elseif {[string match end* $tail]} { + set endmath [string range $tail 3 end] + if {[string length $endmath]} { + set op [string index $endmath 0] + if {$op in {+ -}} { + set operand [string range $endmath 1 end] + if {[string is integer -strict $operand]} { + if {$op eq "+"} { + set linebase [expr {$maxline + $operand}] + } else { + set linebase [expr {$maxline - $operand}] + } + } else { + error $errunrecognised + } + } else { + error $errunrecognised + } + } else { + set linebase $maxline + } + set linebase ${sign}$linebase + } elseif {[string match start* $tail]} { + set endmath [string range $tail 5 end] + if {[string length $endmath]} { + set op [string index $endmath 0] + if {$op in {+ -}} { + set operand [string range $endmath 1 end] + if {[string is integer -strict $operand]} { + if {$op eq "+"} { + set linebase [expr {$minline + $operand}] + } else { + set linebase [expr {$minline - $operand}] + } + } else { + error $errunrecognised + } + } else { + error $errunrecognised + } + } else { + set linebase $minline + } + set linebase ${sign}$linebase + } elseif {[string match *-* $tail]} { + set extras [lassign [split $tail -] int1 int2] + if {[llength $extras]} { + error $errunrecognised + } + if {![string is integer -strict $int1] || ![string is integer -strict $int2]} { + error $errunrecognised + } + set linebase [expr {$int1 - $int2}] + set linebase ${sign}$linebase + } elseif {[string match *+* $tail]} { + set extras [lassign [split $tail +] int1 int2] + if {[llength $extras]} { + error $errunrecognised + } + if {![string is integer -strict $int1] || ![string is integer -strict $int2]} { + error $errunrecognised + } + set linebase [expr {$int1 + $int2}] + set linebase ${sign}$linebase + } else { + error $errunrecognised + } + + } else { + set linebase $opt_linebase + } + + lassign [my numeric_chunkrange $chunkstart $chunkend] chunkstart chunkend + + if {![llength $opt_boundaries]} { + set binfo [lib::range_spans_chunk_boundaries $chunkstart $chunkend $chunksize -offset $opt_offset] + set boundaries [dict get $binfo boundaries] + } else { + set boundaries [list] + foreach b $opt_boundaries { + if {$chunkstart <= $b && $chunkend >= $b} { + lappend boundaries [expr {$b + $opt_offset}] + } + } + } + + + if {![llength $boundaries]} { + return "No boundaries found between $chunkstart and $chunkend for chunksize $chunksize (when offset $opt_offset)" + } + if {$opt_showconfig} { + set result "chunk range $chunkstart $chunkend line range $minline $maxline linebase $linebase limit $opt_limit\n" + } else { + set result "" + } + set pre_bytes [expr {$opt_displaybytes /2}] + set post_bytes $pre_bytes + set max_bytes [expr {[my chunklen] -1}] + if {$opt_limit > 0} { + set boundaries [lrange $boundaries[unset boundaries] 0 $opt_limit-1] + } + + set i 0 + foreach b $boundaries { + if {$opt_boundaryheader ne ""} { + set j [expr {$i+1}] + append result [string map [list %b% $b %i% $i %j% $j] $opt_boundaryheader] \n + } + set low [expr {max(($b - $pre_bytes),0)}] + set high [expr {min(($b + $post_bytes),$max_bytes)}] + + set lineinfolist [my chunkrange_to_lineinfolist $low $high -show_truncated 1] + set le_map [list \r\n \r \n ] + set result_list [list] + foreach lineinfo $lineinfolist { + set lineidx [dict get $lineinfo lineindex] + + set linenum [expr {$lineidx + $linebase}] + set s [dict get $lineinfo start] + set e [dict get $lineinfo end] + + set boundarymarker "" + set displayidx "" + set linenum_display $linenum + if {$s <= $b && $e >= $b} { + set idx [expr {$b - $s}] ;#index into whole position in whole line - not so useful if we're viewing a small section of a line + set char [string index [my line $lineidx] $idx] + set char_display [string map [list \r \n ] $char] + if {[dict get $lineinfo is_truncated]} { + set tside [dict get $lineinfo truncatedside] + set truncated [dict get $lineinfo truncated] + set tlen [string length $truncated] + if {"left" in $tside} { + set tleft [dict get $lineinfo truncatedleft] + set tleftlen [string length $tleft] + set displayidx [expr {$idx - $tleftlen}] + } elseif {"right" in $tside} { + set displayidx $idx + } + } else { + set displayidx $idx + } + set boundarymarker "'[a+ green bold]$char_display[a]'@$displayidx" + set linenum_display ${linenum_display},$idx + } + + set lhs_status $opt_cmark ;#default + set rhs_status $opt_cmark ;#default + if {[dict get $lineinfo is_truncated]} { + set line [dict get $lineinfo truncated] + set tside [dict get $lineinfo truncatedside] + if {"left" in $tside && "right" in $tside } { + set lhs_status $opt_tmark + set rhs_status $opt_tmark + } elseif {"left" in $tside} { + set lhs_status $opt_tmark + } elseif {"right" in $tside} { + set rhs_status $opt_tmark + } + + + } else { + set line [my line $lineidx] + } + if {$displayidx ne ""} { + set line [string replace $line $displayidx $displayidx [a+ White green bold]$char_display[a]] + } + set displayline [string map $le_map $line] + lappend result_list [list $linenum_display $boundarymarker $lhs_status $displayline $rhs_status] + } + set title_linenum "LNUM" + set linenums [lsearch -index 0 -all -inline -subindices $result_list *] + set markers [lsearch -index 1 -all -inline -subindices $result_list *] + set lines [lsearch -index 3 -all -inline -subindices $result_list *] + set title_marker "" + set title_line "Line" + #todo - use punk::char for unicode support of wide chars etc? + set widest_linenum [tcl::mathfunc::max {*}[lmap v [concat [list $title_linenum] $linenums] {string length $v}]] + set widest_marker [tcl::mathfunc::max {*}[lmap v [concat [list $title_marker] $markers] {string length [stripansi $v]}]] + set widest_status [expr {max([string length $opt_cmark], [string length $opt_tmark])}] + set widest_line [tcl::mathfunc::max {*}[lmap v [concat [list $title_line] $lines] {string length $v}]] + foreach row $result_list { + lassign $row linenum marker lhs_status line rhs_status + append result [format " %-*s " $widest_linenum $linenum] + append result [format " %-*s " $widest_marker $marker] + append result [format " %-*s " $widest_status $lhs_status] + append result [format " %-*s " $widest_line $line] + append result [format " %-*s " $widest_status $rhs_status] \n + } + incr i + } + set ::punk::fileline::ansi::enabled $pre_ansi_enabled + return $result + } + method linecount {} { + #*** !doctools + #[call class::textinfo [method linecount]] + #[para] Number of lines in the raw data of the file, counted as per the policy in effect + return [llength $o_payloadlist] + } + + + method line {lineindex} { + #*** !doctools + #[call class::textinfo [method line] [arg lineindex]] + #[para]Reconstructs and returns the raw line using the payload and per-line stored line-ending metadata + #[para]A 'line' may be returned without a line-ending if the unerlying chunk had trailing data without a line-ending (or the chunk was loaded under a non-standard -policy setting) + #[para]Whilst such data may not conform to definitions (e.g POSIX) of the terms 'textfile' and 'line' - it is useful here to represent it as a line with metadata le set to "none" + #[para]To return just the data which might more commonly be needed for dealing with lines, use the [method linepayload] method - which returns the line data minus line-ending + + lassign [my numeric_linerange $lineindex 0] lineindex + + set le [dict get $o_linemap $lineindex le] + set le_chars [dict get [dict create lf \n crlf \r\n none ""] $le] + return [lindex $o_payloadlist $lineindex]$le_chars + } + method chunk_find_glob {globsearch args} { + #todo - use linepayload_find_glob when -ignore_lineendings is 0 - but check truncations for 1st and last line + error "unimplemented" + } + method linepayload_find_glob {globsearch args} { + #*** !doctools + #[call class::textinfo [method linepayload_find_glob] [arg globsearch] [opt {option value...}]] + #[para]Return a lineinfolist (see [method lineinfo] and [method lineinfolist]) of lines where payload matches the [arg globsearch] string + #[para]To limit the returned results use the -limit n option - where -limit 0 means return all matches. + #[para]For example: [method linepayload_find_glob] "*test*" -limit 1 + #[para]The result is always a list of lineinfo dictionaries even if one item is returned + #[para] -limitfrom can be start|end + #[para]The order of results is always the order as they occur in the data - even if -limitfrom end is specified. + #[para]-limitfrom end means that only the last -limit items are returned + #[para]Note that as glob accepts [lb]chars[rb]] to mean match any character in the set given by chars, searching for literal square brackets should be done by escaping the bracket with a backslash + #[para]This is true even if only a single square bracket is being searched for. e.g {*[lb]file*} will not find the word file followed by a left square-bracket - even though the search didn't close the square brackets. + #[para]In the above case - the literal search should be {*\[lb]file*} + + set defaults [dict create\ + -limit 0\ + -strategy 1\ + -start 0\ + -end end\ + -limitfrom start\ + ] + set known_opts [dict keys $defaults] + dict for {k v} $args { + if {$k ni $known_opts} { + error "linepayload_find_glob unknown option '$k'. Known options: $known_opts" + } + } + set opts [dict merge $defaults $args] + # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + set opt_limit [dict get $opts -limit] + if {![string is integer -strict $opt_limit] || $opt_limit < 0} { + error "linepayload_find_glob -limit must be positive integer" + } + # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + set opt_strategy [dict get $opts -strategy] + # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + set opt_start [dict get $opts -start] + set opt_start [expr {$opt_start}] + if {$opt_start != 0} {error "-start unimplemented"} + # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + set opt_end [dict get $opts -end] + set max_line_index [expr {[llength $o_payloadlist]-1}] + if {$opt_end eq "end"} { + set opt_end $max_line_index + } + #TODO + if {$opt_end < $max_line_index} {error "-end less than max_line_index unimplemented"} + # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + set opt_limitfrom [dict get $opts -limitfrom] + #-limitfrom start|end only + #TODO + if {$opt_limitfrom ne "start"} {error "-limitfrom unimplemented"} + # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + + + set lineinfolist [list] + + if {$opt_limit == 1} { + set idx [lsearch -glob [lrange $o_payloadlist $opt_start $opt_end] $globsearch] + if {$idx >=0} { + set i [expr {$opt_start + $idx}] + lappend lineinfolist [list lineindex $i [dict get $o_linemap $i] payload [lindex $o_payloadlist $i]] + } + } elseif {$opt_limit == 0} { + set indices [lsearch -all -glob [lrange $o_payloadlist $opt_start $opt_end] $globsearch] + foreach irel $indices { + set i [expr {$opt_start + $irel}] + lappend lineinfolist [list lineindex $i [dict get $o_linemap $i] payload [lindex $o_payloadlist $i]] + } + } else { + #todo - auto-strategy based on limit vs number of lines + if {$opt_strategy == 0} { + set posn 0 + for {set r 0} {$r < $opt_limit} {incr r} { + set n [lsearch [lrange $o_payloadlist $posn+$opt_start end] $globsearch] + if {$n >=0} { + set irel [expr {$posn + $n}] + set i [expr {$irel + $opt_start}] + lappend lineinfolist [list lineindex $i {*}[dict get $o_linemap $i] payload [lindex $o_payloadlist $i]] + set posn [expr {$irel+1}] + } + } + } else { + set indices [lsearch -all -glob [lrange $o_payloadlist $opt_start $opt_end] $globsearch] + set limited [lrange $indices 0 $opt_limit-1] + foreach irel $limited { + set i [expr {$opt_start + $irel}] + lappend lineinfolist [list lineindex $i {*}[dict get $o_linemap $i] payload [lindex $o_payloadlist $i]] + } + } + } + return $lineinfolist + } + method linepayload {lineindex} { + #*** !doctools + #[call class::textinfo [method linepayload] [arg lineindex]] + #[para]Return the text of the line indicated by the zero-based lineindex + #[para]The line-ending is not returned in the data - but is still stored against this lineindex + #[para]Line Metadata such as the line-ending for a particular line and the byte/character range it occupies within the chunk can be retrieved with the [method linemeta] method + #[para]To retrieve both the line text and metadata in a single call the [method lineinfo] method can be used + #[para]To retrieve an entire line including line-ending use the [method line] method. + lassign [my numeric_linerange $lineindex 0] lineindex + return [lindex $o_payloadlist $lineindex] + } + method linepayloads {startindex endindex} { + #*** !doctools + #[call class::textinfo [method linepayloads] [arg startindex] [arg endindex]] + #[para]Return a list of just the payloads in the specified linindex range, with no metadata. + return [lrange $o_payloadlist $startindex $endindex] + } + method linemeta {lineindex} { + #*** !doctools + #[call class::textinfo [method linemeta] [arg lineindex]] + #[para]Return a dict of the metadata for the line indicated by the zero-based lineindex + #[para]Keys returned include + #[list_begin itemized] + #[item] le + #[para] A string representing the type of line-ending: crlf|lf|none + #[item] linelen + #[para] The number of characters/bytes in the whole line including line-ending if any + #[item] payloadlen + #[para] The number of character/bytes in the line excluding line-ending + #[item] start + #[para] The zero-based index into the associated raw file data indicating at which byte/character index this line begins + #[item] end + #[para] The zero-based index into the associated raw file data indicating at which byte/character index this line ends + #[para] This end-point corresponds to the last character of the line-ending if any - not necessarily the last character of the line's payload + #[list_end] + lassign [my numeric_linerange $lineindex 0] lineindex + dict get $o_linemap $lineindex + } + method lineinfo {lineindex} { + #*** !doctools + #[call class::textinfo [method lineinfo] [arg lineindex]] + #[para]Return a dict of the metadata and text for the line indicated by the zero-based lineindex + #[para]This returns the same info as the [method linemeta] with an added key of 'payload' which is the text of the line without line-ending. + #[para]The 'payload' value is the same as is returned from the [method linepayload] method. + lassign [my numeric_linerange $lineindex 0] lineindex ;#convert lineindex to canonical number e.g 1_000 -> 1000 end -> highest index + return [dict create lineindex $lineindex {*}[dict get $o_linemap $lineindex] payload [lindex $o_payloadlist $lineindex]] + } + method lineinfolist {startidx endidx} { + #*** !doctools + #[call class::textinfo [method lineinfolist] [arg startidx] [arg endidx]] + #[para]Returns list of lineinfo dicts for each line in line index range startidx to endidx + lassign [my numeric_linerange $startidx $endidx] startidx endidx + set chunkstart [dict get $o_linemap $startidx start] + set chunkend [dict get $o_linemap $endidx end] + set line_list [my chunkrange_to_lineinfolist $chunkstart $chunkend] ;# assert - no need to view truncations as we've picked start and end of complete lines + #verify sanity + set l_start [lindex $line_list 0] + if {[set idx_start [dict get $l_start lineindex]] ne $startidx} { + error "lineinfolist first lineindex $idx_start doesn't match startidx $startidx" + } + set l_end [lindex $line_list end] + if {[set idx_end [dict get $l_end lineindex]] ne $endidx} { + error "lineinfolist last lineindex $idx_end doesn't match endidx $endidx" + } + return $line_list + } + + method linerange_to_chunkrange {startidx endidx} { + #*** !doctools + #[call class::textinfo [method linerange_to_chunkrange] [arg startidx] [arg endidx]] + + lassign [my numeric_linerange $startidx $endidx] startidx endidx + #inclusive range + return [list [dict get $o_linemap $startidx start] [dict get $o_linemap $endidx end]] + } + method linerange_to_chunk {startidx endidx} { + #*** !doctools + #[call class::textinfo [method linerange_to_chunk] [arg startidx] [arg endidx]] + set chunkrange [my linerange_to_chunkrange $startidx $endidx] + return [string range $o_chunk [lindex $chunkrange 0] [lindex $chunkrange 1]] + } + method lines {startidx endidx} { + #*** !doctools + #[call class::textinfo [method lines] [arg startidx] [arg endidx]] + lassign [my numeric_linerange $startidx $endidx] startidx endidx + set linelist [list] + set le_map [dict create lf \n crlf \r\n none ""] + for {set i $startidx} {$i <= $endidx} {incr i} { + lappend linelist "[lindex $o_payloadlist $i][dict get $le_map [dict get $o_linemap $i le]]" + } + return $linelist + } + method linepayloads {startidx endidx} { + #*** !doctools + #[call class::textinfo [method linepayloads] [arg startidx] [arg endidx]] + return [lrange $o_payloadlist $startidx $endidx] + } + method chunkrange_to_linerange {chunkstart chunkend} { + #*** !doctools + #[call class::textinfo [method chunkrange_to_linerange] [arg chunkstart] [arg chunkend]] + lassign [my numeric_chunkrange $chunkstart $chunkend] chunkstart chunkend + + set linestart -1 + for {set i 0} {$i < [llength $o_payloadlist]} {incr i} { + if {($chunkstart >= [dict get $o_linemap $i start]) && ($chunkstart <= [dict get $o_linemap $i end])} { + set linestart $i + break + } + } + if {$linestart == -1} { + error "Line with range in chunk spanning start index $chunkstart not found" + } + set lineend -1 + for {set i [expr {[llength $o_payloadlist] -1}]} {$i >=0} {incr i -1} { + if {($chunkend >= [dict get $o_linemap $i start]) && ($chunkend <= [dict get $o_linemap $i end])} { + set lineend $i + break + } + } + if {$lineend == -1} { + error "Line with range spanning end index $chunkend not found" + } + return [list $linestart $lineend] + } + method chunkrange_to_lineinfolist {chunkstart chunkend args} { + #*** !doctools + #[call class::textinfo [method chunkrange_to_lineinfolist] [arg chunkstart] [arg chunkend] [opt {option value...}]] + #[para]Return a list of dicts each with structure like the result of the [method lineinfo] method - but possibly with extra keys for truncation information if -show_truncated 1 is supplied + #[para]The truncation key in a lineinfo dict may be returned for first and/or last line in the resulting list. + #[para]truncation shows the shortened (missing bytes on left and/or right side) part of the entire line (potentially including line-ending or even partial line-ending) + #[para]Note that this truncation info is only in the return value of this method - and will not be reflected in [method lineinfo] queries to the main chunk. + + lassign [my numeric_chunkrange $chunkstart $chunkend] chunkstart chunkend + set defaults [dict create\ + -show_truncated 0\ + ] + set known_opts [dict keys $defaults] + foreach {k v} $args { + if {$k ni $known_opts} { + error "chunkrange_to_lines error: unknown option '$k'. Known options: $known_opts" + } + } + set opts [dict merge $defaults $args] + # -- --- --- --- --- --- --- --- + set opt_show_truncated [dict get $opts -show_truncated] + # -- --- --- --- --- --- --- --- + + set infolist [list] + set linerange [my chunkrange_to_linerange $chunkstart $chunkend] + lassign $linerange start_lineindex end_lineindex + + #if -show_truncated + #return extra keys for first and last items (which may be the same item if chunkrange is entirely within a line) + #add is_truncated 0|1 to all lines + #Even if the start/end line is not fully within the chunkrange ie truncated - the 'payload' key will contain the original untruncated data + ########################### + # first line may have payload tail truncated - or just linefeed, or even a split linefeed + ########################### + set first [dict create lineindex $start_lineindex {*}[dict get $o_linemap $start_lineindex] payload [lindex $o_payloadlist $start_lineindex]] + set start_info [dict get $o_linemap $start_lineindex] + + + if {$chunkstart > [dict get $start_info start]} { + dict set first is_truncated 1 + dict set first truncatedside [list left] ;#truncatedside is a list which may have 'right' added if last line is same as first line + } else { + dict set first is_truncated 0 + } + + if {$opt_show_truncated} { + #line1 + if {$chunkstart > [dict get $start_info start]} { + #there is lhs truncation + set payload [lindex $o_payloadlist $start_lineindex] + set line_start [dict get $start_info start] + set le_chars [dict get [dict create lf \n crlf \r\n none ""] [dict get $start_info le]] + set payload_and_le "${payload}${le_chars}" + set split [expr {$chunkstart - $line_start}] + set truncated [string range $payload_and_le $split end] + set lhs [string range $payload_and_le 0 $split-1] + + dict set first truncated $truncated + dict set first truncatedleft $lhs + } + } + ########################### + + ########################### + # middle lines if any - no truncation + ########################### + #difference in indexes of 1 would only mean 2 items to return + set middle_list [list] + if {($end_lineindex - $start_lineindex) > 1} { + for {set i [expr {$start_lineindex +1}]} {$i <= [expr {$end_lineindex -1}] } {incr i} { + #lineindex is key into main list + lappend middle_list [dict create lineindex $i {*}[dict get $o_linemap $i] payload [lindex $o_payloadlist $i] is_truncated 0] + } + } + ########################### + + ########################### + # tail line may have beginning or all of payload truncated - linefeed may be split if crlf + # may be same line as first line - in which case truncation at beginning as well + if {$end_lineindex == $start_lineindex} { + #same record + set end_info $start_info + + + if {$chunkend < [dict get $end_info end]} { + #there is rhs truncation + if {[dict get $first is_truncated]} { + dict set first truncatedside [list left right] + } else { + dict set first is_truncated 1 + dict set first truncatedside [list right] + } + } + + if {$opt_show_truncated} { + if {$chunkend < [dict get $end_info end]} { + #there is rhs truncation and we need to return the splits + #do rhs truncation - possibly in addition to existing lhs truncation + # ... + set payload [lindex $o_payloadlist $end_lineindex] + set line_start [dict get $end_info start] + set le_chars [dict get [dict create lf \n crlf \r\n none ""] [dict get $end_info le]] + set payload_and_le "${payload}${le_chars}" + set split [expr {$chunkend - $line_start}] + set truncated [string range $payload_and_le 0 $split] + set rhs [string range $payload_and_le $split+1 end] + dict set first truncatedright $rhs + if {"left" ni [dict get $first truncatedside]} { + #rhs truncation only + puts "payload_and_le: $payload_and_le" + puts "LENGTH: [string length $payload_and_le]" + #--- + #--- + dict set first truncated $truncated + dict set first truncatedside [list right] + } else { + #truncated on both sides + set lhslen [string length [dict get $first truncatedleft]] + #re-truncate the truncation to reapply the original lhs truncation + set truncated [string range $truncated $lhslen end] + dict set first truncated $truncated + } + } + } + #no middle or last to append + lappend infolist $first + } else { + set last [dict create lineindex $end_lineindex {*}[dict get $o_linemap $end_lineindex] payload [lindex $o_payloadlist $end_lineindex]] + set end_info [dict get $o_linemap $end_lineindex] + + + if {$chunkend < [dict get $end_info end]} { + dict set last is_truncated 1 + dict set last truncatedside [list right] + } else { + dict set last is_truncated 0 + } + + if {$opt_show_truncated} { + if {$chunkend < [dict get $end_info end]} { + #there is rhs truncation - and last line in range is a different line to first one + set payload [lindex $o_payloadlist $end_lineindex] + set line_start [dict get $end_info start] + set line_end [dict get $end_info end] + set le [dict get $end_info le] + set le_size [dict get {lf 1 crlf 2 none 0} $le] + set le_chars [dict get [dict create lf \n crlf \r\n none ""] $le] + set payload_and_le "${payload}${le_chars}" + + set split [expr {$chunkend - $line_start}] + set truncated [string range $payload_and_le 0 $split] + set rhs [string range $payload_and_le $split+1 end] + + dict set last truncated $truncated + dict set last truncatedright $rhs + #this has the effect that truncating the rhs by 1 can result in truncated being larger than original payload for crlf lines - as payload now sees the cr + #this is a bit unintuitive - but probably best reflects the reality. The truncated value is the truncated 'line' rather than the truncated 'payload' + } + } + + + lappend infolist $first + if {[llength $middle_list]} { + lappend infolist {*}$middle_list + } + lappend infolist $last + } + ########################### + #assert all records have is_truncated key. + #assert if is_truncated == 1 truncatedside should contain a list of either left, right or both left and right + #assert If not opt_show_truncated - then truncated records will not have truncated,truncatedleft,truncatedright keys. + return $infolist + } + + #need to check truncations so that any split \r\n is counted precisely todo + method chunk_le_counts {chunkstart chunkend} { + set infolines [my chunkrange_to_lineinfolist $chunkstart $chunkend -show_truncated 1] + set lf_count 0 + set crlf_count 0 + set none_count 0 + foreach d $infolines { + set le [dict get $d le] + if {$le eq "lf"} { + incr lf_count + } elseif {$le eq "crlf"} { + incr crlf_count + } else { + incr none_count + } + } + #even without split crlf - this can overcount by counting the lf or crlf in a line which had an ending not in the chunk range specified + + #check first and last infoline for truncations + #Also check if the truncation is directly between an crlf + #both an lhs split and an rhs split could land between cr and lf + #to be precise - we should presumably count the part within our chunk as either a none for cr or an lf + #This means a caller counting chunk by chunk using this method will sometimes get the wrong answer depending on where crlfs lie relative to their chosen chunk size + #This is presumably ok - as it should be a well known thing to watch out for. + #If we're only receiving chunk by chunk we can't reliably detect splits vs lone s in the data + #There are surely more efficient ways for a caller to count line-endings in the way that makes sense for them + #but we should makes things as easy as possible for users of this line/chunk structure anyway. + + set first [lindex $infolines 0] + if {[dict get $first is_truncated]} { + #could be the only line - and truncated at one or both ends. + #both a left and a right truncation could split a crlf + + } + set last [lindex $infolines end] + if {[dict get $first lineindex] != [dict get $last lineindex]} { + #only need to process last if it is a different line + #if so - then split can only be left side + + } + + + return [dict create lf $lf_count crlf $crlf_count unterminated $none_count warning line_ending_splits_unimplemented] + } + + #todo - test last line and merge as necessary with first line from new chunk - generate line data only for appended chunk + method append_chunk {rawchunk} { + error "sorry - unimplemented" + } + + method numeric_linerange {startidx endidx} { + #*** !doctools + #[call class::textinfo [method numeric_linerange] [arg startidx] [arg endidx]] + #[para]A helper to return any Tcl-style end end-x values given to startidx or endidx; converted to their specific values based on the current state of the underlying line data + #[para]This is used internally by API functions such as [method line] to enable it to accept more expressive indices + return [my normalize_indices $startidx $endidx [expr {[dict size $o_linemap]-1}]] + } + method numeric_chunkrange {startidx endidx} { + #*** !doctools + #[call class::textinfo [method numeric_chunkrange] [arg startidx] [arg endidx]] + #[para]A helper to return any Tcl-style end end-x entries supplied to startidx or endidx; converted to their specific values based on the current state of the underlying chunk data + return [my normalize_indices $startidx $endidx [expr {[string length $o_chunk]-1}]] + } + method normalize_indices {startidx endidx max} { + #*** !doctools + #[call class::textinfo [method normalize_indices] [arg startidx] [arg endidx] [arg max]] + #[para]A utility to convert some of the of Tcl-style list-index expressions such as end, end-1 etc to valid indices in the range 0 to the supplied max + #[para]Basic addition and subtraction expressions such as 4-1 5+2 are accepted + #[para]startidx higher than endidx is allowed + #[para]Unlike Tcl's index expressions - we raise an error if the calculated index is out of bounds 0 to max + set original_startidx $startidx + set original_endidx $endidx + set startidx [string map [list _ ""] $startidx] ;#don't barf on Tcl 8.7+ underscores in numbers - we can't just use expr because it will not handle end-x + set endidx [string map [list _ ""] $endidx] + if {![string is digit -strict "$startidx$endidx"]} { + foreach whichvar [list start end] { + upvar 0 ${whichvar}idx index + if {![string is digit -strict $index]} { + if {"end" eq $index} { + set index $max + } elseif {[string match "*-*" $index]} { + #end-int or int-int - like lrange etc we don't accept arbitrarily complex expressions + lassign [split $index -] A B + if {$A eq "end"} { + set index [expr {$max - $B}] + } else { + set index [expr {$A - $B}] + } + } elseif {[string match "*+*" $index]} { + lassign [split $index +] A B + if {$A eq "end"} { + #review - this will just result in out of bounds error in final test - as desired + #By calculating here - we will see the result in the error message - but it's probably not particularly useful - as we don't really need end+ support at all. + set index [expr {$max + $B}] + } else { + set index [expr {$A + $B}] + } + } else { + #May be something like +2 or -0 which braced expr can hanle + #we would like to avoid unbraced expr here - as we're potentially dealing with ranges that may come from external sources. + if {[catch {expr {$index}} index]} { + #could be end+x - but we don't want out of bounds to be valid + #set it to something that the final bounds expr test can deal with + set index Inf + } + } + } + } + } + #Unlike Tcl lrange,lindex etc - we don't want to support out of bound indices. + #show the supplied index and what it was mapped to in the error message. + if {$startidx < 0 || $startidx > $max} { + error "Bad start index '$original_startidx'. $startidx out of bounds 0 - $max" + } + if {$endidx < 0 || $endidx > $max} { + error "Bad end index '$original_endidx'. $endidx out of bounds 0 - $max (try $max or end)" + } + return [list $startidx $endidx] + } + + method regenerate_lines {args} { + #*** !doctools + #[call class::textinfo [method regenerate_lines]] + #[para]generate a list of lines from the current state of the stored raw data chunk and keep a map of line-endings indexed by lineindex + #[para]This is called automatically by the Constructor during object creation + #[para]It is exposed in the API experimentally - as chunk and line manipulation functions are considered. + #[para]TODO - review whether such manual control will be necessary/desirable + + #we don't store the actual line-endings as characters (for better layout of debug/display of data) - instead we store names lf|crlf|none + + # first split on lf - then crlf. As we've replaced with single substution chars - the order doesn't matter. + set o_payloadlist [list] + set o_linemap [dict create] + set crlf_replace [list \r\n $o_CRLF_C \n $o_LF_C] + set normalised_data [string map $crlf_replace $o_chunk] + + set lf_lines [split $normalised_data $o_LF_C] + + set idx 0 + set lf_count 0 + set crlf_count 0 + set filedata_offset 0 + set i 0 + set imax [expr {[llength $lf_lines]-1}] + foreach lfln $lf_lines { + set crlf_parts [split $lfln $o_CRLF_C] + if {[llength $crlf_parts] <= 1} { + #no crlf + set payloadlen [string length $lfln] + set le_size 1 + set le lf + if {$i == $imax} { + #no more lf segments - and no crlfs + if {$payloadlen > 0} { + #last line in split has chars - therefore there was no trailing line-ending + set le_size 0 + set le none + } else { + #empty space after last line-ending + #not really a line - we get here from splitting on our lf-replacement char + #An editor might display this pseudo-line with a line number - but we won't treat it as one here + break + } + } + lappend o_payloadlist $lfln + set linelen [expr {$payloadlen + $le_size}] + #we include line-ending in byte count for a line. + dict set o_linemap $idx [list le $le linelen $linelen payloadlen $payloadlen start $filedata_offset end [expr {$filedata_offset + $linelen -1}]] + incr filedata_offset $linelen + incr lf_count + incr idx + } else { + foreach crlfpart [lrange $crlf_parts 0 end-1] { + lappend o_payloadlist $crlfpart + set payloadlen [string length $crlfpart] + set linelen [expr {$payloadlen + 2}] + dict set o_linemap $idx [list le crlf linelen $linelen payloadlen $payloadlen start $filedata_offset end [expr {$filedata_offset + $linelen -1}]] + incr filedata_offset $linelen + incr crlf_count + incr idx + } + set lfpart [lindex $crlf_parts end] + set payloadlen [string length $lfpart] + if {$i == $imax} { + #no more lf segments - but we did find crlf in last (or perhaps only) lf line + #last element in our split has no le + if {$payloadlen > 0} { + set le_size 0 + set le none + } else { + #set le_size 2 + #set le crlf + break + } + } else { + #more lf segments to come + set le_size 1 + set le lf + } + + lappend o_payloadlist $lfpart + set linelen [expr {$payloadlen + $le_size}] + dict set o_linemap $idx [list le $le linelen $linelen payloadlen $payloadlen start $filedata_offset end [expr {$filedata_offset + $linelen -1}]] + incr filedata_offset $linelen + incr lf_count + incr idx + } + incr i + #incr filedata_offset ;#move up 1 so start entry for next line is greater than end entry for previous line + } + set le_count [expr {$lf_count + $crlf_count}] + if {$le_count != [llength $o_payloadlist]} { + puts stderr "fileline::class::textinfo warning. regenerate_lines lf_count: $lf_count + crlf_count: $crlf_count does not equal length of lines stored: [llength $o_payloadlist]" + } + + } + method regenerate_chunk {} { + #o_payloadlist + #o_linemap + set oldsize [string length $o_chunk] + set newchunk "" + dict for {idx lineinfo} $o_linemap { + set + + } + + return [list newsize [string length $newchunk] oldsize $oldsize] + } + + + #*** !doctools + #[list_end] + } + #*** !doctools + #[list_end] [comment {--- end class enumeration ---}] + } +} +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# Base namespace +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval punk::fileline { + namespace export * + #variable xyz + + #*** !doctools + #[subsection {Namespace punk::fileline}] + #[para] Core API functions for punk::fileline + #[list_begin definitions] + + proc get_textinfo {args} { + #*** !doctools + #[call get_textinfo [opt {option value...}] [opt datachunk]] + #[para]Returns textinfo object instance representing data in string datachunk or if -file filename supplied - data loaded from a file + #[para]The encoding used is as specified in the -encoding option - or from the Byte Order Mark (bom) at the beginning of the data + #[para]For Tcl 8.6 - encodings such as utf-16le may not be available - so the bytes are swapped appropriately depending on the platform byteOrder and encoding 'unicode' is used. + #[para]encoding defaults to utf-8 if no -encoding specified and no BOM was found + #[para]Specify -encoding binary to perform no encoding conversion + #[para]Whether -encoding was specified or not - by default the BOM characters are not retained in the line-data + #[para]If -includebom 1 is specified - the bom will be retained in the stored chunk and the data for line 1, but will undergo the same encoding transformation as the rest of the data + #[para]The get_bomid method of the returned object will contain an identifier for any BOM encountered. + #[para] e.g utf-8,utf-16be, utf-16le, utf-32be, utf32-le, SCSU, BOCU-1,GB18030, UTF-EBCDIC, utf-1, utf-7 + #[para]If the encoding specified in the BOM isn't recognised by Tcl - the resulting data is likely to remain as the raw bytes (binary translation) + #[para]Currently only utf-8, utf-16* and utf-32* are properly supported even though the other BOMs are detected, reported via get_bomid, and stripped from the data. + #[para]GB18030 falls back to cp936/gbk (unless a gb18030 encoding has been installed). Use -encoding binary if this isn't suitable and you need to do your own processing of the raw data. + + set defaults { + -file -default {} -type existingfile + -translation -default binary + -encoding -default "\uFFFF" + -includebom -default 0 + } + lassign [dict values [punk::args::opts_values $defaults $args -minvalues 0 -maxvalues 1]] opts values + # -- --- --- --- + set opt_file [dict get $opts -file] + set opt_translation [dict get $opts -translation] + set opt_encoding [dict get $opts -encoding] + set opt_includebom [dict get $opts -includebom] + # -- --- --- --- + + if {$opt_file ne ""} { + set filename $opt_file + set fd [open $filename r] + fconfigure $fd -translation binary -encoding $opt_translation;#should use translation binary to get actual line-endings - but we allow caller to override + #Always read encoding in binary - check for bom below and/or apply chosen opt_encoding + set rawchunk [read $fd] + close $fd + if {[llength $values]} { + puts stderr "Ignoring trailing argument [string length [lindex $values 0]] bytes. Not used when -file is specified" + } + } else { + set rawchunk [lindex $values 0] + } + set rawlen [string length $rawchunk] + #examine first 4 bytes for possible BOM + #big-endian BOMs + # ----------------------------------- + #EFBBBF - utf-8 reliabletxt + #FEFF - utf-16be reliabletxt + #FFFE - utf-16le reliabletxt + #0000FEFF - utf-32be reliabletxt + #FFFE0000 - utf-32le + #0000FFFE - utf-32be(2143) non-standard! (not supported) + #FEFF0000 - utf-32le(3412) non-standard! (not supported - will detect as utf-16be) + #2B2F76 - utf-7 (not supported) + #F7644C - utf-1 (not supported) + #DD736673 - UTF-EBCDIC (not supported) + #0EFEFF - SCSU (not supported) + #FBEE28 - BOCU-1 Binary Ordered Compression for Unicode (mime-compatible) - (not supported - fall back to utf-8) + #84319533 - GB18030 - Chinese gov standard (fall back to cp936 with warning if no encoding name) + # ----------------------------------- + + set first32 [string range $rawchunk 0 3] + #scan using capital H for big-endian order + set first32_be [binary scan $first32 H* maybe_bom] ;#we use H* instead of H8 for 8 nibbles (4 bytes) - because our first32 may contain less than 4 bytes - in which case we won't match + set bomid "" + set bomenc "" + set is_reliabletxt 0 ;#see http://reliabletxt.com - only utf-8 with bom, utf-16be, utf-16le, utf-32be supported as at 2024 + set startdata 0 + if {[string match "efbbbf*" $maybe_bom]} { + set bomid utf-8 + set bomenc utf-8 + set is_reliabletxt 1 + set startdata 3 + } elseif {$maybe_bom eq "0000feff"} { + set bomid utf-32be + set bomenc utf-32be + set is_reliabletxt 1 + set startdata 4 + } elseif {$maybe_bom eq "fffe0000"} { + #Technically ambiguous - could be utf-16le bom followed by utf-16 null character (2 byte null) + puts stderr "WARNING - ambiguous BOM fffe0000 found. Treating as utf-32le - but could be utf-16le - consider manually setting -encoding or converting data to another encoding." + set bomid utf-32le + set bomenc utf-32le + set startdata 4 + } elseif {[string match "feff*" $maybe_bom]} { + set bomid utf-16be + set bomenc utf-16be + set is_reliabletxt 1 + set startdata 2 + } elseif {[string match "fffe*" $maybe_bom]} { + set bomid utf-16le + set bomenc utf-16le + set is_reliabletxt 1 + set startdata 2 + } elseif {$maybe_bom eq "0efeff"} { + set bomid scsu + set bomenc "binary" + set startdata 3 + } elseif {$maybe_bom eq "fbee28"} { + set bomid bocu-1 + puts stderr "WARNING - bocu-1 BOM FBEE28 found. Not supported - back to binary" + set bomenc "binary" ;# utf-8??? + set startdata 3 + } elseif {$maybe_bom eq "84319533"} { + if {![dict exists [punk::char::page_names_dict gb18030]]} { + puts stderr "WARNING - no direct support for GB18030 (chinese) - falling back to cp936/gbk" + set bomenc cp936 + } else { + set bomenc [dict get [punk::char::page_names_dict gb18030]] ;#review - this may never exist in Tcl or may be named differently - create a handler? + } + set bomid gb18030 + set startdata 4 + } elseif {$maybe_bom eq "f7644c"} { + puts stderr "WARNING utf-1 BOM F7644C found - not supported. Falling back to binary" + set bomid utf-1 + set bomenc binary + set startdata 3 + } elseif {[string match "2b2f76*" $maybe_bom]} { + puts stderr "WARNING utf-7 BOM 2b2f76 found - not supported. Falling back to binary and leaving BOM in data!" + #review - work out how to strip bom - last 2 bits of 4th byte belong to following character + set bomid utf-7 + set bomenc binary + set startdata 0 + } + + #todo - check xml encoding attribute / html content-type + #todo - a separate chardet (https://chardet.readthedocs.io/ ) or mozilla like mechanism that can be manually called to autodetect character encoding + #This should be an explicit operation - not automatially done here unless we provide a flag for it. + + + if {$opt_includebom} { + set startdata 0 + } + + if {$opt_encoding eq "\uFFFF"} { + if {$bomenc ne "" && $bomenc ne "binary"} { + if {[package vcompare [package provide Tcl] 8.7] < 0} { + #tcl 8.6 has unicode encoding but not utf-16le etc + if {$bomenc ni [encoding names]} { + if {$bomenc eq "utf-16le"} { + if {$::tcl_platform(byteOrder) eq "littleEndian"} { + set datachunk [encoding convertfrom unicode [string range $rawchunk $startdata end]] + set encoding_selected unicode + } else { + set datachunk [encoding convertfrom unicode [system::wordswap16 [string range $rawchunk $startdata end]]] + set encoding_selected "unicode (wordswapped 16)" + } + } elseif {$bomenc eq "utf-16be"} { + if {$::tcl_platform(byteOrder) eq "littleEndian"} { + set datachunk [encoding convertfrom unicode [system::wordswap16 [string range $rawchunk $startdata end]]] + set encoding_selected "unicode (wordswapped 16)" + } else { + set datachunk [encoding convertfrom unicode [string range $rawchunk $startdata end]] + set encoding_selected unicode + } + } elseif {$bomenc eq "utf-32le"} { + if {$::tcl_platform(byteOrder) eq "littleEndian"} { + set datachunk [encoding convertfrom unicode [string range $rawchunk $startdata end]] + set encoding_selected unicode + } else { + set datachunk [encoding convertfrom unicode [system::wordswap32 [string range $rawchunk $startdata end]]] + set encoding_selected "unicode (wordswapped 32)" + } + } elseif {$bomenc eq "utf-32be"} { + if {$::tcl_platform(byteOrder) eq "littleEndian"} { + set datachunk [encoding convertfrom unicode [system::wordswap32 [string range $rawchunk $startdata end]]] + set encoding_selected "unicode (wordswapped 32)" + } else { + set datachunk [encoding convertfrom unicode [string range $rawchunk $startdata end]] + set encoding_selected unicode + } + } else { + error "Encoding $bomenc unavailable in this version of Tcl" + } + } else { + set datachunk [encoding convertfrom $bomenc [string range $rawchunk $startdata end]] + set encoding_selected $bomenc + } + } else { + #tcl 8.7 plus has utf-16le etc + set datachunk [encoding convertfrom $bomenc [string range $rawchunk $startdata end]] + set encoding_selected $bomenc + } + } else { + if {$bomenc eq "binary"} { + set datachunk [string range $rawchunk $startdata end] + set encoding_selected binary + } else { + set datachunk [encoding convertfrom utf-8 [string range $rawchunk $startdata end]] + set encoding_selected utf-8 + } + } + } else { + #manually specified encoding overrides bom - but still remove bom-chars REVIEW + #e.g we still want bom info - but specify binary encoding + + if {$opt_encoding eq "binary"} { + set datachunk [string range $rawchunk $startdata end] + } else { + set datachunk [encoding convertfrom $opt_encoding [string range $rawchunk $startdata end]] + } + set encoding_selected $opt_encoding + } + + set textobj [class::textinfo new $datachunk] + if {$bomid ne ""} { + $textobj set_bomid $bomid + } + + + + + set summary "" + append summary "Bytes loaded : $rawlen" \n + append summary "BOM ID : $bomid" \n + append summary "Encoding selected : $encoding_selected" \n + append summary "Characters : [$textobj chunklen]" \n + append summary "Lines recognised : [$textobj linecount]" \n + set leinfo [$textobj chunk_le_counts 0 end] + append summary "crlf endings (windows) : [dict get $leinfo crlf]" \n + append summary "lf endings (unix) : [dict get $leinfo lf]" \n + append summary "unterminated lines : [dict get $leinfo unterminated]" \n + puts stdout $summary + return $textobj + } + + proc file_boundary_display {filename startbyte endbyte chunksize args} { + set fd [open $filename r] ;#use default error if file not readable + fconfigure $fd -translation binary + set rawfiledata [read $fd] + close $fd + set textobj [class::textinfo new $rawfiledata] + set result [$textobj chunk_boundary_display $startbyte $endbyte $chunksize {*}$args] + $textobj destroy + return $result + } + + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace punk::fileline ---}] +} +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# Secondary API namespace +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval punk::fileline::lib { + namespace export * + namespace path [namespace parent] + #*** !doctools + #[subsection {Namespace punk::fileline::lib}] + #[para] Secondary functions that are part of the API + #[list_begin definitions] + + + proc range_spans_chunk_boundaries {start end chunksize args} { + #*** !doctools + #[call [fun lib::range_spans_chunk_boundaries] [arg start] [arg end] [arg chunksize]] + #[para]Takes start and end offset, generally representing bytes or character indices, and computes a list of boundaries at multiples of the chunksize that are spanned by the start and end range. + #[list_begin arguments] + # [arg_def integer start] + # [para] zero-based start index of range + # [arg_def integer end] + # [para] zero-based end index of range + # [arg_def integer chunksize] + # [para] Number of bytes/characters in chunk - must be positive and > 0 + #[list_end] + #[para]returns a dict with the keys is_span and boundaries + #[para]is_span 0|1 indicates if the range specified spans a boundary of chunksize + #[para]boundaries contains a list of the spanned boundaries - which are always multiples of the chunksize + #[para]e.g + #[example_begin] + # range_spans_chunk_boundaries 10 1750 512 + # is_span 1 boundaries {512 1024 1536} + #[example_end] + #[para]The -offset option + #[example_begin] + # range_spans_chunk_boundaries 10 1750 512 -offset 2 + # is_span 1 boundaries {514 1026 1538} + #[example_end] + #[para] This function automatically uses lseq (if Tcl >= 8.7) when number of boundaries spanned is approximately greater than 75 + if {[catch {package require Tcl 8.7}]} { + #only one implementation available for older Tcl + tailcall punk::fileline::system::_range_spans_chunk_boundaries_tcl $start $end $chunksize {*}$args + } + if {$chunksize < 1} { + error "range_spans_chunk_boundaries chunksize must be >= 1" + } + + if {(abs($end - $start) / $chunksize) < 75} { + tailcall punk::fileline::system::_range_spans_chunk_boundaries_tcl $start $end $chunksize {*}$args + } else { + tailcall punk::fileline::system::_range_spans_chunk_boundaries_lseq $start $end $chunksize {*}$args + } + } + + proc range_boundaries {start end chunksizes args} { + lassign [punk::get_leading_opts_and_values {\ + -offset 0\ + } $args] _opts opts _vals remainingargs + + + } + + #*** !doctools + #[list_end] [comment {--- end definitions namespace punk::fileline::lib ---}] +} +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +#*** !doctools +#[section Internal] +namespace eval punk::fileline::system { + #*** !doctools + #[subsection {Namespace punk::fileline::system}] + #[para] Internal functions that are not part of the API + + proc wordswap16 {data} { + #scan in one endianness - format in the other. Whether we scan le/be first doesn't matter as long as we format using the opposite endianness + binary scan $data s* elements ;#scan little endian + return [binary format S* $elements] ;#format big endian + } + proc wordswap32 {data} { + binary scan $data i* elements + return [binary format I* $elements] + } + + proc scan32bit_be {i32} { + if {[binary scan $i32 I x]} { + return $x + } else { + error "couldn't scan $i32" + } + } + + #for 8.7+ using lseq + #much faster when resultant boundary size is large (at least when offset 0) + proc _range_spans_chunk_boundaries_lseq {start end chunksize args} { + if {$chunksize < 1} {error "chunksize must be > 0"} ;#sanitycheck in case called directly + set defaults [dict create\ + -offset 0\ + ] + set known_opts [dict keys $defaults] + foreach {k v} $args { + if {$k ni $known_opts} { + error "unknown option '$k'. Known options: $known_opts" + } + } + set opts [dict merge $defaults $args] + # -- --- --- --- + set opt_offset [dict get $opts -offset] + # -- --- --- --- + + set smod [expr {$start % $chunksize}] + if {$smod != 0} { + set start [expr {$start + ($chunksize - $smod)}] + if {$start > $end} { + return [list is_span 0 boundaries {}] + } + } + set boundaries [lseq $start to $end $chunksize] + #offset can be negative + if {$opt_offset} { + if {$opt_offset + [lindex $boundaries end] > $end || $opt_offset + [lindex $boundaries 0] < $start} { + set overflow 1 + } else { + set overflow 0 + } + set boundaries [lmap v $boundaries[unset boundaries] {expr {$v + $opt_offset}}] + if {$overflow} { + #we don't know how many overflowed.. + set inrange [list] + foreach b $boundaries { + if {$b >= $start && $b <= $end} { + lappend inrange $b + } + } + set boundaries $inrange + } + } + return [list is_span [expr {[llength $boundaries]>0}] boundaries $boundaries] + } + + #faster than lseq for small number of resultant boundaries (~< 75) (which is a common use case) + #gets very slow (comparitively) with large resultsets + proc _range_spans_chunk_boundaries_tcl {start end chunksize args} { + if {$chunksize < 1} {error "chunksize must be > 0"} ;#sanitycheck in case called directly + set defaults [dict create\ + -offset 0\ + ] + set known_opts [dict keys $defaults] + foreach {k v} $args { + if {$k ni $known_opts} { + error "unknown option '$k'. Known options: $known_opts" + } + } + set opts [dict merge $defaults $args] + # -- --- --- --- + set opt_offset [dict get $opts -offset] + # -- --- --- --- + + set is_span 0 + set smod [expr {$start % $chunksize}] + if {$smod != 0} { + set start [expr {$start + ($chunksize - $smod)}] + } + set boundaries [list] + + #we only need to pre-check the result-range for negative offsets - as our main loop stops before end? + if {$opt_offset < 0} { + #set btrack [expr {$start + $opt_offset}] ;#start back one to make sure we catch the first boundary + set btrack $bstart + set boff [expr {$btrack + $opt_offset}] ;#must be growing even if start and offset are negative - as chunksize is at least 1 + while {$boff < $start} { + incr btrack $chunksize + set boff [expr {$btrack + $opt_offset}] + } + set bstart $btrack + } else { + set bstart $start + } + for {set b $bstart} {[set boff [expr {$b + $opt_offset}]] <= $end} {incr b $chunksize} { + lappend boundaries $boff + } + + return [list is_span [expr {[llength $boundaries]>0}] boundaries $boundaries offset $opt_offset] + } + + proc _range_spans_chunk_boundaries_TIMEIT {start end chunksize {repeat 1}} { + puts "main : [time {punk::fileline::lib::range_spans_chunk_boundaries $start $end $chunksize} $repeat]" + puts "tcl : [time {punk::fileline::system::_range_spans_chunk_boundaries_tcl $start $end $chunksize} $repeat]" + if {![catch {package require Tcl 8.7}]} { + puts "lseq : [time {punk::fileline::system::_range_spans_chunk_boundaries_lseq $start $end $chunksize} $repeat]" + } + } +} +namespace eval punk::fileline::ansi { + #*** !doctools + #[subsection {Namespace punk::fileline::ansi}] + #[para]These are ansi functions imported from punk::ansi - or no-ops if that package is unavailable + #[para]See [package punk::ansi] for documentation + #[list_begin definitions] + variable enabled 1 + #*** !doctools + #[call [fun ansi::a]] + #[call [fun ansi::a+]] + #[call [fun ansi::stripansi]] + + #*** !doctools + #[list_end] [comment {--- end definitions namespace punk::fileline::ansi ---}] +} + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Ready +package provide punk::fileline [namespace eval punk::fileline { + variable pkg punk::fileline + variable version + set version 0.1.0 +}] +return + +#*** !doctools +#[manpage_end] + diff --git a/src/bootsupport/modules/punk/lib-0.1.0.tm b/src/bootsupport/modules/punk/lib-0.1.0.tm new file mode 100644 index 0000000..f05e87e --- /dev/null +++ b/src/bootsupport/modules/punk/lib-0.1.0.tm @@ -0,0 +1,619 @@ +# -*- tcl -*- +# Maintenance Instruction: leave the 999999.xxx.x as is and use punkshell 'pmix make' or bin/punkmake to update from -buildversion.txt +# module template: punkshell/src/decktemplates/vendor/punk/modules/template_module-0.0.2.tm +# +# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. +# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# (C) 2024 +# +# @@ Meta Begin +# Application punk::lib 0.1.0 +# Meta platform tcl +# Meta license BSD +# @@ Meta End + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# doctools header +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +#*** !doctools +#[manpage_begin punkshell_module_punk::lib 0 0.1.0] +#[copyright "2024"] +#[titledesc {punk general utility functions}] [comment {-- Name section and table of contents description --}] +#[moddesc {punk library}] [comment {-- Description at end of page heading --}] +#[require punk::lib] +#[keywords module utility lib] +#[description] +#[para]This is a set of utility functions that are commonly used across punk modules or are just considered to be general-purpose functions. +#[para]The base set includes string and math functions but has no specific theme + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section Overview] +#[para] overview of punk::lib +#[subsection Concepts] +#[para]The punk::lib modules should have no strong dependencies other than Tcl +#[para]Dependendencies that only affect display or additional functionality may be included - but should fail gracefully if not present, and only when a function is called that uses one of these soft dependencies. +#[para]This requirement for no strong dependencies, means that many utility functions that might otherwise seem worthy of inclusion here are not present. + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Requirements +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[subsection dependencies] +#[para] packages used by punk::lib +#[list_begin itemized] + +package require Tcl 8.6 +#*** !doctools +#[item] [package {Tcl 8.6}] + +# #package require frobz +# #*** !doctools +# #[item] [package {frobz}] + +#*** !doctools +#[list_end] + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section API] + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# oo::class namespace +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval punk::lib::class { + #*** !doctools + #[subsection {Namespace punk::lib::class}] + #[para] class definitions + if {[info commands [namespace current]::interface_sample1] eq ""} { + #*** !doctools + #[list_begin enumerated] + + # oo::class create interface_sample1 { + # #*** !doctools + # #[enum] CLASS [class interface_sample1] + # #[list_begin definitions] + + # method test {arg1} { + # #*** !doctools + # #[call class::interface_sample1 [method test] [arg arg1]] + # #[para] test method + # puts "test: $arg1" + # } + + # #*** !doctools + # #[list_end] [comment {-- end definitions interface_sample1}] + # } + + #*** !doctools + #[list_end] [comment {--- end class enumeration ---}] + } +} +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# Base namespace +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval punk::lib { + namespace export * + #variable xyz + + #*** !doctools + #[subsection {Namespace punk::lib}] + #[para] Core API functions for punk::lib + #[list_begin definitions] + + + + #proc sample1 {p1 n args} { + # #*** !doctools + # #[call [fun sample1] [arg p1] [arg n] [opt {option value...}]] + # #[para]Description of sample1 + # #[para] Arguments: + # # [list_begin arguments] + # # [arg_def tring p1] A description of string argument p1. + # # [arg_def integer n] A description of integer argument n. + # # [list_end] + # return "ok" + #} + + proc K {x y} {return $x} + #*** !doctools + #[call [fun K] [arg x] [arg y]] + #[para]The K-combinator function - returns the first argument, x and discards y + #[para]see [uri https://wiki.tcl-lang.org/page/K] + #[para]It is used in cases where command-substitution at the calling-point performs some desired effect. + + proc hex2dec {args} { + #*** !doctools + #[call [fun hex2dec] [opt {option value...}] [arg list_largeHex]] + #[para]Convert a list of (possibly large) unprefixed hex strings to their decimal values + #[para]hex2dec accepts and ignores internal underscores in the same manner as Tcl 8.7+ numbers e.g hex2dec FF_FF returns 65535 + #[para]Leading and trailing underscores are ignored as a matter of implementation convenience - but this shouldn't be relied upon. + #[para]Leading or trailing whitespace in each list member is allowed e.g hex2dec " F" returns 15 + #[para]Internal whitespace e.g "F F" is not permitted - but a completely empty element "" is allowed and will return 0 + + set list_largeHex [lindex $args end] + set argopts [lrange $args 0 end-1] + if {[llength $argopts]%2 !=0} { + error "[namespace current]::hex2dec arguments prior to list_largeHex must be option/value pairs - received '$argopts'" + } + set defaults [dict create\ + -validate 1\ + -empty_as_hex "INVALID set -empty_as_hex to a hex string e.g FF if empty values should be replaced"\ + ] + set known_opts [dict keys $defaults] + set fullopts [dict create] + dict for {k v} $argopts { + dict set fullopts [tcl::prefix match -message "options for hex2dec. Unexpected option" $known_opts $k] $v + } + set opts [dict merge $defaults $fullopts] + # -- --- --- --- + set opt_validate [dict get $opts -validate] + set opt_empty [dict get $opts -empty_as_hex] + # -- --- --- --- + + set list_largeHex [lmap h $list_largeHex[unset list_largeHex] {string map [list _ ""] [string trim $h]}] + if {$opt_validate} { + #Note appended F so that we accept list of empty strings as per the documentation + if {![string is xdigit -strict [join $list_largeHex ""]F ]} { + error "[namespace current]::hex2dec error: non-hex digits encountered after stripping underscores and leading/trailing whitespace for each element\n $list_largeHex" + } + } + if {![string is xdigit -strict [string map [list _ ""] $opt_empty]]} { + #mapping empty string to a value destroys any advantage of -scanonly + #todo - document that -scanonly has 2 restrictions - each element must be valid hex and less than 7 chars long + #set list_largeHex [lmap v $list_largeHex[set list_largeHex {}] {expr {$v eq ""} ? {0} : {[set v]}}] + if {[lsearch $list_largeHex ""] >=0} { + error "[namespace current]::hex2dec error: empty values in list cannot be mapped to non-hex $opt_empty" + } + } else { + set opt_empty [string trim [string map [list _ ""] $opt_empty]] + if {[set first_empty [lsearch $list_largeHex ""]] >= 0} { + #set list_largeHex [lmap v $list_largeHex[set list_largeHex {}] {expr {$v eq ""} ? {$opt_empty} : {$v}}] + set nonempty_head [lrange $list_largeHex 0 $first_empty-1] + set list_largeHex [concat $nonempty_head [lmap v [lrange $list_largeHex $first_empty end] {expr {$v eq ""} ? {$opt_empty} : {$v}}]] + } + } + return [scan $list_largeHex [string repeat %llx [llength $list_largeHex]]] + } + + proc dec2hex {args} { + #*** !doctools + #[call [fun dex2hex] [opt {option value...}] [arg list_decimals]] + #[para]Convert a list of decimal integers to a list of hex values + #[para] -width can be used to make each hex value at least int characters wide, with leading zeroes. + #[para] -case upper|lower determines the case of the hex letters in the output + set list_decimals [lindex $args end] + set argopts [lrange $args 0 end-1] + if {[llength $argopts]%2 !=0} { + error "[namespace current]::dec2hex arguments prior to list_decimals must be option/value pairs - received '$argopts'" + } + set defaults [dict create\ + -width 1\ + -case upper\ + -empty_as_decimal "INVALID set -empty_as_decimal to a number if empty values should be replaced"\ + ] + set known_opts [dict keys $defaults] + set fullopts [dict create] + dict for {k v} $argopts { + dict set fullopts [tcl::prefix match -message "options for [namespace current]::dec2hex. Unexpected option" $known_opts $k] $v + } + set opts [dict merge $defaults $fullopts] + # -- --- --- --- + set opt_width [dict get $opts -width] + set opt_case [dict get $opts -case] + set opt_empty [dict get $opts -empty_as_decimal] + # -- --- --- --- + + + set resultlist [list] + if {[string tolower $opt_case] eq "upper"} { + set spec X + } elseif {[string tolower $opt_case] eq "lower"} { + set spec x + } else { + error "[namespace current]::dec2hex unknown value '$opt_case' for -case expected upper|lower" + } + set fmt "%${opt_width}.${opt_width}ll${spec}" + + set list_decimals [lmap d $list_decimals[unset list_decimals] {string map [list _ ""] [string trim $d]}] + if {![string is digit -strict [string map [list _ ""] $opt_empty]]} { + if {[lsearch $list_decimals ""] >=0} { + error "[namespace current]::dec2hex error: empty values in list cannot be mapped to non-decimal $opt_empty" + } + } else { + set opt_empty [string map [list _ ""] $opt_empty] + if {[set first_empty [lsearch $list_decimals ""]] >= 0} { + set nonempty_head [lrange $list_decimals 0 $first_empty-1] + set list_decimals [concat $nonempty_head [lmap v [lrange $list_decimals $first_empty end] {expr {$v eq ""} ? {$opt_empty} : {$v}}]] + } + } + return [format [lrepeat [llength $list_decimals] $fmt] {*}$list_decimals] + } + + proc log2 x "expr {log(\$x)/[expr log(2)]}" + #*** !doctools + #[call [fun log2] [arg x]] + #[para]log base2 of x + #[para]This uses a 'live' proc body - the divisor for the change of base is computed once at definition time + #[para](courtesy of RS [uri https://wiki.tcl-lang.org/page/Additional+math+functions]) + + proc logbase {b x} { + #*** !doctools + #[call [fun logbase] [arg b] [arg x]] + #[para]log base b of x + #[para]This function uses expr's natural log and the change of base division. + #[para]This means for example that we can get results like: logbase 10 1000 = 2.9999999999999996 + #[para]Use expr's log10() function or tcl::mathfunc::log10 for base 10 + expr {log($x)/log($b)} + } + proc factors {x} { + #*** !doctools + #[call [fun factors] [arg x]] + #[para]Return a sorted list of the positive factors of x where x > 0 + #[para]For x = 0 we return only 0 and 1 as technically any number divides zero and there are an infinite number of factors. (including zero itself in this context)* + #[para]This is a simple brute-force implementation that iterates all numbers below the square root of x to check the factors + #[para]Because the implementation is so simple - the performance is very reasonable for numbers below at least a few 10's of millions + #[para]See tcllib math::numtheory::factors for a more complex implementation - which seems to be slower for 'small' numbers + #[para]Comparisons were done with some numbers below 17 digits long + #[para]For seriously big numbers - this simple algorithm would no doubt be outperformed by more complex algorithms. + #[para]The numtheory library stores some data about primes etc with each call - so may become faster when being used on more numbers + #but has the disadvantage of being slower for 'small' numbers and using more memory. + #[para]If the largest factor below x is needed - the greatestOddFactorBelow and GreatestFactorBelow functions are a faster way to get there than computing the whole list, even for small values of x + #[para]* Taking x=0; Notion of x being divisible by integer y being: There exists an integer p such that x = py + #[para] In other mathematical contexts zero may be considered not to divide anything. + set factors [list 1] + set j 2 + set max [expr {sqrt($x)}] + while {$j <= $max} { + if {($x % $j) == 0} { + lappend factors $j [expr {$x / $j}] + } + incr j + } + lappend factors $x + return [lsort -unique -integer $factors] + } + proc oddFactors {x} { + #*** !doctools + #[call [fun oddFactors] [arg x]] + #[para]Return a list of odd integer factors of x, sorted in ascending order + set j 2 + set max [expr {sqrt($x)}] + set factors [list 1] + while {$j <= $max} { + if {$x % $j == 0} { + set other [expr {$x / $j}] + if {$other % 2 != 0} { + if {$other ni $factors} { + lappend factors $other + } + } + if {$j % 2 != 0} { + if {$j ni $factors} { + lappend factors $j + } + } + } + incr j + } + return [lsort -integer -increasing $factors] + } + proc greatestFactorBelow {x} { + #*** !doctools + #[call [fun greatestFactorBelow] [arg x]] + #[para]Return the largest factor of x excluding itself + #[para]factor functions can be useful for console layout calculations + #[para]See Tcllib math::numtheory for more extensive implementations + if {$x % 2 == 0 || $x == 0} { + return [expr {$x / 2}] + } + set j 3 + set max [expr {sqrt($x)}] + while {$j <= $max} { + if {$x % $j == 0} { + return [expr {$x / $j}] + } + incr j 2 + } + return 1 + } + proc greatestOddFactorBelow {x} { + #*** !doctools + #[call [fun greatestOddFactorBelow] [arg x]] + #[para]Return the largest odd integer factor of x excluding x itself + if {$x %2 == 0} { + return [greatestOddFactor $x] + } + set j 3 + #dumb brute force - time taken to compute is wildly variable on big numbers + #todo - use a (memoized?) generator of primes to reduce the search space + #tcllib math::numtheory has suitable functions - but do we want that dependency here? Testing shows brute-force often faster for small numbers. + set god 1 + set max [expr {sqrt($x)}] + while { $j <= $max} { + if {$x % $j == 0} { + set other [expr {$x / $j}] + if {$other % 2 == 0} { + set god $j + } else { + set god [expr {$x / $j}] + #lowest j - so other side must be highest + break + } + } + incr j 2 + } + return $god + } + proc greatestOddFactor {x} { + #*** !doctools + #[call [fun greatestOddFactor] [arg x]] + #[para]Return the largest odd integer factor of x + #[para]For an odd value of x - this will always return x + if {$x % 2 != 0 || $x == 0} { + return $x + } + set r [expr {$x / 2}] + while {$r % 2 == 0} { + set r [expr {$r / 2}] + } + return $r + } + proc gcd {n m} { + #*** !doctools + #[call [fun gcd] [arg n] [arg m]] + #[para]Return the greatest common divisor of m and n + #[para]Straight from Lars Hellström's math::numtheory library in Tcllib + #[para]Graphical use: + #[para]An a by b rectangle can be covered with square tiles of side-length c, + #[para]only if c is a common divisor of a and b + + # + # Apply Euclid's good old algorithm + # + if { $n > $m } { + set t $n + set n $m + set m $t + } + + while { $n > 0 } { + set r [expr {$m % $n}] + set m $n + set n $r + } + + return $m + } + proc lcm {n m} { + #*** !doctools + #[call [fun gcd] [arg n] [arg m]] + #[para]Return the lowest common multiple of m and n + #[para]Straight from Lars Hellström's math::numtheory library in Tcllib + #[para] + set gcd [gcd $n $m] + return [expr {$n*$m/$gcd}] + } + proc commonDivisors {x y} { + #*** !doctools + #[call [fun commonDivisors] [arg x] [arg y]] + #[para]Return a list of all the common factors of x and y + #[para](equivalent to factors of their gcd) + return [factors [gcd $x $y]] + } + + #experimental only - there are better/faster ways + proc sieve n { + set primes [list] + if {$n < 2} {return $primes} + set nums [dict create] + for {set i 2} {$i <= $n} {incr i} { + dict set nums $i "" + } + set next 2 + set limit [expr {sqrt($n)}] + while {$next <= $limit} { + for {set i $next} {$i <= $n} {incr i $next} {dict unset nums $i} + lappend primes $next + dict for {next -} $nums break + } + return [concat $primes [dict keys $nums]] + } + proc sieve2 n { + set primes [list] + if {$n < 2} {return $primes} + set nums [dict create] + for {set i 2} {$i <= $n} {incr i} { + dict set nums $i "" + } + set next 2 + set limit [expr {sqrt($n)}] + while {$next <= $limit} { + for {set i $next} {$i <= $n} {incr i $next} {dict unset nums $i} + lappend primes $next + #dict for {next -} $nums break + set next [lindex $nums 0] + } + return [concat $primes [dict keys $nums]] + } + + proc hasglobs {str} { + #*** !doctools + #[call [fun hasglobs] [arg str]] + #[para]Return a boolean indicating whether str contains any of the glob characters: * ? [lb] [rb] + #[para]hasglobs uses append to preserve Tcls internal representation for str - so it should help avoid shimmering in the few cases where this may matter. + regexp {[*?\[\]]} [append obj2 $str {}] ;# int-rep preserving + } + + proc trimzero {number} { + #*** !doctools + #[call [fun trimzero] [arg number]] + #[para]Return number with left-hand-side zeros trimmed off - unless all zero + #[para]If number is all zero - a single 0 is returned + set trimmed [string trimleft $number 0] + if {[string length $trimmed] == 0} { + set trimmed 0 + } + return $trimmed + } + proc substring_count {str substring} { + #*** !doctools + #[call [fun substring_count] [arg str] [arg substring]] + #[para]Search str and return number of occurrences of substring + + #faster than lsearch on split for str of a few K + if {$substring eq ""} {return 0} + set occurrences [expr {[string length $str]-[string length [string map [list $substring {}] $str]]}] + return [expr {$occurrences / [string length $substring]}] + } + + proc dict_merge_ordered {defaults main} { + #*** !doctools + #[call [fun dict_merge_ordered] [arg defaults] [arg main]] + #[para]The standard dict merge accepts multiple dicts with values from dicts to the right (2nd argument) taking precedence. + #[para]When merging with a dict of default values - this means that any default key/vals that weren't in the main dict appear in the output before the main data. + #[para]This function merges the two dicts whilst maintaining the key order of main followed by defaults. + + #1st merge (inner merge) with wrong values taking precedence - but right key-order - then (outer merge) restore values + return [dict merge [dict merge $main $defaults] $main] + } + + proc askuser {question} { + #*** !doctools + #[call [fun askuser] [arg question]] + #[para]A very basic utility to read an answer from stdin + #[para]The prompt is written to the terminal and then it waits for a user to type something + #[para]stdin is temporarily configured to blocking and then put back in its original state in case it wasn't already so. + #[para]The user must hit enter to submit the response + #[para]The return value is the string if any that was typed prior to hitting enter. + #[para]The question argument can be manually colourised using the various punk::ansi funcitons + #[example_begin] + # set answer [lb]punk::lib::askuser "[lb]a+ green bold[rb]Do you want to proceed? (Y|N)[lb]a[rb]"[rb] + # if {[lb]string match y* [lb]string tolower $answer[rb][rb]} { + # puts "Proceeding" + # } else { + # puts "Cancelled by user" + # } + #[example_end] + puts stdout $question + flush stdout + set stdin_state [fconfigure stdin] + try { + fconfigure stdin -blocking 1 + set answer [gets stdin] + } finally { + fconfigure stdin -blocking [dict get $stdin_state -blocking] + } + return $answer + } + + #*** !doctools + #[list_end] [comment {--- end definitions namespace punk::lib ---}] +} +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + + + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +#todo - way to generate 'internal' docs separately? +#*** !doctools +#[section Internal] +namespace eval punk::lib::system { + #*** !doctools + #[subsection {Namespace punk::lib::system}] + #[para] Internal functions that are not part of the API + #[list_begin definitions] + + proc mostFactorsBelow {n} { + ##*** !doctools + #[call [fun mostFactorsBelow] [arg n]] + #[para]Find the number below $n which has the greatest number of factors + #[para]This will get slow quickly as n increases (100K = 1s+ 2024) + set most 0 + set mostcount 0 + for {set i 1} {$i < $n} {incr i} { + set fc [llength [punk::lib::factors $i]] + if {$fc > $mostcount} { + set most $i + set mostcount $fc + } + } + return [list number $most numfactors $mostcount] + } + proc factorCountBelow_punk {n} { + ##*** !doctools + #[call [fun factorCountBelow] [arg n]] + #[para]For numbers 1 to n - keep a tally of the total count of factors + #[para]This is not useful other than a quick and dirty check that different algorithms return *probably* the same result + #[para]and as a rudimentary performance comparison + #[para]gets slow quickly! + set tally 0 + for {set i 1} {$i <= $n} {incr i} { + incr tally [llength [punk::lib::factors $i]] + } + return $tally + } + proc factorCountBelow_numtheory {n} { + ##*** !doctools + #[call [fun factorCountBelow] [arg n]] + #[para]For numbers 1 to n - keep a tally of the total count of factors + #[para]This is not useful other than a quick and dirty check that different algorithms return *probably* the same result + #[para]and as a rudimentary performance comparison + #[para]gets slow quickly! (significantly slower than factorCountBelow_punk) + package require math::numtheory + set tally 0 + for {set i 1} {$i <= $n} {incr i} { + incr tally [llength [math::numtheory::factors $i]] + } + return $tally + } + + proc factors2 {x} { + ##*** !doctools + #[call [fun factors2] [arg x]] + #[para]Return a sorted list of factors of x + #[para]A similar brute-force mechanism to factors - but keeps result ordering as we go. + set smallfactors [list 1] + set j 2 + set max [expr {sqrt($x)}] + while {$j < $max} { + if {($x % $j) == 0} { + lappend smallfactors $j + lappend largefactors [expr {$x / $j}] + } + incr j + } + #handle sqrt outside loop so we don't have to sort/dedup or check list membership in main loop + if {($x % $j) == 0} { + if {$j == ($x / $j)} { + lappend smallfactors $j + } + } + return [concat $smallfactors [lreverse $largefactors] $x] + } + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace punk::lib::system ---}] +} +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Ready +package provide punk::lib [namespace eval punk::lib { + variable pkg punk::lib + variable version + set version 0.1.0 +}] +return + +#*** !doctools +#[manpage_end] + diff --git a/src/embedded/man/files/punk/_module_ansi-0.1.0.tm.n b/src/embedded/man/files/punk/_module_ansi-0.1.0.tm.n index e524213..7b7ed9e 100644 --- a/src/embedded/man/files/punk/_module_ansi-0.1.0.tm.n +++ b/src/embedded/man/files/punk/_module_ansi-0.1.0.tm.n @@ -334,6 +334,8 @@ package require \fBpunk::ansi \fR .sp \fBlength\fR \fItext\fR .sp +\fBindex\fR \fIstring\fR \fIindex\fR +.sp .BE .SH DESCRIPTION .PP @@ -550,6 +552,40 @@ Return the character length after stripping ansi codes - not the printing length .SS "NAMESPACE PUNK::ANSI::ANSISTRING" .PP punk::ansi::string ensemble +.TP +\fBindex\fR \fIstring\fR \fIindex\fR +.sp +Takes a string that possibly contains ansi codes such as colour,underline etc (SGR codes) +.sp +Returns the character (with applied ansi effect) at position index +.sp +The string could contain non SGR ansi codes - and these will (mostly) be ignored, so shouldn't affect the output\&. +.sp +Some terminals don't hide 'privacy message' and other strings within an ESC X ESC ^ or ESC _ sequence (terminated by ST) +.sp +It's arguable some of these are application specific - but this function takes the view that they are probably non-displaying - so index won't see them\&. +.sp +If the caller wants just the character - they should use a normal string index after calling stripansi, or call stripansi afterwards\&. +.sp +As any operation using end-+ will need to strip ansi to precalculate the length anyway; the caller should probably just use stripansi and standard string index if the ansi coded output isn't required and they are using and end-based index\&. +.sp +In fact, any operation where the ansi info isn't required in the output would probably be slightly more efficiently obtained by using stripansi and normal string operations on that\&. +.sp +The returned character will (possibly) have a leading ansi escape sequence but no trailing escape sequence - even if the string was taken from a position immediately before a reset or other SGR ansi code +.sp +The ansi-code prefix in the returned string is built up by concatenating previous SGR ansi codes seen - but it is optimised to re-start the process if any full SGR reset is encountered\&. +.sp +The code sequence doesn't detect individual properties being turned on and then off again, only full resets; so in some cases the ansi-prefix may not be as short as it could be\&. +.sp +This shouldn't make any difference to the visual output - but a possible future enhancement is something to produce the shortest ansi sequence possible +.sp +Notes: +.sp +This function has to split the whole string into plaintext & ansi codes even for a very low index +.sp +Some sort of generator that parses more of the string as required might be more efficient for large chunks\&. +.sp +For end-x operations we have to pre-calculate the content-length by stripping the ansi - which is also potentially sub-optimal .PP .SH KEYWORDS ansi, console, module, string, terminal diff --git a/src/embedded/md/doc/files/punk/_module_ansi-0.1.0.tm.md b/src/embedded/md/doc/files/punk/_module_ansi-0.1.0.tm.md index 06bc52c..453e50c 100644 --- a/src/embedded/md/doc/files/punk/_module_ansi-0.1.0.tm.md +++ b/src/embedded/md/doc/files/punk/_module_ansi-0.1.0.tm.md @@ -71,6 +71,7 @@ package require punk::ansi [__detect\_sgr__ *text*](#27) [__strip__ *text*](#28) [__length__ *text*](#29) +[__index__ *string* *index*](#30) # DESCRIPTION @@ -322,6 +323,62 @@ https://github\.com/perlancar/perl\-Text\-ANSI\-Util/blob/master/lib/Text/ANSI/B punk::ansi::string ensemble + - __index__ *string* *index* + + Takes a string that possibly contains ansi codes such as colour,underline + etc \(SGR codes\) + + Returns the character \(with applied ansi effect\) at position index + + The string could contain non SGR ansi codes \- and these will \(mostly\) be + ignored, so shouldn't affect the output\. + + Some terminals don't hide 'privacy message' and other strings within an ESC + X ESC ^ or ESC \_ sequence \(terminated by ST\) + + It's arguable some of these are application specific \- but this function + takes the view that they are probably non\-displaying \- so index won't see + them\. + + If the caller wants just the character \- they should use a normal string + index after calling stripansi, or call stripansi afterwards\. + + As any operation using end\-\+ will need to strip ansi to precalculate + the length anyway; the caller should probably just use stripansi and + standard string index if the ansi coded output isn't required and they are + using and end\-based index\. + + In fact, any operation where the ansi info isn't required in the output + would probably be slightly more efficiently obtained by using stripansi and + normal string operations on that\. + + The returned character will \(possibly\) have a leading ansi escape sequence + but no trailing escape sequence \- even if the string was taken from a + position immediately before a reset or other SGR ansi code + + The ansi\-code prefix in the returned string is built up by concatenating + previous SGR ansi codes seen \- but it is optimised to re\-start the process + if any full SGR reset is encountered\. + + The code sequence doesn't detect individual properties being turned on and + then off again, only full resets; so in some cases the ansi\-prefix may not + be as short as it could be\. + + This shouldn't make any difference to the visual output \- but a possible + future enhancement is something to produce the shortest ansi sequence + possible + + Notes: + + This function has to split the whole string into plaintext & ansi codes even + for a very low index + + Some sort of generator that parses more of the string as required might be + more efficient for large chunks\. + + For end\-x operations we have to pre\-calculate the content\-length by + stripping the ansi \- which is also potentially sub\-optimal + # KEYWORDS [ansi](\.\./\.\./\.\./index\.md\#ansi), [console](\.\./\.\./\.\./index\.md\#console), diff --git a/src/embedded/www/doc/files/punk/_module_ansi-0.1.0.tm.html b/src/embedded/www/doc/files/punk/_module_ansi-0.1.0.tm.html index 99b0bf5..9c2e547 100644 --- a/src/embedded/www/doc/files/punk/_module_ansi-0.1.0.tm.html +++ b/src/embedded/www/doc/files/punk/_module_ansi-0.1.0.tm.html @@ -164,6 +164,7 @@
  • detect_sgr text
  • strip text
  • length text
  • +
  • index string index
  • @@ -300,6 +301,23 @@

    Namespace punk::ansi::ansistring

    punk::ansi::string ensemble

    +
    index string index
    +

    Takes a string that possibly contains ansi codes such as colour,underline etc (SGR codes)

    +

    Returns the character (with applied ansi effect) at position index

    +

    The string could contain non SGR ansi codes - and these will (mostly) be ignored, so shouldn't affect the output.

    +

    Some terminals don't hide 'privacy message' and other strings within an ESC X ESC ^ or ESC _ sequence (terminated by ST)

    +

    It's arguable some of these are application specific - but this function takes the view that they are probably non-displaying - so index won't see them.

    +

    If the caller wants just the character - they should use a normal string index after calling stripansi, or call stripansi afterwards.

    +

    As any operation using end-+<int> will need to strip ansi to precalculate the length anyway; the caller should probably just use stripansi and standard string index if the ansi coded output isn't required and they are using and end-based index.

    +

    In fact, any operation where the ansi info isn't required in the output would probably be slightly more efficiently obtained by using stripansi and normal string operations on that.

    +

    The returned character will (possibly) have a leading ansi escape sequence but no trailing escape sequence - even if the string was taken from a position immediately before a reset or other SGR ansi code

    +

    The ansi-code prefix in the returned string is built up by concatenating previous SGR ansi codes seen - but it is optimised to re-start the process if any full SGR reset is encountered.

    +

    The code sequence doesn't detect individual properties being turned on and then off again, only full resets; so in some cases the ansi-prefix may not be as short as it could be.

    +

    This shouldn't make any difference to the visual output - but a possible future enhancement is something to produce the shortest ansi sequence possible

    +

    Notes:

    +

    This function has to split the whole string into plaintext & ansi codes even for a very low index

    +

    Some sort of generator that parses more of the string as required might be more efficient for large chunks.

    +

    For end-x operations we have to pre-calculate the content-length by stripping the ansi - which is also potentially sub-optimal

    diff --git a/src/modules/punk/ansi-999999.0a1.0.tm b/src/modules/punk/ansi-999999.0a1.0.tm index 53e8ee4..aec4ee4 100644 --- a/src/modules/punk/ansi-999999.0a1.0.tm +++ b/src/modules/punk/ansi-999999.0a1.0.tm @@ -105,8 +105,12 @@ namespace eval punk::ansi { #NOTE - we are assuming an OSC or DCS started with one type of sequence (7 or 8bit) can be terminated by either 7 or 8 bit ST (or BEL e.g wezterm ) #This using a different type of ST to that of the opening sequence is presumably unlikely in the wild - but who knows? + variable standalone_codes + set standalone_codes [list \x1bc "" \x1b7 "" \x1b8 "" \x1bM "" \x1bE "" \x1bD "" \x1bH "" \x1b= "" \x1b> "" \x1b#3 "" \x1b#4 "" \x1b#5 "" \x1b#6 "" \x1b#8 ""] + #review - there doesn't seem to be an \x1b#7 + # https://espterm.github.io/docs/VT100%20escape%20codes.html + #self-contained 2 byte ansi escape sequences - review more? - variable ansi_2byte_codes_dict set ansi_2byte_codes_dict [dict create\ "reset_terminal" "\u001bc"\ "save_cursor_posn" "\u001b7"\ @@ -133,6 +137,7 @@ namespace eval punk::ansi { #todo - character set selection - SS2 SS3 - how are they terminated? REVIEW variable escape_terminals ;#dict + variable standalone_codes ;#map to empty string set text [convert_g0 $text] @@ -145,9 +150,7 @@ namespace eval punk::ansi { #\x1b#6 double-width line #\x1b#8 dec test fill screen - set clean_map_2b [list \x1bc "" \x1b7 "" \x1b8 "" \x1bM "" \x1bE "" \x1bD "" \x1bH "" \x1b= "" \x1b> ""] - set clean_map_3b [list \x1b#3 "" \x1b#4 "" \x1b#5 "" \x1b#6 "" \x1b#8 ""] - set text [string map [concat $clean_map_2b $clean_map_3b] $text] + set text [string map $standalone_codes $text] #we process char by char - line-endings whether \r\n or \n should be processed as per any other character. #line endings can theoretically occur within an ansi escape sequence payload (review e.g title?) @@ -169,7 +172,7 @@ namespace eval punk::ansi { if {$u in $endseq} { set in_escapesequence 0 } elseif {$uv in $endseq} { - set in_escapseequence 2b ;#flag next byte as last in sequence + set in_escapesequence 2b ;#flag next byte as last in sequence } } else { #handle both 7-bit and 8-bit CSI and OSC @@ -179,7 +182,7 @@ namespace eval punk::ansi { set in_escapesequence OSC } elseif {[regexp {^(?:\033P|\u0090)} $uv]} { set in_escapesequence DCS - } elseif {[regexp {^(?:\033X|\u0098|\033^|\u009E|\033_|\u009F)} $uv]} { + } elseif {[regexp {^(?:\033X|\u0098|\033\^|\u009E|\033_|\u009F)} $uv]} { #SOS,PM,APC - all terminated with ST set in_escapesequence MISC } else { @@ -248,7 +251,7 @@ namespace eval punk::ansi { proc colourmap1 {{bgname White}} { package require textblock - set bg [textblock::block 3 33 "[a+ $bgname] [a]"] + set bg [textblock::block 33 3 "[a+ $bgname] [a]"] set colormap "" for {set i 0} {$i <= 7} {incr i} { append colormap "_[a+ white bold 48\;5\;$i] $i [a]" @@ -258,7 +261,7 @@ namespace eval punk::ansi { } proc colourmap2 {{bgname White}} { package require textblock - set bg [textblock::block 3 39 "[a+ $bgname] [a]"] + set bg [textblock::block 39 3 "[a+ $bgname] [a]"] set colormap "" for {set i 8} {$i <= 15} {incr i} { append colormap "_[a+ black normal 48\;5\;$i] $i [a]" ;#black normal is blacker than black bold - which often displays as a grey @@ -691,6 +694,7 @@ namespace eval punk::ansi { return "\u0090+q$payload\u009c" } namespace eval codetype { + #Functions that operate on a single ansi code sequence - not a sequence, and not codes embedded in another string proc is_sgr {code} { #SGR (Select Graphic Rendition) - codes ending in 'm' - e.g colour/underline #we will accept and pass through the less common colon separator (ITU Open Document Architecture) @@ -707,7 +711,7 @@ namespace eval punk::ansi { regexp {\033\[0*m$} $code } #whether this code has 0 (or equivalently empty) parameter (but may set others) - #if an SGR code as a reset in it - we don't need to carry forward any previous SGR codes + #if an SGR code has a reset in it - we don't need to carry forward any previous SGR codes #it generally only makes sense for the reset to be the first entry - otherwise the code has ineffective portions #However - detecting zero or empty parameter in other positions requires knowing all other codes that may allow zero or empty params. #We will only look at initial parameter as this is the well-formed normal case. @@ -773,14 +777,35 @@ namespace eval punk::ansi::ta { #OSC - termnate with BEL (\a \007) or ST (string terminator \033\\) # 8-byte string terminator is \x9c (\u009c) - #test - non-greedy - variable re_esc_osc1 {(?:\033\]).*?\007} - variable re_esc_osc2 {(?:\033\]).*?\033\\} - variable re_esc_osc3 {(?:\u009d).*?\u009c} + #non-greedy via "*?" doesn't seem to work like this.. + #variable re_esc_osc1 {(?:\033\]).*?\007} + #variable re_esc_osc2 {(?:\033\]).*?\033\\} + #variable re_esc_osc3 {(?:\u009d).*?\u009c} + + #non-greedy by excluding ST terminators + #TODO - FIX? see re_ST below + variable re_esc_osc1 {(?:\033\])(?:[^\007]*)\007} + variable re_esc_osc2 {(?:\033\])(?:[^\033]*)\033\\} + variable re_esc_osc3 {(?:\u009d)(?:[^\u009c]*)?\u009c} variable re_osc_open {(?:\033\]|\u009d).*} - variable re_ansi_detect "${re_csi_open}|${re_esc_osc1}|${re_esc_osc2}" + #standalone_codes [list \x1bc "" \x1b7 "" \x1b8 "" \x1bM "" \x1bE "" \x1bD "" \x1bH "" \x1b= "" \x1b> "" \x1b#3 "" \x1b#4 "" \x1b#5 "" \x1b#5 "" \x1b#6 "" \x1b#8 ""] + variable re_standalones {(?:\x1bc|\x1b7|\x1b8|\x1bM|\x1bE|\x1bD|\x1bD|\x1bH|\x1b=|\x1b>|\x1b#3|\x1b#4|\x1b#5|\x1b#6|\x1b#8)} + + #see stripansi + set re_start_ST {^(?:\033X|\u0098|\033\^|\u009E|\033_|\u009F)} + #ST terminators [list \007 \033\\ \u009c] + + #regex to capture the start of string/privacy message/application command block including the contents and string terminator (ST) + #non-greedy by exclusion of ST terminators in body + #!!! + #TODO - fix. we need to match \033\\ not just \033 ! could be colour codes nested in a privacy msg/string + #This will currently terminate the code too early in this case + #we also need to track the start of ST terminated code and not add it for replay (in the ansistring functions) + variable re_ST {(?:\033X|\u0098|\033\^|\u009E|\033_|\u009F)(?:[^\033\007\u009c]*)(?:\033\\|\007|\u009c)} + + variable re_ansi_detect "${re_csi_open}|${re_esc_osc1}|${re_esc_osc2}|${re_standalones}|${re_start_ST}" #detect any ansi escapes #review - only detect 'complete' codes - or just use the opening escapes for performance? @@ -851,7 +876,9 @@ namespace eval punk::ansi::ta { variable re_esc_osc1 variable re_esc_osc2 variable re_csi_code - punk::ansi::internal::splitx $text "${re_csi_code}|${re_esc_osc1}|${re_esc_osc2}" + variable re_standalones + variable re_ST + punk::ansi::internal::splitx $text "${re_csi_code}|${re_esc_osc1}|${re_esc_osc2}|${re_standalones}|${re_ST}" } # -- --- --- --- --- --- @@ -871,7 +898,9 @@ namespace eval punk::ansi::ta { variable re_esc_osc1 variable re_esc_osc2 variable re_csi_code - set re "(?:${re_csi_code}|${re_esc_osc1}|${re_esc_osc2})+" + variable re_standalones + variable re_ST + set re "(?:${re_csi_code}|${re_standalones}|${re_ST}|${re_esc_osc1}|${re_esc_osc2})+" return [_perlish_split $re $text] } #like split_codes - but each ansi-escape is split out separately (with empty string of plaintext between codes so odd/even plain ansi still holds) @@ -879,7 +908,9 @@ namespace eval punk::ansi::ta { variable re_esc_osc1 variable re_esc_osc2 variable re_csi_code - set re "${re_csi_code}|${re_esc_osc1}|${re_esc_osc2}" + variable re_standalones + variable re_ST + set re "${re_csi_code}|${re_standalones}|${re_ST}|${re_esc_osc1}|${re_esc_osc2}" return [_perlish_split $re $text] } @@ -890,10 +921,26 @@ namespace eval punk::ansi::ta { } set list [list] set start 0 + + #We can get $matchEnd < $matchStart; we need to ensure there is an exit condition for non-greedy empty results REVIEW while {[regexp -start $start -indices -- $re $text match]} { lassign $match matchStart matchEnd + #puts "->start $start ->match $matchStart $matchEnd" + if {$matchEnd < $matchStart} { + lappend list [string range $text $start $matchStart-1] [string index $text $matchStart] + incr start + if {$start >= [string length $text]} { + break + } + continue + } lappend list [string range $text $start $matchStart-1] [string range $text $matchStart $matchEnd] set start [expr {$matchEnd+1}] + + #? + if {$start >= [string length $text]} { + break + } } lappend list [string range $text $start end] return $list @@ -915,15 +962,210 @@ namespace eval punk::ansi::ansistring { #[list_begin definitions] namespace path [list ::punk::ansi ::punk::ansi::ta] namespace ensemble create - namespace export length + namespace export length trim trimleft trimright index + #todo - expose _splits_ methods so caller can work efficiently with the splits themselves + #we need to consider whether these can be agnostic towards splits from split_codes vs split_codes_single proc length {string} { - string length [ansistrip $string] + string length [stripansi $string] } + + proc trimleft {string args} { + set intext 0 + set out "" + #for split_codes only first or last pt can be empty string + foreach {pt ansiblock} [split_codes $string] { + if {!$intext} { + if {$pt eq "" || [regexp {^\s+$} $pt]} { + append out $ansiblock + } else { + append out [string trimleft $pt]$ansiblock + set intext 1 + } + } else { + append out $pt$ansiblock + } + } + return $out + } + proc trimright {string} { + if {$string eq ""} {return ""} ;#excludes the case where split_codes would return nothing + set rtrimmed_list [lreverse [_splits_trimleft [lreverse [split_codes $string]]]] + return [join $rtrimmed_list ""] + } + proc trim {string} { + #make sure we do our ansi-scanning split only once - so use list-based trim operations + #order of left vs right probably makes zero difference - as any reduction the first operation can do is only in terms of characters at other end of list - not in total list length + #we save a single function call by calling both here rather than _splits_trim + join [_splits_trimright [_splits_trimleft [split_codes $string]]] "" + } + + proc index {string index} { + #*** !doctools + #[call [fun index] [arg string] [arg index]] + #[para]Takes a string that possibly contains ansi codes such as colour,underline etc (SGR codes) + #[para]Returns the character (with applied ansi effect) at position index + #[para]The string could contain non SGR ansi codes - and these will (mostly) be ignored, so shouldn't affect the output. + #[para]Some terminals don't hide 'privacy message' and other strings within an ESC X ESC ^ or ESC _ sequence (terminated by ST) + #[para]It's arguable some of these are application specific - but this function takes the view that they are probably non-displaying - so index won't see them. + #[para]todo: SGR codes within ST-terminated strings not yet ignored properly + #[para]If the caller wants just the character - they should use a normal string index after calling stripansi, or call stripansi afterwards. + #[para]As any operation using end-+ will need to strip ansi to precalculate the length anyway; the caller should probably just use stripansi and standard string index if the ansi coded output isn't required and they are using and end-based index. + #[para]In fact, any operation where the ansi info isn't required in the output would probably be slightly more efficiently obtained by using stripansi and normal string operations on that. + #[para]The returned character will (possibly) have a leading ansi escape sequence but no trailing escape sequence - even if the string was taken from a position immediately before a reset or other SGR ansi code + #[para]The ansi-code prefix in the returned string is built up by concatenating previous SGR ansi codes seen - but it is optimised to re-start the process if any full SGR reset is encountered. + #[para]The code sequence doesn't detect individual properties being turned on and then off again, only full resets; so in some cases the ansi-prefix may not be as short as it could be. + #[para]This shouldn't make any difference to the visual output - but a possible future enhancement is something to produce the shortest ansi sequence possible + #[para]Notes: + #[para]This function has to split the whole string into plaintext & ansi codes even for a very low index + #[para]Some sort of generator that parses more of the string as required might be more efficient for large chunks. + #[para]For end-x operations we have to pre-calculate the content-length by stripping the ansi - which is also potentially sub-optimal + + set splits [split_codes_single $string]; #we get empty pt(plaintext) between each ansi code that is in a run + + #todo - end-x +/-x+/-x etc + set original_index $index + + set index [string map [list _ ""] $index] + #short-circuit some trivial cases + if {[string is integer -strict $index]} { + if {$index < 0} {return ""} + #this only short-circuits an index greater than length including ansi-chars + #we don't want to spend cycles stripping ansi for this test so code below will still have to handle index just larger than content-length but still less than entire length + if {$index > [string length $string]} {return ""} + } else { + if {[string match end* $index]} { + #for end- we will probably have to blow a few cycles stripping first and calculate the length + if {$index ne "end"} { + set op [string index $index 3] + set offset [string range $index 4 end] + if {$op ni {+ -} || ![string is integer -strict $offset]} {error "bad index '$index': must be integer?\[+-\]integer? or end?\[+-\]integer?"} + if {$op eq "+" && $offset != 0} { + return "" + } + } else { + set offset 0 + } + #by now, if op = + then offset = 0 so we only need to handle the minus case + set payload_len [punk::ansi::ansistring::length $string] ;#a little bit wasteful - but hopefully no big deal + if {$offset == 0} { + set index [expr {$payload_len-1}] + } else { + set index [expr {($payload_len-1) - $offset}] + } + if {$index < 0} { + #don't waste time splitting and looping the string + return "" + } + } else { + #we are trying to avoid evaluating unbraced expr of potentially insecure origin + regexp {^([+-]{0,1})(.*)} $index _match sign tail ;#should always match - even empty string + if {[string is integer -strict $tail]} { + #plain +- + if {$op eq "-"} { + #return nothing for negative indices as per Tcl's lindex etc + return "" + } + set index $tail + } else { + if {[regexp {(.*)([+-])(.*)} $index _match a op b]} { + if {[string is integer -strict $a] && [string is integer -strict $b]} { + if {$op eq "-"} { + set index [expr {$a - $b}] + } else { + set index [expr {$a + $b}] + } + } else { + error "bad index '$index': must be integer?\[+-\]integer? or end?\[+-\]integer?" + } + } else { + error "bad index '$index': must be integer?\[+-\]integer? or end?\[+-\]integer?" + } + } + } + } + + #any pt could be empty if using split_codes_single (or just first and last pt if split_codes) + set low -1 + set high -1 + set pt_index -2 + set pt_found -1 + set char "" + set codes_in_effect "" + #we can't only apply leading sequence from previous code - as there may be codes in effect from earlier, so we have to track as we go + #(this would apply even if we used split_codes - but then we would need to do further splitting of each codeset anyway) + foreach {pt code} $splits { + incr pt_index 2 + if {$pt ne ""} { + set low [expr {$high + 1}] ;#last high + incr high [string length $pt] + } + + if {$pt ne "" && ($index >= $low && $index <= $high)} { + set pt_found $pt_index + set char [string index $pt $index-$low] + break + } + + if {[punk::ansi::codetype::is_sgr_reset $code]} { + #we can throw away previous codes_in_effect + set codes_in_effect "" + } else { + #may have partial resets - but we don't want to track individual states of SGR features + #A possible feature would be some function to optimise an ansi code sequence - which we could then apply at the end. + #we don't apply non SGR codes to our output. This is probably what is wanted - but should be reviewed. + #Review - consider if any other types of code make sense to retain in the output in this context. + if {[punk::ansi::codetype::is_sgr $code]} { + append codes_in_effect $code + } + } + } + if {$pt_found >= 0} { + return $codes_in_effect$char + } else { + return "" + } } + proc _splits_trimleft {sclist} { + set intext 0 + set outlist [list] + foreach {pt ansiblock} $sclist { + if {!$intext} { + if {$pt eq "" || [regexp {^\s+$} $pt]} { + lappend outlist "" $ansiblock + } else { + lappend outlist [string trimleft $pt] $ansiblock + set intext 1 + } + } else { + lappend outlist $pt $ansiblock + } + } + return $outlist + } + proc _splits_trimright {sclist} { + set intext 0 + set outlist [list] + foreach {pt ansiblock} [lreverse $sclist] { + if {!$intext} { + if {$pt eq "" || [regexp {^\s+$} $pt]} { + lappend outlist "" $ansiblock + } else { + lappend outlist [string trimright $pt] $ansiblock + set intext 1 + } + } else { + lappend outlist $pt $ansiblock + } + } + return [lreverse $outlist] + } + proc _splits_trim {sclist} { + return [_splits_trimright [_splits_trimleft $sclist]] + } #*** !doctools #[list_end] [comment {--- end definitions namespace punk::ansi::ta ---}] diff --git a/src/modules/shellfilter-0.1.8.tm b/src/modules/shellfilter-0.1.8.tm index 0498b59..4224e1e 100644 --- a/src/modules/shellfilter-0.1.8.tm +++ b/src/modules/shellfilter-0.1.8.tm @@ -147,71 +147,6 @@ namespace eval shellfilter::ansi2 { variable SGR_map set SGR_map [dict merge $SGR_colour_map $SGR_setting_map] - proc colourmap1 {{bgname White}} { - package require textblock - - set bg [textblock::block 3 33 "[a+ $bgname] [a=]"] - set colormap "" - for {set i 0} {$i <= 7} {incr i} { - append colormap "_[a+ white bold 48\;5\;$i] $i [a=]" - } - set map1 [overtype::left -transparent _ $bg "\n$colormap"] - return $map1 - } - proc colourmap2 {{bgname White}} { - package require textblock - set bg [textblock::block 3 39 "[a+ $bgname] [a=]"] - set colormap "" - for {set i 8} {$i <= 15} {incr i} { - append colormap "_[a+ black normal 48\;5\;$i] $i [a=]" ;#black normal is blacker than black bold - which often displays as a grey - } - set map2 [overtype::left -transparent _ $bg "\n$colormap"] - return $map2 - } - proc ? {args} { - variable SGR_setting_map - variable SGR_colour_map - - if {![llength $args]} { - set out "" - append out $SGR_setting_map \n - append out $SGR_colour_map \n - - try { - set bgname "White" - set map1 [colourmap1 $bgname] - set map1 [overtype::centre -transparent 1 $map1 "[a= black $bgname]Standard colours[a=]"] - set map2 [colourmap2 $bgname] - set map2 [overtype::centre -transparent 1 $map2 "[a= black $bgname]High-intensity colours[a=]"] - append out [textblock::join [textblock::join $map1 " "] $map2] \n - #append out $map1[a=] \n - #append out $map2[a=] \n - - - - } on error {result options} { - puts stderr "Failed to draw colormap" - puts stderr "$result" - } finally { - return $out - } - } else { - set result [list] - set rmap [lreverse $map] - foreach i $args { - if {[string is integer -strict $i]} { - if {[dict exists $rmap $i]} { - lappend result $i [dict get $rmap $i] - } - } else { - if {[dict exists $map $i]} { - lappend result $i [dict get $map $i] - } - } - } - return $result - } - } proc + {args} { #don't disable ansi here. #we want this to be available to call even if ansi is off