diff --git a/punkproject.toml b/punkproject.toml new file mode 100644 index 00000000..1cc9add2 --- /dev/null +++ b/punkproject.toml @@ -0,0 +1,2 @@ +[project] +name = "punkshell" diff --git a/src/bootsupport/modules/overtype-1.6.6.tm b/src/bootsupport/modules/overtype-1.6.6.tm new file mode 100644 index 00000000..b4e59ec6 --- /dev/null +++ b/src/bootsupport/modules/overtype-1.6.6.tm @@ -0,0 +1,4774 @@ +# -*- tcl -*- +# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt +# +# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. +# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# (C) Julian Noble 2003-2023 +# +# @@ Meta Begin +# Application overtype 1.6.6 +# Meta platform tcl +# Meta license BSD +# @@ Meta End + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# doctools header +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +#*** !doctools +#[manpage_begin overtype_module_overtype 0 1.6.6] +#[copyright "2024"] +#[titledesc {overtype text layout - ansi aware}] [comment {-- Name section and table of contents description --}] +#[moddesc {overtype text layout}] [comment {-- Description at end of page heading --}] +#[require overtype] +#[keywords module text ansi] +#[description] +#[para] - + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section Overview] +#[para] overview of overtype +#[subsection Concepts] +#[para] - + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Requirements +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[subsection dependencies] +#[para] packages used by overtype +#[list_begin itemized] + +package require Tcl 8.6- +package require textutil +package require punk::lib ;#required for lines_as_list +package require punk::ansi ;#required to detect, split, strip and calculate lengths +package require punk::char ;#box drawing - and also unicode character width determination for proper layout of text with double-column-width chars +package require punk::assertion +#*** !doctools +#[item] [package {Tcl 8.6}] +#[item] [package textutil] +#[item] [package punk::ansi] +#[para] - required to detect, split, strip and calculate lengths of text possibly containing ansi codes +#[item] [package punk::char] +#[para] - box drawing - and also unicode character width determination for proper layout of text with double-column-width chars + +# #package require frobz +# #*** !doctools +# #[item] [package {frobz}] + +#*** !doctools +#[list_end] + +#PERFORMANCE notes +#overtype is very performance sensitive - used in ansi output all over the place ie needs to be optimised +#NOTE use of tcl::dict::for tcl::string::range etc instead of ensemble versions. This is for the many tcl 8.6/8.7 interps which don't compile ensemble commands when in safe interps +#similar for tcl::namespace::eval - but this is at least on some versions of Tcl - faster even in a normal interp. Review to see if that holds for Tcl 9. +#for string map: when there are exactly 2 elements - it is faster to use a literal which has a special case optimisation in the c code +#ie use tcl::string::map {\n ""} ... instead of tcl::string::map [list \n ""] ... +#note that we can use unicode (e.g \uFF31) and other escapes such as \t within these curly braces - we don't have to use double quotes +#generally using 'list' is preferred for the map as less error prone. +#can also use: tcl::string::map "token $var" .. but be careful regarding quoting and whitespace in var. This should be used sparingly if at all. + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +#*** !doctools +#[section API] + + +#Julian Noble - 2003 +#Released under standard 'BSD license' conditions. +# +#todo - ellipsis truncation indicator for center,right + +#v1.4 2023-07 - naive ansi color handling - todo - fix tcl::string::range +# - need to extract and replace ansi codes? + +tcl::namespace::eval overtype { + namespace import ::punk::assertion::assert + punk::assertion::active true + + namespace path ::punk::lib + + namespace export * + variable default_ellipsis_horizontal "..." ;#fallback + variable default_ellipsis_vertical "..." + tcl::namespace::eval priv { + proc _init {} { + upvar ::overtype::default_ellipsis_horizontal e_h + upvar ::overtype::default_ellipsis_vertical e_v + set e_h [format %c 0x2026] ;#Unicode Horizontal Ellipsis + set e_v [format %c 0x22EE] + #The unicode ellipsis looks more natural than triple-dash which is centred vertically whereas ellipsis is at floorline of text + #Also - unicode ellipsis has semantic meaning that other processors can interpret + #unicode does also provide a midline horizontal ellipsis 0x22EF + + #set e [format %c 0x2504] ;#punk::char::charshort boxd_ltdshhz - Box Drawings Light Triple Dash Horizontal + #if {![catch {package require punk::char}]} { + # set e [punk::char::charshort boxd_ltdshhz] + #} + } + } + priv::_init +} +proc overtype::about {} { + return "Simple text formatting. Author JMN. BSD-License" +} + +tcl::namespace::eval overtype { + variable grapheme_widths [tcl::dict::create] + + variable escape_terminals + #single "final byte" in the range 0x40–0x7E (ASCII @A–Z[\]^_`a–z{|}~). + tcl::dict::set escape_terminals CSI [list @ \\ ^ _ ` | ~ a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z "\{" "\}"] + #tcl::dict::set escape_terminals CSI [list J K m n A B C D E F G s u] ;#basic + tcl::dict::set escape_terminals OSC [list \007 \033\\] ;#note mix of 1 and 2-byte terminals + + #self-contained 2 byte ansi escape sequences - review more? + variable ansi_2byte_codes_dict + set ansi_2byte_codes_dict [tcl::dict::create\ + "reset_terminal" "\u001bc"\ + "save_cursor_posn" "\u001b7"\ + "restore_cursor_posn" "\u001b8"\ + "cursor_up_one" "\u001bM"\ + "NEL - Next Line" "\u001bE"\ + "IND - Down one line" "\u001bD"\ + "HTS - Set Tab Stop" "\u001bH"\ + ] + + #debatable whether strip should reveal the somethinghidden - some terminals don't hide it anyway. + # "PM - Privacy Message" "\u001b^somethinghidden\033\\"\ +} + + + + +proc overtype::string_columns {text} { + if {[punk::ansi::ta::detect $text]} { + #error "error string_columns is for calculating character length of string - ansi codes must be stripped/rendered first e.g with punk::ansi::ansistrip. Alternatively try punk::ansi::printing_length" + set text [punk::ansi::ansistrip $text] + } + return [punk::char::ansifreestring_width $text] +} + +#todo - consider a way to merge overtype::left/centre/right +#These have similar algorithms/requirements - and should be refactored to be argument-wrappers over a function called something like overtype::renderblock +#overtype::renderblock could render the input to a defined (possibly overflowing in x or y) rectangle overlapping the underlay. +#(i.e not even necessariy having it's top left within the underlay) +tcl::namespace::eval overtype::priv { +} + +#could return larger than renderwidth +proc _get_row_append_column {row} { + #obsolete? + upvar outputlines outputlines + set idx [expr {$row -1}] + if {$row <= 1 || $row > [llength $outputlines]} { + return 1 + } else { + upvar opt_expand_right expand_right + upvar renderwidth renderwidth + set existinglen [punk::ansi::printing_length [lindex $outputlines $idx]] + set endpos [expr {$existinglen +1}] + if {$expand_right} { + return $endpos + } else { + if {$endpos > $renderwidth} { + return $renderwidth + 1 + } else { + return $endpos + } + } + } +} + +tcl::namespace::eval overtype { + #*** !doctools + #[subsection {Namespace overtype}] + #[para] Core API functions for overtype + #[list_begin definitions] + + + + #tcl::string::range should generally be avoided for both undertext and overtext which contain ansi escapes and other cursor affecting chars such as \b and \r + #render onto an already-rendered (ansi already processed) 'underlay' string, a possibly ansi-laden 'overlay' string. + #The underlay and overlay can be multiline blocks of text of varying line lengths. + #The overlay may just be an ansi-colourised block - or may contain ansi cursor movements and cursor save/restore calls - in which case the apparent length and width of the overlay can't be determined as if it was a block of text. + #This is a single-shot rendering of strings - ie there is no way to chain another call containing a cursor-restore to previously rendered output and have it know about any cursor-saves in the first call. + # a cursor start position other than top-left is a possible addition to consider. + #see editbuf in punk::repl for a more stateful ansi-processor. Both systems use loops over overtype::renderline + proc renderspace {args} { + #*** !doctools + #[call [fun overtype::renderspace] [arg args] ] + #[para] usage: ?-transparent [lb]0|1[rb]? ?-expand_right [lb]1|0[rb]? ?-ellipsis [lb]1|0[rb]? ?-ellipsistext ...? undertext overtext + + # @c overtype starting at left (overstrike) + # @c can/should we use something like this?: 'format "%-*s" $len $overtext + variable default_ellipsis_horizontal + + if {[llength $args] < 2} { + error {usage: ?-width ? ?-startcolumn ? ?-transparent [0|1|]? ?-expand_right [1|0]? ?-ellipsis [1|0]? ?-ellipsistext ...? undertext overtext} + } + set optargs [lrange $args 0 end-2] + if {[llength $optargs] % 2 == 0} { + set overblock [lindex $args end] + set underblock [lindex $args end-1] + #lassign [lrange $args end-1 end] underblock overblock + set argsflags [lrange $args 0 end-2] + } else { + set optargs [lrange $args 0 end-1] + if {[llength $optargs] %2 == 0} { + set overblock [lindex $args end] + set underblock "" + set argsflags [lrange $args 0 end-1] + } else { + error "renderspace expects opt-val pairs followed by: or just " + } + } + set opts [tcl::dict::create\ + -bias ignored\ + -width \uFFEF\ + -height \uFFEF\ + -startcolumn 1\ + -ellipsis 0\ + -ellipsistext $default_ellipsis_horizontal\ + -ellipsiswhitespace 0\ + -expand_right 0\ + -appendlines 1\ + -transparent 0\ + -exposed1 \uFFFD\ + -exposed2 \uFFFD\ + -experimental 0\ + -cp437 0\ + -looplimit \uFFEF\ + -crm_mode 0\ + -reverse_mode 0\ + -insert_mode 0\ + -wrap 0\ + -info 0\ + -console {stdin stdout stderr}\ + ] + #expand_right is perhaps consistent with the idea of the page_size being allowed to grow horizontally.. + # it does not necessarily mean the viewport grows. (which further implies need for horizontal scrolling) + # - it does need to be within some concept of terminal width - as columns must be addressable by ansi sequences. + # - This implies the -width option value must grow if it is tied to the concept of renderspace terminal width! REVIEW. + # - further implication is that if expand_right grows the virtual renderspace terminal width - + # then some sort of reflow/rerender needs to be done for preceeding lines? + # possibly not - as expand_right is distinct from a normal terminal-width change event, + # expand_right being primarily to support other operations such as textblock::table + + #todo - viewport width/height as separate concept to terminal width/height? + #-ellipsis args not used if -wrap is true + foreach {k v} $argsflags { + switch -- $k { + -looplimit - -width - -height - -startcolumn - -bias - -ellipsis - -ellipsistext - -ellipsiswhitespace + - -transparent - -exposed1 - -exposed2 - -experimental + - -expand_right - -appendlines + - -reverse_mode - -crm_mode - -insert_mode + - -cp437 + - -info - -console { + tcl::dict::set opts $k $v + } + -wrap - -autowrap_mode { + #temp alias -autowrap_mode for consistency with renderline + #todo - + tcl::dict::set opts -wrap $v + } + default { + error "overtype::renderspace unknown option '$k'. Known options: [tcl::dict::keys $opts]" + } + } + } + #set opts [tcl::dict::merge $defaults $argsflags] + # -- --- --- --- --- --- + #review - expand_left for RTL text? + set opt_expand_right [tcl::dict::get $opts -expand_right] + #for repl - standard output line indicator is a dash - todo, add a different indicator for a continued line. + set opt_width [tcl::dict::get $opts -width] + set opt_height [tcl::dict::get $opts -height] + set opt_startcolumn [tcl::dict::get $opts -startcolumn] + set opt_appendlines [tcl::dict::get $opts -appendlines] + set opt_transparent [tcl::dict::get $opts -transparent] + set opt_ellipsistext [tcl::dict::get $opts -ellipsistext] + set opt_ellipsiswhitespace [tcl::dict::get $opts -ellipsiswhitespace] + set opt_exposed1 [tcl::dict::get $opts -exposed1] ;#widechar_exposed_left - todo + set opt_exposed2 [tcl::dict::get $opts -exposed2] ;#widechar_exposed_right - todo + # -- --- --- --- --- --- + set opt_crm_mode [tcl::dict::get $opts -crm_mode] + set opt_reverse_mode [tcl::dict::get $opts -reverse_mode] + set opt_insert_mode [tcl::dict::get $opts -insert_mode] + ##### + # review -wrap should map onto DECAWM terminal mode - the wrap 2 idea may not fit in with this?. + set opt_autowrap_mode [tcl::dict::get $opts -wrap] + #??? wrap 1 is hard wrap cutting word at exact column, or 1 column early for 2w-glyph, wrap 2 is for language-based word-wrap algorithm (todo) + ##### + # -- --- --- --- --- --- + set opt_cp437 [tcl::dict::get $opts -cp437] + set opt_info [tcl::dict::get $opts -info] + + + + # ---------------------------- + # -experimental dev flag to set flags etc + # ---------------------------- + set data_mode 0 + set edit_mode 0 + set opt_experimental [tcl::dict::get $opts -experimental] + foreach o $opt_experimental { + switch -- $o { + data_mode { + set data_mode 1 + } + edit_mode { + set edit_mode 1 + } + } + } + # ---------------------------- + + + set underblock [tcl::string::map {\r\n \n} $underblock] + set overblock [tcl::string::map {\r\n \n} $overblock] + + + #set underlines [split $underblock \n] + + #underblock is a 'rendered' block - so width height make sense + #only non-cursor affecting and non-width occupying ANSI codes should be present. + #ie SGR codes and perhaps things such as PM - although generally those should have been pushed to the application already + #renderwidth & renderheight were originally used with reference to rendering into a 'column' of output e.g a table column - before cursor row/col was implemented. + + if {$opt_width eq "\uFFEF" || $opt_height eq "\uFFEF"} { + lassign [blocksize $underblock] _w renderwidth _h renderheight + if {$opt_width ne "\uFFEF"} { + set renderwidth $opt_width + } + if {$opt_height ne "\uFFEF"} { + set renderheight $opt_height + } + } else { + set renderwidth $opt_width + set renderheight $opt_height + } + #initial state for renderspace 'terminal' reset + set initial_state [dict create\ + renderwidth $renderwidth\ + renderheight $renderheight\ + crm_mode $opt_crm_mode\ + reverse_mode $opt_reverse_mode\ + insert_mode $opt_insert_mode\ + autowrap_mode $opt_autowrap_mode\ + cp437 $opt_cp437\ + ] + #modes + #e.g insert_mode can be toggled by insert key or ansi IRM sequence CSI 4 h|l + #opt_startcolumn ?? - DECSLRM ? + set vtstate $initial_state + + # -- --- --- --- + #REVIEW - do we need ansi resets in the underblock? + if {$underblock eq ""} { + set underlines [lrepeat $renderheight ""] + } else { + set underblock [textblock::join_basic -- $underblock] ;#ensure properly rendered - ansi per-line resets & replays + set underlines [split $underblock \n] + } + #if {$underblock eq ""} { + # set blank "\x1b\[0m\x1b\[0m" + # #set underlines [list "\x1b\[0m\x1b\[0m"] + # set underlines [lrepeat $renderheight $blank] + #} else { + # #lines_as_list -ansiresets 1 will do nothing if -ansiresets 1 isn't specified - REVIEW + # set underlines [lines_as_list -ansiresets 1 $underblock] + #} + # -- --- --- --- + + #todo - reconsider the 'line' as the natural chunking mechanism for the overlay. + #In practice an overlay ANSI stream can be a single line with ansi moves/restores etc - or even have no moves or newlines, just relying on wrapping at the output renderwidth + #In such cases - we process the whole shebazzle for the first output line - only reducing by the applied amount at the head each time, reprocessing the long tail each time. + #(in cases where there are interline moves or cursor jumps anyway) + #This works - but doesn't seem efficient. + #On the other hand.. maybe it depends on the data. For simpler files it's more efficient than splitting first + + #a hack until we work out how to avoid infinite loops... + # + set looplimit [tcl::dict::get $opts -looplimit] + if {$looplimit eq "\uFFEF"} { + #looping for each char is worst case (all newlines?) - anything over that is an indication of something broken? + #do we need any margin above the length? (telnet mapscii.me test) + set looplimit [expr {[tcl::string::length $overblock] + 10}] + } + + #overblock height/width isn't useful in the presence of an ansi input overlay with movements. The number of lines may bear little relationship to the output height + #lassign [blocksize $overblock] _w overblock_width _h overblock_height + + set scheme 4 + switch -- $scheme { + 0 { + #one big chunk + set inputchunks [list $overblock] + } + 1 { + set inputchunks [punk::ansi::ta::split_codes $overblock] + } + 2 { + + #split into lines if possible first - then into plaintext/ansi-sequence chunks ? + set inputchunks [list ""] ;#put an empty plaintext split in for starters + set i 1 + set lines [split $overblock \n] + foreach ln $lines { + if {$i < [llength $lines]} { + append ln \n + } + set sequence_split [punk::ansi::ta::split_codes_single $ln] ;#use split_codes Not split_codes_single? + set lastpt [lindex $inputchunks end] + lset inputchunks end [tcl::string::cat $lastpt [lindex $sequence_split 0]] + lappend inputchunks {*}[lrange $sequence_split 1 end] + incr i + } + } + 3 { + #it turns out line based chunks are faster than the above.. probably because some of those end up doing the regex splitting twice + set lflines [list] + set inputchunks [split $overblock \n] + foreach ln $inputchunks { + append ln \n + lappend lflines $ln + } + if {[llength $lflines]} { + lset lflines end [tcl::string::range [lindex $lflines end] 0 end-1] + } + #set inputchunks $lflines[unset lflines] + set inputchunks [lindex [list $lflines [unset lflines]] 0] + + } + 4 { + set inputchunks [list] + foreach ln [split $overblock \n] { + lappend inputchunks $ln\n + } + if {[llength $inputchunks]} { + lset inputchunks end [tcl::string::range [lindex $inputchunks end] 0 end-1] + } + } + } + + + + + set replay_codes_underlay [tcl::dict::create 1 ""] + #lappend replay_codes_overlay "" + set replay_codes_overlay "[punk::ansi::a]" + set unapplied "" + set cursor_saved_position [tcl::dict::create] + set cursor_saved_attributes "" + + + set outputlines $underlines + set overidx 0 + + #underlines are not necessarily processed in order - depending on cursor-moves applied from overtext + set row 1 + #if {$data_mode} { + # set col [_get_row_append_column $row] + #} else { + set col $opt_startcolumn + #} + + set instruction_stats [tcl::dict::create] + + set loop 0 + #while {$overidx < [llength $inputchunks]} { } + + while {[llength $inputchunks]} { + #set overtext [lindex $inputchunks $overidx]; lset inputchunks $overidx "" + set overtext [lpop inputchunks 0] + if {![tcl::string::length $overtext]} { + incr loop + continue + } + #puts "----->[ansistring VIEW -lf 1 -vt 1 -nul 1 $overtext]<----" + set undertext [lindex $outputlines [expr {$row -1}]] + set renderedrow $row + + #renderline pads each underaly line to width with spaces and should track where end of data is + + + #set overtext [tcl::string::cat [lindex $replay_codes_overlay $overidx] $overtext] + set overtext $replay_codes_overlay$overtext + if {[tcl::dict::exists $replay_codes_underlay $row]} { + set undertext [tcl::dict::get $replay_codes_underlay $row]$undertext + } + #review insert_mode. As an 'overtype' function whose main function is not interactive keystrokes - insert is secondary - + #but even if we didn't want it as an option to the function call - to process ansi adequately we need to support IRM (insertion-replacement mode) ESC [ 4 h|l + set renderopts [list -experimental $opt_experimental\ + -cp437 $opt_cp437\ + -info 1\ + -crm_mode [tcl::dict::get $vtstate crm_mode]\ + -insert_mode [tcl::dict::get $vtstate insert_mode]\ + -autowrap_mode [tcl::dict::get $vtstate autowrap_mode]\ + -reverse_mode [tcl::dict::get $vtstate reverse_mode]\ + -cursor_restore_attributes $cursor_saved_attributes\ + -transparent $opt_transparent\ + -width [tcl::dict::get $vtstate renderwidth]\ + -exposed1 $opt_exposed1\ + -exposed2 $opt_exposed2\ + -expand_right $opt_expand_right\ + -cursor_column $col\ + -cursor_row $row\ + ] + set rinfo [renderline {*}$renderopts $undertext $overtext] + + set instruction [tcl::dict::get $rinfo instruction] + tcl::dict::set vtstate crm_mode [tcl::dict::get $rinfo crm_mode] + tcl::dict::set vtstate insert_mode [tcl::dict::get $rinfo insert_mode] + tcl::dict::set vtstate autowrap_mode [tcl::dict::get $rinfo autowrap_mode] ;# + tcl::dict::set vtstate reverse_mode [tcl::dict::get $rinfo reverse_mode] + #how to support reverse_mode in rendered linelist? we need to examine all pt/code blocks and flip each SGR stack? + # - review - the answer is probably that we don't need to - it is set/reset only during application of overtext + + #Note carefully the difference betw overflow_right and unapplied. + #overflow_right may need to be included in next run before the unapplied data + #overflow_right most commonly has data when in insert_mode + set rendered [tcl::dict::get $rinfo result] + set overflow_right [tcl::dict::get $rinfo overflow_right] + set overflow_right_column [tcl::dict::get $rinfo overflow_right_column] + set unapplied [tcl::dict::get $rinfo unapplied] + set unapplied_list [tcl::dict::get $rinfo unapplied_list] + set post_render_col [tcl::dict::get $rinfo cursor_column] + set post_render_row [tcl::dict::get $rinfo cursor_row] + set c_saved_pos [tcl::dict::get $rinfo cursor_saved_position] + set c_saved_attributes [tcl::dict::get $rinfo cursor_saved_attributes] + set visualwidth [tcl::dict::get $rinfo visualwidth] ;#column width of what is 'rendered' for the line + set insert_lines_above [tcl::dict::get $rinfo insert_lines_above] + set insert_lines_below [tcl::dict::get $rinfo insert_lines_below] + tcl::dict::set replay_codes_underlay [expr {$renderedrow+1}] [tcl::dict::get $rinfo replay_codes_underlay] + + #lset replay_codes_overlay [expr $overidx+1] [tcl::dict::get $rinfo replay_codes_overlay] + set replay_codes_overlay [tcl::dict::get $rinfo replay_codes_overlay] + if {0 && [tcl::dict::get $vtstate reverse_mode]} { + #test branch - todo - prune + puts stderr "---->[ansistring VIEW $replay_codes_overlay] rendered: $rendered" + #review + #JMN3 + set existing_reverse_state 0 + #split_codes_single is single esc sequence - but could have multiple sgr codes within one esc sequence + #e.g \x1b\[0;31;7m has a reset,colour red and reverse + set codeinfo [punk::ansi::codetype::sgr_merge [list $replay_codes_overlay] -info 1] + set codestate_reverse [dict get $codeinfo codestate reverse] + switch -- $codestate_reverse { + 7 { + set existing_reverse_state 1 + } + 27 { + set existing_reverse_state 0 + } + "" { + } + } + if {$existing_reverse_state == 0} { + set rflip [a+ reverse] + } else { + #reverse of reverse + set rflip [a+ noreverse] + } + #note that mergeresult can have multiple esc (due to unmergeables or non sgr codes) + set replay_codes_overlay [punk::ansi::codetype::sgr_merge [list [dict get $codeinfo mergeresult] $rflip]] + puts stderr "---->[ansistring VIEW $replay_codes_overlay] rendered: $rendered" + } + + + + #-- todo - detect looping properly + if {$row > 1 && $overtext ne "" && $unapplied eq $overtext && $post_render_row == $row && $instruction eq ""} { + puts stderr "overtype::renderspace loop?" + puts [ansistring VIEW $rinfo] + break + } + #-- + + if {[tcl::dict::size $c_saved_pos] >= 1} { + set cursor_saved_position $c_saved_pos + set cursor_saved_attributes $c_saved_attributes + } + + + set overflow_handled 0 + + + + set nextprefix "" + + + #todo - handle potential insertion mode as above for cursor restore? + #keeping separate branches for debugging - review and merge as appropriate when stable + set instruction_type [lindex $instruction 0] ;#some instructions have params + tcl::dict::incr instruction_stats $instruction_type + switch -- $instruction_type { + reset { + #reset the 'renderspace terminal' (not underlying terminal) + set row 1 + set col 1 + set vtstate [tcl::dict::merge $vtstate $initial_state] + #todo - clear screen + } + {} { + #end of supplied line input + #lf included in data + set row $post_render_row + set col $post_render_col + if {![llength $unapplied_list]} { + if {$overflow_right ne ""} { + incr row + } + } else { + puts stderr "renderspace end of input line - has unapplied: [ansistring VIEW $unapplied] (review)" + } + set col $opt_startcolumn + } + up { + + #renderline knows it's own line number, and knows not to go above row l + #it knows that a move whilst 1-beyond the width conflicts with the linefeed and reduces the move by one accordingly. + #row returned should be correct. + #column may be the overflow column - as it likes to report that to the caller. + + #Note that an ansi up sequence after last column going up to a previous line and also beyond the last column, will result in the next grapheme going onto the following line. + #this seems correct - as the column remains beyond the right margin so subsequent chars wrap (?) review + #puts stderr "up $post_render_row" + #puts stderr "$rinfo" + + #puts stdout "1 row:$row col $col" + set row $post_render_row + #data_mode (naming?) determines if we move to end of existing data or not. + #data_mode 0 means ignore existing line length and go to exact column + #set by -experimental flag + if {$data_mode == 0} { + set col $post_render_col + } else { + #This doesn't really work if columns are pre-filled with spaces..we can't distinguish them from data + #we need renderline to return the number of the maximum column filled (or min if we ever do r-to-l) + set existingdata [lindex $outputlines [expr {$post_render_row -1}]] + set lastdatacol [punk::ansi::printing_length $existingdata] + if {$lastdatacol < $renderwidth} { + set col [expr {$lastdatacol+1}] + } else { + set col $renderwidth + } + } + + #puts stdout "2 row:$row col $col" + #puts stdout "-----------------------" + #puts stdout $rinfo + #flush stdout + } + down { + if {$data_mode == 0} { + #renderline doesn't know how far down we can go.. + if {$post_render_row > [llength $outputlines]} { + if {$opt_appendlines} { + set diff [expr {$post_render_row - [llength $outputlines]}] + if {$diff > 0} { + lappend outputlines {*}[lrepeat $diff ""] + } + lappend outputlines "" + } + } + set row $post_render_row + set col $post_render_col + } else { + if {$post_render_row > [llength $outputlines]} { + if {$opt_appendlines} { + set diff [expr {$post_render_row - [llength $outputlines]}] + if {$diff > 0} { + lappend outputlines {*}[lrepeat $diff ""] + } + lappend outputlines "" + } + } + set existingdata [lindex $outputlines [expr {$post_render_row -1}]] + set lastdatacol [punk::ansi::printing_length $existingdata] + if {$lastdatacol < $renderwidth} { + set col [expr {$lastdatacol+1}] + } else { + set col $renderwidth + } + + } + } + restore_cursor { + #testfile belinda.ans uses this + + #puts stdout "[a+ blue bold]CURSOR_RESTORE[a]" + if {[tcl::dict::exists $cursor_saved_position row]} { + set row [tcl::dict::get $cursor_saved_position row] + set col [tcl::dict::get $cursor_saved_position column] + #puts stdout "restoring: row $row col $col [ansistring VIEW $cursor_saved_attributes] [a] unapplied [ansistring VIEWCODES $unapplied]" + #set nextprefix $cursor_saved_attributes + #lset replay_codes_overlay [expr $overidx+1] $cursor_saved_attributes + set replay_codes_overlay [tcl::dict::get $rinfo replay_codes_overlay]$cursor_saved_attributes + #set replay_codes_overlay $cursor_saved_attributes + set cursor_saved_position [tcl::dict::create] + set cursor_saved_attributes "" + } else { + #TODO + #?restore without save? + #should move to home position and reset ansi SGR? + #puts stderr "overtype::renderspace cursor_restore without save data available" + } + #If we were inserting prior to hitting the cursor_restore - there could be overflow_right data - generally the overtype functions aren't for inserting - but ansi can enable it + #if we were already in overflow when cursor_restore was hit - it shouldn't have been processed as an action - just stored. + if {!$overflow_handled && $overflow_right ne ""} { + #wrap before restore? - possible effect on saved cursor position + #this overflow data has previously been rendered so has no cursor movements or further save/restore operations etc + #we can just insert another call to renderline to solve this.. ? + #It would perhaps be more properly handled as a queue of instructions from our initial renderline call + #we don't need to worry about overflow next call (?)- but we should carry forward our gx and ansi stacks + + puts stdout ">>>[a+ red bold]overflow_right during restore_cursor[a]" + + set sub_info [overtype::renderline\ + -info 1\ + -width [tcl::dict::get $vtstate renderwidth]\ + -insert_mode [tcl::dict::get $vtstate insert_mode]\ + -autowrap_mode [tcl::dict::get $vtstate autowrap_mode]\ + -expand_right [tcl::dict::get $opts -expand_right]\ + ""\ + $overflow_right\ + ] + set foldline [tcl::dict::get $sub_info result] + tcl::dict::set vtstate insert_mode [tcl::dict::get $sub_info insert_mode] ;#probably not needed..? + tcl::dict::set vtstate autowrap_mode [tcl::dict::get $sub_info autowrap_mode] ;#nor this.. + linsert outputlines $renderedrow $foldline + #review - row & col set by restore - but not if there was no save.. + } + set overflow_handled 1 + + } + move { + ######## + if {$post_render_row > [llength $outputlines]} { + #Ansi moves need to create new lines ? + #if {$opt_appendlines} { + # set diff [expr {$post_render_row - [llength $outputlines]}] + # if {$diff > 0} { + # lappend outputlines {*}[lrepeat $diff ""] + # } + # set row $post_render_row + #} else { + set row [llength $outputlines] + #} + } else { + set row $post_render_row + } + ####### + set col $post_render_col + #overflow + unapplied? + } + clear_and_move { + #e.g 2J + if {$post_render_row > [llength $outputlines]} { + set row [llength $outputlines] + } else { + set row $post_render_row + } + set col $post_render_col + set overflow_right "" ;#if we're clearing - any overflow due to insert_mode is irrelevant + set clearedlines [list] + foreach ln $outputlines { + lappend clearedlines \x1b\[0m$replay_codes_overlay[string repeat \000 $renderwidth]\x1b\[0m + if 0 { + + set lineparts [punk::ansi::ta::split_codes $ln] + set numcells 0 + foreach {pt _code} $lineparts { + if {$pt ne ""} { + foreach grapheme [punk::char::grapheme_split $pt] { + switch -- $grapheme { + " " - - - _ - ! - @ - # - $ - % - ^ - & - * - = - + - : - . - , - / - | - ? - + a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - + z - A - B - C - D - E - F - G - H - I - J - K - L - M - N - O - P - Q - R - S - T - U - V - W - X - Y - Z - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 { + incr numcells 1 + } + default { + if {$grapheme eq "\u0000"} { + incr numcells 1 + } else { + incr numcells [grapheme_width_cached $grapheme] + } + } + } + + } + } + } + #replays/resets each line + lappend clearedlines \x1b\[0m$replay_codes_overlay[string repeat \000 $numcells]\x1b\[0m + } + } + set outputlines $clearedlines + #todo - determine background/default to be in effect - DECECM ? + puts stderr "replay_codes_overlay: [ansistring VIEW $replay_codes_overlay]" + #lset outputlines 0 $replay_codes_overlay[lindex $outputlines 0] + + } + lf_start { + #raw newlines + # ---------------------- + #test with fruit.ans + #test - treating as newline below... + #append rendered $overflow_right + #set overflow_right "" + set row $renderedrow + incr row + if {$row > [llength $outputlines]} { + lappend outputlines "" + } + set col $opt_startcolumn + # ---------------------- + } + lf_mid { + + set edit_mode 0 + if {$edit_mode} { + set inputchunks [linsert $inputchunks 0 $overflow_right$unapplied] + set overflow_right "" + set unapplied "" + set row $post_render_row + #set col $post_render_col + set col $opt_startcolumn + if {$row > [llength $outputlines]} { + lappend outputlines {*}[lrepeat 1 ""] + } + } else { + if 1 { + if {$overflow_right ne ""} { + if {$opt_expand_right} { + append rendered $overflow_right + set overflow_right "" + } else { + #review - we should really make renderline do the work...? + set overflow_width [punk::ansi::printing_length $overflow_right] + if {$visualwidth + $overflow_width <= $renderwidth} { + append rendered $overflow_right + set overflow_right "" + } else { + if {[tcl::dict::get $vtstate autowrap_mode]} { + set outputlines [linsert $outputlines $renderedrow $overflow_right] + set overflow_right "" + set row [expr {$renderedrow + 2}] + } else { + set overflow_right "" ;#abandon + } + + if {0 && $visualwidth < $renderwidth} { + puts stderr "visualwidth: $visualwidth < renderwidth:$renderwidth" + error "incomplete - abandon?" + set overflowparts [punk::ansi::ta::split_codes $overflow_right] + set remaining_overflow $overflowparts + set filled 0 + foreach {pt code} $overflowparts { + lpop remaining_overflow 0 + if {$pt ne ""} { + set graphemes [punk::char::grapheme_split $pt] + set add "" + set addlen $visualwidth + foreach g $graphemes { + set w [overtype::grapheme_width_cached $g] + if {$addlen + $w <= $renderwidth} { + append add $g + incr addlen $w + } else { + set filled 1 + break + } + } + append rendered $add + } + if {!$filled} { + lpop remaining_overflow 0 ;#pop code + } + } + set overflow_right [join $remaining_overflow ""] + } + } + } + } + set row $post_render_row + set col $opt_startcolumn + if {$row > [llength $outputlines]} { + lappend outputlines {*}[lrepeat 1 ""] + } + } else { + #old version - known to work with various ansi graphics - e.g fruit.ans + # - but fails to limit lines to renderwidth when expand_right == 0 + append rendered $overflow_right + set overflow_right "" + set row $post_render_row + set col $opt_startcolumn + if {$row > [llength $outputlines]} { + lappend outputlines {*}[lrepeat 1 ""] + } + } + } + } + lf_overflow { + #linefeed after renderwidth e.g at column 81 for an 80 col width + #we may also have other control sequences that came after col 80 e.g cursor save + + if 0 { + set lhs [overtype::left -width 40 -wrap 1 "" [ansistring VIEWSTYLE -nul 1 -lf 1 -vt 1 $rendered]] + set lhs [textblock::frame -title "rendered $visualwidth cols" -subtitle "row-$renderedrow" $lhs] + set rhs "" + + #assertion - there should be no overflow.. + puts $lhs + } + if {![tcl::dict::get $vtstate insert_mode]} { + assert {$overflow_right eq ""} lf_overflow should not get data in overflow_right when not insert_mode + } + + set row $post_render_row + #set row $renderedrow + #incr row + #only add newline if we're at the bottom + if {$row > [llength $outputlines]} { + lappend outputlines {*}[lrepeat 1 ""] + } + set col $opt_startcolumn + + } + newlines_above { + #we get a newlines_above instruction when received at column 1 + #In some cases we want to treat that as request to insert a new blank line above, and move our row 1 down (staying with the data) + #in other cases - we want to treat at column 1 the same as any other + + puts "--->newlines_above" + puts "rinfo: $rinfo" + #renderline doesn't advance the row for us - the caller has the choice to implement or not + set row $post_render_row + set col $post_render_col + if {$insert_lines_above > 0} { + set row $renderedrow + set outputlines [linsert $outputlines $renderedrow-1 {*}[lrepeat $insert_lines_above ""]] + incr row [expr {$insert_lines_above -1}] ;#we should end up on the same line of text (at a different index), with new empties inserted above + #? set row $post_render_row #can renderline tell us? + } + } + newlines_below { + #obsolete? - use for ANSI insert lines sequence + if {$data_mode == 0} { + puts --->nl_below + set row $post_render_row + set col $post_render_col + if {$insert_lines_below == 1} { + #set lhs [overtype::left -width 40 -wrap 1 "" [ansistring VIEWSTYLE -lf 1 -vt 1 $rendered]] + #set lhs [textblock::frame -title rendered -subtitle "row-$renderedrow" $lhs] + #set rhs "" + #if {$overflow_right ne ""} { + # set rhs [overtype::left -width 40 -wrap 1 "" [ansistring VIEWSTYLE -lf 1 -vt 1 $overflow_right]] + # set rhs [textblock::frame -title overflow_right $rhs] + #} + #puts [textblock::join $lhs $rhs] + + #rendered + append rendered $overflow_right + # + + + set overflow_right "" + set row $renderedrow + #only add newline if we're at the bottom + if {$row > [llength $outputlines]} { + lappend outputlines {*}[lrepeat $insert_lines_below ""] + } + incr row $insert_lines_below + set col $opt_startcolumn + } + } else { + set row $post_render_row + if {$post_render_row > [llength $outputlines]} { + if {$opt_appendlines} { + set diff [expr {$post_render_row - [llength $outputlines]}] + if {$diff > 0} { + lappend outputlines {*}[lrepeat $diff ""] + } + lappend outputlines "" + } + } else { + set existingdata [lindex $outputlines [expr {$post_render_row -1}]] + set lastdatacol [punk::ansi::printing_length $existingdata] + if {$lastdatacol < $renderwidth} { + set col [expr {$lastdatacol+1}] + } else { + set col $renderwidth + } + } + } + } + wrapmoveforward { + #doesn't seem to be used by fruit.ans testfile + #used by dzds.ans + #note that cursor_forward may move deep into the next line - or even span multiple lines !TODO + set c $renderwidth + set r $post_render_row + if {$post_render_col > $renderwidth} { + set i $c + while {$i <= $post_render_col} { + if {$c == $renderwidth+1} { + incr r + if {$opt_appendlines} { + if {$r < [llength $outputlines]} { + lappend outputlines "" + } + } + set c $opt_startcolumn + } else { + incr c + } + incr i + } + set col $c + } else { + #why are we getting this instruction then? + puts stderr "wrapmoveforward - test" + set r [expr {$post_render_row +1}] + set c $post_render_col + } + set row $r + set col $c + } + wrapmovebackward { + set c $renderwidth + set r $post_render_row + if {$post_render_col < 1} { + set c 1 + set i $c + while {$i >= $post_render_col} { + if {$c == 0} { + if {$r > 1} { + incr r -1 + set c $renderwidth + } else { + #leave r at 1 set c 1 + #testfile besthpav.ans first line top left border alignment + set c 1 + break + } + } else { + incr c -1 + } + incr i -1 + } + set col $c + } else { + puts stderr "Wrapmovebackward - but postrendercol >= 1???" + } + set row $r + set col $c + } + overflow { + #normal single-width grapheme overflow + #puts "----normal overflow --- [ansistring VIEWSTYLE -lf 1 -nul 1 -vt 1 $rendered]" + set row $post_render_row ;#renderline will not advance row when reporting overflow char + if {[tcl::dict::get $vtstate autowrap_mode]} { + incr row + set col $opt_startcolumn ;#whether wrap or not - next data is at column 1 ?? + } else { + set col $post_render_col + #set unapplied "" ;#this seems wrong? + #set unapplied [tcl::string::range $unapplied 1 end] + #The overflow can only be triggered by a grapheme (todo cluster?) - but our unapplied could contain SGR codes prior to the grapheme that triggered overflow - so we need to skip beyond any SGRs + #There may be more than one, because although the stack leading up to overflow may have been merged - codes between the last column and the overflowing grapheme will remain separate + #We don't expect any movement or other ANSI codes - as if they came before the grapheme, they would have triggered a different instruction to 'overflow' + set idx 0 + set next_grapheme_index -1 + foreach u $unapplied_list { + if {![punk::ansi::ta::detect $u]} { + set next_grapheme_index $idx + break + } + incr idx + } + assert {$next_grapheme_index >= 0} + #drop the overflow grapheme - keeping all codes in place. + set unapplied [join [lreplace $unapplied_list $next_grapheme_index $next_grapheme_index] ""] + #we need to run the reduced unapplied on the same line - further graphemes will just overflow again, but codes or control chars could trigger jumps to other lines + + set overflow_handled 1 + #handled by dropping overflow if any + } + } + overflow_splitchar { + set row $post_render_row ;#renderline will not advance row when reporting overflow char + + #2nd half of grapheme would overflow - treggering grapheme is returned in unapplied. There may also be overflow_right from earlier inserts + #todo - consider various options .. re-render a single trailing space or placeholder on same output line, etc + if {[tcl::dict::get $vtstate autowrap_mode]} { + if {$renderwidth < 2} { + #edge case of rendering to a single column output - any 2w char will just cause a loop if we don't substitute with something, or drop the character + set idx 0 + set triggering_grapheme_index -1 + foreach u $unapplied_list { + if {![punk::ansi::ta::detect $u]} { + set triggering_grapheme_index $idx + break + } + incr idx + } + set unapplied [join [lreplace $unapplied_list $triggering_grapheme_index $triggering_grapheme_index $opt_exposed1] ""] + } else { + set col $opt_startcolumn + incr row + } + } else { + set overflow_handled 1 + #handled by dropping entire overflow if any + if {$renderwidth < 2} { + set idx 0 + set triggering_grapheme_index -1 + foreach u $unapplied_list { + if {![punk::ansi::ta::detect $u]} { + set triggering_grapheme_index $idx + break + } + incr idx + } + set unapplied [join [lreplace $unapplied_list $triggering_grapheme_index $triggering_grapheme_index $opt_exposed1] ""] + } + } + + } + vt { + + #can vt add a line like a linefeed can? + set row $post_render_row + set col $post_render_col + } + set_window_title { + set newtitle [lindex $instruction 1] + puts stderr "overtype::renderspace set_window_title [ansistring VIEW $newtitle] instruction '$instruction'" + # + } + reset_colour_palette { + puts stderr "overtype::renderspace instruction '$instruction' unimplemented" + } + default { + puts stderr "overtype::renderspace unhandled renderline instruction '$instruction'" + } + + } + + + if {!$opt_expand_right && ![tcl::dict::get $vtstate autowrap_mode]} { + #not allowed to overflow column or wrap therefore we get overflow data to truncate + if {[tcl::dict::get $opts -ellipsis]} { + set show_ellipsis 1 + if {!$opt_ellipsiswhitespace} { + #we don't want ellipsis if only whitespace was lost + set lostdata "" + if {$overflow_right ne ""} { + append lostdata $overflow_right + } + if {$unapplied ne ""} { + append lostdata $unapplied + } + if {[tcl::string::trim $lostdata] eq ""} { + set show_ellipsis 0 + } + #set lostdata [tcl::string::range $overtext end-[expr {$overflowlength-1}] end] + if {[tcl::string::trim [punk::ansi::ansistrip $lostdata]] eq ""} { + set show_ellipsis 0 + } + } + if {$show_ellipsis} { + set rendered [overtype::right $rendered $opt_ellipsistext] + } + set overflow_handled 1 + } else { + #no wrap - no ellipsis - silently truncate + set overflow_handled 1 + } + } + + + + if {$renderedrow <= [llength $outputlines]} { + lset outputlines [expr {$renderedrow-1}] $rendered + } else { + if {$opt_appendlines} { + lappend outputlines $rendered + } else { + #? + lset outputlines [expr {$renderedrow-1}] $rendered + } + } + + if {!$overflow_handled} { + append nextprefix $overflow_right + } + + append nextprefix $unapplied + + if 0 { + if {$nextprefix ne ""} { + set nextoveridx [expr {$overidx+1}] + if {$nextoveridx >= [llength $inputchunks]} { + lappend inputchunks $nextprefix + } else { + #lset overlines $nextoveridx $nextprefix[lindex $overlines $nextoveridx] + set inputchunks [linsert $inputchunks $nextoveridx $nextprefix] + } + } + } + + if {$nextprefix ne ""} { + set inputchunks [linsert $inputchunks 0 $nextprefix] + } + + + incr overidx + incr loop + if {$loop >= $looplimit} { + puts stderr "overtype::renderspace looplimit reached ($looplimit)" + lappend outputlines "[a+ red bold] - looplimit $looplimit reached[a]" + set Y [a+ yellow bold] + set RST [a] + set sep_header ----DEBUG----- + set debugmsg "" + append debugmsg "${Y}${sep_header}${RST}" \n + append debugmsg "looplimit $looplimit reached\n" + append debugmsg "data_mode:$data_mode\n" + append debugmsg "opt_appendlines:$opt_appendlines\n" + append debugmsg "prev_row :[tcl::dict::get $renderopts -cursor_row]\n" + append debugmsg "prev_col :[tcl::dict::get $renderopts -cursor_column]\n" + tcl::dict::for {k v} $rinfo { + append debugmsg "${Y}$k [ansistring VIEW -lf 1 -vt 1 $v]$RST" \n + } + append debugmsg "${Y}[string repeat - [string length $sep_header]]$RST" \n + + puts stdout $debugmsg + #todo - config regarding error dumps rather than just dumping in working dir + set fd [open [pwd]/error_overtype.txt w] + puts $fd $debugmsg + close $fd + error $debugmsg + break + } + } + + set result [join $outputlines \n] + if {!$opt_info} { + return $result + } else { + #emit to debug window like basictelnet does? make debug configurable as syslog or even a telnet server to allow on 2nd window? + #append result \n$instruction_stats\n + set inforesult [dict create\ + result $result\ + last_instruction $instruction\ + instruction_stats $instruction_stats\ + ] + if {$opt_info == 2} { + return [pdict -channel none inforesult] + } else { + return $inforesult + } + } + } + + #todo - left-right ellipsis ? + proc centre {args} { + variable default_ellipsis_horizontal + if {[llength $args] < 2} { + error {usage: ?-transparent [0|1]? ?-bias [left|right]? ?-overflow [1|0]? undertext overtext} + } + + foreach {underblock overblock} [lrange $args end-1 end] break + + #todo - vertical vs horizontal overflow for blocks + set opts [tcl::dict::create\ + -bias left\ + -ellipsis 0\ + -ellipsistext $default_ellipsis_horizontal\ + -ellipsiswhitespace 0\ + -overflow 0\ + -transparent 0\ + -exposed1 \uFFFD\ + -exposed2 \uFFFD\ + ] + set argsflags [lrange $args 0 end-2] + foreach {k v} $argsflags { + switch -- $k { + -bias - -ellipsis - -ellipsistext - -ellipsiswhitespace - -overflow - -transparent - -exposed1 - -exposed2 { + tcl::dict::set opts $k $v + } + default { + set known_opts [tcl::dict::keys $opts] + error "overtype::centre unknown option '$k'. Known options: $known_opts" + } + } + } + #set opts [tcl::dict::merge $defaults $argsflags] + # -- --- --- --- --- --- + set opt_transparent [tcl::dict::get $opts -transparent] + set opt_ellipsis [tcl::dict::get $opts -ellipsis] + set opt_ellipsistext [tcl::dict::get $opts -ellipsistext] + set opt_ellipsiswhitespace [tcl::dict::get $opts -ellipsiswhitespace] + set opt_exposed1 [tcl::dict::get $opts -exposed1] + set opt_exposed2 [tcl::dict::get $opts -exposed2] + # -- --- --- --- --- --- + + + set underblock [tcl::string::map {\r\n \n} $underblock] + set overblock [tcl::string::map {\r\n \n} $overblock] + + set underlines [split $underblock \n] + #set renderwidth [tcl::mathfunc::max {*}[lmap v $underlines {punk::ansi::printing_length $v}]] + lassign [blocksize $underblock] _w renderwidth _h renderheight + set overlines [split $overblock \n] + lassign [blocksize $overblock] _w overblock_width _h overblock_height + set under_exposed_max [expr {$renderwidth - $overblock_width}] + if {$under_exposed_max > 0} { + #background block is wider + if {$under_exposed_max % 2 == 0} { + #even left/right exposure + set left_exposed [expr {$under_exposed_max / 2}] + } else { + set beforehalf [expr {$under_exposed_max / 2}] ;#1 less than half due to integer division + if {[tcl::string::tolower [tcl::dict::get $opts -bias]] eq "left"} { + set left_exposed $beforehalf + } else { + #bias to the right + set left_exposed [expr {$beforehalf + 1}] + } + } + } else { + set left_exposed 0 + } + + set outputlines [list] + if {[punk::ansi::ta::detect_sgr [lindex $overlines 0]]} { + set replay_codes "[punk::ansi::a]" + } else { + set replay_codes "" + } + set replay_codes_underlay "" + set replay_codes_overlay "" + foreach undertext $underlines overtext $overlines { + set overtext_datalen [punk::ansi::printing_length $overtext] + set ulen [punk::ansi::printing_length $undertext] + if {$ulen < $renderwidth} { + set udiff [expr {$renderwidth - $ulen}] + set undertext "$undertext[string repeat { } $udiff]" + } + set undertext $replay_codes_underlay$undertext + set overtext $replay_codes_overlay$overtext + + set overflowlength [expr {$overtext_datalen - $renderwidth}] + #review - right-to-left langs should elide on left! - extra option required + + if {$overflowlength > 0} { + #overlay line wider or equal + #review - we still expand_right for centred for now.. possibly should do something like -expand_leftright with ellipsis each end? + set rinfo [renderline -info 1 -insert_mode 0 -transparent $opt_transparent -expand_right [tcl::dict::get $opts -overflow] -exposed1 $opt_exposed1 -exposed2 $opt_exposed2 $undertext $overtext] + set rendered [tcl::dict::get $rinfo result] + set overflow_right [tcl::dict::get $rinfo overflow_right] + set unapplied [tcl::dict::get $rinfo unapplied] + #todo - get replay_codes from overflow_right instead of wherever it was truncated? + + #overlay line data is wider - trim if overflow not specified in opts - and overtype an ellipsis at right if it was specified + if {![tcl::dict::get $opts -overflow]} { + #lappend outputlines [tcl::string::range $overtext 0 [expr {$renderwidth - 1}]] + #set overtext [tcl::string::range $overtext 0 $renderwidth-1 ] + if {$opt_ellipsis} { + set show_ellipsis 1 + if {!$opt_ellipsiswhitespace} { + #we don't want ellipsis if only whitespace was lost + #don't use tcl::string::range on ANSI data + #set lostdata [tcl::string::range $overtext end-[expr {$overflowlength-1}] end] + set lostdata "" + if {$overflow_right ne ""} { + append lostdata $overflow_right + } + if {$unapplied ne ""} { + append lostdata $unapplied + } + if {[tcl::string::trim $lostdata] eq ""} { + set show_ellipsis 0 + } + } + if {$show_ellipsis} { + set rendered [overtype::right $rendered $opt_ellipsistext] + } + } + } + lappend outputlines $rendered + #lappend outputlines [renderline -insert_mode 0 -transparent $opt_transparent $undertext $overtext] + } else { + #background block is wider than or equal to data for this line + #lappend outputlines [renderline -insert_mode 0 -startcolumn [expr {$left_exposed + 1}] -transparent $opt_transparent -exposed1 $opt_exposed1 -exposed2 $opt_exposed2 $undertext $overtext] + set rinfo [renderline -info 1 -insert_mode 0 -startcolumn [expr {$left_exposed + 1}] -transparent $opt_transparent -exposed1 $opt_exposed1 -exposed2 $opt_exposed2 $undertext $overtext] + lappend outputlines [tcl::dict::get $rinfo result] + } + set replay_codes_underlay [tcl::dict::get $rinfo replay_codes_underlay] + set replay_codes_overlay [tcl::dict::get $rinfo replay_codes_overlay] + } + return [join $outputlines \n] + } + + #overtype::right is for a rendered ragged underblock and a rendered ragged overblock + #ie we can determine the block width for bost by examining the lines and picking the longest. + # + proc right {args} { + #NOT the same as align-right - which should be done to the overblock first if required + variable default_ellipsis_horizontal + # @d !todo - implement overflow, length checks etc + + if {[llength $args] < 2} { + error {usage: ?-overflow [1|0]? ?-transparent 0|? undertext overtext} + } + foreach {underblock overblock} [lrange $args end-1 end] break + + set opts [tcl::dict::create\ + -bias ignored\ + -ellipsis 0\ + -ellipsistext $default_ellipsis_horizontal\ + -ellipsiswhitespace 0\ + -overflow 0\ + -transparent 0\ + -exposed1 \uFFFD\ + -exposed2 \uFFFD\ + -align "left"\ + ] + set argsflags [lrange $args 0 end-2] + tcl::dict::for {k v} $argsflags { + switch -- $k { + -bias - -ellipsis - -ellipsistext - -ellipsiswhitespace - -overflow - -transparent - -exposed1 - -exposed2 - -align { + tcl::dict::set opts $k $v + } + default { + set known_opts [tcl::dict::keys $opts] + error "overtype::centre unknown option '$k'. Known options: $known_opts" + } + } + } + #set opts [tcl::dict::merge $defaults $argsflags] + # -- --- --- --- --- --- + set opt_transparent [tcl::dict::get $opts -transparent] + set opt_ellipsis [tcl::dict::get $opts -ellipsis] + set opt_ellipsistext [tcl::dict::get $opts -ellipsistext] + set opt_ellipsiswhitespace [tcl::dict::get $opts -ellipsiswhitespace] + set opt_overflow [tcl::dict::get $opts -overflow] + set opt_exposed1 [tcl::dict::get $opts -exposed1] + set opt_exposed2 [tcl::dict::get $opts -exposed2] + set opt_align [tcl::dict::get $opts -align] + # -- --- --- --- --- --- + + set underblock [tcl::string::map {\r\n \n} $underblock] + set overblock [tcl::string::map {\r\n \n} $overblock] + + set underlines [split $underblock \n] + lassign [blocksize $underblock] _w renderwidth _h renderheight + set overlines [split $overblock \n] + #set overblock_width [tcl::mathfunc::max {*}[lmap v $overlines {punk::ansi::printing_length $v}]] + lassign [blocksize $overblock] _w overblock_width _h overblock_height + set under_exposed_max [expr {max(0,$renderwidth - $overblock_width)}] + set left_exposed $under_exposed_max + + + + set outputlines [list] + if {[punk::ansi::ta::detect_sgr [lindex $overlines 0]]} { + set replay_codes "[punk::ansi::a]" + } else { + set replay_codes "" + } + set replay_codes_underlay "" + set replay_codes_overlay "" + foreach undertext $underlines overtext $overlines { + set overtext_datalen [punk::ansi::printing_length $overtext] + set ulen [punk::ansi::printing_length $undertext] + if {$ulen < $renderwidth} { + set udiff [expr {$renderwidth - $ulen}] + #puts xxx + append undertext [string repeat { } $udiff] + } + if {$overtext_datalen < $overblock_width} { + set odiff [expr {$overblock_width - $overtext_datalen}] + switch -- $opt_align { + left { + set startoffset 0 + } + right { + set startoffset $odiff + } + default { + set half [expr {$odiff / 2}] + #set lhs [string repeat { } $half] + #set righthalf [expr {$odiff - $half}] ;#remainder - may be one more - so we are biased left + #set rhs [string repeat { } $righthalf] + set startoffset $half + } + } + } else { + set startoffset 0 ;#negative? + } + + set undertext $replay_codes_underlay$undertext + set overtext $replay_codes_overlay$overtext + + set overflowlength [expr {$overtext_datalen - $renderwidth}] + if {$overflowlength > 0} { + #raw overtext wider than undertext column + set rinfo [renderline\ + -info 1\ + -insert_mode 0\ + -transparent $opt_transparent\ + -exposed1 $opt_exposed1 -exposed2 $opt_exposed2\ + -overflow $opt_overflow\ + -startcolumn [expr {1 + $startoffset}]\ + $undertext $overtext] + set replay_codes [tcl::dict::get $rinfo replay_codes] + set rendered [tcl::dict::get $rinfo result] + if {!$opt_overflow} { + if {$opt_ellipsis} { + set show_ellipsis 1 + if {!$opt_ellipsiswhitespace} { + #we don't want ellipsis if only whitespace was lost + set lostdata [tcl::string::range $overtext end-[expr {$overflowlength-1}] end] + if {[tcl::string::trim $lostdata] eq ""} { + set show_ellipsis 0 + } + } + if {$show_ellipsis} { + set ellipsis $replay_codes$opt_ellipsistext + #todo - overflow on left if allign = right?? + set rendered [overtype::right $rendered $ellipsis] + } + } + } + lappend outputlines $rendered + } else { + #padded overtext + #lappend outputlines [renderline -insert_mode 0 -transparent $opt_transparent -startcolumn [expr {$left_exposed + 1}] $undertext $overtext] + #Note - we still need overflow(exapnd_right) here - as although the overtext is short - it may oveflow due to the startoffset + set rinfo [renderline -info 1 -insert_mode 0 -transparent $opt_transparent -expand_right $opt_overflow -startcolumn [expr {$left_exposed + 1 + $startoffset}] $undertext $overtext] + lappend outputlines [tcl::dict::get $rinfo result] + } + set replay_codes [tcl::dict::get $rinfo replay_codes] + set replay_codes_underlay [tcl::dict::get $rinfo replay_codes_underlay] + set replay_codes_overlay [tcl::dict::get $rinfo replay_codes_overlay] + } + + return [join $outputlines \n] + } + + proc left {args} { + overtype::block -blockalign left {*}$args + } + #overtype a (possibly ragged) underblock with a (possibly ragged) overblock + proc block {args} { + variable default_ellipsis_horizontal + # @d !todo - implement overflow, length checks etc + + if {[llength $args] < 2} { + error {usage: ?-blockalign left|centre|right? ?-textalign left|centre|right? ?-overflow [1|0]? ?-transparent 0|? undertext overtext} + } + #foreach {underblock overblock} [lrange $args end-1 end] break + lassign [lrange $args end-1 end] underblock overblock + + set opts [tcl::dict::create\ + -ellipsis 0\ + -ellipsistext $default_ellipsis_horizontal\ + -ellipsiswhitespace 0\ + -overflow 0\ + -transparent 0\ + -exposed1 \uFFFD\ + -exposed2 \uFFFD\ + -textalign "left"\ + -textvertical "top"\ + -blockalign "left"\ + -blockalignbias left\ + -blockvertical "top"\ + ] + set argsflags [lrange $args 0 end-2] + tcl::dict::for {k v} $argsflags { + switch -- $k { + -blockalignbias - -ellipsis - -ellipsistext - -ellipsiswhitespace - -overflow - -transparent - -exposed1 - -exposed2 - -textalign - -blockalign - -blockvertical { + tcl::dict::set opts $k $v + } + default { + error "overtype::block unknown option '$k'. Known options: [tcl::dict::keys $opts]" + } + } + } + # -- --- --- --- --- --- + set opt_transparent [tcl::dict::get $opts -transparent] + set opt_ellipsis [tcl::dict::get $opts -ellipsis] + set opt_ellipsistext [tcl::dict::get $opts -ellipsistext] + set opt_ellipsiswhitespace [tcl::dict::get $opts -ellipsiswhitespace] + set opt_overflow [tcl::dict::get $opts -overflow] + set opt_exposed1 [tcl::dict::get $opts -exposed1] + set opt_exposed2 [tcl::dict::get $opts -exposed2] + set opt_textalign [tcl::dict::get $opts -textalign] + set opt_blockalign [tcl::dict::get $opts -blockalign] + if {$opt_blockalign eq "center"} { + set opt_blockalign "centre" + } + # -- --- --- --- --- --- + + set underblock [tcl::string::map {\r\n \n} $underblock] + set overblock [tcl::string::map {\r\n \n} $overblock] + + set underlines [split $underblock \n] + lassign [blocksize $underblock] _w renderwidth _h renderheight + set overlines [split $overblock \n] + #set overblock_width [tcl::mathfunc::max {*}[lmap v $overlines {punk::ansi::printing_length $v}]] + lassign [blocksize $overblock] _w overblock_width _h overblock_height + set under_exposed_max [expr {max(0,$renderwidth - $overblock_width)}] + + switch -- $opt_blockalign { + left { + set left_exposed 0 + } + right { + set left_exposed $under_exposed_max + } + centre { + if {$under_exposed_max > 0} { + #background block is wider + if {$under_exposed_max % 2 == 0} { + #even left/right exposure + set left_exposed [expr {$under_exposed_max / 2}] + } else { + set beforehalf [expr {$under_exposed_max / 2}] ;#1 less than half due to integer division + if {[tcl::string::tolower [tcl::dict::get $opts -blockalignbias]] eq "left"} { + set left_exposed $beforehalf + } else { + #bias to the right + set left_exposed [expr {$beforehalf + 1}] + } + } + } else { + set left_exposed 0 + } + } + default { + set left_exposed 0 + } + } + + + + set outputlines [list] + if {[punk::ansi::ta::detect_sgr [lindex $overlines 0]]} { + set replay_codes "[punk::ansi::a]" + } else { + set replay_codes "" + } + set replay_codes_underlay "" + set replay_codes_overlay "" + foreach undertext $underlines overtext $overlines { + set overtext_datalen [punk::ansi::printing_length $overtext] + set ulen [punk::ansi::printing_length $undertext] + if {$ulen < $renderwidth} { + set udiff [expr {$renderwidth - $ulen}] + #puts xxx + append undertext [string repeat { } $udiff] + } + if {$overtext_datalen < $overblock_width} { + set odiff [expr {$overblock_width - $overtext_datalen}] + switch -- $opt_textalign { + left { + set startoffset 0 + } + right { + set startoffset $odiff + } + default { + set half [expr {$odiff / 2}] + #set lhs [string repeat { } $half] + #set righthalf [expr {$odiff - $half}] ;#remainder - may be one more - so we are biased left + #set rhs [string repeat { } $righthalf] + set startoffset $half + } + } + } else { + set startoffset 0 ;#negative? + } + + set undertext $replay_codes_underlay$undertext + set overtext $replay_codes_overlay$overtext + + set overflowlength [expr {$overtext_datalen - $renderwidth}] + if {$overflowlength > 0} { + #raw overtext wider than undertext column + set rinfo [renderline -info 1 -insert_mode 0 -transparent $opt_transparent -exposed1 $opt_exposed1 -exposed2 $opt_exposed2 -expand_right $opt_overflow -startcolumn [expr {1 + $startoffset}] $undertext $overtext] + set replay_codes [tcl::dict::get $rinfo replay_codes] + set rendered [tcl::dict::get $rinfo result] + set overflow_right [tcl::dict::get $rinfo overflow_right] + set unapplied [tcl::dict::get $rinfo unapplied] + if {!$opt_overflow} { + if {$opt_ellipsis} { + set show_ellipsis 1 + if {!$opt_ellipsiswhitespace} { + #we don't want ellipsis if only whitespace was lost + #don't use tcl::string::range on ANSI data + #set lostdata [tcl::string::range $overtext end-[expr {$overflowlength-1}] end] + set lostdata "" + if {$overflow_right ne ""} { + append lostdata $overflow_right + } + if {$unapplied ne ""} { + append lostdata $unapplied + } + if {[tcl::string::trim $lostdata] eq ""} { + set show_ellipsis 0 + } + } + if {$show_ellipsis} { + set rendered [overtype::block -blockalign right $rendered $opt_ellipsistext] + } + } + + #if {$opt_ellipsis} { + # set show_ellipsis 1 + # if {!$opt_ellipsiswhitespace} { + # #we don't want ellipsis if only whitespace was lost + # set lostdata [tcl::string::range $overtext end-[expr {$overflowlength-1}] end] + # if {[tcl::string::trim $lostdata] eq ""} { + # set show_ellipsis 0 + # } + # } + # if {$show_ellipsis} { + # set ellipsis [tcl::string::cat $replay_codes $opt_ellipsistext] + # #todo - overflow on left if allign = right?? + # set rendered [overtype::right $rendered $ellipsis] + # } + #} + } + lappend outputlines $rendered + } else { + #padded overtext + #lappend outputlines [renderline -insert_mode 0 -transparent $opt_transparent -startcolumn [expr {$left_exposed + 1}] $undertext $overtext] + #Note - we still need expand_right here - as although the overtext is short - it may oveflow due to the startoffset + set rinfo [renderline -info 1 -insert_mode 0 -transparent $opt_transparent -expand_right $opt_overflow -startcolumn [expr {$left_exposed + 1 + $startoffset}] $undertext $overtext] + #puts stderr "--> [ansistring VIEW -lf 1 -nul 1 $rinfo] <--" + set overflow_right [tcl::dict::get $rinfo overflow_right] + set unapplied [tcl::dict::get $rinfo unapplied] + lappend outputlines [tcl::dict::get $rinfo result] + } + set replay_codes [tcl::dict::get $rinfo replay_codes] + set replay_codes_underlay [tcl::dict::get $rinfo replay_codes_underlay] + set replay_codes_overlay [tcl::dict::get $rinfo replay_codes_overlay] + } + + return [join $outputlines \n] + } + + variable optimise_ptruns 10 ;# can be set to zero to disable the ptruns branches + + # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### + # renderline written from a left-right line orientation perspective as a first-shot at getting something useful. + # ultimately right-to-left, top-to-bottom and bottom-to-top are probably needed. + # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### + # + # + #-returnextra enables returning of overflow and length + #review - use punk::ansi::ta::detect to short-circuit processing and do simpler string calcs as an optimisation? + #review - DECSWL/DECDWL double width line codes - very difficult/impossible to align and compose with other elements + #(could render it by faking it with sixels and a lot of work - find/make a sixel font and ensure it's exactly 2 cols per char? + # This would probably be impractical to support for different fonts) + #todo - review transparency issues with single/double width characters + #bidi - need a base direction and concept of directional runs for RTL vs LTR - may be best handled at another layer? + proc renderline {args} { + #*** !doctools + #[call [fun overtype::renderline] [arg args] ] + #[para] renderline is the core engine for overtype string processing (frames & textblocks), and the raw mode commandline repl for the Tcl Punk Shell + #[para] It is also a central part of an ansi (micro) virtual terminal-emulator of sorts + #[para] This system does a half decent job at rendering 90's ANSI art to manipulable colour text blocks that can be joined & framed for layout display within a unix or windows terminal + #[para] Renderline helps maintain ANSI text styling reset/replay codes so that the styling of one block doesn't affect another. + #[para] Calling on the punk::ansi library - it can coalesce codes to keep the size down. + #[para] It is a giant mess of doing exactly what common wisdom says not to do... lots at once. + #[para] renderline is part of the Unicode and ANSI aware Overtype system which 'renders' a block of text onto a static underlay + #[para] The underlay is generally expected to be an ordered set of lines or a rectangular text block analogous to a terminal screen - but it can also be ragged in line length, or just blank. + #[para] The overlay couuld be similar - in which case it may often be used to overwrite a column or section of the underlay. + #[para] The overlay could however be a sequence of ANSI-laden text that jumps all over the place. + # + #[para] renderline itself only deals with a single line - or sometimes a single character. It is generally called from a loop that does further terminal-like or textblock processing. + #[para] By suppyling the -info 1 option - it can return various fields indicating the state of the render. + #[para] The main 3 are the result, overflow_right, and unapplied. + #[para] Renderline handles cursor movements from either keystrokes or ANSI sequences but for a full system the aforementioned loop will need to be in place to manage the set of lines under manipulation. + + #puts stderr "renderline '$args'" + variable optimise_ptruns + + if {[llength $args] < 2} { + error {usage: ?-info 0|1? ?-startcolumn ? ?-cursor_column ? ?-cursor_row |""? ?-transparent [0|1|]? ?-expand_right [1|0]? undertext overtext} + } + set under [lindex $args end-1] + set over [lindex $args end] + #lassign [lrange $args end-1 end] under over + if {[string last \n $under] >= 0} { + error "overtype::renderline not allowed to contain newlines in undertext" + } + #if {[string first \n $over] >=0 || [string first \n $under] >= 0} { + # error "overtype::renderline not allowed to contain newlines" + #} + + #generally faster to create a new dict in the proc than to use a namespace variable to create dict once and link to variable (2024 8.6/8.7) + set opts [tcl::dict::create\ + -etabs 0\ + -width \uFFEF\ + -expand_right 0\ + -transparent 0\ + -startcolumn 1\ + -cursor_column 1\ + -cursor_row ""\ + -insert_mode 1\ + -crm_mode 0\ + -autowrap_mode 1\ + -reverse_mode 0\ + -info 0\ + -exposed1 \uFFFD\ + -exposed2 \uFFFD\ + -cursor_restore_attributes ""\ + -cp437 0\ + -experimental {}\ + ] + #-cursor_restore_attributes only - for replay stack - position and actual setting/restoring handled by throwback to caller + + #cursor_row, when numeric will allow detection of certain row moves that are still within our row - allowing us to avoid an early return + #An empty string for cursor_row tells us we have no info about our own row context, and to return with an unapplied string if any row move occurs + + #exposed1 and exposed2 for first and second col of underying 2wide char which is truncated by transparency, currsor movements to 2nd charcol, or overflow/expand_right + #todo - return info about such grapheme 'cuts' in -info structure and/or create option to raise an error + + set argsflags [lrange $args 0 end-2] + tcl::dict::for {k v} $argsflags { + switch -- $k { + -experimental - -cp437 - -width - -expand_right - -transparent - -startcolumn - -cursor_column - -cursor_row + - -crm_mode - -insert_mode - -autowrap_mode - -reverse_mode + - -info - -exposed1 - -exposed2 - -cursor_restore_attributes { + tcl::dict::set opts $k $v + } + default { + error "overtype::renderline unknown option '$k'. Known options: [tcl::dict::keys $opts]" + } + } + } + # -- --- --- --- --- --- --- --- --- --- --- --- + set opt_width [tcl::dict::get $opts -width] + set opt_etabs [tcl::dict::get $opts -etabs] + set opt_expand_right [tcl::dict::get $opts -expand_right] + set opt_colstart [tcl::dict::get $opts -startcolumn] ;#lhs limit for overlay - an offset to cursor_column - first visible column is 1. 0 or < 0 are before the start of the underlay + set opt_colcursor [tcl::dict::get $opts -cursor_column];#start cursor column relative to overlay + set opt_row_context [tcl::dict::get $opts -cursor_row] + if {[string length $opt_row_context]} { + if {![tcl::string::is integer -strict $opt_row_context] || $opt_row_context <1 } { + error "overtype::renderline -cursor_row must be empty for unspecified/unknown or a non-zero positive integer. received: '$opt_row_context'" + } + } + # -- --- --- --- --- --- --- --- --- --- --- --- + #The _mode flags correspond to terminal modes that can be set/reset via escape sequences (e.g DECAWM wraparound mode) + set opt_insert_mode [tcl::dict::get $opts -insert_mode];#should usually be 1 for each new line in editor mode but must be initialised to 1 externally (review) + #default is for overtype + # -- --- --- --- --- --- --- --- --- --- --- --- + set opt_autowrap_mode [tcl::dict::get $opts -autowrap_mode] ;#DECAWM - char or movement can go beyond leftmost/rightmost col to prev/next line + set opt_reverse_mode [tcl::dict::get $opts -reverse_mode] ;#DECSNM + set opt_crm_mode [tcl::dict::get $opts -crm_mode];# CRM - show control character mode + # -- --- --- --- --- --- --- --- --- --- --- --- + set temp_cursor_saved [tcl::dict::get $opts -cursor_restore_attributes] + + set cp437_glyphs [tcl::dict::get $opts -cp437] + set cp437_map [tcl::dict::create] + if {$cp437_glyphs} { + set cp437_map [set ::punk::ansi::cp437_map] + #for cp437 images we need to map these *after* splitting ansi + #some old files might use newline for its glyph.. but we can't easily support that. + #Not sure how old files did it.. maybe cr lf in sequence was newline and any lone cr or lf were displayed as glyphs? + tcl::dict::unset cp437_map \n + } + + set opt_transparent [tcl::dict::get $opts -transparent] + if {$opt_transparent eq "0"} { + set do_transparency 0 + } else { + set do_transparency 1 + if {$opt_transparent eq "1"} { + set opt_transparent {[\s]} + } + } + # -- --- --- --- --- --- --- --- --- --- --- --- + set opt_returnextra [tcl::dict::get $opts -info] + # -- --- --- --- --- --- --- --- --- --- --- --- + set opt_exposed1 [tcl::dict::get $opts -exposed1] + set opt_exposed2 [tcl::dict::get $opts -exposed2] + # -- --- --- --- --- --- --- --- --- --- --- --- + + if {$opt_row_context eq ""} { + set cursor_row 1 + } else { + set cursor_row $opt_row_context + } + + set insert_mode $opt_insert_mode ;#default 1 + set autowrap_mode $opt_autowrap_mode ;#default 1 + set crm_mode $opt_crm_mode ;#default 0 (Show Control Character mode) + set reverse_mode $opt_reverse_mode + + #----- + # + if {[info exists punk::console::tabwidth]} { + #punk console is updated if punk::console::set_tabstop_width is used or rep is started/restarted + #It is way too slow to test the current width by querying the terminal here - so it could conceivably get out of sync + #todo - we also need to handle the new threaded repl where console config is in a different thread. + # - also - concept of sub-regions being mini-consoles with their own settings - 'id' for console, or use in/out channels as id? + set tw $::punk::console::tabwidth + } else { + set tw 8 + } + + set overdata $over + if {!$cp437_glyphs} { + #REVIEW! tabify will give different answers for an ANSI colourised string vs plain text + if {!$opt_etabs} { + if {[string first \t $under] >= 0} { + #set under [textutil::tabify::untabify2 $under] + set under [textutil::tabify::untabifyLine $under $tw] + } + if {[string first \t $over] >= 0} { + #set overdata [textutil::tabify::untabify2 $over] + set overdata [textutil::tabify::untabifyLine $over $tw] + } + } + } + #------- + + #ta_detect ansi and do simpler processing? + + #we repeat tests for grapheme width in different loops - rather than create another datastructure to store widths based on column, + #we'll use the grapheme_width_cached function as a lookup table of all graphemes encountered - as there will often be repeats in different positions anyway. + + # -- --- --- --- --- --- --- --- + if {$under ne ""} { + if {[punk::ansi::ta::detect $under]} { + set undermap [punk::ansi::ta::split_codes_single $under] + } else { + #single plaintext part + set undermap [list $under] + } + } else { + set undermap [list] + } + set understacks [list] + set understacks_gx [list] + set pm_list [list] + + set i_u -1 ;#underlay may legitimately be empty + set undercols [list] + set u_codestack [list] + #u_gx_stack probably isn't really a stack - I don't know if g0 g1 can stack or not - for now we support only g0 anyway + set u_gx_stack [list] ;#separate stack for g0 (g1 g2 g3?) graphics on and off (DEC special graphics) + #set pt_underchars "" ;#for string_columns length calculation for expand_right 0 truncation + set remainder [list] ;#for returnextra + foreach {pt code} $undermap { + #pt = plain text + #append pt_underchars $pt + if {$pt ne ""} { + if {$cp437_glyphs} { + set pt [tcl::string::map $cp437_map $pt] + } + set is_ptrun 0 + if {$optimise_ptruns && [tcl::string::length $pt] >= $optimise_ptruns} { + set p1 [tcl::string::index $pt 0] + set hex [format %x [scan $p1 %c]] ;#punk::char::char_hex + set re [tcl::string::cat {^[} \\U$hex {]+$}] + set is_ptrun [regexp $re $pt] + } + if {$is_ptrun} { + #switch -- $p1 { + # " " - - - _ - ! - @ - # - $ - % - ^ - & - * - = - + - : - . - , - / - | - ? - + # a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - + # z - A - B - C - D - E - F - G - H - I - J - K - L - M - N - O - P - Q - R - S - T - U - V - W - X - Y - Z - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 { + # set width 1 + # } + # default { + # if {$p1 eq "\u0000"} { + # #use null as empty cell representation - review + # #use of this will probably collide with some application at some point + # #consider an option to set the empty cell character + # set width 1 + # } else { + # set width [grapheme_width_cached $p1] ;# when zero??? + # } + # } + #} + set width [grapheme_width_cached $p1] ;# when zero??? + set ptlen [string length $pt] + if {$width <= 1} { + #review - 0 and 1? + incr i_u $ptlen + lappend understacks {*}[lrepeat $ptlen $u_codestack] + lappend understacks_gx {*}[lrepeat $ptlen $u_gx_stack] + lappend undercols {*}[lrepeat $ptlen $p1] + } else { + incr i_u $ptlen ;#2nd col empty str - so same as above + set 2ptlen [expr {$ptlen * 2}] + lappend understacks {*}[lrepeat $2ptlen $u_codestack] + lappend understacks_gx {*}[lrepeat $2ptlen $u_gx_stack] + set l [concat {*}[lrepeat $ptlen [list $p1 ""]] + lappend undercols {*}$l + unset l + } + + } else { + foreach grapheme [punk::char::grapheme_split $pt] { + #an ugly but easy hack to serve *some* common case ascii quickly with byte-compiled literal switch - feels dirty. + #.. but even 0.5uS per char (grapheme_width_cached) adds up quickly when stitching lots of lines together. + #todo - test decimal value instead, compare performance + switch -- $grapheme { + " " - - - _ - ! - @ - # - $ - % - ^ - & - * - = - + - : - . - , - / - | - ? - + a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - + z - A - B - C - D - E - F - G - H - I - J - K - L - M - N - O - P - Q - R - S - T - U - V - W - X - Y - Z - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 { + set width 1 + } + default { + if {$grapheme eq "\u0000"} { + #use null as empty cell representation - review + #use of this will probably collide with some application at some point + #consider an option to set the empty cell character + set width 1 + } else { + #zero width still acts as 1 below??? review what should happen + set width [grapheme_width_cached $grapheme] + #we still want most controls and other zero-length codepoints such as \u200d (zero width joiner) to stay zero-length + #we substitute lone ESC that weren't captured within ANSI context as a debugging aid to see malformed ANSI + #todo - default to off and add a flag (?) to enable this substitution + set sub_stray_escapes 0 + if {$sub_stray_escapes && $width == 0} { + if {$grapheme eq "\x1b"} { + set gvis [ansistring VIEW $grapheme] ;#can only use with graphemes that have a single replacement char.. + set grapheme $gvis + set width 1 + } + } + } + } + } + + #set width [grapheme_width_cached $grapheme] + incr i_u + lappend understacks $u_codestack + lappend understacks_gx $u_gx_stack + + lappend undercols $grapheme + if {$width > 1} { + #presumably there are no triple-column or wider unicode chars.. until the aliens arrive.(?) + #but what about emoji combinations etc - can they be wider than 2? + #todo - if -etabs enabled - then we treat \t as the width determined by our elastic tabstop + incr i_u + lappend understacks $u_codestack + lappend understacks_gx $u_gx_stack + lappend undercols "" + } + } + + } + } + #underlay should already have been rendered and not have non-sgr codes - but let's retain the check for them and not stack them if other codes are here + + #only stack SGR (graphics rendition) codes - not title sets, cursor moves etc + #keep any remaining PMs in place + if {$code ne ""} { + set c1c2 [tcl::string::range $code 0 1] + + set leadernorm [tcl::string::range [tcl::string::map [list\ + \x1b\[ 7CSI\ + \x9b 8CSI\ + \x1b\( 7GFX\ + \x1b^ 7PMX\ + \x1bX 7SOS\ + ] $c1c2] 0 3];# leadernorm is 1st 2 chars mapped to normalised indicator - or is original 2 chars + + switch -- $leadernorm { + 7CSI - 8CSI { + #need to exclude certain leaders after the lb e.g < for SGR 1006 mouse + #REVIEW - what else could end in m but be mistaken as a normal SGR code here? + set maybemouse "" + if {[tcl::string::index $c1c2 0] eq "\x1b"} { + set maybemouse [tcl::string::index $code 2] + } + + if {$maybemouse ne "<" && [tcl::string::index $code end] eq "m"} { + if {[punk::ansi::codetype::is_sgr_reset $code]} { + set u_codestack [list "\x1b\[m"] + } elseif {[punk::ansi::codetype::has_sgr_leadingreset $code]} { + set u_codestack [list $code] + } else { + #basic simplification first.. straight dups + set dup_posns [lsearch -all -exact $u_codestack $code] ;#-exact because of square-bracket glob chars + set u_codestack [lremove $u_codestack {*}$dup_posns] + lappend u_codestack $code + } + } + } + 7GFX { + switch -- [tcl::string::index $code 2] { + "0" { + set u_gx_stack [list gx0_on] ;#we'd better use a placeholder - or debugging will probably get into a big mess + } + B { + set u_gx_stack [list] + } + } + } + 7PMX - 7SOS { + #we can have PMs or SOS (start of string) in the underlay - though mostly the PMs should have been processed.. + #attach the PM/SOS (entire ANSI sequence) to the previous grapheme! + #It should not affect the size - but terminal display may get thrown out if terminal doesn't support them. + + #note that there may in theory already be ANSI stored - we don't assume it was a pure grapheme string + set graphemeplus [lindex $undercols end] + if {$graphemeplus ne "\0"} { + append graphemeplus $code + } else { + set graphemeplus $code + } + lset undercols end $graphemeplus + #The grapheme_width_cached function will be called on this later - and doesn't account for ansi. + #we need to manually cache the item with it's proper width + variable grapheme_widths + #stripped and plus version keys pointing to same length + dict set grapheme_widths $graphemeplus [grapheme_width_cached [::punk::ansi::ansistrip $graphemeplus]] + + } + default { + + } + + } + + #if {[punk::ansi::codetype::is_sgr_reset $code]} { + # #set u_codestack [list] + #} elseif {[punk::ansi::codetype::has_sgr_leadingreset $code]} { + #} elseif {[punk::ansi::codetype::is_sgr $code]} { + #} else { + # #leave SGR stack as is + # if {[punk::ansi::codetype::is_gx_open $code]} { + # } elseif {[punk::ansi::codetype::is_gx_close $code]} { + # } + #} + } + #consider also if there are other codes that should be stacked..? + } + + #NULL empty cell indicator + if {$opt_width ne "\uFFEF"} { + if {[llength $understacks]} { + set cs $u_codestack + set gs $u_gx_stack + } else { + set cs [list] + set gs [list] + } + if {[llength $undercols]< $opt_width} { + set diff [expr {$opt_width- [llength $undercols]}] + if {$diff > 0} { + #set undercols [list {*}$undercols {*}[lrepeat $diff "\u0000"]] ;#2024 - much slower + lappend undercols {*}[lrepeat $diff "\u0000"] + lappend understacks {*}[lrepeat $diff $cs] + lappend understacks_gx {*}[lrepeat $diff $gs] + } + } + } + + if {$opt_width ne "\uFFEF"} { + set renderwidth $opt_width + } else { + set renderwidth [llength $undercols] + } + + + if 0 { + # ----------------- + # if we aren't extending understacks & understacks_gx each time we incr idx above the undercols length.. this doesn't really serve a purpose + # Review. + # ----------------- + #replay code for last overlay position in input line + # whether or not we get that far - we need to return it for possible replay on next line + if {[llength $understacks]} { + lappend understacks $u_codestack + lappend understacks_gx $u_gx_stack + } else { + #in case overlay onto emptystring as underlay + lappend understacks [list] + lappend understacks_gx [list] + } + # ----------------- + } + + #trailing codes in effect for underlay + if {[llength $u_codestack]} { + #set replay_codes_underlay [join $u_codestack ""] + set replay_codes_underlay [punk::ansi::codetype::sgr_merge_list {*}$u_codestack] + } else { + set replay_codes_underlay "" + } + + + # -- --- --- --- --- --- --- --- + #### + #if opt_colstart - we need to build a space (or any singlewidth char ?) padding on the left containing the right number of columns. + #this will be processed as transparent - and handle doublewidth underlay characters appropriately + set startpadding [string repeat " " [expr {$opt_colstart -1}]] + #overdata with left padding spaces based on col-start under will show through for left-padding portion regardless of -transparency + if {$startpadding ne "" || $overdata ne ""} { + if {[punk::ansi::ta::detect $overdata]} { + set overmap [punk::ansi::ta::split_codes_single $startpadding$overdata] + } else { + #single plaintext part + set overmap [list $startpadding$overdata] + } + } else { + set overmap [list] + } + #### + + + #todo - detect plain ascii no-ansi input line (aside from leading ansi reset) + #will that allow some optimisations? + + #todo - detect repeated transparent char in overlay + #regexp {^(.)\1+$} ;#backtracking regexp - relatively slow. + # - try set c [string index $data 0]; regexp [string map [list %c% $c] {^[%c%]+$}] $data + #we should be able to optimize to pass through the underlay?? + + #??? + set colcursor $opt_colstart + #TODO - make a little virtual column object + #we need to refer to column1 or columnmin? or columnmax without calculating offsets due to to startcolumn + #need to lock-down what start column means from perspective of ANSI codes moving around - the offset perspective is unclear and a mess. + + + #set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+} + #as at 2024-02 punk::char::grapheme_split uses these - not aware of more complex graphemes + + set overstacks [list] + set overstacks_gx [list] + + set o_codestack [list]; #SGR codestack (not other codes such as movement,insert key etc) + set o_gxstack [list] + set pt_overchars "" + set i_o 0 + set overlay_grapheme_control_list [list] ;#tag each with g, sgr or other. 'other' are things like cursor-movement or insert-mode or codes we don't recognise/use + #experiment + set overlay_grapheme_control_stacks [list] + foreach {pt code} $overmap { + if {$pt ne ""} { + #todo - wrap in test for empty pt (we used split_codes_single - and it may be common for sgr sequences to be unmerged and so have empty pts between) + if {$cp437_glyphs} { + set pt [tcl::string::map $cp437_map $pt] + } + append pt_overchars $pt + #will get empty pt between adjacent codes + if {!$crm_mode} { + + set is_ptrun 0 + if {$optimise_ptruns && [tcl::string::length $pt] >= $optimise_ptruns} { + set p1 [tcl::string::index $pt 0] + set re [tcl::string::cat {^[} \\U[format %x [scan $p1 %c]] {]+$}] + set is_ptrun [regexp $re $pt] + + #leading only? we would have to check for graphemes at the trailing boundary? + #set re [tcl::string::cat {^[} \\U[format %x [scan $p1 %c]] {]+}] + #set is_ptrun [regexp -indices $re $pt runrange] + #if {$is_ptrun && 1} { + #} + } + if {$is_ptrun} { + #review - in theory a run over a certain length won't be part of some grapheme combo (graphemes can be long e.g 44?, but not as runs(?)) + #could be edge cases for runs at line end? (should be ok as we include trailing \n in our data) + set len [string length $pt] + set g_element [list g $p1] + + #lappend overstacks {*}[lrepeat $len $o_codestack] + #lappend overstacks_gx {*}[lrepeat $len $o_gxstack] + #incr i_o $len + #lappend overlay_grapheme_control_list {*}[lrepeat $len [list g $p1]] + #lappend overlay_grapheme_control_stacks {*}[lrepeat $len $o_codestack] + + set pi 0 + incr i_o $len + while {$pi < $len} { + lappend overstacks $o_codestack + lappend overstacks_gx $o_gxstack + lappend overlay_grapheme_control_list $g_element + lappend overlay_grapheme_control_stacks $o_codestack + incr pi + } + } else { + foreach grapheme [punk::char::grapheme_split $pt] { + lappend overstacks $o_codestack + lappend overstacks_gx $o_gxstack + incr i_o + lappend overlay_grapheme_control_list [list g $grapheme] + lappend overlay_grapheme_control_stacks $o_codestack + } + } + } else { + set tsbegin [clock micros] + foreach grapheme_original [punk::char::grapheme_split $pt] { + set pt_crm [ansistring VIEW -nul 1 -lf 2 -vt 2 -ff 2 $grapheme_original] + #puts stderr "ptlen [string length $pt] graphemelen[string length $grapheme_original] pt_crmlen[string length $pt_crm] $pt_crm" + foreach grapheme [punk::char::grapheme_split $pt_crm] { + if {$grapheme eq "\n"} { + lappend overlay_grapheme_control_stacks $o_codestack + lappend overlay_grapheme_control_list [list crmcontrol "\x1b\[00001E"] + } else { + lappend overstacks $o_codestack + lappend overstacks_gx $o_gxstack + incr i_o + lappend overlay_grapheme_control_list [list g $grapheme] + lappend overlay_grapheme_control_stacks $o_codestack + } + } + } + set elapsed [expr {[clock micros] - $tsbegin}] + puts stderr "ptlen [string length $pt] elapsedus:$elapsed" + } + } + + #only stack SGR (graphics rendition) codes - not title sets, cursor moves etc + #order of if-else based on assumptions: + # that pure resets are fairly common - more so than leading resets with other info + # that non-sgr codes are not that common, so ok to check for resets before verifying it is actually SGR at all. + if {$code ne ""} { + #we need to immediately set crm_mode here if \x1b\[3h received + if {$code eq "\x1b\[3h"} { + set crm_mode 1 + } elseif {$code eq "\x1b\[3l"} { + set crm_mode 0 + } + #else crm_mode could be set either way from options + if {$crm_mode && $code ne "\x1b\[00001E"} { + #treat the code as type 'g' like above - only allow through codes to reset mode REVIEW for now just \x1b\[3l ? + #we need to somehow convert further \n in the graphical rep to an instruction for newline that will bypass further crm_mode processing or we would loop. + set code_as_pt [ansistring VIEW -nul 1 -lf 1 -vt 1 -ff 1 $code] + #split using standard split for first foreach loop - grapheme based split when processing 2nd foreach loop + set chars [split $code_as_pt ""] + set codeparts [list] ;#list of 2-el lists each element {crmcontrol } or {g } + foreach c $chars { + if {$c eq "\n"} { + #use CNL (cursor next line) \x1b\[00001E ;#leading zeroes ok for this processor - used as debugging aid to distinguish + lappend codeparts [list crmcontrol "\x1b\[00001E"] + } else { + if {[llength $codeparts] > 0 && [lindex $codeparts end 0] eq "g"} { + set existing [lindex $codeparts end 1] + lset codeparts end [list g [string cat $existing $c]] + } else { + lappend codeparts [list g $c] + } + } + } + + set partidx 0 + foreach record $codeparts { + lassign $record rtype rval + switch -exact -- $rtype { + g { + append pt_overchars $rval + foreach grapheme [punk::char::grapheme_split $rval] { + lappend overstacks $o_codestack + lappend overstacks_gx $o_gxstack + incr i_o + lappend overlay_grapheme_control_list [list g $grapheme] + lappend overlay_grapheme_control_stacks $o_codestack + } + } + crmcontrol { + #leave o_codestack + lappend overlay_grapheme_control_stacks $o_codestack + lappend overlay_grapheme_control_list [list crmcontrol $rval] + } + } + } + } else { + lappend overlay_grapheme_control_stacks $o_codestack + #there will always be an empty code at end due to foreach on 2 vars with odd-sized list ending with pt (overmap coming from perlish split) + if {[punk::ansi::codetype::is_sgr_reset $code]} { + set o_codestack [list "\x1b\[m"] ;#reset better than empty list - fixes some ansi art issues + lappend overlay_grapheme_control_list [list sgr $code] + } elseif {[punk::ansi::codetype::has_sgr_leadingreset $code]} { + set o_codestack [list $code] + lappend overlay_grapheme_control_list [list sgr $code] + } elseif {[priv::is_sgr $code]} { + #basic simplification first - remove straight dupes + set dup_posns [lsearch -all -exact $o_codestack $code] ;#must be -exact because of square-bracket glob chars + set o_codestack [lremove $o_codestack {*}$dup_posns] + lappend o_codestack $code + lappend overlay_grapheme_control_list [list sgr $code] + } elseif {[regexp {\x1b7|\x1b\[s} $code]} { + #experiment + #cursor_save - for the replays review. + #jmn + #set temp_cursor_saved [punk::ansi::codetype::sgr_merge_list {*}$o_codestack] + lappend overlay_grapheme_control_list [list other $code] + } elseif {[regexp {\x1b8|\x1b\[u} $code]} { + #experiment + #cursor_restore - for the replays + set o_codestack [list $temp_cursor_saved] + lappend overlay_grapheme_control_list [list other $code] + } else { + #review + if {[punk::ansi::codetype::is_gx_open $code]} { + set o_gxstack [list "gx0_on"] + lappend overlay_grapheme_control_list [list gx0 gx0_on] ;#don't store code - will complicate debugging if we spit it out and jump character sets + } elseif {[punk::ansi::codetype::is_gx_close $code]} { + set o_gxstack [list] + lappend overlay_grapheme_control_list [list gx0 gx0_off] ;#don't store code - will complicate debugging if we spit it out and jump character sets + } else { + lappend overlay_grapheme_control_list [list other $code] + } + } + } + } + + } + #replay code for last overlay position in input line - should take account of possible trailing sgr code after last grapheme + set max_overlay_grapheme_index [expr {$i_o -1}] + lappend overstacks $o_codestack + lappend overstacks_gx $o_gxstack + + #set replay_codes_overlay [join $o_codestack ""] + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}$o_codestack] + + #if {[tcl::dict::exists $overstacks $max_overlay_grapheme_index]} { + # set replay_codes_overlay [join [tcl::dict::get $overstacks $max_overlay_grapheme_index] ""] + #} else { + # set replay_codes_overlay "" + #} + # -- --- --- --- --- --- --- --- + + + #potential problem - combinining diacritics directly following control chars like \r \b + + # -- --- --- + #we need to initialise overflow_idx before any potential row-movements - as they need to perform a loop break and force in_excess to 1 + if {$opt_expand_right} { + #expand_right true means we can have lines as long as we want, but either way there can be excess data that needs to be thrown back to the calling loop. + #we currently only support horizontal expansion to the right (review regarding RTL text!) + set overflow_idx -1 + } else { + #expand_right zero - we can't grow beyond our column width - so we get ellipsis or truncation + if {$opt_width ne "\uFFEF"} { + set overflow_idx [expr {$opt_width}] + } else { + #review - this is also the cursor position when adding a char at end of line? + set overflow_idx [expr {[llength $undercols]}] ;#index at which we would be *in* overflow a row move may still override it + } + } + # -- --- --- + + set outcols $undercols ;#leave undercols as is, outcols can potentially be appended to. + + set unapplied "" ;#if we break for move row (but not for /v ?) + set unapplied_list [list] + + set insert_lines_above 0 ;#return key + set insert_lines_below 0 + set instruction "" + + # -- --- --- + #cursor_save_dec, cursor_restore_dec etc + set cursor_restore_required 0 + set cursor_saved_attributes "" + set cursor_saved_position "" + # -- --- --- + + #set idx 0 ;# line index (cursor - 1) + #set idx [expr {$opt_colstart + $opt_colcursor} -1] + + #idx is the per column output index + set idx [expr {$opt_colcursor -1}] ;#don't use opt_colstart here - we have padded and won't start emitting until idx reaches opt_colstart-1 + #cursor_column is usually one above idx - but we have opt_colstart which is like a margin - todo: remove cursor_column from the following loop and calculate it's offset when breaking or at end. + #(for now we are incrementing/decrementing both in sync - which is a bit silly) + set cursor_column $opt_colcursor + + #idx_over is the per grapheme overlay index + set idx_over -1 + + + #movements only occur within the overlay range. + #an underlay is however not necessary.. e.g + #renderline -expand_right 1 "" data + + #set re_mode {\x1b\[\?([0-9]*)(h|l)} ;#e.g DECAWM + #set re_col_move {\x1b\[([0-9]*)(C|D|G)$} + #set re_row_move {\x1b\[([0-9]*)(A|B)$} + #set re_both_move {\x1b\[([0-9]*)(?:;){0,1}([0-9]*)H$} ;# or "f" ? + #set re_vt_sequence {\x1b\[([0-9]*)(?:;){0,1}([0-9]*)~$} + #set re_cursor_save {\x1b\[s$} ;#note probable incompatibility with DECSLRM (set left right margin)! + #set re_cursor_restore {\x1b\[u$} + #set re_cursor_save_dec {\x1b7$} + #set re_cursor_restore_dec {\x1b8$} + #set re_other_single {\x1b(D|M|E)$} + #set re_decstbm {\x1b\[([0-9]*)(?:;){0,1}([0-9]*)r$} ;#DECSTBM set top and bottom margins + + #puts "-->$overlay_grapheme_control_list<--" + #puts "-->overflow_idx: $overflow_idx" + for {set gci 0} {$gci < [llength $overlay_grapheme_control_list]} {incr gci} { + set gc [lindex $overlay_grapheme_control_list $gci] + lassign $gc type item + + #emit plaintext chars first using existing SGR codes from under/over stack as appropriate + #then check if the following code is a cursor movement within the line and adjust index if so + #foreach ch $overlay_graphemes {} + switch -- $type { + g { + set ch $item + #crm_mode affects both graphic and control + if {0 && $crm_mode} { + set chars [ansistring VIEW -nul 1 -lf 2 -vt 2 -ff 2 $ch] + set chars [string map [list \n "\x1b\[00001E"] $chars] + if {[llength [split $chars ""]] > 1} { + priv::render_unapplied $overlay_grapheme_control_list $gci + #prefix the unapplied controls with the string version of this control + set unapplied_list [linsert $unapplied_list 0 {*}[split $chars ""]] + set unapplied [join $unapplied_list ""] + #incr idx_over + break + } else { + set ch $chars + } + } + incr idx_over; #idx_over (until unapplied reached anyway) is per *grapheme* in the overlay - not per col. + if {($idx < ($opt_colstart -1))} { + incr idx [grapheme_width_cached $ch] + continue + } + #set within_undercols [expr {$idx <= [llength $undercols]-1}] ;#within our active data width + set within_undercols [expr {$idx <= $renderwidth-1}] + + #https://www.enigma.com/resources/blog/the-secret-world-of-newline-characters + #\x85 NEL in the c1 control set is treated by some terminal emulators (e.g Hyper) as a newline, + #on some it's invisble but doesn't change the line, on some it's a visible glyph of width 1. + #This is hard to process in any standard manner - but I think the Hyper behaviour of doing what it was intended is perhaps most reasonable + #We will map it to the same behaviour as lf here for now... but we need also to consider the equivalent ANSI sequence: \x1bE + + set chtest [tcl::string::map [list \n \x85 \b \r \v \x7f ] $ch] + #puts --->chtest:$chtest + #specials - each shoud have it's own test of what to do if it happens after overflow_idx reached + switch -- $chtest { + "" { + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + if {$idx == 0} { + #puts "---a at col 1" + #linefeed at column 1 + #leave the overflow_idx ;#? review + set instruction lf_start ;#specific instruction for newline at column 1 + priv::render_unapplied $overlay_grapheme_control_list $gci + break + } elseif {$overflow_idx != -1 && $idx == $overflow_idx} { + #linefeed after final column + #puts "---c at overflow_idx=$overflow_idx" + incr cursor_row + set overflow_idx $idx ;#override overflow_idx even if it was set to -1 due to expand_right = 1 + set instruction lf_overflow ;#only special treatment is to give it it's own instruction in case caller needs to handle differently + priv::render_unapplied $overlay_grapheme_control_list $gci + break + } else { + #linefeed occurred in middle or at end of text + #puts "---mid-or-end-text-linefeed idx:$idx overflow_idx:$overflow_idx" + if {$insert_mode == 0} { + incr cursor_row + if {$idx == -1 || $overflow_idx > $idx} { + #don't set overflow_idx higher if it's already set lower and we're adding graphemes to overflow + set overflow_idx $idx ;#override overflow_idx even if it was set to -1 due to expand_right = 1 + } + set instruction lf_mid + priv::render_unapplied $overlay_grapheme_control_list $gci + break + } else { + incr cursor_row + #don't adjust the overflow_idx + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction lf_mid + break ;# could have overdata following the \n - don't keep processing + } + } + + } + "" { + #will we want/need to use raw for keypresses in terminal? (terminal with LNM in standard reset mode means enter= this is the usual config for terminals) + #So far we are assuming the caller has translated to and handle above.. REVIEW. + + #consider also the old space-carriagereturn softwrap convention used in some terminals. + #In the context of rendering to a block of text - this works similarly in that the space gets eaten so programs emitting space-cr at the terminal width col will pretty much get what they expect. + set idx [expr {$opt_colstart -1}] + set cursor_column $opt_colstart ;#? + } + "" { + #literal backspace char - not necessarily from keyboard + #review - backspace effect on double-width chars - we are taking a column-editing perspective in overtype + #(important for -transparent option - hence replacement chars for half-exposed etc) + #review - overstrike support as per nroff/less (generally considered an old technology replaced by unicode mechanisms and/or ansi SGR) + if {$idx > ($opt_colstart -1)} { + incr idx -1 + incr cursor_column -1 + } else { + set flag 0 + if $flag { + #review - conflicting requirements? Need a different sequence for destructive interactive backspace? + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction backspace_at_start + break + } + } + } + "" { + #literal del character - some terminals send just this for what is generally expected to be a destructive backspace + #We instead treat this as a pure delete at current cursor position - it is up to the repl or terminal to remap backspace key to a sequence that has the desired effect. + priv::render_delchar $idx + } + "" { + #end processing this overline. rest of line is remainder. cursor for column as is. + #REVIEW - this theoretically depends on terminal's vertical tabulation setting (name?) + #e.g it could be configured to jump down 6 rows. + #On the other hand I've seen indications that some modern terminal emulators treat it pretty much as a linefeed. + #todo? + incr cursor_row + set overflow_idx $idx + #idx_over has already been incremented as this is both a movement-control and in some sense a grapheme + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction vt + break + } + default { + if {$overflow_idx != -1} { + #review - how to check arbitrary length item such as tab is going to overflow .. before we get to overflow_idx? + #call grapheme_width_cached on each ch, or look for tab specifically as it's currently the only known reason to have a grapheme width > 2? + #we need to decide what a tab spanning the overflow_idx means and how it affects wrap etc etc + if {$idx == $overflow_idx-1} { + set owidth [grapheme_width_cached $ch] + if {$owidth == 2} { + #review split 2w overflow? + #we don't want to make the decision here to split a 2w into replacement characters at end of line and beginning of next line + #better to consider the overlay char as unable to be applied to the line + #render empty column(?) - and reduce overlay grapheme index by one so that the current ch goes into unapplied + #throwing back to caller with instruction complicates its job - but is necessary to avoid making decsions for it here. + priv::render_addchar $idx "" [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + #change the overflow_idx + set overflow_idx $idx + incr idx + incr idx_over -1 ;#set overlay grapheme index back one so that sgr stack from previous overlay grapheme used + priv::render_unapplied $overlay_grapheme_control_list [expr {$gci-1}] ;#note $gci-1 instead of just gci + #throw back to caller's loop - add instruction to caller as this is not the usual case + #caller may for example choose to render a single replacement char to this line and omit the grapheme, or wrap it to the next line + set instruction overflow_splitchar + break + } elseif {$owidth > 2} { + #? tab? + #TODO! + puts stderr "overtype::renderline long overtext grapheme '[ansistring VIEW -lf 1 -vt 1 $ch]' not handled" + #tab of some length dependent on tabstops/elastic tabstop settings? + } + } elseif {$idx >= $overflow_idx} { + #REVIEW + set next_gc [lindex $overlay_grapheme_control_list $gci+1] ;#next grapheme or control + lassign $next_gc next_type next_item + if {$autowrap_mode || $next_type ne "g"} { + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci-1]] + #set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #don't incr idx beyond the overflow_idx + #idx_over already incremented - decrement so current overlay grapheme stacks go to unapplied + incr idx_over -1 + #priv::render_unapplied $overlay_grapheme_control_list [expr {$gci-1}] ;#back one index here too + priv::render_this_unapplied $overlay_grapheme_control_list $gci ;# + set instruction overflow + break + } else { + #no point throwing back to caller for each grapheme that is overflowing + #without this branch - renderline would be called with overtext reducing only by one grapheme per call + #processing a potentially long overtext each time (ie - very slow) + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #JMN4 + + } + } + } else { + #review. + #overflow_idx = -1 + #This corresponds to expand_right being true (at least until overflow_idx is in some cases forced to a value when throwing back to calling loop) + } + + if {($do_transparency && [regexp $opt_transparent $ch])} { + #pre opt_colstart is effectively transparent (we have applied padding of required number of columns to left of overlay) + if {$idx > [llength $outcols]-1} { + lappend outcols " " + #tcl::dict::set understacks $idx [list] ;#review - use idx-1 codestack? + #lset understacks $idx [list] ;#will get index $i out of range error + lappend understacks [list] ;#REVIEW + incr idx + incr cursor_column + } else { + #todo - punk::char::char_width + set g [lindex $outcols $idx] + #JMN + set uwidth [grapheme_width_cached $g] + if {[lindex $outcols $idx] eq ""} { + #2nd col of 2-wide char in underlay + incr idx + incr cursor_column + } elseif {$uwidth == 0} { + #e.g control char ? combining diacritic ? + incr idx + incr cursor_column + } elseif {$uwidth == 1} { + set owidth [grapheme_width_cached $ch] + incr idx + incr cursor_column + if {$owidth > 1} { + incr idx + incr cursor_column + } + } elseif {$uwidth > 1} { + if {[grapheme_width_cached $ch] == 1} { + if {!$insert_mode} { + #normal singlewide transparent overlay onto double-wide underlay + set next_pt_overchar [tcl::string::index $pt_overchars $idx_over+1] ;#lookahead of next plain-text char in overlay + if {$next_pt_overchar eq ""} { + #special-case trailing transparent - no next_pt_overchar + incr idx + incr cursor_column + } else { + if {[regexp $opt_transparent $next_pt_overchar]} { + incr idx + incr cursor_column + } else { + #next overlay char is not transparent.. first-half of underlying 2wide char is exposed + #priv::render_addchar $idx $opt_exposed1 [tcl::dict::get $overstacks $idx_over] [tcl::dict::get $overstacks_gx $idx_over] $insert_mode + priv::render_addchar $idx $opt_exposed1 [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + incr cursor_column + } + } + } else { + #? todo - decide what transparency even means for insert mode + incr idx + incr cursor_column + } + } else { + #2wide transparency over 2wide in underlay - review + incr idx + incr cursor_column + } + } + } + } else { + + set idxchar [lindex $outcols $idx] + #non-transparent char in overlay or empty cell + if {$idxchar eq "\u0000"} { + #empty/erased cell indicator + set uwidth 1 + } else { + set uwidth [grapheme_width_cached $idxchar] + } + if {$within_undercols} { + if {$idxchar eq ""} { + #2nd col of 2wide char in underlay + if {!$insert_mode} { + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] 0 + #JMN - this has to expose if our startposn chopped an underlay - but not if we already overwrote the first half of the widechar underlay grapheme + #e.g renderline \uFF21\uFF21--- a\uFF23\uFF23 + #vs + # renderline -startcolumn 2 \uFF21---- \uFF23 + if {[lindex $outcols $idx-1] != ""} { + #verified it's an empty following a filled - so it's a legit underlay remnant (REVIEW - when would it not be??) + #reset previous to an exposed 1st-half - but leave understacks code as is + priv::render_addchar [expr {$idx-1}] $opt_exposed1 [lindex $understacks $idx-1] [lindex $understacks_gx $idx-1] 0 + } + incr idx + } else { + set prevcolinfo [lindex $outcols $idx-1] + #for insert mode - first replace the empty 2ndhalf char with exposed2 before shifting it right + #REVIEW - this leaves a replacement character permanently in our columns.. but it is consistent regarding length (?) + #The alternative is to disallow insertion at a column cursor that is at 2nd half of 2wide char + #perhaps by inserting after the char - this may be worthwhile - but may cause other surprises + #It is perhaps best avoided at another level and try to make renderline do exactly as it's told + #the advantage of this 2w splitting method is that the inserted character ends up in exactly the column we expect. + priv::render_addchar $idx $opt_exposed2 [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] 0 ;#replace not insert + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] 1 ;#insert - same index + if {$prevcolinfo ne ""} { + #we've split the 2wide - it may already have been rendered as an exposed1 - but not for example if our startcolumn was current idx + priv::render_addchar [expr {$idx-1}] $opt_exposed1 [lindex $understacks $idx-1] [lindex $understacks_gx $idx-1] 0 ;#replace not insert + } ;# else?? + incr idx + } + if {$cursor_column < [llength $outcols] || $overflow_idx == -1} { + incr cursor_column + } + } elseif {$uwidth == 0} { + #what if this is some other c0/c1 control we haven't handled specifically? + + #by emitting a preceding empty-string column - we associate whatever this char is with the preceeding non-zero-length character and any existing zero-lengths that follow it + #e.g combining diacritic - increment before over char REVIEW + #arguably the previous overchar should have done this - ie lookahead for combiners? + #if we can get a proper grapheme_split function - this should be easier to tidy up. + priv::render_addchar $idx "" [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + incr cursor_column 2 + + if {$cursor_column > [llength $outcols] && $overflow_idx != -1} { + set cursor_column [llength $outcols] + } + } elseif {$uwidth == 1} { + #includes null empty cells + set owidth [grapheme_width_cached $ch] + if {$owidth == 1} { + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + } else { + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + priv::render_addchar $idx "" [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + #if next column in underlay empty - we've overwritten first half of underlying 2wide grapheme + #replace with rhs exposure in case there are no more overlay graphemes coming - use underlay's stack + if {([llength $outcols] >= $idx +2) && [lindex $outcols $idx+1] eq ""} { + priv::render_addchar [expr {$idx+1}] $opt_exposed2 [lindex $understacks $idx+1] [lindex $understacks_gx $idx+1] $insert_mode + } + incr idx + } + if {($cursor_column < [llength $outcols]) || $overflow_idx == -1} { + incr cursor_column + } + } elseif {$uwidth > 1} { + set owidth [grapheme_width_cached $ch] + if {$owidth == 1} { + #1wide over 2wide in underlay + if {!$insert_mode} { + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + incr cursor_column + priv::render_addchar $idx $opt_exposed2 [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + #don't incr idx - we are just putting a broken-indication in the underlay - which may get overwritten by next overlay char + } else { + #insert mode just pushes all to right - no exposition char here + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + incr cursor_column + } + } else { + #2wide over 2wide + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx 2 + incr cursor_column 2 + } + + if {$cursor_column > [llength $outcols] && $overflow_idx != -1} { + set cursor_column [llength $outcols] + } + } + } else { + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + incr cursor_column + } + } + } + } ;# end switch + + + } + other - crmcontrol { + if {$crm_mode && $type ne "crmcontrol" && $item ne "\x1b\[00001E"} { + if {$item eq "\x1b\[3l"} { + set crm_mode 0 + } else { + #When our initial overlay split was done - we weren't in crm_mode - so there are codes that weren't mapped to unicode control character representations + #set within_undercols [expr {$idx <= $renderwidth-1}] + #set chars [ansistring VIEW -nul 1 -lf 2 -vt 2 -ff 2 $item] + set chars [ansistring VIEW -nul 1 -lf 1 -vt 1 -ff 1 $item] + priv::render_unapplied $overlay_grapheme_control_list $gci + #prefix the unapplied controls with the string version of this control + set unapplied_list [linsert $unapplied_list 0 {*}[split $chars ""]] + set unapplied [join $unapplied_list ""] + + break + } + } + + #todo - consider CSI s DECSLRM vs ansi.sys \x1b\[s - we need \x1b\[s for oldschool ansi art - but may have to enable only for that. + #we should possibly therefore reverse this mapping so that x1b7 x1b8 are the primary codes for save/restore? + set code [tcl::string::map [list \x1b7 \x1b\[s \x1b8 \x1b\[u ] $item] + #since this element isn't a grapheme - advance idx_over to next grapheme overlay when about to fill 'unapplied' + + + #remap of DEC cursor_save/cursor_restore from ESC sequence to equivalent CSI + #probably not ideal - consider putting cursor_save/cursor_restore in functions so they can be called from the appropriate switch branch instead of using this mapping + #review - cost/benefit of function calls within these switch-arms instead of inline code? + + set c1 [tcl::string::index $code 0] + set c1c2c3 [tcl::string::range $code 0 2] + #set re_ST_open {(?:\033P|\u0090|\033X|\u0098|\033\^|\u009e|\033_|\u009f)} + #tcl 8.7 - faster to use inline list than to store it in a local var outside of loop. + #(somewhat surprising) + set leadernorm [tcl::string::range [tcl::string::map [list\ + \x1b\[< 1006\ + \x1b\[ 7CSI\ + \x1bY 7MAP\ + \x1bP 7DCS\ + \x90 8DCS\ + \x9b 8CSI\ + \x1b\] 7OSC\ + \x9d 8OSC\ + \x1b 7ESC\ + ] $c1c2c3] 0 3] ;#leadernorm is 1st 1,2 or 3 chars mapped to 4char normalised indicator - or is original first chars (1,2 or 3 len) + + #we leave the tail of the code unmapped for now + switch -- $leadernorm { + 1006 { + #https://invisible-island.net/xterm/ctlseqs/ctlseqs.html + #SGR (1006) CSI < followed by colon separated encoded-button-value,px,py ordinates and final M for button press m for button release + set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 3 end]] + } + 7CSI - 7OSC { + #set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 2 end]] + set codenorm $leadernorm[tcl::string::range $code 2 end] + } + 7DCS { + #ESC P + #Device Control String https://invisible-island.net/xterm/ctlseqs/ctlseqs.html#h4-Controls-beginning-with-ESC:ESC-F.C74 + set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 2 end]] + } + 8DCS { + #8-bit Device Control String + set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 1 end]] + } + 7MAP { + #map to another type of code to share implementation branch + set codenorm $leadernorm[tcl::string::range $code 1 end] + } + 7ESC { + #set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 1 end]] + set codenorm $leadernorm[tcl::string::range $code 1 end] + } + 8CSI - 8OSC { + set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 1 end]] + } + default { + puts stderr "Sequence detected as ANSI, but not handled in leadernorm switch. code: [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + #we haven't made a mapping for this + #could in theory be 1,2 or 3 in len + #although we shouldn't be getting here if the regexp for ansi codes is kept in sync with our switch branches + set codenorm $code + } + } + + switch -- $leadernorm { + 7MAP { + switch -- [lindex $codenorm 4] { + Y { + #vt52 movement. we expect 2 chars representing position (limited range) + set params [tcl::string::range $codenorm 5 end] + if {[tcl::string::length $params] != 2} { + #shouldn't really get here or need this branch if ansi splitting was done correctly + puts stderr "overtype::renderline ESC Y recognised as vt52 move, but incorrect parameters length ([string length $params] vs expected 2) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented codenorm:[ansistring VIEW -lf 1 -vt 1 -nul 1 $codenorm]" + } + set line [tcl::string::index $params 5] + set column [tcl::string::index $params 1] + set r [expr {[scan $line %c] -31}] + set c [expr {[scan $column %c] -31}] + + #MAP to: + #CSI n;m H - CUP - Cursor Position + set leadernorm 7CSI + set codenorm "$leadernorm${r}\;${c}H" + } + } + } + } + + #we've mapped 7 and 8bit escapes to values we can handle as literals in switch statements to take advantange of jump tables. + switch -- $leadernorm { + 1006 { + #TODO + # + switch -- [tcl::string::index $codenorm end] { + M { + puts stderr "mousedown $codenorm" + } + m { + puts stderr "mouseup $codenorm" + } + } + + } + {7CSI} - {8CSI} { + set param [tcl::string::range $codenorm 4 end-1] + #puts stdout "--> CSI [tcl::string::index $leadernorm 0] bit param:$param" + set code_end [tcl::string::index $codenorm end] ;#used for e.g h|l set/unset mode + + switch -exact -- $code_end { + A { + #Row move - up + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #todo + lassign [split $param {;}] num modifierkey + if {$modifierkey ne ""} { + puts stderr "modifierkey:$modifierkey" + } + + if {$num eq ""} {set num 1} + incr cursor_row -$num + + if {$cursor_row < 1} { + set cursor_row 1 + } + + #ensure rest of *overlay* is emitted to remainder + incr idx_over + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction up + #retain cursor_column + break + } + B { + #CUD - Cursor Down + #Row move - down + lassign [split $param {;}] num modifierkey + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #move down + if {$modifierkey ne ""} { + puts stderr "modifierkey:$modifierkey" + } + if {$num eq ""} {set num 1} + incr cursor_row $num + + + incr idx_over ;#idx_over hasn't encountered a grapheme and hasn't advanced yet + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction down + #retain cursor_column + break + } + C { + #CUF - Cursor Forward + #Col move + #puts stdout "->forward" + #todo - consider right-to-left cursor mode (e.g Hebrew).. some day. + #cursor forward + #right-arrow/move forward + lassign [split $param {;}] num modifierkey + if {$modifierkey ne ""} { + puts stderr "modifierkey:$modifierkey" + } + if {$num eq ""} {set num 1} + + #todo - retrict to moving 1 position past datalen? restrict to column width? + #should ideally wrap to next line when interactive and not on last row + #(some ansi art seems to expect this behaviour) + #This presumably depends on the terminal's wrap mode + #e.g DECAWM autowrap mode + # CSI ? 7 h - set: autowrap (also tput smam) + # CSI ? 7 l - reset: no autowrap (also tput rmam) + set version 2 + if {$version eq "2"} { + set max [llength $outcols] + if {$overflow_idx == -1} { + incr max + } + if {$cursor_column == $max+1} { + #move_forward while in overflow + incr cursor_column -1 + } + + if {($cursor_column + $num) <= $max} { + incr idx $num + incr cursor_column $num + } else { + if {$autowrap_mode} { + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #jmn + if {$idx == $overflow_idx} { + incr num + } + + #horizontal movement beyond line extent needs to wrap - throw back to caller + #we may have both overflow_right and unapplied data + #(can have overflow_right if we were in insert_mode and processed chars prior to this movement) + #leave row as is - caller will need to determine how many rows the column-movement has consumed + incr cursor_column $num ;#give our caller the necessary info as columns from start of row + #incr idx_over + #should be gci following last one applied + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction wrapmoveforward + break + } else { + set cursor_column $max + set idx [expr {$cursor_column -1}] + } + } + } else { + #review - dead branch + if {!$expand_right || ($cursor_column + $num) <= [llength $outcols+1]} { + incr idx $num + incr cursor_column $num + } else { + if {!$insert_mode} { + #block editing style with arrow keys + #overtype mode + set idxstart $idx + set idxend [llength $outcols] + set moveend [expr {$idxend - $idxstart}] + if {$moveend < 0} {set moveend 0} ;#sanity? + #puts "idxstart:$idxstart idxend:$idxend outcols[llength $outcols] undercols:[llength $undercols]" + incr idx $moveend + incr cursor_column $moveend + #if {[tcl::dict::exists $understacks $idx]} { + # set stackinfo [tcl::dict::get $understacks $idx] ;#use understack at end - which may or may not have already been replaced by stack from overtext + #} else { + # set stackinfo [list] + #} + if {$idx < [llength $understacks]} { + set stackinfo [lindex $understacks $idx] ;#use understack at end - which may or may not have already been replaced by stack from overtext + } else { + set stackinfo [list] + } + if {$idx < [llength $understacks_gx]} { + #set gxstackinfo [tcl::dict::get $understacks_gx $idx] + set gxstackinfo [lindex $understacks_gx $idx] + } else { + set gxstackinfo [list] + } + #pad outcols + set movemore [expr {$num - $moveend}] + #assert movemore always at least 1 or we wouldn't be in this branch + for {set m 1} {$m <= $movemore} {incr m} { + incr idx + incr cursor_column + priv::render_addchar $idx " " $stackinfo $gxstackinfo $insert_mode + } + } else { + #normal - insert + incr idx $num + incr cursor_column $num + if {$idx > [llength $outcols]} { + set idx [llength $outcols];#allow one beyond - for adding character at end of line + set cursor_column [expr {[llength $outcols]+1}] + } + } + } + } + } + D { + #Col move + #puts stdout "<-back" + #cursor back + #left-arrow/move-back when ltr mode + lassign [split $param {;}] num modifierkey + if {$modifierkey ne ""} { + puts stderr "modifierkey:$modifierkey" + } + if {$num eq ""} {set num 1} + + set version 2 + if {$version eq "2"} { + #todo - startcolumn offset! + if {$cursor_column - $num >= 1} { + incr idx -$num + incr cursor_column -$num + } else { + if {!$autowrap_mode} { + set cursor_column 1 + set idx 0 + } else { + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + incr cursor_column -$num + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction wrapmovebackward + break + } + } + } else { + incr idx -$num + incr cursor_column -$num + if {$idx < $opt_colstart-1} { + #wrap to previous line and position cursor at end of data + set idx [expr {$opt_colstart-1}] + set cursor_column $opt_colstart + } + } + } + E { + #CNL - Cursor Next Line + if {$param eq ""} { + set downmove 1 + } else { + set downmove [expr {$param}] + } + puts stderr "renderline CNL down-by-$downmove" + set cursor_column 1 + set cursor_row [expr {$cursor_row + $downmove}] + set idx [expr {$cursor_column -1}] + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + incr idx_over + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction move + break + + } + F { + #CPL - Cursor Previous Line + if {$param eq ""} { + set upmove 1 + } else { + set upmove [expr {$param}] + } + puts stderr "renderline CPL up-by-$upmove" + set cursor_column 1 + set cursor_row [expr {$cursor_row -$upmove}] + if {$cursor_row < 1} { + set cursor_row 1 + } + set idx [expr {$cursor_column - 1}] + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + incr idx_over + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction move + break + + } + G { + #CHA - Cursor Horizontal Absolute (move to absolute column no) + if {$param eq ""} { + set targetcol 1 + } else { + set targetcol $param + if {![string is integer -strict $targetcol]} { + puts stderr "renderline CHA (Cursor Horizontal Absolute) error. Unrecognised parameter '$param'" + } + set targetcol [expr {$param}] + set max [llength $outcols] + if {$overflow_idx == -1} { + incr max + } + if {$targetcol > $max} { + puts stderr "renderline CHA (Cursor Horizontal Absolute) error. Param '$param' > max: $max" + set targetcol $max + } + } + #adjust to colstart - as column 1 is within overlay + #??? REVIEW + set idx [expr {($targetcol -1) + $opt_colstart -1}] + + + set cursor_column $targetcol + #puts stderr "renderline absolute col move ESC G (TEST)" + } + H - f { + #CSI n;m H - CUP - Cursor Position + + #CSI n;m f - HVP - Horizontal Vertical Position REVIEW - same as CUP with differences (what?) in some terminal modes + # - 'counts as effector format function (like CR or LF) rather than an editor function (like CUD or CNL)' + # - REVIEW + #see Annex A at: https://www.ecma-international.org/wp-content/uploads/ECMA-48_5th_edition_june_1991.pdf + + #test e.g ansicat face_2.ans + #$re_both_move + lassign [split $param {;}] paramrow paramcol + #missing defaults to 1 + #CSI ;5H = CSI 1;5H -> row 1 col 5 + #CSI 17;H = CSI 17H = CSI 17;1H -> row 17 col 1 + + if {$paramcol eq ""} {set paramcol 1} + if {$paramrow eq ""} {set paramrow 1} + if {![string is integer -strict $paramcol] || ![string is integer -strict $paramrow]} { + puts stderr "renderline CUP (CSI H) unrecognised param $param" + #ignore? + } else { + set max [llength $outcols] + if {$overflow_idx == -1} { + incr max + } + if {$paramcol > $max} { + set target_column $max + } else { + set target_column [expr {$paramcol}] + } + + + if {$paramrow < 1} { + puts stderr "renderline CUP (CSI H) bad row target 0. Assuming 1" + set target_row 1 + } else { + set target_row [expr {$paramrow}] + } + if {$target_row == $cursor_row} { + #col move only - no need for break and move + #puts stderr "renderline CUP col move only to col $target_column param:$param" + set cursor_column $target_column + set idx [expr {$cursor_column -1}] + } else { + set cursor_row $target_row + set cursor_column $target_column + set idx [expr {$cursor_column -1}] + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + incr idx_over + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction move + break + } + } + } + J { + set modegroup [tcl::string::index $codenorm 4] ;#e.g ? + switch -exact -- $modegroup { + ? { + #CSI ? Pn J - selective erase + puts stderr "overtype::renderline ED - SELECTIVE ERASE IN DISPLAY (UNIMPLEMENTED) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + default { + puts stderr "overtype::renderline ED - ERASE IN DISPLAY (TESTING) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + if {$param eq ""} {set param 0} + switch -exact -- $param { + 0 { + #clear from cursor to end of screen + } + 1 { + #clear from cursor to beginning of screen + } + 2 { + #clear entire screen + #ansi.sys - move cursor to upper left REVIEW + set cursor_row 1 + set cursor_column 1 + set idx [expr {$cursor_column -1}] + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + incr idx_over + if {[llength $outcols]} { + priv::render_erasechar 0 [llength $outcols] + } + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction clear_and_move + break + } + 3 { + #clear entire screen. presumably cursor doesn't move - otherwise it would be same as 2J ? + + } + default { + } + } + + } + } + } + K { + #see DECECM regarding background colour + set modegroup [tcl::string::index $codenorm 4] ;#e.g ? + switch -exact -- $modegroup { + ? { + puts stderr "overtype::renderline DECSEL - SELECTIVE ERASE IN LINE (UNIMPLEMENTED) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + set param [string range $param 1 end] ;#chop qmark + if {$param eq ""} {set param 0} + switch -exact -- $param { + 0 { + #clear from cursor to end of line - depending on DECSCA + } + 1 { + #clear from cursor to beginning of line - depending on DECSCA + + } + 2 { + #clear entire line - depending on DECSCA + } + default { + puts stderr "overtype::renderline DECSEL - SELECTIVE ERASE IN LINE PARAM '$param' unrecognised [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + + } + default { + puts stderr "overtype::renderline EL - ERASE IN LINE (TESTING) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + if {$param eq ""} {set param 0} + switch -exact -- $param { + 0 { + #clear from cursor to end of line + } + 1 { + #clear from cursor to beginning of line + + } + 2 { + #clear entire line + } + default { + puts stderr "overtype::renderline EL - ERASE IN LINE PARAM '$param' unrecognised [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + } + } + } + L { + puts stderr "overtype::renderline IL - Insert Line - not implemented [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + M { + #CSI Pn M - DL - Delete Line + puts stderr "overtype::renderline DL - Delete Line - not implemented [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + + } + T { + #CSI Pn T - SD Pan Up (empty lines introduced at top) + #CSI Pn+T - kitty extension (lines at top come from scrollback buffer) + #Pn new lines appear at top of the display, Pn old lines disappear at the bottom of the display + if {$param eq "" || $param eq "0"} {set param 1} + if {[string index $param end] eq "+"} { + puts stderr "overtype::renderline CSI Pn + T - kitty Pan Up - not implemented [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } else { + puts stderr "overtype::renderline CSI Pn T - SD Pan Up - not implemented [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + X { + puts stderr "overtype::renderline X ECH ERASE CHARACTER - $param" + #ECH - erase character + if {$param eq "" || $param eq "0"} {set param 1}; #param=count of chars to erase + priv::render_erasechar $idx $param + #cursor position doesn't change. + } + q { + set code_secondlast [tcl::string::index $codenorm end-1] + switch -exact -- $code_secondlast { + {"} { + #DECSCA - Select Character Protection Attribute + #(for use with selective erase: DECSED and DECSEL) + set param [tcl::string::range $codenorm 4 end-2] + if {$param eq ""} {set param 0} + #TODO - store like SGR in stacks - replays? + switch -exact -- $param { + 0 - 2 { + #canerase + puts stderr "overtype::renderline - DECSCA canerase not implemented - [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + 1 { + #cannoterase + puts stderr "overtype::renderline - DECSCA cannoterase not implemented - [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + default { + puts stderr "overtype::renderline DECSCA param '$param' not understood [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + + } + default { + puts stderr "overtype::renderline - CSI ... q not implemented - [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + + } + r { + #$re_decstbm + #https://www.vt100.net/docs/vt510-rm/DECSTBM.html + #This control function sets the top and bottom margins for the current page. You cannot perform scrolling outside the margins + lassign [split $param {;}] margin_top margin_bottom + + #todo - return these for the caller to process.. + puts stderr "overtype::renderline DECSTBM set top and bottom margin not implemented" + #Also moves the cursor to col 1 line 1 of the page + set cursor_column 1 + set cursor_row 1 + + incr idx_over + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction move ;#own instruction? decstbm? + break + } + s { + #code conflict between ansi emulation and DECSLRM - REVIEW + #ANSISYSSC (when no parameters) - like other terminals - essentially treat same as DECSC + # todo - when parameters - support DECSLRM instead + + if {$param ne ""} { + #DECSLRM - should only be recognised if DECLRMM is set (vertical split screen mode) + lassign [split $param {;} margin_left margin_right + puts stderr "overtype DECSLRM not yet supported - got [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + if {$margin_left eq ""} { + set margin_left 1 + } + set columns_per_page 80 ;#todo - set to 'page width (DECSCPP set columns per page)' - could be 132 or?? + if {$margin_right eq ""} { + set margin_right $columns_per_page + } + puts stderr "DECSLRM margin left: $margin_left margin right: $margin_right" + if {![string is integer -strict $margin_left] || $margin_left < 0} { + puts stderr "DECSLRM invalid margin_left" + } + if {![string is integer -strict $margin_right] || $margin_right < 0} { + puts stderr "DECSLRM invalid margin_right" + } + set scrolling_region_size [expr {$margin_right - $margin_left}] + if {$scrolling_region_size < 2 || $scrolling_region_size > $columns_per_page} { + puts stderr "DECSLRM region size '$scrolling_regsion_size' must be between 1 and $columns_per_page" + } + #todo + + + } else { + #DECSC + #//notes on expected behaviour: + #DECSC - saves following items in terminal's memory + #cursor position + #character attributes set by the SGR command + #character sets (G0,G1,G2 or G3) currently in GL and GR + #Wrap flag (autowrap or no autowrap) + #State of origin mode (DECOM) + #selective erase attribute + #any single shift 2 (SS2) or single shift 3(SSD) functions sent + + #$re_cursor_save + #cursor save could come after last column + if {$overflow_idx != -1 && $idx == $overflow_idx} { + #bartman2.ans test file - fixes misalignment at bottom of dialog bubble + #incr cursor_row + #set cursor_column 1 + #bwings1.ans test file - breaks if we actually incr cursor (has repeated saves) + set cursor_saved_position [list row [expr {$cursor_row+1}] column 1] + } else { + set cursor_saved_position [list row $cursor_row column $cursor_column] + } + #there may be overlay stackable codes emitted that aren't in the understacks because they come between the last emmited character and the cursor_save control. + #we need the SGR and gx overlay codes prior to the cursor_save + + #a real terminal would not be able to know the state of the underlay.. so we should probably ignore it. + #set sgr_stack [lindex $understacks $idx] + #set gx_stack [lindex $understacks_gx $idx] ;#not actually a stack - just a boolean state (for now?) + + set sgr_stack [list] + set gx_stack [list] + + #we shouldn't need to scan for intermediate cursor save/restores - as restores would throw-back to the calling loop - so our overlay 'line' is since those. + #The overlay_grapheme_control_list had leading resets from previous lines - so we go back to the beginning not just the first grapheme. + + foreach gc [lrange $overlay_grapheme_control_list 0 $gci-1] { + lassign $gc type code + #types g other sgr gx0 + switch -- $type { + gx0 { + #code is actually a stand-in for the graphics on/off code - not the raw code + #It is either gx0_on or gx0_off + set gx_stack [list $code] + } + sgr { + #code is the raw code + if {[punk::ansi::codetype::is_sgr_reset $code]} { + #jmn + set sgr_stack [list "\x1b\[m"] + } elseif {[punk::ansi::codetype::has_sgr_leadingreset $code]} { + set sgr_stack [list $code] + lappend overlay_grapheme_control_list [list sgr $code] + } elseif {[priv::is_sgr $code]} { + #often we don't get resets - and codes just pile up. + #as a first step to simplifying - at least remove earlier straight up dupes + set dup_posns [lsearch -all -exact $sgr_stack $code] ;#needs -exact - codes have square-brackets (glob chars) + set sgr_stack [lremove $sgr_stack {*}$dup_posns] + lappend sgr_stack $code + } + } + } + } + set cursor_saved_attributes "" + switch -- [lindex $gx_stack 0] { + gx0_on { + append cursor_saved_attributes "\x1b(0" + } + gx0_off { + append cursor_saved_attributes "\x1b(B" + } + } + #append cursor_saved_attributes [join $sgr_stack ""] + append cursor_saved_attributes [punk::ansi::codetype::sgr_merge_list {*}$sgr_stack] + + #as there is apparently only one cursor storage element we don't need to throw back to the calling loop for a save. + + #don't incr index - or the save will cause cursor to move to the right + #carry on + } + } + u { + #ANSISYSRC save cursor (when no parameters) (DECSC) + + #$re_cursor_restore + #we are going to jump somewhere.. for now we will assume another line, and process accordingly. + #The caller has the cursor_saved_position/cursor_saved_attributes if any (?review - if we always pass it back it, we could save some calls for moves in same line) + #don't set overflow at this point. The existing underlay to the right must be preserved. + #we only want to jump and render the unapplied at the new location. + + #lset overstacks $idx_over [list] + #set replay_codes_overlay "" + + #if {$cursor_saved_attributes ne ""} { + # set replay_codes_overlay $cursor_saved_attributes ;#empty - or last save if it happend in this input chunk + #} else { + #jj + #set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + set replay_codes_overlay "" + #} + + #like priv::render_unapplied - but without the overlay's ansi reset or gx stacks from before the restore code + incr idx_over + + set unapplied "" + set unapplied_list [list] + foreach gc [lrange $overlay_grapheme_control_list $gci+1 end] { + lassign $gc type item + if {$type eq "gx0"} { + if {$item eq "gx0_on"} { + lappend unapplied_list "\x1b(0" + } elseif {$item eq "gx0_off"} { + lappend unapplied_list "\x1b(B" + } + } else { + lappend unapplied_list $item + } + #incr idx_over + } + set unapplied [join $unapplied_list ""] + #if the save occured within this line - that's ok - it's in the return value list and caller can prepend for the next loop. + set instruction restore_cursor + break + } + "{" { + + puts stderr "renderline warning - CSI.. - unimplemented [ansistring VIEW -lf 1 -nul 1 $code]" + } + "}" { + set code_secondlast [tcl::string::index $codenorm end-1] + switch -exact -- $code_secondlast { + ' { + puts stderr "renderline warning - DECIC - Insert Column - CSI...' - unimplemented [ansistring VIEW -lf 1 -nul 1 $code]" + } + default { + puts stderr "renderline warning - CSI.. - unimplemented [ansistring VIEW -lf 1 -nul 1 $code]" + } + } + } + ~ { + set code_secondlast [tcl::string::index $codenorm end-1] ;#used for e.g CSI x '~ + switch -exact -- $code_secondlast { + ' { + #DECDC - editing sequence - Delete Column + puts stderr "renderline warning - DECDC - unimplemented" + } + default { + #$re_vt_sequence + lassign [split $param {;}] key mod + + #Note that f1 to f4 show as ESCOP|Q|R|S (VT220?) but f5+ show as ESC\[15~ + # + #e.g esc \[2~ insert esc \[2;2~ shift-insert + #mod - subtract 1, and then use bitmask + #shift = 1, (left)Alt = 2, control=4, meta=8 (meta seems to do nothing on many terminals on windows? Intercepted by windows?) + #puts stderr "vt key:$key mod:$mod code:[ansistring VIEW $code]" + if {$key eq "1"} { + #home + } elseif {$key eq "2"} { + #Insert + if {$mod eq ""} { + #no modifier key + set insert_mode [expr {!$insert_mode}] + #rather than set the cursor - we return the insert mode state so the caller can decide + } + } elseif {$key eq "3"} { + #Delete - presumably this shifts other chars in the line, with empty cells coming in from the end + switch -- $mod { + "" { + priv::render_delchar $idx + } + "5" { + #ctrl-del - delete to end of word (pwsh) - possibly word on next line if current line empty(?) + } + } + } elseif {$key eq "4"} { + #End + } elseif {$key eq "5"} { + #pgup + } elseif {$key eq "6"} { + #pgDn + } elseif {$key eq "7"} { + #Home + #?? + set idx [expr {$opt_colstart -1}] + set cursor_column 1 + } elseif {$key eq "8"} { + #End + } elseif {$key eq "11"} { + #F1 - or ESCOP or e.g shift F1 ESC\[1;2P + } elseif {$key eq "12"} { + #F2 - or ESCOQ + } elseif {$key eq "13"} { + #F3 - or ESCOR + } elseif {$key eq "14"} { + #F4 - or ESCOS + } elseif {$key eq "15"} { + #F5 or shift F5 ESC\[15;2~ + } elseif {$key eq "17"} { + #F6 + } elseif {$key eq "18"} { + #F7 + } elseif {$key eq "19"} { + #F8 + } elseif {$key eq "20"} { + #F9 + } elseif {$key eq "21"} { + #F10 + } elseif {$key eq "23"} { + #F11 + } elseif {$key eq "24"} { + #F12 + } + + } + } + + } + h - l { + #set mode unset mode + #we are matching only last char to get to this arm - but are there other sequences ending in h|l we need to handle? + + #$re_mode if first after CSI is "?" + #some docs mention ESC=h|l - not seen on windows terminals.. review + #e.g https://www2.math.upenn.edu/~kazdan/210/computer/ansi.html + set modegroup [tcl::string::index $codenorm 4] ;#e.g ? = + switch -exact -- $modegroup { + ? { + set smparams [tcl::string::range $codenorm 5 end-1] ;#params between ? and h|l + #one or more modes can be set + set smparam_list [split $smparams {;}] + foreach num $smparam_list { + switch -- $num { + "" { + #ignore empties e.g extra/trailing semicolon in params + } + 5 { + #DECSNM - reverse video + #How we simulate this to render within a block of text is an open question. + #track all SGR stacks and constantly flip based on the current SGR reverse state? + #It is the job of the calling loop to do this - so at this stage we'll just set the states + + if {$code_end eq "h"} { + #set (enable) + set reverse_mode 1 + } else { + #reset (disable) + set reverse_mode 0 + } + + } + 7 { + #DECAWM autowrap + if {$code_end eq "h"} { + #set (enable) + set autowrap_mode 1 + if {$opt_width ne "\uFFEF"} { + set overflow_idx $opt_width + } else { + #review - this is also the cursor position when adding a char at end of line? + set overflow_idx [expr {[llength $undercols]}] ;#index at which we would be *in* overflow a row move may still override it + } + #review - can idx ever be beyond overflow_idx limit when we change e.g with a width setting and cursor movements? + # presume not usually - but sanity check with warning for now. + if {$idx >= $overflow_idx} { + puts stderr "renderline warning - idx '$idx' >= overflow_idx '$overflow_idx' - unexpected" + } + } else { + #reset (disable) + set autowrap_mode 0 + #REVIEW! + set overflow_idx -1 + } + } + 25 { + if {$code_end eq "h"} { + #visible cursor + + } else { + #invisible cursor + + } + } + 117 { + #DECECM - Erase Color Mode + #https://invisible-island.net/ncurses/ncurses.faq.html + #The Erase color selection controls the background color used when text is erased or new + #text is scrolled on to the screen. Screen background causes newly erased areas or + #scrolled text to be written using color index zero, the screen background. This is VT + #and DECterm compatible. Text background causes erased areas or scrolled text to be + #written using the current text background color. This is PC console compatible and is + #the factory default. + + #see also: https://unix.stackexchange.com/questions/251726/clear-to-end-of-line-uses-the-wrong-background-color-in-screen + } + } + } + } + = { + set num [tcl::string::range $codenorm 5 end-1] ;#param between = and h|l + puts stderr "overtype::renderline CSI=...h|l code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented" + } + default { + #e.g CSI 4 h + set num [tcl::string::range $codenorm 4 end-1] ;#param before h|l + switch -exact -- $num { + 3 { + puts stderr "CRM MODE $code_end" + #CRM - Show control character mode + # 'No control functions are executed except LF,FF and VT which are represented in the CRM FONT before a CRLF(new line) is executed' + # + #use ansistring VIEW -nul 1 -lf 2 -ff 2 -vt 2 + #https://vt100.net/docs/vt510-rm/CRM.html + #NOTE - vt100 CRM always does auto-wrap at right margin. + #disabling auto-wrap in set-up or by sequence is disabled. + #We should default to turning off auto-wrap when crm_mode enabled.. but + #displaying truncated (on rhs) crm can still be very useful - and we have optimisation in overflow to avoid excess renderline calls (per grapheme) + #we therefore could reasonably put in an exception to allow auto_wrap to be disabled after crm_mode is engaged, + #although this would be potentially an annoying difference to some.. REVIEW + if {$code_end eq "h"} { + set crm_mode 1 + set autowrap_mode 1 + if {$opt_width ne "\uFFEF"} { + set overflow_idx $opt_width + } else { + #review - this is also the cursor position when adding a char at end of line? + set overflow_idx [expr {[llength $undercols]}] ;#index at which we would be *in* overflow a row move may still override it + } + } else { + set crm_mode 0 + } + } + 4 { + #IRM - Insert/Replace Mode + if {$code_end eq "h"} { + #CSI 4 h + set insert_mode 1 + } else { + #CSI 4 l + #replace mode + set insert_mode 0 + } + } + default { + puts stderr "overtype::renderline CSI...h|l code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented" + } + } + } + } + } + | { + switch -- [tcl::string::index $codenorm end-1] { + {$} { + #CSI ... $ | DECSCPP set columns per page- (recommended in vt510 docs as preferable to DECCOLM) + #real terminals generally only supported 80/132 + #some other virtuals support any where from 2 to 65,536? + #we will allow arbitrary widths >= 2 .. to some as yet undetermined limit. + #CSI $ | + #empty or 0 param is 80 for compatibility - other numbers > 2 accepted + set page_width -1 ;#flag as unset + if {$param eq ""} { + set page_width 80 + } elseif {[string is integer -strict $param] && $param >=2 0} { + set page_width [expr {$param}] ;#we should allow leading zeros in the number - but lets normalize using expr + } else { + puts stderr "overtype::renderline unacceptable DECSPP value '$param'" + } + + if {$page_width > 2} { + puts stderr "overtype::renderline DECSCPP - not implemented - but selected width '$page_width' looks ok" + #if cursor already beyond new page_width - will move to right colum - otherwise no cursor movement + + } + + } + default { + puts stderr "overtype::renderline unrecognised CSI code ending in pipe (|) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented" + } + } + } + default { + puts stderr "overtype::renderline CSI code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented" + } + } + } + 7ESC { + # + #re_other_single {\x1b(D|M|E)$} + #also vt52 Y.. + #also PM \x1b^...(ST) + switch -- [tcl::string::index $codenorm 4] { + c { + #RIS - reset terminal to initial state - where 'terminal' in this case is the renderspace - not the underlying terminal! + puts stderr "renderline reset" + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction reset + break + } + D { + #\x84 + #index (IND) + #vt102-docs: "Moves cursor down one line in same column. If cursor is at bottom margin, screen performs a scroll-up" + puts stderr "renderline ESC D not fully implemented" + incr cursor_row + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction down + #retain cursor_column + break + } + E { + #\x85 + #review - is behaviour different to lf? + #todo - possibly(?) same logic as handling above. i.e return instruction depends on where column_cursor is at the time we get NEL + #leave implementation until logic for is set in stone... still under review + #It's arguable NEL is a pure cursor movement as opposed to the semantic meaning of crlf or lf in a file. + # + #Next Line (NEL) "Move the cursor to the left margin on the next line. If the cursor is at the bottom margin, scroll the page up" + puts stderr "overtype::renderline ESC E unimplemented" + + } + H { + #\x88 + #Tab Set + puts stderr "overtype::renderline ESC H tab set unimplemented" + } + M { + #\x8D + #Reverse Index (RI) + #vt102-docs: "Moves cursor up one line in same column. If cursor is at top margin, screen performs a scroll-down" + puts stderr "overtype::renderline ESC M not fully implemented" + + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #move up + incr cursor_row -1 + if {$cursor_row < 1} { + set cursor_row 1 + } + #ensure rest of *overlay* is emitted to remainder + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction up ;#need instruction for scroll-down? + #retain cursor_column + break + } + N { + #\x8e - affects next character only + puts stderr "overtype::renderline single shift select G2 command UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + O { + #\x8f - affects next character only + puts stderr "overtype::renderline single shift select G3 command UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + P { + #\x90 + #DCS - shouldn't get here - handled in 7DCS branch + #similarly \] OSC (\x9d) and \\ (\x9c) ST + } + V { + #\x96 + + } + W { + #\x97 + } + X { + #\x98 + #SOS + if {[string index $code end] eq "\007"} { + set sos_content [string range $code 2 end-1] ;#ST is \007 + } else { + set sos_content [string range $code 2 end-2] ;#ST is \x1b\\ + } + #return in some useful form to the caller + #TODO! + lappend sos_list [list string $sos_content row $cursor_row column $cursor_column] + puts stderr "overtype::renderline ESCX SOS UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + ^ { + #puts stderr "renderline PM" + #Privacy Message. + if {[string index $code end] eq "\007"} { + set pm_content [string range $code 2 end-1] ;#ST is \007 + } else { + set pm_content [string range $code 2 end-2] ;#ST is \x1b\\ + } + #We don't want to render it - but we need to make it available to the application + #see the textblock library in punk, for the exception we make here for single backspace. + #It is unlikely to be encountered as a useful PM - so we hack to pass it through as a fix + #for spacing issues on old terminals which miscalculate the single-width 'Symbols for Legacy Computing' + if {$pm_content eq "\b"} { + #puts stderr "renderline PM sole backspace special handling for \U1FB00 - \U1FBFF" + #esc^\b\007 or esc^\besc\\ + #HACKY pass-through - targeting terminals that both mis-space legacy symbols *and* don't support PMs + #The result is repair of the extra space. If the terminal is a modern one and does support PM - the \b should be hidden anyway. + #If the terminal has the space problem AND does support PMs - then this just won't fix it. + #The fix relies on the symbol-supplier to cooperate by appending esc^\b\esc\\ to the problematic symbols. + + #priv::render_addchar $idx $code [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + #idx has been incremented after last grapheme added + priv::render_append_to_char [expr {$idx -1}] $code + } + #lappend to a dict element in the result for application-specific processing + lappend pm_list $pm_content + } + _ { + #APC Application Program Command + #just warn for now.. + puts stderr "overtype::renderline ESC_ APC command UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + default { + puts stderr "overtype::renderline ESC code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented codenorm:[ansistring VIEW -lf 1 -vt 1 -nul 1 $codenorm]" + } + } + + } + 7DCS - 8DCS { + puts stderr "overtype::renderline DCS - DEVICE CONTROL STRING command UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + #ST (string terminator) \x9c or \x1b\\ + if {[tcl::string::index $codenorm end] eq "\x9c"} { + set code_content [tcl::string::range $codenorm 4 end-1] ;#ST is 8-bit 0x9c + } else { + set code_content [tcl::string::range $codenorm 4 end-2] ;#ST is \x1b\\ + } + + } + 7OSC - 8OSC { + # OSCs are terminated with ST of either \007 or \x1b\\ - we allow either whether code was 7 or 8 bit + if {[tcl::string::index $codenorm end] eq "\007"} { + set code_content [tcl::string::range $codenorm 4 end-1] ;#ST is \007 + } else { + set code_content [tcl::string::range $codenorm 4 end-2] ;#ST is \x1b\\ + } + set first_colon [tcl::string::first {;} $code_content] + if {$first_colon == -1} { + #there probably should always be a colon - but we'll try to make sense of it without + set osc_code $code_content ;#e.g \x1b\]104\007 vs \x1b\]104\;\007 + } else { + set osc_code [tcl::string::range $code_content 0 $first_colon-1] + } + switch -exact -- $osc_code { + 2 { + set newtitle [tcl::string::range $code_content 2 end] + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction [list set_window_title $newtitle] + break + } + 4 { + #OSC 4 - set colour palette + #can take multiple params + #e.g \x1b\]4\;1\;red\;2\;green\x1b\\ + set params [tcl::string::range $code_content 2 end] ;#strip 4 and first semicolon + set cmap [dict create] + foreach {cnum spec} [split $params {;}] { + if {$cnum >= 0 and $cnum <= 255} { + #todo - parse spec from names like 'red' to RGB + #todo - accept rgb:ab/cd/ef as well as rgb:/a/b/c (as alias for aa/bb/cc) + #also - what about rgb:abcd/defg/hijk and 12-bit abc/def/ghi ? + dict set cmap $cnum $spec + } else { + #todo - log + puts stderr "overtype::renderline OSC 4 set colour palette - bad color number: $cnum must be from 0 to 255. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + + puts stderr "overtype::renderline OSC 4 set colour palette unimplemented. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + + + } + 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 { + #OSC 10 through 17 - so called 'dynamic colours' + #can take multiple params - each successive parameter changes the next colour in the list + #- e.g if code started at 11 - next param is for 12. 17 takes only one param because there are no more + #10 change text foreground colour + #11 change text background colour + #12 change text cursor colour + #13 change mouse foreground colour + #14 change mouse background colour + #15 change tektronix foreground colour + #16 change tektronix background colour + #17 change highlight colour + set params [tcl::string::range $code_content 2 end] + + puts stderr "overtype::renderline OSC $osc_code set dynamic colours unimplemented. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + + + } + 18 { + #why is this not considered one of the dynamic colours above? + #https://www.xfree86.org/current/ctlseqs.html + #tektronix cursor color + puts stderr "overtype::renderline OSC 18 - set tektronix cursor color unimplemented. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + 99 { + #kitty desktop notifications + #https://sw.kovidgoyal.net/kitty/desktop-notifications/ + # 99 ; metadata ; payload + puts stderr "overtype::renderline OSC 99 kitty desktop notification unimplemented. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + 104 { + #reset colour palette + #we want to do it for the current rendering context only (vvt) - not just pass through to underlying vt + puts stderr "overtype::renderline OSC 104 reset colour palette unimplemented. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction [list reset_colour_palette] + break + } + 1337 { + #iterm2 graphics and file transfer + puts stderr "overtype::renderline OSC 1337 iterm2 graphics/file_transfer unimplemented. 1st 100 chars of code [ansistring VIEW -lf 1 -vt 1 -nul 1 [string range $code 0 99]]" + } + 5113 { + puts stderr "overtype::renderline OSC 5113 kitty file transfer unimplemented. 1st 100 chars of code [ansistring VIEW -lf 1 -vt 1 -nul 1 [string range $code 0 99]]" + } + default { + puts stderr "overtype::renderline OSC - UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + + } + default { + } + } + + + } + default { + #don't need to handle sgr or gx0 types + #we have our sgr gx0 codes already in stacks for each overlay grapheme + } + } + } + + #-------- + if {$opt_expand_right == 0} { + #need to truncate to the width of the original undertext + #review - string_width vs printing_length here. undertext requirement to be already rendered therefore punk::char::string_width ok? + #set num_under_columns [punk::char::string_width $pt_underchars] ;#plaintext underchars + } + if {$overflow_idx == -1} { + #overflow was initially unlimited and hasn't been overridden + } else { + + } + #-------- + + + #coalesce and replay codestacks for outcols grapheme list + set outstring "" ;#output prior to overflow + set overflow_right "" ;#remainder after overflow point reached + set i 0 + set cstack [list] + set prevstack [list] + set prev_g0 [list] + #note overflow_idx may already have been set lower if we had a row move above due to \v or ANSI moves + set in_overflow 0 ;#used to stop char-width scanning once in overflow + if {$overflow_idx == 0} { + #how does caller avoid an infinite loop if they have autowrap on and keep throwing graphemes to the next line? REVIEW + set in_overflow 1 + } + set trailing_nulls 0 + foreach ch [lreverse $outcols] { + if {$ch eq "\u0000"} { + incr trailing_nulls + } else { + break + } + } + if {$trailing_nulls} { + set first_tail_null_posn [expr {[llength $outcols] - $trailing_nulls}] + } else { + set first_tail_null_posn -1 + } + + #puts stderr "first_tail_null_posn: $first_tail_null_posn" + #puts stderr "colview: [ansistring VIEW $outcols]" + + foreach ch $outcols { + #puts "---- [ansistring VIEW $ch]" + + set gxleader "" + if {$i < [llength $understacks_gx]} { + #set g0 [tcl::dict::get $understacks_gx $i] + set g0 [lindex $understacks_gx $i] + if {$g0 ne $prev_g0} { + if {$g0 eq [list "gx0_on"]} { + set gxleader "\x1b(0" + } else { + set gxleader "\x1b(B" + } + } + set prev_g0 $g0 + } else { + set prev_g0 [list] + } + + set sgrleader "" + if {$i < [llength $understacks]} { + #set cstack [tcl::dict::get $understacks $i] + set cstack [lindex $understacks $i] + if {$cstack ne $prevstack} { + if {[llength $prevstack] && ![llength $cstack]} { + #This reset is important e.g testfile fruit.ans - we get overhang on rhs without it. But why is cstack empty? + append sgrleader \033\[m + } else { + append sgrleader [punk::ansi::codetype::sgr_merge_list {*}$cstack] + } + } + set prevstack $cstack + } else { + set prevstack [list] + } + + + + if {$in_overflow} { + if {$i == $overflow_idx} { + set 0 [lindex $understacks_gx $i] + set gxleader "" + if {$g0 eq [list "gx0_on"]} { + set gxleader "\x1b(0" + } elseif {$g0 eq [list "gx0_off"]} { + set gxleader "\x1b(B" + } + append overflow_right $gxleader + set cstack [lindex $understacks $i] + set sgrleader "" + #whether cstack is same or differs from previous char's stack - we must have an output at the start of the overflow_right + #if {[llength $prevstack] && ![llength $cstack]} { + # append sgrleader \033\[m + #} + append sgrleader [punk::ansi::codetype::sgr_merge_list {*}$cstack] + append overflow_right $sgrleader + append overflow_right $ch + } else { + append overflow_right $gxleader + append overflow_right $sgrleader + append overflow_right $ch + } + } else { + if {$overflow_idx != -1 && $i+1 == $overflow_idx} { + #one before overflow + #will be in overflow in next iteration + set in_overflow 1 + if {[grapheme_width_cached $ch]> 1} { + #we overflowed with second-half of a double-width char - replace first-half with user-supplied exposition char (should be 1 wide) + set ch $opt_exposed1 + } + } + append outstring $gxleader + append outstring $sgrleader + if {$ch eq "\u0000"} { + if {$cp437_glyphs} { + #map all nulls including at tail to space + append outstring " " + } else { + if {$trailing_nulls && $i < $first_tail_null_posn} { + append outstring " " ;#map inner nulls to space + } else { + append outstring \u0000 + } + } + } else { + append outstring $ch + } + } + incr i + } + #flower.ans good test for null handling - reverse line building + #review - presence of overflow_right doesn't indicate line's trailing nulls should remain. + #The cells could have been erased? + #if {!$cp437_glyphs} { + # #if {![ansistring length $overflow_right]} { + # # set outstring [tcl::string::trimright $outstring "\u0000"] + # #} + # set outstring [tcl::string::trimright $outstring "\u0000"] + # set outstring [tcl::string::map {\u0000 " "} $outstring] + #} + + + #REVIEW + #set overflow_right [tcl::string::trimright $overflow_right "\u0000"] + #set overflow_right [tcl::string::map {\u0000 " "} $overflow_right] + + set replay_codes "" + if {[llength $understacks] > 0} { + if {$overflow_idx == -1} { + #set tail_idx [tcl::dict::size $understacks] + set tail_idx [llength $understacks] + } else { + set tail_idx [llength $undercols] + } + if {$tail_idx-1 < [llength $understacks]} { + #set replay_codes [join [lindex $understacks $tail_idx-1] ""] ;#tail replay codes + set replay_codes [punk::ansi::codetype::sgr_merge_list {*}[lindex $understacks $tail_idx-1]] ;#tail replay codes + } + if {$tail_idx-1 < [llength $understacks_gx]} { + set gx0 [lindex $understacks_gx $tail_idx-1] + if {$gx0 eq [list "gx0_on"]} { + #if it was on, turn gx0 off at the point we stop processing overlay + append outstring "\x1b(B" + } + } + } + if {[string length $overflow_right]} { + #puts stderr "remainder:$overflow_right" + } + #pdict $understacks + + if {[punk::ansi::ta::detect_sgr $outstring]} { + append outstring [punk::ansi::a] ;#without this - we would get for example, trailing backgrounds after rightmost column + + #close off any open gx? + #probably should - and overflow_right reopen? + } + + if {$opt_returnextra} { + #replay_codes is the codestack at the boundary - used for ellipsis colouring to match elided text - review + #replay_codes_underlay is the set of codes in effect at the very end of the original underlay + + #review + #replay_codes_overlay is the set of codes in effect at the very end of the original overlay (even if not all overlay was applied) + #todo - replay_codes for gx0 mode + + #overflow_idx may change during ansi & character processing + if {$overflow_idx == -1} { + set overflow_right_column "" + } else { + set overflow_right_column [expr {$overflow_idx+1}] + } + set result [tcl::dict::create\ + result $outstring\ + visualwidth [punk::ansi::printing_length $outstring]\ + instruction $instruction\ + stringlen [string length $outstring]\ + overflow_right_column $overflow_right_column\ + overflow_right $overflow_right\ + unapplied $unapplied\ + unapplied_list $unapplied_list\ + insert_mode $insert_mode\ + autowrap_mode $autowrap_mode\ + crm_mode $crm_mode\ + reverse_mode $reverse_mode\ + insert_lines_above $insert_lines_above\ + insert_lines_below $insert_lines_below\ + cursor_saved_position $cursor_saved_position\ + cursor_saved_attributes $cursor_saved_attributes\ + cursor_column $cursor_column\ + cursor_row $cursor_row\ + expand_right $opt_expand_right\ + replay_codes $replay_codes\ + replay_codes_underlay $replay_codes_underlay\ + replay_codes_overlay $replay_codes_overlay\ + pm_list $pm_list\ + ] + if {$opt_returnextra == 1} { + #puts stderr "renderline: $result" + return $result + } else { + #human/debug - map special chars to visual glyphs + set viewop VIEW + switch -- $opt_returnextra { + 2 { + #codes and character data + set viewop VIEWCODES ;#ansi colorisation of codes - green for SGR, blue/blue reverse for cursor_save/cursor_restore, cyan for movements, orange for others + } + 3 { + set viewop VIEWSTYLE ;#ansi colorise the characters within the output with preceding codes, stacking codes only within each dict value - may not be same SGR effect as the effect in-situ. + } + } + tcl::dict::set result result [ansistring $viewop -lf 1 -vt 1 [tcl::dict::get $result result]] + tcl::dict::set result overflow_right [ansistring VIEW -lf 1 -vt 1 [tcl::dict::get $result overflow_right]] + tcl::dict::set result unapplied [ansistring VIEW -lf 1 -vt 1 [tcl::dict::get $result unapplied]] + tcl::dict::set result unapplied_list [ansistring VIEW -lf 1 -vt 1 [tcl::dict::get $result unapplied_list]] + tcl::dict::set result replay_codes [ansistring $viewop -lf 1 -vt 1 [tcl::dict::get $result replay_codes]] + tcl::dict::set result replay_codes_underlay [ansistring $viewop -lf 1 -vt 1 [tcl::dict::get $result replay_codes_underlay]] + tcl::dict::set result replay_codes_overlay [ansistring $viewop -lf 1 -vt 1 [tcl::dict::get $result replay_codes_overlay]] + tcl::dict::set result cursor_saved_attributes [ansistring $viewop -lf 1 -vt 1 [tcl::dict::get $result cursor_saved_attributes]] + return $result + } + } else { + #puts stderr "renderline returning: result $outstring instruction $instruction unapplied $unapplied overflow_right $overflow_right" + return $outstring + } + #return [join $out ""] + } + + #*** !doctools + #[list_end] [comment {--- end definitions namespace overtype ---}] +} + +tcl::namespace::eval overtype::piper { + proc overcentre {args} { + if {[llength $args] < 2} { + error {usage: ?-bias left|right? ?-transparent [0|1|]? ?-exposed1 ? ?-exposed2 ? ?-overflow [1|0]? overtext pipelinedata} + } + lassign [lrange $args end-1 end] over under + set argsflags [lrange $args 0 end-2] + tailcall overtype::centre {*}$argsflags $under $over + } + proc overleft {args} { + if {[llength $args] < 2} { + error {usage: ?-startcolumn ? ?-transparent [0|1|]? ?-exposed1 ? ?-exposed2 ? ?-overflow [1|0]? overtext pipelinedata} + } + lassign [lrange $args end-1 end] over under + set argsflags [lrange $args 0 end-2] + tailcall overtype::left {*}$argsflags $under $over + } +} + + +# -- --- --- --- --- --- --- --- --- --- --- +proc overtype::transparentline {args} { + foreach {under over} [lrange $args end-1 end] break + set argsflags [lrange $args 0 end-2] + set defaults [tcl::dict::create\ + -transparent 1\ + -exposed 1 " "\ + -exposed 2 " "\ + ] + set newargs [tcl::dict::merge $defaults $argsflags] + tailcall overtype::renderline {*}$newargs $under $over +} +#renderline may not make sense as it is in the long run for blocks of text - but is handy in the single-line-handling form anyway. +# We are trying to handle ansi codes in a block of text which is acting like a mini-terminal in some sense. +#We can process standard cursor moves such as \b \r - but no way to respond to other cursor movements e.g moving to other lines. +# +tcl::namespace::eval overtype::piper { + proc renderline {args} { + if {[llength $args] < 2} { + error {usage: ?-start ? ?-transparent [0|1|]? ?-overflow [1|0]? overtext pipelinedata} + } + foreach {over under} [lrange $args end-1 end] break + set argsflags [lrange $args 0 end-2] + tailcall overtype::renderline {*}$argsflags $under $over + } +} +interp alias "" piper_renderline "" overtype::piper::renderline + +#intended primarily for single grapheme - but will work for multiple +#WARNING: query CAN contain ansi or newlines - but if cache was not already set manually,the answer will be incorrect! +#We deliberately allow this for PM/SOS attached within a column +#(a cache of ansifreestring_width calls - as these are quite regex heavy) +proc overtype::grapheme_width_cached {ch} { + variable grapheme_widths + if {[tcl::dict::exists $grapheme_widths $ch]} { + return [tcl::dict::get $grapheme_widths $ch] + } + set width [punk::char::ansifreestring_width $ch] + tcl::dict::set grapheme_widths $ch $width + return $width +} + + + +proc overtype::test_renderline {} { + set t \uFF5E ;#2-wide tilde + set u \uFF3F ;#2-wide underscore + set missing \uFFFD + return [list $t $u A${t}B] +} + +#maintenance warning +#same as textblock::size - but we don't want that circular dependency +#block width and height can be tricky. e.g \v handled differently on different terminal emulators and can affect both +proc overtype::blocksize {textblock} { + if {$textblock eq ""} { + return [tcl::dict::create width 0 height 1] ;#no such thing as zero-height block - for consistency with non-empty strings having no line-endings + } + if {[tcl::string::first \t $textblock] >= 0} { + if {[info exists punk::console::tabwidth]} { + set tw $::punk::console::tabwidth + } else { + set tw 8 + } + set textblock [textutil::tabify::untabify2 $textblock $tw] + } + #ansistrip on entire block in one go rather than line by line - result should be the same - review - make tests + if {[punk::ansi::ta::detect $textblock]} { + set textblock [punk::ansi::ansistrip $textblock] + } + if {[tcl::string::last \n $textblock] >= 0} { + set num_le [expr {[tcl::string::length $textblock]-[tcl::string::length [tcl::string::map {\n {}} $textblock]]}] ;#faster than splitting into single-char list + set width [tcl::mathfunc::max {*}[lmap v [split $textblock \n] {::punk::char::ansifreestring_width $v}]] + } else { + set num_le 0 + set width [punk::char::ansifreestring_width $textblock] + } + #our concept of block-height is likely to be different to other line-counting mechanisms + set height [expr {$num_le + 1}] ;# one line if no le - 2 if there is one trailing le even if no data follows le + + return [tcl::dict::create width $width height $height] ;#maintain order in 'image processing' standard width then height - caller may use lassign [dict values [blocksize ]] width height +} + +tcl::namespace::eval overtype::priv { + variable cache_is_sgr [tcl::dict::create] + + #we are likely to be asking the same question of the same ansi codes repeatedly + #caching the answer saves some regex expense - possibly a few uS to lookup vs under 1uS + #todo - test if still worthwhile after a large cache is built up. (limit cache size?) + proc is_sgr {code} { + variable cache_is_sgr + if {[tcl::dict::exists $cache_is_sgr $code]} { + return [tcl::dict::get $cache_is_sgr $code] + } + set answer [punk::ansi::codetype::is_sgr $code] + tcl::dict::set cache_is_sgr $code $answer + return $answer + } + # better named render_to_unapplied? + proc render_unapplied {overlay_grapheme_control_list gci} { + upvar idx_over idx_over + upvar unapplied unapplied + upvar unapplied_list unapplied_list ;#maintaining as a list allows caller to utilize it without having to re-split + upvar overstacks overstacks + upvar overstacks_gx overstacks_gx + upvar overlay_grapheme_control_stacks og_stacks + + #set unapplied [join [lrange $overlay_grapheme_control_list $gci+1 end]] + set unapplied "" + set unapplied_list [list] + #append unapplied [join [lindex $overstacks $idx_over] ""] + #append unapplied [punk::ansi::codetype::sgr_merge_list {*}[lindex $overstacks $idx_over]] + set sgr_merged [punk::ansi::codetype::sgr_merge_list {*}[lindex $og_stacks $gci]] + if {$sgr_merged ne ""} { + lappend unapplied_list $sgr_merged + } + switch -- [lindex $overstacks_gx $idx_over] { + "gx0_on" { + lappend unapplied_list "\x1b(0" + } + "gx0_off" { + lappend unapplied_list "\x1b(B" + } + } + + foreach gc [lrange $overlay_grapheme_control_list $gci+1 end] { + lassign $gc type item + #types g other sgr gx0 + if {$type eq "gx0"} { + if {$item eq "gx0_on"} { + lappend unapplied_list "\x1b(0" + } elseif {$item eq "gx0_off"} { + lappend unapplied_list "\x1b(B" + } + } else { + lappend unapplied_list $item + } + } + set unapplied [join $unapplied_list ""] + } + + #clearer - renders the specific gci forward as unapplied - prefixed with it's merged sgr stack + proc render_this_unapplied {overlay_grapheme_control_list gci} { + upvar idx_over idx_over + upvar unapplied unapplied + upvar unapplied_list unapplied_list + upvar overstacks overstacks + upvar overstacks_gx overstacks_gx + upvar overlay_grapheme_control_stacks og_stacks + + #set unapplied [join [lrange $overlay_grapheme_control_list $gci+1 end]] + set unapplied "" + set unapplied_list [list] + + set sgr_merged [punk::ansi::codetype::sgr_merge_list {*}[lindex $og_stacks $gci]] + if {$sgr_merged ne ""} { + lappend unapplied_list $sgr_merged + } + switch -- [lindex $overstacks_gx $idx_over] { + "gx0_on" { + lappend unapplied_list "\x1b(0" + } + "gx0_off" { + lappend unapplied_list "\x1b(B" + } + } + + foreach gc [lrange $overlay_grapheme_control_list $gci end] { + lassign $gc type item + #types g other sgr gx0 + if {$type eq "gx0"} { + if {$item eq "gx0_on"} { + lappend unapplied_list "\x1b(0" + } elseif {$item eq "gx0_off"} { + lappend unapplied_list "\x1b(B" + } + } else { + lappend unapplied_list $item + } + } + set unapplied [join $unapplied_list ""] + } + proc render_delchar {i} { + upvar outcols o + upvar understacks ustacks + upvar understacks_gx gxstacks + set nxt [llength $o] + if {$i < $nxt} { + set o [lreplace $o $i $i] + set ustacks [lreplace $ustacks $i $i] + set gxstacks [lreplace $gxstacks $i $i] + } elseif {$i == 0 || $i == $nxt} { + #nothing to do + } else { + puts stderr "render_delchar - attempt to delchar at index $i >= number of outcols $nxt - shouldn't happen" + } + } + proc render_erasechar {i count} { + upvar outcols o + upvar understacks ustacks + upvar understacks_gx gxstacks + upvar replay_codes_overlay replay + #ECH clears character attributes from erased character positions + #ECH accepts 0 or empty parameter, which is equivalent to 1. Caller of render_erasechar should do that mapping and only supply 1 or greater. + if {![tcl::string::is integer -strict $count] || $count < 1} { + error "render_erasechar count must be integer >= 1" + } + set start $i + set end [expr {$i + $count -1}] + #we restrict ECH to current line - as some terminals do - review - is that the only way it's implemented? + if {$i > [llength $o]-1} { + return + } + if {$end > [llength $o]-1} { + set end [expr {[llength $o]-1}] + } + set num [expr {$end - $start + 1}] + set o [lreplace $o $start $end {*}[lrepeat $num \u0000]] ;#or space? + #DECECM ??? + set ustacks [lreplace $ustacks $start $end {*}[lrepeat $num [list $replay]]] + set gxstacks [lreplace $gxstacks $start $end {*}[lrepeat $num [list]]] ;# ??? review + return + } + proc render_setchar {i c } { + upvar outcols o + lset o $i $c + } + + #Initial usecase is for old-terminal hack to add PM-wrapped \b + #review - can be used for other multibyte sequences that occupy one column? + #combiners? diacritics? + proc render_append_to_char {i c} { + upvar outcols o + if {$i > [llength $o]-1} { + error "render_append_to_char cannot append [ansistring VIEW -lf 1 -nul 1 $c] to existing char at index $i while $i >= llength outcols [llength $o]" + } + set existing [lindex $o $i] + if {$existing eq "\0"} { + lset o $i $c + } else { + lset o $i $existing$c + } + } + #is actually addgrapheme? + proc render_addchar {i c sgrstack gx0stack {insert_mode 0}} { + upvar outcols o + upvar understacks ustacks + upvar understacks_gx gxstacks + + # -- --- --- + #this is somewhat of a hack.. probably not really the equivalent of proper reverse video? review + #we should ideally be able to reverse the video of a sequence that already includes SGR reverse/noreverse attributes + upvar reverse_mode do_reverse + #if {$do_reverse} { + # lappend sgrstack [a+ reverse] + #} else { + # lappend sgrstack [a+ noreverse] + #} + + #JMN3 + if {$do_reverse} { + #note we can't just look for \x1b\[7m or \x1b\[27m + # it may be a more complex sequence like \x1b\[0\;\;7\;31m etc + + set existing_reverse_state 0 + set codeinfo [punk::ansi::codetype::sgr_merge $sgrstack -info 1] + set codestate_reverse [dict get $codeinfo codestate reverse] + switch -- $codestate_reverse { + 7 { + set existing_reverse_state 1 + } + 27 { + set existing_reverse_state 0 + } + "" { + } + } + if {$existing_reverse_state == 0} { + set rflip [a+ reverse] + } else { + #reverse of reverse + set rflip [a+ noreverse] + } + #note that mergeresult can have multiple esc (due to unmergeables or non sgr codes) + set sgrstack [list [dict get $codeinfo mergeresult] $rflip] + #set sgrstack [punk::ansi::codetype::sgr_merge [list [dict get $codeinfo mergeresult] $rflip]] + } + + # -- --- --- + + set nxt [llength $o] + if {!$insert_mode} { + if {$i < $nxt} { + #These lists must always be in sync + lset o $i $c + } else { + lappend o $c + } + if {$i < [llength $ustacks]} { + lset ustacks $i $sgrstack + lset gxstacks $i $gx0stack + } else { + lappend ustacks $sgrstack + lappend gxstacks $gx0stack + } + } else { + #insert of single-width vs double-width when underlying is double-width? + if {$i < $nxt} { + set o [linsert $o $i $c] + } else { + lappend o $c + } + if {$i < [llength $ustacks]} { + set ustacks [linsert $ustacks $i $sgrstack] + set gxstacks [linsert $gxstacks $i $gx0stack] + } else { + lappend ustacks $sgrstack + lappend gxstacks $gx0stack + } + } + } + +} + + + +# -- --- --- --- --- --- --- --- --- --- --- +tcl::namespace::eval overtype { + interp alias {} ::overtype::center {} ::overtype::centre +} + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Ready +package provide overtype [tcl::namespace::eval overtype { + variable version + set version 1.6.6 +}] +return + +#*** !doctools +#[manpage_end] diff --git a/src/bootsupport/modules/test/tomlish-1.1.5.tm b/src/bootsupport/modules/test/tomlish-1.1.5.tm index 536e3fa3..35de5e70 100644 Binary files a/src/bootsupport/modules/test/tomlish-1.1.5.tm and b/src/bootsupport/modules/test/tomlish-1.1.5.tm differ diff --git a/src/bootsupport/modules/tomlish-1.1.6.tm b/src/bootsupport/modules/tomlish-1.1.6.tm new file mode 100644 index 00000000..dddcd0bb --- /dev/null +++ b/src/bootsupport/modules/tomlish-1.1.6.tm @@ -0,0 +1,8408 @@ +# -*- tcl -*- +# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt +# +# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. +# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# (C) 2024 +# +# @@ Meta Begin +# Application tomlish 1.1.6 +# Meta platform tcl +# Meta license +# @@ Meta End + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# doctools header +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +#*** !doctools +#[manpage_begin tomlish_module_tomlish 0 1.1.6] +#[copyright "2024"] +#[titledesc {tomlish toml parser}] [comment {-- Name section and table of contents description --}] +#[moddesc {tomlish}] [comment {-- Description at end of page heading --}] +#[require tomlish] +#[keywords module parsing toml configuration] +#[description] +#[para] tomlish is an intermediate representation of toml data in a tree structure (tagged lists representing type information) +#[para] The design goals are for tomlish to be whitespace and comment preserving ie byte-for byte preservation during roundtrips from toml to tomlish and back to toml +#[para] The tomlish representation can then be converted to a Tcl dict structure or to other formats such as json, +#[para] although these other formats are generally unlikely to retain whitespace or comments +#[para] The other formats also won't preserve roundtripability e.g \t and a literal tab coming from a toml file will be indistinguishable. +#[para] A further goal is to allow at least a useful subset of in-place editing operations which also preserve whitespace and comments. +#[para] e.g leaf key value editing, and table reordering/sorting, key-renaming at any level, key insertions/deletions +#[para] The API for editing (tomldoc object?) may require explicit setting of type if accessing an existing key +#[para] e.g setting a key that already exists and is a different type (especially if nested structure such as a table or array) +#[para] will need a -type option (-force ?) to force overriding with another type such as an int. + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section Overview] +#[para] overview of tomlish +#[subsection Concepts] +#[para] - + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Requirements +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[subsection dependencies] +#[para] packages used by tomlish +#[list_begin itemized] + +package require Tcl 8.6- +package require struct::stack +package require logger + +#*** !doctools +#[item] [package {Tcl 8.6-}] +#[item] [package {struct::stack}] + +#limit ourselves to clear, destroy, peek, pop, push, rotate, or size (e.g v 1.3 does not implement 'get') + + +#*** !doctools +#[list_end] + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section API] + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# Base namespace +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval tomlish { + namespace export {[a-z]*}; # Convention: export all lowercase + variable types + + #*** !doctools + #[subsection {Namespace tomlish}] + #[para] Core API functions for tomlish + #[list_begin definitions] + + #default interp recursionlimit of 1000 is insufficient to pass 1000 deep nested structures as in certain toml tests. + #e.g https://github.com/iarna/toml-spec-tests/tree/latest/values + #1000 seems deep for a 'configuration' format - but toml sometimes used for other serialisation purposes. + #todo - review + set existing_recursionlimit [interp recursionlimit {}] + if {$existing_recursionlimit < 5000} { + interp recursionlimit {} 5000 + } + + #IDEAS: + # since get_toml produces tomlish with whitespace/comments intact: + # tomldoc object - allow (at least basic?) editing of toml whilst preserving comments & whitespace + # - setKey (set leaf only to value) how to specify type? -type option? - whole array vs index into arrays and further nested objects? - option for raw toml additions? + # - separate addKey?? + # - deleteKey (delete leaf) + # - deleteTable (delete table - if only has leaves? - option to delete with child tables?) + # - set/add Table? - position in doc based on existing tables/subtables? + + #The tomlish intermediate representation allows things such as sorting the toml document by table name or other re-ordering of tables - + # because the tables include subkeys, comments and newlines within their structure - those elements all come along with it nicely during reordering. + #The same goes for the first newline following a keyval e.g x=1 \ny=2\n\n + #The newline is part of the keyval structure so makes reordering easier + #example from_toml "a=1\nb=2\n\n\n" + # 0 = TOMLISH + # 1 = KEY a = {INT 1} {NEWLINE lf} + # 2 = NEWLINE lf + # 3 = KEY b = {INT 2} {NEWLINE lf} + # 4 = NEWLINE lf + # 5 = NEWLINE lf + + #This reordering idea is complicated by the nature of tablearrays - especially as a table header references last tablearrayname, + # and duplicate table headers are allowed in that context. + #e.g + #[[fruits]] + # name="apple" + # [fruits.metadata] + # id=1 + # + #[unrelated1] + # + #[[fruits]] + # name="pear" + # + #[unrelated2] + # silly="ordering" + # + #[fruits.metadata] + #id=2 + #The TABLEARRAY record can't be completely selfcontained on the default parsing mechanism - because it is legal (though not recommended) to have unrelated tables in between. + #If we were to 'insert' later related records (such as the 2nd [fruits.metadata] above) into the TABLEARRAY structure - then, even though it might produce 'nicer' toml, + # we would lose roundtripability toml->tomlish->toml + # ----------------------------------------------------- + #REVIEW + #todo - some sort of 'normalize'/'grouping' function on tomlish that at least makes records self-contained, and perhaps then (optionally) reorders resulting records sensibly. + #such a function on the tomlish may work - although it would be unwise to duplicate the validation aspects of dict::from_tomlish + #The most practical way might be to use dict::from_tomlish followed by from_dict - but that would lose comment info and formatting. + #In the above example - The decision by the toml author to put [unrelated1] between related tablearrays should be respected, + #but the positioning of [unrelated2] between a tablearray and one of its contained tables is suspect. + #Both [fruits.metadata] table records should theoretically be added as children to their corresponding [[fruits]] tablearray record in the tomlish. (just as their name keys are) + # ----------------------------------------------------- + + + + #ARRAY is analogous to a Tcl list + #TABLE is analogous to a Tcl dict + #WS = inline whitespace + #KEY = bare key and value + #DQKEY = double quoted key and value + #SQKEY = single quoted key and value + #ITABLE = inline table (*can* be anonymous table) + # inline table values immediately create a table with the opening brace + # inline tables are fully defined between their braces, as are dotted-key subtables defined within + # No additional subtables or arrays of tables may be defined within an inline table after the ending brace - they must be entirely self-contained + + set tags [list TOMLISH BOM ARRAY TABLE ITABLE TABLEARRAY WS NEWLINE COMMENT DOTTEDKEY KEY DQKEY SQKEY STRING STRINGPART MULTISTRING LITERAL LITERALPART MULTILITERAL INT FLOAT BOOL] + #DDDD + lappend tags {*}[list\ + DATETIME\ + DATETIME-LOCAL\ + DATE-LOCAL\ + TIME-LOCAL\ + ] + + #removed - ANONTABLE + #tomlish v1.0 should accept arbitrary 64-bit signed ints (from -2^63 to 2^63-1) + #we will restrict to this range for compatibility for now - although Tcl can handle larger (arbitrarily so?) + #todo - configurable - allow empty string for 'unlimited' + set min_int -9223372036854775808 ;#-2^63 + set max_int +9223372036854775807 ;#2^63-1 + + proc Dolog {lvl txt} { + #return "$lvl -- $txt" + set msg "[clock format [clock seconds] -format "%Y-%m-%dT%H:%M:%S"] tomlish '$txt'" + puts stderr $msg + } + logger::initNamespace ::tomlish + foreach lvl [logger::levels] { + interp alias {} tomlish_log_$lvl {} ::tomlish::Dolog $lvl + log::logproc $lvl tomlish_log_$lvl + } + + + proc tags {} { + return $::tomlish::tags + } + + proc get_dottedkey_info {dottedkeyrecord} { + set key_hierarchy [list] + set key_hierarchy_raw [list] + if {[lindex $dottedkeyrecord 0] ne "DOTTEDKEY"} { + error "tomlish::get_dottedkey_info error. Supplied list doesn't appear to be a DOTTEDKEY (tag: [lindex $dottedkeyrecord 0])" + } + set compoundkeylist [lindex $dottedkeyrecord 1] + set expect_sep 0 + foreach part $compoundkeylist { + set parttag [lindex $part 0] + if {$parttag eq "WS"} { + continue + } + if {$expect_sep} { + if {$parttag ne "DOTSEP"} { + error "DOTTEDKEY missing dot separator between parts. '$dottedkeyrecord'" + } + set expect_sep 0 + } else { + set val [lindex $part 1] + switch -exact -- $parttag { + KEY { + lappend key_hierarchy $val + lappend key_hierarchy_raw $val + } + DQKEY { + #REVIEW unescape or not? + #JJJJ + lappend key_hierarchy [::tomlish::utils::unescape_string $val] + lappend key_hierarchy_raw \"$val\" + } + SQKEY { + lappend key_hierarchy $val + lappend key_hierarchy_raw "'$val'" + } + default { + error "tomlish::get_dottedkey_info DOTTED key unexpected part '$parttag' - ensure dot separator is between key parts. '$compoundkeylist'" + } + } + set expect_sep 1 + } + } + return [dict create keys $key_hierarchy keys_raw $key_hierarchy_raw] + } + + #helper function for tomlish::dict::from_tomlish + proc _get_keyval_value {keyval_element} { + #e.g + #DOTTEDKEY {{KEY a} {WS { }}} = {WS { }} {ARRAY {INT 1} SEP {ITABLE {DOTTEDKEY {{KEY x}} = {INT 1} SEP} {DOTTEDKEY {{KEY y}} = {INT 2}}}} + + log::notice ">>> _get_keyval_value from '$keyval_element'<<<" + #find the value (or 2 values if space separated datetime - and stitch back into one) + # 3 is the earliest index at which the value could occur (depending on whitespace) + if {[lindex $keyval_element 2] ne "="} { + error "tomlish _get_keyval_value keyval_element doesn't seem to be a properly structured { = } list\n $keyval_element" + } + + #review + if {[uplevel 1 [list info exists tablenames_info]]} { + upvar tablenames_info tablenames_info + } else { + set tablenames_info [dict create] ;#keys are @@ paths {@@parenttable @@arrayable @@etc} corresponding to parenttable.arraytable[].etc + #value is a dict with keys such as ttype, tdefined + } + set sublist [lrange $keyval_element 3 end] ;# rhs of = + + set values [list] + set value_posns [list] + set posn 0 + foreach sub $sublist { + #note that a barekey/dquotedkey won't occur directly inside a barekey/dquotedkey + #DDDD + switch -exact -- [lindex $sub 0] { + STRING - LITERAL - MULTISTRING - MULTILITERAL - INT - FLOAT - BOOL - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - TABLE - ARRAY - ITABLE { + lappend values $sub + lappend value_posns $posn + } + DOTTEDKEY { + #we should never see DOTTEDKEY as a toplevel element on RHS + #sanity check in case manually manipulated tomlish - or something went very wrong + set msg "tomlish::_get_keyval_value Unexpected toplevel value element DOTTEDKEY after =" + return -code error -errorcode {TOMLISH SYNTAX UNEXPECTEDDOTTEDKEYRHS} $msg + } + WS - NEWLINE - COMMENT {} + SEP {} + default { + set msg "tomlish::_get_keyval_value Unexpected toplevel value element [lindex $sub 0] after =" + return -code error -errorcode {TOMLISH SYNTAX UNEXPECTED} $msg + } + } + incr posn + } + switch -- [llength $values] { + 0 { + error "tomlish Failed to find value element in KEY. '$keyval_element'" + } + 1 { + lassign [lindex $values 0] type value + } + 2 { + #we generally expect a single 'value' item on RHS of = + #(ignoring WS,NEWLINE,SEP + #(either a simple type, or a container which has multiple values inside) + #exception for space separated datetime which is two toplevel values + + #validate than exactly single space was between the two values + lassign $value_posns p1 p2 + if {$p2 != $p1 +2} { + #sanity check + #can probably only get here through manual manipulation of the tomlish list to an unprocessable form + error "tomlish KEY appears to have more than one part - but not separated by whitespace - invalid '$keyval_element'" + } + set between_token [lindex $sublist $p1+1] + if {[lindex $between_token 1] ne " "} { + error "tomlish KEY in 2 parts is not separated by a single space - cannot consider for datetime '$keyval_element'" + } + lassign [lindex $values 0] type_d1 value_d1 + lassign [lindex $values 1] type_d2 value_d2 + #DDDD + if {$type_d1 ne "DATE-LOCAL" || $type_d2 ni {DATETIME TIME-LOCAL}} { + #we reuse DATETIME tag for standalone time with tz offset (or zZ) + error "tomlish KEY in 2 parts does not appear to be datetime '$keyval_element'" + } + if {$type_d2 eq "TIME-LOCAL"} { + set type DATETIME-LOCAL + } else { + #extra check that 2nd part is actually a time + if {![tomlish::utils::is_timepart $value_d2]} { + error "tomlish KEY in 2 parts does not appear to be datetime. (part 2 not a time value) '$keyval_element'" + } + set type DATETIME + } + set value "${value_d1}T${value_d2}" + } + default { + error "tomlish Found multiple value elements in KEY, expected one. (or 2 for space-separated datetime) '$keyval_element'" + } + } + set sub_tablenames_info [dict create] + switch -exact -- $type { + INT - FLOAT - BOOL - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL { + #DDDD + #simple (non-container, no-substitution) datatype + set result [list type $type value $value] + } + STRING - STRINGPART { + #JJJ + #!!! review + #set result [list type $type value [::tomlish::utils::unescape_string $value]] + set result [list type $type value $value] + } + LITERAL - LITERALPART { + #REVIEW + set result [list type $type value $value] + } + TABLE { + #invalid? + error "tomlish _get_keyval_value invalid to have type TABLE on rhs of =" + } + ITABLE { + #This one should not be returned as a type value structure! + # + set prev_tablenames_info $tablenames_info + set tablenames_info [dict create] + set result [::tomlish::dict::from_tomlish [ list [lindex $values 0] ]] + set sub_tablenames_info $tablenames_info + set tablenames_info $prev_tablenames_info + } + ARRAY { + #we need to recurse to get the corresponding dict for the contained item(s) + #pass in the whole [lindex $values 0] (type val) - not just the $value! + set prev_tablenames_info $tablenames_info + set tablenames_info [dict create] + set result [list type $type value [ ::tomlish::dict::from_tomlish [ list [lindex $values 0] ] ]] + set sub_tablenames_info $tablenames_info + set tablenames_info $prev_tablenames_info + } + MULTISTRING - MULTILITERAL { + #review - mapping these to STRING might make some conversions harder? + #if we keep the MULTI - we know we have to look for newlines for example when converting to json + #without specific types we'd have to check every STRING - and lose info about how best to map chars within it + set result [list type $type value [ ::tomlish::dict::from_tomlish [ list [lindex $values 0] ] ]] + } + default { + error "tomlish Unexpected value type '$type' found in keyval '$keyval_element'" + } + } + return [dict create result $result tablenames_info $sub_tablenames_info] + } + + + proc to_dict {tomlish} { + tomlish::dict::from_tomlish $tomlish + } + + + + proc _from_dictval_tomltype {parents tablestack keys typeval} { + set type [dict get $typeval type] + set val [dict get $typeval value] + #These are the restricted sets of typed used in the tomlish::dict representation + #They are a subset of the types in tomlish: data types plus ARRAY, arranged in a dictionary form. + #The container types: ITABLE, TABLE, TABLEARRAY are not used as they are represented as dictionary keys and ARRAY items. + #The WS, COMMENT, and NEWLINE elements are also unrepresented in the dict structure. + switch -- $type { + ARRAY { + set subitems [list] + foreach item $val { + lappend subitems [_from_dictval [list {*}$parents ARRAY] $tablestack $keys $item] SEP + } + if {[lindex $subitems end] eq "SEP"} { + set subitems [lrange $subitems 0 end-1] + } + return [list ARRAY {*}$subitems] + } + ITABLE { + error "not applicable" + if {$val eq ""} { + return ITABLE + } else { + return [_from_dictval [list {*}$parents ITABLE] $tablestack $keys $val] + } + } + STRING { + #JSJS + #if our dict came from json - we have already decided what type of STRING/LITERAL etc to use when building the dict + + #do not validate like this - important that eg json val\\ue -> dict val\ue -> tomlish/toml val\\ue + #see toml-tests + #if {![tomlish::utils::rawstring_is_valid_tomlstring $val]} { + # #todo? + # return -code error -errorcode {TOML SYNTAX INVALIDSTRING} "Unescaped controls in string or invalid escapes" + #} + return [list STRING [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $val]] + } + MULTISTRING { + #value is a raw string that isn't encoded as tomlish + #create a valid toml snippet with the raw value and decode it to the proper tomlish MULTISTRING format + #We need to convert controls in $val to escape sequences - except for newlines + # + #consider an *option* to reformat for long lines? (perhaps overcomplex - byte equiv - but may fold in ugly places) + #we could use a line-length limit to decide when to put in a "line ending backslash" + #and even format it with a reasonable indent so that proper CONT and WS entries are made (?) REVIEW + # + #TODO + set tomlpart "x=\"\"\"\\\n" + append tomlpart [tomlish::utils::rawstring_to_MultiBstring_with_escaped_controls $val] + append tomlpart "\"\"\"" + set tomlish [tomlish::from_toml $tomlpart] + #e.g if val = " etc\nblah" + #TOMLISH {DOTTEDKEY {{KEY x}} = {MULTISTRING CONT {NEWLINE LF} {WS { }} {STRINGPART etc} {NEWLINE lf} {STRINGPART blah} } } + #lindex 1 3 is the MULTISTRING tomlish list + return [lindex $tomlish 1 3] + } + MULTILITERAL { + #MLL string can contain newlines - but still no control chars + #todo - validate - e.g val can't contain more than 2 squotes in a row + if {[string first ''' $val] >=0} { + set msg "_from_dictval_tomltype error: more than 2 single quotes in a row found in MULTILITERAL - cannot encode dict to TOML-VALID TOMLISH" + return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} $msg + } + + #rawstring_is_valid_multiliteral - allow newlines as lf or crlf - but not bare cr + if {![tomlish::utils::rawstring_is_valid_multiliteral $val]} { + return -code error -errorcode {TOML SYNTAX INVALIDMULTILITERAL} "Controls other than tab or newlines found in multiliteral" + } + + set tomlpart "x='''\n" + append tomlpart $val ''' + set tomlish [tomlish::from_toml $tomlpart] + return [lindex $tomlish 1 3] + } + LITERAL { + #from v1.0 spec - "Control characters other than tab are not permitted in a literal string" + #(This rules out raw ANSI SGR - which is somewhat restrictive - but perhaps justified for a config format + # as copy-pasting ansi to a config value is probably not always wise, and it's not something that can be + # easily input via a text editor. ANSI can go in Basic strings using the \e escape if that's accepted v1.1?) + #we could choose to change the type to another format here when encountering invalid chars - but that seems + #like too much magic. We elect to error out and require the dict to have valid data for the types it specifies. + if {[string first ' $val] >=0} { + set msg "_from_dictval_tomltype error: single quote found in LITERAL - cannot encode dict to TOML-VALID TOMLISH" + return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} $msg + } + #JJJJ + if {![tomlish::utils::rawstring_is_valid_literal $val]} { + #has controls other than tab + #todo - squote? + return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} "Controls other than tab found in literal" + } + return [list LITERAL $val] + } + INT { + if {![::tomlish::utils::is_int $val]} { + error "_from_dictval_tomltype error: bad INT value '$val' - cannot encode dict to TOML-VALID TOMLISH" + } + return [list INT $val] + } + FLOAT { + if {![::tomlish::utils::is_float $val]} { + error "_from_dictval_tomltype error: bad FLOAT value '$val' - cannot encode dict to TOML-VALID TOMLISH" + } + return [list FLOAT $val] + } + default { + if {$type ni [::tomlish::tags]} { + error "_from_dictval_tomltype error: Unrecognised typename '$type' in {type value } - cannot encode dict to TOML-VALID TOMLISH" + } + return [list $type $val] + } + } + } + + proc _from_dictval {parents tablestack keys vinfo} { + set k [lindex $keys end] + set K_PART [tomlish::dict::classify_rawkey $k] ;#get [list SQKEY ] + #puts stderr "---parents:'$parents' keys:'$keys' vinfo: $vinfo---" + #puts stderr "---tablestack: $tablestack---" + set result [list] + set lastparent [lindex $parents end] + if {$lastparent in [list "" do_inline]} { + if {[tomlish::dict::is_typeval $vinfo]} { + set type [dict get $vinfo type] + #treat ITABLE differently? + set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] + lappend result DOTTEDKEY [list $K_PART {WS { }}] = {WS { }} $sublist {NEWLINE lf} + } else { + if {$vinfo ne ""} { + + #set result [list DOTTEDKEY [list [list KEY $k]] = ] + #set records [list ITABLE] + + set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] + + if {$lastparent eq "do_inline"} { + set result [list DOTTEDKEY [list $K_PART] =] + set records [list ITABLE] + } else { + set tname [tomlish::dict::join_and_quote_rawkey_list [list $k]] + set result [list TABLE $tname {NEWLINE lf}] + set tablestack [list {*}$tablestack [list T $k]] + set records [list] + } + + + + set lastidx [expr {[dict size $vinfo] -1}] + set dictidx 0 + dict for {vk vv} $vinfo { + set VK_PART [tomlish::dict::classify_rawkey $vk] ;#get [list SQKEY ] + #(SQKEY & DQKEY do not have the enclosing quotes in their returned val) + #if {[regexp {\s} $vk] || [string first . $vk] >= 0} { + # set VK_PART [list SQKEY $vk] + #} else { + # set VK_PART [list KEY $vk] + #} + if {[tomlish::dict::is_typeval $vv]} { + #type x value y + #REVIEW - we could detect if value is an array of objects, + #and depending on parent context - emit a series of TABLEARRAY records instead of a DOTTEDKEY record containing an ARRAY of objects + set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] + set record [list DOTTEDKEY [list $VK_PART {WS { }}] = {WS { }} $sublist] + } else { + if {$vv eq ""} { + #experimental + if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { + ::tomlish::log::notice "_from_dictval could uninline KEY $vk (tablestack:$tablestack)" + #set tname [tomlish::dict::name_from_tablestack [list {*}$tablestack [list T $vk]]] + + #we can't just join normalized keys - need keys with appropriate quotes and escapes + #set tname [join [list {*}$keys $vk] .] ;#WRONG + set tq [tomlish::dict::join_and_quote_rawkey_list [list {*}$keys $vk]] + + + ##wrong? results in TABLE within TABLE record?? todo pop? + #set record [list TABLE $tq {NEWLINE lf}] + #set tablestack [list {*}$tablestack [list T $vk]] + + #REVIEW!!! + + set record [list DOTTEDKEY [list $VK_PART] = ITABLE] + set tablestack [list {*}$tablestack [list I $vk]] + + } else { + set record [list DOTTEDKEY [list $VK_PART] = ITABLE] + set tablestack [list {*}$tablestack [list I $vk]] + } + } else { + if { 0 } { + #experiment.. sort of getting there. + if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { + ::tomlish::log::notice "_from_dictval could uninline2 KEYS [list {*}$keys $vk] (tablestack:$tablestack)" + set tq [tomlish::dict::join_and_quote_rawkey_list [list {*}$keys $vk]] + set record [list TABLE $tq {NEWLINE lf}] + set tablestack [list {*}$tablestack [list T $vk]] + + #review - todo? + set dottedkey_value [_from_dictval [list {*}$parents TABLE] $tablestack [list {*}$keys $vk] $vv] + lappend record {*}$dottedkey_value + + } else { + set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] + set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] + } + } else { + set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] + set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] + } + } + } + if {$dictidx != $lastidx} { + #lappend record SEP + if {$lastparent eq "do_inline"} { + lappend record SEP + } else { + lappend record {NEWLINE lf} + } + } + if {[llength $record]} { + lappend records $record + } + incr dictidx + } + if {$lastparent eq "do_inline"} { + lappend result $records {NEWLINE lf} + } else { + lappend result {*}$records {NEWLINE lf} + } + } else { + if {$lastparent eq "do_inline"} { + lappend result DOTTEDKEY [list $K_PART] = ITABLE {NEWLINE lf} + } else { + set tname [tomlish::dict::join_and_quote_rawkey_list [list $k]] + #REVIEW + lappend result TABLE $tname {NEWLINE lf} + } + } + } + } else { + #lastparent is not toplevel "" or "do_inline" + if {[tomlish::dict::is_typeval $vinfo]} { + #type x value y + set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] + lappend result {*}$sublist + } else { + if {$lastparent eq "TABLE"} { + #review + dict for {vk vv} $vinfo { + set VK_PART [tomlish::dict::classify_rawkey $vk] ;#get [list SQKEY ] + set dottedkey_value [_from_dictval [list {*}$parents DOTTEDKEY] $tablestack [list {*}$keys $vk] $vv] + lappend result [list DOTTEDKEY [list $VK_PART] = $dottedkey_value {NEWLINE lf}] + } + } else { + if {$vinfo ne ""} { + set lastidx [expr {[dict size $vinfo] -1}] + set dictidx 0 + set sub [list] + #REVIEW + #set result $lastparent ;#e.g sets ITABLE + set result ITABLE + set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] + dict for {vk vv} $vinfo { + set VK_PART [tomlish::dict::classify_rawkey $vk] ;#get [list SQKEY ] + if {[tomlish::dict::is_typeval $vv]} { + #type x value y + set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] + set record [list DOTTEDKEY [list $VK_PART] = $sublist] + } else { + if {$vv eq ""} { + #can't just uninline at this level + #we need a better method to query main dict for uninlinability at each level + # (including what's been inlined already) + #if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { + # puts stderr "_from_dictval uninline2 KEY $keys" + # set tname [tomlish::dict::join_and_quote_rawkey_list [list {*}$keys $vk]] + # set record [list TABLE $tname {NEWLINE lf}] + # set tablestack [list {*}$tablestack [list T $vk]] + #} else { + set record [list DOTTEDKEY [list $VK_PART] = ITABLE] + #} + } else { + #set sub [_from_dictval ITABLE $vk $vv] + set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] + #set record [list DOTTEDKEY [list $VK_PART] = ITABLE $dottedkey_value] + set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] + } + } + if {$dictidx != $lastidx} { + lappend record SEP + } + lappend result $record + incr dictidx + } + } else { + #e.g x=[{}] + log::debug "---> _from_dictval empty ITABLE x-1" + #lappend result DOTTEDKEY [list $K_PART] = ITABLE ;#wrong + lappend result ITABLE + } + } + } + } + return $result + } + + + proc from_dict {d} { + #consider: + # t1={a=1,b=2} + # x = 1 + + # from_dict gives us: t1 {a {type INT value 1} b {type INT value 2}} x {type INT value 1} + + #If we represent t1 as an expanded table we get + # [t1] + # a=1 + # b=2 + # x=1 + # --- which is incorrect - as x was a toplevel key like t1! + #This issue doesn't occur if x is itself an inline table + # t1={a=1,b=2} + # x= {no="problem"} + # + # (or if we were to reorder x to come before t1) + + #ie the order of the dict elements influences how the toml can be represented. + + #As the dictionary form doesn't distinguish the structure used to create tables {[table1]\nk=v} vs inline {table1={k=v}} + #Without a solution, from_dict would have to always produce the inline form for toplevel tables unless we allowed re-ordering, + #which is unpreferred here. + + #A possible solution: + #scan the top level to see if all (trailing) elements are themselves dicts + # (ie not of form {type XXX value yyy}) + # + # A further point is that if all root level values are at the 'top' - we can treat lower table-like structures as {[table]} elements + #ie we don't need to force do_inline if all the 'simple' keys are before any compound keys + + #set root_has_values 0 + #approach 1) - the naive approach - forces inline when not always necessary + #dict for {k v} $d { + # if {[llength $v] == 4 && [lindex $v 0] eq "type"} { + # set root_has_values 1 + # break + # } + #} + + + #approach 2) - track the position of last {type x value y} in the dictionary built by dict::from_tomlish + # - still not perfect. Inlines dotted tables unnecessarily + #This means from_dict doesn't produce output optimal for human editing. + set last_simple [tomlish::dict::last_tomltype_posn $d] + + + ## set parent "do_inline" ;#a value used in _from_dictval to distinguish from "" or other context based parent values + #Any keys that are themselves tables - will need to be represented inline + #to avoid reordering, or incorrect assignment of plain values to the wrong table. + + ## set parent "" + #all toplevel keys in the dict structure can represent subtables. + #we are free to use {[tablename]\n} syntax for toplevel elements. + + + set tomlish [list TOMLISH] + set dictposn 0 + set tablestack [list [list T root]] ;#todo + dict for {t tinfo} $d { + if {$last_simple > $dictposn} { + set parents [list do_inline] + } else { + set parents [list ""] + } + set keys [list $t] + #review - where to make decision on + # DOTTEDKEY containing array of objs + #vs + # list of TABLEARRAY records + #At least for the top + set trecord [_from_dictval $parents $tablestack $keys $tinfo] + lappend tomlish $trecord + incr dictposn + } + return $tomlish + } + + proc typedjson_to_toml {json} { + #*** !doctools + #[call [fun typedjson_to_toml] [arg json]] + #[para] + + set tomlish [::tomlish::from_dict_from_typedjson $json] + lappend tomlish [list NEWLINE lf] + set toml [::tomlish::to_toml $tomlish] + } + + set json1 {{ "a": {"type": "integer", "value": "42"}}} + set json2 {{ + "a": {"type": "integer", "value": "42"}, + "b": {"type": "string", "value": "test"} + }} + set json3 { +{ + "best-day-ever": {"type": "datetime", "value": "1987-07-05T17:45:00Z"}, + "numtheory": { + "boring": {"type": "bool", "value": "false"}, + "perfection": [ + {"type": "integer", "value": "6"}, + {"type": "integer", "value": "28"}, + {"type": "integer", "value": "496"} + ] + } +} + } + + set json4 { +{ + "best-day-ever": {"type": "datetime", "value": "1987-07-05T17:45:00Z"}, + "numtheory": { + "boring": {"type": "bool", "value": "false"}, + "perfection": [ + {"type": "integer", "value": "6"}, + {"type": "integer", "value": "28"}, + {"type": "integer", "value": "496"} + ] + }, + "emptyobj": {}, + "emptyarray": [] +} + } + + set json5 { +{ + "a": { + " x ": {}, + "b.c": {}, + "d.e": {}, + "b": { + "c": {} + } + } +} + } + + #surrogate pair face emoji + set json6 { +{ + "surrogatepair": {"type": "string", "value": "\uD83D\uDE10"} +} + } + + + set json7 { +{ + "escapes": {"type": "string", "value": "val\\ue"} +} + } + + + proc from_dict_from_typedjson {json} { + set d [tomlish::dict::from_typedjson $json] + tomlish::from_dict $d ;#return tomlish + } + + + proc toml_to_typedjson {toml} { + set tomlish [::tomlish::from_toml $toml] + set d [tomlish::dict::from_tomlish $tomlish] + #full validation only occurs by re-encoding dict to tomlish + set test [tomlish::from_dict $d] + + set h [tomlish::typedhuddle::from_dict $d] + #huddle jsondump $h + tomlish::huddle::jsondumpraw $h + } + + #proc get_json {tomlish} { + # package require fish::json + # set d [::tomlish::dict::from_tomlish $tomlish] + + # #return [::tomlish::dict_to_json $d] + # return [fish::json::from "struct" $d] + #} + + #return a Tcl list of tomlish tokens + #i.e get a standard list of all the toml terms in string $s + #where each element of the list is a *tomlish* term.. i.e a specially 'tagged' Tcl list. + #(simliar to a tcl 'Huddle' - but also supporting whitespace preservation) + # ---------------------------------------------------------------------------------------------- + # NOTE: the production of tomlish from toml source doesn't indicate the toml source was valid!!! + # e.g we deliberately don't check certain things such as duplicate table declarations here. + # ---------------------------------------------------------------------------------------------- + #Part of the justification for this is that as long as the syntax is toml shaped - we can load files which violate certain rules and allow programmatic manipulation. + # (e.g perhaps a toml editor to highlight violations for fixing) + # A further stage is then necessary to load the tomlish tagged list into a data structure more suitable for efficient query/reading. + # e.g dicts or an object oriented structure + #Note also - *no* escapes in quoted strings are processed. This is up to the datastructure stage + #e.g dict::from_tomlish will substitute \r \n \uHHHH \UHHHHHHH etc + #This is important for tomlish to maintain the ability to perform competely lossless round-trips from toml to tomlish and back to toml. + # (which is handy for testing as well as editing some part of the structure with absolutely no effect on other parts of the document) + #If we were to unescape a tab character for example + # - we have no way of knowing if it was originally specified as \t \u0009 or \U00000009 or directly as a tab character. + # For this reason, we also do absolutely no line-ending transformations based on platform. + # All line-endings are maintained as is, and even a file with mixed lf crlf line-endings will be correctly interpreted and can be 'roundtripped' + + proc from_toml {args} { + + namespace upvar ::tomlish::parse s s + set s [join $args \n] + namespace upvar ::tomlish::parse i i + set i 0 ;#index into s + + namespace upvar ::tomlish::parse is_parsing is_parsing + set is_parsing 1 + + if {[info command ::tomlish::parse::spacestack] eq "::tomlish::parse::spacestack"} { + tomlish::parse::spacestack destroy + } + struct::stack ::tomlish::parse::spacestack + + namespace upvar ::tomlish::parse last_space_action last_space_action + namespace upvar ::tomlish::parse last_space_type last_space_type + + namespace upvar ::tomlish::parse tok tok + set tok "" + + namespace upvar ::tomlish::parse type type + namespace upvar ::tomlish::parse tokenType tokenType + ::tomlish::parse::set_tokenType "" + namespace upvar ::tomlish::parse tokenType_list tokenType_list + set tokenType [list] ;#Flat (un-nested) list of tokentypes found + + namespace upvar ::tomlish::parse lastChar lastChar + set lastChar "" + + + set result "" + namespace upvar ::tomlish::parse nest nest + set nest 0 + + namespace upvar ::tomlish::parse v v ;#array keyed on nest level + + + set v(0) {TOMLISH} + array set s0 [list] ;#whitespace data to go in {SPACE {}} element. + set parentlevel 0 + + + namespace upvar ::tomlish::parse state state + + namespace upvar ::tomlish::parse braceCount braceCount + set barceCount 0 + namespace upvar ::tomlish::parse bracketCount bracketCount + set bracketCount 0 + + set sep 0 + set r 1 + namespace upvar ::tomlish::parse token_waiting token_waiting + set token_waiting [dict create] ;#if ::tok finds a *complete* second token during a run, it will put the 2nd one here to be returned by the next call. + + + + set state "table-space" + ::tomlish::parse::spacestack push {type space state table-space} + namespace upvar ::tomlish::parse linenum linenum;#'line number' of input data. (incremented for each literal linefeed - but not escaped ones in data) + set linenum 1 + + set ::tomlish::parse::state_list [list] + try { + while {$r} { + set r [::tomlish::parse::tok] + #puts stdout "got tok: '$tok' while parsing string '$s' " + set next_tokenType_known 0 ;#whether we begin a new token here based on what terminated the token result of 'tok' + + + #puts "got token: '$tok' tokenType='$tokenType'. while v($nest) = [set v($nest)]" + #puts "-->tok: $tok tokenType='$tokenType'" + set prevstate $state + set transition_info [::tomlish::parse::goNextState $tokenType $tok $state] + #review goNextState could perform more than one space_action + set space_action [dict get $transition_info space_action] + set newstate [dict get $transition_info newstate] ;#use of 'newstate' vs 'state' makes code clearer below + + if {[tcl::string::match "err-*" $state]} { + ::tomlish::log::warn "---- State error in state $prevstate for tokenType: $tokenType token value: $tok. $state aborting parse. [tomlish::parse::report_line]" + lappend v(0) [list ERROR tokentype $tokenType state $prevstate to $state leveldata [set v($nest)]] + return $v(0) + } + # --------------------------------------------------------- + #NOTE there may already be a token_waiting at this point + #set_token_waiting can raise an error here, + # in which case the space_action branch needs to be rewritten to handle the existing token_waiting + # --------------------------------------------------------- + + if {$space_action eq "pop"} { + #pop_trigger_tokens: newline tablename endarray endinlinetable + #note a token is a pop trigger depending on context. e.g first newline during keyval is a pop trigger. + set parentlevel [expr {$nest -1}] + set do_append_to_parent 1 ;#most tokens will leave this alone - but some like tentative_accum_squote need to do their own append + switch -exact -- $tokenType { + tentative_accum_squote { + #should only apply within a multiliteral + #### + set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed + #Without this - we would get extraneous empty list entries in the parent + # - as the xxx-squote-space isn't a space level from the toml perspective + # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop + #assert prevstate always trailing-squote-space + #dev guardrail - remove? assertion lib? + switch -exact -- $prevstate { + trailing-squote-space { + } + default { + error "--- unexpected popped due to tentative_accum_squote but came from state '$prevstate' should have been trailing-squote-space" + } + } + switch -- $tok { + ' { + tomlish::parse::set_token_waiting type single_squote value $tok complete 1 startindex [expr {$i -1}] + } + '' { + #review - we should perhaps return double_squote instead? + #tomlish::parse::set_token_waiting type literal value "" complete 1 + tomlish::parse::set_token_waiting type double_squote value "" complete 1 startindex [expr {$i - 2}] + } + ''' { + #### + #if already an eof in token_waiting - set_token_waiting will insert before it + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 3}] + } + '''' { + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] + #todo integrate left squote with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + LITERALPART { + set newval "[lindex $lastpart 1]'" + set parentdata $v($parentlevel) + lset parentdata end [list LITERALPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE { + lappend v($parentlevel) [list LITERALPART "'"] + } + MULTILITERAL { + #empty + lappend v($parentlevel) [list LITERALPART "'"] + } + default { + error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" + } + } + } + ''''' { + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] + #todo integrate left 2 squotes with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + LITERALPART { + set newval "[lindex $lastpart 1]''" + set parentdata $v($parentlevel) + lset parentdata end [list LITERALPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE { + lappend v($parentlevel) [list LITERALPART "''"] + } + MULTILITERAL { + lappend v($parentlevel) [list LITERALPART "''"] + } + default { + error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" + } + } + } + } + } + triple_squote { + #presumably popping multiliteral-space + ::tomlish::log::debug "---- triple_squote for last_space_action pop leveldata: $v($nest)" + set merged [list] + set lasttype "" + foreach part $v($nest) { + switch -exact -- [lindex $part 0] { + MULTILITERAL { + lappend merged $part + } + LITERALPART { + if {$lasttype eq "LITERALPART"} { + set prevpart [lindex $merged end] + lset prevpart 1 [lindex $prevpart 1][lindex $part 1] + lset merged end $prevpart + } else { + lappend merged $part + } + } + NEWLINE { + #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here + #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. + lappend merged $part + } + default { + error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" + } + } + set lasttype [lindex $part 0] + } + set v($nest) $merged + } + tentative_accum_dquote { + #should only apply within a multistring + #### + set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed + #Without this - we would get extraneous empty list entries in the parent + # - as the trailing-dquote-space isn't a space level from the toml perspective + # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop + #assert prevstate always trailing-dquote-space + #dev guardrail - remove? assertion lib? + switch -exact -- $prevstate { + trailing-dquote-space { + } + default { + error "--- unexpected popped due to tentative_accum_dquote but came from state '$prevstate' should have been trailing-dquote-space" + } + } + switch -- $tok { + {"} { + tomlish::parse::set_token_waiting type single_dquote value $tok complete 1 startindex [expr {$i -1}] + } + {""} { + #review - we should perhaps return double_dquote instead? + #tomlish::parse::set_token_waiting type literal value "" complete 1 + tomlish::parse::set_token_waiting type double_dquote value "" complete 1 startindex [expr {$i - 2}] + } + {"""} { + #### + #if already an eof in token_waiting - set_token_waiting will insert before it + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 3}] + } + {""""} { + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 4}] + #todo integrate left dquote with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + STRINGPART { + set newval "[lindex $lastpart 1]\"" + set parentdata $v($parentlevel) + lset parentdata end [list STRINGPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE - CONT - WS { + lappend v($parentlevel) [list STRINGPART {"}] + } + MULTISTRING { + #empty + lappend v($parentlevel) [list STRINGPART {"}] + } + default { + error "--- don't know how to integrate extra trailing dquote with data $v($parentlevel)" + } + } + } + {"""""} { + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i-5}] + #todo integrate left 2 dquotes with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + STRINGPART { + set newval "[lindex $lastpart 1]\"\"" + set parentdata $v($parentlevel) + lset parentdata end [list STRINGPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE - CONT - WS { + lappend v($parentlevel) [list STRINGPART {""}] + } + MULTISTRING { + lappend v($parentlevel) [list STRINGPART {""}] + } + default { + error "--- don't know how to integrate extra trailing 2 dquotes with data $v($parentlevel)" + } + } + } + } + } + triple_dquote { + #presumably popping multistring-space + ::tomlish::log::debug "---- triple_dquote for last_space_action pop leveldata: $v($nest)" + set merged [list] + set lasttype "" + foreach part $v($nest) { + switch -exact -- [lindex $part 0] { + MULTISTRING { + lappend merged $part + } + STRINGPART { + if {$lasttype eq "STRINGPART"} { + set prevpart [lindex $merged end] + lset prevpart 1 [lindex $prevpart 1][lindex $part 1] + lset merged end $prevpart + } else { + lappend merged $part + } + } + CONT - WS { + lappend merged $part + } + NEWLINE { + #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here + #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. + lappend merged $part + } + default { + error "---- triple_dquote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" + } + } + set lasttype [lindex $part 0] + } + set v($nest) $merged + } + equal { + #pop caused by = + switch -exact -- $prevstate { + dottedkey-space { + tomlish::log::debug "---- equal ending dottedkey-space for last_space_action pop" + #re-emit for parent space + tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] + } + dottedkey-space-tail { + #experiment? + tomlish::log::debug "---- equal ending dottedkey-space-tail for last_space_action pop" + #re-emit for parent space + tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] + } + } + } + newline { + incr linenum + lappend v($nest) [list NEWLINE $tok] + } + tablename { + #note: a tablename only 'pops' if we are greater than zero + error "---- tablename pop should already have been handled as special case zeropoppushspace in goNextState" + } + tablearrayname { + #!review - tablearrayname different to tablename regarding push/pop? + #note: a tablename only 'pops' if we are greater than zero + error "---- tablearrayname pop should already have been handled as special case zeropoppushspace in goNextState" + } + endarray { + #nothing to do here. + } + comma { + #comma for inline table will pop the keyvalue space + lappend v($nest) "SEP" + } + endinlinetable { + ::tomlish::log::debug "---- endinlinetable for last_space_action pop" + } + default { + error "---- unexpected tokenType '$tokenType' for last_space_action 'pop'" + } + } + if {$do_append_to_parent} { + #e.g tentative_accum_squote does it's own appends as necessary - so won't get here + lappend v($parentlevel) [set v($nest)] + } + + incr nest -1 + + } elseif {$last_space_action eq "push"} { + set prevnest $nest + incr nest 1 + set v($nest) [list] + # push_trigger_tokens: barekey dquotedkey startinlinetable startarray tablename tablearrayname + + + switch -exact -- $tokenType { + tentative_trigger_squote - tentative_trigger_dquote { + #### this startok will always be tentative_accum_squote/tentative_accum_dquote starting with one accumulated squote/dquote + if {[dict exists $transition_info starttok] && [dict get $transition_info starttok] ne ""} { + lassign [dict get $transition_info starttok] starttok_type starttok_val + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType $starttok_type + set tok $starttok_val + } + } + single_squote { + #JMN - REVIEW + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "squotedkey" + set tok "" + } + triple_squote { + ::tomlish::log::debug "---- push trigger tokenType triple_squote" + set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERALPART + } + squotedkey { + switch -exact -- $prevstate { + table-space - itable-space { + set v($nest) [list DOTTEDKEY] + } + } + #todo - check not something already waiting? + tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space + } + triple_dquote { + set v($nest) [list MULTISTRING] ;#container for NEWLINE,STRINGPART,CONT + } + dquotedkey { + switch -exact -- $prevstate { + table-space - itable-space { + set v($nest) [list DOTTEDKEY] + } + } + #todo - check not something already waiting? + tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space + } + barekey { + switch -exact -- $prevstate { + table-space - itable-space { + set v($nest) [list DOTTEDKEY] + } + } + #todo - check not something already waiting? + set waiting [tomlish::parse::get_token_waiting] + if {[llength $waiting]} { + set i [dict get $waiting startindex] + tomlish::parse::clear_token_waiting + tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space + } else { + tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space + } + } + tablename { + #note: we do not use the output of tablename_trim to produce a tablename for storage in the tomlish list! + #The tomlish list is intended to preserve all whitespace (and comments) - so a roundtrip from toml file to tomlish + # back to toml file will be identical. + #It is up to the datastructure stage to normalize and interpret tomlish for programmatic access. + # we call tablename_trim here only to to validate that the tablename data is well-formed at the outermost level, + # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names from + # a structural perspective. + + #todo - review! It's arguable that we should not do any validation here, and just store even incorrect raw tablenames, + # so that the tomlish list is more useful for say a toml editor. Consider adding an 'err' tag to the appropriate place in the + # tomlish list? + + #set trimtable [tablename_trim $tok] + #::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$trimtable'" + set v($nest) [list TABLE $tok] ;#$tok is the *raw* table name + #note also that equivalent tablenames may have different toml representations even after being trimmed! + #e.g ["x\t\t"] & ["x "] (tab escapes vs literals) + #These will show as above in the tomlish list, but should normalize to the same tablename when used as keys by the datastructure stage. + } + tablearrayname { + #set trimtable [tablename_trim $tok] + #::tomlish::log::debug "---- trimmed (but not normalized) tablearrayname: '$trimtable'" + set v($nest) [list TABLEARRAY $tok] ;#$tok is the *raw* tablearray name + } + startarray { + set v($nest) [list ARRAY] ;#$tok is just the opening bracket - don't output. + } + startinlinetable { + set v($nest) [list ITABLE] ;#$tok is just the opening curly brace - don't output. + } + default { + error "---- push trigger tokenType '$tokenType' not yet implemented" + } + } + + } else { + #no space level change + switch -exact -- $tokenType { + squotedkey { + #puts "---- squotedkey in state $prevstate (no space level change)" + lappend v($nest) [list SQKEY $tok] + } + dquotedkey { + #puts "---- dquotedkey in state $prevstate (no space level change)" + lappend v($nest) [list DQKEY $tok] + } + barekey { + lappend v($nest) [list KEY $tok] + } + dotsep { + lappend v($nest) [list DOTSEP] + } + starttablename { + #$tok is triggered by the opening bracket and sends nothing to output + } + starttablearrayname { + #$tok is triggered by the double opening brackets and sends nothing to output + } + tablename - tablenamearray { + error "---- did not expect 'tablename/tablearrayname' without space level change (no space level change)" + #set v($nest) [list TABLE $tok] + } + endtablename - endtablearrayname { + #no output into the tomlish list for this token + } + startinlinetable { + puts stderr "---- decode::toml error. did not expect startinlinetable without space level change (no space level change)" + } + single_dquote { + switch -exact -- $newstate { + string-state { + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "string" + set tok "" + } + dquoted-key { + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "dquotedkey" + set tok "" + } + multistring-space { + lappend v($nest) [list STRINGPART {"}] + #may need to be joined on pop if there are neighbouring STRINGPARTS + } + default { + error "---- single_dquote switch case not implemented for nextstate: $newstate (no space level change)" + } + } + } + double_dquote { + #leading extra quotes - test: toml_multistring_startquote2 + switch -exact -- $prevstate { + itable-keyval-value-expected - keyval-value-expected { + puts stderr "tomlish::decode::toml double_dquote TEST" + #empty string + lappend v($nest) [list STRINGPART ""] + } + multistring-space { + #multistring-space to multistring-space + lappend v($nest) [list STRINGPART {""}] + } + default { + error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" + } + } + + } + single_squote { + switch -exact -- $newstate { + literal-state { + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "literal" + set tok "" + } + squoted-key { + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "squotedkey" + set tok "" + } + multiliteral-space { + #false alarm squote returned from tentative_accum_squote pop + ::tomlish::log::debug "---- adding lone squote to own LITERALPART nextstate: $newstate (no space level change)" + #(single squote - not terminating space) + lappend v($nest) [list LITERALPART '] + #may need to be joined on pop if there are neighbouring LITERALPARTs + } + default { + error "---- single_squote switch case not implemented for nextstate: $newstate (no space level change)" + } + } + } + double_squote { + switch -exact -- $prevstate { + keyval-value-expected { + lappend v($nest) [list LITERAL ""] + } + multiliteral-space { + #multiliteral-space to multiliteral-space + lappend v($nest) [list LITERALPART ''] + } + default { + error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" + } + } + } + enddquote { + #nothing to do? + set tok "" + } + endsquote { + set tok "" + } + string { + #JJJJ + set tok [tomlish::from_Bstring $tok] + lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes + } + literal { + lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes + } + multistring { + #review + #JJJJ ? + lappend v($nest) [list MULTISTRING $tok] + } + stringpart { + #JJJJ + set tok [tomlish::from_Bstring $tok] + lappend v($nest) [list STRINGPART $tok] ;#will not get wrapped in dquotes directly + } + multiliteral { + lappend v($nest) [LIST MULTILITERAL $tok] + } + literalpart { + lappend v($nest) [list LITERALPART $tok] ;#will not get wrapped in squotes directly + } + untyped_value { + #would be better termed unclassified_value + #we can't determine the type of unquoted values (int,float,datetime,bool) until the entire token was read. + unset -nocomplain tag + if {$tok in {true false}} { + set tag BOOL + } else { + if {[::tomlish::utils::is_int $tok]} { + set tag INT + } else { + if {[string is integer -strict $tok]} { + #didn't qualify as a toml int - but still an int + #probably means is_int is limiting size and not accepting bigints (configurable?) + #or it didn't qualify due to more than 1 leading zero + #or other integer format issue such as repeated underscores + error "---- Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. (looks close to being an int. Formatting or range issue?) [tomlish::parse::report_line] (no space level change)" + } else { + #DDDD + if {[::tomlish::utils::is_float $tok]} { + set tag FLOAT + } elseif {[::tomlish::utils::is_localtime $tok]} { + set tag TIME-LOCAL + } elseif {[::tomlish::utils::is_timepart $tok]} { + #Note we must allow lone timepart here (not just is_localtime which doesn't allow tz offsets) in case it followed a previous localdate + set tag DATETIME ;#?? review standalone time with tz - no specific tag - only allowed as followup value from DATETIME-LOCAL + } elseif {[::tomlish::utils::is_datepart $tok]} { + set tag DATE-LOCAL + } elseif {[::tomlish::utils::is_datetime $tok]} { + #not just a date or just a time + #could be either local or have tz offset + #DDDD JJJ + set norm [string map {" " T} $tok];#prob unneeded - we won't get here if there was a space - would arrive as 2 separate tokens review. + lassign [split $norm T] dp tp + if {[::tomlish::utils::is_localtime $tp]} { + set tag DATETIME-LOCAL + } else { + set tag DATETIME + } + } elseif {[::tomlish::utils::is_datetime X$tok] || [::tomlish::utils::is_timepart X$tok]} { + # obsolete + #Note we must allow lone timepart here (not just is_localtime which doesn't allow tz offsets) in case it followed a previous localdate + #e.g x= 2025-01-01 02:34Z + #The dict::from_tomlish validation will catch an invalid standaline timepart, or combine with leading date if applicable. + set tag DATETIME + } else { + error "---- Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. [tomlish::parse::report_line] (no space level change)" + } + } + } + } + #assert either tag is set, or we errored out. + lappend v($nest) [list $tag $tok] + + } + comment { + #puts stdout "----- comment token returned '$tok'------" + #JJJJ + set tok [tomlish::from_comment $tok] + lappend v($nest) [list COMMENT "$tok"] + } + equal { + #we append '=' to the nest so that any surrounding whitespace is retained. + lappend v($nest) = + } + comma { + lappend v($nest) SEP + } + newline { + incr linenum + lappend v($nest) [list NEWLINE $tok] + } + whitespace { + lappend v($nest) [list WS $tok] + } + continuation { + lappend v($nest) CONT + } + bom { + lappend v($nest) BOM + } + eof { + #ok - nothing more to add to the tomlish list. + #!todo - check previous tokens are complete/valid? + } + default { + error "--- unknown tokenType '$tokenType' during state $prevstate [::tomlish::parse::report_line] (no space level change)" + } + } + } + + if {!$next_tokenType_known} { + ::tomlish::log::notice "---- tomlish::decode::toml - current tokenType:$tokenType Next token type not known" + ::tomlish::parse::set_tokenType "" + set tok "" + } + + if {$state eq "end-state"} { + break + } + + + } + + #while {$nest > 0} { + # lappend v([expr {$nest -1}]) [set v($nest)] + # incr nest -1 + #} + while {[::tomlish::parse::spacestack size] > 1} { + ::tomlish::parse::spacestack pop + lappend v([expr {$nest -1}]) [set v($nest)] + incr nest -1 + + #set parent [spacestack peek] ;#the level being appended to + #lassign $parent type state + #if {$type eq "space"} { + # + #} elseif {$type eq "buffer"} { + # lappend v([expr {$nest -1}]) {*}[set v($nest)] + #} else { + # error "invalid spacestack item: $parent" + #} + } + + } finally { + set is_parsing 0 + } + return $v(0) + } + + #toml dquoted string to tomlish STRING + # - only allow specified escape sequences + # - allow any unicode except those that must be escaped: dquote, bsl, and control chars(except tab) + proc from_Bstring {bstr} { + #JJJJ + if {[catch { + tomlish::utils::unescape_string $bstr + } errM]} { + return -code error -errorcode {TOML SYNTAX INVALIDESCAPE} "tomlish::from_Bstring toml Bstring contains invalid escape sequence\n$errM" ;#review + } + #assert: all escapes are now valid + + if {[regexp {[\u0000-\u0008\u000A-\u001F\u007f]} $bstr]} { + set msg "tomlish::from_Bstring toml Bstring contains controls that must be escaped" + return -code error -errorcode {TOML SYNTAX BSTRINGUNESCAPEDCONTROLS} $msg ;#review + } + return $bstr + } + #validate toml comment + # - disallow controls that must be escaped + #from spec: + # "Control characters other than tab (U+0000 to U+0008, U+000A to U+001F, U+007F) are not permitted in comments." + proc from_comment {comment} { + if {[regexp {[\u0000-\u0008\u000A-\u001F\u007f]} $comment]} { + set msg "tomlish::from_comment toml comment contains controls that must be escaped" + return -code error -errorcode {TOML SYNTAX COMMENTUNESCAPEDCONTROLS} $msg ;#review + } + return $comment + } + + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish ---}] +} +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +namespace eval tomlish::build { + #STRING,INT,FLOAT,BOOL, DATETIME - simple wrappers for completeness + # take a value of the appropriate type and wrap as a tomlish tagged item + proc STRING {s} { + return [list STRING [::tomlish::utils::rawstring_to_Bstring_with_escaped_controls $s]] + } + proc LITERAL {litstring} { + error todo + } + + proc INT {i} { + #whole numbers, may be prefixed with a + or - + #Leading zeros are not allowed + #Hex,octal binary forms are allowed (toml 1.0) + #We will error out on encountering commas, as commas are interpreted differently depending on locale (and don't seem to be supported in the toml spec anyway) + #!todo - Tcl can handle bignums - bigger than a 64bit signed long as specified in toml. + # - We should probably raise an error for number larger than this and suggest the user supply it as a string? + if {[tcl::string::last , $i] > -1} { + error "Unable to interpret '$i' as an integer. Use underscores if you need a thousands separator [::tomlish::parse::report_line]" + } + if {![::tomlish::utils::int_validchars $i]} { + error "Unable to interpret '$i' as an integer. Only 0-9 + 1 _ characters are acceptable. [::tomlish::parse::report_line]" + } + + if {[::tomlish::utils::is_int $i]} { + return [list INT $i] + } else { + error "'$i' is not a valid integer as per the Toml spec. [::tomlish::parse::report_line]" + } + + } + + proc FLOAT {f} { + #convert any non-lower case variants of special values to lowercase for Toml + if {[::tcl::string::tolower $f] in {nan +nan -nan inf +inf -inf}} { + return [list FLOAT [tcl::string::tolower $f]] + } + if {[::tomlish::utils::is_float $f]} { + return [list FLOAT $f] + } else { + error "Unable to interpret '$f' as Toml float. Check your input, or check that tomlish is able to handle all Toml floats properly [::tomlish::parse::report_line]" + } + } + + proc DATETIME {str} { + if {[::tomlish::utils::is_datetime $str]} { + return [list DATETIME $str] + } else { + error "Unable to interpret '$str' as Toml datetime. Check your input, or check that tomlish is able to handle all Toml datetimes properly [::tomlish::parse::report_line]" + } + } + proc DATETIME-LOCAL {str} { + error "build::DATETIME-LOCAL todo" + } + + proc BOOLEAN {b} { + #convert any Tcl-acceptable boolean to boolean as accepted by toml - lower case true/false + if {![tcl::string::is boolean -strict $b]} { + error "Unable to convert '$b' to Toml boolean true|false. [::tomlish::parse::report_line]" + } else { + if {$b && 1} { + return [::list BOOL true] + } else { + return [::list BOOL false] + } + } + } + + #REVIEW + #Take tablename followed by + # a) *tomlish* name-value pairs e.g table mydata [list KEY item11 = [list STRING "test"]] {KEY item2 = [list INT 1]} + # (accept also key value {STRING }) + # b) simple 2-element tcl lists being name & *simple* value pairs for which basic heuristics will be used to determine types + proc _table {name args} { + set pairs [list] + foreach t $args { + if {[llength $t] == 4} { + if {[tcl::string::tolower [lindex $t 0]] ne "key" || [tcl::string::tolower [lindex $t 2]] ni "= value"} { + error "Only items tagged as KEY = currently accepted as name-value pairs for table command" + } + lassign $t _k keystr _eq valuepart + if {[llength $valuepart] != 2} { + error "supplied value must be typed. e.g {INT 1} or {STRING test}" + } + lappend pairs [list KEY $keystr = $valuepart] + } elseif {[llength $t] == 2} { + #!todo - type heuristics + lassign $t n v + lappend pairs [list KEY $n = [list STRING $v]] + } else { + error "'KEY = { toml but + # the first newline is not part of the data. + # we elect instead to maintain a basic LITERALPART that must not contain newlines.. + # and to compose MULTILITERAL of multiple NEWLINE LITERALPART parts, + #with the datastructure representation dropping the first newline (if immediately following opening delim) when building the value. + set literal "" + foreach part [lrange $item 1 end] { + append literal [::tomlish::encode::tomlish [list $part] $nextcontext] + } + append toml '''$literal''' + } + INT - + BOOL - + FLOAT - + DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL { + #DDDD + append toml [lindex $item 1] + } + INCOMPLETE { + error "cannot process tomlish term tagged as INCOMPLETE" + } + COMMENT { + append toml "#[lindex $item 1]" + } + BOM { + #Byte Order Mark may appear at beginning of a file. Needs to be preserved. + append toml "\uFEFF" + } + default { + error "Not a properly formed 'tomlish' taggedlist.\n '$list'\n Unknown tag '[lindex $item 0]'. See output of \[tomlish::tags\] command." + } + } + + } + return $toml + } + + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::encode ---}] +} +#fish toml from tomlish + +#(encode tomlish as toml) +interp alias {} tomlish::to_toml {} tomlish::encode::tomlish + +# + + +namespace eval tomlish::decode { + #*** !doctools + #[subsection {Namespace tomlish::decode}] + #[para] + #[list_begin definitions] + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::decode ---}] +} +#decode toml to tomlish +#interp alias {} tomlish::from_toml {} tomlish::decode::toml + +namespace eval tomlish::utils { + #*** !doctools + #[subsection {Namespace tomlish::utils}] + #[para] + #[list_begin definitions] + + + + #basic generic quote matching for single and double quotes + #note for example that {[o'malley]} will return sq - as the single quote is not closed or wrapped in double quotes + proc tok_in_quotedpart {tok} { + set sLen [tcl::string::length $tok] + set quote_type "" + set had_slash 0 + for {set i 0} {$i < $sLen} {incr i} { + set c [tcl::string::index $tok $i] + if {$quote_type eq ""} { + if {$had_slash} { + #don't enter quote mode + #leave slash_mode because even if current char is slash - it is escaped + set had_slash 0 + } else { + set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] + switch -- $ctype { + dq { + set quote_type dq + } + sq { + set quote_type sq + } + bsl { + set had_slash 1 + } + } + } + } else { + if {$had_slash} { + #don't leave quoted mode + #leave slash_mode because even if current char is slash - it is escaped + set had_slash 0 + } else { + set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] + switch -- $ctype { + dq { + if {$quote_type eq "dq"} { + set quote_type "" + } + } + sq { + if {$quote_type eq "sq"} { + set quote_type "" + } + } + bsl { + set had_slash 1 + } + } + } + } + } + return $quote_type ;#dq | sq + } + + proc hex_escape_info {slashx} { + set exp {^\\x([0-9a-fA-F]{2}$)} + if {[regexp $exp $slashx match hex]} { + return [list ok [list char [subst -nocommand -novariable $slashx]]] + } else { + return [list err [list reason "Supplied string not of the form \\xHH where H in \[0-9a-fA-F\]"]] + } + } + proc unicode_escape_info {slashu} { + #!todo + # validate that slashu is either a \uxxxx or \Uxxxxxxxx value of the correct length and + # is a valid 'unicode scalar value' (any Unicode code point except high-surrogate and low-surrogate code points) + # ie integers in the range 0 to D7FF16 and E00016 to 10FFFF16 inclusive + #expr {(($x >= 0) && ($x <= 0xD7FF16)) || (($x >= 0xE00016) && ($x <= 0x10FFFF16))} + if {[tcl::string::match {\\u*} $slashu]} { + set exp {^\\u([0-9a-fA-F]{4}$)} + if {[regexp $exp $slashu match hex]} { + if {[scan $hex %4x dec] != 1} { + #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? + return [list err [list reason "Failed to convert '$hex' to decimal"]] + } else { + return [list ok [list char [subst -nocommand -novariable $slashu]]] + } + } else { + return [list err [list reason "Supplied string not of the form \\uHHHH where H in \[0-9a-fA-F\]"]] + } + } elseif {[tcl::string::match {\\U*} $slashu]} { + set exp {^\\U([0-9a-fA-F]{8}$)} + if {[regexp $exp $slashu match hex]} { + if {[scan $hex %8x dec] != 1} { + #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? + return [list err [list reason "Failed to convert '$hex' to decimal"]] + } else { + if {(($dec >= 0) && ($dec <= 0xD7FF16)) || (($dec >= 0xE00016) && ($dec <= 0x10FFFF16))} { + return [list ok [list char [subst -nocommand -novariable $slashu]]] + } else { + return [list err [list reason "$slashu is not within the 'unicode scalar value' ranges 0 to 0xD7FF16 or 0xE00016 to 0x10FFFF16"]] + } + } + } else { + return [list err [list reason "Supplied string not of the form \\UHHHHHHHH where H in \[0-9a-fA-F\]"]] + } + } else { + return [list err [list reason "Supplied string did not start with \\u or \\U" ]] + } + + } + + #Note that unicode characters don't *have* to be escaped. + #So if we provide a function named 'escape_string', the name implies the inverse of unescape_string which unescapes unicode \u \U values. + #- an inverse of unescape_string would encode all unicode chars unnecessarily. + #- as toml accepts a compact escape sequence for common chars such as tab,backspace,linefeed etc but also allows the full form \u009 etc + #- escape_string and unescape_string would not be reliably roundtrippable inverses anyway. + #REVIEW - provide it anyway? When would it be desirable to use? + + variable Bstring_control_map [dict create] + dict set Bstring_control_map \b {\b} + dict set Bstring_control_map \n {\n} + dict set Bstring_control_map \r {\r} + dict set Bstring_control_map \" {\"} + dict set Bstring_control_map \x1b {\e} ;#In spec it's included in the list of 'must be escaped', as well as the 'convenience' escapes - so we make it go both ways. + dict set Bstring_control_map \\ "\\\\" + + #\e for \x1b seems like it might be included - v1.1?? hard to find current state of where toml is going :/ + #for a Bstring (Basic string) tab is explicitly mentioned as not being one that must be escaped. + #8 = \b - already in list. + #built the remainder whilst checking for entries already hardcoded above -in case more are added to the hardcoded list + for {set cdec 0} {$cdec <= 7} {incr cdec} { + set hhhh [format %.4X $cdec] + set char [format %c $cdec] + if {![dict exists $Bstring_control_map $char]} { + dict set Bstring_control_map $char \\u$hhhh + } + } + for {set cdec [expr {0x0A}]} {$cdec <= 0x1F} {incr cdec} { + set hhhh [format %.4X $cdec] + set char [format %c $cdec] + if {![dict exists $Bstring_control_map $char]} { + dict set Bstring_control_map $char \\u$hhhh + } + } + # \u007F = 127 + dict set Bstring_control_map [format %c 127] \\u007F + + # ------------------------------------------------------------------ + variable Literal_control_map [dict create] + #controls other than tab + for {set cdec 0} {$cdec <= 8} {incr cdec} { + set hhhh [format %.4X $cdec] + set char [format %c $cdec] + if {![dict exists $Literal_control_map $char]} { + dict set Literal_control_map $char \\u$hhhh + } + } + for {set cdec [expr {0x0A}]} {$cdec <= 0x1F} {incr cdec} { + set hhhh [format %.4X $cdec] + set char [format %c $cdec] + if {![dict exists $Literal_control_map $char]} { + dict set Literal_control_map $char \\u$hhhh + } + } + # \u007F = 127 + dict set Literal_control_map [format %c 127] \\u007F + # ------------------------------------------------------------------ + variable Multiliteral_control_map + set Multiliteral_control_map [dict remove $Literal_control_map \n] + + variable String_control_map + set String_control_map [dict remove $Literal_control_map \\] + + + variable MultiBstring_totoml_map + #'minimally' escaped sequences of double quotes. + #e.g {""\"""\"} vs {\"\"\"\"\"} + #This produces easier to read toml - and in many cases may be more likely to match original format when roundtripped from dict datastructure + # REVIEW - should this be configurable? + set MultiBstring_totoml_map [dict remove $Bstring_control_map {"} \r \n] + dict set MultiBstring_totoml_map {"""} {""\"} ;#" editor hack: commented quote for dumb syntax highlighers + + #Note the inclusion of backslash in the list of controls makes this non idempotent - subsequent runs would keep encoding the backslashes! + #escape only those chars that must be escaped in a Bstring (e.g not tab which can be literal or escaped) + #for example - can be used by from_dict to produce valid Bstring data for a tomlish record + proc rawstring_to_Bstring_with_escaped_controls {str} { + #for the well known chars that have compact escape sequences allowed by toml - we choose that form over the full \u form. + #we'll use a string map with an explicit list rather than algorithmic at runtime + # - the string map is probably more performant than splitting a string, especially if it's large + + upvar ::tomlish::utils::Bstring_control_map map + + return [string map $map $str] + } + proc rawstring_to_MultiBstring_with_escaped_controls {str} { + #for the well known chars that have compact escape sequences allowed by toml - we choose that form over the full \u form. + #we'll use a string map with an explicit list rather than algorithmic at runtime + # - the string map is probably more performant than splitting a string, especially if it's large + + upvar ::tomlish::utils::MultiBstring_totoml_map map + + return [string map $map $str] + } + + proc rawstring_is_valid_tomlstring {str} { + #controls are allowed in this direction dict -> toml (they get quoted) + + #check any existing escapes are valid + if {[catch { + unescape_string $str + } errM]} { + return 0 + } + return 1 + } + + proc rawstring_is_valid_literal {str} { + #detect control chars other than tab + variable Literal_control_map + set testval [string map $Literal_control_map $str] + return [expr {$testval eq $str}] + } + proc rawstring_is_valid_multiliteral {str} { + #detect control chars other than tab + variable Multiliteral_control_map + + set teststr [string map [list \r\n ok] $str] + + set testval [string map $Multiliteral_control_map $teststr] + return [expr {$testval eq $teststr}] + } + + #review - unescape what string? Bstring vs MLBstring? + #we should be specific in the function naming here + #used by dict::from_tomlish - so part of validation? - REVIEW + proc unescape_string {str} { + #note we can't just use Tcl subst because: + # it also transforms \a (audible bell) and \v (vertical tab) which are not in the toml spec. + # it would strip out backslashes inappropriately: e.g "\j" becomes just j + # it recognizes other escapes which aren't approprite e.g octal \nnn + # it replaces \ with a single whitespace (trailing backslash) + #This means we shouldn't use 'subst' on the whole string, but instead substitute only the toml-specified escapes (\r \n \b \t \f \\ \" \uhhhh & \Uhhhhhhhh + #plus \e for \x1b? + + set buffer "" + set buffer2 "" ;#buffer for 2 hex characters following a \x + set buffer4 "" ;#buffer for 4 hex characters following a \u + set buffer8 "" ;#buffer for 8 hex characters following a \u + + set sLen [tcl::string::length $str] + + #we need to handle arbitrarily long sequences of backslashes. \\\\\ etc + set slash_active 0 + set unicode2_active 0 + set unicode4_active 0 + set unicode8_active 0 + + ::tomlish::log::debug "unescape_string. got len [string length str] str $str" + + #!todo - check for invalid data in the form of a raw carriage return (decimal 13) without following linefeed? + set i 0 + for {} {$i < $sLen} {} { + if {$i > 0} { + set lastChar [tcl::string::index $str [expr {$i - 1}]] + } else { + set lastChar "" + } + + set c [tcl::string::index $str $i] + #::tomlish::log::debug "unescape_string. got char $c" ;#too much? + + ##---------------------- + ##as we are 'unescaping' - should we really be testing here for existing values that should have been escaped? + ##The answer is probably no - keep this function to a single purpose - test elsewhere for raw controls. + ##this test looks incomplete anyway REVIEW + #scan $c %c n + #if {($n <= 31) && ($n != 9) && ($n != 10) && ($n != 13)} { + # #we don't expect unescaped unicode characters from 0000 to 001F - + # #*except* for raw tab (which is whitespace) and newlines + # error "unescape_string. Invalid data for a toml string. Unescaped control character (decimal $n) [::tomlish::utils::string_to_slashu $c]" + #} + ##---------------------- + + incr i ;#must incr here because we do'returns'inside the loop + if {$c eq "\\"} { + if {$slash_active} { + append buffer "\\" + set slash_active 0 + } elseif {$unicode2_active} { + error "unescape_string. unexpected case slash during unicode2 not yet handled" + } elseif {$unicode4_active} { + error "unescape_string. unexpected case slash during unicode4 not yet handled" + } elseif {$unicode8_active} { + error "unescape_string. unexpected case slash during unicode8 not yet handled" + } else { + # don't output anything (yet) + set slash_active 1 + } + } else { + if {$unicode2_active} { + if {[tcl::string::length $buffer2] < 2} { + append buffer2 $c + } + if {[tcl::string::length $buffer2] == 2} { + #we have a \xHH to test + set unicode2_active 0 + set result [tomlish::utils::hex_escape_info "\\x$buffer2"] + if {[lindex $result 0] eq "ok"} { + append buffer [dict get $result ok char] + } else { + error "unescape_string error: [lindex $result 1]" + } + } + } elseif {$unicode4_active} { + if {[tcl::string::length $buffer4] < 4} { + append buffer4 $c + } + if {[tcl::string::length $buffer4] == 4} { + #we have a \uHHHH to test + set unicode4_active 0 + set result [tomlish::utils::unicode_escape_info "\\u$buffer4"] + if {[lindex $result 0] eq "ok"} { + append buffer [dict get $result ok char] + } else { + error "unescape_string error: [lindex $result 1]" + } + } + } elseif {$unicode8_active} { + if {[tcl::string::length $buffer8] < 8} { + append buffer8 $c + } + if {[tcl::string::length $buffer8] == 8} { + #we have a \UHHHHHHHH to test + set unicode8_active 0 + set result [tomlish::utils::unicode_escape_info "\\U$buffer8"] + if {[lindex $result 0] eq "ok"} { + append buffer [dict get $result ok char] + } else { + error "unescape_string error: [lindex $result 1]" + } + } + } elseif {$slash_active} { + set slash_active 0 + set ctest [tcl::string::map {{"} dq} $c] + switch -exact -- $ctest { + dq { + append buffer {"} + } + b - t - n - f - r { + append buffer [subst -nocommand -novariable "\\$c"] + } + e { + append buffer \x1b + } + x { + #introduced in 1.1.0 \xHH + set unicode2_active 1 + set buffer2 "" + } + u { + set unicode4_active 1 + set buffer4 "" + } + U { + set unicode8_active 1 + set buffer8 "" + } + default { + set slash_active 0 + #review - toml spec says all other escapes are reserved + #and if they are used TOML should produce an error. + #append buffer "\\$c" + set msg "Invalid escape sequence \\ followed by '$c'" + return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} $msg + } + } + } else { + append buffer $c + } + } + } + #puts stdout "EOF 4:$unicode4_active 8:$unicode8_active slash:$slash_active" + if {$unicode2_active} { + error "End of string reached before complete hex escape sequence \xHH" + } + if {$unicode4_active} { + error "End of string reached before complete unicode escape sequence \uHHHH" + } + if {$unicode8_active} { + error "End of string reached before complete unicode escape sequence \UHHHHHHHH" + } + if {$slash_active} { + append buffer "\\" + } + try { + encoding convertto utf-8 $buffer + } trap {} {emsg eopts} { + return -code error -errorcode {TOMLISH SYNTAX ENCODINGERROR} $emsg + } + return $buffer + } + + #This does not have to do with unicode normal forms - which it seems toml has decided against regarding use in keys (review/references?) + #This is meant for internal use regarding ensuring we match equivalent keys which may have just been specified with different string mechanisms, + #e.g squoted vs dquoted vs barekey. + proc normalize_key {rawkey} { + set c1 [tcl::string::index $rawkey 0] + set c2 [tcl::string::index $rawkey end] + if {($c1 eq "'") && ($c2 eq "'")} { + #single quoted segment. No escapes allowed within it. + set key [tcl::string::range $rawkey 1 end-1] + } elseif {($c1 eq "\"") && ($c2 eq "\"")} { + #double quoted segment. Unapply escapes. + # + set keydata [tcl::string::range $rawkey 1 end-1] ;#strip outer quotes only + #e.g key could have mix of \UXXXXXXXX escapes and unicode chars + #or mix of \t and literal tabs. + #unescape to convert all to literal versions for comparison + set key [::tomlish::utils::unescape_string $keydata] + #set key [subst -nocommands -novariables $keydata] ;#wrong. Todo - create a string escape substitution function. + } else { + set key $rawkey + } + return $key + } + + proc string_to_slashu {string} { + set rv {} + foreach c [split $string {}] { + scan $c %c cdec + if {$cdec > 65535} { + append rv {\U} [format %.8X $cdec] + } else { + append rv {\u} [format %.4X $cdec] + } + } + return $rv + } + + #'nonprintable' is conservative here because some systems (e.g windows console) are very limited in what they can display. + #This is used for display purposes only (error msgs) + proc nonprintable_to_slashu {s} { + set res "" + foreach i [split $s ""] { + scan $i %c cdec + + set printable 0 + if {($cdec>31) && ($cdec<127)} { + set printable 1 + } + if {$printable} { + append res $i + } else { + if {$cdec > 65535} { + append res \\U[format %.8X $cdec] + } else { + append res \\u[format %.4X $cdec] + } + } + } + set res + } ;# initial version from tcl wiki RS + + proc rawstring_to_jsonstring {s} { + #like nonprintable_to_slashu + # - also escape every dquote + # - escape newlines + set res "" + foreach i [split $s ""] { + scan $i %c cdec + switch -- $cdec { + 34 { + #double quote + append res \\\" + } + 13 { + #carriage return + append res \\r + } + 8 { + append res \\b + } + 9 { + append res \\t + } + 10 { + #linefeed + append res \\n + } + 92 { + append res \\\\ + } + default { + set printable 0 + if {($cdec>31) && ($cdec<127)} { + set printable 1 + } + if {$printable} { + append res $i + } else { + if {$cdec > 65535} { + #append res $i + #append res \\U[format %.8X $cdec] ;#wrong + #append res "\\U{[format %.8x $cdec]}" ;#some variation of json? + package require punk::cesu + #e.g \U0001f610 emoticon face + #surrogate pair: \uD83D\uDE10 + set surrogatepair [punk::cesu::to_surrogatestring -format escape $i] + append res $surrogatepair + } else { + append res \\u[format %.4X $cdec] + } + } + } + } + } + set res + + } + + #check if str is valid for use as a toml bare key + #Early toml versions only allowed letters + underscore + dash + proc is_basic_barekey {str} { + if {[tcl::string::length $str] == 0} { + return 0 + } else { + set matches [regexp -all {[a-zA-Z0-9\_\-]} $str] + if {[tcl::string::length $str] == $matches} { + #all characters match the regexp + return 1 + } else { + return 0 + } + } + } + + #from toml.abnf in github.com/toml-lang/toml + #unquoted-key = 1*unquoted-key-char + #unquoted-key-char = ALPHA / DIGIT / %x2D / %x5F ; a-z A-Z 0-9 - _ + #unquoted-key-char =/ %xB2 / %xB3 / %xB9 / %xBC-BE ; superscript digits, fractions + #unquoted-key-char =/ %xC0-D6 / %xD8-F6 / %xF8-37D ; non-symbol chars in Latin block + #unquoted-key-char =/ %x37F-1FFF ; exclude GREEK QUESTION MARK, which is basically a semi-colon + #unquoted-key-char =/ %x200C-200D / %x203F-2040 ; from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ + #unquoted-key-char =/ %x2070-218F / %x2460-24FF ; include super-/subscripts, letterlike/numberlike forms, enclosed alphanumerics + #unquoted-key-char =/ %x2C00-2FEF / %x3001-D7FF ; skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces + #unquoted-key-char =/ %x2070-21FF / %x2300-24FF ; skip math operators + #unquoted-key-char =/ %x25A0-268B / %x2690-2757 ; skip box drawing, block elements, and some yin-yang symbols + #unquoted-key-char =/ %x2762-2767 / %x2776-27E5 ; skip some Dingbat punctuation + #unquoted-key-char =/ %x2801-297F ; skip some math brackets and arrows, and braille blank + #unquoted-key-char =/ %x2B00-2FFF / %x3001-D7FF ; skip various math operators and symbols, and ideographic space + #unquoted-key-char =/ %xF900-FDCF / %xFDF0-FFFD ; skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) + #unquoted-key-char =/ %x10000-EFFFF ; all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) + variable re_barekey + set ranges [list] + lappend ranges {a-zA-Z0-9\_\-} + lappend ranges {\u00B2} {\u00B3} {\u00B9} {\u00BC-\u00BE} ;# superscript digits, fractions + lappend ranges {\u00C0-\u00D6} {\u00D8-\u00F6} {\u00F8-\u037D} ;# non-symbol chars in Latin block + lappend ranges {\u037f-\u1FFF} ;# exclude GREEK QUESTION MARK, which is basically a semi-colon + lappend ranges {\u200C-\u200D} {\u203F-\u2040} ;# from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ + lappend ranges {\u2070-\u218f} {\u2460-\u24FF} ;# include super-subscripts, letterlike/numberlike forms, enclosed alphanumerics + lappend ranges {\u2C00-\u2FEF} {\u3001-\uD7FF} ;# skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces + lappend ranges {\u2070-\u21FF} {\u2300-\u24FF} ;# skip math operators + lappend ranges {\u25A0-\u268B} {\u2690-\u2757} ;# skip box drawing, block elements, and some yin-yang symbols + lappend ranges {\u2762-\u2767} {\u2776-\u27E5} ;# skip some Dingbat punctuation + lappend ranges {\u2801-\u297F} ;# skip some math brackets and arrows, and braille blank + lappend ranges {\u2B00-\u2FFF} {\u3001-\uD7FF} ;# skip various math operators and symbols, and ideographic space + lappend ranges {\uF900-\uFDCF} {\uFDF0-\uFFFD} ;# skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) + lappend ranges {\U10000-\UEFFFF} ;# all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) + set re_barekey {^[} + foreach r $ranges { + append re_barekey $r + } + append re_barekey {]+$} + + proc is_barekey {str} { + if {[tcl::string::length $str] == 0} { + return 0 + } + variable re_barekey + return [regexp $re_barekey $str] + } + + #test only that the characters in str are valid for the toml specified type 'integer'. + proc int_validchars1 {str} { + set numchars [tcl::string::length $str] + if {[regexp -all {[0-9\_\-\+]} $str] == $numchars} { + return 1 + } else { + return 0 + } + } + #add support for hex,octal,binary 0x.. 0o.. 0b... + proc int_validchars {str} { + set numchars [tcl::string::length $str] + if {[regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] == $numchars} { + return 1 + } else { + return 0 + } + } + + proc is_int {str} { + set matches [regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] ;#0b101 etc covered by a-f + + if {[tcl::string::length $str] == $matches} { + #all characters in legal range + + # --------------------------------------- + #check for leading zeroes in non 0x 0b 0o + #first strip any +, - or _ (just for this test) + #(but still allowing 0 -0 +0) + set check [tcl::string::map {+ "" - "" _ ""} $str] + if {([tcl::string::length $check] > 1) && ([tcl::string::index $check 0] eq "0") && ([tcl::string::index $check 1] ni {o x b})} { + return 0 + } + # --------------------------------------- + + #check +,- only occur in the first position. (excludes also +++1 etc) + if {[tcl::string::last - $str] > 0} { + return 0 + } + if {[tcl::string::last + $str] > 0} { + return 0 + } + + #------------------------------------------- + #unclear if a 'digit' includes the type specifiers x b o + #we assume the 0x 0b 0o are NOT counted as digits - as underscores here would seem + #to be likely to cause interop issues with other systems + #(e.g tcl allows 0b1_1 but not 0b_11) + #Most of this structure would be unnecessary if we could rely on string::is::integer understanding underscores (9+?) + #we still need to support earlier Tcl for now though. + + #first rule out any case with more than one underscore in a row + if {[regexp {__} $str]} { + return 0 + } + if {[string index $str 0] eq "_"} { + return 0 + } + set utest [string trimleft $str +-] + #test again for further trick like _+_0xFF + if {[string index $utest 0] eq "_"} { + return 0 + } + if {[string range $utest 0 1] in {0x 0b 0o}} { + set testnum [string range $utest 2 end] + #spec says *non-negative* integers may *also* be expressed in hex, octal or binary + #and also explicitly states + not allowed + #presumed to mean negative not allowed. + if {[string index $str 0] in {- +}} { + return 0 + } + } else { + set testnum $utest + #exclude also things like 0_x 0___b that snuck past our prefix test + if {![string is digit -strict [string map {_ ""} $testnum]]} { + return 0 + } + #assert - only digits and underscores in testnum + #still may have underscores at each end + } + #assert testnum is now the 'digits' portion of a , 0x 0b 0o number + #(+ and - already stripped) + #It may still have chars unsuitable for its type - which will be caught by the string::is::integer test below + if {[string length $testnum] != [string length [string trim $testnum _]]} { + #had non-inner underscores in 'digit' part + return 0 + } + #assert str only has solo inner underscores (if any) between 'digits' + #------------------------------------------- + + set numeric_value [tcl::string::map {_ ""} $str] ;#allow some earlier tcl versions which don't support underscores + #use Tcl's integer check to ensure we don't let things like 3e4 through - which is a float (would need to be 0x3e4 for hex) + if {![tcl::string::is integer -strict $numeric_value]} { + return 0 + } + + + + #!todo - check bounds only based on some config value + #even though Tcl can handle bignums, we won't accept anything outside of toml 1.0 minimum requirements by default (for now) + #presumably very large numbers would have to be supplied in a toml file as strings. + #Review - toml 1.0 only says that it must handle up to 2^63 - not that this is a max + #some question around implementations allowed to use lower values such as 2^31 on some systems? + if {$::tomlish::max_int ne "" && $numeric_value > $::tomlish::max_int} { + return 0 + } + if {$::tomlish::min_int ne "" && $numeric_value < $::tomlish::min_int} { + return 0 + } + } else { + return 0 + } + #Got this far - didn't find anything wrong with it. + return 1 + } + + #test only that the characters in str are valid for the toml specified type 'float'. + proc float_validchars {str} { + set numchars [tcl::string::length $str] + if {[regexp -all {[eE0-9\_\-\+\.]} $str] == $numchars} { + return 1 + } else { + #only allow lower case for these special values - as per Toml 1.0 spec + if {$str ni {inf +inf -inf nan +nan -nan}} { + return 0 + } else { + return 1 + } + } + } + + #note - Tcl's string is double will return true also for the subset of float values which are integers + #This function is to determine whether it matches the Toml float concept - so requires a . or e or E + proc is_float {str} { + #vip greenlight known literals, don't test for case variations - as Toml doesn't allow (whereas Tcl allows Inf NaN etc) + if {$str in {inf +inf -inf nan +nan -nan}} { + return 1 + } + #doorcheck the basics for floatiness vs members of that rival gang - ints + if {![regexp {[.eE]} $str]} { + #could be an integer - which isn't specifically a float for Toml purposes. + return 0 + } + + + #patdown for any contraband chars + set matches [regexp -all {[eE0-9\_\-\+\.]} $str] + if {[tcl::string::length $str] != $matches} { + return 0 + } + + #all characters in legal range + + #A leading zero is ok, but we should disallow multiple leading zeroes (same rules as toml ints) + + #Early Toml spec also disallowed leading zeros in the exponent part(?) + #... this seems less interoperable anyway (some libraries generate leading zeroes in exponents) + #we allow leading zeros in exponents here. + + #Check for leading zeros in main part + #first strip any +, - or _ (just for this test) + set check [tcl::string::map {+ "" - "" _ ""} $str] + set r {([0-9])*} + regexp $r $check intpart ;#intpart holds all numerals before the first .,e or E + #leading zero only if exactly one zero + if {$intpart ne "0" && [string match 0* $intpart]} { + return 0 + } + + #for floats, +,- may occur in multiple places + #e.g -2E-22 +3e34 + #!todo - check bounds ? + + #----------------------------------------- + if {[regexp {__} $str]} { + return 0 + } + if {[string index $str 0] eq "_" || [string index $str end] eq "_"} { + return 0 + } + set utest [string trimleft $str +-] + #test again for further trick like _+_ + if {[string index $utest 0] eq "_"} { + return 0 + } + #----------------------------------------- + + #decimal point, if used must be surrounded by at least one digit on each side + #e.g 3.e+20 also illegal + set dposn [string first . $str] + if {$dposn > -1 } { + set d3 [string range $str $dposn-1 $dposn+1] + if {![string is integer -strict [string index $d3 0]] || ![string is integer -strict [string index $d3 2]]} { + return 0 + } + } + #we've already eliminated leading/trailing underscores + #now ensure each inner underscore is surrounded by digits + if {[regexp {_[^0-9]|[^0-9]_} $str]} { + return 0 + } + + #strip underscores for tcl double check so we can support < tcl 9 versions which didn't allow underscores + set check [tcl::string::map {_ ""} $str] + #string is double accepts inf nan +NaN etc. + if {![tcl::string::is double $check]} { + return 0 + } + + #All good - seems to be a toml-approved float and not an int. + return 1 + } + + #test only that the characters in str are valid for the toml specified type 'datetime'. + proc datetime_validchars {str} { + set numchars [tcl::string::length $str] + if {[regexp -all {[zZtT0-9\-\+\.:]} $str] == $numchars} { + return 1 + } else { + return 0 + } + } + + proc is_datepart {str} { + set matches [regexp -all {[0-9\-]} $str] + if {[tcl::string::length $str] != $matches} { + return 0 + } + #seems to require yyyy-mm-dd (e.g not allowing just yyyy-mm) + if {![regexp {^([0-9]{4})-([0-9]{2})-([0-9]{2})$} $str _match y m d]} { + return 0 + } + if {$m > 12 || $m == 0} { + return 0 + } + switch -- [expr {$m}] { + 1 - 3 - 5 - 7 - 8 - 10 - 12 { + if {$d > 31 || $d == 0} { + return 0 + } + } + 2 { + if {$d > 29 || $d == 0} { + return 0 + } + if {$d == 29} { + #leapyear check + if {[catch {clock scan $str -format %Y-%m-%d} errM]} { + return 0 + } + } + } + 4 - 6 - 9 - 11 { + if {$d > 30 || $d == 0} { + return 0 + } + } + } + return 1 + } + proc is_localdate {str} { + is_datepart $str + } + + #allow only hh:mm:ss or hh:mm (no subseconds) + proc _is_hms_or_hm_time {val} { + set numchars [tcl::string::length $val] + if {[regexp -all {[0-9:]} $val] != $numchars} { + return 0 + } + #assert now digits and colons only + set hms_cparts [split $val :] + #2 or 3 parts only are valid - check contents of each part + if {[llength $hms_cparts] == 2} { + lassign $hms_cparts hr min + if {[string length $hr] != 2 || [string length $min] != 2} { + return 0 + } + if {$hr > 23 || $min > 59} { + return 0 + } + } elseif {[llength $hms_cparts] == 3} { + lassign $hms_cparts hr min sec + if {[string length $hr] != 2 || [string length $min] != 2 || [string length $sec] !=2} { + return 0 + } + #possible for sec to be 60 - leap second RFC 3339 + if {$hr > 23 || $min > 59 || $sec > 60} { + return 0 + } + } else { + return 0 + } + return 1 + } + proc is_timepart {str} { + #validate the part after the T (or space) + #we receive only that trailing part here. + + #odt1 = 1979-05-27T07:32:00Z + #odt2 = 1979-05-27T00:32:00-07:00 + #odt3 = 1979-05-27T00:32:00.5-07:00 + #odt4 = 1979-05-27T00:32:00.999999-07:00 + + set numchars [tcl::string::length $str] + #timepart can have negative or positive offsets so - and + must be accepted + if {[regexp -all {[zZt0-9\-\+\.:]} $str] == $numchars} { + #todo + #basic check that we have leading 2dig hr and 2dig min separated by colon + if {![regexp {^[0-9]{2}:[0-9]{2}$|^[0-9]{2}:[0-9]{2}[^0-9]{1}.*$} $str]} { + #nn:nn or nn:nnX.* where X is non digit + return 0 + } + set dotparts [split $str .] + if {[llength $dotparts] ni {1 2}} { + return 0 + } + if {[llength $dotparts] == 2} { + lassign $dotparts hms tail + #validate tail - which might have +- offset + if {[string index $tail end] ni {z Z}} { + #from hh:mm:??. + #check for +/- something + if {[regexp {(.*)[+-](.*)} $tail _match fraction offset]} { + if {![string is digit -strict $fraction]} { + return 0 + } + if {![_is_hms_or_hm_time $offset]} { + return 0 + } + } + } else { + set tail [string range $tail 0 end-1] + #expect tail nnnn (from hh:mm::ss.nnnnZ) + #had a dot and a zZ - no other offset valid (?) + if {![string is digit -strict $tail]} { + return 0 + } + } + + } else { + #no dot (fraction of second) + if {[regexp {(.*)[+-](.*)} $str _match hms offset]} { + #validate offset + if {![_is_hms_or_hm_time $offset]} { + return 0 + } + } else { + set hms $str + set offset "" + #trim a *single* z or Z off hms if present - multiple should error later + if {[string index $hms end] in {z Z}} { + set hms [string range $hms 0 end-1] + } + } + } + #hms is allowed in toml to be hh:mm:ss or hh:mm + #validate we have hh:mm:ss or hh:mm - exactly 2 digits each + if {![_is_hms_or_hm_time $hms]} { + return 0 + } + + return 1 + } else { + return 0 + } + } + proc is_localtime {str} { + #time of day without any relation to a specific day or any offset or timezone + set numchars [tcl::string::length $str] + if {[regexp -all {[0-9\.:]} $str] == $numchars} { + #todo + if {![regexp {^[0-9]{2}:[0-9]{2}$|^[0-9]{2}:[0-9]{2}:[0-9]{2}([.][0-9]+){0,1}$} $str]} { + #hh:mm or hh:mm:ss or hh:mm::ss.nnn + return 0 + } + set dotparts [split $str .] + if {[llength $dotparts] ni {1 2}} { + return 0 + } + if {[llength $dotparts] == 2} { + lassign $dotparts hms _tail + #validate tail - just fractional seconds - regex has confirmed at least one digit and only digits + #nothing todo? max length? + } else { + #no fractional seconds + set hms $str + } + if {![_is_hms_or_hm_time $hms]} { + return 0 + } + return 1 + } else { + return 0 + } + } + + #review + proc is_datetime {str} { + #Essentially RFC3339 formatted date-time - but: + #1) allowing seconds to be omitted (:00 assumed) + #2) T may be replaced with a single space character TODO - parser support for space in datetime! + # (RFC 3339 allows space instead of T also - but doesn't specify it *must* be a single space) + + #toml-lint @2025-04 doesn't accept t for T or z for Z - but RFC3339 does + #toml spec doesn't clarify - we will accept + + #e.g 1979-05-27 + #e.g 1979-05-27T00:32:00Z + #e.g 1979-05-27 00:32:00-07:00 + #e.g 1979-05-27 00:32:00+10:00 + #e.g 1979-05-27 00:32:00.999999-07:00 + + #review + #minimal datetimes? + # 2024 not ok - 2024T not accepted by tomlint why? + # 02:00 ok + # 02:00:00.5 ok + # 1:00 - not ok - RFC3339 requires 2-digit hr,min,sec + + #toml-lint.com accepts 2025-01 + + if {[string length $str] < 5} { + return 0 + } + + set matches [regexp -all {[zZtT0-9\ \-\+\.:]} $str] + if {[tcl::string::length $str] == $matches} { + #all characters in legal range + if {[regexp -all {\ } $str] > 1} { + #only a single space is allowed. + return 0 + } + #If we get a space - it is only valid as a convience to represent the T separator + #we can normalize by converting to T here before more tests + set str [string map {" " T t T} $str] + #a further sanity check on T + if {[regexp -all {T} $str] > 1} { + return 0 + } + + #!todo - use full RFC 3339 parser? + #!todo - what if the value is 'time only'? + + if {[string first T $str] > -1} { + lassign [split $str T] datepart timepart + if {![is_datepart $datepart]} { + return 0 + } + if {![is_timepart $timepart]} { + return 0 + } + } else { + #either a datepart or a localtime + #spec: "If you include only the time portion of an RFC 3339 formatted date-time, it will represent that time of day + # without any relation to a specific day or any offset or timezone." + if {!([is_datepart $str] || [is_localtime $str])} { + return 0 + } + } + + + #Tcl's free-form clock scan (no -format option) is deprecated + # + #if {[catch {clock scan $datepart} err]} { + # puts stderr "tcl clock scan failed err:'$err'" + # return 0 + #} + + } else { + return 0 + } + return 1 + } + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::utils ---}] +} + +namespace eval tomlish::parse { + #*** !doctools + #[subsection {Namespace tomlish::parse}] + #[para] + #[list_begin definitions] + + #This is a somewhat curly mix of a statemachine and toml-nesting-stack littered with special cases. + #The code is a pig's-nest - but it should be noted that for example trailing single double quotes in multiline strings are perhaps not so trivial to parse using more standard methods either: + # - e.g some kind of backtracking required if using an ABNF parser? + #I don't know the precise technical name for this sort of parser; probably something like "Dog's Breakfast" + #More seriously, we don't have distinct lex/parse steps - so it is basically a 'fused lexer' or 'scannerless parser' + + #It is also desirable for this system to be useful in 'interactive' use. review - would a separate lexer make this easier or harder? + + #A possible alternative more structured approach might be to use a PEG (Parsing Expression Grammar) + + + variable is_parsing 0 ;#whether we are in the middle of parsing tomlish text + + variable state + # states: + # table-space, itable-space, array-space + # array-value-expected,keyval-value-expected,itable-keyval-value-expected, keyval-syntax, + # dquoted-key, squoted-key + # string-state, literal-state, multistring... + # + # notes: + # only the -space states are also 'spaces' ie a container which is pushed/popped on the spacestack + + # + # xxx_value-expected - we also allow for leading whitespace in this state, but once a value is returned we jump to a state based on the containing space. e.g keyval-tail or array-syntax + # + #stateMatrix defines for each state, actions to take for each possible token. + #single-element actions are the name of the next state into which to transition, or a 'POPSPACE' instruction to pop a level off the spacestack and add the data to the parent container. + #dual-element actions are a push instruction and the name of the space to push on the stack. + # - PUSHSPACE is a simple push onto the spacestack, zeropoppushspace also pushes, but will first do a pop *if* the current space level is greater than zero (ie if only if not already in root table-space) + + # -- --- --- --- --- --- + #token/state naming guide + # -- --- --- --- --- --- + #tokens : underscore separated or bare name e.g newline, start_quote, start_squote + #private tokens: always have a leading underscore (These are private 'temporary state' tokens that are never returned as actual tokens e.g _start_squote_sequence + #states : always contain at least one dash e.g err-state, table-space + #instructions + # -- --- --- --- --- --- + + + #stateMatrix dict of elements mapping current state to next state based on returned tokens + # current-state {token-encountered next-state ... } + # where next-state can be a 1 or 2 element list. + #If 2 element - the first item is an instruction (ucase) + #If 1 element - it is either a lowercase dashed state name or an ucase instruction + #e.g {PUSHSPACE } or POPSPACE or SAMESPACE + + + #SAMESPACE - got to same space as parent without popping a level, but has it's own autotransition lookup - strange concept - review usecases + + variable stateMatrix + set stateMatrix [dict create] + #--------------------------------------------------------- + #WARNING + #The stateMatrix implementation here is currently messy. + #The code is a mixture of declarative via the stateMatrix and imperative via switch statements during PUSH/POP/SAMESPACE transitions. + #This means the state behaviour has to be reasoned about by looking at both in conjuction. + #--------------------------------------------------------- + + #xxx-space vs xxx-syntax inadequately documented - TODO + + #review - out of date? + # --------------------------------------------------------------------------------------------------------------# + # incomplete example of some state starting at table-space + # --------------------------------------------------------------------------------------------------------------# + # ( = -> keyval-value-expected) + # keyval-syntax (popped -> keyval-space -> keyval-tail) (autotransition on pop) + # keyval-space (autotransition on push ^) + # table-space (barekey^) (startdquote -> dquoted-key ^) + # --------------------------------------------------------------------------------------------------------------# + + dict set stateMatrix\ + table-space { + bom "table-space"\ + whitespace "table-space"\ + newline "table-space"\ + barekey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ + squotedkey {PUSHSPACE "keyval-space" state "keyval-syntax" note ""}\ + dquotedkey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ + XXXsingle_dquote "quoted-key"\ + XXXsingle_squote "squoted-key"\ + comment "table-space"\ + starttablename "tablename-state"\ + starttablearrayname "tablearrayname-state"\ + enddquote "err-state"\ + endsquote "err-state"\ + comma "err-state"\ + eof "end-state"\ + equal "err-state"\ + cr "err-lonecr"\ + } + + + + dict set stateMatrix\ + keyval-space {\ + whitespace "keyval-syntax"\ + equal "keyval-value-expected"\ + } + + # ' = ' portion of keyval + dict set stateMatrix\ + keyval-syntax {\ + whitespace "keyval-syntax"\ + barekey {PUSHSPACE "dottedkey-space"}\ + squotedkey {PUSHSPACE "dottedkey-space"}\ + dquotedkey {PUSHSPACE "dottedkey-space"}\ + equal "keyval-value-expected"\ + comma "err-state"\ + newline "err-state"\ + eof "err-state"\ + } + #### + dict set stateMatrix\ + keyval-value-expected {\ + whitespace "keyval-value-expected"\ + untyped_value {TOSTATE "keyval-untyped-sequence" note "possible datetime datepart"}\ + literal {TOSTATE "keyval-tail" note "required for empty literal at EOF"}\ + string {TOSTATE "keyval-tail" note "required for empty string at EOF"}\ + single_dquote {TOSTATE "string-state" returnstate keyval-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ + single_squote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ + triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ + startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ + startarray {PUSHSPACE array-space returnstate keyval-tail}\ + } + #double_squote {TOSTATE "keyval-tail" note "empty literal received when double squote occurs"} + + #untyped_value sequences without intervening comma are allowed for datepart timepart + #we will produce tomlish with missing SEPS and to_dict must validate whether 2 adjacent barekeys are valid + dict set stateMatrix\ + keyval-untyped-sequence {\ + whitespace "keyval-untyped-sequence"\ + untyped_value {TOSTATE "keyval-tail"}\ + literal {TOSTATE "keyval-tail" note "required for empty literal at EOF"}\ + string {TOSTATE "keyval-tail" note "required for empty string at EOF"}\ + single_dquote {TOSTATE "string-state" returnstate keyval-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ + single_squote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ + triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ + startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ + startarray {PUSHSPACE array-space returnstate keyval-tail}\ + newline "POPSPACE"\ + comment "keyval-tail"\ + eof "end-state"\ + } + + #2025 - no leading-squote-space - only trailing-squote-space. + + dict set stateMatrix\ + keyval-tail {\ + whitespace "keyval-tail"\ + newline "POPSPACE"\ + comment "keyval-tail"\ + eof "end-state"\ + } + + + #itable-space/ curly-syntax : itables + # x={y=1,} + dict set stateMatrix\ + itable-space {\ + whitespace "itable-space"\ + newline "itable-space"\ + barekey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + squotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + dquotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + endinlinetable "POPSPACE"\ + comma "err-state"\ + comment "itable-space"\ + eof "err-state"\ + } + #we don't get single_squote etc here - instead we get the resulting squotedkey token + + + # ??? review - something like this + # + # x={y =1,} + dict set stateMatrix\ + itable-keyval-syntax {\ + whitespace {TOSTATE "itable-keyval-syntax"}\ + barekey {PUSHSPACE "dottedkey-space"}\ + squotedkey {PUSHSPACE "dottedkey-space"}\ + dquotedkey {PUSHSPACE "dottedkey-space"}\ + equal {TOSTATE "itable-keyval-value-expected"}\ + newline "err-state"\ + eof "err-state"\ + } + + # x={y=1} + dict set stateMatrix\ + itable-keyval-space {\ + whitespace "itable-keyval-syntax"\ + equal {TOSTATE "itable-keyval-value-expected" note "required"}\ + } + + dict set stateMatrix\ + itable-keyval-value-expected {\ + whitespace "itable-keyval-value-expected"\ + untyped_value {TOSTATE "itable-val-tail" note ""}\ + single_dquote {TOSTATE "string-state" returnstate itable-val-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate itable-val-tail}\ + single_squote {TOSTATE "literal-state" returnstate itable-val-tail note "usual way a literal is triggered"}\ + triple_squote {PUSHSPACE "multiliteral-space" returnstate itable-val-tail}\ + startinlinetable {PUSHSPACE "itable-space" returnstate itable-val-tail}\ + startarray {PUSHSPACE "array-space" returnstate itable-val-tail}\ + } + #double_squote not currently generated by _start_squote_sequence - '' processed as single_squote to literal-state just like 'xxx' + # review + # double_squote {TOSTATE "itable-val-tail" note "empty literal received when double squote occurs"} + + + + # x={y=1,z="x"} + #POPSPACE is transition from itable-keyval-space to parent itable-space + dict set stateMatrix\ + itable-val-tail {\ + whitespace "itable-val-tail"\ + endinlinetable "POPSPACE"\ + comma "POPSPACE"\ + newline {TOSTATE "itable-val-tail" note "itable-space ??"}\ + comment "itable-val-tail"\ + eof "err-state"\ + } + # XXXnewline "POPSPACE" + # We shouldn't popspace on newline - as if there was no comma we need to stay in itable-val-tail + # This means the newline and subsequent whitespace, comments etc become part of the preceeding dottedkey record + #e.g + # x = { + # j=1 + # #comment within dottedkey j record + # , # comment unattached + # #comment unattached + # k=2 , #comment unattached + # l=3 #comment within l record + # , m=4 + # #comment associated with m record + # + # #still associated with m record + # } + ## - This doesn't quite correspond to what a user might expect - but seems like a consistent mechanism. + #The awkwardness is because there is no way to put in a comment that doesn't consume a trailing comma + #so we cant do: j= 1 #comment for j1 , + # and have the trailing comma recognised. + # + # To associate: j= 1, #comment for j1 + # we would need some extra processing . (not popping until next key ? extra state itable-sep-tail?) REVIEW - worth doing? + # + # The same issue occurs with multiline arrays. The most natural assumption is that a comment on same line after a comma + # is 'associated' with the previous entry. + # + # These comment issues are independent of the data dictionary being generated for conversion to json etc - as the comments don't carry through anyway, + # but are a potential oddity for manipulating the intermediate tomlish structure whilst attempting to preserve 'associated' comments + # (e.g reordering records within an itable) + #The user's intention for 'associated' isn't always clear and the specs don't really guide on this. + + + #dottedkey-space is not (currently) used within [tablename] or [[tablearrayname]] + #it is for keyval ie x.y.z = value + + #this is the state after dot + #we are expecting a complete key token or whitespace + #(initial entry to the space is by one of the keys - which will immediately go to dottedkey-space-tail) + dict set stateMatrix\ + dottedkey-space {\ + whitespace "dottedkey-space"\ + dotsep "err-state"\ + barekey "dottedkey-space-tail"\ + squotedkey "dottedkey-space-tail"\ + dquotedkey "dottedkey-space-tail"\ + newline "err-state"\ + comma "err-state"\ + comment "err-state"\ + equal "err-state"\ + } + + #dottedkeyend "POPSPACE" + #equal "POPSPACE"\ + + + #jmn 2025 + #we have 1 or more dottedkeys so far - need dotsep to add more, whitespace to maintain, equal to pop + dict set stateMatrix\ + dottedkey-space-tail {\ + whitespace "dottedkey-space-tail" + dotsep "dottedkey-space" + equal "POPSPACE"\ + eof "err-state"\ + newline "err-state"\ + } + + #-------------------------------------------------------------------------- + #scratch area + #from_toml {x=1} + # barekey tok + # table-space PUSHSPACE keyval-space state keyval-syntax + # + + + #-------------------------------------------------------------------------- + + + #REVIEW + #toml spec looks like heading towards allowing newlines within inline tables + #https://github.com/toml-lang/toml/issues/781 + + #2025 - multiline itables appear to be valid for 1.1 - which we are targeting. + #https://github.com/toml-lang/toml/blob/main/toml.md#inline-table + + #JMN2025 + #review comment "err-state" vs comment "itable-space" - see if TOML 1.1 comes out and allows comments in multiline ITABLES + #We currently allow multiline ITABLES (also with comments) in the tokenizer. + #if we want to disallow as per TOML 1.0 - we should do so when attempting to get structure? + + + #JMN REVIEW + #dict set stateMatrix\ + # array-space {\ + # whitespace "array-space"\ + # newline "array-space"\ + # untyped_value "SAMESPACE"\ + # startarray {PUSHSPACE "array-space"}\ + # endarray "POPSPACE"\ + # startinlinetable {PUSHSPACE itable-space}\ + # single_dquote "string-state"\ + # single_squote "literal-state"\ + # triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax note "seems ok 2024"}\ + # comma "array-space"\ + # comment "array-space"\ + # eof "err-state-array-space-got-eof"\ + # } + + ## array-space ## + set aspace [dict create] + dict set aspace whitespace "array-space" + dict set aspace newline "array-space" + #dict set aspace untyped_value "SAMESPACE" + dict set aspace untyped_value "array-syntax" + dict set aspace startarray {PUSHSPACE "array-space"} + dict set aspace endarray "POPSPACE" + dict set aspace single_dquote {TOSTATE "string-state" returnstate array-syntax} + dict set aspace triple_dquote {PUSHSPACE "multistring-space" returnstate array-syntax} + dict set aspace single_squote {TOSTATE "literal-state" returnstate array-syntax} + dict set aspace triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax} + dict set aspace startinlinetable {PUSHSPACE itable-space} + #dict set aspace comma "array-space" + dict set aspace comment "array-space" + dict set aspace eof "err-state-array-space-got-eof" + dict set stateMatrix array-space $aspace + + #when we pop from an inner array we get to array-syntax + #e.g {x=[[]] ??? + set tarntail [dict create] + dict set tarntail whitespace "err-state" ;#"tablearrayname-tail" ;#spec doesn't allow whitespace here + dict set tarntail newline "err-state" + dict set tarntail comment "err-state" + dict set tarntail eof "err-state" + dict set tarntail endtablename "tablearray-tail" + dict set stateMatrix tablearrayname-tail $tarntail + + #review - somewhat counterintuitive...? + # [(starttablearrayname) (endtablearrayname] + # [(starttablename) (endtablename)] + + # [[xxx]] ??? + set tartail [dict create] + dict set tartail whitespace "tablearray-tail" + dict set tartail newline "table-space" + dict set tartail comment "tablearray-tail" + dict set tartail eof "end-state" + dict set stateMatrix tablearray-tail $tartail + + + + + + + dict set stateMatrix\ + end-state {} + + set knowntokens [list] + set knownstates [list] + dict for {state transitions} $stateMatrix { + if {$state ni $knownstates} {lappend knownstates $state} + dict for {tok instructions} $transitions { + if {$tok ni $knowntokens} {lappend knowntokens $tok} + } + } + dict set stateMatrix nostate {} + foreach tok $knowntokens { + dict set stateMatrix nostate $tok "err-nostate-received-token-$tok" + } + + + # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + #purpose - debugging? remove? + # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + #build a list of 'push triggers' from the stateMatrix + # ie tokens which can push a new space onto spacestack + set push_trigger_tokens [list] + tcl::dict::for {s transitions} $stateMatrix { + tcl::dict::for {token transition_to} $transitions { + set instruction [lindex $transition_to 0] + switch -exact -- $instruction { + PUSHSPACE - zeropoppushspace { + if {$token ni $push_trigger_tokens} { + lappend push_trigger_tokens $token + } + } + } + } + } + ::tomlish::log::debug "push_trigger_tokens: $push_trigger_tokens" + # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + + + + #This seems hacky... (deprecate in favour of explicit arguments to the instructions in stateMatrix?) + #spacePopTransitions, spacePushTransitions, spaceSameTransitions below for auto state redirections on POPSPACE,PUSHSPACE,SAMESPACE + + #mainly for the -space states: + #redirect to another state $c based on a state transition from $whatever to $b + # e.g "string {array-space array-syntax}" means when transitioning from string to array-space, jump to array-syntax instead. + #this is useful as we often don't know state $b. e.g when it is decided by 'POPSPACE' + + #use dict set to add values so we can easily add/remove/comment lines + + #Push to, next + #default first states when we push to these spaces + variable spacePushTransitions [dict create] + dict set spacePushTransitions keyval-space keyval-syntax + dict set spacePushTransitions itable-keyval-space itable-keyval-syntax + dict set spacePushTransitions array-space array-space + dict set spacePushTransitions table-space tablename-state + #dict set spacePushTransitions #itable-space itable-space + + #Pop to, next + variable spacePopTransitions [dict create] + dict set spacePopTransitions array-space array-syntax + + + #itable-keyval-space itable-val-tail + #review + #we pop to keyval-space from dottedkey-space or from keyval-value-expected? we don't always want to go to keyval-tail + #leave it out and make the POPSPACE caller explicitly specify it + #keyval-space keyval-tail + + variable spaceSameTransitions [dict create] + #JMN test + #dict set spaceSameTransitions array-space array-syntax + + #itable-keyval-space itable-val-tail + + + variable state_list ;#reset every tomlish::decode::toml + + namespace export tomlish toml + namespace ensemble create + + #goNextState has various side-effects e.g pushes and pops spacestack + #REVIEW - setting nest and v elements here is ugly + #todo - make neater, more single-purpose? + proc goNextState {tokentype tok currentstate} { + variable state + variable nest + variable v + + set prevstate $currentstate + + + variable spacePopTransitions + variable spacePushTransitions + variable spaceSameTransitions + + variable last_space_action "none" + variable last_space_type "none" + variable state_list + + set result "" + set starttok "" + + if {[dict exists $::tomlish::parse::stateMatrix $currentstate $tokentype]} { + set transition_to [dict get $::tomlish::parse::stateMatrix $currentstate $tokentype] + ::tomlish::log::debug "--->> goNextState tokentype:$tokentype tok:$tok currentstate:$currentstate : transition_to = $transition_to" + switch -exact -- [lindex $transition_to 0] { + POPSPACE { + set popfromspace_info [spacestack peek] + set popfromspace_state [dict get $popfromspace_info state] + spacestack pop + set parent_info [spacestack peek] + set type [dict get $parent_info type] + set parentspace [dict get $parent_info state] + + set last_space_action "pop" + set last_space_type $type + + if {[dict exists $parent_info returnstate]} { + set next [dict get $parent_info returnstate] + #clear the returnstate on current level + set existing [spacestack pop] + dict unset existing returnstate + spacestack push $existing ;#re-push modification + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected to stored returnstate $next <<---" + } else { + ### + #review - do away with spacePopTransitions - which although useful to provide a default.. + # - involve error-prone configurations distant to the main state transition configuration in stateMatrix + if {[dict exists $::tomlish::parse::spacePopTransitions $parentspace]} { + set next [dict get $::tomlish::parse::spacePopTransitions $parentspace] + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected state to $next (spacePopTransitions)<<---" + } else { + set next $parentspace + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace<<---" + } + } + set result $next + } + SAMESPACE { + set currentspace_info [spacestack peek] + ::tomlish::log::debug "--->> SAMESPACE got current space entry: $currentspace_info <<<<<" + set type [dict get $currentspace_info type] + set currentspace [dict get $currentspace_info state] + + if {[dict exists $currentspace_info returnstate]} { + set next [dict get $currentspace_info returnstate] + #clear the returnstate on current level + set existing [spacestack pop] + dict unset existing returnstate + spacestack push $existing ;#re-push modification + ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace redirected to stored returnstate $next" + } else { + if {[dict exists $::tomlish::parse::spaceSameTransitions $currentspace]} { + set next [dict get $::tomlish::parse::spaceSameTransitions $currentspace] + ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace redirected state to $next (spaceSameTransitions)" + } else { + set next $currentspace + ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace" + } + } + set result $next + } + zeropoppushspace { + if {$nest > 0} { + #pop back down to the root level (table-space) + spacestack pop + set parentinfo [spacestack peek] + set type [dict get $parentinfo type] + set target [dict get $parentinfo state] + + set last_space_action "pop" + set last_space_type $type + + #----- + #standard pop + set parentlevel [expr {$nest -1}] + lappend v($parentlevel) [set v($nest)] + incr nest -1 + #----- + } + #re-entrancy + + #set next [list PUSHSPACE [lindex $transition_to 1]] + set nexttokentype ${tokentype}2 ;#fake token type e.g tablename2 or tablearrayname2 + ::tomlish::log::debug "--->> zeropoppushspace goNextState RECURSE. calling goNextState $nexttokentype $currentstate" + set transition_info [::tomlish::parse::goNextState $nexttokentype $tok $currentstate] + set result [dict get $transition_info newstate] + } + PUSHSPACE { + set original_target [dict get $transition_to PUSHSPACE] + if {[dict exists $transition_to returnstate]} { + #adjust the existing space record on the stack. + #struct::stack doesn't really support that - so we have to pop and re-push + #todo - investigate a custom stack implementation where we can efficiently lset the top of the stack + set currentspace [spacestack pop] + dict set currentspace returnstate [dict get $transition_to returnstate] + spacestack push $currentspace ;#return modified info to stack so when we POPSPACE the returnstate is available. + } + if {[dict exists $transition_to starttok]} { + set starttok [dict get $transition_to starttok] + } + spacestack push [dict create type space state $original_target] + + set last_space_action "push" + set last_space_type "space" + + if {[dict exists $transition_to state]} { + #an explicit state in the pushed space was requested in the stateMatrix - override the spacePushTransition (spacePushTransitions can be deprecated if we require explicitness?) + set next [dict get $transition_to state] + ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target redirected state to $next by explicit 'state' entry" + } else { + #puts $::tomlish::parse::spacePushTransitions + if {[dict exists $::tomlish::parse::spacePushTransitions $original_target]} { + set next [dict get $::tomlish::parse::spacePushTransitions $original_target] + ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target redirected state to $next (spacePushTransitions) " + } else { + set next $original_target + ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target" + } + } + set result $next + } + TOSTATE { + if {[dict exists $transition_to returnstate]} { + #adjust the existing space record on the stack. + #struct::stack doesn't really support that - so we have to pop and re-push + #todo - investigate a custom stack implementation where we can efficiently lset the top of the stack + set currentspace [spacestack pop] + dict set currentspace returnstate [dict get $transition_to returnstate] + spacestack push $currentspace ;#return modified info to stack so when we POPSPACE the returnstate is available. + } + set result [dict get $transition_to TOSTATE] + } + default { + #simplified version of TOSTATE + set result [lindex $transition_to 0] ;#ignore everything but first word + } + } + } else { + ::tomlish::log::error "--->> No state transition defined from state $currentstate when tokentype $tokentype received" + set result "nostate" + } + lappend state_list [list tokentype $tokentype from $currentstate to $result] + set state $result + ::tomlish::log::notice "--->> STATE TRANSITION tokenType: '$tokentype' tok:$tok triggering '$currentstate' -> '$result' last_space_action:$last_space_action " + return [dict create prevstate $prevstate newstate $result space_action $last_space_action starttok $starttok] + } + + proc report_line {{line ""}} { + variable linenum + variable is_parsing + if {$is_parsing} { + if {$line eq ""} { + set line $linenum + } + return "Line Number: $line" + } else { + #not in the middle of parsing tomlish text - return nothing. + return "" + } + } + + #produce a *slightly* more readable string rep of the nest for puts etc. + proc nest_pretty1 {list} { + set prettier "{" + + foreach el $list { + if { [lindex $el 0] eq "NEWLINE"} { + append prettier "[list $el]\n" + } elseif {([llength $el] > 1) && ([lindex $el 0] in {KEY DQKEY SQKEY TABLE ARRAY})} { + append prettier [nest_pretty1 $el] + } else { + append prettier "[list $el] " + } + } + append prettier "}" + return $prettier + } + + proc set_tokenType {t} { + variable tokenType + variable tokenType_list + if {![info exists tokenType]} { + set tokenType "" + } + lappend tokenType_list $t + set tokenType $t + } + + proc switch_tokenType {t} { + variable tokenType + variable tokenType_list + lset tokenType_list end $t + set tokenType $t + } + + proc get_tokenType {} { + variable tokenType + return $tokenType + } + + + proc get_token_waiting {} { + variable token_waiting + return [lindex $token_waiting 0] + } + proc clear_token_waiting {} { + variable token_waiting + set token_waiting [list] + } + + #token_waiting is a list - but our standard case is to have only one + #in certain circumstances such as near eof we may have 2 + #the set_token_waiting function only allows setting when there is not already one waiting. + #we want to catch cases of inadvertently trying to set multiple + # - the reason being that the state transition triggered by the previous token may have invalidated the assumptions made when a token was added as waiting. + proc set_token_waiting {args} { + if {[llength $args] %2 != 0} { + error "tomlish set_token_waiting must have args of form: type value complete 0|1" + } + variable token_waiting + + if {[llength $token_waiting] && [dict get [lindex $token_waiting end] type] ne "eof"} { + #tokloop already set a token_waiting - but something (post tokloop processing?) is trying to set another + #we may need to remove the existing token_waiting and reset the tokloop index to the previous char so it's reprocessed in the possibly new context + #rather than attempt to make the right decision here - we raise an error and require the caller to check/handle it + set err "tomlish set_token_waiting already has token_waiting: [lindex $token_waiting 0]" + append err \n " - cannot add token_waiting: $args" + error $err + #set tomlish::parse::i [expr {[dict get $token_waiting startindex] -1}] + #set token_waiting [list] + } + + set waiting [dict create] + dict for {k v} $args { + switch -exact $k { + type - complete { + dict set waiting $k $v + } + value { + dict set waiting tok $v + } + startindex { + dict set waiting startindex $v + } + default { + error "tomlish set_token_waiting error - unrecognised key $k. known keys: [dict keys $args]" + } + } + } + if {![tcl::string::is boolean -strict [dict get $waiting complete]]} { + error "tomlish set_token_waiting error - 'complete' must be a boolean. got [dict get $waiting complete]" + } + if {![llength $token_waiting]} { + set token_waiting [list $waiting] + } else { + #an extra sanity-check that we don't have more than just the eof.. + if {[llength $token_waiting] > 1} { + set err "tomlish Unexpected. Existing token_waiting count > 1.\n" + foreach tw $token_waiting { + append err " $tw" \n + } + append err " - cannot add token_waiting: $waiting" + error $err + } + #last entry must be a waiting eof + set token_waiting [list $waiting [lindex $token_waiting end]] + } + return + } + + #returns 0 or 1 + #tomlish::parse::tok + #we attempt to do this without lookahead (potential use in streaming toml? for what benefit?) todo -final flag + # - the possible benefit is being able to more easily process in arbitrarily split chunks (although we would still have to watch crlf splitting ?) + # - interactive use? + + proc tok {} { + variable nest + variable s + variable v + variable i + variable tok + variable type ;#character type + variable state ;#FSM + + + variable tokenType + variable tokenType_list + + + variable endToken + + variable lastChar + + variable braceCount + variable bracketCount + + + #------------------------------ + #Previous run found another (presumably single-char) token + #The normal case is for there to be only one dict in the list + #multiple is an exception - primarily for eof + variable token_waiting + if {[llength $token_waiting]} { + set waiting [lindex $token_waiting 0] + + set tokenType [dict get $waiting type] + set tok [dict get $waiting tok] + #todo: dict get $token_waiting complete + set token_waiting [lrange $token_waiting 1 end] + return 1 + } + #------------------------------ + + set resultlist [list] + set sLen [tcl::string::length $s] + + set slash_active 0 + set quote 0 + set c "" + for {} {$i < $sLen} {} { + if {$i > 0} { + set lastChar [tcl::string::index $s [expr {$i - 1}]] + set start_of_data h + } else { + set lastChar "" + set start_of_data 1 + #bom-handling + if {[tcl::string::index $s 0] eq "\uFEFF"} { + #bom (could be from various encodings - now decoded as single unicode char FEFF) + #incr i 1 ;#skip over initial bom? + } + } + + + set c [tcl::string::index $s $i] + set cindex $i + + set ctest [tcl::string::map {\{ lc \} rc \[ lb \] rb \" dq ' sq \\ bsl \r cr \n lf \t tab \uFEFF bom} $c] + tomlish::log::debug "- tokloop char <$ctest> index $i tokenType:$tokenType tok:<$tok>" + #puts "got char $c during tokenType '$tokenType'" + incr i ;#must incr here because we do returns inside the loop + + + + switch -exact -- $ctest { + # { + set had_slash $slash_active + set slash_active 0 + + if {$had_slash} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + #for multiliteral, multistring - data and/or end + incr i -1 + return 1 + } + _start_squote_sequence { + #pseudo token beginning with underscore - never returned to state machine - review + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + barekey { + error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed (see tomlish::utils::is_barekey). [tomlish::parse::report_line]" + } + whitespace { + # hash marks end of whitespace token + #do a return for the whitespace, set token_waiting + #set_token_waiting type comment value "" complete 1 + incr i -1 ;#leave comment for next run + return 1 + } + untyped_value { + #REVIEW! the spec isn't clear.. is whitespace after an int,bool etc required before comment? + #we will accept a comment marker as an immediate terminator of the untyped_value. + incr i -1 + return 1 + } + starttablename - starttablearrayname { + #fix? + error "tomlish Character '#' is invalid first character for $tokenType. [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #invalid in bare parts - but allowed in quoted parts - let tablename parser sort it out + append tok $c + } + default { + #dquotedkey, string,literal, multistring + append tok $c + } + } + } else { + switch -- $state { + multistring-space { + set_tokenType stringpart + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "#" + } + multiliteral-space { + set_tokenType "literalpart" + set tok "#" + } + default { + #start of token if we're not in a token + set_tokenType comment + set tok "" ;#The hash is not part of the comment data + } + } + } + } + lc { + #left curly brace + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i [tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + append tok $c + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + starttablename - starttablearrayname { + #*bare* tablename can only contain letters,digits underscores + error "tomlish Invalid tablename first character \{ [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #valid in quoted parts + append tok $c + } + comment { + if {$had_slash} {append tok "\\"} + append tok "\{" + } + default { + #end any other token. + incr i -1 + return 1 + } + } + } else { + switch -exact -- $state { + itable-keyval-value-expected - keyval-value-expected { + #switch last key to tablename?? + set_tokenType "startinlinetable" + set tok "\{" + return 1 + } + array-space - array-syntax { + #nested anonymous inline table + set_tokenType "startinlinetable" + set tok "\{" + return 1 + } + table-space { + #invalid - but allow parser statemachine to report it. ? + set_tokenType "startinlinetable" + set tok "\{" + return 1 + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "\{" + } + multiliteral-space { + set_tokenType "literalpart" + set tok "\{" + } + default { + error "tomlish state: '$state'. left brace case not implemented [tomlish::parse::report_line]" + } + } + } + + } + rc { + #right curly brace + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + append tok $c + } + string - dquotedkey - comment { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + starttablename - tablename { + if {$had_slash} {append tok "\\"} + #invalid! - but leave for datastructure loading stage to catch + set_token_waiting type endinlinetable value "" complete 1 startindex $cindex + return 1 + } + starttablearrayname - tablearrayname { + if {$had_slash} {append tok "\\"} + #invalid! - but leave for datastructure loading stage to catch + set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex + return 1 + } + default { + #end any other token + incr i -1 + return 1 + } + } + } else { + #$slash_active not relevant when no tokenType + switch -exact -- $state { + table-space { + #invalid - but allow parser statemachine to report it. ? + set_tokenType "endinlinetable" + set tok "\}" + return 1 + } + itable-space { + set_tokenType "endinlinetable" + set tok "\}" + return 1 + } + tablename-state { + #e.g [] - empty tablename - allowed or not? + #empty tablename/tablearrayname ? + #error "unexpected tablename problem" + + set_tokenType "endinlinetable" + set tok "" ;#no output into the tomlish list for this token + return 1 + } + tablearrayname-state { + error "tomlish unexpected tablearrayname-state problem" + set_tokenType "endinlinetable" + set tok "" ;#no output into the tomlish list for this token + return 1 + } + array-syntax - array-space { + #invalid + set_tokenType "endinlinetable" + set tok "\}" + return 1 + } + itable-val-tail { + set_tokenType "endinlinetable" + set tok "" + #we need to pop the keyval - and then reprocess to pop the inlinetable - so we incr -1 + incr i -1 + return 1 + } + itable-keyval-syntax { + error "tomlish endinlinetable unexpected at this point. Expecting key=val syntax [tomlish::parse::report_line]" + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "\}" + } + multiliteral-space { + set_tokenType "literalpart" ; #review + set tok "\}" + } + default { + #JMN2024b keyval-tail? + error "tomlish state '$state'. endinlinetable case not implemented [tomlish::parse::report_line]" + } + } + } + + } + lb { + #left square bracket + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + append tok $c + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + starttablename { + #change the tokenType + switch_tokenType "starttablearrayname" + set tok "" ;#no output into the tomlish list for this token + #any following whitespace is part of the tablearrayname, so return now + return 1 + } + tablename - tablearrayname { + #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token + if {$had_slash} { + #resultant tablename may be invalid - but leave for datastructure loading stage to catch + #append tok "\\[" + append tok {\[} + } else { + if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { + #invalid at this point - state machine should disallow: + # table -> starttablearrayname + # tablearray -> starttablearrayname + set_token_waiting type starttablearrayname value "" complete 1 startindex $cindex + return 1 + } else { + #we appear to still be in single or double quoted section + append tok "\[" + } + } + } + comment { + if {$had_slash} {append tok "\\"} + append tok "\[" + } + default { + #end any other token. + incr i -1 + return 1 + } + } + } else { + #$slash_active not relevant when no tokenType + switch -exact -- $state { + keyval-value-expected - itable-keyval-value-expected { + set_tokenType "startarray" + set tok "\[" + return 1 + } + array-space - array-syntax { + #nested array? + set_tokenType "startarray" + set tok "\[" + return 1 + #error "state: array-space. startarray case not implemented [tomlish::parse::report_line]" + } + table-space { + #table name + #assume it's a single bracket - but we need to wait for non-bracket to confirm it's not a tablearray + #note that a starttablearrayname token may contain whitespace between the brackets + # e.g \[ \[ + set_tokenType "starttablename" + set tok "" ;#there is no output into the tomlish list for this token + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "\[" + } + multiliteral-space { + set_tokenType "literalpart" + set tok "\[" + } + itable-space { + #handle state just to give specific error msg + error "tomlish state: '$state'. Left square bracket invalid. Cannot start array in inline table without key. Use key=\[\] syntax. [tomlish::parse::report_line]" + } + default { + error "tomlish state: '$state'. startarray case not implemented [tomlish::parse::report_line]" + } + } + } + } + rb { + #right square bracket + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + append tok $c + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + comment { + if {$had_slash} {append tok "\\"} + append tok $c + } + whitespace { + if {$state eq "multistring-space"} { + #???? + incr i -1 + if {$had_slash} {incr i -1} ;#reprocess + return 1 + } else { + incr i -1 + if {$had_slash} {incr i -1} ;#reprocess + return 1 + } + } + starttablename { + #toml-test invalid/table/empty + + set_token_waiting type tablename value "" complete 1 startindex $cindex + incr i -1 + return 1 + } + tablename { + #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token + if {$had_slash} { + #resultant tablename may be invalid - but leave for datastructure loading stage to catch + append tok "\\]" + } else { + if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { + set_token_waiting type endtablename value "" complete 1 startindex $cindex + return 1 + } else { + #we appear to still be in single or double quoted section + append tok "]" + } + } + } + tablearrayname { + #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token + if {$had_slash} { + #resultant tablename may be invalid - but leave for datastructure loading stage to catch + append tok "\\]" + } else { + if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { + set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex + return 1 + } else { + #we appear to still be in single or double quoted section + append tok "]" + } + } + } + default { + incr i -1 + return 1 + } + } + } else { + #$slash_active not relevant when no tokenType + switch -exact -- $state { + array-syntax - array-space { + #invalid - but allow parser statemachine to report it. + set_tokenType "endarray" + set tok "\]" + return 1 + } + table-space { + #invalid - but allow parser statemachine to report it. ? + set_tokenType "endarray" + set tok "\]" + return 1 + } + tablename-state { + #e.g [] - empty tablename + #tomltest 1.1.0 invalid/table/empty + #should be invalid + #we parse it and let dict::from_tomlish error when it tries to split table + + set_tokenType "endtablename" + set tok "" ;#no output into the tomlish list for this token + return 1 + } + tablearrayname-state { + error "tomlish unexpected tablearrayname problem" + set_tokenType "endtablearray" + set tok "" ;#no output into the tomlish list for this token + return 1 + } + tablearrayname-tail { + #[[xxx] + set_tokenType "endtablename" + #sequence: starttablename -> starttablearrayname -> endtablearrayname -> endtablename + return 1 + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "\]" + } + multiliteral-space { + set_tokenType "literalpart" + set tok "\]" + } + default { + error "tomlish state '$state'. endarray case not implemented [tomlish::parse::report_line]" + } + } + } + } + bsl { + #backslash + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + whitespace { + if {$state eq "multistring-space"} { + #end whitespace token + incr i -1 ;#reprocess bsl in next run + return 1 + } else { + error "tomlish Unexpected backslash during whitespace. [tomlish::parse::report_line]" + } + } + literal - literalpart - squotedkey { + #never need to set slash_active true when in single quoted tokens + append tok "\\" + set slash_active 0 + } + string - dquotedkey - comment { + if {$slash_active} { + set slash_active 0 + append tok "\\\\" + } else { + set slash_active 1 + } + } + stringpart { + if {$slash_active} { + set slash_active 0 + append tok "\\\\" + } else { + set slash_active 1 + } + } + starttablename - starttablearrayname { + error "tomlish backslash is invalid as first character of $tokenType [tomlish::parse::report_line]" + } + tablename - tablearrayname { + if {$slash_active} { + set slash_active 0 + append tok "\\\\" + } else { + set slash_active 1 + } + } + barekey { + error "tomlish Unexpected backslash during barekey. [tomlish::parse::report_line]" + } + default { + error "tomlish Backslash unexpected during tokentype: '$tokenType'. [tomlish::parse::report_line]" + } + } + } else { + switch -exact -- $state { + multistring-space { + if {$slash_active} { + set_tokenType "stringpart" + set tok "\\\\" + set slash_active 0 + } else { + set slash_active 1 + } + } + multiliteral-space { + #nothing can be escaped in multiliteral-space - not even squotes (?) review + set_tokenType "literalpart" + set tok "\\" + } + default { + error "tomlish tok error: Unexpected backslash when no token is active. [tomlish::parse::report_line]" + } + } + } + } + sq { + #single quote + set had_slash $slash_active + set slash_active 0 + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote { + #for within multiliteral + #short tentative_accum_squote tokens are returned if active upon receipt of any other character + #longest allowable for leading/trailing are returned here + #### + set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote + #assert state = trailing-squote-space + append tok $c + if {$existingtoklen == 4} { + #maxlen to be a tentative_accum_squote is multisquote + 2 = 5 + #return tok with value ''''' + return 1 + } + } + tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space + switch -- [tcl::string::length $tok] { + 1 { + #no conclusion can yet be reached + append tok $c + } + 2 { + #enter multiliteral + #switch? + append tok $c + set_tokenType triple_squote + return 1 + } + default { + #if there are more than 3 leading squotes we also enter multiliteral space and the subsequent ones are handled + #by the tentative_accum_squote check for ending sequence which can accept up to 5 and reintegrate the + #extra 1 or 2 squotes as data. + error "tomlish unexpected token length [tcl::string::length $tok] in '_start_squote_sequence'" + } + } + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + whitespace { + #end whitespace + incr i -1 ;#reprocess sq + return 1 + } + literal { + #slash_active always false + #terminate the literal + set_token_waiting type endsquote value "'" complete 1 startindex $cindex + return 1 + } + literalpart { + #ended by ''' - but final could be '''' or ''''' (up to 2 squotes allowed directly before ending triple squote sequence) + #todo + # idea: end this literalpart (possibly 'temporarily') + # let the sq be reprocessed in the multiliteral-space to push an end-multiliteral-sequence to state stack + # upon popping end-multiliteral-sequence - stitch quotes back into this literalpart's token (if either too short - or a long ending sequence as shown above) + incr i -1 ;#throw the "'" back to loop - will be added to a tentative_accum_squote token for later processing + return 1 + } + XXXitablesquotedkey { + set_token_waiting type endsquote value "'" complete 1 startindex $cindex + return 1 + } + squotedkey { + ### + #set_token_waiting type endsquote value "'" complete 1 + return 1 + } + starttablename - starttablearrayname { + #!!! + incr i -1 + return 1 + } + tablename - tablearrayname { + append tok $c + } + barekey { + #barekeys now support all sorts of unicode letter/number chars for other cultures + #but not punctuation - not even for those of Irish heritage who don't object + #to the anglicised form of some names. + # o'shenanigan seems to not be a legal barekey + #The Irish will have to use an earlier form Ó - which apparently many may prefer anyway. + error "tomlish Unexpected single quote during barekey. [tomlish::parse::report_line]" + } + default { + append tok $c + } + } + } else { + switch -exact -- $state { + array-space - keyval-value-expected - itable-keyval-value-expected { + #leading squote + #pseudo-token _start_squote_sequence ss not received by state machine + #This pseudotoken will trigger production of single_squote token or triple_squote token + #It currently doesn't trigger double_squote token + #(handle '' same as 'x' ie produce a single_squote and go into processing literal) + #review - producing double_squote for empty literal may be slightly more efficient. + #This token is not used to handle squote sequences *within* a multiliteral + set_tokenType "_start_squote_sequence" + set tok "'" + } + multiliteral-space { + #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row + #we are building up a tentative_accum_squote to determine if + #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines + #b) it is exactly ''' and we can terminate the whole multiliteral + #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space + set_tokenType "tentative_trigger_squote" ;#trigger tentative_accum_squote + set tok "'" + return 1 + } + table-space - itable-space { + #tests: squotedkey.test squotedkey_itable.test + set_tokenType "squotedkey" + set tok "" + } + XXXtable-space - XXXitable-space { + #future - could there be multiline keys? MLLKEY, MLBKEY ? + #this would (almost) allow arbitrary tcl dicts to be stored in toml (aside from escaping issues) + #probably unlikely - as it's perhaps not very 'minimal' or ergonomic for config files + #@2025 ABNF for toml mentions key, simple-key, unquoted-key, quoted-key and dotted-key + #where key is simple-key or dotted-key - no MLL or MLB components + #the spec states solution for arbitrary binary data is application specific involving encodings + #such as hex, base64 + set_tokenType "_start_squote_sequence" + set tok "'" + return 1 + } + tablename-state { + #first char in tablename-state/tablearrayname-state + set_tokenType "tablename" + append tok "'" + } + tablearrayname-state { + set_tokenType "tablearrayname" + append tok "'" + } + literal-state { + #shouldn't get here? review + tomlish::log::debug "- tokloop sq during literal-state with no tokentype - empty literal?" + set_tokenType "literal" + incr -1 + return 1 + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} {append tok "\\"} + append tok "," + #error "tomlish unimplemented - squote during state '$state'. [tomlish::parse::report_line]" + } + dottedkey-space { + set_tokenType "squotedkey" + } + default { + error "tomlish unhandled squote during state '$state'. [tomlish::parse::report_line]" + } + } + } + + } + dq { + #double quote + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #review + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + tentative_accum_dquote { + #within multistring + #short tentative_accum_dquote tokens are returned if active upon receipt of any other character + #longest allowable for leading/trailing are returned here + #### + set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote + #assert state = trailing-squote-space + append tok $c + if {$existingtoklen == 4} { + #maxlen to be a tentative_accum_dquote is multidquote + 2 = 5 + #return tok with value """"" + return 1 + } + } + _start_dquote_sequence { + #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space + switch -- [tcl::string::length $tok] { + 1 { + #no conclusion can yet be reached + append tok $c + } + 2 { + #enter multistring + #switch? + append tok $c + set_tokenType triple_dquote + return 1 + } + default { + #if there are more than 3 leading dquotes we also enter multistring space and the subsequent ones are handled + #by the tentative_accum_dquote check for ending sequence which can accept up to 5 and reintegrate the + #extra 1 or 2 dquotes as data. + error "tomlish unexpected token length [tcl::string::length $tok] in '_start_dquote_sequence'" + } + } + } + literal - literalpart { + append tok $c + } + string { + if {$had_slash} { + append tok "\\" $c + } else { + #unescaped quote always terminates a string + set_token_waiting type enddquote value "\"" complete 1 startindex $cindex + return 1 + } + } + stringpart { + #sub element of multistring + if {$had_slash} { + append tok "\\" $c + } else { + incr i -1 ;#throw the {"} back to loop - will be added to a tentative_accum_dquote token for later processing + return 1 + } + } + whitespace { + #assert: had_slash will only ever be true in multistring-space + if {$had_slash} { + incr i -2 + return 1 + } else { + #end whitespace token - throw dq back for reprocessing + incr i -1 + return 1 + } + } + comment { + if {$had_slash} {append tok "\\"} + append tok $c + } + XXXdquotedkey { + if {$had_slash} { + append tok "\\" + append tok $c + } else { + set_token_waiting type enddquote value "\"" complete 1 startindex $cindex + return 1 + } + } + dquotedkey { + ### + if {$had_slash} { + append tok "\\" + append tok $c + } else { + #set_token_waiting type enddquote value {"} complete 1 + return 1 + } + } + squotedkey { + append tok $c + } + tablename - tablearrayname { + if {$had_slash} {append tok "\\"} + append tok $c + } + starttablename - starttablearrayname { + incr i -1 ;## + return 1 + } + default { + error "tomlish got quote during tokenType '$tokenType' [tomlish::parse::report_line]" + } + } + } else { + #$slash_active not relevant when no tokenType + #token is string only if we're expecting a value at this point + switch -exact -- $state { + array-space - keyval-value-expected - itable-keyval-value-expected { + #leading dquote + #pseudo-token _start_squote_sequence ss not received by state machine + #This pseudotoken will trigger production of single_dquote token or triple_dquote token + #It currently doesn't trigger double_dquote token + #(handle "" same as "x" ie produce a single_dquote and go into processing string) + #review - producing double_dquote for empty string may be slightly more efficient. + #This token is not used to handle dquote sequences once *within* a multistring + set_tokenType "_start_dquote_sequence" + set tok {"} + } + multistring-space { + if {$had_slash} { + set_tokenType "stringpart" + set tok "\\\"" + } else { + #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row + #we are building up a tentative_accum_squote to determine if + #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines + #b) it is exactly ''' and we can terminate the whole multiliteral + #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space + set_tokenType "tentative_trigger_dquote" ;#trigger tentative_accum_dquote + set tok {"} + return 1 + } + } + multiliteral-space { + set_tokenType "literalpart" + set tok "\"" + } + table-space - itable-space { + set_tokenType "dquotedkey" + set tok "" + } + dottedkey-space { + set_tokenType dquotedkey + set tok "" + + #only if complex keys become a thing + #set_tokenType dquote_seq_begin + #set tok $c + } + tablename-state { + set_tokenType tablename + set tok $c + } + tablearrayname-state { + set_tokenType tablearrayname + set tok $c + } + default { + error "tomlish Unexpected dquote during state '$state' [tomlish::parse::report_line]" + } + } + } + } + = { + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #review + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + #assertion had_slash 0 + append tok $c + } + string - comment - dquotedkey { + #for these tokenTypes an = is just data. + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + whitespace { + if {$state eq "multistring-space"} { + incr i -1 + return 1 + } else { + set_token_waiting type equal value = complete 1 startindex $cindex + return 1 + } + } + barekey { + #set_token_waiting type equal value = complete 1 + incr i -1 + return 1 + } + starttablename - starttablearrayname { + error "tomlish Character '=' is invalid first character for $tokenType. [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #invalid in bare name - but valid in quoted parts - leave for tablename parser to sort out + append tok $c + } + default { + error "tomlish unexpected = character during tokentype $tokenType. case not implemented. [tomlish::parse::report_line]" + } + } + } else { + switch -exact -- $state { + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok = + } + multiliteral-space { + set_tokenType "literalpart" + set tok "=" + } + dottedkey-space { + set_tokenType "equal" + set tok "=" + return 1 + } + default { + set_tokenType "equal" + set tok = + return 1 + } + } + } + } + cr { + #REVIEW! + # \r carriage return + if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. + set slash_active 0 + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #we have received a double cr + ::tomlish::log::warn "double cr - will generate cr token. needs testing" + set_tokenType "cr" ;#lone cr token will generally raise an error - but let state machine handle it + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal { + append tok $c + } + literalpart { + #part of MLL string (multi-line literal string) + #we need to split out crlf as a separate NEWLINE to be consistent + ::tomlish::log::warn "literalpart ended by cr - needs testing" + #return literalpart temporarily - allow cr to be reprocessed from multiliteral-space + incr i -1 + return 1 + } + stringpart { + #stringpart is a part of MLB string (multi-line basic string) + #throw back the cr - if followed by lf it will become a {NEWLINE crlf} entry within the MULTISTRING list (e.g between STRINGPART entries) + incr i -1 + return 1 + } + starttablename - starttablearrayname { + error "tomlish Character is invalid first character for $tokenType. [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #could in theory be valid in quoted part of name + #review - might be better just to disallow here + append tok $c + } + whitespace { + #it should technically be part of whitespace if not followed by lf + #but outside of values we are also free to map it to be another NEWLINE instead? REVIEW + incr i -1 + return 1 + } + untyped_value { + incr i -1 + return 1 + } + comment { + #JJJJ + #review + incr i -1 + return 1 + } + default { + #!todo - error out if cr inappropriate for tokenType + append tok $c + } + } + } else { + #lf may be appended if next + #review - lone cr as newline? - this is uncommon - but so is lone cr in a string(?) + set_tokenType "newline" + set tok cr + } + } + lf { + # \n newline + set had_slash $slash_active + set slash_active 0 + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #review + #this lf is the trailing part of a crlf + append tok lf ;#assert we should now have tok "crlf" - as a previous cr is the only way to have an incomplete newline tok + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + #multiliteral or multistring + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal { + #nl is not allowed *within* a literal - require multiliteral syntax for any literal containing a newline ''' ''' + #even though we terminate the literal without the closing quote here - the token_waiting newline should trigger a state error + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 + } + literalpart { + #we allow newlines - but store them within the multiliteral as their own element + #This is a legitimate end to the literalpart - but not the whole multiliteral + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 + } + stringpart { + if {$had_slash} { + #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) + set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] + incr i -1 + return 1 + } else { + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 + } + } + starttablename - tablename - tablearrayname - starttablearrayname { + error "tomlish Character is invalid in $tokenType. [tomlish::parse::report_line]" + } + default { + #newline ends all other tokens. + #note for string: we don't add (raw unescaped) newline to simple string. (must use multi-string for this) + #note for whitespace: + # we will use the convention that \n terminates the current whitespace even if whitespace follows + # ie whitespace is split into separate whitespace tokens at each newline + + #puts "-------------- newline lf during tokenType $tokenType" + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 + } + } + } else { + switch -exact -- $state { + multistring-space { + if {$had_slash} { + set_tokenType "continuation" + set tok "\\" + incr i -1 + return 1 + } else { + set_tokenType "newline" + set tok lf + return 1 + } + } + multiliteral-space { + #assert had_slash 0 + set_tokenType "newline" + set tok "lf" + return 1 + } + default { + #ignore slash? error? + set_tokenType "newline" + set tok lf + return 1 + } + } + #if {$had_slash} { + # #CONT directly before newline - allows strings_5_byteequivalent test to pass + # set_tokenType "continuation" + # set tok "\\" + # incr i -1 + # return 1 + #} else { + # set_tokenType newline + # set tok lf + # return 1 + #} + } + } + , { + set had_slash $slash_active + set slash_active 0 + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + comment - tablename - tablearrayname { + if {$had_slash} {append tok "\\"} + append tok , + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + #stringpart can have up to 2 quotes too + if {$had_slash} {append tok "\\"} + append tok $c + } + literal - literalpart - squotedkey { + #assert had_slash always 0 + append tok $c + } + whitespace { + if {$state eq "multistring-space"} { + incr i -1 + return 1 + } else { + set_token_waiting type comma value "," complete 1 startindex $cindex + return 1 + } + } + default { + set_token_waiting type comma value "," complete 1 startindex $cindex + if {$had_slash} {append tok "\\"} + return 1 + } + } + } else { + switch -exact -- $state { + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} {append tok "\\"} + append tok "," + } + multiliteral-space { + #assert had_slash 0 + set_tokenType "literalpart" + set tok "," + } + default { + set_tokenType "comma" + set tok "," + return 1 + } + } + } + } + . { + set had_slash $slash_active + set slash_active 0 + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + comment - untyped_value { + if {$had_slash} {append tok "\\"} + append tok $c + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + literal - literalpart - squotedkey { + #assert had_slash always 0 + append tok $c + } + whitespace { + switch -exact -- $state { + multistring-space { + #review + if {$had_slash} { + incr i -2 + } else { + incr i -1 + } + return 1 + } + xxxdottedkey-space { + incr i -1 + return 1 + } + dottedkey-space-tail { + incr i -1 + return 1 + } + default { + error "tomlish Received period during tokenType 'whitespace' [tomlish::parse::report_line]" + } + } + } + starttablename - starttablearrayname { + #This would correspond to an empty table name + error "tomlish Character '.' is not allowed as first character ($tokenType). [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #subtable - split later - review + append tok $c + } + barekey { + #e.g x.y = 1 + #we need to transition the barekey to become a structured table name ??? review + #x is the tablename y is the key + set_token_waiting type dotsep value "." complete 1 startindex $cindex + return 1 + } + default { + error "tomlish Received period during tokenType '$tokenType' [tomlish::parse::report_line]" + #set_token_waiting type period value . complete 1 + #return 1 + } + } + } else { + switch -exact -- $state { + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} {append tok "\\"} + append tok "." + } + multiliteral-space { + set_tokenType "literalpart" + set tok "." + } + XXXdottedkey-space { + ### obs? + set_tokenType "dotsep" + set tok "." + return 1 + } + dottedkey-space-tail { + ### + set_tokenType "dotsep" + set tok "." + return 1 + } + default { + set_tokenType "untyped_value" + set tok "." + } + } + } + + } + " " - tab { + if {[tcl::string::length $tokenType]} { + set had_slash $slash_active + set slash_active 0 + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + barekey { + #todo had_slash - emit token or error + #whitespace is a terminator for bare keys + #set_token_waiting type whitespace value $c complete 1 + incr i -1 + return 1 + } + untyped_value { + #unquoted values (int,date,float etc) are terminated by whitespace + #set_token_waiting type whitespace value $c complete 1 + incr i -1 + return 1 + } + comment { + if {$had_slash} { + append tok "\\" + } + append tok $c + } + string - dquotedkey { + if {$had_slash} { append tok "\\" } + append tok $c + } + stringpart { + #for stringpart we store WS separately for ease of processing continuations (CONT stripping) + if {$had_slash} { + #REVIEW + #emit the stringpart - go back to the slash + incr i -2 + return 1 + } else { + #split into STRINGPART xxx WS " " + incr i -1 + return 1 + } + } + literal - literalpart - squotedkey { + append tok $c + } + whitespace { + if {$state eq "multistring-space"} { + append tok $c + } else { + append tok $c + } + } + starttablename - starttablearrayname { + incr i -1 + return 1 + } + tablename - tablearrayname { + #include whitespace in the tablename/tablearrayname + #Will need to be normalized upon interpreting the tomlish as a datastructure + append tok $c + } + default { + error "tomlish Received whitespace space during tokenType '$tokenType' [tomlish::parse::report_line]" + } + } + } else { + set had_slash $slash_active + set slash_active 0 + switch -exact -- $state { + tablename-state { + #tablename can have leading,trailing and interspersed whitespace! + #These will not be treated as whitespace tokens, instead forming part of the name. + set_tokenType tablename + set tok "" + if {$had_slash} {append tok "\\"} + append tok $c + } + tablearrayname-state { + set_tokenType tablearrayname + set tok "" + if {$had_slash} {append tok "\\"} + append tok $c + } + multistring-space { + if {$had_slash} { + set_tokenType "continuation" + set tok "\\" + incr i -1 + return 1 + } else { + set_tokenType "whitespace" + append tok $c + } + } + multiliteral-space { + set_tokenType "literalpart" + set tok $c + } + default { + if {$had_slash} { + error "tomlish unexpected backslash [tomlish::parse::report_line]" + } + set_tokenType "whitespace" + append tok $c + } + } + } + } + tabX { + if {[tcl::string::length $tokenType]} { + if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out (?review) + set slash_active 0 + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + barekey { + #whitespace is a terminator for bare keys + incr i -1 + #set_token_waiting type whitespace value $c complete 1 + return 1 + } + untyped_value { + #unquoted values (int,date,float etc) are terminated by whitespace + #set_token_waiting type whitespace value $c complete 1 + incr i -1 + return 1 + } + squotedkey { + append tok $c + } + dquotedkey - string - comment - whitespace { + #REVIEW + append tok $c + } + stringpart { + #for stringpart we store WS separately for ease of processing continuations (CONT stripping) + if {$had_slash} { + #REVIEW + #emit the stringpart - go back to the slash + incr i -2 + return 1 + } else { + #split into STRINGPART aaa WS " " + incr i -1 + return 1 + } + } + literal - literalpart { + append tok $c + } + starttablename - starttablearrayname { + incr i -1 + return 1 + } + tablename - tablearrayname { + #include whitespace in the tablename/tablearrayname + #Will need to be normalized upon interpreting the tomlish as a datastructure + append tok $c + } + default { + error "tomlish Received whitespace tab during tokenType '$tokenType' [tomlish::parse::report_line]" + } + } + } else { + set had_slash $slash_active + if {$slash_active} { + set slash_active 0 + } + switch -exact -- $state { + tablename-state { + #tablename can have leading,trailing and interspersed whitespace! + #These will not be treated as whitespace tokens, instead forming part of the name. + set_tokenType tablename + set tok $c + } + tablearrayname-state { + set_tokenType tablearrayname + set tok $c + } + multistring-space { + if {$had_slash} { + set_tokenType "continuation" + set tok "\\" + incr i -1 + return 1 + } else { + set_tokenType whitespace + append tok $c + } + } + multiliteral-space { + set_tokenType "literalpart" + set tok $c + } + default { + set_tokenType "whitespace" + append tok $c + } + } + } + } + bom { + #bom encoded as single unicode codepoint \uFFEF + #BOM (Byte Order Mark) - ignored by token consumer + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + #assert - tok will be one or two squotes only + #A toml literal probably isn't allowed to contain this + #but we will parse and let the validator sort it out. + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart { + append tok $c + } + string - stringpart { + append tok $c + } + default { + #state machine will generally not have entry to accept bom - let it crash + set_token_waiting type bom value "\uFEFF" complete 1 startindex $cindex + return 1 + } + } + } else { + switch -exact -- $state { + multiliteral-space { + set_tokenType "literalpart" + set tok $c + } + multistring-space { + set_tokenType "stringpart" + set tok $c + } + default { + set_tokenType "bom" + set tok "\uFEFF" + return 1 + } + } + } + } + default { + + if {[tcl::string::length $tokenType]} { + if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. + set slash_active 0 + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + whitespace { + if {$state eq "multistring-space"} { + incr i -1 + return 1 + } else { + #review + incr i -1 ;#We don't have a full token to add to the token_waiting dict - so leave this char for next run. + return 1 + } + } + barekey { + if {[tomlish::utils::is_barekey $c]} { + append tok $c + } else { + error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed. (see tomlish::utils::is_barekey) [tomlish::parse::report_line]" + } + } + starttablename - starttablearrayname { + incr i -1 + #allow statemachine to set context for subsequent chars + return 1 + } + string - stringpart { + append tok $c + } + default { + #e.g comment/literal/literalpart/untyped_value/starttablename/starttablearrayname/tablename/tablearrayname + append tok $c + } + } + } else { + set had_slash $slash_active + set slash_active 0 + switch -exact -- $state { + table-space - itable-space { + #if no currently active token - assume another key value pair + if {[tomlish::utils::is_barekey $c]} { + set_tokenType "barekey" + append tok $c + } else { + error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" + } + } + multistring-space { + set_tokenType "stringpart" + if {$had_slash} { + set tok \\$c + } else { + set tok $c + } + } + multiliteral-space { + set_tokenType "literalpart" + set tok $c + } + tablename-state { + set_tokenType "tablename" + set tok $c + } + tablearrayname-state { + set_tokenType "tablearrayname" + set tok $c + } + dottedkey-space { + set_tokenType barekey + set tok $c + } + default { + #todo - something like ansistring VIEW to show control chars? + set cshow [string map [list \t tab \v vt] $c] + tomlish::log::debug "- tokloop char '$cshow' setting to untyped_value while state:$state [tomlish::parse::report_line]" + set_tokenType "untyped_value" + set tok $c + } + } + } + } + } + + } + + #run out of characters (eof) + if {[tcl::string::length $tokenType]} { + #check for invalid ending tokens + #if {$state eq "err-state"} { + # error "Reached end of data whilst tokenType = '$tokenType'. INVALID" + #} + switch -exact -- $tokenType { + _start_squote_sequence { + set toklen [tcl::string::length $tok] + switch -- $toklen { + 1 { + #invalid eof with open literal + error "tomlish eof reached without closing single quote for string literal. [tomlish::parse::report_line]" + } + 2 { + set_tokenType "literal" + set tok "" + return 1 + + ##review + #set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] + #set_tokenType "literal" + #set tok "" + #return 1 + } + } + } + _start_dquote_sequence { + set toklen [tcl::string::length $tok] + switch -- $toklen { + 1 { + #invalid eof with open string + error "tomlish eof reached without closing double quote for string. [tomlish::parse::report_line]" + } + 2 { + set_tokenType "string" + set tok "" + return 1 + } + } + } + newline { + #The only newline token that has still not been returned should have a tok value of "cr" + puts "tomlish eof reached - with incomplete newline token '$tok'" + if {$tok eq "cr"} { + #we convert lone cr to it's own "cr" token elsewhere in the document to allow statemachine to handle it. + #(which it should generally do by not handling it ie raising an error - or emitting an ERROR list in the tomlish) + #if trailing char is a lone cr - we should encode it the same way as elsewhere that is outside of values + # ie as it's own token. + switch_tokenType "cr" + return 1 + } else { + #should be unreachable + error "tomlish eof reached - with invalid newline token. value: $tok" + } + } + } + set_token_waiting type eof value eof complete 1 startindex $i ;#review + return 1 + } else { + ::tomlish::log::debug "- No current tokenType, ran out of characters, setting tokenType to 'eof' [tomlish::parse::report_line]" + set tokenType "eof" + set tok "eof" + } + return 0 + } + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::parse ---}] +} +namespace eval tomlish::huddle { + proc from_json {json} { + package require huddle + package require huddle::json + #note - huddle may now contain raw surrogate pair - which cannot be emitted to stdout + set h [huddle::json::json2huddle parse $json] + } + proc from_dict {d} { + + } + + #raw - strings must already be processed into values suitable for json e.g surrogate pair escaping + proc jsondumpraw {huddle_object {offset " "} {newline "\n"} {begin ""}} { + upvar ::huddle::types types + set nextoff "$begin$offset" + set nlof "$newline$nextoff" + set sp " " + if {[string equal $offset ""]} {set sp ""} + + set type [huddle type $huddle_object] + + switch -- $type { + boolean - + number { + return [huddle get_stripped $huddle_object] + } + null { + return null + } + string { + set data [huddle get_stripped $huddle_object] + + # JSON permits only oneline string + #set data [string map { + # \n \\n + # \t \\t + # \r \\r + # \b \\b + # \f \\f + # \\ \\\\ + # \" \\\" + # / \\/ + # } $data + #] + return "\"$data\"" + } + list { + set inner {} + set len [huddle llength $huddle_object] + for {set i 0} {$i < $len} {incr i} { + set subobject [huddle get $huddle_object $i] + lappend inner [jsondumpraw $subobject $offset $newline $nextoff] + } + if {[llength $inner] == 1} { + return "\[[lindex $inner 0]\]" + } + return "\[$nlof[join $inner ,$nlof]$newline$begin\]" + } + dict { + set inner {} + foreach {key} [huddle keys $huddle_object] { + lappend inner [subst {"$key":$sp[jsondumpraw [huddle get $huddle_object $key] $offset $newline $nextoff]}] + } + #if {[llength $inner] == 1} { + # return $inner ;#wrong - breaks with quoted list representation + # #FAILS: toml-test valid/comment/tricky + #} + + return "\{$nlof[join $inner ,$nlof]$newline$begin\}" + } + default { + set node [unwrap $huddle_object] + #foreach {tag src} $node break + lassign $node tag src + return [$types(callback:$tag) jsondumpraw $huddle_object $offset $newline $nextoff] + } + } + } +} + +#typed as per toml-test types +namespace eval tomlish::typedhuddle { + proc from_json {json} { + set plainhuddle [tomlish::huddle::from_json $json] + + error "tomlish::typedhuddle::from_json unimplemented" + } + proc from_dict {d} { + package require huddle + set h [huddle create] + if {[tomlish::dict::is_typeval $d]} { + set dtype [dict get $d type] + switch -- $dtype { + ARRAY { + #error "typedhuddle::from_dict ARRAY not yet handled" + set h_list [huddle list] + set elements [dict get $d value] + foreach el $elements { + set sub [from_dict $el] + huddle append h_list $sub + } + return $h_list + } + default { + set tinfo [tomlish::dict::convert_typeval_to_tomltest $d] + #basic non-container types + set h_tdict [huddle create] + huddle set h_tdict type [huddle string [dict get $tinfo type]] + huddle set h_tdict value [huddle string [dict get $tinfo value]] + return $h_tdict + } + } + } else { + dict for {dictkey dictval} $d { + set jsonkey [tomlish::utils::rawstring_to_jsonstring $dictkey] + if {[tomlish::dict::is_typeval $dictval]} { + set dtype [dict get $dictval type] + switch -- $dtype { + ARRAY { + #error "typedhuddle::from_dict ARRAY not yet handled" + set h_next [huddle list] + set elements [dict get $dictval value] + foreach el $elements { + set sub [from_dict $el] + huddle append h_next $sub + } + } + default { + set tinfo [tomlish::dict::convert_typeval_to_tomltest $dictval] + set tp [dict get $tinfo type] + #basic non-container types + set h_next [huddle create] ;#dict + huddle set h_next type [huddle string [dict get $tinfo type]] + huddle set h_next value [huddle string [dict get $tinfo value]] + } + } + huddle set h $jsonkey $h_next + } else { + #dict + set sub [from_dict $dictval] + huddle set h $jsonkey $sub + } + } + } + return $h + } + proc is_typeval {huddled} { + set htype [huddle type $huddled] + if {$htype ne "dict"} { + return 0 + } + if {[huddle keys $huddled] ne {type value}} { + return 0 + } + set tp [huddle type $huddled type] + switch -- $tp { + string - integer - float - bool - datetime - datetime-local - date-local - time-local { + return 1 + } + } + return 0 + } + + #direction from typed json towards toml + proc convert_typeval_to_tomlish {huddled} { + set htype [huddle get_stripped $huddled type] + set hval [huddle get_stripped $huddled value] + switch -- $htype { + string { + #we need to decide here the type of string element to use in toml/tomlish + #STRING,MULTISTRING,LITERAL,MULTILITERAL + #set unesc [tomlish::utils::unescape_jsonstring $hval] ;#no need - json parser unescaped when creating the huddle + set unesc $hval + #(huddle::json::json2huddle parse $json) + #since it was unescaped any backslashes remaining represent themselves - reapply escape - REVIEW + #set hval [string map [list \\ \\\ ] $hval] + #JSJS + if {[string first \n $unesc] >= 0} { + #always use a MULTI + if {[string first ' $unesc] >=0} { + if {[string first ''' $unesc] >=0} { + set dtype MULTISTRING + } else { + set dtype MULTILITERAL + } + } else { + if {[string first \"\"\" $unesc] >=0} { + set dtype MULTILITERAL + } else { + set dtype MULTISTRING + } + } + } else { + #use multi if needed? + if {[string first '' $hval] >=0} { + if {[string first ''' $unesc] >=0} { + set dtype STRING + } else { + set dtype MULTILITERAL + } + } elseif {[string first ' $unesc] >= 0} { + set dtype STRING + } elseif {[string first \"\"\" $unesc] >= 0} { + set dtype LITERAL + } else { + #STRING or LITERAL? + set dtype STRING + } + } + + } + datetime - bool { + set dtype [string toupper $htype] + } + float { + set dtype FLOAT + if {[string is integer -strict $hval]} { + #json FLOAT specified as integer - must have dot for toml + set hval [expr {double($hval)}] + } + } + integer { + set dtype INT + } + datetime - datetime-local - date-local - time-local { + #DDDD + #set dtype DATETIME + set dtype [string toupper $htype] + } + default { + error "tomlish::typedhuddle::convert_typeval_to_tomlish unrecognised type $htype" + } + } + return [list type $dtype value $hval] + } + +} +namespace eval tomlish::toml { + proc from_binary {bindata} { + set bom "" + set b12 [tcl::string::range $bindata 0 1] + set b12test [string map [list \xEF\xBB utf8_12 \xFE\xFF bom16be \xFF\xFE utf32le_12 \x00\x00 utf32be_12] $b12] + switch -- $b12test { + bom16be { + #FEFF + set bom utf-16be + } + utf32le_12 { + #FFFE + set b34 [tcl::string::range $bindata 2 3] + if {$b34 eq "\x00\x00"} { + set bom utf-32le + } else { + set bom utf-16le + } + } + utf32be_12 { + #0000 + set b34 [tcl::string::range $bindata 2 3] + if {$b34 eq "\xFE\xFF"} { + set bom utf-32be + } + } + utf8_12 { + set b3 [tcl::string::index $bindata 2] + if {$b3 eq "\xBF"} { + set bom utf-8 + } + } + } + if {$bom eq ""} { + #no bom - assume utf8 - but we read in as binary + #if data wasn't actually utf8 we may error here depending on content - or we may just get wrongly encoded chars + set tomldata [encoding convertfrom utf-8 $bindata] + } elseif {$bom eq "utf-8"} { + #utf-8 bom read in as binary + set tomldata [encoding convertfrom utf-8 $bindata] + #bom now encoded as single unicode char \uFFEF + } else { + return -code error -errorcode {TOML ENCODING NOTUTF8} "Input not UTF8 encoded according to BOM. Indicated encoding is '$bom' - invalid for toml" + } + return $tomldata + } + proc from_tomlish {tomlish} { + return [tomlish::encode::tomlish $tomlish] + } + + #todo - rename to taggedjson + proc from_tomlish_from_dict_from_typedjson {json} { + set d [tomlish::dict::from_typedjson $json] + from_tomlish [tomlish::from_dict $d] ;#return tomlish + } + + proc tablename_split {tablename {normalize false}} { + #we can't just split on . because we have to handle quoted segments which may contain a dot. + #eg {dog."tater.man"} + if {$tablename eq ""} { + error "tablename_split. No table name segments found. empty tablename" + } + set sLen [tcl::string::length $tablename] + set segments [list] + set mode "preval" ;#5 modes: preval, quoted,litquoted, unquoted, postval + #quoted is for double-quotes, litquoted is for single-quotes (string literal) + set seg "" + for {set i 0} {$i < $sLen} {incr i} { + + if {$i > 0} { + set lastChar [tcl::string::index $tablename [expr {$i - 1}]] + } else { + set lastChar "" + } + + #todo - track\count backslashes properly + + set c [tcl::string::index $tablename $i] + if {$c eq "\""} { + if {($lastChar eq "\\")} { + #not strictly correct - we could have had an even number prior-backslash sequence + #the toml spec would have us error out immediately on bsl in bad location - but we're + #trying to parse to unvalidated tomlish + set ctest escq + } else { + set ctest dq + } + } else { + set ctest [string map [list " " sp \t tab] $c] + } + + switch -- $ctest { + . { + switch -exact -- $mode { + preval { + error "tablename_split. dot not allowed - expecting a value" + } + unquoted { + #dot marks end of segment. + if {![tomlish::utils::is_barekey $seg]} { + error "tablename_split. unquoted key segment $seg is not a valid toml key" + } + lappend segments $seg + set seg "" + set mode "preval" + } + quoted { + append seg $c + } + litquoted { + append seg $c + } + postval { + #got dot in an expected location + set mode "preval" + } + } + } + dq { + #unescaped dquote + switch -- $mode { + preval { + set mode "quoted" + set seg "\"" + } + unquoted { + #invalid in barekey - but we are after structure only + append seg $c + } + quoted { + append seg $c + #JJJJ + if {$normalize} { + lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] + } else { + lappend segments $seg + } + set seg "" + set mode "postval" ;#make sure we only accept a dot or end-of-data now. + } + litquoted { + append seg $c + } + postval { + error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" + } + } + } + ' { + switch -- $mode { + preval { + append seg $c + set mode "litquoted" + } + unquoted { + #single quote inside e.g o'neill - ultimately invalid - but we pass through here. + append seg $c + } + quoted { + append seg $c + } + litquoted { + append seg $c + #no normalization to do aside from stripping squotes + if {$normalize} { + lappend segments [tcl::string::range $seg 1 end-1] + } else { + lappend segments $seg + } + set seg "" + set mode "postval" + } + postval { + error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" + } + } + } + sp - tab { + switch -- $mode { + preval - postval { + #ignore + } + unquoted { + #terminates a barekey + lappend segments $seg + set seg "" + set mode "postval" + } + default { + #append to quoted or litquoted + append seg $c + } + } + } + default { + switch -- $mode { + preval { + set mode unquoted + append seg $c + } + postval { + error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" + } + default { + append seg $c + } + } + } + } + + if {$i == $sLen-1} { + #end of data + ::tomlish::log::debug "End of data: mode='$mode'" + switch -exact -- $mode { + preval { + if {[llength $segments]} { + error "tablename_split. Expected a value after last dot separator. tablename: '$tablename'" + } else { + error "tablename_split. Whitespace only? No table name segments found. tablename: '$tablename'" + } + } + unquoted { + if {![tomlish::utils::is_barekey $seg]} { + #e.g toml-test invalid/table/with-pound required to fail for invalid barekey + error "tablename_split. unquoted key segment $seg is not a valid toml key" + } + lappend segments $seg + } + quoted { + error "tablename_split. Expected a trailing double quote. tablename: '$tablename'" + } + litquoted { + error "tablename_split. Expected a trailing single quote. tablename: '$tablename'" + } + postval { + #ok - segment already lappended + } + } + } + } + + #note - we must allow 'empty' quoted strings '' & "" + # (these are 'discouraged' but valid toml keys) + + return $segments + } + + #tablenames (& tablearraynames) may contain irrelevant leading, trailing and interspersed whitespace + # tablenames can be made up of segments delimited by dots. .eg [ a.b . c ] + #trimmed, the tablename becomes {a.b.c} + # A segment may contain whitespace if it is quoted e.g [a . b . "c etc " ] + #ie whitespace is only irrelevant if it's outside a quoted segment + #trimmed, the tablename becomes {a.b."c etc "} + proc tablename_trim {tablename} { + set segments [tomlish::toml::tablename_split $tablename false] + set trimmed_segments [list] + foreach seg $segments { + lappend trimmed_segments [::string trim $seg " \t"] + } + return [join $trimmed_segments .] + } +} + +namespace eval tomlish::dict { + namespace export {[a-z]*}; # Convention: export all lowercase + namespace path [namespace parent] + + #from_taggedjson + proc from_typedjson {json} { + package require huddle + package require huddle::json + set h [huddle::json::json2huddle parse $json] + #json2huddle parse unescapes the basic json escapes \n \\ etc + #$h could now contain raw form of surrogate pair (json2huddle parse as at 2025-014 doesn't convert the surrogates - just unescapes?) + if {[catch {encoding convertto utf-8 $h} errM]} { + #This test suggests we have raw surrogate pairs - REVIEW + package require punk::cesu + set h [punk::cesu::from_surrogatestring $h] + } + tomlish::dict::from_typedhuddle $h + } + proc from_typedhuddle {h} { + set resultd [dict create] + switch -- [huddle type $h] { + dict { + foreach k [huddle keys $h] { + switch -- [huddle type $h $k] { + dict { + set huddle_d [huddle get $h $k] + #puts stderr "huddle_d: $huddle_d" + #set v [huddle get_stripped $h $k] + if {[tomlish::typedhuddle::is_typeval $huddle_d]} { + dict set resultd $k [tomlish::typedhuddle::convert_typeval_to_tomlish $huddle_d] + } else { + dict set resultd $k [from_typedhuddle $huddle_d] + } + } + list { + set items [huddle get $h $k] + + set numitems [huddle llength $items] + if {$numitems == 0} { + dict set resultd $k [list type ARRAY value {}] + } else { + set arritems [list] + for {set i 0} {$i < $numitems} {incr i} { + set item [huddle get $items $i] + #puts stderr "item: $item" + #set v [huddle get $item] + if {[tomlish::typedhuddle::is_typeval $item]} { + lappend arritems [tomlish::typedhuddle::convert_typeval_to_tomlish $item] + } else { + lappend arritems [from_typedhuddle $item] + } + } + dict set resultd $k [list type ARRAY value $arritems] + } + } + default { + error "dict_from_json unexpected subtype [huddle type $h $k] in dict" + } + } + } + } + list { + set items [huddle get $h] + set numitems [huddle llength $items] + if {$numitems == 0} { + return [list type ARRAY value {}] + } else { + set arritems [list] + for {set i 0} {$i < $numitems} {incr i} { + set item [huddle get $items $i] + #puts stderr "item: $item" + #set v [huddle get $item] + if {[tomlish::typedhuddle::is_typeval $item]} { + lappend arritems [tomlish::typedhuddle::convert_typeval_to_tomlish $item] + } else { + lappend arritems [from_typedhuddle $item] + } + } + return [list type ARRAY value $arritems] + } + + } + } + return $resultd + } + + proc is_typeval {d} { + #designed to detect {type value } e.g {type INT value 3}, {type STRING value "blah etc"} + #as a sanity check we need to avoid mistaking user data that happens to match same form + #consider x.y={type="spud",value="blah"} + #The value of type will itself have already been converted to {type STRING value spud} ie never a single element. + #check the length of the type as a quick way to see it's a tag - not something else masqerading. + expr {[string is dict $d] && [dict size $d] == 2 && [dict exists $d type] && [dict exists $d value] && [llength [dict get $d type]] == 1} + } + + #simple types only - not containers? + proc convert_typeval_to_tomltest {d} { + set dtype [dict get $d type] + set dval [dict get $d value] + switch -- $dtype { + INT { + set testtype integer + set dval [expr {$dval}] ;#convert e.g 0xDEADBEEF to 3735928559 + } + DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - FLOAT - BOOL { + #DDDD + set testtype [string tolower $dtype] + } + STRING - MULTISTRING { + set testtype string + #JJJJ + set dval [tomlish::utils::unescape_string $dval] + set dval [tomlish::utils::rawstring_to_jsonstring $dval] + } + LITERAL - MULTILITERAL { + set testtype string + #don't validate on way out to json here? + #decoder should validate by calling tomlish::from_dict + #if {![tomlish::utils::rawstring_is_valid_literal $dval]} { + # return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} $msg + #} + set dval [tomlish::utils::rawstring_to_jsonstring $dval] + } + MULTILITERAL { + #todo - escape newlines for json? + set testtype string + } + default { + error "convert_typeval_to_tomltest unhandled type $dtype" + } + } + return [list type $testtype value $dval] + } + + # Check that each leaf is a typeval or typeval dict + #importantly: must accept empty dict leaves e.g {x {}} + proc is_typeval_dict {d {checkarrays 0}} { + if {![string is dict $d]} { + return 0 + } + dict for {k v} $d { + set is_d 0 + if {!([is_typeval $v] || [set is_d [is_typeval_dict $v $checkarrays]])} { + return 0 + } + if {!$is_d} { + set vtype [dict get $v type] + switch -- $vtype { + INT - FLOAT - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - BOOL - LITERAL - STRING - MULTILITERAL - MULTISTRING {} + ARRAY { + if {$checkarrays} { + set arrdata [dict get $v value] + foreach el $arrdata { + if {![is_typeval_dict $el $checkarrays]} { + return 0 + } + } + } + } + default { + puts stderr "is_typeval_dict: Unexpected type '$vtype'" + return 0 + } + } + } + } + return 1 + } + + proc last_tomltype_posn {d} { + set last_simple -1 + set dictposn [expr {[dict size $d] -1}] + foreach k [lreverse [dict keys $d]] { + set dval [dict get $d $k] + if {[is_typeval $dval]} { + set last_simple $dictposn + break + } + incr dictposn -1 + } + return $last_simple + } + + + #review + proc name_from_tablestack {tablestack} { + set name "" + foreach tinfo [lrange $tablestack 1 end] { + lassign $tinfo type namepart + switch -- $type { + T { + if {$name eq ""} { + append name $namepart + } else { + append name .$namepart + } + } + I { + if {$name eq ""} { + append name $namepart + } else { + append name .$namepart + } + } + default { + #end at first break in the leading sequence of T & I tablenames + break + } + } + } + return $name + } + + + #tablenames_info is a flat dict with the key being an '@@' path + proc _show_tablenames {tablenames_info} { + #e.g {@l@a @@b} {ttype header_table tdefined closed} + append msg \n "tablenames_info:" \n + dict for {tkey tinfo} $tablenames_info { + append msg " " "table: $tkey" \n + dict for {field finfo} $tinfo { + append msg " " "$field $finfo" \n + } + } + return $msg + } + + #take a raw string and classify: result is a 2 element list comprised of KEY|SQKEY|DQKEY and the value being the appropriate inner string + proc classify_rawkey {rawval} { + if {![::tomlish::utils::is_barekey $rawval]} { + #requires quoting + # + #Any dot in the key would have been split by dict::from_tomlish - so if it's present here it's part of this key - not a level separator! + # + #we'll use a basic mechanisms for now to determine the type of quoting + # - whether it has any single quotes or not. + # (can't go in an SQKEY) + # - whether it has any chars that require quoting when in a Bstring + # (if so - then its visual representation might be unsuitable for a key in a toml text file, so escape and put in DQKEY instead of literal SQKEY) + #todo - more? + #REVIEW - the backslash might often be in things like a regex or windows path - which is often better expressed in a literal SQKEY + # from literal examples: + # 'c:\Users\nodejs\templates' + # '<\i\c*\s*>' + #If these are in *keys* our basic test will express these as: + # "c:\\Users\\nodejs\\templates" + # "<\\i\\c*\\s*>" + # This still works - but a smarter test might determine when SQKEY is the better form? + #when coming from external systems - can we even know if the value was already escaped? REVIEW + #Probably when coming from json - we know it's already escaped - and so we build our dict converting keys to unescaped + #TODO - clarify in documentation that keys resulting from dict::from_tomlish are in 'normalized' (unescaped) form + # + #For keys - we currently (2025) are only allowed barekeys,basic strings and literal strings. (no multiline forms) + set k_escaped [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $rawval] + if {[string length $k_escaped] != [string length $rawval]} { + #escaping made a difference + set has_escape_requirement 1 + } else { + set has_escape_requirement 0 + } + if {[string first ' $rawval] >=0 || $has_escape_requirement} { + #basic string + # (any ANSI SGR sequence will end up here in escaped form ) + return [list DQKEY $k_escaped] + } else { + #literal string + return [list SQKEY $rawval] + } + } else { + return [list KEY $rawval] + } + } + #the quoting implies the necessary escaping for DQKEYs + proc join_and_quote_rawkey_list {rawkeylist} { + set result "" + foreach rk $rawkeylist { + lassign [tomlish::dict::classify_rawkey $rk] type val + switch -- $type { + SQKEY { + append result "'$val'." + } + DQKEY { + append result "\"$val\"." + } + KEY { + append result "$val." + } + } + } + return [string range $result 0 end-1] + } + + proc _process_tomlish_dottedkey {element {context_refpath {}}} { + upvar tablenames_info tablenames_info + upvar datastructure datastructure + set dottedtables_defined [list] + set dkey_info [tomlish::get_dottedkey_info $element] + #e.g1 keys {x.y y} keys_raw {'x.y' "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) + #e.g2 keys {x.y y} keys_raw {{"x.y"} "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) + + #[a.b] + #t1.t2.dottedtable.leafkey = "val" + #we have already checked supertables a & {a b} + # - in basic case, passed in context_refpath as {@@a @@b} + # - our context_refpath could also include some combination of keys and array indices e.g {@@a @@b 3 @@subtablekey} + #We need to check {a b t1} & {a b t2} ('creation' only) + #and then 'dottedtable' is 'defined' while leafkey is an ordinary key in dottedtable + + #note we also get here as a 'dottedkey' with the following even though there is no dot in k + #[a.b] + #leafkey = "val" + + set all_dotted_keys [dict get $dkey_info keys] + set dottedkeyname [join $all_dotted_keys .] + + if {[llength $all_dotted_keys] > 1} { + #dottedtable.k=1 + #tX.dottedtable.k=1 + #etc + + #Wrap in a list so we can detect 'null' equivalent. + #We can't use empty string as that's a valid dotted key segment + set dottedtable_bag [list [lindex $all_dotted_keys end-1]] + set dotparents [lrange $all_dotted_keys 0 end-2] + } else { + #basic case - not really a 'dotted' key + #k = 1 + set dottedtable_bag [list] ;#empty bag + set dotparents [list] + } + #assert dottedtable_bag only ever holds 0 or 1 elements + set leaf_key [lindex $all_dotted_keys end] + + #see also: https://github.com/toml-lang/toml/issues/846 "Can dotted keys insert into already-defined [tables]?" + #This code was originally written with a misinterpretation of: + #"Dotted keys create and define a table for each key part before the last one, provided that such tables were not previously created." + # 'each key part before the last one' refers to each key in a single dotted key entry + # not each 2nd-to last key in a list of dotted keys. + + + #we've already tested the table/tablearray keys that got us here.. but not the dottedkey segments (if any) prior to dottedtable & leaf_key + set dottedsuper_refpath $context_refpath + foreach normkey $dotparents { + lappend dottedsuper_refpath @@$normkey + if {![dict exists $tablenames_info $dottedsuper_refpath ttype]} { + #supertable with this combined path (context_path plus parts of dottedkey) not yet 'created' + if {[tomlish::dict::path::exists $datastructure $dottedsuper_refpath]} { + #There is data so it must have been created as a keyval + set msg "Path $dottedsuper_refpath for dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here + #dict set tablenames_info $dottedsuper_refpath ttype unknown_table ;#REVIEW + dict set tablenames_info $dottedsuper_refpath ttype unknown_dotted ;#REVIEW + + #see note above re dotted keys insert into already defined table - we need to 'define' all the dotted supers in this block + lappend dottedtables_defined $dottedsuper_refpath + + #ensure empty tables are still represented in the datastructure + tomlish::dict::path::set_endpoint datastructure $dottedsuper_refpath {} ;#set to empty subdict + } else { + #added for fixed assumption + set ttype [dict get $tablenames_info $dottedsuper_refpath ttype] + set definedstate [dictn getdef $tablenames_info [list $dottedsuper_refpath tdefined] NULL] + switch -- $ttype { + dottedkey_table - unknown_dotted { + #'created' as dotted - but make sure it's from this header section - i.e defined not set + if {$definedstate ne "NULL"} { + #collision with some other dottedkey + set msg "Table at $dottedsuper_refpath represented by dottedkey $dottedkeyname has been 'defined' elsewhere (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + } + itable { + #itables are immediately defined + set msg "Table at $dottedsuper_refpath represented by dottedkey $dottedkeyname has been 'defined' as itable (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + default { + #header_table, header_tablearray or unknown_header + #is header_tablearray any different from header_table in this context? + #we don't set tdefined for tablearray anyway - so should be ok here. + if {$definedstate ne "NULL"} { + set msg "Table at $dottedsuper_refpath represented by dottedkey $dottedkeyname has been 'defined' in a header (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + } + } + } + } + + #dottedtable being 2nd last segment was for original assumption - todo - tidy up? we are duplicating the logic above + #review - any need/advantage to treat 2nd to last key any different from other supers? ie D in a.b.c.D.key=1 + #no need for 'unknown_dotted' vs 'dottedkey_table' ?? + if {[llength $dottedtable_bag] == 1} { + set dottedtable [lindex $dottedtable_bag 0] + set dottedkey_refpath [list {*}$dottedsuper_refpath "@@$dottedtable"] + #our dotted key is attempting to define a table + if {![dict exists $tablenames_info $dottedkey_refpath ttype]} { + #first one - but check datastructure for collisions + if {[tomlish::dict::path::exists $datastructure $dottedkey_refpath]} { + set msg "Path $dottedkey_refpath for dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #'create' the table + dict set tablenames_info $dottedkey_refpath ttype dottedkey_table + #don't actually set 'defined' here.. use the end of TABLE record to close them off by looking at this list + tomlish::dict::path::set_endpoint datastructure $dottedkey_refpath {} + lappend dottedtables_defined $dottedkey_refpath + + # + } else { + #exists - but might be from another dottedkey within the current header section + #the table is open for adding keys until the next 'header' section ([tablename] / [[tablearray]]) + #check for 'defined' closed (or just existence) + set ttype [dict get $tablenames_info $dottedkey_refpath ttype] + set definedstate [dictn getdef $tablenames_info [list $dottedkey_refpath tdefined] NULL] + switch -- $ttype { + dottedkey_table - unknown_dotted { + #'created' as dotted - but make sure it's from this header section - i.e defined not set + if {$definedstate ne "NULL"} { + #collision with some other dottedkey + set msg "Table at $dottedkey_refpath represented by dottedkey $dottedkeyname has been 'defined' elsewhere (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + } + itable { + #itables are immediately defined + set msg "Table at $dottedkey_refpath represented by dottedkey $dottedkeyname has been 'defined' as itable (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + default { + #header_table, header_tablearray or unknown_header + #is header_tablearray any different from header_table in this context? + #we don't set tdefined for tablearray anyway - so should be ok here. + if {$definedstate ne "NULL"} { + set msg "Table at $dottedkey_refpath represented by dottedkey $dottedkeyname has been 'defined' in a header (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + } + } + } + } else { + set dottedkey_refpath $dottedsuper_refpath + } + #assert - dottedkey represents a key val pair that can be added + + + set fullkey_refpath [list {*}$dottedkey_refpath @@$leaf_key] + if {[tomlish::dict::path::exists $datastructure $fullkey_refpath]} { + set msg "Duplicate key. The key (path $fullkey_refpath) already exists at this level in the toml data. The toml data is not valid." + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + + #set keyval_dict [_get_keyval_value $element] + lassign [_get_keyval_value $element] _ keyval_dict _ sub_tablenames_info + + + #keyval_dict is either a {type value } + #or the result from parsing an arbitrary dict from an inline table - which could theoretically look the same at the topmost level + #punk::dict::is_typeval can distinguish + tomlish::log::debug "_process_tomlish_dottedkey>>> context:$context_refpath dottedkey $dottedkeyname kv: $keyval_dict" + tomlish::dict::path::set_endpoint datastructure $fullkey_refpath $keyval_dict + + #remove ? + #if {![tomlish::dict::is_typeval $keyval_dict]} { + # #the value is either empty or or a dict structure with arbitrary (from-user-data) toplevel keys + # # inner structure will contain {type value } if all leaves are not empty ITABLES + # ##set tkey [list {*}$norm_segments {*}$all_dotted_keys] + + # #by not creating a tablenames_info record - we effectively make it closed anyway? + # #it should be detected as a key + # #is there any need to store tablenames_info for it?? + # #REVIEW + + # ##TODO - update? + # #dictn incr tablenames_info [list $tkey seencount] + # ##if the keyval_dict is not a simple type x value y - then it's an inline table ? + # ##if so - we should add the path to the leaf_key as a closed table too - as it's not allowed to have more entries added. + # #dictn set tablenames_info [list $tkey closed] 1 + #} + return [dict create dottedtables_defined $dottedtables_defined] + } + #tomlish::dict::from_tomlish is a *basic* programmatic datastructure for accessing the data. + # produce a dictionary of keys and values from a tomlish tagged list. + # dict::from_tomlish is primarily for read access to toml data. + #Extraneous (not within quoted sections) whitespace and comments are not preserved in this structure, + # so a roundtrip from toml to this datastructure and back to toml will lose whitespace formatting and comments. + # creating/changing toml values can be done directly on a tomlish list if preserving (or adding) formatting/comments is desired. + #A separate package 'tomlish::object' may be needed to allow easier programmatic creating/updating/deleting of data elements whilst preserving (or adding or selectively deleting/editing) such formatting. + # + + #within an ARRAY, we store a list of items such as plain dicts (possibly empty) and {type value } for simple types + #(ARRAYS can be mixed type) + #This means our dict structure should have only ARRAY and simple types which need to be in {type value } form + #A dict within an array encodeded as a type ITABLE value should also parse - but is the unpreferred form - REVIEW test? + + #Namespacing? + #ie note the difference: + #[Data] + #temp = { cpu = 79.5, case = 72.0} + # versus + #[Data] + #temps = [{cpu = 79.5, case = 72.0}] + proc from_tomlish {tomlish} { + package require dictn + + #keep track of which tablenames have already been directly defined, + # so we can raise an error to satisfy the toml rule: 'You cannot define any key or table more than once. Doing so is invalid' + #Note that [a] and then [a.b] is ok if there are no subkey conflicts - so we are only tracking complete tablenames here. + #we don't error out just because a previous tablename segment has already appeared. + + #Declaring, Creating, and Defining Tables + #https://github.com/toml-lang/toml/issues/795 + #(update - only Creating and Defining are relevant terminology) + + #review + #tablenames_info keys ttype created, tdefined, createdby, definedby, closedby ??? review keys + # [tname] = header_table [[tname]] = header_tablearray + + #consider the following 2 which are legal: + #[table] #'table' created, defined=open type header_table + #x.y = 3 + #[table.x.z] #'table' tdefined=closed closedby={header_table table.x.z}, 'table.x' created, 'table.x.z' created tdefined=open tdefinedby={header_table table.x.z} + #k= 22 + # #'table.x.z' tdefined=closed closedby={eof eof} + + #equivalent datastructure + + #[table] #'table' created, tdefined=open definedby={header_table table} + #[table.x] #'table' tdefined=closed closedby={header_table table.x}, 'table.x' created tdefined=open definedby={header_table table.x} + #y = 3 + #[table.x.z] #'table.x' tdefined=closed closedby={header_table table.x.z}, 'table.x.z' created tdefined=open definedby={header_table table.x.z} + #k=22 + + #illegal + #[table] #'table' created and tdefined=open + #x.y = 3 #'table.x' created first keyval pair tdefined=open definedby={keyval x.y = 3} + #[table.x.y.z] #'table' tdefined=closed, 'table.x' closed because parent 'table' closed?, 'table.x.y' cannot be created + #k = 22 + # + ## - we would fail on encountering table.x.y because only table and table.x are effectively tables - but that table.x is closed should be detected (?) + + #illegal + #[table] + #x.y = {p=3} + #[table.x.y.z] + #k = 22 + ## we should fail because y is an inline table which is closed to further entries + + #note: it is not safe to compare normalized tablenames using join! + # e.g a.'b.c'.d is not the same as a.b.c.d + # instead compare {a b.c d} with {a b c d} + # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. + #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' + #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} + + + + if {[uplevel 1 [list info exists tablenames_info]]} { + upvar tablenames_info tablenames_info + } else { + set tablenames_info [dict create] ;#keyed on tablepath each of which is an @@path such as {@@config @@subgroup @@etc} (corresponding to config.subgroup.etc) + #also has non @@ indexes which are list indexes as taken by tcl list commands (int or end-1 etc) + #value is a dict with keys: ttype, tdefined + } + + + log::info "---> dict::from_tomlish processing '$tomlish'<<<" + set items $tomlish + + foreach lst $items { + if {[lindex $lst 0] ni $::tomlish::tags} { + error "supplied list does not appear to be toml parsed into a tomlish tagged list. Run tomlish::decode::toml on the raw toml data to produce a tomlish list" + } + } + + if {[lindex $tomlish 0] eq "TOMLISH"} { + #ignore TOMLISH tag at beginning + set items [lrange $tomlish 1 end] + } + + set datastructure [dict create] + set dottedtables_defined [list] + foreach item $items { + set tag [lindex $item 0] + #puts "...> item:'$item' tag:'$tag'" + switch -exact -- $tag { + KEY - DQKEY - SQKEY - INT - FLOAT - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - STRING - LITERAL { + #why would we get individual key item as opposed to DOTTEDKEY? + error "tomlish::dict::from_tomlish error: invalid tag: $tag. At the toplevel, from_tomlish can only process WS NEWLINE COMMENT and compound elements DOTTEDKEY TABLE TABLEARRAY ITABLE MULTILITERAL MULTISTRING" + } + DOTTEDKEY { + #toplevel dotted key + set dkinfo [_process_tomlish_dottedkey $item] + lappend dottedtables_defined {*}[dict get $dkinfo dottedtables_defined] + #at any level - we don't expect any more DOTTEDKEY records in a tomlish structure after TABLE or TABLEARRAY are encountered + #as those records should encapsulate their own dottedkeys + + } + TABLEARRAY { + #close off any dottedtables_defined created by dottedkeys at this level + foreach dtablepath $dottedtables_defined { + dict set tablenames_info $dtablepath tdefined closed + } + set dottedtables_defined [list] ;#for closing off at end by setting 'defined' + + set tablearrayname [lindex $item 1] + tomlish::log::debug "---> tomlish::dict::from_tomlish processing item TABLENAME (name: $tablearrayname): $item" + set norm_segments [::tomlish::toml::tablename_split $tablearrayname true] ;#true to normalize + #we expect repeated tablearray entries - each adding a sub-object to the value, which is an array/list. + #tablearrayname is likely to appear multiple times - so unlike a TABLE we don't check for 'defined' for the full name as an indicator of a problem + set supertable [list] + ############## + # [[a.b.c.d]] + # norm_segments = {a b c d} + #check a {a b} {a b c} <---- supertables of a.b.c.d + ############## + set refpath [list] ;#e.g @@j1 @@j2 1 @@k1 end + foreach normseg [lrange $norm_segments 0 end-1] { + lappend supertable $normseg + lappend refpath @@$normseg + if {![dict exists $tablenames_info $refpath ttype]} { + #supertable with this path doesn't yet exist + if {[tomlish::dict::path::exists $datastructure $refpath]} { + #There is data though - so it must have been created as a keyval + set msg "Supertable [join $supertable .] of tablearray name $tablearrayname already has data but doesn't appear to be a table - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + #test: datastructure_tablearray_supertable_keycollision + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } else { + #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here + #review - we can't later specify as tablearray so should just set ttype to header_table even though it's being created + # because of a tablearray header? + #By setting ttype to something other than table_header we can provide more precise errorCode/msg ?? + dict set tablenames_info $refpath ttype unknown_header + #ensure empty tables are still represented in the datastructure + dict set datastructure {*}$supertable [list] + } + } else { + #REVIEW!! + # what happens with from_toml {[[a.b.c]]} {[[a.b]]} ??? + #presumed that a and a.b were 'created' as tables (supertables of tablearray at a.b.c) and can't now be redefined as tablearrays + + #supertable has already been created - and may be defined - but even if defined we can add subtables unless it is of type itable + #but if it's a tablearray - we need to point to the most 'recently defined table element of the array' + #(last member of that array - need to check type? allowed to have non-table elements ie nonhomogenous??) + set supertype [dict get $tablenames_info $refpath ttype] + if {$supertype eq "header_tablearray"} { + #exercised by toml-tests: + # valid/table/array-table-array + # valid/table/array-nest + + #puts stdout "todict!!! TABLEARRAY nesting required for supertable [join $supertable .]" + + #'refer' to the appropriate element in existing array + set arrdata [tomlish::dict::path::get $datastructure [list {*}$refpath @@value]] + set idx [expr {[llength $arrdata]-1}] + if {$idx < 0} { + #existing tablearray should have at least one entry even if empty (review) + set msg "reference to empty tablearray?" + return -code error -errorcode {TOMLISH STRUCTURE REFTOEMPTYTABLEARRAY} $msg + } + lappend refpath $idx + } + } + } + # + #puts "TABLE supertable refpath $refpath" + lappend refpath @@[lindex $norm_segments end] + tomlish::log::debug "TABLEARRAY refpath $refpath" + set tablearray_refpath $refpath + + + if {![dict exists $tablenames_info $tablearray_refpath ttype]} { + #first encounter of this tablearrayname + if {[tomlish::dict::path::exists $datastructure $tablearray_refpath]} { + #e.g from_toml {a=1} {[[a]]} + set msg "Cannot create tablearray name $tablearrayname. Key already has data but key doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + #test: datastructure_tablearray_direct_keycollision_error + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #no collision - we can create the tablearray and the array in the datastructure + dict set tablenames_info $tablearray_refpath ttype header_tablearray + #dict set datastructure {*}$norm_segments [list type ARRAY value {}] + #create array along with empty array-item at position zero + tomlish::dict::path::set_endpoint datastructure $tablearray_refpath [list type ARRAY value {{}}] + set arrayitem_refpath [list {*}$tablearray_refpath 0] + #set ARRAY_ELEMENTS [list] + } else { + #we have an existing tablenames_info record for this path - but is it a tablearray? + set ttype [dict get $tablenames_info $tablearray_refpath ttype] + if {$ttype ne "header_tablearray"} { + #header_table or itable + switch -- $ttype { + itable {set ttypename itable} + header_table {set ttypename table} + dottedkey_table {set ttypename dottedkey_table} + unknown_header - unknown_dotted { + #table was created e.g as supertable - but not specifically a tablearray + #violates ordering - return specific test error + set msg "Table $tablearrayname referenced as supertable before tablearray defined (ordering)" + return -code error -errorcode {TOMLISH STRUCTURE TABLEARRAYORDERING} $msg + } + default {error "unrecognised type $ttype - expected header_table or itable"} + } + set msg "tablearray name $tablearrayname already appears to be already created as '$ttypename' not tablearray - invalid?" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #EXISTING tablearray + #add to array + #error "add_to_array not implemented" + #{type ARRAY value } + #set ARRAY_ELEMENTS [dict get $datastructure {*}$norm_segments value] + tomlish::log::debug ">>>>pre-extend-array dict::from_tomlish datastructure: $datastructure" + set existing_array [tomlish::dict::path::get $datastructure [list {*}$tablearray_refpath @@value]] + set arrayitem_refpath [list {*}$tablearray_refpath [llength $existing_array]] + tomlish::dict::path::lappend datastructure $tablearray_refpath {} + tomlish::log::debug ">>>>post-extend-array dict::from_tomlish datastructure: $datastructure" + } + + + #set object [dict create] ;#array context equivalent of 'datastructure' + + #add to ARRAY_ELEMENTS and write back in to datastructure. + foreach element [lrange $item 2 end] { + set type [lindex $element 0] + tomlish::log::debug "----> todict processing $tag subitem $type processing contained element $element" + switch -exact -- $type { + DOTTEDKEY { + set dkinfo [_process_tomlish_dottedkey $element $arrayitem_refpath] + lappend dottedtables_defined {*}[dict get $dkinfo dottedtables_defined] + } + NEWLINE - COMMENT - WS { + #ignore + } + TABLE { + #we *perhaps* should be able to process tablearray subtables either as part of the tablearray record, or independently. + #(or even a mixture of both, although that is somewhat an edge case, and of limited utility) + #[[fruit]] + #x=1 + # [fruit.metadata] + # [fruit.otherdata] + + #when processing a dict destined for the above - the tomlish generator (e.g from_dict) + #should create as 1 or 3 records (but could create 2 records if there was an unrelated table in between the subtables) + #choices: all in tablearray record, tablearray + 1 or 2 table records. + # + #We are going the other way here - so we just need to realise that the list of tables 'belonging' to this tablearray might not be complete. + # + #the subtable names must be prefixed with the tablearray - we should validate that for any contained TABLE records + + #The default mechanism is for from_dict to produce tomlish with separate TABLE records - and use the ordering to determine membership + #If we were to support wrapping the TABLE records within a TABLEARRAY - we should also support TABLEARRAY within TABLEARRAY + # ----------------------------------------------------------------------- + #Implementing this is not critical for standard encoding/decoding of toml! + #It would be an alternative form for the tomlish intermediate form - and adds complexity. + # + #The upside would be to provide a function for sorting/rearranging in the tomlish form if all records were fully encapsulated. + #A possible downside is that unrelated tables placed before a tablearray is fully defined (within the tablearray definition area in toml) + # would have to be re-positioned before or after the encapsulated tablearray record. + # While unrelated tables in such a position aren't a recommended way to write toml, they appear to be valid + # and preserving the author's ordering is a goal of the basic encoding/decoding operations if no explicit sorting/reordering was requested. + # + #Consider an 'encapsulate' method to this (tomlish -> tomlish) + # ----------------------------------------------------------------------- + #todo + error "tomlish::dict::from_tomlish TABLE element within TABLEARRAY not handled - TABLE should be a separate tomlish record" + } + default { + error "tomlish::dict::from_tomlish Sub element of type '$type' not understood in tablearray context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" + } + } + } + + #end of TABLEARRAY record - equivalent of EOF or next header - close off the dottedtables + foreach dtablepath $dottedtables_defined { + dict set tablenames_info $dtablepath tdefined closed + } + } + TABLE { + #close off any dottedtables_defined created by dottedkeys at this level + foreach dtablepath $dottedtables_defined { + dict set tablenames_info $dtablepath tdefined closed + } + set tablename [lindex $item 1] + set dottedtables_defined [list] ;#for closing off at end by setting 'defined' + #As our TABLE record contains all it's child DOTTEDKEY records - this should be equivalent to setting them as defined at EOF or next header. + + #----------------------------------------------------------------------------------- + #default assumption - our reference is to the main tablenames_info and datastructure + #Will need to append keys appropriately if we have recursed + #----------------------------------------------------------------------------------- + + log::debug "---> tomlish::dict::from_tomlish processing item TABLE (name: $tablename): $item" + set norm_segments [::tomlish::toml::tablename_split $tablename true] ;#true to normalize + + + + set name_segments [::tomlish::toml::tablename_split $tablename 0] ;#unnormalized e.g ['a'."b".c.d] -> 'a' "b" c d + #results of tablename_split 0 are 'raw' - ie some segments may be enclosed in single or double quotes. + + + set supertable [list] + ############## + # [a.b.c.d] + # norm_segments = {a b c d} + #check a {a b} {a b c} <---- supertables of a.b.c.d + ############## + + ############## + #[[a]] + #[a.b] #supertable a is tablearray + ############## + + #also consider + ############## + # [[a.b]] + # [a.b.c.d] #supertable a is a table, supertable a.b is tablearray, supertable a.b.c is elementtable + ############## + set refpath [list] ;#e.g @@j1 @@j2 1 @@k1 end + foreach normseg [lrange $norm_segments 0 end-1] { + lappend supertable $normseg + lappend refpath @@$normseg + if {![dict exists $tablenames_info $refpath ttype]} { + #supertable with this path doesn't yet exist + if {[tomlish::dict::path::exists $datastructure $refpath]} { + #There is data though - so it must have been created as a keyval + set msg "Supertable [join $supertable .] of table name $tablename (path $refpath) already has data but doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here + #we also don't know whether it's a table or a dottedkey_table (not allowed to be tablearray - out of order?) + dict set tablenames_info $refpath ttype unknown_header + #ensure empty tables are still represented in the datastructure + #dict set datastructure {*}$supertable [list] + tomlish::dict::path::set_endpoint datastructure $refpath {} + } else { + #supertable has already been created - and may be defined - but even if defined we can add subtables unless it is of type itable + if {[dict get $tablenames_info $refpath ttype] eq "header_tablearray"} { + #'refer' to the appropriate element in existing array + set arrdata [tomlish::dict::path::get $datastructure [list {*}$refpath @@value]] + set idx [expr {[llength $arrdata]-1}] + if {$idx < 0} { + #existing tablearray should have at least one entry even if empty (review) + set msg "reference to empty tablearray?" + return -code error -errorcode {TOMLISH STRUCTURE REFTOEMPTYTABLEARRAY} $msg + } + lappend refpath $idx + } else { + #?? + if {[dictn getdef $tablenames_info [list $refpath tdefined] NULL] eq "NULL"} { + } else { + } + } + } + } + #puts "TABLE supertable refpath $refpath" + lappend refpath @@[lindex $norm_segments end] + tomlish::log::info "TABLE refpath $refpath" + set table_refpath $refpath + + + + + #table [a.b.c.d] hasn't been defined - but may have been 'created' already by a longer tablename + # - or may have existing data from a keyval + if {![dict exists $tablenames_info $table_refpath ttype]} { + if {[tomlish::dict::path::exists $datastructure $table_refpath]} { + #e.g from_toml {a=1} {[a]} + set msg "Cannot create table name $tablename. Key already has data but key doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + #test: datastructure_tablename_keyval_collision_error + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #no data or previously created table + dict set tablenames_info $table_refpath ttype header_table + + #We are 'defining' this table's keys and values here (even if empty) + #dict set datastructure {*}$norm_segments [list] ;#ensure table still represented in datastructure even if we add no keyvals here + tomlish::dict::path::set_endpoint datastructure $table_refpath {} ;#ensure table still represented in datastructure even if we add no keyvals here + } else { + if {[dict get $tablenames_info $table_refpath ttype] eq "header_tablearray"} { + #e.g tomltest invalid/table/duplicate-table-array2 + #[[tbl]] + #[tbl] + set msg "Table name $tablename has already been created as a tablearray. Invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } else { + #any other type tdefined is a problem + set T_DEFINED [dictn getdef $tablenames_info [list $table_refpath tdefined] NULL] + if {$T_DEFINED ne "NULL" } { + #our tablename e.g [a.b.c.d] declares a space to 'define' subkeys - but there has already been a definition space for this path + set msg "Table name $tablename has already been defined in the toml data. Invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + } + } + dict set tablenames_info $table_refpath tdefined open + + #now add the contained elements + foreach element [lrange $item 2 end] { + set type [lindex $element 0] + log::debug "----> todict processing $tag subitem $type processing contained element $element" + switch -exact -- $type { + DOTTEDKEY { + set dkinfo [_process_tomlish_dottedkey $element $table_refpath] + lappend dottedtables_defined {*}[dict get $dkinfo dottedtables_defined] + } + NEWLINE - COMMENT - WS { + #ignore + } + default { + error "Sub element of type '$type' not understood in table context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" + } + } + } + + #end of TABLE record - equivalent of EOF or next header - close off the dottedtables + foreach dtablepath $dottedtables_defined { + dict set tablenames_info $dtablepath tdefined closed + } + } + ITABLE { + #As there is no other mechanism to create tables within an ITABLE than dottedkeys + # and ITABLES are fully defined/enclosed - we can rely on key collision and don't need to track dottedtables_defined - REVIEW. + set dottedtables_defined [list] + #SEP??? + #ITABLE only ever on RHS of = or inside ARRAY + set datastructure [dict create] + set tablenames_info [dict create] + + foreach element [lrange $item 1 end] { + set type [lindex $element 0] + log::debug "----> tododict processing $tag subitem $type processing contained element $element" + switch -exact -- $type { + DOTTEDKEY { + set dkinfo [_process_tomlish_dottedkey $element] + } + NEWLINE - COMMENT - WS { + #ignore + } + default { + error "Sub element of type '$type' not understood in ITABLE context. Expected only KEY,DQKEY,SQKEY,NEWLINE,COMMENT,WS" + } + } + } + } + ARRAY { + #arrays in toml are allowed to contain mixtures of types + set datastructure [list] + log::debug "--> processing array: $item" + + foreach element [lrange $item 1 end] { + set type [lindex $element 0] + log::debug "----> tododict processing $tag subitem $type processing contained element $element" + switch -exact -- $type { + INT - FLOAT - BOOL - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL { + set value [lindex $element 1] + lappend datastructure [list type $type value $value] + } + STRING { + #JJJJ + #don't unescape string! + set value [lindex $element 1] + #lappend datastructure [list type $type value [::tomlish::utils::unescape_string $value]] + lappend datastructure [list type $type value $value] + } + LITERAL { + set value [lindex $element 1] + lappend datastructure [list type $type value $value] + } + ITABLE { + #anonymous table + #lappend datastructure [list type $type value [::tomlish::to_dict [list $element]]] + lappend datastructure [::tomlish::dict::from_tomlish [list $element]] ;#store itables within arrays as raw dicts (possibly empty) + } + TABLE - TABLEARRAY { + #invalid? shouldn't be output from from_dict - but could manually be constructed as such? review + #doesn't make sense as table needs a name? + #take as synonym for ITABLE? + error "tomlish::dict::from_tomlish $type within array unexpected" + } + ARRAY - MULTISTRING - MULTILITERAL { + #set value [lindex $element 1] + lappend datastructure [list type $type value [::tomlish::dict::from_tomlish [list $element]]] + } + WS - SEP - NEWLINE - COMMENT { + #ignore whitespace, commas, newlines and comments + } + default { + error "tomlish::dict::from_tomlish Unexpected value type '$type' found in array" + } + } + } + } + MULTILITERAL { + #triple squoted string + #first newline stripped only if it is the very first element + #(ie *immediately* following the opening delims) + #All whitespace other than newlines is within LITERALPARTS + # ------------------------------------------------------------------------- + #todo - consider extension to toml to allow indent-aware multiline literals + # how - propose as issue in toml github? Use different delim? e.g ^^^ ? + #e.g + # xxx=?'''abc + # def + # etc + # ''' + # - we would like to trimleft each line to the column following the opening delim + # ------------------------------------------------------------------------- + + log::debug "---> todict processing multiliteral: $item" + set parts [lrange $item 1 end] + if {[lindex $parts 0 0] eq "NEWLINE"} { + set parts [lrange $parts 1 end] ;#skip it + } + for {set idx 0} {$idx < [llength $parts]} {incr idx} { + set element [lindex $parts $idx] + set type [lindex $element 0] + switch -exact -- $type { + LITERALPART { + append stringvalue [lindex $element 1] + } + NEWLINE { + set val [lindex $element 1] + if {$val eq "lf"} { + append stringvalue \n + } else { + append stringvalue \r\n + } + } + default { + error "Unexpected value type '$type' found in multistring" + } + } + } + set datastructure $stringvalue + } + MULTISTRING { + #triple dquoted string + log::debug "---> tomlish::dict::from_tomlish processing multistring: $item" + set stringvalue "" + set idx 0 + set parts [lrange $item 1 end] + for {set idx 0} {$idx < [llength $parts]} {incr idx} { + set element [lindex $parts $idx] + set type [lindex $element 0] + #We use STRINGPART in the tomlish representation as a distinct element to STRING - which would imply wrapping quotes to be reinserted + switch -exact -- $type { + STRING { + #todo - do away with STRING ? + #we don't build MULTISTRINGS containing STRING - but should we accept it? + tomlish::log::warn "double quoting a STRING found in MULTISTRING - should be STRINGPART?" + #append stringvalue "\"[::tomlish::utils::unescape_string [lindex $element 1]]\"" + append stringvalue "\"[lindex $element 1]\"" + } + STRINGPART { + #JJJ + #don't unescape string + #append stringvalue [::tomlish::utils::unescape_string [lindex $element 1]] + append stringvalue [lindex $element 1] + } + CONT { + #When the last non-whitespace character on a line is an unescaped backslash, + #it will be trimmed along with all whitespace (including newlines) up to the next non-whitespace character or closing delimiter + # review - we allow some whitespace in stringpart elements - can a stringpart ever be all whitespace? + set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] + if {$next_nl == -1} { + #last (or first and only) line + return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} "Invalid whitespace escape - not a valid continuation position" + #set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] + #if {$non_ws >= 0} { + # #append stringvalue "\\" + # return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} "Invalid whitespace escape - not a valid continuation position" + #} else { + # #skip over ws without emitting + # set idx [llength $parts] + #} + } else { + set parts_til_nl [lrange $parts 0 $next_nl-1] + set non_ws [lsearch -index 0 -start $idx+1 -not $parts_til_nl WS] + if {$non_ws >= 0} { + #This CONT is invalid. If there had been a non-whitespace char directly following it, + #it wouldn't have come through as a CONT token + #Now that we see it isn't the last non-whitespace backslash on the line we can reject + # as an invalid escape of space or tab + #append stringvalue "\\" + return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} "Invalid whitespace escape - not a valid continuation position" + } else { + #skip over ws on this line + set idx $next_nl + #then have to check each subsequent line until we get to first non-whitespace + set trimming 1 + while {$trimming && $idx < [llength $parts]} { + set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] + if {$next_nl == -1} { + #last line + set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] + if {$non_ws >= 0} { + set idx [expr {$non_ws -1}] + } else { + set idx [llength $parts] + } + set trimming 0 + } else { + set non_ws [lsearch -index 0 -start $idx+1 -not [lrange $parts 0 $next_nl-1] WS] + if {$non_ws >= 0} { + set idx [expr {$non_ws -1}] + set trimming 0 + } else { + set idx $next_nl + #keep trimming + } + } + } + } + } + } + NEWLINE { + #if newline is first element - it is not part of the data of a multistring + if {$idx > 0} { + set val [lindex $element 1] + if {$val eq "lf"} { + append stringvalue \n + } else { + append stringvalue \r\n + } + } + } + WS { + append stringvalue [lindex $element 1] + } + default { + error "Unexpected value type '$type' found in multistring" + } + } + } + set datastructure $stringvalue + } + WS - COMMENT - NEWLINE { + #ignore + } + BOM { + #this token is the unicode single char \uFFEF + #It doesn't tell us what encoding was originally used (though toml should only accept UTF-8 files) + #ignore at start - what about in other positions? + } + default { + error "Unexpected tag '$tag' in Tomlish list '$tomlish'" + } + } + } + return $datastructure + } +} +namespace eval tomlish::dict::path { + #access tomlish dict structure + namespace export {[a-z]*}; # Convention: export all lowercase + + #access with path such as: @@k @@k 0 @@k end where dict keys marked with @@ and plain values are list indices into in {type ARRAY value } + #leaf elements returned as structured {type value } + proc get {dictval {path {}}} { + if {$path eq ""} { + return $dictval + } + ::set data $dictval + ::set pathsofar [list] + foreach p $path { + ::lappend pathsofar $p + if {[string range $p 0 1] eq "@@"} { + ::set data [dict get $data [string range $p 2 end]] + } else { + if {![tomlish::dict::is_typeval $data]} { + error "tomlish::dict::path::get error bad path $path. Attempt to access table as array at subpath $pathsofar." + } + if {[dict get $data type] ne "ARRAY"} { + error "tomlish::dict::get error bad path $path. Subpath $pathsofar is not an array." + } + ::set arrdata [dict get $data value] + ::set data [lindex $arrdata $p] + } + } + return $data + } + proc exists {dictval path} { + ::set data $dictval + ::set pathsofar [list] + ::set exists 1 + foreach p $path { + ::lappend pathsofar $p + if {[string range $p 0 1] eq "@@"} { + ::set k [string range $p 2 end] + if {![dict exists $data $k]} { + return 0 + } + ::set data [dict get $data $k] + } else { + if {![tomlish::dict::is_typeval $data]} { + return 0 + } + if {[dict get $data type] ne "ARRAY"} { + return 0 + } + ::set arrdata [dict get $data value] + ::set intp [tomlish::system::lindex_resolve_basic $arrdata $p] ;#handle index math (end-1 etc) + if {$intp == -1} { + #out of bounds + return 0 + } + ::set data [lindex $arrdata $p] + } + } + return $exists + } + + #a restricted analogy of 'dictn set' + #set 'endpoints' - don't create intermediate paths + # can replace an existing dict with another dict + # can create a key when key at tail end of path is a key (ie @@keyname, not index) + # can replace an existing {type value value } + # with added restriction that if is ARRAY the new must also be ARRAY + proc set_endpoint {dictvariable path value} { + upvar $dictvariable dict_being_edited + ::set data $dict_being_edited + ::set pathsofar [list] + if {!([tomlish::dict::is_typeval $value] || [tomlish::dict::is_typeval_dict $value 0])} { + #failed check of supplied value as basic type, or a sub-dict structure (not checking arrays) + error "tomlish::dict::path::set_endpoint error - value must already be in the tomlish form {type value } or be a dict with such forms as leaves" + } + foreach p $path { + ::lappend pathsofar $p + if {[string range $p 0 1] eq "@@"} { + ::set k [string range $p 2 end] + + #if {![dict exists $data $k]} { + # error "tomlish::dict:path::set error bad path $path. Attempt to access nonexistent element at subpath $pathsofar." + #} + ::set varname v[incr v] + + if {$pathsofar eq $path} { + #see if endpoint of the path given already exists + if {[dict exists $data $k]} { + ::set endpoint [dict get $data $k] + if {[tomlish::dict::is_typeval $endpoint]} { + set existing_tp [dict get $endpoint type] + if {![tomlish::dict::is_typeval $value]} { + error "tomlish::dict::path::set_endpoint error Unable to overwrite subpath '$pathsofar' which is of type $existing_tp with sub-dict. Supplied value not {type value val } with sub-dict: $value" + } + switch -- [dict get $endpoint type] { + ARRAY { + #disallow overwriting array - unless given value is an ARRAY? REVIEW + if {[dict get $value type] ne "ARRAY"} { + error "tomlish::dict::path::set_endpoint error bad path '$path'. Cannot overwrite array with non-array: $value" + } + } + default { + # + } + } + } else { + #endpoint is a typeval dict not a plain typeval - only allow overwrite with a typeval dict + if {![tomlish::dict::is_typeval_dict $value 0]} { + error "tomlish::dict::path::set_endpoint error path '$path'. Cannot overwrite sub-dict (size: [dict size $endpoint]) with non sub-dict: $value" + } + } + ::set $varname $value + dict set vdict $pathsofar $varname + break + } else { + ::set arrdata [dict get $data value] + set idx [tomlish::system::lindex_resolve_basic $arrdata $p] + if {$idx == -1} { + error "tomlish::dict::path::set_endpoint error bad path '$path'. No existing element at $p" + } + ::set data [lindex $arrdata $p] + ::set $varname $data + dict set vdict $pathsofar $varname + } + } + } + #dict for {path varname} $vdict { + # puts "$path $varname\n" + # puts " '[::set $varname]'\n" + # puts "" + #} + ::set i 0 + ::set reverse [lreverse $vdict] + foreach {varname path} $reverse { + set newval [::set $varname] + if {$i+2 == [llength $reverse]} { + ::set k [lindex $path end] + ::set k [string range $k 2 end] ;#first key is always @@something + dict set dict_being_edited $k $newval + #puts "--result $dict_being_edited" + break + } + ::set nextvarname [lindex $reverse $i+2] + ::set nextval [::set $nextvarname] + ::set k [lindex $path end] + if {[string match @@* $k]} { + #dict key + #dict set $nextvarname $k $newval + set_endpoint $nextvarname [list $k] $newval + } else { + #list index + ::set nextarr [dict get $nextval value] + ::lset nextarr $k $newval + dict set $nextvarname value $nextarr + } + ::incr i 2 + } + + return $dict_being_edited + + } + #path must be to a {type ARRAY value } + #REVIEW - how to lappend to deep mixed dict/array structure without rewriting whole datastructure? + proc lappend {dictvariable path args} { + upvar $dictvariable dict_being_edited + ::set data $dict_being_edited + ::set pathsofar [list] + #::set newlist [list] + ::set v 0 + ::set vdict [dict create] + foreach a $args { + if {![string is dict $a]} { + error "tomlish::dict::path::lappend error - lappended arguments must already be in the tomlish form {type value } or be a dict with such forms as leaves" + } + } + foreach p $path { + ::lappend pathsofar $p + if {[string range $p 0 1] eq "@@"} { + ::set k [string range $p 2 end] + if {![dict exists $data $k]} { + error "tomlish::dict::path::lappend error bad path $path. Attempt to access nonexistent element at subpath $pathsofar." + } + ::set varname v[incr v] + + if {$pathsofar eq $path} { + #see if endpoint of the path given is an ARRAY + ::set endpoint [dict get $data $k] + if {![tomlish::dict::is_typeval $endpoint]} { + error "tomlish::dict::path::lappend error bad path $path. Attempt to access table as array at subpath $pathsofar." + } + if {[dict get $endpoint type] ne "ARRAY"} { + error "tomlish::dict::path::lappend error bad path $path. Subpath $pathsofar is not an array." + } + ::set arrdata [dict get $endpoint value] + ::lappend arrdata {*}$args + dict set endpoint value $arrdata + ::set newlist $endpoint + ::set $varname $newlist + dict set vdict $pathsofar $varname + break + } + ::set data [dict get $data $k] + ::set $varname $data + dict set vdict $pathsofar $varname + } else { + if {![tomlish::dict::is_typeval $data]} { + error "tomlish::dict::path::lappend error bad path $path. Attempt to access table as array at subpath $pathsofar." + } + if {[dict get $data type] ne "ARRAY"} { + error "tomlish::dict::path::lappend error bad path $path. Subpath $pathsofar is not an array." + } + ::set varname v[incr v] + if {$pathsofar eq $path} { + if {[dict get $data type] ne "ARRAY"} { + error "tomlish::dict::path::lappend error bad path $path. Parent path is not an array." + } + ::set parentarray [dict get $data value] + ::set idx [tomlish::system::lindex_resolve_basic $parentarray $p] + if {$idx == -1} { + error "tomlish::dict::path::lappend error bad path $path. Index $p does not exist." + } + ::set endpoint [lindex $parentarray $p] + if {[dict get $endpoint type] ne "ARRAY"} { + error "tomlish::dict::path::lappend error bad path $path. Not an array." + } + + ::set arrdata [dict get $endpoint value] + ::lappend arrdata {*}$args + dict set endpoint value $arrdata + ::set newlist $endpoint + #::lset parentarray $p $newlist + #set parentarray $newlist + ::set $varname $newlist + dict set vdict $pathsofar $varname + break + } else { + ::set arrdata [dict get $data value] + set idx [tomlish::system::lindex_resolve_basic $arrdata $p] + if {$idx == -1} { + error "tomlish::dict::path::lappend error bad path $path. Subpath $pathsofar, index $p does not exist." + } + ::set data [lindex $arrdata $p] + ::set $varname $data + dict set vdict $pathsofar $varname + } + } + } + #todo tomlish::log::debug ? + #dict for {path varname} $vdict { + # puts "$path $varname\n" + # puts " [::set $varname]\n" + # puts "" + #} + ::set i 0 + ::set reverse [lreverse $vdict] + foreach {varname path} $reverse { + set newval [::set $varname] + if {$i+2 == [llength $reverse]} { + ::set k [lindex $path end] + ::set k [string range $k 2 end] ;#first key is always @@something + dict set dict_being_edited $k $newval + #puts "--result $dict_being_edited" + break + } + ::set nextvarname [lindex $reverse $i+2] + ::set nextval [::set $nextvarname] + ::set k [lindex $path end] + if {[string match @@* $k]} { + #dict key + set k [string range $k 2 end] + dict set $nextvarname $k $newval + } else { + #list index + ::set nextarr [dict get $nextval value] + ::lset nextarr $k $newval + dict set $nextvarname value $nextarr + } + ::incr i 2 + } + return $dict_being_edited + } +} +tcl::namespace::eval tomlish::to_dict { + + + proc @@path {dictkeys} { + lmap v $dictkeys {string cat @@ $v} + } + +} + + +tcl::namespace::eval tomlish::app { + #*** !doctools + #[subsection {Namespace tomlish::app}] + #[para] + #[list_begin definitions] + + tcl::namespace::eval argdoc { + proc test_suites {} { + if {[package provide test::tomlish] eq ""} { + return [list] + } + return [test::tomlish::SUITES] + } + } + + package require punk::args + punk::args::define { + @id -id ::tomlish::app::decoder + @cmd -name tomlish::app::decoder -help\ + "Read toml on stdin until EOF + on error - returns non-zero exit code and writes error to + the errorchannel. + on success - returns zero exit code and writes JSON encoding + of the data to the outputchannel. + This decoder is intended to be compatble with toml-test." + @leaders -min 0 -max 0 + @opts + -help -type none -help\ + "Display this usage message" + -inputchannel -default stdin + -inputencoding -default "iso8859-1" -choicerestricted 0 -choices {utf-8 utf-16 iso8859-1} -help\ + "configure encoding on input channel + iso8859-1 is equivalent to binary encoding" + -outputchannel -default stdout + -errorchannel -default stderr + @values -min 0 -max 0 + } + proc decoder {args} { + set argd [punk::args::parse $args withid ::tomlish::app::decoder] + set ch_input [dict get $argd opts -inputchannel] + set ch_input_enc [dict get $argd opts -inputencoding] + set ch_output [dict get $argd opts -outputchannel] + set ch_error [dict get $argd opts -errorchannel] + if {[dict exists $argd received -help]} { + return [punk::args::usage -scheme info ::tomlish::app::decoder] + } + + chan configure $ch_input -encoding $ch_input_enc + #translation? + chan configure $ch_input -translation lf ;# toml-test invalid/control tests we need to see raw CRs to reject them properly - auto translation won't do. + + #Just slurp it all - presumably we are not handling massive amounts of data on stdin. + # - even if the input is large, we probably don't gain much (aside from possible memory savings?) by attempting to process input as it arrives. + if {[catch { + set inputdata [read $ch_input] + if {$ch_input_enc eq "iso8859-1"} { + set toml [tomlish::toml::from_binary $inputdata] + } else { + set toml $inputdata + } + } errM]} { + puts stderr "read-input error: $errM" + #toml-tests expect exit code 1 + #e.g invalid/encoding/utf16-bom + exit 1 ;#read error + } + try { + set j [::tomlish::toml_to_typedjson $toml] + } on error {em} { + puts $ch_error "decoding failed: '$em'" + exit 1 + } + puts -nonewline $ch_output $j + exit 0 + } + + package require punk::args + punk::args::define { + @id -id ::tomlish::app::encoder + @cmd -name tomlish::app::encoder -help\ + "Read JSON on input until EOF + return non-zero exitcode if JSON data cannot be converted to + a valid TOML representation. + return zero exitcode and TOML data on output if JSON data can + be converted. + This encoder is intended to be compatible with toml-test." + @leaders -min 0 -max 0 + @opts + -help -type none -help \ + "Display this usage message" + -restrict_barekeys -default 0 -help\ + "If true, keys containing unicode will be quoted. + If false, an extended range of barekeys will be used + in unquoted form." + -inputchannel -default stdin + -inputencoding -default "" -choicerestricted 0 -choices {utf-8 utf-16 iso8859-1} -help\ + "configure encoding on input channel + If not supplied, leave at Tcl default" + -outputchannel -default stdout + -errorchannel -default stderr + @values -min 0 -max 0 + } + proc encoder {args} { + set argd [punk::args::parse $args withid ::tomlish::app::encoder] + set restrict_barekeys [dict get $argd opts -restrict_barekeys] + set ch_input [dict get $argd opts -inputchannel] + set ch_input_enc [dict get $argd opts -inputencoding] + set ch_output [dict get $argd opts -outputchannel] + set ch_error [dict get $argd opts -errorchannel] + if {[dict exists $argd received -help]} { + return [punk::args::usage -scheme info ::tomlish::app::encoder] + } + #review + if {$ch_input_enc ne ""} { + chan configure $ch_input -encoding $ch_input_enc + } + #review + chan configure $ch_input -translation lf + + if {[catch { + set json [read $ch_input] + }]} { + exit 2 ;#read error + } + try { + #tomlish::typedjson_to_toml + set toml [::tomlish::toml::from_tomlish_from_dict_from_typedjson $json] + } trap {} {e eopts} { + puts $ch_error "encoding failed: '$e'" + puts $ch_error "$::errorInfo" + exit 1 + } + puts -nonewline $ch_output $toml + exit 0 + } + + punk::args::define { + @dynamic + @id -id ::tomlish::app::test + @cmd -name tomlish::app::test + @leaders + @opts -any 1 + -help -type none -help\ + "Display this usage message + or further info if more args." + -suite -default tests -choices {${[::tomlish::app::argdoc::test_suites]}} + @values -min 0 -max -1 + } + proc test {args} { + package require test::tomlish + set argd [punk::args::parse $args withid ::tomlish::app::test] + set opts [dict get $argd opts] + set values [dict get $argd values] + set received [dict get $argd received] + set solos [dict get $argd solos] + set opt_suite [dict get $opts -suite] + if {[dict exists $received -help] && ![dict exists $received -suite]} { + return [punk::args::usage -scheme info ::tomlish::app::test] + } + + test::tomlish::SUITE $opt_suite + #if {[catch {test::tomlish::SUITE $opt_suite} errM]} { + # puts stderr "Unknown test suite '$opt_suite'. Available suites: [test::tomlish::SUITES]" + # exit 1 + #} + set run_opts [dict remove $opts -suite] + set run_opts [dict remove $run_opts {*}$solos] + set result [test::tomlish::RUN {*}$run_opts {*}$solos {*}$values] + return $result + } + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::app ---}] +} + +proc ::tomlish::appnames {} { + set applist [list] + foreach cmd [info commands ::tomlish::app::*] { + lappend applist [namespace tail $cmd] + } + return $applist +} + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# Secondary API namespace +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval tomlish::lib { + namespace export {[a-z]*}; # Convention: export all lowercase + namespace path [namespace parent] + #*** !doctools + #[subsection {Namespace tomlish::lib}] + #[para] Secondary functions that are part of the API + #[list_begin definitions] + + #proc utility1 {p1 args} { + # #*** !doctools + # #[call lib::[fun utility1] [arg p1] [opt {?option value...?}]] + # #[para]Description of utility1 + # return 1 + #} + + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::lib ---}] +} +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval tomlish::system { + + #taken from punk::lib + proc lindex_resolve_basic {list index} { + #*** !doctools + #[call [fun lindex_resolve_basic] [arg list] [arg index]] + #[para] Accepts index of the forms accepted by Tcl's list commands. (e.g compound indices such as 3+1 end-2) + #[para] returns -1 for out of range at either end, or a valid integer index + #[para] Unlike lindex_resolve; lindex_resolve_basic can't determine if an out of range index was out of range at the lower or upper bound + #[para] This is only likely to be faster than average over lindex_resolve for Tcl which has the builtin lseq command + #[para] The performance advantage is more likely to be present when using compound indexes such as $x+1 or end-1 + #[para] For pure integer indices the performance should be equivalent + + set index [tcl::string::map {_ {}} $index] ;#forward compatibility with integers such as 1_000 + if {[string is integer -strict $index]} { + #can match +i -i + #avoid even the lseq overhead when the index is simple + if {$index < 0 || ($index >= [llength $list])} { + #even though in this case we could return -2 or -3 like lindex_resolve; for consistency we don't, as it's not always determinable for compound indices using the lseq method. + return -1 + } else { + #integer may still have + sign - normalize with expr + return [expr {$index}] + } + } + if {[llength $list]} { + set indices [tomlish::system::range 0 [expr {[llength $list]-1}]] ;# uses lseq if available, has fallback. + #if lseq was available - $indices is an 'arithseries' - theoretically not taking up ram(?) + } else { + set indices [list] + } + set idx [lindex $indices $index] + if {$idx eq ""} { + #we have no way to determine if out of bounds is at lower vs upper end + return -1 + } else { + return $idx + } + } + + if {[info commands ::lseq] ne ""} { + #tcl 8.7+ lseq significantly faster, especially for larger ranges + #The internal rep can be an 'arithseries' with no string representation + #support minimal set from to + proc range {from to} { + lseq $from $to + } + } else { + #lseq accepts basic expressions e.g 4-2 for both arguments + #e.g we can do lseq 0 [llength $list]-1 + #if range is to be consistent with the lseq version above - it should support that, even though we don't support most lseq functionality in either wrapper. + proc range {from to} { + set to [offset_expr $to] + set from [offset_expr $from] + if {$to > $from} { + set count [expr {($to -$from) + 1}] + if {$from == 0} { + return [lsearch -all [lrepeat $count 0] *] + } else { + incr from -1 + return [lmap v [lrepeat $count 0] {incr from}] + } + #slower methods. + #2) + #set i -1 + #set L [lrepeat $count 0] + #lmap v $L {lset L [incr i] [incr from];lindex {}} + #return $L + #3) + #set L {} + #for {set i 0} {$i < $count} {incr i} { + # lappend L [incr from] + #} + #return $L + } elseif {$from > $to} { + set count [expr {$from - $to} + 1] + #1) + if {$to == 0} { + return [lreverse [lsearch -all [lrepeat $count 0] *]] + } else { + incr from + return [lmap v [lrepeat $count 0] {incr from -1}] + } + + #2) + #set i -1 + #set L [lrepeat $count 0] + #lmap v $L {lset L [incr i] [incr from -1];lindex {}} + #return $L + #3) + #set L {} + #for {set i 0} {$i < $count} {incr i} { + # lappend L [incr from -1] + #} + #return $L + } else { + return [list $from] + } + } + } + +} + +if {[info exists ::argc] && $::argc > 0} { + #puts stderr "argc: $::argc args: $::argv" + set arglist $::argv + # -------------- + #make sure any dependant packages that are sourced don't get any commandline args + set ::argv {} + set ::argc 0 + # -------------- + package require punk::args + punk::args::define { + @dynamic + @id -id tomlish::cmdline + @cmd -name tomlish -help\ + "toml encoder/decoder written in Tcl" + @opts -any 1 + -help -type none -help\ + "Display this usage message or more specific + help if further arguments provided." + -app -choices {${[tomlish::appnames]}} + } + try { + set argd [punk::args::parse $arglist withid tomlish::cmdline] + } trap {PUNKARGS VALIDATION} {msg erroropts} { + puts stderr $msg + exit 1 + } + + + lassign [dict values $argd] leaders opts values received solos + if {[dict exists $received -help] && ![dict exists $received -app]} { + #only emit cmdline help if -app not supplied as well - otherwise app function can act on -help for more specific help + #puts stdout "Usage: -app where appname one of:[tomlish::appnames]" + puts stdout [punk::args::usage -scheme info tomlish::cmdline] + exit 0 + } + if {![dict exists $received -app]} { + puts stderr [punk::args::usage -scheme error tomlish::cmdline] + exit 1 + } + + set app [dict get $opts -app] + set appnames [tomlish::appnames] + set app_opts [dict remove $opts -app {*}$solos] + try { + set result [tomlish::app::$app {*}$app_opts {*}$solos {*}$values] + } trap {PUNKARGS VALIDATION} {msg erroropts} { + #The validation error should fully describe the issue + #no need for errortrace - keep the output cleaner + puts stderr $msg + exit 1 + } trap {} {msg erroropts} { + #unexpected error - uncaught throw will produce error trace + #todo - a support msg? Otherwise we may as well just leave off this trap. + throw [dict get $erroropts -errorcode] [dict get $erroropts -errorinfo] + } + if {"-help" in $solos} { + puts stderr $result + exit 1 + } else { + if {$result ne ""} { + puts stdout $result + exit 0 + } + } +} + +## Ready +package provide tomlish [namespace eval tomlish { + variable pkg tomlish + variable version + set version 1.1.6 +}] +return + +#*** !doctools +#[manpage_end] + diff --git a/src/modules/punk/cesu-999999.0a1.0.tm b/src/modules/punk/cesu-999999.0a1.0.tm index e17acd66..fa6e9bb1 100644 --- a/src/modules/punk/cesu-999999.0a1.0.tm +++ b/src/modules/punk/cesu-999999.0a1.0.tm @@ -22,11 +22,11 @@ #[manpage_begin punkshell_module_punk::cesu 0 999999.0a1.0] #[copyright "2024"] #[titledesc {CESU compatibility ehcoding scheme for utf-16: 8-Bit (CESU-8) ??}] [comment {-- Name section and table of contents description --}] -#[moddesc {CESU experimental}] [comment {-- Description at end of page heading --}] +#[moddesc {CESU experimental}] [comment {-- Description at end of page heading --}] #[require punk::cesu] #[keywords module cesu encoding compatibility experimental unofficial] #[description] -#[para] experimental +#[para] experimental # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ @@ -34,8 +34,8 @@ #[section Overview] #[para] overview of punk::cesu #[subsection Concepts] -#[para] cesu-8 may be mistaken for utf-8 if no supplementary chars present. -#[para] see: https://www.unicode.org/reports/tr26/tr26-4.html +#[para] cesu-8 may be mistaken for utf-8 if no supplementary chars present. +#[para] see: https://www.unicode.org/reports/tr26/tr26-4.html #[para] Particulary note discouragement of use especially in external interchange. @@ -52,9 +52,6 @@ package require Tcl 8.6- #*** !doctools #[item] [package {Tcl 8.6}] -# #package require frobz -# #*** !doctools -# #[item] [package {frobz}] #*** !doctools #[list_end] @@ -70,11 +67,11 @@ package require Tcl 8.6- # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ tcl::namespace::eval punk::cesu { tcl::namespace::export {[a-z]*} ;# Convention: export all lowercase - variable PUNKARGS + variable PUNKARGS #*** !doctools #[subsection {Namespace punk::cesu}] - #[para] Core API functions for punk::cesu + #[para] Core API functions for punk::cesu #[list_begin definitions] @@ -127,7 +124,7 @@ tcl::namespace::eval punk::cesu { binary scan $1 c 1 binary scan $2 c 2 binary scan $3 c 3 - puts [list $1 $2 $3] + #puts [list $1 $2 $3] #binary scan $4 c 4 incr 1 ;#// Effectively adds 0x10000 to the codepoint ? @@ -155,7 +152,7 @@ tcl::namespace::eval punk::cesu { [expr {0x80 | (($1 & 0x3) << 4) | (($2 & 0x3C) >> 2)}] \ [expr {0x80 | (($2 & 0x3) << 4) | ($3 & 0xF)}] \ $4] - + } else { puts "Invalid sequence: $char" return $char @@ -177,26 +174,78 @@ tcl::namespace::eval punk::cesu { #e.g from_surrogatestring "note \ud83f\udd1e etc" #e.g from_surrogatestring "faces \ud83d\ude10 \ud83d\ude21 \ud83d\ude31" - #note: from_surrogatestring \U1f600 returns a mouse (\U1f400) instead of smiley + #note: from_surrogatestring \U1f600 returns a mouse (\U1f400) instead of smiley # but from_surrogatestring \U1f400 returns a mouse. # Tcl bug - fixed some time in 9.x - # surrogated_string shouldn't include non BMP chars anyway (G.I.G.O?) + # surrogated_string theoretically shouldn't include non BMP chars anyway (but may in some contexts? mixed surrogate escapes and raw nonbmp?) lappend PUNKARGS [list { @id -id ::punk::cesu::from_surrogatestring @cmd -name punk::cesu::from_surrogatestring -help\ "Convert a string containing surrogate pairs - to string with pairs converted to unicode non-BMP + to Tcl string with pairs converted to unicode non-BMP characters" - @values + @values surrogated_string -help\ "May contain a mix of surrogate pairs and other characters - only the surrogate pairs will be converted." }] - proc from_surrogatestring {surrogated_string} { + + proc from_surrogatestring {str} { + #high surrogate character rep followed by low surrogate character rep + if {[regexp {[\uD800-\uDBFF][\uDC00-\uDFFF]} $str]} { + set str [string map {\[ \\\[ \] \\\]} $str] ;#Make sure any existing commandlike structures aren't executed + return [subst -novariables -nobackslashes [regsub -all {([\uD800-\uDBFF])([\uDC00-\uDFFF])} $str {[surrogatepair_to_codepoint \1 \2]} ]] + } else { + return $str + } + } + proc surrogatepair_to_codepoint {highchar lowchar} { + if {[string length $highchar] != 1 || [string length $lowchar] !=1} { + error "surrogatepair_to_codepoint expected surrogate pair encoded as 2 characters" + } + #NOTE in tcl8 - we get oddity that 'split ""' returns a list of length 1 even though there are 2 chars + #fixed in tcl9 + #lassign [split $2_surrogate_chars ""] highSurrogateChar lowSurrogateChar + + scan $highchar %c highDecimal + scan $lowchar %c lowDecimal + + set highDecimal [expr {$highDecimal - 0xD800}] + set lowDecimal [expr {$lowDecimal - 0xDC00}] + # Combine the values and add 0x10000 to get the original code point + set codepointDecimal [expr {($highDecimal << 10) + $lowDecimal + 0x10000}] + #puts "->codepointDecimal $codepointDecimal" + + #In tcl8 - we will get \uFFFD for non BMP codepoints - todo ? + return [format %c $codepointDecimal] + } + #e.g {\ud83d\ude00} + proc escaped_surrogatepair_to_codepoint {spair} { + set spair [string map {" " ""} $spair] + if {[string length $spair] != 12} { + error "escaped_surrogatepair_to_codepoint expected input of form \\uXXXX\\uXXXX" + } + set normalised [regsub -all {\\+u} $spair ""] + set highSurrogate [string range $normalised 0 3] + set lowSurrogate [string range $normalised 4 end] + scan $highSurrogate %x highDecimal + scan $lowSurrogate %x lowDecimal + + set highDecimal [expr {$highDecimal - 0xD800}] + set lowDecimal [expr {$lowDecimal - 0xDC00}] + # Combine the values and add 0x10000 to get the original code point + set codepointDecimal [expr {($highDecimal << 10) + $lowDecimal + 0x10000}] + return [format %c $codepointDecimal] + } + + proc from_surrogatestring_via_cesu {surrogated_string} { + #we can do this without cesu (from_surrogatestring) set cesu [encoding convertto cesu-8 $surrogated_string] set x [cesu2utf $cesu] encoding convertfrom utf-8 $x } + + proc _to_test {emoji} { puts stderr "_to_test incomplete" set cesu [encoding convertto cesu-8 $e] @@ -209,7 +258,7 @@ tcl::namespace::eval punk::cesu { -format -default escape -choices {raw escape} -choicelabels { raw\ " emit raw surrogate pairs - may not be writable to + may not be writable to output channels" escape\ " emit unprocessed backslash hex @@ -224,7 +273,7 @@ tcl::namespace::eval punk::cesu { e.g >to_surrogatestring -format escape \"mouse: \\U1f400\" mouse: \\uD83D\\uDC00 - " + " }] proc to_surrogatestring {args} { set argd [punk::args::parse $args withid ::punk::cesu::to_surrogatestring] @@ -273,14 +322,14 @@ tcl::namespace::eval punk::cesu { #set lsbinfo [punk::char::char_info_dec $lsbfinal -fields all -except testwidth] ;#don't use all/testwidth will try to emit the char and fail/show error set esc "\\u$msbhex\\u$lsbhex" set raw [format %c $msbfinal][format %c $lsbfinal] - return [dict create escapes $esc msbdec $msbfinal msbhex $msbhex lsbdec $lsbfinal lsbhex $lsbhex raw $raw] + return [dict create escapes $esc msbdec $msbfinal msbhex $msbhex lsbdec $lsbfinal lsbhex $lsbhex raw $raw] } # #test_enc_equivalency \U1f400 \U1f600 proc test_enc_equivalency {c1 c2} { package require punk::ansi - namespace import ::punk::ansi::a+ ::punk::ansi::a + namespace import ::punk::ansi::a+ ::punk::ansi::a foreach enc [lsort [encoding names]] { puts stdout "testing $enc" if {$enc in "iso2022 iso2022-jp iso2022-kr"} { @@ -315,14 +364,14 @@ tcl::namespace::eval punk::cesu::lib { tcl::namespace::path [tcl::namespace::parent] #*** !doctools #[subsection {Namespace punk::cesu::lib}] - #[para] Secondary functions that are part of the API + #[para] Secondary functions that are part of the API #[list_begin definitions] #proc utility1 {p1 args} { # #*** !doctools # #[call lib::[fun utility1] [arg p1] [opt {?option value...?}]] - # #[para]Description of utility1 - # return 1 + # #[para]Description of utility1 + # return 1 #} @@ -340,15 +389,15 @@ tcl::namespace::eval punk::cesu::lib { #tcl::namespace::eval punk::cesu::system { #*** !doctools #[subsection {Namespace punk::cesu::system}] - #[para] Internal functions that are not part of the API + #[para] Internal functions that are not part of the API #} # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -# == === === === === === === === === === === === === === === +# == === === === === === === === === === === === === === === # Sample 'about' function with punk::args documentation -# == === === === === === === === === === === === === === === +# == === === === === === === === === === === === === === === tcl::namespace::eval punk::cesu { tcl::namespace::export {[a-z]*} ;# Convention: export all lowercase variable PUNKARGS @@ -371,7 +420,7 @@ tcl::namespace::eval punk::cesu { set about_topics [list] foreach f $topic_funs { set tail [namespace tail $f] - lappend about_topics [string range $tail [string length get_topic_] end] + lappend about_topics [string range $tail [string length get_topic_] end] } #Adjust this function or 'default_topics' if a different order is required return [lsort $about_topics] @@ -379,12 +428,12 @@ tcl::namespace::eval punk::cesu { proc default_topics {} {return [list Description *]} # ------------------------------------------------------------- - # get_topic_ functions add more to auto-include in about topics + # get_topic_ functions add more to auto-include in about topics # ------------------------------------------------------------- proc get_topic_Description {} { - punk::args::lib::tstr [string trim { + punk::args::lib::tstr [string trim { package punk::cesu - description to come.. + cesu and surrogate-pair processing } \n] } proc get_topic_License {} { @@ -406,7 +455,8 @@ tcl::namespace::eval punk::cesu { } proc get_topic_custom-topic {} { punk::args::lib::tstr -return string { - nothing to see here + This library can be used for surrogate-pair handling. + cesu utilities are used internally in from_surrogatestring } } # ------------------------------------------------------------- @@ -415,9 +465,9 @@ tcl::namespace::eval punk::cesu { # we re-use the argument definition from punk::args::standard_about and override some items set overrides [dict create] dict set overrides @id -id "::punk::cesu::about" - dict set overrides @cmd -name "punk::cesu::about" + dict set overrides @cmd -name "punk::cesu::about" dict set overrides @cmd -help [string trim [punk::args::lib::tstr { - About punk::cesu + About punk::cesu }] \n] dict set overrides topic -choices [list {*}[punk::cesu::argdoc::about_topics] *] dict set overrides topic -choicerestricted 1 @@ -433,7 +483,7 @@ tcl::namespace::eval punk::cesu { } } # end of sample 'about' function -# == === === === === === === === === === === === === === === +# == === === === === === === === === === === === === === === # ----------------------------------------------------------------------------- @@ -446,11 +496,11 @@ namespace eval ::punk::args::register { lappend ::punk::args::register::NAMESPACES ::punk::cesu } # ----------------------------------------------------------------------------- -## Ready +## Ready package provide punk::cesu [tcl::namespace::eval punk::cesu { variable pkg punk::cesu variable version - set version 999999.0a1.0 + set version 999999.0a1.0 }] return diff --git a/src/modules/punk/cesu-buildversion.txt b/src/modules/punk/cesu-buildversion.txt index f47d01c8..781c895b 100644 --- a/src/modules/punk/cesu-buildversion.txt +++ b/src/modules/punk/cesu-buildversion.txt @@ -1,3 +1,3 @@ -0.1.0 +0.1.1 #First line must be a semantic version number #all other lines are ignored. diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/overtype-1.6.6.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/overtype-1.6.6.tm new file mode 100644 index 00000000..b4e59ec6 --- /dev/null +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/overtype-1.6.6.tm @@ -0,0 +1,4774 @@ +# -*- tcl -*- +# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt +# +# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. +# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# (C) Julian Noble 2003-2023 +# +# @@ Meta Begin +# Application overtype 1.6.6 +# Meta platform tcl +# Meta license BSD +# @@ Meta End + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# doctools header +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +#*** !doctools +#[manpage_begin overtype_module_overtype 0 1.6.6] +#[copyright "2024"] +#[titledesc {overtype text layout - ansi aware}] [comment {-- Name section and table of contents description --}] +#[moddesc {overtype text layout}] [comment {-- Description at end of page heading --}] +#[require overtype] +#[keywords module text ansi] +#[description] +#[para] - + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section Overview] +#[para] overview of overtype +#[subsection Concepts] +#[para] - + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Requirements +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[subsection dependencies] +#[para] packages used by overtype +#[list_begin itemized] + +package require Tcl 8.6- +package require textutil +package require punk::lib ;#required for lines_as_list +package require punk::ansi ;#required to detect, split, strip and calculate lengths +package require punk::char ;#box drawing - and also unicode character width determination for proper layout of text with double-column-width chars +package require punk::assertion +#*** !doctools +#[item] [package {Tcl 8.6}] +#[item] [package textutil] +#[item] [package punk::ansi] +#[para] - required to detect, split, strip and calculate lengths of text possibly containing ansi codes +#[item] [package punk::char] +#[para] - box drawing - and also unicode character width determination for proper layout of text with double-column-width chars + +# #package require frobz +# #*** !doctools +# #[item] [package {frobz}] + +#*** !doctools +#[list_end] + +#PERFORMANCE notes +#overtype is very performance sensitive - used in ansi output all over the place ie needs to be optimised +#NOTE use of tcl::dict::for tcl::string::range etc instead of ensemble versions. This is for the many tcl 8.6/8.7 interps which don't compile ensemble commands when in safe interps +#similar for tcl::namespace::eval - but this is at least on some versions of Tcl - faster even in a normal interp. Review to see if that holds for Tcl 9. +#for string map: when there are exactly 2 elements - it is faster to use a literal which has a special case optimisation in the c code +#ie use tcl::string::map {\n ""} ... instead of tcl::string::map [list \n ""] ... +#note that we can use unicode (e.g \uFF31) and other escapes such as \t within these curly braces - we don't have to use double quotes +#generally using 'list' is preferred for the map as less error prone. +#can also use: tcl::string::map "token $var" .. but be careful regarding quoting and whitespace in var. This should be used sparingly if at all. + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +#*** !doctools +#[section API] + + +#Julian Noble - 2003 +#Released under standard 'BSD license' conditions. +# +#todo - ellipsis truncation indicator for center,right + +#v1.4 2023-07 - naive ansi color handling - todo - fix tcl::string::range +# - need to extract and replace ansi codes? + +tcl::namespace::eval overtype { + namespace import ::punk::assertion::assert + punk::assertion::active true + + namespace path ::punk::lib + + namespace export * + variable default_ellipsis_horizontal "..." ;#fallback + variable default_ellipsis_vertical "..." + tcl::namespace::eval priv { + proc _init {} { + upvar ::overtype::default_ellipsis_horizontal e_h + upvar ::overtype::default_ellipsis_vertical e_v + set e_h [format %c 0x2026] ;#Unicode Horizontal Ellipsis + set e_v [format %c 0x22EE] + #The unicode ellipsis looks more natural than triple-dash which is centred vertically whereas ellipsis is at floorline of text + #Also - unicode ellipsis has semantic meaning that other processors can interpret + #unicode does also provide a midline horizontal ellipsis 0x22EF + + #set e [format %c 0x2504] ;#punk::char::charshort boxd_ltdshhz - Box Drawings Light Triple Dash Horizontal + #if {![catch {package require punk::char}]} { + # set e [punk::char::charshort boxd_ltdshhz] + #} + } + } + priv::_init +} +proc overtype::about {} { + return "Simple text formatting. Author JMN. BSD-License" +} + +tcl::namespace::eval overtype { + variable grapheme_widths [tcl::dict::create] + + variable escape_terminals + #single "final byte" in the range 0x40–0x7E (ASCII @A–Z[\]^_`a–z{|}~). + tcl::dict::set escape_terminals CSI [list @ \\ ^ _ ` | ~ a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z "\{" "\}"] + #tcl::dict::set escape_terminals CSI [list J K m n A B C D E F G s u] ;#basic + tcl::dict::set escape_terminals OSC [list \007 \033\\] ;#note mix of 1 and 2-byte terminals + + #self-contained 2 byte ansi escape sequences - review more? + variable ansi_2byte_codes_dict + set ansi_2byte_codes_dict [tcl::dict::create\ + "reset_terminal" "\u001bc"\ + "save_cursor_posn" "\u001b7"\ + "restore_cursor_posn" "\u001b8"\ + "cursor_up_one" "\u001bM"\ + "NEL - Next Line" "\u001bE"\ + "IND - Down one line" "\u001bD"\ + "HTS - Set Tab Stop" "\u001bH"\ + ] + + #debatable whether strip should reveal the somethinghidden - some terminals don't hide it anyway. + # "PM - Privacy Message" "\u001b^somethinghidden\033\\"\ +} + + + + +proc overtype::string_columns {text} { + if {[punk::ansi::ta::detect $text]} { + #error "error string_columns is for calculating character length of string - ansi codes must be stripped/rendered first e.g with punk::ansi::ansistrip. Alternatively try punk::ansi::printing_length" + set text [punk::ansi::ansistrip $text] + } + return [punk::char::ansifreestring_width $text] +} + +#todo - consider a way to merge overtype::left/centre/right +#These have similar algorithms/requirements - and should be refactored to be argument-wrappers over a function called something like overtype::renderblock +#overtype::renderblock could render the input to a defined (possibly overflowing in x or y) rectangle overlapping the underlay. +#(i.e not even necessariy having it's top left within the underlay) +tcl::namespace::eval overtype::priv { +} + +#could return larger than renderwidth +proc _get_row_append_column {row} { + #obsolete? + upvar outputlines outputlines + set idx [expr {$row -1}] + if {$row <= 1 || $row > [llength $outputlines]} { + return 1 + } else { + upvar opt_expand_right expand_right + upvar renderwidth renderwidth + set existinglen [punk::ansi::printing_length [lindex $outputlines $idx]] + set endpos [expr {$existinglen +1}] + if {$expand_right} { + return $endpos + } else { + if {$endpos > $renderwidth} { + return $renderwidth + 1 + } else { + return $endpos + } + } + } +} + +tcl::namespace::eval overtype { + #*** !doctools + #[subsection {Namespace overtype}] + #[para] Core API functions for overtype + #[list_begin definitions] + + + + #tcl::string::range should generally be avoided for both undertext and overtext which contain ansi escapes and other cursor affecting chars such as \b and \r + #render onto an already-rendered (ansi already processed) 'underlay' string, a possibly ansi-laden 'overlay' string. + #The underlay and overlay can be multiline blocks of text of varying line lengths. + #The overlay may just be an ansi-colourised block - or may contain ansi cursor movements and cursor save/restore calls - in which case the apparent length and width of the overlay can't be determined as if it was a block of text. + #This is a single-shot rendering of strings - ie there is no way to chain another call containing a cursor-restore to previously rendered output and have it know about any cursor-saves in the first call. + # a cursor start position other than top-left is a possible addition to consider. + #see editbuf in punk::repl for a more stateful ansi-processor. Both systems use loops over overtype::renderline + proc renderspace {args} { + #*** !doctools + #[call [fun overtype::renderspace] [arg args] ] + #[para] usage: ?-transparent [lb]0|1[rb]? ?-expand_right [lb]1|0[rb]? ?-ellipsis [lb]1|0[rb]? ?-ellipsistext ...? undertext overtext + + # @c overtype starting at left (overstrike) + # @c can/should we use something like this?: 'format "%-*s" $len $overtext + variable default_ellipsis_horizontal + + if {[llength $args] < 2} { + error {usage: ?-width ? ?-startcolumn ? ?-transparent [0|1|]? ?-expand_right [1|0]? ?-ellipsis [1|0]? ?-ellipsistext ...? undertext overtext} + } + set optargs [lrange $args 0 end-2] + if {[llength $optargs] % 2 == 0} { + set overblock [lindex $args end] + set underblock [lindex $args end-1] + #lassign [lrange $args end-1 end] underblock overblock + set argsflags [lrange $args 0 end-2] + } else { + set optargs [lrange $args 0 end-1] + if {[llength $optargs] %2 == 0} { + set overblock [lindex $args end] + set underblock "" + set argsflags [lrange $args 0 end-1] + } else { + error "renderspace expects opt-val pairs followed by: or just " + } + } + set opts [tcl::dict::create\ + -bias ignored\ + -width \uFFEF\ + -height \uFFEF\ + -startcolumn 1\ + -ellipsis 0\ + -ellipsistext $default_ellipsis_horizontal\ + -ellipsiswhitespace 0\ + -expand_right 0\ + -appendlines 1\ + -transparent 0\ + -exposed1 \uFFFD\ + -exposed2 \uFFFD\ + -experimental 0\ + -cp437 0\ + -looplimit \uFFEF\ + -crm_mode 0\ + -reverse_mode 0\ + -insert_mode 0\ + -wrap 0\ + -info 0\ + -console {stdin stdout stderr}\ + ] + #expand_right is perhaps consistent with the idea of the page_size being allowed to grow horizontally.. + # it does not necessarily mean the viewport grows. (which further implies need for horizontal scrolling) + # - it does need to be within some concept of terminal width - as columns must be addressable by ansi sequences. + # - This implies the -width option value must grow if it is tied to the concept of renderspace terminal width! REVIEW. + # - further implication is that if expand_right grows the virtual renderspace terminal width - + # then some sort of reflow/rerender needs to be done for preceeding lines? + # possibly not - as expand_right is distinct from a normal terminal-width change event, + # expand_right being primarily to support other operations such as textblock::table + + #todo - viewport width/height as separate concept to terminal width/height? + #-ellipsis args not used if -wrap is true + foreach {k v} $argsflags { + switch -- $k { + -looplimit - -width - -height - -startcolumn - -bias - -ellipsis - -ellipsistext - -ellipsiswhitespace + - -transparent - -exposed1 - -exposed2 - -experimental + - -expand_right - -appendlines + - -reverse_mode - -crm_mode - -insert_mode + - -cp437 + - -info - -console { + tcl::dict::set opts $k $v + } + -wrap - -autowrap_mode { + #temp alias -autowrap_mode for consistency with renderline + #todo - + tcl::dict::set opts -wrap $v + } + default { + error "overtype::renderspace unknown option '$k'. Known options: [tcl::dict::keys $opts]" + } + } + } + #set opts [tcl::dict::merge $defaults $argsflags] + # -- --- --- --- --- --- + #review - expand_left for RTL text? + set opt_expand_right [tcl::dict::get $opts -expand_right] + #for repl - standard output line indicator is a dash - todo, add a different indicator for a continued line. + set opt_width [tcl::dict::get $opts -width] + set opt_height [tcl::dict::get $opts -height] + set opt_startcolumn [tcl::dict::get $opts -startcolumn] + set opt_appendlines [tcl::dict::get $opts -appendlines] + set opt_transparent [tcl::dict::get $opts -transparent] + set opt_ellipsistext [tcl::dict::get $opts -ellipsistext] + set opt_ellipsiswhitespace [tcl::dict::get $opts -ellipsiswhitespace] + set opt_exposed1 [tcl::dict::get $opts -exposed1] ;#widechar_exposed_left - todo + set opt_exposed2 [tcl::dict::get $opts -exposed2] ;#widechar_exposed_right - todo + # -- --- --- --- --- --- + set opt_crm_mode [tcl::dict::get $opts -crm_mode] + set opt_reverse_mode [tcl::dict::get $opts -reverse_mode] + set opt_insert_mode [tcl::dict::get $opts -insert_mode] + ##### + # review -wrap should map onto DECAWM terminal mode - the wrap 2 idea may not fit in with this?. + set opt_autowrap_mode [tcl::dict::get $opts -wrap] + #??? wrap 1 is hard wrap cutting word at exact column, or 1 column early for 2w-glyph, wrap 2 is for language-based word-wrap algorithm (todo) + ##### + # -- --- --- --- --- --- + set opt_cp437 [tcl::dict::get $opts -cp437] + set opt_info [tcl::dict::get $opts -info] + + + + # ---------------------------- + # -experimental dev flag to set flags etc + # ---------------------------- + set data_mode 0 + set edit_mode 0 + set opt_experimental [tcl::dict::get $opts -experimental] + foreach o $opt_experimental { + switch -- $o { + data_mode { + set data_mode 1 + } + edit_mode { + set edit_mode 1 + } + } + } + # ---------------------------- + + + set underblock [tcl::string::map {\r\n \n} $underblock] + set overblock [tcl::string::map {\r\n \n} $overblock] + + + #set underlines [split $underblock \n] + + #underblock is a 'rendered' block - so width height make sense + #only non-cursor affecting and non-width occupying ANSI codes should be present. + #ie SGR codes and perhaps things such as PM - although generally those should have been pushed to the application already + #renderwidth & renderheight were originally used with reference to rendering into a 'column' of output e.g a table column - before cursor row/col was implemented. + + if {$opt_width eq "\uFFEF" || $opt_height eq "\uFFEF"} { + lassign [blocksize $underblock] _w renderwidth _h renderheight + if {$opt_width ne "\uFFEF"} { + set renderwidth $opt_width + } + if {$opt_height ne "\uFFEF"} { + set renderheight $opt_height + } + } else { + set renderwidth $opt_width + set renderheight $opt_height + } + #initial state for renderspace 'terminal' reset + set initial_state [dict create\ + renderwidth $renderwidth\ + renderheight $renderheight\ + crm_mode $opt_crm_mode\ + reverse_mode $opt_reverse_mode\ + insert_mode $opt_insert_mode\ + autowrap_mode $opt_autowrap_mode\ + cp437 $opt_cp437\ + ] + #modes + #e.g insert_mode can be toggled by insert key or ansi IRM sequence CSI 4 h|l + #opt_startcolumn ?? - DECSLRM ? + set vtstate $initial_state + + # -- --- --- --- + #REVIEW - do we need ansi resets in the underblock? + if {$underblock eq ""} { + set underlines [lrepeat $renderheight ""] + } else { + set underblock [textblock::join_basic -- $underblock] ;#ensure properly rendered - ansi per-line resets & replays + set underlines [split $underblock \n] + } + #if {$underblock eq ""} { + # set blank "\x1b\[0m\x1b\[0m" + # #set underlines [list "\x1b\[0m\x1b\[0m"] + # set underlines [lrepeat $renderheight $blank] + #} else { + # #lines_as_list -ansiresets 1 will do nothing if -ansiresets 1 isn't specified - REVIEW + # set underlines [lines_as_list -ansiresets 1 $underblock] + #} + # -- --- --- --- + + #todo - reconsider the 'line' as the natural chunking mechanism for the overlay. + #In practice an overlay ANSI stream can be a single line with ansi moves/restores etc - or even have no moves or newlines, just relying on wrapping at the output renderwidth + #In such cases - we process the whole shebazzle for the first output line - only reducing by the applied amount at the head each time, reprocessing the long tail each time. + #(in cases where there are interline moves or cursor jumps anyway) + #This works - but doesn't seem efficient. + #On the other hand.. maybe it depends on the data. For simpler files it's more efficient than splitting first + + #a hack until we work out how to avoid infinite loops... + # + set looplimit [tcl::dict::get $opts -looplimit] + if {$looplimit eq "\uFFEF"} { + #looping for each char is worst case (all newlines?) - anything over that is an indication of something broken? + #do we need any margin above the length? (telnet mapscii.me test) + set looplimit [expr {[tcl::string::length $overblock] + 10}] + } + + #overblock height/width isn't useful in the presence of an ansi input overlay with movements. The number of lines may bear little relationship to the output height + #lassign [blocksize $overblock] _w overblock_width _h overblock_height + + set scheme 4 + switch -- $scheme { + 0 { + #one big chunk + set inputchunks [list $overblock] + } + 1 { + set inputchunks [punk::ansi::ta::split_codes $overblock] + } + 2 { + + #split into lines if possible first - then into plaintext/ansi-sequence chunks ? + set inputchunks [list ""] ;#put an empty plaintext split in for starters + set i 1 + set lines [split $overblock \n] + foreach ln $lines { + if {$i < [llength $lines]} { + append ln \n + } + set sequence_split [punk::ansi::ta::split_codes_single $ln] ;#use split_codes Not split_codes_single? + set lastpt [lindex $inputchunks end] + lset inputchunks end [tcl::string::cat $lastpt [lindex $sequence_split 0]] + lappend inputchunks {*}[lrange $sequence_split 1 end] + incr i + } + } + 3 { + #it turns out line based chunks are faster than the above.. probably because some of those end up doing the regex splitting twice + set lflines [list] + set inputchunks [split $overblock \n] + foreach ln $inputchunks { + append ln \n + lappend lflines $ln + } + if {[llength $lflines]} { + lset lflines end [tcl::string::range [lindex $lflines end] 0 end-1] + } + #set inputchunks $lflines[unset lflines] + set inputchunks [lindex [list $lflines [unset lflines]] 0] + + } + 4 { + set inputchunks [list] + foreach ln [split $overblock \n] { + lappend inputchunks $ln\n + } + if {[llength $inputchunks]} { + lset inputchunks end [tcl::string::range [lindex $inputchunks end] 0 end-1] + } + } + } + + + + + set replay_codes_underlay [tcl::dict::create 1 ""] + #lappend replay_codes_overlay "" + set replay_codes_overlay "[punk::ansi::a]" + set unapplied "" + set cursor_saved_position [tcl::dict::create] + set cursor_saved_attributes "" + + + set outputlines $underlines + set overidx 0 + + #underlines are not necessarily processed in order - depending on cursor-moves applied from overtext + set row 1 + #if {$data_mode} { + # set col [_get_row_append_column $row] + #} else { + set col $opt_startcolumn + #} + + set instruction_stats [tcl::dict::create] + + set loop 0 + #while {$overidx < [llength $inputchunks]} { } + + while {[llength $inputchunks]} { + #set overtext [lindex $inputchunks $overidx]; lset inputchunks $overidx "" + set overtext [lpop inputchunks 0] + if {![tcl::string::length $overtext]} { + incr loop + continue + } + #puts "----->[ansistring VIEW -lf 1 -vt 1 -nul 1 $overtext]<----" + set undertext [lindex $outputlines [expr {$row -1}]] + set renderedrow $row + + #renderline pads each underaly line to width with spaces and should track where end of data is + + + #set overtext [tcl::string::cat [lindex $replay_codes_overlay $overidx] $overtext] + set overtext $replay_codes_overlay$overtext + if {[tcl::dict::exists $replay_codes_underlay $row]} { + set undertext [tcl::dict::get $replay_codes_underlay $row]$undertext + } + #review insert_mode. As an 'overtype' function whose main function is not interactive keystrokes - insert is secondary - + #but even if we didn't want it as an option to the function call - to process ansi adequately we need to support IRM (insertion-replacement mode) ESC [ 4 h|l + set renderopts [list -experimental $opt_experimental\ + -cp437 $opt_cp437\ + -info 1\ + -crm_mode [tcl::dict::get $vtstate crm_mode]\ + -insert_mode [tcl::dict::get $vtstate insert_mode]\ + -autowrap_mode [tcl::dict::get $vtstate autowrap_mode]\ + -reverse_mode [tcl::dict::get $vtstate reverse_mode]\ + -cursor_restore_attributes $cursor_saved_attributes\ + -transparent $opt_transparent\ + -width [tcl::dict::get $vtstate renderwidth]\ + -exposed1 $opt_exposed1\ + -exposed2 $opt_exposed2\ + -expand_right $opt_expand_right\ + -cursor_column $col\ + -cursor_row $row\ + ] + set rinfo [renderline {*}$renderopts $undertext $overtext] + + set instruction [tcl::dict::get $rinfo instruction] + tcl::dict::set vtstate crm_mode [tcl::dict::get $rinfo crm_mode] + tcl::dict::set vtstate insert_mode [tcl::dict::get $rinfo insert_mode] + tcl::dict::set vtstate autowrap_mode [tcl::dict::get $rinfo autowrap_mode] ;# + tcl::dict::set vtstate reverse_mode [tcl::dict::get $rinfo reverse_mode] + #how to support reverse_mode in rendered linelist? we need to examine all pt/code blocks and flip each SGR stack? + # - review - the answer is probably that we don't need to - it is set/reset only during application of overtext + + #Note carefully the difference betw overflow_right and unapplied. + #overflow_right may need to be included in next run before the unapplied data + #overflow_right most commonly has data when in insert_mode + set rendered [tcl::dict::get $rinfo result] + set overflow_right [tcl::dict::get $rinfo overflow_right] + set overflow_right_column [tcl::dict::get $rinfo overflow_right_column] + set unapplied [tcl::dict::get $rinfo unapplied] + set unapplied_list [tcl::dict::get $rinfo unapplied_list] + set post_render_col [tcl::dict::get $rinfo cursor_column] + set post_render_row [tcl::dict::get $rinfo cursor_row] + set c_saved_pos [tcl::dict::get $rinfo cursor_saved_position] + set c_saved_attributes [tcl::dict::get $rinfo cursor_saved_attributes] + set visualwidth [tcl::dict::get $rinfo visualwidth] ;#column width of what is 'rendered' for the line + set insert_lines_above [tcl::dict::get $rinfo insert_lines_above] + set insert_lines_below [tcl::dict::get $rinfo insert_lines_below] + tcl::dict::set replay_codes_underlay [expr {$renderedrow+1}] [tcl::dict::get $rinfo replay_codes_underlay] + + #lset replay_codes_overlay [expr $overidx+1] [tcl::dict::get $rinfo replay_codes_overlay] + set replay_codes_overlay [tcl::dict::get $rinfo replay_codes_overlay] + if {0 && [tcl::dict::get $vtstate reverse_mode]} { + #test branch - todo - prune + puts stderr "---->[ansistring VIEW $replay_codes_overlay] rendered: $rendered" + #review + #JMN3 + set existing_reverse_state 0 + #split_codes_single is single esc sequence - but could have multiple sgr codes within one esc sequence + #e.g \x1b\[0;31;7m has a reset,colour red and reverse + set codeinfo [punk::ansi::codetype::sgr_merge [list $replay_codes_overlay] -info 1] + set codestate_reverse [dict get $codeinfo codestate reverse] + switch -- $codestate_reverse { + 7 { + set existing_reverse_state 1 + } + 27 { + set existing_reverse_state 0 + } + "" { + } + } + if {$existing_reverse_state == 0} { + set rflip [a+ reverse] + } else { + #reverse of reverse + set rflip [a+ noreverse] + } + #note that mergeresult can have multiple esc (due to unmergeables or non sgr codes) + set replay_codes_overlay [punk::ansi::codetype::sgr_merge [list [dict get $codeinfo mergeresult] $rflip]] + puts stderr "---->[ansistring VIEW $replay_codes_overlay] rendered: $rendered" + } + + + + #-- todo - detect looping properly + if {$row > 1 && $overtext ne "" && $unapplied eq $overtext && $post_render_row == $row && $instruction eq ""} { + puts stderr "overtype::renderspace loop?" + puts [ansistring VIEW $rinfo] + break + } + #-- + + if {[tcl::dict::size $c_saved_pos] >= 1} { + set cursor_saved_position $c_saved_pos + set cursor_saved_attributes $c_saved_attributes + } + + + set overflow_handled 0 + + + + set nextprefix "" + + + #todo - handle potential insertion mode as above for cursor restore? + #keeping separate branches for debugging - review and merge as appropriate when stable + set instruction_type [lindex $instruction 0] ;#some instructions have params + tcl::dict::incr instruction_stats $instruction_type + switch -- $instruction_type { + reset { + #reset the 'renderspace terminal' (not underlying terminal) + set row 1 + set col 1 + set vtstate [tcl::dict::merge $vtstate $initial_state] + #todo - clear screen + } + {} { + #end of supplied line input + #lf included in data + set row $post_render_row + set col $post_render_col + if {![llength $unapplied_list]} { + if {$overflow_right ne ""} { + incr row + } + } else { + puts stderr "renderspace end of input line - has unapplied: [ansistring VIEW $unapplied] (review)" + } + set col $opt_startcolumn + } + up { + + #renderline knows it's own line number, and knows not to go above row l + #it knows that a move whilst 1-beyond the width conflicts with the linefeed and reduces the move by one accordingly. + #row returned should be correct. + #column may be the overflow column - as it likes to report that to the caller. + + #Note that an ansi up sequence after last column going up to a previous line and also beyond the last column, will result in the next grapheme going onto the following line. + #this seems correct - as the column remains beyond the right margin so subsequent chars wrap (?) review + #puts stderr "up $post_render_row" + #puts stderr "$rinfo" + + #puts stdout "1 row:$row col $col" + set row $post_render_row + #data_mode (naming?) determines if we move to end of existing data or not. + #data_mode 0 means ignore existing line length and go to exact column + #set by -experimental flag + if {$data_mode == 0} { + set col $post_render_col + } else { + #This doesn't really work if columns are pre-filled with spaces..we can't distinguish them from data + #we need renderline to return the number of the maximum column filled (or min if we ever do r-to-l) + set existingdata [lindex $outputlines [expr {$post_render_row -1}]] + set lastdatacol [punk::ansi::printing_length $existingdata] + if {$lastdatacol < $renderwidth} { + set col [expr {$lastdatacol+1}] + } else { + set col $renderwidth + } + } + + #puts stdout "2 row:$row col $col" + #puts stdout "-----------------------" + #puts stdout $rinfo + #flush stdout + } + down { + if {$data_mode == 0} { + #renderline doesn't know how far down we can go.. + if {$post_render_row > [llength $outputlines]} { + if {$opt_appendlines} { + set diff [expr {$post_render_row - [llength $outputlines]}] + if {$diff > 0} { + lappend outputlines {*}[lrepeat $diff ""] + } + lappend outputlines "" + } + } + set row $post_render_row + set col $post_render_col + } else { + if {$post_render_row > [llength $outputlines]} { + if {$opt_appendlines} { + set diff [expr {$post_render_row - [llength $outputlines]}] + if {$diff > 0} { + lappend outputlines {*}[lrepeat $diff ""] + } + lappend outputlines "" + } + } + set existingdata [lindex $outputlines [expr {$post_render_row -1}]] + set lastdatacol [punk::ansi::printing_length $existingdata] + if {$lastdatacol < $renderwidth} { + set col [expr {$lastdatacol+1}] + } else { + set col $renderwidth + } + + } + } + restore_cursor { + #testfile belinda.ans uses this + + #puts stdout "[a+ blue bold]CURSOR_RESTORE[a]" + if {[tcl::dict::exists $cursor_saved_position row]} { + set row [tcl::dict::get $cursor_saved_position row] + set col [tcl::dict::get $cursor_saved_position column] + #puts stdout "restoring: row $row col $col [ansistring VIEW $cursor_saved_attributes] [a] unapplied [ansistring VIEWCODES $unapplied]" + #set nextprefix $cursor_saved_attributes + #lset replay_codes_overlay [expr $overidx+1] $cursor_saved_attributes + set replay_codes_overlay [tcl::dict::get $rinfo replay_codes_overlay]$cursor_saved_attributes + #set replay_codes_overlay $cursor_saved_attributes + set cursor_saved_position [tcl::dict::create] + set cursor_saved_attributes "" + } else { + #TODO + #?restore without save? + #should move to home position and reset ansi SGR? + #puts stderr "overtype::renderspace cursor_restore without save data available" + } + #If we were inserting prior to hitting the cursor_restore - there could be overflow_right data - generally the overtype functions aren't for inserting - but ansi can enable it + #if we were already in overflow when cursor_restore was hit - it shouldn't have been processed as an action - just stored. + if {!$overflow_handled && $overflow_right ne ""} { + #wrap before restore? - possible effect on saved cursor position + #this overflow data has previously been rendered so has no cursor movements or further save/restore operations etc + #we can just insert another call to renderline to solve this.. ? + #It would perhaps be more properly handled as a queue of instructions from our initial renderline call + #we don't need to worry about overflow next call (?)- but we should carry forward our gx and ansi stacks + + puts stdout ">>>[a+ red bold]overflow_right during restore_cursor[a]" + + set sub_info [overtype::renderline\ + -info 1\ + -width [tcl::dict::get $vtstate renderwidth]\ + -insert_mode [tcl::dict::get $vtstate insert_mode]\ + -autowrap_mode [tcl::dict::get $vtstate autowrap_mode]\ + -expand_right [tcl::dict::get $opts -expand_right]\ + ""\ + $overflow_right\ + ] + set foldline [tcl::dict::get $sub_info result] + tcl::dict::set vtstate insert_mode [tcl::dict::get $sub_info insert_mode] ;#probably not needed..? + tcl::dict::set vtstate autowrap_mode [tcl::dict::get $sub_info autowrap_mode] ;#nor this.. + linsert outputlines $renderedrow $foldline + #review - row & col set by restore - but not if there was no save.. + } + set overflow_handled 1 + + } + move { + ######## + if {$post_render_row > [llength $outputlines]} { + #Ansi moves need to create new lines ? + #if {$opt_appendlines} { + # set diff [expr {$post_render_row - [llength $outputlines]}] + # if {$diff > 0} { + # lappend outputlines {*}[lrepeat $diff ""] + # } + # set row $post_render_row + #} else { + set row [llength $outputlines] + #} + } else { + set row $post_render_row + } + ####### + set col $post_render_col + #overflow + unapplied? + } + clear_and_move { + #e.g 2J + if {$post_render_row > [llength $outputlines]} { + set row [llength $outputlines] + } else { + set row $post_render_row + } + set col $post_render_col + set overflow_right "" ;#if we're clearing - any overflow due to insert_mode is irrelevant + set clearedlines [list] + foreach ln $outputlines { + lappend clearedlines \x1b\[0m$replay_codes_overlay[string repeat \000 $renderwidth]\x1b\[0m + if 0 { + + set lineparts [punk::ansi::ta::split_codes $ln] + set numcells 0 + foreach {pt _code} $lineparts { + if {$pt ne ""} { + foreach grapheme [punk::char::grapheme_split $pt] { + switch -- $grapheme { + " " - - - _ - ! - @ - # - $ - % - ^ - & - * - = - + - : - . - , - / - | - ? - + a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - + z - A - B - C - D - E - F - G - H - I - J - K - L - M - N - O - P - Q - R - S - T - U - V - W - X - Y - Z - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 { + incr numcells 1 + } + default { + if {$grapheme eq "\u0000"} { + incr numcells 1 + } else { + incr numcells [grapheme_width_cached $grapheme] + } + } + } + + } + } + } + #replays/resets each line + lappend clearedlines \x1b\[0m$replay_codes_overlay[string repeat \000 $numcells]\x1b\[0m + } + } + set outputlines $clearedlines + #todo - determine background/default to be in effect - DECECM ? + puts stderr "replay_codes_overlay: [ansistring VIEW $replay_codes_overlay]" + #lset outputlines 0 $replay_codes_overlay[lindex $outputlines 0] + + } + lf_start { + #raw newlines + # ---------------------- + #test with fruit.ans + #test - treating as newline below... + #append rendered $overflow_right + #set overflow_right "" + set row $renderedrow + incr row + if {$row > [llength $outputlines]} { + lappend outputlines "" + } + set col $opt_startcolumn + # ---------------------- + } + lf_mid { + + set edit_mode 0 + if {$edit_mode} { + set inputchunks [linsert $inputchunks 0 $overflow_right$unapplied] + set overflow_right "" + set unapplied "" + set row $post_render_row + #set col $post_render_col + set col $opt_startcolumn + if {$row > [llength $outputlines]} { + lappend outputlines {*}[lrepeat 1 ""] + } + } else { + if 1 { + if {$overflow_right ne ""} { + if {$opt_expand_right} { + append rendered $overflow_right + set overflow_right "" + } else { + #review - we should really make renderline do the work...? + set overflow_width [punk::ansi::printing_length $overflow_right] + if {$visualwidth + $overflow_width <= $renderwidth} { + append rendered $overflow_right + set overflow_right "" + } else { + if {[tcl::dict::get $vtstate autowrap_mode]} { + set outputlines [linsert $outputlines $renderedrow $overflow_right] + set overflow_right "" + set row [expr {$renderedrow + 2}] + } else { + set overflow_right "" ;#abandon + } + + if {0 && $visualwidth < $renderwidth} { + puts stderr "visualwidth: $visualwidth < renderwidth:$renderwidth" + error "incomplete - abandon?" + set overflowparts [punk::ansi::ta::split_codes $overflow_right] + set remaining_overflow $overflowparts + set filled 0 + foreach {pt code} $overflowparts { + lpop remaining_overflow 0 + if {$pt ne ""} { + set graphemes [punk::char::grapheme_split $pt] + set add "" + set addlen $visualwidth + foreach g $graphemes { + set w [overtype::grapheme_width_cached $g] + if {$addlen + $w <= $renderwidth} { + append add $g + incr addlen $w + } else { + set filled 1 + break + } + } + append rendered $add + } + if {!$filled} { + lpop remaining_overflow 0 ;#pop code + } + } + set overflow_right [join $remaining_overflow ""] + } + } + } + } + set row $post_render_row + set col $opt_startcolumn + if {$row > [llength $outputlines]} { + lappend outputlines {*}[lrepeat 1 ""] + } + } else { + #old version - known to work with various ansi graphics - e.g fruit.ans + # - but fails to limit lines to renderwidth when expand_right == 0 + append rendered $overflow_right + set overflow_right "" + set row $post_render_row + set col $opt_startcolumn + if {$row > [llength $outputlines]} { + lappend outputlines {*}[lrepeat 1 ""] + } + } + } + } + lf_overflow { + #linefeed after renderwidth e.g at column 81 for an 80 col width + #we may also have other control sequences that came after col 80 e.g cursor save + + if 0 { + set lhs [overtype::left -width 40 -wrap 1 "" [ansistring VIEWSTYLE -nul 1 -lf 1 -vt 1 $rendered]] + set lhs [textblock::frame -title "rendered $visualwidth cols" -subtitle "row-$renderedrow" $lhs] + set rhs "" + + #assertion - there should be no overflow.. + puts $lhs + } + if {![tcl::dict::get $vtstate insert_mode]} { + assert {$overflow_right eq ""} lf_overflow should not get data in overflow_right when not insert_mode + } + + set row $post_render_row + #set row $renderedrow + #incr row + #only add newline if we're at the bottom + if {$row > [llength $outputlines]} { + lappend outputlines {*}[lrepeat 1 ""] + } + set col $opt_startcolumn + + } + newlines_above { + #we get a newlines_above instruction when received at column 1 + #In some cases we want to treat that as request to insert a new blank line above, and move our row 1 down (staying with the data) + #in other cases - we want to treat at column 1 the same as any other + + puts "--->newlines_above" + puts "rinfo: $rinfo" + #renderline doesn't advance the row for us - the caller has the choice to implement or not + set row $post_render_row + set col $post_render_col + if {$insert_lines_above > 0} { + set row $renderedrow + set outputlines [linsert $outputlines $renderedrow-1 {*}[lrepeat $insert_lines_above ""]] + incr row [expr {$insert_lines_above -1}] ;#we should end up on the same line of text (at a different index), with new empties inserted above + #? set row $post_render_row #can renderline tell us? + } + } + newlines_below { + #obsolete? - use for ANSI insert lines sequence + if {$data_mode == 0} { + puts --->nl_below + set row $post_render_row + set col $post_render_col + if {$insert_lines_below == 1} { + #set lhs [overtype::left -width 40 -wrap 1 "" [ansistring VIEWSTYLE -lf 1 -vt 1 $rendered]] + #set lhs [textblock::frame -title rendered -subtitle "row-$renderedrow" $lhs] + #set rhs "" + #if {$overflow_right ne ""} { + # set rhs [overtype::left -width 40 -wrap 1 "" [ansistring VIEWSTYLE -lf 1 -vt 1 $overflow_right]] + # set rhs [textblock::frame -title overflow_right $rhs] + #} + #puts [textblock::join $lhs $rhs] + + #rendered + append rendered $overflow_right + # + + + set overflow_right "" + set row $renderedrow + #only add newline if we're at the bottom + if {$row > [llength $outputlines]} { + lappend outputlines {*}[lrepeat $insert_lines_below ""] + } + incr row $insert_lines_below + set col $opt_startcolumn + } + } else { + set row $post_render_row + if {$post_render_row > [llength $outputlines]} { + if {$opt_appendlines} { + set diff [expr {$post_render_row - [llength $outputlines]}] + if {$diff > 0} { + lappend outputlines {*}[lrepeat $diff ""] + } + lappend outputlines "" + } + } else { + set existingdata [lindex $outputlines [expr {$post_render_row -1}]] + set lastdatacol [punk::ansi::printing_length $existingdata] + if {$lastdatacol < $renderwidth} { + set col [expr {$lastdatacol+1}] + } else { + set col $renderwidth + } + } + } + } + wrapmoveforward { + #doesn't seem to be used by fruit.ans testfile + #used by dzds.ans + #note that cursor_forward may move deep into the next line - or even span multiple lines !TODO + set c $renderwidth + set r $post_render_row + if {$post_render_col > $renderwidth} { + set i $c + while {$i <= $post_render_col} { + if {$c == $renderwidth+1} { + incr r + if {$opt_appendlines} { + if {$r < [llength $outputlines]} { + lappend outputlines "" + } + } + set c $opt_startcolumn + } else { + incr c + } + incr i + } + set col $c + } else { + #why are we getting this instruction then? + puts stderr "wrapmoveforward - test" + set r [expr {$post_render_row +1}] + set c $post_render_col + } + set row $r + set col $c + } + wrapmovebackward { + set c $renderwidth + set r $post_render_row + if {$post_render_col < 1} { + set c 1 + set i $c + while {$i >= $post_render_col} { + if {$c == 0} { + if {$r > 1} { + incr r -1 + set c $renderwidth + } else { + #leave r at 1 set c 1 + #testfile besthpav.ans first line top left border alignment + set c 1 + break + } + } else { + incr c -1 + } + incr i -1 + } + set col $c + } else { + puts stderr "Wrapmovebackward - but postrendercol >= 1???" + } + set row $r + set col $c + } + overflow { + #normal single-width grapheme overflow + #puts "----normal overflow --- [ansistring VIEWSTYLE -lf 1 -nul 1 -vt 1 $rendered]" + set row $post_render_row ;#renderline will not advance row when reporting overflow char + if {[tcl::dict::get $vtstate autowrap_mode]} { + incr row + set col $opt_startcolumn ;#whether wrap or not - next data is at column 1 ?? + } else { + set col $post_render_col + #set unapplied "" ;#this seems wrong? + #set unapplied [tcl::string::range $unapplied 1 end] + #The overflow can only be triggered by a grapheme (todo cluster?) - but our unapplied could contain SGR codes prior to the grapheme that triggered overflow - so we need to skip beyond any SGRs + #There may be more than one, because although the stack leading up to overflow may have been merged - codes between the last column and the overflowing grapheme will remain separate + #We don't expect any movement or other ANSI codes - as if they came before the grapheme, they would have triggered a different instruction to 'overflow' + set idx 0 + set next_grapheme_index -1 + foreach u $unapplied_list { + if {![punk::ansi::ta::detect $u]} { + set next_grapheme_index $idx + break + } + incr idx + } + assert {$next_grapheme_index >= 0} + #drop the overflow grapheme - keeping all codes in place. + set unapplied [join [lreplace $unapplied_list $next_grapheme_index $next_grapheme_index] ""] + #we need to run the reduced unapplied on the same line - further graphemes will just overflow again, but codes or control chars could trigger jumps to other lines + + set overflow_handled 1 + #handled by dropping overflow if any + } + } + overflow_splitchar { + set row $post_render_row ;#renderline will not advance row when reporting overflow char + + #2nd half of grapheme would overflow - treggering grapheme is returned in unapplied. There may also be overflow_right from earlier inserts + #todo - consider various options .. re-render a single trailing space or placeholder on same output line, etc + if {[tcl::dict::get $vtstate autowrap_mode]} { + if {$renderwidth < 2} { + #edge case of rendering to a single column output - any 2w char will just cause a loop if we don't substitute with something, or drop the character + set idx 0 + set triggering_grapheme_index -1 + foreach u $unapplied_list { + if {![punk::ansi::ta::detect $u]} { + set triggering_grapheme_index $idx + break + } + incr idx + } + set unapplied [join [lreplace $unapplied_list $triggering_grapheme_index $triggering_grapheme_index $opt_exposed1] ""] + } else { + set col $opt_startcolumn + incr row + } + } else { + set overflow_handled 1 + #handled by dropping entire overflow if any + if {$renderwidth < 2} { + set idx 0 + set triggering_grapheme_index -1 + foreach u $unapplied_list { + if {![punk::ansi::ta::detect $u]} { + set triggering_grapheme_index $idx + break + } + incr idx + } + set unapplied [join [lreplace $unapplied_list $triggering_grapheme_index $triggering_grapheme_index $opt_exposed1] ""] + } + } + + } + vt { + + #can vt add a line like a linefeed can? + set row $post_render_row + set col $post_render_col + } + set_window_title { + set newtitle [lindex $instruction 1] + puts stderr "overtype::renderspace set_window_title [ansistring VIEW $newtitle] instruction '$instruction'" + # + } + reset_colour_palette { + puts stderr "overtype::renderspace instruction '$instruction' unimplemented" + } + default { + puts stderr "overtype::renderspace unhandled renderline instruction '$instruction'" + } + + } + + + if {!$opt_expand_right && ![tcl::dict::get $vtstate autowrap_mode]} { + #not allowed to overflow column or wrap therefore we get overflow data to truncate + if {[tcl::dict::get $opts -ellipsis]} { + set show_ellipsis 1 + if {!$opt_ellipsiswhitespace} { + #we don't want ellipsis if only whitespace was lost + set lostdata "" + if {$overflow_right ne ""} { + append lostdata $overflow_right + } + if {$unapplied ne ""} { + append lostdata $unapplied + } + if {[tcl::string::trim $lostdata] eq ""} { + set show_ellipsis 0 + } + #set lostdata [tcl::string::range $overtext end-[expr {$overflowlength-1}] end] + if {[tcl::string::trim [punk::ansi::ansistrip $lostdata]] eq ""} { + set show_ellipsis 0 + } + } + if {$show_ellipsis} { + set rendered [overtype::right $rendered $opt_ellipsistext] + } + set overflow_handled 1 + } else { + #no wrap - no ellipsis - silently truncate + set overflow_handled 1 + } + } + + + + if {$renderedrow <= [llength $outputlines]} { + lset outputlines [expr {$renderedrow-1}] $rendered + } else { + if {$opt_appendlines} { + lappend outputlines $rendered + } else { + #? + lset outputlines [expr {$renderedrow-1}] $rendered + } + } + + if {!$overflow_handled} { + append nextprefix $overflow_right + } + + append nextprefix $unapplied + + if 0 { + if {$nextprefix ne ""} { + set nextoveridx [expr {$overidx+1}] + if {$nextoveridx >= [llength $inputchunks]} { + lappend inputchunks $nextprefix + } else { + #lset overlines $nextoveridx $nextprefix[lindex $overlines $nextoveridx] + set inputchunks [linsert $inputchunks $nextoveridx $nextprefix] + } + } + } + + if {$nextprefix ne ""} { + set inputchunks [linsert $inputchunks 0 $nextprefix] + } + + + incr overidx + incr loop + if {$loop >= $looplimit} { + puts stderr "overtype::renderspace looplimit reached ($looplimit)" + lappend outputlines "[a+ red bold] - looplimit $looplimit reached[a]" + set Y [a+ yellow bold] + set RST [a] + set sep_header ----DEBUG----- + set debugmsg "" + append debugmsg "${Y}${sep_header}${RST}" \n + append debugmsg "looplimit $looplimit reached\n" + append debugmsg "data_mode:$data_mode\n" + append debugmsg "opt_appendlines:$opt_appendlines\n" + append debugmsg "prev_row :[tcl::dict::get $renderopts -cursor_row]\n" + append debugmsg "prev_col :[tcl::dict::get $renderopts -cursor_column]\n" + tcl::dict::for {k v} $rinfo { + append debugmsg "${Y}$k [ansistring VIEW -lf 1 -vt 1 $v]$RST" \n + } + append debugmsg "${Y}[string repeat - [string length $sep_header]]$RST" \n + + puts stdout $debugmsg + #todo - config regarding error dumps rather than just dumping in working dir + set fd [open [pwd]/error_overtype.txt w] + puts $fd $debugmsg + close $fd + error $debugmsg + break + } + } + + set result [join $outputlines \n] + if {!$opt_info} { + return $result + } else { + #emit to debug window like basictelnet does? make debug configurable as syslog or even a telnet server to allow on 2nd window? + #append result \n$instruction_stats\n + set inforesult [dict create\ + result $result\ + last_instruction $instruction\ + instruction_stats $instruction_stats\ + ] + if {$opt_info == 2} { + return [pdict -channel none inforesult] + } else { + return $inforesult + } + } + } + + #todo - left-right ellipsis ? + proc centre {args} { + variable default_ellipsis_horizontal + if {[llength $args] < 2} { + error {usage: ?-transparent [0|1]? ?-bias [left|right]? ?-overflow [1|0]? undertext overtext} + } + + foreach {underblock overblock} [lrange $args end-1 end] break + + #todo - vertical vs horizontal overflow for blocks + set opts [tcl::dict::create\ + -bias left\ + -ellipsis 0\ + -ellipsistext $default_ellipsis_horizontal\ + -ellipsiswhitespace 0\ + -overflow 0\ + -transparent 0\ + -exposed1 \uFFFD\ + -exposed2 \uFFFD\ + ] + set argsflags [lrange $args 0 end-2] + foreach {k v} $argsflags { + switch -- $k { + -bias - -ellipsis - -ellipsistext - -ellipsiswhitespace - -overflow - -transparent - -exposed1 - -exposed2 { + tcl::dict::set opts $k $v + } + default { + set known_opts [tcl::dict::keys $opts] + error "overtype::centre unknown option '$k'. Known options: $known_opts" + } + } + } + #set opts [tcl::dict::merge $defaults $argsflags] + # -- --- --- --- --- --- + set opt_transparent [tcl::dict::get $opts -transparent] + set opt_ellipsis [tcl::dict::get $opts -ellipsis] + set opt_ellipsistext [tcl::dict::get $opts -ellipsistext] + set opt_ellipsiswhitespace [tcl::dict::get $opts -ellipsiswhitespace] + set opt_exposed1 [tcl::dict::get $opts -exposed1] + set opt_exposed2 [tcl::dict::get $opts -exposed2] + # -- --- --- --- --- --- + + + set underblock [tcl::string::map {\r\n \n} $underblock] + set overblock [tcl::string::map {\r\n \n} $overblock] + + set underlines [split $underblock \n] + #set renderwidth [tcl::mathfunc::max {*}[lmap v $underlines {punk::ansi::printing_length $v}]] + lassign [blocksize $underblock] _w renderwidth _h renderheight + set overlines [split $overblock \n] + lassign [blocksize $overblock] _w overblock_width _h overblock_height + set under_exposed_max [expr {$renderwidth - $overblock_width}] + if {$under_exposed_max > 0} { + #background block is wider + if {$under_exposed_max % 2 == 0} { + #even left/right exposure + set left_exposed [expr {$under_exposed_max / 2}] + } else { + set beforehalf [expr {$under_exposed_max / 2}] ;#1 less than half due to integer division + if {[tcl::string::tolower [tcl::dict::get $opts -bias]] eq "left"} { + set left_exposed $beforehalf + } else { + #bias to the right + set left_exposed [expr {$beforehalf + 1}] + } + } + } else { + set left_exposed 0 + } + + set outputlines [list] + if {[punk::ansi::ta::detect_sgr [lindex $overlines 0]]} { + set replay_codes "[punk::ansi::a]" + } else { + set replay_codes "" + } + set replay_codes_underlay "" + set replay_codes_overlay "" + foreach undertext $underlines overtext $overlines { + set overtext_datalen [punk::ansi::printing_length $overtext] + set ulen [punk::ansi::printing_length $undertext] + if {$ulen < $renderwidth} { + set udiff [expr {$renderwidth - $ulen}] + set undertext "$undertext[string repeat { } $udiff]" + } + set undertext $replay_codes_underlay$undertext + set overtext $replay_codes_overlay$overtext + + set overflowlength [expr {$overtext_datalen - $renderwidth}] + #review - right-to-left langs should elide on left! - extra option required + + if {$overflowlength > 0} { + #overlay line wider or equal + #review - we still expand_right for centred for now.. possibly should do something like -expand_leftright with ellipsis each end? + set rinfo [renderline -info 1 -insert_mode 0 -transparent $opt_transparent -expand_right [tcl::dict::get $opts -overflow] -exposed1 $opt_exposed1 -exposed2 $opt_exposed2 $undertext $overtext] + set rendered [tcl::dict::get $rinfo result] + set overflow_right [tcl::dict::get $rinfo overflow_right] + set unapplied [tcl::dict::get $rinfo unapplied] + #todo - get replay_codes from overflow_right instead of wherever it was truncated? + + #overlay line data is wider - trim if overflow not specified in opts - and overtype an ellipsis at right if it was specified + if {![tcl::dict::get $opts -overflow]} { + #lappend outputlines [tcl::string::range $overtext 0 [expr {$renderwidth - 1}]] + #set overtext [tcl::string::range $overtext 0 $renderwidth-1 ] + if {$opt_ellipsis} { + set show_ellipsis 1 + if {!$opt_ellipsiswhitespace} { + #we don't want ellipsis if only whitespace was lost + #don't use tcl::string::range on ANSI data + #set lostdata [tcl::string::range $overtext end-[expr {$overflowlength-1}] end] + set lostdata "" + if {$overflow_right ne ""} { + append lostdata $overflow_right + } + if {$unapplied ne ""} { + append lostdata $unapplied + } + if {[tcl::string::trim $lostdata] eq ""} { + set show_ellipsis 0 + } + } + if {$show_ellipsis} { + set rendered [overtype::right $rendered $opt_ellipsistext] + } + } + } + lappend outputlines $rendered + #lappend outputlines [renderline -insert_mode 0 -transparent $opt_transparent $undertext $overtext] + } else { + #background block is wider than or equal to data for this line + #lappend outputlines [renderline -insert_mode 0 -startcolumn [expr {$left_exposed + 1}] -transparent $opt_transparent -exposed1 $opt_exposed1 -exposed2 $opt_exposed2 $undertext $overtext] + set rinfo [renderline -info 1 -insert_mode 0 -startcolumn [expr {$left_exposed + 1}] -transparent $opt_transparent -exposed1 $opt_exposed1 -exposed2 $opt_exposed2 $undertext $overtext] + lappend outputlines [tcl::dict::get $rinfo result] + } + set replay_codes_underlay [tcl::dict::get $rinfo replay_codes_underlay] + set replay_codes_overlay [tcl::dict::get $rinfo replay_codes_overlay] + } + return [join $outputlines \n] + } + + #overtype::right is for a rendered ragged underblock and a rendered ragged overblock + #ie we can determine the block width for bost by examining the lines and picking the longest. + # + proc right {args} { + #NOT the same as align-right - which should be done to the overblock first if required + variable default_ellipsis_horizontal + # @d !todo - implement overflow, length checks etc + + if {[llength $args] < 2} { + error {usage: ?-overflow [1|0]? ?-transparent 0|? undertext overtext} + } + foreach {underblock overblock} [lrange $args end-1 end] break + + set opts [tcl::dict::create\ + -bias ignored\ + -ellipsis 0\ + -ellipsistext $default_ellipsis_horizontal\ + -ellipsiswhitespace 0\ + -overflow 0\ + -transparent 0\ + -exposed1 \uFFFD\ + -exposed2 \uFFFD\ + -align "left"\ + ] + set argsflags [lrange $args 0 end-2] + tcl::dict::for {k v} $argsflags { + switch -- $k { + -bias - -ellipsis - -ellipsistext - -ellipsiswhitespace - -overflow - -transparent - -exposed1 - -exposed2 - -align { + tcl::dict::set opts $k $v + } + default { + set known_opts [tcl::dict::keys $opts] + error "overtype::centre unknown option '$k'. Known options: $known_opts" + } + } + } + #set opts [tcl::dict::merge $defaults $argsflags] + # -- --- --- --- --- --- + set opt_transparent [tcl::dict::get $opts -transparent] + set opt_ellipsis [tcl::dict::get $opts -ellipsis] + set opt_ellipsistext [tcl::dict::get $opts -ellipsistext] + set opt_ellipsiswhitespace [tcl::dict::get $opts -ellipsiswhitespace] + set opt_overflow [tcl::dict::get $opts -overflow] + set opt_exposed1 [tcl::dict::get $opts -exposed1] + set opt_exposed2 [tcl::dict::get $opts -exposed2] + set opt_align [tcl::dict::get $opts -align] + # -- --- --- --- --- --- + + set underblock [tcl::string::map {\r\n \n} $underblock] + set overblock [tcl::string::map {\r\n \n} $overblock] + + set underlines [split $underblock \n] + lassign [blocksize $underblock] _w renderwidth _h renderheight + set overlines [split $overblock \n] + #set overblock_width [tcl::mathfunc::max {*}[lmap v $overlines {punk::ansi::printing_length $v}]] + lassign [blocksize $overblock] _w overblock_width _h overblock_height + set under_exposed_max [expr {max(0,$renderwidth - $overblock_width)}] + set left_exposed $under_exposed_max + + + + set outputlines [list] + if {[punk::ansi::ta::detect_sgr [lindex $overlines 0]]} { + set replay_codes "[punk::ansi::a]" + } else { + set replay_codes "" + } + set replay_codes_underlay "" + set replay_codes_overlay "" + foreach undertext $underlines overtext $overlines { + set overtext_datalen [punk::ansi::printing_length $overtext] + set ulen [punk::ansi::printing_length $undertext] + if {$ulen < $renderwidth} { + set udiff [expr {$renderwidth - $ulen}] + #puts xxx + append undertext [string repeat { } $udiff] + } + if {$overtext_datalen < $overblock_width} { + set odiff [expr {$overblock_width - $overtext_datalen}] + switch -- $opt_align { + left { + set startoffset 0 + } + right { + set startoffset $odiff + } + default { + set half [expr {$odiff / 2}] + #set lhs [string repeat { } $half] + #set righthalf [expr {$odiff - $half}] ;#remainder - may be one more - so we are biased left + #set rhs [string repeat { } $righthalf] + set startoffset $half + } + } + } else { + set startoffset 0 ;#negative? + } + + set undertext $replay_codes_underlay$undertext + set overtext $replay_codes_overlay$overtext + + set overflowlength [expr {$overtext_datalen - $renderwidth}] + if {$overflowlength > 0} { + #raw overtext wider than undertext column + set rinfo [renderline\ + -info 1\ + -insert_mode 0\ + -transparent $opt_transparent\ + -exposed1 $opt_exposed1 -exposed2 $opt_exposed2\ + -overflow $opt_overflow\ + -startcolumn [expr {1 + $startoffset}]\ + $undertext $overtext] + set replay_codes [tcl::dict::get $rinfo replay_codes] + set rendered [tcl::dict::get $rinfo result] + if {!$opt_overflow} { + if {$opt_ellipsis} { + set show_ellipsis 1 + if {!$opt_ellipsiswhitespace} { + #we don't want ellipsis if only whitespace was lost + set lostdata [tcl::string::range $overtext end-[expr {$overflowlength-1}] end] + if {[tcl::string::trim $lostdata] eq ""} { + set show_ellipsis 0 + } + } + if {$show_ellipsis} { + set ellipsis $replay_codes$opt_ellipsistext + #todo - overflow on left if allign = right?? + set rendered [overtype::right $rendered $ellipsis] + } + } + } + lappend outputlines $rendered + } else { + #padded overtext + #lappend outputlines [renderline -insert_mode 0 -transparent $opt_transparent -startcolumn [expr {$left_exposed + 1}] $undertext $overtext] + #Note - we still need overflow(exapnd_right) here - as although the overtext is short - it may oveflow due to the startoffset + set rinfo [renderline -info 1 -insert_mode 0 -transparent $opt_transparent -expand_right $opt_overflow -startcolumn [expr {$left_exposed + 1 + $startoffset}] $undertext $overtext] + lappend outputlines [tcl::dict::get $rinfo result] + } + set replay_codes [tcl::dict::get $rinfo replay_codes] + set replay_codes_underlay [tcl::dict::get $rinfo replay_codes_underlay] + set replay_codes_overlay [tcl::dict::get $rinfo replay_codes_overlay] + } + + return [join $outputlines \n] + } + + proc left {args} { + overtype::block -blockalign left {*}$args + } + #overtype a (possibly ragged) underblock with a (possibly ragged) overblock + proc block {args} { + variable default_ellipsis_horizontal + # @d !todo - implement overflow, length checks etc + + if {[llength $args] < 2} { + error {usage: ?-blockalign left|centre|right? ?-textalign left|centre|right? ?-overflow [1|0]? ?-transparent 0|? undertext overtext} + } + #foreach {underblock overblock} [lrange $args end-1 end] break + lassign [lrange $args end-1 end] underblock overblock + + set opts [tcl::dict::create\ + -ellipsis 0\ + -ellipsistext $default_ellipsis_horizontal\ + -ellipsiswhitespace 0\ + -overflow 0\ + -transparent 0\ + -exposed1 \uFFFD\ + -exposed2 \uFFFD\ + -textalign "left"\ + -textvertical "top"\ + -blockalign "left"\ + -blockalignbias left\ + -blockvertical "top"\ + ] + set argsflags [lrange $args 0 end-2] + tcl::dict::for {k v} $argsflags { + switch -- $k { + -blockalignbias - -ellipsis - -ellipsistext - -ellipsiswhitespace - -overflow - -transparent - -exposed1 - -exposed2 - -textalign - -blockalign - -blockvertical { + tcl::dict::set opts $k $v + } + default { + error "overtype::block unknown option '$k'. Known options: [tcl::dict::keys $opts]" + } + } + } + # -- --- --- --- --- --- + set opt_transparent [tcl::dict::get $opts -transparent] + set opt_ellipsis [tcl::dict::get $opts -ellipsis] + set opt_ellipsistext [tcl::dict::get $opts -ellipsistext] + set opt_ellipsiswhitespace [tcl::dict::get $opts -ellipsiswhitespace] + set opt_overflow [tcl::dict::get $opts -overflow] + set opt_exposed1 [tcl::dict::get $opts -exposed1] + set opt_exposed2 [tcl::dict::get $opts -exposed2] + set opt_textalign [tcl::dict::get $opts -textalign] + set opt_blockalign [tcl::dict::get $opts -blockalign] + if {$opt_blockalign eq "center"} { + set opt_blockalign "centre" + } + # -- --- --- --- --- --- + + set underblock [tcl::string::map {\r\n \n} $underblock] + set overblock [tcl::string::map {\r\n \n} $overblock] + + set underlines [split $underblock \n] + lassign [blocksize $underblock] _w renderwidth _h renderheight + set overlines [split $overblock \n] + #set overblock_width [tcl::mathfunc::max {*}[lmap v $overlines {punk::ansi::printing_length $v}]] + lassign [blocksize $overblock] _w overblock_width _h overblock_height + set under_exposed_max [expr {max(0,$renderwidth - $overblock_width)}] + + switch -- $opt_blockalign { + left { + set left_exposed 0 + } + right { + set left_exposed $under_exposed_max + } + centre { + if {$under_exposed_max > 0} { + #background block is wider + if {$under_exposed_max % 2 == 0} { + #even left/right exposure + set left_exposed [expr {$under_exposed_max / 2}] + } else { + set beforehalf [expr {$under_exposed_max / 2}] ;#1 less than half due to integer division + if {[tcl::string::tolower [tcl::dict::get $opts -blockalignbias]] eq "left"} { + set left_exposed $beforehalf + } else { + #bias to the right + set left_exposed [expr {$beforehalf + 1}] + } + } + } else { + set left_exposed 0 + } + } + default { + set left_exposed 0 + } + } + + + + set outputlines [list] + if {[punk::ansi::ta::detect_sgr [lindex $overlines 0]]} { + set replay_codes "[punk::ansi::a]" + } else { + set replay_codes "" + } + set replay_codes_underlay "" + set replay_codes_overlay "" + foreach undertext $underlines overtext $overlines { + set overtext_datalen [punk::ansi::printing_length $overtext] + set ulen [punk::ansi::printing_length $undertext] + if {$ulen < $renderwidth} { + set udiff [expr {$renderwidth - $ulen}] + #puts xxx + append undertext [string repeat { } $udiff] + } + if {$overtext_datalen < $overblock_width} { + set odiff [expr {$overblock_width - $overtext_datalen}] + switch -- $opt_textalign { + left { + set startoffset 0 + } + right { + set startoffset $odiff + } + default { + set half [expr {$odiff / 2}] + #set lhs [string repeat { } $half] + #set righthalf [expr {$odiff - $half}] ;#remainder - may be one more - so we are biased left + #set rhs [string repeat { } $righthalf] + set startoffset $half + } + } + } else { + set startoffset 0 ;#negative? + } + + set undertext $replay_codes_underlay$undertext + set overtext $replay_codes_overlay$overtext + + set overflowlength [expr {$overtext_datalen - $renderwidth}] + if {$overflowlength > 0} { + #raw overtext wider than undertext column + set rinfo [renderline -info 1 -insert_mode 0 -transparent $opt_transparent -exposed1 $opt_exposed1 -exposed2 $opt_exposed2 -expand_right $opt_overflow -startcolumn [expr {1 + $startoffset}] $undertext $overtext] + set replay_codes [tcl::dict::get $rinfo replay_codes] + set rendered [tcl::dict::get $rinfo result] + set overflow_right [tcl::dict::get $rinfo overflow_right] + set unapplied [tcl::dict::get $rinfo unapplied] + if {!$opt_overflow} { + if {$opt_ellipsis} { + set show_ellipsis 1 + if {!$opt_ellipsiswhitespace} { + #we don't want ellipsis if only whitespace was lost + #don't use tcl::string::range on ANSI data + #set lostdata [tcl::string::range $overtext end-[expr {$overflowlength-1}] end] + set lostdata "" + if {$overflow_right ne ""} { + append lostdata $overflow_right + } + if {$unapplied ne ""} { + append lostdata $unapplied + } + if {[tcl::string::trim $lostdata] eq ""} { + set show_ellipsis 0 + } + } + if {$show_ellipsis} { + set rendered [overtype::block -blockalign right $rendered $opt_ellipsistext] + } + } + + #if {$opt_ellipsis} { + # set show_ellipsis 1 + # if {!$opt_ellipsiswhitespace} { + # #we don't want ellipsis if only whitespace was lost + # set lostdata [tcl::string::range $overtext end-[expr {$overflowlength-1}] end] + # if {[tcl::string::trim $lostdata] eq ""} { + # set show_ellipsis 0 + # } + # } + # if {$show_ellipsis} { + # set ellipsis [tcl::string::cat $replay_codes $opt_ellipsistext] + # #todo - overflow on left if allign = right?? + # set rendered [overtype::right $rendered $ellipsis] + # } + #} + } + lappend outputlines $rendered + } else { + #padded overtext + #lappend outputlines [renderline -insert_mode 0 -transparent $opt_transparent -startcolumn [expr {$left_exposed + 1}] $undertext $overtext] + #Note - we still need expand_right here - as although the overtext is short - it may oveflow due to the startoffset + set rinfo [renderline -info 1 -insert_mode 0 -transparent $opt_transparent -expand_right $opt_overflow -startcolumn [expr {$left_exposed + 1 + $startoffset}] $undertext $overtext] + #puts stderr "--> [ansistring VIEW -lf 1 -nul 1 $rinfo] <--" + set overflow_right [tcl::dict::get $rinfo overflow_right] + set unapplied [tcl::dict::get $rinfo unapplied] + lappend outputlines [tcl::dict::get $rinfo result] + } + set replay_codes [tcl::dict::get $rinfo replay_codes] + set replay_codes_underlay [tcl::dict::get $rinfo replay_codes_underlay] + set replay_codes_overlay [tcl::dict::get $rinfo replay_codes_overlay] + } + + return [join $outputlines \n] + } + + variable optimise_ptruns 10 ;# can be set to zero to disable the ptruns branches + + # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### + # renderline written from a left-right line orientation perspective as a first-shot at getting something useful. + # ultimately right-to-left, top-to-bottom and bottom-to-top are probably needed. + # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### + # + # + #-returnextra enables returning of overflow and length + #review - use punk::ansi::ta::detect to short-circuit processing and do simpler string calcs as an optimisation? + #review - DECSWL/DECDWL double width line codes - very difficult/impossible to align and compose with other elements + #(could render it by faking it with sixels and a lot of work - find/make a sixel font and ensure it's exactly 2 cols per char? + # This would probably be impractical to support for different fonts) + #todo - review transparency issues with single/double width characters + #bidi - need a base direction and concept of directional runs for RTL vs LTR - may be best handled at another layer? + proc renderline {args} { + #*** !doctools + #[call [fun overtype::renderline] [arg args] ] + #[para] renderline is the core engine for overtype string processing (frames & textblocks), and the raw mode commandline repl for the Tcl Punk Shell + #[para] It is also a central part of an ansi (micro) virtual terminal-emulator of sorts + #[para] This system does a half decent job at rendering 90's ANSI art to manipulable colour text blocks that can be joined & framed for layout display within a unix or windows terminal + #[para] Renderline helps maintain ANSI text styling reset/replay codes so that the styling of one block doesn't affect another. + #[para] Calling on the punk::ansi library - it can coalesce codes to keep the size down. + #[para] It is a giant mess of doing exactly what common wisdom says not to do... lots at once. + #[para] renderline is part of the Unicode and ANSI aware Overtype system which 'renders' a block of text onto a static underlay + #[para] The underlay is generally expected to be an ordered set of lines or a rectangular text block analogous to a terminal screen - but it can also be ragged in line length, or just blank. + #[para] The overlay couuld be similar - in which case it may often be used to overwrite a column or section of the underlay. + #[para] The overlay could however be a sequence of ANSI-laden text that jumps all over the place. + # + #[para] renderline itself only deals with a single line - or sometimes a single character. It is generally called from a loop that does further terminal-like or textblock processing. + #[para] By suppyling the -info 1 option - it can return various fields indicating the state of the render. + #[para] The main 3 are the result, overflow_right, and unapplied. + #[para] Renderline handles cursor movements from either keystrokes or ANSI sequences but for a full system the aforementioned loop will need to be in place to manage the set of lines under manipulation. + + #puts stderr "renderline '$args'" + variable optimise_ptruns + + if {[llength $args] < 2} { + error {usage: ?-info 0|1? ?-startcolumn ? ?-cursor_column ? ?-cursor_row |""? ?-transparent [0|1|]? ?-expand_right [1|0]? undertext overtext} + } + set under [lindex $args end-1] + set over [lindex $args end] + #lassign [lrange $args end-1 end] under over + if {[string last \n $under] >= 0} { + error "overtype::renderline not allowed to contain newlines in undertext" + } + #if {[string first \n $over] >=0 || [string first \n $under] >= 0} { + # error "overtype::renderline not allowed to contain newlines" + #} + + #generally faster to create a new dict in the proc than to use a namespace variable to create dict once and link to variable (2024 8.6/8.7) + set opts [tcl::dict::create\ + -etabs 0\ + -width \uFFEF\ + -expand_right 0\ + -transparent 0\ + -startcolumn 1\ + -cursor_column 1\ + -cursor_row ""\ + -insert_mode 1\ + -crm_mode 0\ + -autowrap_mode 1\ + -reverse_mode 0\ + -info 0\ + -exposed1 \uFFFD\ + -exposed2 \uFFFD\ + -cursor_restore_attributes ""\ + -cp437 0\ + -experimental {}\ + ] + #-cursor_restore_attributes only - for replay stack - position and actual setting/restoring handled by throwback to caller + + #cursor_row, when numeric will allow detection of certain row moves that are still within our row - allowing us to avoid an early return + #An empty string for cursor_row tells us we have no info about our own row context, and to return with an unapplied string if any row move occurs + + #exposed1 and exposed2 for first and second col of underying 2wide char which is truncated by transparency, currsor movements to 2nd charcol, or overflow/expand_right + #todo - return info about such grapheme 'cuts' in -info structure and/or create option to raise an error + + set argsflags [lrange $args 0 end-2] + tcl::dict::for {k v} $argsflags { + switch -- $k { + -experimental - -cp437 - -width - -expand_right - -transparent - -startcolumn - -cursor_column - -cursor_row + - -crm_mode - -insert_mode - -autowrap_mode - -reverse_mode + - -info - -exposed1 - -exposed2 - -cursor_restore_attributes { + tcl::dict::set opts $k $v + } + default { + error "overtype::renderline unknown option '$k'. Known options: [tcl::dict::keys $opts]" + } + } + } + # -- --- --- --- --- --- --- --- --- --- --- --- + set opt_width [tcl::dict::get $opts -width] + set opt_etabs [tcl::dict::get $opts -etabs] + set opt_expand_right [tcl::dict::get $opts -expand_right] + set opt_colstart [tcl::dict::get $opts -startcolumn] ;#lhs limit for overlay - an offset to cursor_column - first visible column is 1. 0 or < 0 are before the start of the underlay + set opt_colcursor [tcl::dict::get $opts -cursor_column];#start cursor column relative to overlay + set opt_row_context [tcl::dict::get $opts -cursor_row] + if {[string length $opt_row_context]} { + if {![tcl::string::is integer -strict $opt_row_context] || $opt_row_context <1 } { + error "overtype::renderline -cursor_row must be empty for unspecified/unknown or a non-zero positive integer. received: '$opt_row_context'" + } + } + # -- --- --- --- --- --- --- --- --- --- --- --- + #The _mode flags correspond to terminal modes that can be set/reset via escape sequences (e.g DECAWM wraparound mode) + set opt_insert_mode [tcl::dict::get $opts -insert_mode];#should usually be 1 for each new line in editor mode but must be initialised to 1 externally (review) + #default is for overtype + # -- --- --- --- --- --- --- --- --- --- --- --- + set opt_autowrap_mode [tcl::dict::get $opts -autowrap_mode] ;#DECAWM - char or movement can go beyond leftmost/rightmost col to prev/next line + set opt_reverse_mode [tcl::dict::get $opts -reverse_mode] ;#DECSNM + set opt_crm_mode [tcl::dict::get $opts -crm_mode];# CRM - show control character mode + # -- --- --- --- --- --- --- --- --- --- --- --- + set temp_cursor_saved [tcl::dict::get $opts -cursor_restore_attributes] + + set cp437_glyphs [tcl::dict::get $opts -cp437] + set cp437_map [tcl::dict::create] + if {$cp437_glyphs} { + set cp437_map [set ::punk::ansi::cp437_map] + #for cp437 images we need to map these *after* splitting ansi + #some old files might use newline for its glyph.. but we can't easily support that. + #Not sure how old files did it.. maybe cr lf in sequence was newline and any lone cr or lf were displayed as glyphs? + tcl::dict::unset cp437_map \n + } + + set opt_transparent [tcl::dict::get $opts -transparent] + if {$opt_transparent eq "0"} { + set do_transparency 0 + } else { + set do_transparency 1 + if {$opt_transparent eq "1"} { + set opt_transparent {[\s]} + } + } + # -- --- --- --- --- --- --- --- --- --- --- --- + set opt_returnextra [tcl::dict::get $opts -info] + # -- --- --- --- --- --- --- --- --- --- --- --- + set opt_exposed1 [tcl::dict::get $opts -exposed1] + set opt_exposed2 [tcl::dict::get $opts -exposed2] + # -- --- --- --- --- --- --- --- --- --- --- --- + + if {$opt_row_context eq ""} { + set cursor_row 1 + } else { + set cursor_row $opt_row_context + } + + set insert_mode $opt_insert_mode ;#default 1 + set autowrap_mode $opt_autowrap_mode ;#default 1 + set crm_mode $opt_crm_mode ;#default 0 (Show Control Character mode) + set reverse_mode $opt_reverse_mode + + #----- + # + if {[info exists punk::console::tabwidth]} { + #punk console is updated if punk::console::set_tabstop_width is used or rep is started/restarted + #It is way too slow to test the current width by querying the terminal here - so it could conceivably get out of sync + #todo - we also need to handle the new threaded repl where console config is in a different thread. + # - also - concept of sub-regions being mini-consoles with their own settings - 'id' for console, or use in/out channels as id? + set tw $::punk::console::tabwidth + } else { + set tw 8 + } + + set overdata $over + if {!$cp437_glyphs} { + #REVIEW! tabify will give different answers for an ANSI colourised string vs plain text + if {!$opt_etabs} { + if {[string first \t $under] >= 0} { + #set under [textutil::tabify::untabify2 $under] + set under [textutil::tabify::untabifyLine $under $tw] + } + if {[string first \t $over] >= 0} { + #set overdata [textutil::tabify::untabify2 $over] + set overdata [textutil::tabify::untabifyLine $over $tw] + } + } + } + #------- + + #ta_detect ansi and do simpler processing? + + #we repeat tests for grapheme width in different loops - rather than create another datastructure to store widths based on column, + #we'll use the grapheme_width_cached function as a lookup table of all graphemes encountered - as there will often be repeats in different positions anyway. + + # -- --- --- --- --- --- --- --- + if {$under ne ""} { + if {[punk::ansi::ta::detect $under]} { + set undermap [punk::ansi::ta::split_codes_single $under] + } else { + #single plaintext part + set undermap [list $under] + } + } else { + set undermap [list] + } + set understacks [list] + set understacks_gx [list] + set pm_list [list] + + set i_u -1 ;#underlay may legitimately be empty + set undercols [list] + set u_codestack [list] + #u_gx_stack probably isn't really a stack - I don't know if g0 g1 can stack or not - for now we support only g0 anyway + set u_gx_stack [list] ;#separate stack for g0 (g1 g2 g3?) graphics on and off (DEC special graphics) + #set pt_underchars "" ;#for string_columns length calculation for expand_right 0 truncation + set remainder [list] ;#for returnextra + foreach {pt code} $undermap { + #pt = plain text + #append pt_underchars $pt + if {$pt ne ""} { + if {$cp437_glyphs} { + set pt [tcl::string::map $cp437_map $pt] + } + set is_ptrun 0 + if {$optimise_ptruns && [tcl::string::length $pt] >= $optimise_ptruns} { + set p1 [tcl::string::index $pt 0] + set hex [format %x [scan $p1 %c]] ;#punk::char::char_hex + set re [tcl::string::cat {^[} \\U$hex {]+$}] + set is_ptrun [regexp $re $pt] + } + if {$is_ptrun} { + #switch -- $p1 { + # " " - - - _ - ! - @ - # - $ - % - ^ - & - * - = - + - : - . - , - / - | - ? - + # a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - + # z - A - B - C - D - E - F - G - H - I - J - K - L - M - N - O - P - Q - R - S - T - U - V - W - X - Y - Z - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 { + # set width 1 + # } + # default { + # if {$p1 eq "\u0000"} { + # #use null as empty cell representation - review + # #use of this will probably collide with some application at some point + # #consider an option to set the empty cell character + # set width 1 + # } else { + # set width [grapheme_width_cached $p1] ;# when zero??? + # } + # } + #} + set width [grapheme_width_cached $p1] ;# when zero??? + set ptlen [string length $pt] + if {$width <= 1} { + #review - 0 and 1? + incr i_u $ptlen + lappend understacks {*}[lrepeat $ptlen $u_codestack] + lappend understacks_gx {*}[lrepeat $ptlen $u_gx_stack] + lappend undercols {*}[lrepeat $ptlen $p1] + } else { + incr i_u $ptlen ;#2nd col empty str - so same as above + set 2ptlen [expr {$ptlen * 2}] + lappend understacks {*}[lrepeat $2ptlen $u_codestack] + lappend understacks_gx {*}[lrepeat $2ptlen $u_gx_stack] + set l [concat {*}[lrepeat $ptlen [list $p1 ""]] + lappend undercols {*}$l + unset l + } + + } else { + foreach grapheme [punk::char::grapheme_split $pt] { + #an ugly but easy hack to serve *some* common case ascii quickly with byte-compiled literal switch - feels dirty. + #.. but even 0.5uS per char (grapheme_width_cached) adds up quickly when stitching lots of lines together. + #todo - test decimal value instead, compare performance + switch -- $grapheme { + " " - - - _ - ! - @ - # - $ - % - ^ - & - * - = - + - : - . - , - / - | - ? - + a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - + z - A - B - C - D - E - F - G - H - I - J - K - L - M - N - O - P - Q - R - S - T - U - V - W - X - Y - Z - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 { + set width 1 + } + default { + if {$grapheme eq "\u0000"} { + #use null as empty cell representation - review + #use of this will probably collide with some application at some point + #consider an option to set the empty cell character + set width 1 + } else { + #zero width still acts as 1 below??? review what should happen + set width [grapheme_width_cached $grapheme] + #we still want most controls and other zero-length codepoints such as \u200d (zero width joiner) to stay zero-length + #we substitute lone ESC that weren't captured within ANSI context as a debugging aid to see malformed ANSI + #todo - default to off and add a flag (?) to enable this substitution + set sub_stray_escapes 0 + if {$sub_stray_escapes && $width == 0} { + if {$grapheme eq "\x1b"} { + set gvis [ansistring VIEW $grapheme] ;#can only use with graphemes that have a single replacement char.. + set grapheme $gvis + set width 1 + } + } + } + } + } + + #set width [grapheme_width_cached $grapheme] + incr i_u + lappend understacks $u_codestack + lappend understacks_gx $u_gx_stack + + lappend undercols $grapheme + if {$width > 1} { + #presumably there are no triple-column or wider unicode chars.. until the aliens arrive.(?) + #but what about emoji combinations etc - can they be wider than 2? + #todo - if -etabs enabled - then we treat \t as the width determined by our elastic tabstop + incr i_u + lappend understacks $u_codestack + lappend understacks_gx $u_gx_stack + lappend undercols "" + } + } + + } + } + #underlay should already have been rendered and not have non-sgr codes - but let's retain the check for them and not stack them if other codes are here + + #only stack SGR (graphics rendition) codes - not title sets, cursor moves etc + #keep any remaining PMs in place + if {$code ne ""} { + set c1c2 [tcl::string::range $code 0 1] + + set leadernorm [tcl::string::range [tcl::string::map [list\ + \x1b\[ 7CSI\ + \x9b 8CSI\ + \x1b\( 7GFX\ + \x1b^ 7PMX\ + \x1bX 7SOS\ + ] $c1c2] 0 3];# leadernorm is 1st 2 chars mapped to normalised indicator - or is original 2 chars + + switch -- $leadernorm { + 7CSI - 8CSI { + #need to exclude certain leaders after the lb e.g < for SGR 1006 mouse + #REVIEW - what else could end in m but be mistaken as a normal SGR code here? + set maybemouse "" + if {[tcl::string::index $c1c2 0] eq "\x1b"} { + set maybemouse [tcl::string::index $code 2] + } + + if {$maybemouse ne "<" && [tcl::string::index $code end] eq "m"} { + if {[punk::ansi::codetype::is_sgr_reset $code]} { + set u_codestack [list "\x1b\[m"] + } elseif {[punk::ansi::codetype::has_sgr_leadingreset $code]} { + set u_codestack [list $code] + } else { + #basic simplification first.. straight dups + set dup_posns [lsearch -all -exact $u_codestack $code] ;#-exact because of square-bracket glob chars + set u_codestack [lremove $u_codestack {*}$dup_posns] + lappend u_codestack $code + } + } + } + 7GFX { + switch -- [tcl::string::index $code 2] { + "0" { + set u_gx_stack [list gx0_on] ;#we'd better use a placeholder - or debugging will probably get into a big mess + } + B { + set u_gx_stack [list] + } + } + } + 7PMX - 7SOS { + #we can have PMs or SOS (start of string) in the underlay - though mostly the PMs should have been processed.. + #attach the PM/SOS (entire ANSI sequence) to the previous grapheme! + #It should not affect the size - but terminal display may get thrown out if terminal doesn't support them. + + #note that there may in theory already be ANSI stored - we don't assume it was a pure grapheme string + set graphemeplus [lindex $undercols end] + if {$graphemeplus ne "\0"} { + append graphemeplus $code + } else { + set graphemeplus $code + } + lset undercols end $graphemeplus + #The grapheme_width_cached function will be called on this later - and doesn't account for ansi. + #we need to manually cache the item with it's proper width + variable grapheme_widths + #stripped and plus version keys pointing to same length + dict set grapheme_widths $graphemeplus [grapheme_width_cached [::punk::ansi::ansistrip $graphemeplus]] + + } + default { + + } + + } + + #if {[punk::ansi::codetype::is_sgr_reset $code]} { + # #set u_codestack [list] + #} elseif {[punk::ansi::codetype::has_sgr_leadingreset $code]} { + #} elseif {[punk::ansi::codetype::is_sgr $code]} { + #} else { + # #leave SGR stack as is + # if {[punk::ansi::codetype::is_gx_open $code]} { + # } elseif {[punk::ansi::codetype::is_gx_close $code]} { + # } + #} + } + #consider also if there are other codes that should be stacked..? + } + + #NULL empty cell indicator + if {$opt_width ne "\uFFEF"} { + if {[llength $understacks]} { + set cs $u_codestack + set gs $u_gx_stack + } else { + set cs [list] + set gs [list] + } + if {[llength $undercols]< $opt_width} { + set diff [expr {$opt_width- [llength $undercols]}] + if {$diff > 0} { + #set undercols [list {*}$undercols {*}[lrepeat $diff "\u0000"]] ;#2024 - much slower + lappend undercols {*}[lrepeat $diff "\u0000"] + lappend understacks {*}[lrepeat $diff $cs] + lappend understacks_gx {*}[lrepeat $diff $gs] + } + } + } + + if {$opt_width ne "\uFFEF"} { + set renderwidth $opt_width + } else { + set renderwidth [llength $undercols] + } + + + if 0 { + # ----------------- + # if we aren't extending understacks & understacks_gx each time we incr idx above the undercols length.. this doesn't really serve a purpose + # Review. + # ----------------- + #replay code for last overlay position in input line + # whether or not we get that far - we need to return it for possible replay on next line + if {[llength $understacks]} { + lappend understacks $u_codestack + lappend understacks_gx $u_gx_stack + } else { + #in case overlay onto emptystring as underlay + lappend understacks [list] + lappend understacks_gx [list] + } + # ----------------- + } + + #trailing codes in effect for underlay + if {[llength $u_codestack]} { + #set replay_codes_underlay [join $u_codestack ""] + set replay_codes_underlay [punk::ansi::codetype::sgr_merge_list {*}$u_codestack] + } else { + set replay_codes_underlay "" + } + + + # -- --- --- --- --- --- --- --- + #### + #if opt_colstart - we need to build a space (or any singlewidth char ?) padding on the left containing the right number of columns. + #this will be processed as transparent - and handle doublewidth underlay characters appropriately + set startpadding [string repeat " " [expr {$opt_colstart -1}]] + #overdata with left padding spaces based on col-start under will show through for left-padding portion regardless of -transparency + if {$startpadding ne "" || $overdata ne ""} { + if {[punk::ansi::ta::detect $overdata]} { + set overmap [punk::ansi::ta::split_codes_single $startpadding$overdata] + } else { + #single plaintext part + set overmap [list $startpadding$overdata] + } + } else { + set overmap [list] + } + #### + + + #todo - detect plain ascii no-ansi input line (aside from leading ansi reset) + #will that allow some optimisations? + + #todo - detect repeated transparent char in overlay + #regexp {^(.)\1+$} ;#backtracking regexp - relatively slow. + # - try set c [string index $data 0]; regexp [string map [list %c% $c] {^[%c%]+$}] $data + #we should be able to optimize to pass through the underlay?? + + #??? + set colcursor $opt_colstart + #TODO - make a little virtual column object + #we need to refer to column1 or columnmin? or columnmax without calculating offsets due to to startcolumn + #need to lock-down what start column means from perspective of ANSI codes moving around - the offset perspective is unclear and a mess. + + + #set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+} + #as at 2024-02 punk::char::grapheme_split uses these - not aware of more complex graphemes + + set overstacks [list] + set overstacks_gx [list] + + set o_codestack [list]; #SGR codestack (not other codes such as movement,insert key etc) + set o_gxstack [list] + set pt_overchars "" + set i_o 0 + set overlay_grapheme_control_list [list] ;#tag each with g, sgr or other. 'other' are things like cursor-movement or insert-mode or codes we don't recognise/use + #experiment + set overlay_grapheme_control_stacks [list] + foreach {pt code} $overmap { + if {$pt ne ""} { + #todo - wrap in test for empty pt (we used split_codes_single - and it may be common for sgr sequences to be unmerged and so have empty pts between) + if {$cp437_glyphs} { + set pt [tcl::string::map $cp437_map $pt] + } + append pt_overchars $pt + #will get empty pt between adjacent codes + if {!$crm_mode} { + + set is_ptrun 0 + if {$optimise_ptruns && [tcl::string::length $pt] >= $optimise_ptruns} { + set p1 [tcl::string::index $pt 0] + set re [tcl::string::cat {^[} \\U[format %x [scan $p1 %c]] {]+$}] + set is_ptrun [regexp $re $pt] + + #leading only? we would have to check for graphemes at the trailing boundary? + #set re [tcl::string::cat {^[} \\U[format %x [scan $p1 %c]] {]+}] + #set is_ptrun [regexp -indices $re $pt runrange] + #if {$is_ptrun && 1} { + #} + } + if {$is_ptrun} { + #review - in theory a run over a certain length won't be part of some grapheme combo (graphemes can be long e.g 44?, but not as runs(?)) + #could be edge cases for runs at line end? (should be ok as we include trailing \n in our data) + set len [string length $pt] + set g_element [list g $p1] + + #lappend overstacks {*}[lrepeat $len $o_codestack] + #lappend overstacks_gx {*}[lrepeat $len $o_gxstack] + #incr i_o $len + #lappend overlay_grapheme_control_list {*}[lrepeat $len [list g $p1]] + #lappend overlay_grapheme_control_stacks {*}[lrepeat $len $o_codestack] + + set pi 0 + incr i_o $len + while {$pi < $len} { + lappend overstacks $o_codestack + lappend overstacks_gx $o_gxstack + lappend overlay_grapheme_control_list $g_element + lappend overlay_grapheme_control_stacks $o_codestack + incr pi + } + } else { + foreach grapheme [punk::char::grapheme_split $pt] { + lappend overstacks $o_codestack + lappend overstacks_gx $o_gxstack + incr i_o + lappend overlay_grapheme_control_list [list g $grapheme] + lappend overlay_grapheme_control_stacks $o_codestack + } + } + } else { + set tsbegin [clock micros] + foreach grapheme_original [punk::char::grapheme_split $pt] { + set pt_crm [ansistring VIEW -nul 1 -lf 2 -vt 2 -ff 2 $grapheme_original] + #puts stderr "ptlen [string length $pt] graphemelen[string length $grapheme_original] pt_crmlen[string length $pt_crm] $pt_crm" + foreach grapheme [punk::char::grapheme_split $pt_crm] { + if {$grapheme eq "\n"} { + lappend overlay_grapheme_control_stacks $o_codestack + lappend overlay_grapheme_control_list [list crmcontrol "\x1b\[00001E"] + } else { + lappend overstacks $o_codestack + lappend overstacks_gx $o_gxstack + incr i_o + lappend overlay_grapheme_control_list [list g $grapheme] + lappend overlay_grapheme_control_stacks $o_codestack + } + } + } + set elapsed [expr {[clock micros] - $tsbegin}] + puts stderr "ptlen [string length $pt] elapsedus:$elapsed" + } + } + + #only stack SGR (graphics rendition) codes - not title sets, cursor moves etc + #order of if-else based on assumptions: + # that pure resets are fairly common - more so than leading resets with other info + # that non-sgr codes are not that common, so ok to check for resets before verifying it is actually SGR at all. + if {$code ne ""} { + #we need to immediately set crm_mode here if \x1b\[3h received + if {$code eq "\x1b\[3h"} { + set crm_mode 1 + } elseif {$code eq "\x1b\[3l"} { + set crm_mode 0 + } + #else crm_mode could be set either way from options + if {$crm_mode && $code ne "\x1b\[00001E"} { + #treat the code as type 'g' like above - only allow through codes to reset mode REVIEW for now just \x1b\[3l ? + #we need to somehow convert further \n in the graphical rep to an instruction for newline that will bypass further crm_mode processing or we would loop. + set code_as_pt [ansistring VIEW -nul 1 -lf 1 -vt 1 -ff 1 $code] + #split using standard split for first foreach loop - grapheme based split when processing 2nd foreach loop + set chars [split $code_as_pt ""] + set codeparts [list] ;#list of 2-el lists each element {crmcontrol } or {g } + foreach c $chars { + if {$c eq "\n"} { + #use CNL (cursor next line) \x1b\[00001E ;#leading zeroes ok for this processor - used as debugging aid to distinguish + lappend codeparts [list crmcontrol "\x1b\[00001E"] + } else { + if {[llength $codeparts] > 0 && [lindex $codeparts end 0] eq "g"} { + set existing [lindex $codeparts end 1] + lset codeparts end [list g [string cat $existing $c]] + } else { + lappend codeparts [list g $c] + } + } + } + + set partidx 0 + foreach record $codeparts { + lassign $record rtype rval + switch -exact -- $rtype { + g { + append pt_overchars $rval + foreach grapheme [punk::char::grapheme_split $rval] { + lappend overstacks $o_codestack + lappend overstacks_gx $o_gxstack + incr i_o + lappend overlay_grapheme_control_list [list g $grapheme] + lappend overlay_grapheme_control_stacks $o_codestack + } + } + crmcontrol { + #leave o_codestack + lappend overlay_grapheme_control_stacks $o_codestack + lappend overlay_grapheme_control_list [list crmcontrol $rval] + } + } + } + } else { + lappend overlay_grapheme_control_stacks $o_codestack + #there will always be an empty code at end due to foreach on 2 vars with odd-sized list ending with pt (overmap coming from perlish split) + if {[punk::ansi::codetype::is_sgr_reset $code]} { + set o_codestack [list "\x1b\[m"] ;#reset better than empty list - fixes some ansi art issues + lappend overlay_grapheme_control_list [list sgr $code] + } elseif {[punk::ansi::codetype::has_sgr_leadingreset $code]} { + set o_codestack [list $code] + lappend overlay_grapheme_control_list [list sgr $code] + } elseif {[priv::is_sgr $code]} { + #basic simplification first - remove straight dupes + set dup_posns [lsearch -all -exact $o_codestack $code] ;#must be -exact because of square-bracket glob chars + set o_codestack [lremove $o_codestack {*}$dup_posns] + lappend o_codestack $code + lappend overlay_grapheme_control_list [list sgr $code] + } elseif {[regexp {\x1b7|\x1b\[s} $code]} { + #experiment + #cursor_save - for the replays review. + #jmn + #set temp_cursor_saved [punk::ansi::codetype::sgr_merge_list {*}$o_codestack] + lappend overlay_grapheme_control_list [list other $code] + } elseif {[regexp {\x1b8|\x1b\[u} $code]} { + #experiment + #cursor_restore - for the replays + set o_codestack [list $temp_cursor_saved] + lappend overlay_grapheme_control_list [list other $code] + } else { + #review + if {[punk::ansi::codetype::is_gx_open $code]} { + set o_gxstack [list "gx0_on"] + lappend overlay_grapheme_control_list [list gx0 gx0_on] ;#don't store code - will complicate debugging if we spit it out and jump character sets + } elseif {[punk::ansi::codetype::is_gx_close $code]} { + set o_gxstack [list] + lappend overlay_grapheme_control_list [list gx0 gx0_off] ;#don't store code - will complicate debugging if we spit it out and jump character sets + } else { + lappend overlay_grapheme_control_list [list other $code] + } + } + } + } + + } + #replay code for last overlay position in input line - should take account of possible trailing sgr code after last grapheme + set max_overlay_grapheme_index [expr {$i_o -1}] + lappend overstacks $o_codestack + lappend overstacks_gx $o_gxstack + + #set replay_codes_overlay [join $o_codestack ""] + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}$o_codestack] + + #if {[tcl::dict::exists $overstacks $max_overlay_grapheme_index]} { + # set replay_codes_overlay [join [tcl::dict::get $overstacks $max_overlay_grapheme_index] ""] + #} else { + # set replay_codes_overlay "" + #} + # -- --- --- --- --- --- --- --- + + + #potential problem - combinining diacritics directly following control chars like \r \b + + # -- --- --- + #we need to initialise overflow_idx before any potential row-movements - as they need to perform a loop break and force in_excess to 1 + if {$opt_expand_right} { + #expand_right true means we can have lines as long as we want, but either way there can be excess data that needs to be thrown back to the calling loop. + #we currently only support horizontal expansion to the right (review regarding RTL text!) + set overflow_idx -1 + } else { + #expand_right zero - we can't grow beyond our column width - so we get ellipsis or truncation + if {$opt_width ne "\uFFEF"} { + set overflow_idx [expr {$opt_width}] + } else { + #review - this is also the cursor position when adding a char at end of line? + set overflow_idx [expr {[llength $undercols]}] ;#index at which we would be *in* overflow a row move may still override it + } + } + # -- --- --- + + set outcols $undercols ;#leave undercols as is, outcols can potentially be appended to. + + set unapplied "" ;#if we break for move row (but not for /v ?) + set unapplied_list [list] + + set insert_lines_above 0 ;#return key + set insert_lines_below 0 + set instruction "" + + # -- --- --- + #cursor_save_dec, cursor_restore_dec etc + set cursor_restore_required 0 + set cursor_saved_attributes "" + set cursor_saved_position "" + # -- --- --- + + #set idx 0 ;# line index (cursor - 1) + #set idx [expr {$opt_colstart + $opt_colcursor} -1] + + #idx is the per column output index + set idx [expr {$opt_colcursor -1}] ;#don't use opt_colstart here - we have padded and won't start emitting until idx reaches opt_colstart-1 + #cursor_column is usually one above idx - but we have opt_colstart which is like a margin - todo: remove cursor_column from the following loop and calculate it's offset when breaking or at end. + #(for now we are incrementing/decrementing both in sync - which is a bit silly) + set cursor_column $opt_colcursor + + #idx_over is the per grapheme overlay index + set idx_over -1 + + + #movements only occur within the overlay range. + #an underlay is however not necessary.. e.g + #renderline -expand_right 1 "" data + + #set re_mode {\x1b\[\?([0-9]*)(h|l)} ;#e.g DECAWM + #set re_col_move {\x1b\[([0-9]*)(C|D|G)$} + #set re_row_move {\x1b\[([0-9]*)(A|B)$} + #set re_both_move {\x1b\[([0-9]*)(?:;){0,1}([0-9]*)H$} ;# or "f" ? + #set re_vt_sequence {\x1b\[([0-9]*)(?:;){0,1}([0-9]*)~$} + #set re_cursor_save {\x1b\[s$} ;#note probable incompatibility with DECSLRM (set left right margin)! + #set re_cursor_restore {\x1b\[u$} + #set re_cursor_save_dec {\x1b7$} + #set re_cursor_restore_dec {\x1b8$} + #set re_other_single {\x1b(D|M|E)$} + #set re_decstbm {\x1b\[([0-9]*)(?:;){0,1}([0-9]*)r$} ;#DECSTBM set top and bottom margins + + #puts "-->$overlay_grapheme_control_list<--" + #puts "-->overflow_idx: $overflow_idx" + for {set gci 0} {$gci < [llength $overlay_grapheme_control_list]} {incr gci} { + set gc [lindex $overlay_grapheme_control_list $gci] + lassign $gc type item + + #emit plaintext chars first using existing SGR codes from under/over stack as appropriate + #then check if the following code is a cursor movement within the line and adjust index if so + #foreach ch $overlay_graphemes {} + switch -- $type { + g { + set ch $item + #crm_mode affects both graphic and control + if {0 && $crm_mode} { + set chars [ansistring VIEW -nul 1 -lf 2 -vt 2 -ff 2 $ch] + set chars [string map [list \n "\x1b\[00001E"] $chars] + if {[llength [split $chars ""]] > 1} { + priv::render_unapplied $overlay_grapheme_control_list $gci + #prefix the unapplied controls with the string version of this control + set unapplied_list [linsert $unapplied_list 0 {*}[split $chars ""]] + set unapplied [join $unapplied_list ""] + #incr idx_over + break + } else { + set ch $chars + } + } + incr idx_over; #idx_over (until unapplied reached anyway) is per *grapheme* in the overlay - not per col. + if {($idx < ($opt_colstart -1))} { + incr idx [grapheme_width_cached $ch] + continue + } + #set within_undercols [expr {$idx <= [llength $undercols]-1}] ;#within our active data width + set within_undercols [expr {$idx <= $renderwidth-1}] + + #https://www.enigma.com/resources/blog/the-secret-world-of-newline-characters + #\x85 NEL in the c1 control set is treated by some terminal emulators (e.g Hyper) as a newline, + #on some it's invisble but doesn't change the line, on some it's a visible glyph of width 1. + #This is hard to process in any standard manner - but I think the Hyper behaviour of doing what it was intended is perhaps most reasonable + #We will map it to the same behaviour as lf here for now... but we need also to consider the equivalent ANSI sequence: \x1bE + + set chtest [tcl::string::map [list \n \x85 \b \r \v \x7f ] $ch] + #puts --->chtest:$chtest + #specials - each shoud have it's own test of what to do if it happens after overflow_idx reached + switch -- $chtest { + "" { + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + if {$idx == 0} { + #puts "---a at col 1" + #linefeed at column 1 + #leave the overflow_idx ;#? review + set instruction lf_start ;#specific instruction for newline at column 1 + priv::render_unapplied $overlay_grapheme_control_list $gci + break + } elseif {$overflow_idx != -1 && $idx == $overflow_idx} { + #linefeed after final column + #puts "---c at overflow_idx=$overflow_idx" + incr cursor_row + set overflow_idx $idx ;#override overflow_idx even if it was set to -1 due to expand_right = 1 + set instruction lf_overflow ;#only special treatment is to give it it's own instruction in case caller needs to handle differently + priv::render_unapplied $overlay_grapheme_control_list $gci + break + } else { + #linefeed occurred in middle or at end of text + #puts "---mid-or-end-text-linefeed idx:$idx overflow_idx:$overflow_idx" + if {$insert_mode == 0} { + incr cursor_row + if {$idx == -1 || $overflow_idx > $idx} { + #don't set overflow_idx higher if it's already set lower and we're adding graphemes to overflow + set overflow_idx $idx ;#override overflow_idx even if it was set to -1 due to expand_right = 1 + } + set instruction lf_mid + priv::render_unapplied $overlay_grapheme_control_list $gci + break + } else { + incr cursor_row + #don't adjust the overflow_idx + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction lf_mid + break ;# could have overdata following the \n - don't keep processing + } + } + + } + "" { + #will we want/need to use raw for keypresses in terminal? (terminal with LNM in standard reset mode means enter= this is the usual config for terminals) + #So far we are assuming the caller has translated to and handle above.. REVIEW. + + #consider also the old space-carriagereturn softwrap convention used in some terminals. + #In the context of rendering to a block of text - this works similarly in that the space gets eaten so programs emitting space-cr at the terminal width col will pretty much get what they expect. + set idx [expr {$opt_colstart -1}] + set cursor_column $opt_colstart ;#? + } + "" { + #literal backspace char - not necessarily from keyboard + #review - backspace effect on double-width chars - we are taking a column-editing perspective in overtype + #(important for -transparent option - hence replacement chars for half-exposed etc) + #review - overstrike support as per nroff/less (generally considered an old technology replaced by unicode mechanisms and/or ansi SGR) + if {$idx > ($opt_colstart -1)} { + incr idx -1 + incr cursor_column -1 + } else { + set flag 0 + if $flag { + #review - conflicting requirements? Need a different sequence for destructive interactive backspace? + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction backspace_at_start + break + } + } + } + "" { + #literal del character - some terminals send just this for what is generally expected to be a destructive backspace + #We instead treat this as a pure delete at current cursor position - it is up to the repl or terminal to remap backspace key to a sequence that has the desired effect. + priv::render_delchar $idx + } + "" { + #end processing this overline. rest of line is remainder. cursor for column as is. + #REVIEW - this theoretically depends on terminal's vertical tabulation setting (name?) + #e.g it could be configured to jump down 6 rows. + #On the other hand I've seen indications that some modern terminal emulators treat it pretty much as a linefeed. + #todo? + incr cursor_row + set overflow_idx $idx + #idx_over has already been incremented as this is both a movement-control and in some sense a grapheme + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction vt + break + } + default { + if {$overflow_idx != -1} { + #review - how to check arbitrary length item such as tab is going to overflow .. before we get to overflow_idx? + #call grapheme_width_cached on each ch, or look for tab specifically as it's currently the only known reason to have a grapheme width > 2? + #we need to decide what a tab spanning the overflow_idx means and how it affects wrap etc etc + if {$idx == $overflow_idx-1} { + set owidth [grapheme_width_cached $ch] + if {$owidth == 2} { + #review split 2w overflow? + #we don't want to make the decision here to split a 2w into replacement characters at end of line and beginning of next line + #better to consider the overlay char as unable to be applied to the line + #render empty column(?) - and reduce overlay grapheme index by one so that the current ch goes into unapplied + #throwing back to caller with instruction complicates its job - but is necessary to avoid making decsions for it here. + priv::render_addchar $idx "" [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + #change the overflow_idx + set overflow_idx $idx + incr idx + incr idx_over -1 ;#set overlay grapheme index back one so that sgr stack from previous overlay grapheme used + priv::render_unapplied $overlay_grapheme_control_list [expr {$gci-1}] ;#note $gci-1 instead of just gci + #throw back to caller's loop - add instruction to caller as this is not the usual case + #caller may for example choose to render a single replacement char to this line and omit the grapheme, or wrap it to the next line + set instruction overflow_splitchar + break + } elseif {$owidth > 2} { + #? tab? + #TODO! + puts stderr "overtype::renderline long overtext grapheme '[ansistring VIEW -lf 1 -vt 1 $ch]' not handled" + #tab of some length dependent on tabstops/elastic tabstop settings? + } + } elseif {$idx >= $overflow_idx} { + #REVIEW + set next_gc [lindex $overlay_grapheme_control_list $gci+1] ;#next grapheme or control + lassign $next_gc next_type next_item + if {$autowrap_mode || $next_type ne "g"} { + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci-1]] + #set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #don't incr idx beyond the overflow_idx + #idx_over already incremented - decrement so current overlay grapheme stacks go to unapplied + incr idx_over -1 + #priv::render_unapplied $overlay_grapheme_control_list [expr {$gci-1}] ;#back one index here too + priv::render_this_unapplied $overlay_grapheme_control_list $gci ;# + set instruction overflow + break + } else { + #no point throwing back to caller for each grapheme that is overflowing + #without this branch - renderline would be called with overtext reducing only by one grapheme per call + #processing a potentially long overtext each time (ie - very slow) + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #JMN4 + + } + } + } else { + #review. + #overflow_idx = -1 + #This corresponds to expand_right being true (at least until overflow_idx is in some cases forced to a value when throwing back to calling loop) + } + + if {($do_transparency && [regexp $opt_transparent $ch])} { + #pre opt_colstart is effectively transparent (we have applied padding of required number of columns to left of overlay) + if {$idx > [llength $outcols]-1} { + lappend outcols " " + #tcl::dict::set understacks $idx [list] ;#review - use idx-1 codestack? + #lset understacks $idx [list] ;#will get index $i out of range error + lappend understacks [list] ;#REVIEW + incr idx + incr cursor_column + } else { + #todo - punk::char::char_width + set g [lindex $outcols $idx] + #JMN + set uwidth [grapheme_width_cached $g] + if {[lindex $outcols $idx] eq ""} { + #2nd col of 2-wide char in underlay + incr idx + incr cursor_column + } elseif {$uwidth == 0} { + #e.g control char ? combining diacritic ? + incr idx + incr cursor_column + } elseif {$uwidth == 1} { + set owidth [grapheme_width_cached $ch] + incr idx + incr cursor_column + if {$owidth > 1} { + incr idx + incr cursor_column + } + } elseif {$uwidth > 1} { + if {[grapheme_width_cached $ch] == 1} { + if {!$insert_mode} { + #normal singlewide transparent overlay onto double-wide underlay + set next_pt_overchar [tcl::string::index $pt_overchars $idx_over+1] ;#lookahead of next plain-text char in overlay + if {$next_pt_overchar eq ""} { + #special-case trailing transparent - no next_pt_overchar + incr idx + incr cursor_column + } else { + if {[regexp $opt_transparent $next_pt_overchar]} { + incr idx + incr cursor_column + } else { + #next overlay char is not transparent.. first-half of underlying 2wide char is exposed + #priv::render_addchar $idx $opt_exposed1 [tcl::dict::get $overstacks $idx_over] [tcl::dict::get $overstacks_gx $idx_over] $insert_mode + priv::render_addchar $idx $opt_exposed1 [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + incr cursor_column + } + } + } else { + #? todo - decide what transparency even means for insert mode + incr idx + incr cursor_column + } + } else { + #2wide transparency over 2wide in underlay - review + incr idx + incr cursor_column + } + } + } + } else { + + set idxchar [lindex $outcols $idx] + #non-transparent char in overlay or empty cell + if {$idxchar eq "\u0000"} { + #empty/erased cell indicator + set uwidth 1 + } else { + set uwidth [grapheme_width_cached $idxchar] + } + if {$within_undercols} { + if {$idxchar eq ""} { + #2nd col of 2wide char in underlay + if {!$insert_mode} { + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] 0 + #JMN - this has to expose if our startposn chopped an underlay - but not if we already overwrote the first half of the widechar underlay grapheme + #e.g renderline \uFF21\uFF21--- a\uFF23\uFF23 + #vs + # renderline -startcolumn 2 \uFF21---- \uFF23 + if {[lindex $outcols $idx-1] != ""} { + #verified it's an empty following a filled - so it's a legit underlay remnant (REVIEW - when would it not be??) + #reset previous to an exposed 1st-half - but leave understacks code as is + priv::render_addchar [expr {$idx-1}] $opt_exposed1 [lindex $understacks $idx-1] [lindex $understacks_gx $idx-1] 0 + } + incr idx + } else { + set prevcolinfo [lindex $outcols $idx-1] + #for insert mode - first replace the empty 2ndhalf char with exposed2 before shifting it right + #REVIEW - this leaves a replacement character permanently in our columns.. but it is consistent regarding length (?) + #The alternative is to disallow insertion at a column cursor that is at 2nd half of 2wide char + #perhaps by inserting after the char - this may be worthwhile - but may cause other surprises + #It is perhaps best avoided at another level and try to make renderline do exactly as it's told + #the advantage of this 2w splitting method is that the inserted character ends up in exactly the column we expect. + priv::render_addchar $idx $opt_exposed2 [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] 0 ;#replace not insert + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] 1 ;#insert - same index + if {$prevcolinfo ne ""} { + #we've split the 2wide - it may already have been rendered as an exposed1 - but not for example if our startcolumn was current idx + priv::render_addchar [expr {$idx-1}] $opt_exposed1 [lindex $understacks $idx-1] [lindex $understacks_gx $idx-1] 0 ;#replace not insert + } ;# else?? + incr idx + } + if {$cursor_column < [llength $outcols] || $overflow_idx == -1} { + incr cursor_column + } + } elseif {$uwidth == 0} { + #what if this is some other c0/c1 control we haven't handled specifically? + + #by emitting a preceding empty-string column - we associate whatever this char is with the preceeding non-zero-length character and any existing zero-lengths that follow it + #e.g combining diacritic - increment before over char REVIEW + #arguably the previous overchar should have done this - ie lookahead for combiners? + #if we can get a proper grapheme_split function - this should be easier to tidy up. + priv::render_addchar $idx "" [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + incr cursor_column 2 + + if {$cursor_column > [llength $outcols] && $overflow_idx != -1} { + set cursor_column [llength $outcols] + } + } elseif {$uwidth == 1} { + #includes null empty cells + set owidth [grapheme_width_cached $ch] + if {$owidth == 1} { + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + } else { + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + priv::render_addchar $idx "" [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + #if next column in underlay empty - we've overwritten first half of underlying 2wide grapheme + #replace with rhs exposure in case there are no more overlay graphemes coming - use underlay's stack + if {([llength $outcols] >= $idx +2) && [lindex $outcols $idx+1] eq ""} { + priv::render_addchar [expr {$idx+1}] $opt_exposed2 [lindex $understacks $idx+1] [lindex $understacks_gx $idx+1] $insert_mode + } + incr idx + } + if {($cursor_column < [llength $outcols]) || $overflow_idx == -1} { + incr cursor_column + } + } elseif {$uwidth > 1} { + set owidth [grapheme_width_cached $ch] + if {$owidth == 1} { + #1wide over 2wide in underlay + if {!$insert_mode} { + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + incr cursor_column + priv::render_addchar $idx $opt_exposed2 [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + #don't incr idx - we are just putting a broken-indication in the underlay - which may get overwritten by next overlay char + } else { + #insert mode just pushes all to right - no exposition char here + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + incr cursor_column + } + } else { + #2wide over 2wide + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx 2 + incr cursor_column 2 + } + + if {$cursor_column > [llength $outcols] && $overflow_idx != -1} { + set cursor_column [llength $outcols] + } + } + } else { + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + incr cursor_column + } + } + } + } ;# end switch + + + } + other - crmcontrol { + if {$crm_mode && $type ne "crmcontrol" && $item ne "\x1b\[00001E"} { + if {$item eq "\x1b\[3l"} { + set crm_mode 0 + } else { + #When our initial overlay split was done - we weren't in crm_mode - so there are codes that weren't mapped to unicode control character representations + #set within_undercols [expr {$idx <= $renderwidth-1}] + #set chars [ansistring VIEW -nul 1 -lf 2 -vt 2 -ff 2 $item] + set chars [ansistring VIEW -nul 1 -lf 1 -vt 1 -ff 1 $item] + priv::render_unapplied $overlay_grapheme_control_list $gci + #prefix the unapplied controls with the string version of this control + set unapplied_list [linsert $unapplied_list 0 {*}[split $chars ""]] + set unapplied [join $unapplied_list ""] + + break + } + } + + #todo - consider CSI s DECSLRM vs ansi.sys \x1b\[s - we need \x1b\[s for oldschool ansi art - but may have to enable only for that. + #we should possibly therefore reverse this mapping so that x1b7 x1b8 are the primary codes for save/restore? + set code [tcl::string::map [list \x1b7 \x1b\[s \x1b8 \x1b\[u ] $item] + #since this element isn't a grapheme - advance idx_over to next grapheme overlay when about to fill 'unapplied' + + + #remap of DEC cursor_save/cursor_restore from ESC sequence to equivalent CSI + #probably not ideal - consider putting cursor_save/cursor_restore in functions so they can be called from the appropriate switch branch instead of using this mapping + #review - cost/benefit of function calls within these switch-arms instead of inline code? + + set c1 [tcl::string::index $code 0] + set c1c2c3 [tcl::string::range $code 0 2] + #set re_ST_open {(?:\033P|\u0090|\033X|\u0098|\033\^|\u009e|\033_|\u009f)} + #tcl 8.7 - faster to use inline list than to store it in a local var outside of loop. + #(somewhat surprising) + set leadernorm [tcl::string::range [tcl::string::map [list\ + \x1b\[< 1006\ + \x1b\[ 7CSI\ + \x1bY 7MAP\ + \x1bP 7DCS\ + \x90 8DCS\ + \x9b 8CSI\ + \x1b\] 7OSC\ + \x9d 8OSC\ + \x1b 7ESC\ + ] $c1c2c3] 0 3] ;#leadernorm is 1st 1,2 or 3 chars mapped to 4char normalised indicator - or is original first chars (1,2 or 3 len) + + #we leave the tail of the code unmapped for now + switch -- $leadernorm { + 1006 { + #https://invisible-island.net/xterm/ctlseqs/ctlseqs.html + #SGR (1006) CSI < followed by colon separated encoded-button-value,px,py ordinates and final M for button press m for button release + set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 3 end]] + } + 7CSI - 7OSC { + #set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 2 end]] + set codenorm $leadernorm[tcl::string::range $code 2 end] + } + 7DCS { + #ESC P + #Device Control String https://invisible-island.net/xterm/ctlseqs/ctlseqs.html#h4-Controls-beginning-with-ESC:ESC-F.C74 + set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 2 end]] + } + 8DCS { + #8-bit Device Control String + set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 1 end]] + } + 7MAP { + #map to another type of code to share implementation branch + set codenorm $leadernorm[tcl::string::range $code 1 end] + } + 7ESC { + #set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 1 end]] + set codenorm $leadernorm[tcl::string::range $code 1 end] + } + 8CSI - 8OSC { + set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 1 end]] + } + default { + puts stderr "Sequence detected as ANSI, but not handled in leadernorm switch. code: [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + #we haven't made a mapping for this + #could in theory be 1,2 or 3 in len + #although we shouldn't be getting here if the regexp for ansi codes is kept in sync with our switch branches + set codenorm $code + } + } + + switch -- $leadernorm { + 7MAP { + switch -- [lindex $codenorm 4] { + Y { + #vt52 movement. we expect 2 chars representing position (limited range) + set params [tcl::string::range $codenorm 5 end] + if {[tcl::string::length $params] != 2} { + #shouldn't really get here or need this branch if ansi splitting was done correctly + puts stderr "overtype::renderline ESC Y recognised as vt52 move, but incorrect parameters length ([string length $params] vs expected 2) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented codenorm:[ansistring VIEW -lf 1 -vt 1 -nul 1 $codenorm]" + } + set line [tcl::string::index $params 5] + set column [tcl::string::index $params 1] + set r [expr {[scan $line %c] -31}] + set c [expr {[scan $column %c] -31}] + + #MAP to: + #CSI n;m H - CUP - Cursor Position + set leadernorm 7CSI + set codenorm "$leadernorm${r}\;${c}H" + } + } + } + } + + #we've mapped 7 and 8bit escapes to values we can handle as literals in switch statements to take advantange of jump tables. + switch -- $leadernorm { + 1006 { + #TODO + # + switch -- [tcl::string::index $codenorm end] { + M { + puts stderr "mousedown $codenorm" + } + m { + puts stderr "mouseup $codenorm" + } + } + + } + {7CSI} - {8CSI} { + set param [tcl::string::range $codenorm 4 end-1] + #puts stdout "--> CSI [tcl::string::index $leadernorm 0] bit param:$param" + set code_end [tcl::string::index $codenorm end] ;#used for e.g h|l set/unset mode + + switch -exact -- $code_end { + A { + #Row move - up + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #todo + lassign [split $param {;}] num modifierkey + if {$modifierkey ne ""} { + puts stderr "modifierkey:$modifierkey" + } + + if {$num eq ""} {set num 1} + incr cursor_row -$num + + if {$cursor_row < 1} { + set cursor_row 1 + } + + #ensure rest of *overlay* is emitted to remainder + incr idx_over + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction up + #retain cursor_column + break + } + B { + #CUD - Cursor Down + #Row move - down + lassign [split $param {;}] num modifierkey + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #move down + if {$modifierkey ne ""} { + puts stderr "modifierkey:$modifierkey" + } + if {$num eq ""} {set num 1} + incr cursor_row $num + + + incr idx_over ;#idx_over hasn't encountered a grapheme and hasn't advanced yet + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction down + #retain cursor_column + break + } + C { + #CUF - Cursor Forward + #Col move + #puts stdout "->forward" + #todo - consider right-to-left cursor mode (e.g Hebrew).. some day. + #cursor forward + #right-arrow/move forward + lassign [split $param {;}] num modifierkey + if {$modifierkey ne ""} { + puts stderr "modifierkey:$modifierkey" + } + if {$num eq ""} {set num 1} + + #todo - retrict to moving 1 position past datalen? restrict to column width? + #should ideally wrap to next line when interactive and not on last row + #(some ansi art seems to expect this behaviour) + #This presumably depends on the terminal's wrap mode + #e.g DECAWM autowrap mode + # CSI ? 7 h - set: autowrap (also tput smam) + # CSI ? 7 l - reset: no autowrap (also tput rmam) + set version 2 + if {$version eq "2"} { + set max [llength $outcols] + if {$overflow_idx == -1} { + incr max + } + if {$cursor_column == $max+1} { + #move_forward while in overflow + incr cursor_column -1 + } + + if {($cursor_column + $num) <= $max} { + incr idx $num + incr cursor_column $num + } else { + if {$autowrap_mode} { + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #jmn + if {$idx == $overflow_idx} { + incr num + } + + #horizontal movement beyond line extent needs to wrap - throw back to caller + #we may have both overflow_right and unapplied data + #(can have overflow_right if we were in insert_mode and processed chars prior to this movement) + #leave row as is - caller will need to determine how many rows the column-movement has consumed + incr cursor_column $num ;#give our caller the necessary info as columns from start of row + #incr idx_over + #should be gci following last one applied + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction wrapmoveforward + break + } else { + set cursor_column $max + set idx [expr {$cursor_column -1}] + } + } + } else { + #review - dead branch + if {!$expand_right || ($cursor_column + $num) <= [llength $outcols+1]} { + incr idx $num + incr cursor_column $num + } else { + if {!$insert_mode} { + #block editing style with arrow keys + #overtype mode + set idxstart $idx + set idxend [llength $outcols] + set moveend [expr {$idxend - $idxstart}] + if {$moveend < 0} {set moveend 0} ;#sanity? + #puts "idxstart:$idxstart idxend:$idxend outcols[llength $outcols] undercols:[llength $undercols]" + incr idx $moveend + incr cursor_column $moveend + #if {[tcl::dict::exists $understacks $idx]} { + # set stackinfo [tcl::dict::get $understacks $idx] ;#use understack at end - which may or may not have already been replaced by stack from overtext + #} else { + # set stackinfo [list] + #} + if {$idx < [llength $understacks]} { + set stackinfo [lindex $understacks $idx] ;#use understack at end - which may or may not have already been replaced by stack from overtext + } else { + set stackinfo [list] + } + if {$idx < [llength $understacks_gx]} { + #set gxstackinfo [tcl::dict::get $understacks_gx $idx] + set gxstackinfo [lindex $understacks_gx $idx] + } else { + set gxstackinfo [list] + } + #pad outcols + set movemore [expr {$num - $moveend}] + #assert movemore always at least 1 or we wouldn't be in this branch + for {set m 1} {$m <= $movemore} {incr m} { + incr idx + incr cursor_column + priv::render_addchar $idx " " $stackinfo $gxstackinfo $insert_mode + } + } else { + #normal - insert + incr idx $num + incr cursor_column $num + if {$idx > [llength $outcols]} { + set idx [llength $outcols];#allow one beyond - for adding character at end of line + set cursor_column [expr {[llength $outcols]+1}] + } + } + } + } + } + D { + #Col move + #puts stdout "<-back" + #cursor back + #left-arrow/move-back when ltr mode + lassign [split $param {;}] num modifierkey + if {$modifierkey ne ""} { + puts stderr "modifierkey:$modifierkey" + } + if {$num eq ""} {set num 1} + + set version 2 + if {$version eq "2"} { + #todo - startcolumn offset! + if {$cursor_column - $num >= 1} { + incr idx -$num + incr cursor_column -$num + } else { + if {!$autowrap_mode} { + set cursor_column 1 + set idx 0 + } else { + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + incr cursor_column -$num + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction wrapmovebackward + break + } + } + } else { + incr idx -$num + incr cursor_column -$num + if {$idx < $opt_colstart-1} { + #wrap to previous line and position cursor at end of data + set idx [expr {$opt_colstart-1}] + set cursor_column $opt_colstart + } + } + } + E { + #CNL - Cursor Next Line + if {$param eq ""} { + set downmove 1 + } else { + set downmove [expr {$param}] + } + puts stderr "renderline CNL down-by-$downmove" + set cursor_column 1 + set cursor_row [expr {$cursor_row + $downmove}] + set idx [expr {$cursor_column -1}] + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + incr idx_over + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction move + break + + } + F { + #CPL - Cursor Previous Line + if {$param eq ""} { + set upmove 1 + } else { + set upmove [expr {$param}] + } + puts stderr "renderline CPL up-by-$upmove" + set cursor_column 1 + set cursor_row [expr {$cursor_row -$upmove}] + if {$cursor_row < 1} { + set cursor_row 1 + } + set idx [expr {$cursor_column - 1}] + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + incr idx_over + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction move + break + + } + G { + #CHA - Cursor Horizontal Absolute (move to absolute column no) + if {$param eq ""} { + set targetcol 1 + } else { + set targetcol $param + if {![string is integer -strict $targetcol]} { + puts stderr "renderline CHA (Cursor Horizontal Absolute) error. Unrecognised parameter '$param'" + } + set targetcol [expr {$param}] + set max [llength $outcols] + if {$overflow_idx == -1} { + incr max + } + if {$targetcol > $max} { + puts stderr "renderline CHA (Cursor Horizontal Absolute) error. Param '$param' > max: $max" + set targetcol $max + } + } + #adjust to colstart - as column 1 is within overlay + #??? REVIEW + set idx [expr {($targetcol -1) + $opt_colstart -1}] + + + set cursor_column $targetcol + #puts stderr "renderline absolute col move ESC G (TEST)" + } + H - f { + #CSI n;m H - CUP - Cursor Position + + #CSI n;m f - HVP - Horizontal Vertical Position REVIEW - same as CUP with differences (what?) in some terminal modes + # - 'counts as effector format function (like CR or LF) rather than an editor function (like CUD or CNL)' + # - REVIEW + #see Annex A at: https://www.ecma-international.org/wp-content/uploads/ECMA-48_5th_edition_june_1991.pdf + + #test e.g ansicat face_2.ans + #$re_both_move + lassign [split $param {;}] paramrow paramcol + #missing defaults to 1 + #CSI ;5H = CSI 1;5H -> row 1 col 5 + #CSI 17;H = CSI 17H = CSI 17;1H -> row 17 col 1 + + if {$paramcol eq ""} {set paramcol 1} + if {$paramrow eq ""} {set paramrow 1} + if {![string is integer -strict $paramcol] || ![string is integer -strict $paramrow]} { + puts stderr "renderline CUP (CSI H) unrecognised param $param" + #ignore? + } else { + set max [llength $outcols] + if {$overflow_idx == -1} { + incr max + } + if {$paramcol > $max} { + set target_column $max + } else { + set target_column [expr {$paramcol}] + } + + + if {$paramrow < 1} { + puts stderr "renderline CUP (CSI H) bad row target 0. Assuming 1" + set target_row 1 + } else { + set target_row [expr {$paramrow}] + } + if {$target_row == $cursor_row} { + #col move only - no need for break and move + #puts stderr "renderline CUP col move only to col $target_column param:$param" + set cursor_column $target_column + set idx [expr {$cursor_column -1}] + } else { + set cursor_row $target_row + set cursor_column $target_column + set idx [expr {$cursor_column -1}] + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + incr idx_over + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction move + break + } + } + } + J { + set modegroup [tcl::string::index $codenorm 4] ;#e.g ? + switch -exact -- $modegroup { + ? { + #CSI ? Pn J - selective erase + puts stderr "overtype::renderline ED - SELECTIVE ERASE IN DISPLAY (UNIMPLEMENTED) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + default { + puts stderr "overtype::renderline ED - ERASE IN DISPLAY (TESTING) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + if {$param eq ""} {set param 0} + switch -exact -- $param { + 0 { + #clear from cursor to end of screen + } + 1 { + #clear from cursor to beginning of screen + } + 2 { + #clear entire screen + #ansi.sys - move cursor to upper left REVIEW + set cursor_row 1 + set cursor_column 1 + set idx [expr {$cursor_column -1}] + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + incr idx_over + if {[llength $outcols]} { + priv::render_erasechar 0 [llength $outcols] + } + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction clear_and_move + break + } + 3 { + #clear entire screen. presumably cursor doesn't move - otherwise it would be same as 2J ? + + } + default { + } + } + + } + } + } + K { + #see DECECM regarding background colour + set modegroup [tcl::string::index $codenorm 4] ;#e.g ? + switch -exact -- $modegroup { + ? { + puts stderr "overtype::renderline DECSEL - SELECTIVE ERASE IN LINE (UNIMPLEMENTED) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + set param [string range $param 1 end] ;#chop qmark + if {$param eq ""} {set param 0} + switch -exact -- $param { + 0 { + #clear from cursor to end of line - depending on DECSCA + } + 1 { + #clear from cursor to beginning of line - depending on DECSCA + + } + 2 { + #clear entire line - depending on DECSCA + } + default { + puts stderr "overtype::renderline DECSEL - SELECTIVE ERASE IN LINE PARAM '$param' unrecognised [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + + } + default { + puts stderr "overtype::renderline EL - ERASE IN LINE (TESTING) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + if {$param eq ""} {set param 0} + switch -exact -- $param { + 0 { + #clear from cursor to end of line + } + 1 { + #clear from cursor to beginning of line + + } + 2 { + #clear entire line + } + default { + puts stderr "overtype::renderline EL - ERASE IN LINE PARAM '$param' unrecognised [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + } + } + } + L { + puts stderr "overtype::renderline IL - Insert Line - not implemented [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + M { + #CSI Pn M - DL - Delete Line + puts stderr "overtype::renderline DL - Delete Line - not implemented [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + + } + T { + #CSI Pn T - SD Pan Up (empty lines introduced at top) + #CSI Pn+T - kitty extension (lines at top come from scrollback buffer) + #Pn new lines appear at top of the display, Pn old lines disappear at the bottom of the display + if {$param eq "" || $param eq "0"} {set param 1} + if {[string index $param end] eq "+"} { + puts stderr "overtype::renderline CSI Pn + T - kitty Pan Up - not implemented [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } else { + puts stderr "overtype::renderline CSI Pn T - SD Pan Up - not implemented [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + X { + puts stderr "overtype::renderline X ECH ERASE CHARACTER - $param" + #ECH - erase character + if {$param eq "" || $param eq "0"} {set param 1}; #param=count of chars to erase + priv::render_erasechar $idx $param + #cursor position doesn't change. + } + q { + set code_secondlast [tcl::string::index $codenorm end-1] + switch -exact -- $code_secondlast { + {"} { + #DECSCA - Select Character Protection Attribute + #(for use with selective erase: DECSED and DECSEL) + set param [tcl::string::range $codenorm 4 end-2] + if {$param eq ""} {set param 0} + #TODO - store like SGR in stacks - replays? + switch -exact -- $param { + 0 - 2 { + #canerase + puts stderr "overtype::renderline - DECSCA canerase not implemented - [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + 1 { + #cannoterase + puts stderr "overtype::renderline - DECSCA cannoterase not implemented - [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + default { + puts stderr "overtype::renderline DECSCA param '$param' not understood [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + + } + default { + puts stderr "overtype::renderline - CSI ... q not implemented - [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + + } + r { + #$re_decstbm + #https://www.vt100.net/docs/vt510-rm/DECSTBM.html + #This control function sets the top and bottom margins for the current page. You cannot perform scrolling outside the margins + lassign [split $param {;}] margin_top margin_bottom + + #todo - return these for the caller to process.. + puts stderr "overtype::renderline DECSTBM set top and bottom margin not implemented" + #Also moves the cursor to col 1 line 1 of the page + set cursor_column 1 + set cursor_row 1 + + incr idx_over + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction move ;#own instruction? decstbm? + break + } + s { + #code conflict between ansi emulation and DECSLRM - REVIEW + #ANSISYSSC (when no parameters) - like other terminals - essentially treat same as DECSC + # todo - when parameters - support DECSLRM instead + + if {$param ne ""} { + #DECSLRM - should only be recognised if DECLRMM is set (vertical split screen mode) + lassign [split $param {;} margin_left margin_right + puts stderr "overtype DECSLRM not yet supported - got [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + if {$margin_left eq ""} { + set margin_left 1 + } + set columns_per_page 80 ;#todo - set to 'page width (DECSCPP set columns per page)' - could be 132 or?? + if {$margin_right eq ""} { + set margin_right $columns_per_page + } + puts stderr "DECSLRM margin left: $margin_left margin right: $margin_right" + if {![string is integer -strict $margin_left] || $margin_left < 0} { + puts stderr "DECSLRM invalid margin_left" + } + if {![string is integer -strict $margin_right] || $margin_right < 0} { + puts stderr "DECSLRM invalid margin_right" + } + set scrolling_region_size [expr {$margin_right - $margin_left}] + if {$scrolling_region_size < 2 || $scrolling_region_size > $columns_per_page} { + puts stderr "DECSLRM region size '$scrolling_regsion_size' must be between 1 and $columns_per_page" + } + #todo + + + } else { + #DECSC + #//notes on expected behaviour: + #DECSC - saves following items in terminal's memory + #cursor position + #character attributes set by the SGR command + #character sets (G0,G1,G2 or G3) currently in GL and GR + #Wrap flag (autowrap or no autowrap) + #State of origin mode (DECOM) + #selective erase attribute + #any single shift 2 (SS2) or single shift 3(SSD) functions sent + + #$re_cursor_save + #cursor save could come after last column + if {$overflow_idx != -1 && $idx == $overflow_idx} { + #bartman2.ans test file - fixes misalignment at bottom of dialog bubble + #incr cursor_row + #set cursor_column 1 + #bwings1.ans test file - breaks if we actually incr cursor (has repeated saves) + set cursor_saved_position [list row [expr {$cursor_row+1}] column 1] + } else { + set cursor_saved_position [list row $cursor_row column $cursor_column] + } + #there may be overlay stackable codes emitted that aren't in the understacks because they come between the last emmited character and the cursor_save control. + #we need the SGR and gx overlay codes prior to the cursor_save + + #a real terminal would not be able to know the state of the underlay.. so we should probably ignore it. + #set sgr_stack [lindex $understacks $idx] + #set gx_stack [lindex $understacks_gx $idx] ;#not actually a stack - just a boolean state (for now?) + + set sgr_stack [list] + set gx_stack [list] + + #we shouldn't need to scan for intermediate cursor save/restores - as restores would throw-back to the calling loop - so our overlay 'line' is since those. + #The overlay_grapheme_control_list had leading resets from previous lines - so we go back to the beginning not just the first grapheme. + + foreach gc [lrange $overlay_grapheme_control_list 0 $gci-1] { + lassign $gc type code + #types g other sgr gx0 + switch -- $type { + gx0 { + #code is actually a stand-in for the graphics on/off code - not the raw code + #It is either gx0_on or gx0_off + set gx_stack [list $code] + } + sgr { + #code is the raw code + if {[punk::ansi::codetype::is_sgr_reset $code]} { + #jmn + set sgr_stack [list "\x1b\[m"] + } elseif {[punk::ansi::codetype::has_sgr_leadingreset $code]} { + set sgr_stack [list $code] + lappend overlay_grapheme_control_list [list sgr $code] + } elseif {[priv::is_sgr $code]} { + #often we don't get resets - and codes just pile up. + #as a first step to simplifying - at least remove earlier straight up dupes + set dup_posns [lsearch -all -exact $sgr_stack $code] ;#needs -exact - codes have square-brackets (glob chars) + set sgr_stack [lremove $sgr_stack {*}$dup_posns] + lappend sgr_stack $code + } + } + } + } + set cursor_saved_attributes "" + switch -- [lindex $gx_stack 0] { + gx0_on { + append cursor_saved_attributes "\x1b(0" + } + gx0_off { + append cursor_saved_attributes "\x1b(B" + } + } + #append cursor_saved_attributes [join $sgr_stack ""] + append cursor_saved_attributes [punk::ansi::codetype::sgr_merge_list {*}$sgr_stack] + + #as there is apparently only one cursor storage element we don't need to throw back to the calling loop for a save. + + #don't incr index - or the save will cause cursor to move to the right + #carry on + } + } + u { + #ANSISYSRC save cursor (when no parameters) (DECSC) + + #$re_cursor_restore + #we are going to jump somewhere.. for now we will assume another line, and process accordingly. + #The caller has the cursor_saved_position/cursor_saved_attributes if any (?review - if we always pass it back it, we could save some calls for moves in same line) + #don't set overflow at this point. The existing underlay to the right must be preserved. + #we only want to jump and render the unapplied at the new location. + + #lset overstacks $idx_over [list] + #set replay_codes_overlay "" + + #if {$cursor_saved_attributes ne ""} { + # set replay_codes_overlay $cursor_saved_attributes ;#empty - or last save if it happend in this input chunk + #} else { + #jj + #set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + set replay_codes_overlay "" + #} + + #like priv::render_unapplied - but without the overlay's ansi reset or gx stacks from before the restore code + incr idx_over + + set unapplied "" + set unapplied_list [list] + foreach gc [lrange $overlay_grapheme_control_list $gci+1 end] { + lassign $gc type item + if {$type eq "gx0"} { + if {$item eq "gx0_on"} { + lappend unapplied_list "\x1b(0" + } elseif {$item eq "gx0_off"} { + lappend unapplied_list "\x1b(B" + } + } else { + lappend unapplied_list $item + } + #incr idx_over + } + set unapplied [join $unapplied_list ""] + #if the save occured within this line - that's ok - it's in the return value list and caller can prepend for the next loop. + set instruction restore_cursor + break + } + "{" { + + puts stderr "renderline warning - CSI.. - unimplemented [ansistring VIEW -lf 1 -nul 1 $code]" + } + "}" { + set code_secondlast [tcl::string::index $codenorm end-1] + switch -exact -- $code_secondlast { + ' { + puts stderr "renderline warning - DECIC - Insert Column - CSI...' - unimplemented [ansistring VIEW -lf 1 -nul 1 $code]" + } + default { + puts stderr "renderline warning - CSI.. - unimplemented [ansistring VIEW -lf 1 -nul 1 $code]" + } + } + } + ~ { + set code_secondlast [tcl::string::index $codenorm end-1] ;#used for e.g CSI x '~ + switch -exact -- $code_secondlast { + ' { + #DECDC - editing sequence - Delete Column + puts stderr "renderline warning - DECDC - unimplemented" + } + default { + #$re_vt_sequence + lassign [split $param {;}] key mod + + #Note that f1 to f4 show as ESCOP|Q|R|S (VT220?) but f5+ show as ESC\[15~ + # + #e.g esc \[2~ insert esc \[2;2~ shift-insert + #mod - subtract 1, and then use bitmask + #shift = 1, (left)Alt = 2, control=4, meta=8 (meta seems to do nothing on many terminals on windows? Intercepted by windows?) + #puts stderr "vt key:$key mod:$mod code:[ansistring VIEW $code]" + if {$key eq "1"} { + #home + } elseif {$key eq "2"} { + #Insert + if {$mod eq ""} { + #no modifier key + set insert_mode [expr {!$insert_mode}] + #rather than set the cursor - we return the insert mode state so the caller can decide + } + } elseif {$key eq "3"} { + #Delete - presumably this shifts other chars in the line, with empty cells coming in from the end + switch -- $mod { + "" { + priv::render_delchar $idx + } + "5" { + #ctrl-del - delete to end of word (pwsh) - possibly word on next line if current line empty(?) + } + } + } elseif {$key eq "4"} { + #End + } elseif {$key eq "5"} { + #pgup + } elseif {$key eq "6"} { + #pgDn + } elseif {$key eq "7"} { + #Home + #?? + set idx [expr {$opt_colstart -1}] + set cursor_column 1 + } elseif {$key eq "8"} { + #End + } elseif {$key eq "11"} { + #F1 - or ESCOP or e.g shift F1 ESC\[1;2P + } elseif {$key eq "12"} { + #F2 - or ESCOQ + } elseif {$key eq "13"} { + #F3 - or ESCOR + } elseif {$key eq "14"} { + #F4 - or ESCOS + } elseif {$key eq "15"} { + #F5 or shift F5 ESC\[15;2~ + } elseif {$key eq "17"} { + #F6 + } elseif {$key eq "18"} { + #F7 + } elseif {$key eq "19"} { + #F8 + } elseif {$key eq "20"} { + #F9 + } elseif {$key eq "21"} { + #F10 + } elseif {$key eq "23"} { + #F11 + } elseif {$key eq "24"} { + #F12 + } + + } + } + + } + h - l { + #set mode unset mode + #we are matching only last char to get to this arm - but are there other sequences ending in h|l we need to handle? + + #$re_mode if first after CSI is "?" + #some docs mention ESC=h|l - not seen on windows terminals.. review + #e.g https://www2.math.upenn.edu/~kazdan/210/computer/ansi.html + set modegroup [tcl::string::index $codenorm 4] ;#e.g ? = + switch -exact -- $modegroup { + ? { + set smparams [tcl::string::range $codenorm 5 end-1] ;#params between ? and h|l + #one or more modes can be set + set smparam_list [split $smparams {;}] + foreach num $smparam_list { + switch -- $num { + "" { + #ignore empties e.g extra/trailing semicolon in params + } + 5 { + #DECSNM - reverse video + #How we simulate this to render within a block of text is an open question. + #track all SGR stacks and constantly flip based on the current SGR reverse state? + #It is the job of the calling loop to do this - so at this stage we'll just set the states + + if {$code_end eq "h"} { + #set (enable) + set reverse_mode 1 + } else { + #reset (disable) + set reverse_mode 0 + } + + } + 7 { + #DECAWM autowrap + if {$code_end eq "h"} { + #set (enable) + set autowrap_mode 1 + if {$opt_width ne "\uFFEF"} { + set overflow_idx $opt_width + } else { + #review - this is also the cursor position when adding a char at end of line? + set overflow_idx [expr {[llength $undercols]}] ;#index at which we would be *in* overflow a row move may still override it + } + #review - can idx ever be beyond overflow_idx limit when we change e.g with a width setting and cursor movements? + # presume not usually - but sanity check with warning for now. + if {$idx >= $overflow_idx} { + puts stderr "renderline warning - idx '$idx' >= overflow_idx '$overflow_idx' - unexpected" + } + } else { + #reset (disable) + set autowrap_mode 0 + #REVIEW! + set overflow_idx -1 + } + } + 25 { + if {$code_end eq "h"} { + #visible cursor + + } else { + #invisible cursor + + } + } + 117 { + #DECECM - Erase Color Mode + #https://invisible-island.net/ncurses/ncurses.faq.html + #The Erase color selection controls the background color used when text is erased or new + #text is scrolled on to the screen. Screen background causes newly erased areas or + #scrolled text to be written using color index zero, the screen background. This is VT + #and DECterm compatible. Text background causes erased areas or scrolled text to be + #written using the current text background color. This is PC console compatible and is + #the factory default. + + #see also: https://unix.stackexchange.com/questions/251726/clear-to-end-of-line-uses-the-wrong-background-color-in-screen + } + } + } + } + = { + set num [tcl::string::range $codenorm 5 end-1] ;#param between = and h|l + puts stderr "overtype::renderline CSI=...h|l code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented" + } + default { + #e.g CSI 4 h + set num [tcl::string::range $codenorm 4 end-1] ;#param before h|l + switch -exact -- $num { + 3 { + puts stderr "CRM MODE $code_end" + #CRM - Show control character mode + # 'No control functions are executed except LF,FF and VT which are represented in the CRM FONT before a CRLF(new line) is executed' + # + #use ansistring VIEW -nul 1 -lf 2 -ff 2 -vt 2 + #https://vt100.net/docs/vt510-rm/CRM.html + #NOTE - vt100 CRM always does auto-wrap at right margin. + #disabling auto-wrap in set-up or by sequence is disabled. + #We should default to turning off auto-wrap when crm_mode enabled.. but + #displaying truncated (on rhs) crm can still be very useful - and we have optimisation in overflow to avoid excess renderline calls (per grapheme) + #we therefore could reasonably put in an exception to allow auto_wrap to be disabled after crm_mode is engaged, + #although this would be potentially an annoying difference to some.. REVIEW + if {$code_end eq "h"} { + set crm_mode 1 + set autowrap_mode 1 + if {$opt_width ne "\uFFEF"} { + set overflow_idx $opt_width + } else { + #review - this is also the cursor position when adding a char at end of line? + set overflow_idx [expr {[llength $undercols]}] ;#index at which we would be *in* overflow a row move may still override it + } + } else { + set crm_mode 0 + } + } + 4 { + #IRM - Insert/Replace Mode + if {$code_end eq "h"} { + #CSI 4 h + set insert_mode 1 + } else { + #CSI 4 l + #replace mode + set insert_mode 0 + } + } + default { + puts stderr "overtype::renderline CSI...h|l code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented" + } + } + } + } + } + | { + switch -- [tcl::string::index $codenorm end-1] { + {$} { + #CSI ... $ | DECSCPP set columns per page- (recommended in vt510 docs as preferable to DECCOLM) + #real terminals generally only supported 80/132 + #some other virtuals support any where from 2 to 65,536? + #we will allow arbitrary widths >= 2 .. to some as yet undetermined limit. + #CSI $ | + #empty or 0 param is 80 for compatibility - other numbers > 2 accepted + set page_width -1 ;#flag as unset + if {$param eq ""} { + set page_width 80 + } elseif {[string is integer -strict $param] && $param >=2 0} { + set page_width [expr {$param}] ;#we should allow leading zeros in the number - but lets normalize using expr + } else { + puts stderr "overtype::renderline unacceptable DECSPP value '$param'" + } + + if {$page_width > 2} { + puts stderr "overtype::renderline DECSCPP - not implemented - but selected width '$page_width' looks ok" + #if cursor already beyond new page_width - will move to right colum - otherwise no cursor movement + + } + + } + default { + puts stderr "overtype::renderline unrecognised CSI code ending in pipe (|) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented" + } + } + } + default { + puts stderr "overtype::renderline CSI code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented" + } + } + } + 7ESC { + # + #re_other_single {\x1b(D|M|E)$} + #also vt52 Y.. + #also PM \x1b^...(ST) + switch -- [tcl::string::index $codenorm 4] { + c { + #RIS - reset terminal to initial state - where 'terminal' in this case is the renderspace - not the underlying terminal! + puts stderr "renderline reset" + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction reset + break + } + D { + #\x84 + #index (IND) + #vt102-docs: "Moves cursor down one line in same column. If cursor is at bottom margin, screen performs a scroll-up" + puts stderr "renderline ESC D not fully implemented" + incr cursor_row + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction down + #retain cursor_column + break + } + E { + #\x85 + #review - is behaviour different to lf? + #todo - possibly(?) same logic as handling above. i.e return instruction depends on where column_cursor is at the time we get NEL + #leave implementation until logic for is set in stone... still under review + #It's arguable NEL is a pure cursor movement as opposed to the semantic meaning of crlf or lf in a file. + # + #Next Line (NEL) "Move the cursor to the left margin on the next line. If the cursor is at the bottom margin, scroll the page up" + puts stderr "overtype::renderline ESC E unimplemented" + + } + H { + #\x88 + #Tab Set + puts stderr "overtype::renderline ESC H tab set unimplemented" + } + M { + #\x8D + #Reverse Index (RI) + #vt102-docs: "Moves cursor up one line in same column. If cursor is at top margin, screen performs a scroll-down" + puts stderr "overtype::renderline ESC M not fully implemented" + + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #move up + incr cursor_row -1 + if {$cursor_row < 1} { + set cursor_row 1 + } + #ensure rest of *overlay* is emitted to remainder + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction up ;#need instruction for scroll-down? + #retain cursor_column + break + } + N { + #\x8e - affects next character only + puts stderr "overtype::renderline single shift select G2 command UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + O { + #\x8f - affects next character only + puts stderr "overtype::renderline single shift select G3 command UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + P { + #\x90 + #DCS - shouldn't get here - handled in 7DCS branch + #similarly \] OSC (\x9d) and \\ (\x9c) ST + } + V { + #\x96 + + } + W { + #\x97 + } + X { + #\x98 + #SOS + if {[string index $code end] eq "\007"} { + set sos_content [string range $code 2 end-1] ;#ST is \007 + } else { + set sos_content [string range $code 2 end-2] ;#ST is \x1b\\ + } + #return in some useful form to the caller + #TODO! + lappend sos_list [list string $sos_content row $cursor_row column $cursor_column] + puts stderr "overtype::renderline ESCX SOS UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + ^ { + #puts stderr "renderline PM" + #Privacy Message. + if {[string index $code end] eq "\007"} { + set pm_content [string range $code 2 end-1] ;#ST is \007 + } else { + set pm_content [string range $code 2 end-2] ;#ST is \x1b\\ + } + #We don't want to render it - but we need to make it available to the application + #see the textblock library in punk, for the exception we make here for single backspace. + #It is unlikely to be encountered as a useful PM - so we hack to pass it through as a fix + #for spacing issues on old terminals which miscalculate the single-width 'Symbols for Legacy Computing' + if {$pm_content eq "\b"} { + #puts stderr "renderline PM sole backspace special handling for \U1FB00 - \U1FBFF" + #esc^\b\007 or esc^\besc\\ + #HACKY pass-through - targeting terminals that both mis-space legacy symbols *and* don't support PMs + #The result is repair of the extra space. If the terminal is a modern one and does support PM - the \b should be hidden anyway. + #If the terminal has the space problem AND does support PMs - then this just won't fix it. + #The fix relies on the symbol-supplier to cooperate by appending esc^\b\esc\\ to the problematic symbols. + + #priv::render_addchar $idx $code [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + #idx has been incremented after last grapheme added + priv::render_append_to_char [expr {$idx -1}] $code + } + #lappend to a dict element in the result for application-specific processing + lappend pm_list $pm_content + } + _ { + #APC Application Program Command + #just warn for now.. + puts stderr "overtype::renderline ESC_ APC command UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + default { + puts stderr "overtype::renderline ESC code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented codenorm:[ansistring VIEW -lf 1 -vt 1 -nul 1 $codenorm]" + } + } + + } + 7DCS - 8DCS { + puts stderr "overtype::renderline DCS - DEVICE CONTROL STRING command UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + #ST (string terminator) \x9c or \x1b\\ + if {[tcl::string::index $codenorm end] eq "\x9c"} { + set code_content [tcl::string::range $codenorm 4 end-1] ;#ST is 8-bit 0x9c + } else { + set code_content [tcl::string::range $codenorm 4 end-2] ;#ST is \x1b\\ + } + + } + 7OSC - 8OSC { + # OSCs are terminated with ST of either \007 or \x1b\\ - we allow either whether code was 7 or 8 bit + if {[tcl::string::index $codenorm end] eq "\007"} { + set code_content [tcl::string::range $codenorm 4 end-1] ;#ST is \007 + } else { + set code_content [tcl::string::range $codenorm 4 end-2] ;#ST is \x1b\\ + } + set first_colon [tcl::string::first {;} $code_content] + if {$first_colon == -1} { + #there probably should always be a colon - but we'll try to make sense of it without + set osc_code $code_content ;#e.g \x1b\]104\007 vs \x1b\]104\;\007 + } else { + set osc_code [tcl::string::range $code_content 0 $first_colon-1] + } + switch -exact -- $osc_code { + 2 { + set newtitle [tcl::string::range $code_content 2 end] + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction [list set_window_title $newtitle] + break + } + 4 { + #OSC 4 - set colour palette + #can take multiple params + #e.g \x1b\]4\;1\;red\;2\;green\x1b\\ + set params [tcl::string::range $code_content 2 end] ;#strip 4 and first semicolon + set cmap [dict create] + foreach {cnum spec} [split $params {;}] { + if {$cnum >= 0 and $cnum <= 255} { + #todo - parse spec from names like 'red' to RGB + #todo - accept rgb:ab/cd/ef as well as rgb:/a/b/c (as alias for aa/bb/cc) + #also - what about rgb:abcd/defg/hijk and 12-bit abc/def/ghi ? + dict set cmap $cnum $spec + } else { + #todo - log + puts stderr "overtype::renderline OSC 4 set colour palette - bad color number: $cnum must be from 0 to 255. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + + puts stderr "overtype::renderline OSC 4 set colour palette unimplemented. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + + + } + 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 { + #OSC 10 through 17 - so called 'dynamic colours' + #can take multiple params - each successive parameter changes the next colour in the list + #- e.g if code started at 11 - next param is for 12. 17 takes only one param because there are no more + #10 change text foreground colour + #11 change text background colour + #12 change text cursor colour + #13 change mouse foreground colour + #14 change mouse background colour + #15 change tektronix foreground colour + #16 change tektronix background colour + #17 change highlight colour + set params [tcl::string::range $code_content 2 end] + + puts stderr "overtype::renderline OSC $osc_code set dynamic colours unimplemented. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + + + } + 18 { + #why is this not considered one of the dynamic colours above? + #https://www.xfree86.org/current/ctlseqs.html + #tektronix cursor color + puts stderr "overtype::renderline OSC 18 - set tektronix cursor color unimplemented. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + 99 { + #kitty desktop notifications + #https://sw.kovidgoyal.net/kitty/desktop-notifications/ + # 99 ; metadata ; payload + puts stderr "overtype::renderline OSC 99 kitty desktop notification unimplemented. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + 104 { + #reset colour palette + #we want to do it for the current rendering context only (vvt) - not just pass through to underlying vt + puts stderr "overtype::renderline OSC 104 reset colour palette unimplemented. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction [list reset_colour_palette] + break + } + 1337 { + #iterm2 graphics and file transfer + puts stderr "overtype::renderline OSC 1337 iterm2 graphics/file_transfer unimplemented. 1st 100 chars of code [ansistring VIEW -lf 1 -vt 1 -nul 1 [string range $code 0 99]]" + } + 5113 { + puts stderr "overtype::renderline OSC 5113 kitty file transfer unimplemented. 1st 100 chars of code [ansistring VIEW -lf 1 -vt 1 -nul 1 [string range $code 0 99]]" + } + default { + puts stderr "overtype::renderline OSC - UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + + } + default { + } + } + + + } + default { + #don't need to handle sgr or gx0 types + #we have our sgr gx0 codes already in stacks for each overlay grapheme + } + } + } + + #-------- + if {$opt_expand_right == 0} { + #need to truncate to the width of the original undertext + #review - string_width vs printing_length here. undertext requirement to be already rendered therefore punk::char::string_width ok? + #set num_under_columns [punk::char::string_width $pt_underchars] ;#plaintext underchars + } + if {$overflow_idx == -1} { + #overflow was initially unlimited and hasn't been overridden + } else { + + } + #-------- + + + #coalesce and replay codestacks for outcols grapheme list + set outstring "" ;#output prior to overflow + set overflow_right "" ;#remainder after overflow point reached + set i 0 + set cstack [list] + set prevstack [list] + set prev_g0 [list] + #note overflow_idx may already have been set lower if we had a row move above due to \v or ANSI moves + set in_overflow 0 ;#used to stop char-width scanning once in overflow + if {$overflow_idx == 0} { + #how does caller avoid an infinite loop if they have autowrap on and keep throwing graphemes to the next line? REVIEW + set in_overflow 1 + } + set trailing_nulls 0 + foreach ch [lreverse $outcols] { + if {$ch eq "\u0000"} { + incr trailing_nulls + } else { + break + } + } + if {$trailing_nulls} { + set first_tail_null_posn [expr {[llength $outcols] - $trailing_nulls}] + } else { + set first_tail_null_posn -1 + } + + #puts stderr "first_tail_null_posn: $first_tail_null_posn" + #puts stderr "colview: [ansistring VIEW $outcols]" + + foreach ch $outcols { + #puts "---- [ansistring VIEW $ch]" + + set gxleader "" + if {$i < [llength $understacks_gx]} { + #set g0 [tcl::dict::get $understacks_gx $i] + set g0 [lindex $understacks_gx $i] + if {$g0 ne $prev_g0} { + if {$g0 eq [list "gx0_on"]} { + set gxleader "\x1b(0" + } else { + set gxleader "\x1b(B" + } + } + set prev_g0 $g0 + } else { + set prev_g0 [list] + } + + set sgrleader "" + if {$i < [llength $understacks]} { + #set cstack [tcl::dict::get $understacks $i] + set cstack [lindex $understacks $i] + if {$cstack ne $prevstack} { + if {[llength $prevstack] && ![llength $cstack]} { + #This reset is important e.g testfile fruit.ans - we get overhang on rhs without it. But why is cstack empty? + append sgrleader \033\[m + } else { + append sgrleader [punk::ansi::codetype::sgr_merge_list {*}$cstack] + } + } + set prevstack $cstack + } else { + set prevstack [list] + } + + + + if {$in_overflow} { + if {$i == $overflow_idx} { + set 0 [lindex $understacks_gx $i] + set gxleader "" + if {$g0 eq [list "gx0_on"]} { + set gxleader "\x1b(0" + } elseif {$g0 eq [list "gx0_off"]} { + set gxleader "\x1b(B" + } + append overflow_right $gxleader + set cstack [lindex $understacks $i] + set sgrleader "" + #whether cstack is same or differs from previous char's stack - we must have an output at the start of the overflow_right + #if {[llength $prevstack] && ![llength $cstack]} { + # append sgrleader \033\[m + #} + append sgrleader [punk::ansi::codetype::sgr_merge_list {*}$cstack] + append overflow_right $sgrleader + append overflow_right $ch + } else { + append overflow_right $gxleader + append overflow_right $sgrleader + append overflow_right $ch + } + } else { + if {$overflow_idx != -1 && $i+1 == $overflow_idx} { + #one before overflow + #will be in overflow in next iteration + set in_overflow 1 + if {[grapheme_width_cached $ch]> 1} { + #we overflowed with second-half of a double-width char - replace first-half with user-supplied exposition char (should be 1 wide) + set ch $opt_exposed1 + } + } + append outstring $gxleader + append outstring $sgrleader + if {$ch eq "\u0000"} { + if {$cp437_glyphs} { + #map all nulls including at tail to space + append outstring " " + } else { + if {$trailing_nulls && $i < $first_tail_null_posn} { + append outstring " " ;#map inner nulls to space + } else { + append outstring \u0000 + } + } + } else { + append outstring $ch + } + } + incr i + } + #flower.ans good test for null handling - reverse line building + #review - presence of overflow_right doesn't indicate line's trailing nulls should remain. + #The cells could have been erased? + #if {!$cp437_glyphs} { + # #if {![ansistring length $overflow_right]} { + # # set outstring [tcl::string::trimright $outstring "\u0000"] + # #} + # set outstring [tcl::string::trimright $outstring "\u0000"] + # set outstring [tcl::string::map {\u0000 " "} $outstring] + #} + + + #REVIEW + #set overflow_right [tcl::string::trimright $overflow_right "\u0000"] + #set overflow_right [tcl::string::map {\u0000 " "} $overflow_right] + + set replay_codes "" + if {[llength $understacks] > 0} { + if {$overflow_idx == -1} { + #set tail_idx [tcl::dict::size $understacks] + set tail_idx [llength $understacks] + } else { + set tail_idx [llength $undercols] + } + if {$tail_idx-1 < [llength $understacks]} { + #set replay_codes [join [lindex $understacks $tail_idx-1] ""] ;#tail replay codes + set replay_codes [punk::ansi::codetype::sgr_merge_list {*}[lindex $understacks $tail_idx-1]] ;#tail replay codes + } + if {$tail_idx-1 < [llength $understacks_gx]} { + set gx0 [lindex $understacks_gx $tail_idx-1] + if {$gx0 eq [list "gx0_on"]} { + #if it was on, turn gx0 off at the point we stop processing overlay + append outstring "\x1b(B" + } + } + } + if {[string length $overflow_right]} { + #puts stderr "remainder:$overflow_right" + } + #pdict $understacks + + if {[punk::ansi::ta::detect_sgr $outstring]} { + append outstring [punk::ansi::a] ;#without this - we would get for example, trailing backgrounds after rightmost column + + #close off any open gx? + #probably should - and overflow_right reopen? + } + + if {$opt_returnextra} { + #replay_codes is the codestack at the boundary - used for ellipsis colouring to match elided text - review + #replay_codes_underlay is the set of codes in effect at the very end of the original underlay + + #review + #replay_codes_overlay is the set of codes in effect at the very end of the original overlay (even if not all overlay was applied) + #todo - replay_codes for gx0 mode + + #overflow_idx may change during ansi & character processing + if {$overflow_idx == -1} { + set overflow_right_column "" + } else { + set overflow_right_column [expr {$overflow_idx+1}] + } + set result [tcl::dict::create\ + result $outstring\ + visualwidth [punk::ansi::printing_length $outstring]\ + instruction $instruction\ + stringlen [string length $outstring]\ + overflow_right_column $overflow_right_column\ + overflow_right $overflow_right\ + unapplied $unapplied\ + unapplied_list $unapplied_list\ + insert_mode $insert_mode\ + autowrap_mode $autowrap_mode\ + crm_mode $crm_mode\ + reverse_mode $reverse_mode\ + insert_lines_above $insert_lines_above\ + insert_lines_below $insert_lines_below\ + cursor_saved_position $cursor_saved_position\ + cursor_saved_attributes $cursor_saved_attributes\ + cursor_column $cursor_column\ + cursor_row $cursor_row\ + expand_right $opt_expand_right\ + replay_codes $replay_codes\ + replay_codes_underlay $replay_codes_underlay\ + replay_codes_overlay $replay_codes_overlay\ + pm_list $pm_list\ + ] + if {$opt_returnextra == 1} { + #puts stderr "renderline: $result" + return $result + } else { + #human/debug - map special chars to visual glyphs + set viewop VIEW + switch -- $opt_returnextra { + 2 { + #codes and character data + set viewop VIEWCODES ;#ansi colorisation of codes - green for SGR, blue/blue reverse for cursor_save/cursor_restore, cyan for movements, orange for others + } + 3 { + set viewop VIEWSTYLE ;#ansi colorise the characters within the output with preceding codes, stacking codes only within each dict value - may not be same SGR effect as the effect in-situ. + } + } + tcl::dict::set result result [ansistring $viewop -lf 1 -vt 1 [tcl::dict::get $result result]] + tcl::dict::set result overflow_right [ansistring VIEW -lf 1 -vt 1 [tcl::dict::get $result overflow_right]] + tcl::dict::set result unapplied [ansistring VIEW -lf 1 -vt 1 [tcl::dict::get $result unapplied]] + tcl::dict::set result unapplied_list [ansistring VIEW -lf 1 -vt 1 [tcl::dict::get $result unapplied_list]] + tcl::dict::set result replay_codes [ansistring $viewop -lf 1 -vt 1 [tcl::dict::get $result replay_codes]] + tcl::dict::set result replay_codes_underlay [ansistring $viewop -lf 1 -vt 1 [tcl::dict::get $result replay_codes_underlay]] + tcl::dict::set result replay_codes_overlay [ansistring $viewop -lf 1 -vt 1 [tcl::dict::get $result replay_codes_overlay]] + tcl::dict::set result cursor_saved_attributes [ansistring $viewop -lf 1 -vt 1 [tcl::dict::get $result cursor_saved_attributes]] + return $result + } + } else { + #puts stderr "renderline returning: result $outstring instruction $instruction unapplied $unapplied overflow_right $overflow_right" + return $outstring + } + #return [join $out ""] + } + + #*** !doctools + #[list_end] [comment {--- end definitions namespace overtype ---}] +} + +tcl::namespace::eval overtype::piper { + proc overcentre {args} { + if {[llength $args] < 2} { + error {usage: ?-bias left|right? ?-transparent [0|1|]? ?-exposed1 ? ?-exposed2 ? ?-overflow [1|0]? overtext pipelinedata} + } + lassign [lrange $args end-1 end] over under + set argsflags [lrange $args 0 end-2] + tailcall overtype::centre {*}$argsflags $under $over + } + proc overleft {args} { + if {[llength $args] < 2} { + error {usage: ?-startcolumn ? ?-transparent [0|1|]? ?-exposed1 ? ?-exposed2 ? ?-overflow [1|0]? overtext pipelinedata} + } + lassign [lrange $args end-1 end] over under + set argsflags [lrange $args 0 end-2] + tailcall overtype::left {*}$argsflags $under $over + } +} + + +# -- --- --- --- --- --- --- --- --- --- --- +proc overtype::transparentline {args} { + foreach {under over} [lrange $args end-1 end] break + set argsflags [lrange $args 0 end-2] + set defaults [tcl::dict::create\ + -transparent 1\ + -exposed 1 " "\ + -exposed 2 " "\ + ] + set newargs [tcl::dict::merge $defaults $argsflags] + tailcall overtype::renderline {*}$newargs $under $over +} +#renderline may not make sense as it is in the long run for blocks of text - but is handy in the single-line-handling form anyway. +# We are trying to handle ansi codes in a block of text which is acting like a mini-terminal in some sense. +#We can process standard cursor moves such as \b \r - but no way to respond to other cursor movements e.g moving to other lines. +# +tcl::namespace::eval overtype::piper { + proc renderline {args} { + if {[llength $args] < 2} { + error {usage: ?-start ? ?-transparent [0|1|]? ?-overflow [1|0]? overtext pipelinedata} + } + foreach {over under} [lrange $args end-1 end] break + set argsflags [lrange $args 0 end-2] + tailcall overtype::renderline {*}$argsflags $under $over + } +} +interp alias "" piper_renderline "" overtype::piper::renderline + +#intended primarily for single grapheme - but will work for multiple +#WARNING: query CAN contain ansi or newlines - but if cache was not already set manually,the answer will be incorrect! +#We deliberately allow this for PM/SOS attached within a column +#(a cache of ansifreestring_width calls - as these are quite regex heavy) +proc overtype::grapheme_width_cached {ch} { + variable grapheme_widths + if {[tcl::dict::exists $grapheme_widths $ch]} { + return [tcl::dict::get $grapheme_widths $ch] + } + set width [punk::char::ansifreestring_width $ch] + tcl::dict::set grapheme_widths $ch $width + return $width +} + + + +proc overtype::test_renderline {} { + set t \uFF5E ;#2-wide tilde + set u \uFF3F ;#2-wide underscore + set missing \uFFFD + return [list $t $u A${t}B] +} + +#maintenance warning +#same as textblock::size - but we don't want that circular dependency +#block width and height can be tricky. e.g \v handled differently on different terminal emulators and can affect both +proc overtype::blocksize {textblock} { + if {$textblock eq ""} { + return [tcl::dict::create width 0 height 1] ;#no such thing as zero-height block - for consistency with non-empty strings having no line-endings + } + if {[tcl::string::first \t $textblock] >= 0} { + if {[info exists punk::console::tabwidth]} { + set tw $::punk::console::tabwidth + } else { + set tw 8 + } + set textblock [textutil::tabify::untabify2 $textblock $tw] + } + #ansistrip on entire block in one go rather than line by line - result should be the same - review - make tests + if {[punk::ansi::ta::detect $textblock]} { + set textblock [punk::ansi::ansistrip $textblock] + } + if {[tcl::string::last \n $textblock] >= 0} { + set num_le [expr {[tcl::string::length $textblock]-[tcl::string::length [tcl::string::map {\n {}} $textblock]]}] ;#faster than splitting into single-char list + set width [tcl::mathfunc::max {*}[lmap v [split $textblock \n] {::punk::char::ansifreestring_width $v}]] + } else { + set num_le 0 + set width [punk::char::ansifreestring_width $textblock] + } + #our concept of block-height is likely to be different to other line-counting mechanisms + set height [expr {$num_le + 1}] ;# one line if no le - 2 if there is one trailing le even if no data follows le + + return [tcl::dict::create width $width height $height] ;#maintain order in 'image processing' standard width then height - caller may use lassign [dict values [blocksize ]] width height +} + +tcl::namespace::eval overtype::priv { + variable cache_is_sgr [tcl::dict::create] + + #we are likely to be asking the same question of the same ansi codes repeatedly + #caching the answer saves some regex expense - possibly a few uS to lookup vs under 1uS + #todo - test if still worthwhile after a large cache is built up. (limit cache size?) + proc is_sgr {code} { + variable cache_is_sgr + if {[tcl::dict::exists $cache_is_sgr $code]} { + return [tcl::dict::get $cache_is_sgr $code] + } + set answer [punk::ansi::codetype::is_sgr $code] + tcl::dict::set cache_is_sgr $code $answer + return $answer + } + # better named render_to_unapplied? + proc render_unapplied {overlay_grapheme_control_list gci} { + upvar idx_over idx_over + upvar unapplied unapplied + upvar unapplied_list unapplied_list ;#maintaining as a list allows caller to utilize it without having to re-split + upvar overstacks overstacks + upvar overstacks_gx overstacks_gx + upvar overlay_grapheme_control_stacks og_stacks + + #set unapplied [join [lrange $overlay_grapheme_control_list $gci+1 end]] + set unapplied "" + set unapplied_list [list] + #append unapplied [join [lindex $overstacks $idx_over] ""] + #append unapplied [punk::ansi::codetype::sgr_merge_list {*}[lindex $overstacks $idx_over]] + set sgr_merged [punk::ansi::codetype::sgr_merge_list {*}[lindex $og_stacks $gci]] + if {$sgr_merged ne ""} { + lappend unapplied_list $sgr_merged + } + switch -- [lindex $overstacks_gx $idx_over] { + "gx0_on" { + lappend unapplied_list "\x1b(0" + } + "gx0_off" { + lappend unapplied_list "\x1b(B" + } + } + + foreach gc [lrange $overlay_grapheme_control_list $gci+1 end] { + lassign $gc type item + #types g other sgr gx0 + if {$type eq "gx0"} { + if {$item eq "gx0_on"} { + lappend unapplied_list "\x1b(0" + } elseif {$item eq "gx0_off"} { + lappend unapplied_list "\x1b(B" + } + } else { + lappend unapplied_list $item + } + } + set unapplied [join $unapplied_list ""] + } + + #clearer - renders the specific gci forward as unapplied - prefixed with it's merged sgr stack + proc render_this_unapplied {overlay_grapheme_control_list gci} { + upvar idx_over idx_over + upvar unapplied unapplied + upvar unapplied_list unapplied_list + upvar overstacks overstacks + upvar overstacks_gx overstacks_gx + upvar overlay_grapheme_control_stacks og_stacks + + #set unapplied [join [lrange $overlay_grapheme_control_list $gci+1 end]] + set unapplied "" + set unapplied_list [list] + + set sgr_merged [punk::ansi::codetype::sgr_merge_list {*}[lindex $og_stacks $gci]] + if {$sgr_merged ne ""} { + lappend unapplied_list $sgr_merged + } + switch -- [lindex $overstacks_gx $idx_over] { + "gx0_on" { + lappend unapplied_list "\x1b(0" + } + "gx0_off" { + lappend unapplied_list "\x1b(B" + } + } + + foreach gc [lrange $overlay_grapheme_control_list $gci end] { + lassign $gc type item + #types g other sgr gx0 + if {$type eq "gx0"} { + if {$item eq "gx0_on"} { + lappend unapplied_list "\x1b(0" + } elseif {$item eq "gx0_off"} { + lappend unapplied_list "\x1b(B" + } + } else { + lappend unapplied_list $item + } + } + set unapplied [join $unapplied_list ""] + } + proc render_delchar {i} { + upvar outcols o + upvar understacks ustacks + upvar understacks_gx gxstacks + set nxt [llength $o] + if {$i < $nxt} { + set o [lreplace $o $i $i] + set ustacks [lreplace $ustacks $i $i] + set gxstacks [lreplace $gxstacks $i $i] + } elseif {$i == 0 || $i == $nxt} { + #nothing to do + } else { + puts stderr "render_delchar - attempt to delchar at index $i >= number of outcols $nxt - shouldn't happen" + } + } + proc render_erasechar {i count} { + upvar outcols o + upvar understacks ustacks + upvar understacks_gx gxstacks + upvar replay_codes_overlay replay + #ECH clears character attributes from erased character positions + #ECH accepts 0 or empty parameter, which is equivalent to 1. Caller of render_erasechar should do that mapping and only supply 1 or greater. + if {![tcl::string::is integer -strict $count] || $count < 1} { + error "render_erasechar count must be integer >= 1" + } + set start $i + set end [expr {$i + $count -1}] + #we restrict ECH to current line - as some terminals do - review - is that the only way it's implemented? + if {$i > [llength $o]-1} { + return + } + if {$end > [llength $o]-1} { + set end [expr {[llength $o]-1}] + } + set num [expr {$end - $start + 1}] + set o [lreplace $o $start $end {*}[lrepeat $num \u0000]] ;#or space? + #DECECM ??? + set ustacks [lreplace $ustacks $start $end {*}[lrepeat $num [list $replay]]] + set gxstacks [lreplace $gxstacks $start $end {*}[lrepeat $num [list]]] ;# ??? review + return + } + proc render_setchar {i c } { + upvar outcols o + lset o $i $c + } + + #Initial usecase is for old-terminal hack to add PM-wrapped \b + #review - can be used for other multibyte sequences that occupy one column? + #combiners? diacritics? + proc render_append_to_char {i c} { + upvar outcols o + if {$i > [llength $o]-1} { + error "render_append_to_char cannot append [ansistring VIEW -lf 1 -nul 1 $c] to existing char at index $i while $i >= llength outcols [llength $o]" + } + set existing [lindex $o $i] + if {$existing eq "\0"} { + lset o $i $c + } else { + lset o $i $existing$c + } + } + #is actually addgrapheme? + proc render_addchar {i c sgrstack gx0stack {insert_mode 0}} { + upvar outcols o + upvar understacks ustacks + upvar understacks_gx gxstacks + + # -- --- --- + #this is somewhat of a hack.. probably not really the equivalent of proper reverse video? review + #we should ideally be able to reverse the video of a sequence that already includes SGR reverse/noreverse attributes + upvar reverse_mode do_reverse + #if {$do_reverse} { + # lappend sgrstack [a+ reverse] + #} else { + # lappend sgrstack [a+ noreverse] + #} + + #JMN3 + if {$do_reverse} { + #note we can't just look for \x1b\[7m or \x1b\[27m + # it may be a more complex sequence like \x1b\[0\;\;7\;31m etc + + set existing_reverse_state 0 + set codeinfo [punk::ansi::codetype::sgr_merge $sgrstack -info 1] + set codestate_reverse [dict get $codeinfo codestate reverse] + switch -- $codestate_reverse { + 7 { + set existing_reverse_state 1 + } + 27 { + set existing_reverse_state 0 + } + "" { + } + } + if {$existing_reverse_state == 0} { + set rflip [a+ reverse] + } else { + #reverse of reverse + set rflip [a+ noreverse] + } + #note that mergeresult can have multiple esc (due to unmergeables or non sgr codes) + set sgrstack [list [dict get $codeinfo mergeresult] $rflip] + #set sgrstack [punk::ansi::codetype::sgr_merge [list [dict get $codeinfo mergeresult] $rflip]] + } + + # -- --- --- + + set nxt [llength $o] + if {!$insert_mode} { + if {$i < $nxt} { + #These lists must always be in sync + lset o $i $c + } else { + lappend o $c + } + if {$i < [llength $ustacks]} { + lset ustacks $i $sgrstack + lset gxstacks $i $gx0stack + } else { + lappend ustacks $sgrstack + lappend gxstacks $gx0stack + } + } else { + #insert of single-width vs double-width when underlying is double-width? + if {$i < $nxt} { + set o [linsert $o $i $c] + } else { + lappend o $c + } + if {$i < [llength $ustacks]} { + set ustacks [linsert $ustacks $i $sgrstack] + set gxstacks [linsert $gxstacks $i $gx0stack] + } else { + lappend ustacks $sgrstack + lappend gxstacks $gx0stack + } + } + } + +} + + + +# -- --- --- --- --- --- --- --- --- --- --- +tcl::namespace::eval overtype { + interp alias {} ::overtype::center {} ::overtype::centre +} + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Ready +package provide overtype [tcl::namespace::eval overtype { + variable version + set version 1.6.6 +}] +return + +#*** !doctools +#[manpage_end] diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/test/tomlish-1.1.5.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/test/tomlish-1.1.5.tm index 536e3fa3..35de5e70 100644 Binary files a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/test/tomlish-1.1.5.tm and b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/test/tomlish-1.1.5.tm differ diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/tomlish-1.1.6.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/tomlish-1.1.6.tm new file mode 100644 index 00000000..dddcd0bb --- /dev/null +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/tomlish-1.1.6.tm @@ -0,0 +1,8408 @@ +# -*- tcl -*- +# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt +# +# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. +# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# (C) 2024 +# +# @@ Meta Begin +# Application tomlish 1.1.6 +# Meta platform tcl +# Meta license +# @@ Meta End + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# doctools header +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +#*** !doctools +#[manpage_begin tomlish_module_tomlish 0 1.1.6] +#[copyright "2024"] +#[titledesc {tomlish toml parser}] [comment {-- Name section and table of contents description --}] +#[moddesc {tomlish}] [comment {-- Description at end of page heading --}] +#[require tomlish] +#[keywords module parsing toml configuration] +#[description] +#[para] tomlish is an intermediate representation of toml data in a tree structure (tagged lists representing type information) +#[para] The design goals are for tomlish to be whitespace and comment preserving ie byte-for byte preservation during roundtrips from toml to tomlish and back to toml +#[para] The tomlish representation can then be converted to a Tcl dict structure or to other formats such as json, +#[para] although these other formats are generally unlikely to retain whitespace or comments +#[para] The other formats also won't preserve roundtripability e.g \t and a literal tab coming from a toml file will be indistinguishable. +#[para] A further goal is to allow at least a useful subset of in-place editing operations which also preserve whitespace and comments. +#[para] e.g leaf key value editing, and table reordering/sorting, key-renaming at any level, key insertions/deletions +#[para] The API for editing (tomldoc object?) may require explicit setting of type if accessing an existing key +#[para] e.g setting a key that already exists and is a different type (especially if nested structure such as a table or array) +#[para] will need a -type option (-force ?) to force overriding with another type such as an int. + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section Overview] +#[para] overview of tomlish +#[subsection Concepts] +#[para] - + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Requirements +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[subsection dependencies] +#[para] packages used by tomlish +#[list_begin itemized] + +package require Tcl 8.6- +package require struct::stack +package require logger + +#*** !doctools +#[item] [package {Tcl 8.6-}] +#[item] [package {struct::stack}] + +#limit ourselves to clear, destroy, peek, pop, push, rotate, or size (e.g v 1.3 does not implement 'get') + + +#*** !doctools +#[list_end] + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section API] + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# Base namespace +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval tomlish { + namespace export {[a-z]*}; # Convention: export all lowercase + variable types + + #*** !doctools + #[subsection {Namespace tomlish}] + #[para] Core API functions for tomlish + #[list_begin definitions] + + #default interp recursionlimit of 1000 is insufficient to pass 1000 deep nested structures as in certain toml tests. + #e.g https://github.com/iarna/toml-spec-tests/tree/latest/values + #1000 seems deep for a 'configuration' format - but toml sometimes used for other serialisation purposes. + #todo - review + set existing_recursionlimit [interp recursionlimit {}] + if {$existing_recursionlimit < 5000} { + interp recursionlimit {} 5000 + } + + #IDEAS: + # since get_toml produces tomlish with whitespace/comments intact: + # tomldoc object - allow (at least basic?) editing of toml whilst preserving comments & whitespace + # - setKey (set leaf only to value) how to specify type? -type option? - whole array vs index into arrays and further nested objects? - option for raw toml additions? + # - separate addKey?? + # - deleteKey (delete leaf) + # - deleteTable (delete table - if only has leaves? - option to delete with child tables?) + # - set/add Table? - position in doc based on existing tables/subtables? + + #The tomlish intermediate representation allows things such as sorting the toml document by table name or other re-ordering of tables - + # because the tables include subkeys, comments and newlines within their structure - those elements all come along with it nicely during reordering. + #The same goes for the first newline following a keyval e.g x=1 \ny=2\n\n + #The newline is part of the keyval structure so makes reordering easier + #example from_toml "a=1\nb=2\n\n\n" + # 0 = TOMLISH + # 1 = KEY a = {INT 1} {NEWLINE lf} + # 2 = NEWLINE lf + # 3 = KEY b = {INT 2} {NEWLINE lf} + # 4 = NEWLINE lf + # 5 = NEWLINE lf + + #This reordering idea is complicated by the nature of tablearrays - especially as a table header references last tablearrayname, + # and duplicate table headers are allowed in that context. + #e.g + #[[fruits]] + # name="apple" + # [fruits.metadata] + # id=1 + # + #[unrelated1] + # + #[[fruits]] + # name="pear" + # + #[unrelated2] + # silly="ordering" + # + #[fruits.metadata] + #id=2 + #The TABLEARRAY record can't be completely selfcontained on the default parsing mechanism - because it is legal (though not recommended) to have unrelated tables in between. + #If we were to 'insert' later related records (such as the 2nd [fruits.metadata] above) into the TABLEARRAY structure - then, even though it might produce 'nicer' toml, + # we would lose roundtripability toml->tomlish->toml + # ----------------------------------------------------- + #REVIEW + #todo - some sort of 'normalize'/'grouping' function on tomlish that at least makes records self-contained, and perhaps then (optionally) reorders resulting records sensibly. + #such a function on the tomlish may work - although it would be unwise to duplicate the validation aspects of dict::from_tomlish + #The most practical way might be to use dict::from_tomlish followed by from_dict - but that would lose comment info and formatting. + #In the above example - The decision by the toml author to put [unrelated1] between related tablearrays should be respected, + #but the positioning of [unrelated2] between a tablearray and one of its contained tables is suspect. + #Both [fruits.metadata] table records should theoretically be added as children to their corresponding [[fruits]] tablearray record in the tomlish. (just as their name keys are) + # ----------------------------------------------------- + + + + #ARRAY is analogous to a Tcl list + #TABLE is analogous to a Tcl dict + #WS = inline whitespace + #KEY = bare key and value + #DQKEY = double quoted key and value + #SQKEY = single quoted key and value + #ITABLE = inline table (*can* be anonymous table) + # inline table values immediately create a table with the opening brace + # inline tables are fully defined between their braces, as are dotted-key subtables defined within + # No additional subtables or arrays of tables may be defined within an inline table after the ending brace - they must be entirely self-contained + + set tags [list TOMLISH BOM ARRAY TABLE ITABLE TABLEARRAY WS NEWLINE COMMENT DOTTEDKEY KEY DQKEY SQKEY STRING STRINGPART MULTISTRING LITERAL LITERALPART MULTILITERAL INT FLOAT BOOL] + #DDDD + lappend tags {*}[list\ + DATETIME\ + DATETIME-LOCAL\ + DATE-LOCAL\ + TIME-LOCAL\ + ] + + #removed - ANONTABLE + #tomlish v1.0 should accept arbitrary 64-bit signed ints (from -2^63 to 2^63-1) + #we will restrict to this range for compatibility for now - although Tcl can handle larger (arbitrarily so?) + #todo - configurable - allow empty string for 'unlimited' + set min_int -9223372036854775808 ;#-2^63 + set max_int +9223372036854775807 ;#2^63-1 + + proc Dolog {lvl txt} { + #return "$lvl -- $txt" + set msg "[clock format [clock seconds] -format "%Y-%m-%dT%H:%M:%S"] tomlish '$txt'" + puts stderr $msg + } + logger::initNamespace ::tomlish + foreach lvl [logger::levels] { + interp alias {} tomlish_log_$lvl {} ::tomlish::Dolog $lvl + log::logproc $lvl tomlish_log_$lvl + } + + + proc tags {} { + return $::tomlish::tags + } + + proc get_dottedkey_info {dottedkeyrecord} { + set key_hierarchy [list] + set key_hierarchy_raw [list] + if {[lindex $dottedkeyrecord 0] ne "DOTTEDKEY"} { + error "tomlish::get_dottedkey_info error. Supplied list doesn't appear to be a DOTTEDKEY (tag: [lindex $dottedkeyrecord 0])" + } + set compoundkeylist [lindex $dottedkeyrecord 1] + set expect_sep 0 + foreach part $compoundkeylist { + set parttag [lindex $part 0] + if {$parttag eq "WS"} { + continue + } + if {$expect_sep} { + if {$parttag ne "DOTSEP"} { + error "DOTTEDKEY missing dot separator between parts. '$dottedkeyrecord'" + } + set expect_sep 0 + } else { + set val [lindex $part 1] + switch -exact -- $parttag { + KEY { + lappend key_hierarchy $val + lappend key_hierarchy_raw $val + } + DQKEY { + #REVIEW unescape or not? + #JJJJ + lappend key_hierarchy [::tomlish::utils::unescape_string $val] + lappend key_hierarchy_raw \"$val\" + } + SQKEY { + lappend key_hierarchy $val + lappend key_hierarchy_raw "'$val'" + } + default { + error "tomlish::get_dottedkey_info DOTTED key unexpected part '$parttag' - ensure dot separator is between key parts. '$compoundkeylist'" + } + } + set expect_sep 1 + } + } + return [dict create keys $key_hierarchy keys_raw $key_hierarchy_raw] + } + + #helper function for tomlish::dict::from_tomlish + proc _get_keyval_value {keyval_element} { + #e.g + #DOTTEDKEY {{KEY a} {WS { }}} = {WS { }} {ARRAY {INT 1} SEP {ITABLE {DOTTEDKEY {{KEY x}} = {INT 1} SEP} {DOTTEDKEY {{KEY y}} = {INT 2}}}} + + log::notice ">>> _get_keyval_value from '$keyval_element'<<<" + #find the value (or 2 values if space separated datetime - and stitch back into one) + # 3 is the earliest index at which the value could occur (depending on whitespace) + if {[lindex $keyval_element 2] ne "="} { + error "tomlish _get_keyval_value keyval_element doesn't seem to be a properly structured { = } list\n $keyval_element" + } + + #review + if {[uplevel 1 [list info exists tablenames_info]]} { + upvar tablenames_info tablenames_info + } else { + set tablenames_info [dict create] ;#keys are @@ paths {@@parenttable @@arrayable @@etc} corresponding to parenttable.arraytable[].etc + #value is a dict with keys such as ttype, tdefined + } + set sublist [lrange $keyval_element 3 end] ;# rhs of = + + set values [list] + set value_posns [list] + set posn 0 + foreach sub $sublist { + #note that a barekey/dquotedkey won't occur directly inside a barekey/dquotedkey + #DDDD + switch -exact -- [lindex $sub 0] { + STRING - LITERAL - MULTISTRING - MULTILITERAL - INT - FLOAT - BOOL - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - TABLE - ARRAY - ITABLE { + lappend values $sub + lappend value_posns $posn + } + DOTTEDKEY { + #we should never see DOTTEDKEY as a toplevel element on RHS + #sanity check in case manually manipulated tomlish - or something went very wrong + set msg "tomlish::_get_keyval_value Unexpected toplevel value element DOTTEDKEY after =" + return -code error -errorcode {TOMLISH SYNTAX UNEXPECTEDDOTTEDKEYRHS} $msg + } + WS - NEWLINE - COMMENT {} + SEP {} + default { + set msg "tomlish::_get_keyval_value Unexpected toplevel value element [lindex $sub 0] after =" + return -code error -errorcode {TOMLISH SYNTAX UNEXPECTED} $msg + } + } + incr posn + } + switch -- [llength $values] { + 0 { + error "tomlish Failed to find value element in KEY. '$keyval_element'" + } + 1 { + lassign [lindex $values 0] type value + } + 2 { + #we generally expect a single 'value' item on RHS of = + #(ignoring WS,NEWLINE,SEP + #(either a simple type, or a container which has multiple values inside) + #exception for space separated datetime which is two toplevel values + + #validate than exactly single space was between the two values + lassign $value_posns p1 p2 + if {$p2 != $p1 +2} { + #sanity check + #can probably only get here through manual manipulation of the tomlish list to an unprocessable form + error "tomlish KEY appears to have more than one part - but not separated by whitespace - invalid '$keyval_element'" + } + set between_token [lindex $sublist $p1+1] + if {[lindex $between_token 1] ne " "} { + error "tomlish KEY in 2 parts is not separated by a single space - cannot consider for datetime '$keyval_element'" + } + lassign [lindex $values 0] type_d1 value_d1 + lassign [lindex $values 1] type_d2 value_d2 + #DDDD + if {$type_d1 ne "DATE-LOCAL" || $type_d2 ni {DATETIME TIME-LOCAL}} { + #we reuse DATETIME tag for standalone time with tz offset (or zZ) + error "tomlish KEY in 2 parts does not appear to be datetime '$keyval_element'" + } + if {$type_d2 eq "TIME-LOCAL"} { + set type DATETIME-LOCAL + } else { + #extra check that 2nd part is actually a time + if {![tomlish::utils::is_timepart $value_d2]} { + error "tomlish KEY in 2 parts does not appear to be datetime. (part 2 not a time value) '$keyval_element'" + } + set type DATETIME + } + set value "${value_d1}T${value_d2}" + } + default { + error "tomlish Found multiple value elements in KEY, expected one. (or 2 for space-separated datetime) '$keyval_element'" + } + } + set sub_tablenames_info [dict create] + switch -exact -- $type { + INT - FLOAT - BOOL - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL { + #DDDD + #simple (non-container, no-substitution) datatype + set result [list type $type value $value] + } + STRING - STRINGPART { + #JJJ + #!!! review + #set result [list type $type value [::tomlish::utils::unescape_string $value]] + set result [list type $type value $value] + } + LITERAL - LITERALPART { + #REVIEW + set result [list type $type value $value] + } + TABLE { + #invalid? + error "tomlish _get_keyval_value invalid to have type TABLE on rhs of =" + } + ITABLE { + #This one should not be returned as a type value structure! + # + set prev_tablenames_info $tablenames_info + set tablenames_info [dict create] + set result [::tomlish::dict::from_tomlish [ list [lindex $values 0] ]] + set sub_tablenames_info $tablenames_info + set tablenames_info $prev_tablenames_info + } + ARRAY { + #we need to recurse to get the corresponding dict for the contained item(s) + #pass in the whole [lindex $values 0] (type val) - not just the $value! + set prev_tablenames_info $tablenames_info + set tablenames_info [dict create] + set result [list type $type value [ ::tomlish::dict::from_tomlish [ list [lindex $values 0] ] ]] + set sub_tablenames_info $tablenames_info + set tablenames_info $prev_tablenames_info + } + MULTISTRING - MULTILITERAL { + #review - mapping these to STRING might make some conversions harder? + #if we keep the MULTI - we know we have to look for newlines for example when converting to json + #without specific types we'd have to check every STRING - and lose info about how best to map chars within it + set result [list type $type value [ ::tomlish::dict::from_tomlish [ list [lindex $values 0] ] ]] + } + default { + error "tomlish Unexpected value type '$type' found in keyval '$keyval_element'" + } + } + return [dict create result $result tablenames_info $sub_tablenames_info] + } + + + proc to_dict {tomlish} { + tomlish::dict::from_tomlish $tomlish + } + + + + proc _from_dictval_tomltype {parents tablestack keys typeval} { + set type [dict get $typeval type] + set val [dict get $typeval value] + #These are the restricted sets of typed used in the tomlish::dict representation + #They are a subset of the types in tomlish: data types plus ARRAY, arranged in a dictionary form. + #The container types: ITABLE, TABLE, TABLEARRAY are not used as they are represented as dictionary keys and ARRAY items. + #The WS, COMMENT, and NEWLINE elements are also unrepresented in the dict structure. + switch -- $type { + ARRAY { + set subitems [list] + foreach item $val { + lappend subitems [_from_dictval [list {*}$parents ARRAY] $tablestack $keys $item] SEP + } + if {[lindex $subitems end] eq "SEP"} { + set subitems [lrange $subitems 0 end-1] + } + return [list ARRAY {*}$subitems] + } + ITABLE { + error "not applicable" + if {$val eq ""} { + return ITABLE + } else { + return [_from_dictval [list {*}$parents ITABLE] $tablestack $keys $val] + } + } + STRING { + #JSJS + #if our dict came from json - we have already decided what type of STRING/LITERAL etc to use when building the dict + + #do not validate like this - important that eg json val\\ue -> dict val\ue -> tomlish/toml val\\ue + #see toml-tests + #if {![tomlish::utils::rawstring_is_valid_tomlstring $val]} { + # #todo? + # return -code error -errorcode {TOML SYNTAX INVALIDSTRING} "Unescaped controls in string or invalid escapes" + #} + return [list STRING [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $val]] + } + MULTISTRING { + #value is a raw string that isn't encoded as tomlish + #create a valid toml snippet with the raw value and decode it to the proper tomlish MULTISTRING format + #We need to convert controls in $val to escape sequences - except for newlines + # + #consider an *option* to reformat for long lines? (perhaps overcomplex - byte equiv - but may fold in ugly places) + #we could use a line-length limit to decide when to put in a "line ending backslash" + #and even format it with a reasonable indent so that proper CONT and WS entries are made (?) REVIEW + # + #TODO + set tomlpart "x=\"\"\"\\\n" + append tomlpart [tomlish::utils::rawstring_to_MultiBstring_with_escaped_controls $val] + append tomlpart "\"\"\"" + set tomlish [tomlish::from_toml $tomlpart] + #e.g if val = " etc\nblah" + #TOMLISH {DOTTEDKEY {{KEY x}} = {MULTISTRING CONT {NEWLINE LF} {WS { }} {STRINGPART etc} {NEWLINE lf} {STRINGPART blah} } } + #lindex 1 3 is the MULTISTRING tomlish list + return [lindex $tomlish 1 3] + } + MULTILITERAL { + #MLL string can contain newlines - but still no control chars + #todo - validate - e.g val can't contain more than 2 squotes in a row + if {[string first ''' $val] >=0} { + set msg "_from_dictval_tomltype error: more than 2 single quotes in a row found in MULTILITERAL - cannot encode dict to TOML-VALID TOMLISH" + return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} $msg + } + + #rawstring_is_valid_multiliteral - allow newlines as lf or crlf - but not bare cr + if {![tomlish::utils::rawstring_is_valid_multiliteral $val]} { + return -code error -errorcode {TOML SYNTAX INVALIDMULTILITERAL} "Controls other than tab or newlines found in multiliteral" + } + + set tomlpart "x='''\n" + append tomlpart $val ''' + set tomlish [tomlish::from_toml $tomlpart] + return [lindex $tomlish 1 3] + } + LITERAL { + #from v1.0 spec - "Control characters other than tab are not permitted in a literal string" + #(This rules out raw ANSI SGR - which is somewhat restrictive - but perhaps justified for a config format + # as copy-pasting ansi to a config value is probably not always wise, and it's not something that can be + # easily input via a text editor. ANSI can go in Basic strings using the \e escape if that's accepted v1.1?) + #we could choose to change the type to another format here when encountering invalid chars - but that seems + #like too much magic. We elect to error out and require the dict to have valid data for the types it specifies. + if {[string first ' $val] >=0} { + set msg "_from_dictval_tomltype error: single quote found in LITERAL - cannot encode dict to TOML-VALID TOMLISH" + return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} $msg + } + #JJJJ + if {![tomlish::utils::rawstring_is_valid_literal $val]} { + #has controls other than tab + #todo - squote? + return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} "Controls other than tab found in literal" + } + return [list LITERAL $val] + } + INT { + if {![::tomlish::utils::is_int $val]} { + error "_from_dictval_tomltype error: bad INT value '$val' - cannot encode dict to TOML-VALID TOMLISH" + } + return [list INT $val] + } + FLOAT { + if {![::tomlish::utils::is_float $val]} { + error "_from_dictval_tomltype error: bad FLOAT value '$val' - cannot encode dict to TOML-VALID TOMLISH" + } + return [list FLOAT $val] + } + default { + if {$type ni [::tomlish::tags]} { + error "_from_dictval_tomltype error: Unrecognised typename '$type' in {type value } - cannot encode dict to TOML-VALID TOMLISH" + } + return [list $type $val] + } + } + } + + proc _from_dictval {parents tablestack keys vinfo} { + set k [lindex $keys end] + set K_PART [tomlish::dict::classify_rawkey $k] ;#get [list SQKEY ] + #puts stderr "---parents:'$parents' keys:'$keys' vinfo: $vinfo---" + #puts stderr "---tablestack: $tablestack---" + set result [list] + set lastparent [lindex $parents end] + if {$lastparent in [list "" do_inline]} { + if {[tomlish::dict::is_typeval $vinfo]} { + set type [dict get $vinfo type] + #treat ITABLE differently? + set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] + lappend result DOTTEDKEY [list $K_PART {WS { }}] = {WS { }} $sublist {NEWLINE lf} + } else { + if {$vinfo ne ""} { + + #set result [list DOTTEDKEY [list [list KEY $k]] = ] + #set records [list ITABLE] + + set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] + + if {$lastparent eq "do_inline"} { + set result [list DOTTEDKEY [list $K_PART] =] + set records [list ITABLE] + } else { + set tname [tomlish::dict::join_and_quote_rawkey_list [list $k]] + set result [list TABLE $tname {NEWLINE lf}] + set tablestack [list {*}$tablestack [list T $k]] + set records [list] + } + + + + set lastidx [expr {[dict size $vinfo] -1}] + set dictidx 0 + dict for {vk vv} $vinfo { + set VK_PART [tomlish::dict::classify_rawkey $vk] ;#get [list SQKEY ] + #(SQKEY & DQKEY do not have the enclosing quotes in their returned val) + #if {[regexp {\s} $vk] || [string first . $vk] >= 0} { + # set VK_PART [list SQKEY $vk] + #} else { + # set VK_PART [list KEY $vk] + #} + if {[tomlish::dict::is_typeval $vv]} { + #type x value y + #REVIEW - we could detect if value is an array of objects, + #and depending on parent context - emit a series of TABLEARRAY records instead of a DOTTEDKEY record containing an ARRAY of objects + set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] + set record [list DOTTEDKEY [list $VK_PART {WS { }}] = {WS { }} $sublist] + } else { + if {$vv eq ""} { + #experimental + if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { + ::tomlish::log::notice "_from_dictval could uninline KEY $vk (tablestack:$tablestack)" + #set tname [tomlish::dict::name_from_tablestack [list {*}$tablestack [list T $vk]]] + + #we can't just join normalized keys - need keys with appropriate quotes and escapes + #set tname [join [list {*}$keys $vk] .] ;#WRONG + set tq [tomlish::dict::join_and_quote_rawkey_list [list {*}$keys $vk]] + + + ##wrong? results in TABLE within TABLE record?? todo pop? + #set record [list TABLE $tq {NEWLINE lf}] + #set tablestack [list {*}$tablestack [list T $vk]] + + #REVIEW!!! + + set record [list DOTTEDKEY [list $VK_PART] = ITABLE] + set tablestack [list {*}$tablestack [list I $vk]] + + } else { + set record [list DOTTEDKEY [list $VK_PART] = ITABLE] + set tablestack [list {*}$tablestack [list I $vk]] + } + } else { + if { 0 } { + #experiment.. sort of getting there. + if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { + ::tomlish::log::notice "_from_dictval could uninline2 KEYS [list {*}$keys $vk] (tablestack:$tablestack)" + set tq [tomlish::dict::join_and_quote_rawkey_list [list {*}$keys $vk]] + set record [list TABLE $tq {NEWLINE lf}] + set tablestack [list {*}$tablestack [list T $vk]] + + #review - todo? + set dottedkey_value [_from_dictval [list {*}$parents TABLE] $tablestack [list {*}$keys $vk] $vv] + lappend record {*}$dottedkey_value + + } else { + set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] + set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] + } + } else { + set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] + set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] + } + } + } + if {$dictidx != $lastidx} { + #lappend record SEP + if {$lastparent eq "do_inline"} { + lappend record SEP + } else { + lappend record {NEWLINE lf} + } + } + if {[llength $record]} { + lappend records $record + } + incr dictidx + } + if {$lastparent eq "do_inline"} { + lappend result $records {NEWLINE lf} + } else { + lappend result {*}$records {NEWLINE lf} + } + } else { + if {$lastparent eq "do_inline"} { + lappend result DOTTEDKEY [list $K_PART] = ITABLE {NEWLINE lf} + } else { + set tname [tomlish::dict::join_and_quote_rawkey_list [list $k]] + #REVIEW + lappend result TABLE $tname {NEWLINE lf} + } + } + } + } else { + #lastparent is not toplevel "" or "do_inline" + if {[tomlish::dict::is_typeval $vinfo]} { + #type x value y + set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] + lappend result {*}$sublist + } else { + if {$lastparent eq "TABLE"} { + #review + dict for {vk vv} $vinfo { + set VK_PART [tomlish::dict::classify_rawkey $vk] ;#get [list SQKEY ] + set dottedkey_value [_from_dictval [list {*}$parents DOTTEDKEY] $tablestack [list {*}$keys $vk] $vv] + lappend result [list DOTTEDKEY [list $VK_PART] = $dottedkey_value {NEWLINE lf}] + } + } else { + if {$vinfo ne ""} { + set lastidx [expr {[dict size $vinfo] -1}] + set dictidx 0 + set sub [list] + #REVIEW + #set result $lastparent ;#e.g sets ITABLE + set result ITABLE + set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] + dict for {vk vv} $vinfo { + set VK_PART [tomlish::dict::classify_rawkey $vk] ;#get [list SQKEY ] + if {[tomlish::dict::is_typeval $vv]} { + #type x value y + set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] + set record [list DOTTEDKEY [list $VK_PART] = $sublist] + } else { + if {$vv eq ""} { + #can't just uninline at this level + #we need a better method to query main dict for uninlinability at each level + # (including what's been inlined already) + #if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { + # puts stderr "_from_dictval uninline2 KEY $keys" + # set tname [tomlish::dict::join_and_quote_rawkey_list [list {*}$keys $vk]] + # set record [list TABLE $tname {NEWLINE lf}] + # set tablestack [list {*}$tablestack [list T $vk]] + #} else { + set record [list DOTTEDKEY [list $VK_PART] = ITABLE] + #} + } else { + #set sub [_from_dictval ITABLE $vk $vv] + set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] + #set record [list DOTTEDKEY [list $VK_PART] = ITABLE $dottedkey_value] + set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] + } + } + if {$dictidx != $lastidx} { + lappend record SEP + } + lappend result $record + incr dictidx + } + } else { + #e.g x=[{}] + log::debug "---> _from_dictval empty ITABLE x-1" + #lappend result DOTTEDKEY [list $K_PART] = ITABLE ;#wrong + lappend result ITABLE + } + } + } + } + return $result + } + + + proc from_dict {d} { + #consider: + # t1={a=1,b=2} + # x = 1 + + # from_dict gives us: t1 {a {type INT value 1} b {type INT value 2}} x {type INT value 1} + + #If we represent t1 as an expanded table we get + # [t1] + # a=1 + # b=2 + # x=1 + # --- which is incorrect - as x was a toplevel key like t1! + #This issue doesn't occur if x is itself an inline table + # t1={a=1,b=2} + # x= {no="problem"} + # + # (or if we were to reorder x to come before t1) + + #ie the order of the dict elements influences how the toml can be represented. + + #As the dictionary form doesn't distinguish the structure used to create tables {[table1]\nk=v} vs inline {table1={k=v}} + #Without a solution, from_dict would have to always produce the inline form for toplevel tables unless we allowed re-ordering, + #which is unpreferred here. + + #A possible solution: + #scan the top level to see if all (trailing) elements are themselves dicts + # (ie not of form {type XXX value yyy}) + # + # A further point is that if all root level values are at the 'top' - we can treat lower table-like structures as {[table]} elements + #ie we don't need to force do_inline if all the 'simple' keys are before any compound keys + + #set root_has_values 0 + #approach 1) - the naive approach - forces inline when not always necessary + #dict for {k v} $d { + # if {[llength $v] == 4 && [lindex $v 0] eq "type"} { + # set root_has_values 1 + # break + # } + #} + + + #approach 2) - track the position of last {type x value y} in the dictionary built by dict::from_tomlish + # - still not perfect. Inlines dotted tables unnecessarily + #This means from_dict doesn't produce output optimal for human editing. + set last_simple [tomlish::dict::last_tomltype_posn $d] + + + ## set parent "do_inline" ;#a value used in _from_dictval to distinguish from "" or other context based parent values + #Any keys that are themselves tables - will need to be represented inline + #to avoid reordering, or incorrect assignment of plain values to the wrong table. + + ## set parent "" + #all toplevel keys in the dict structure can represent subtables. + #we are free to use {[tablename]\n} syntax for toplevel elements. + + + set tomlish [list TOMLISH] + set dictposn 0 + set tablestack [list [list T root]] ;#todo + dict for {t tinfo} $d { + if {$last_simple > $dictposn} { + set parents [list do_inline] + } else { + set parents [list ""] + } + set keys [list $t] + #review - where to make decision on + # DOTTEDKEY containing array of objs + #vs + # list of TABLEARRAY records + #At least for the top + set trecord [_from_dictval $parents $tablestack $keys $tinfo] + lappend tomlish $trecord + incr dictposn + } + return $tomlish + } + + proc typedjson_to_toml {json} { + #*** !doctools + #[call [fun typedjson_to_toml] [arg json]] + #[para] + + set tomlish [::tomlish::from_dict_from_typedjson $json] + lappend tomlish [list NEWLINE lf] + set toml [::tomlish::to_toml $tomlish] + } + + set json1 {{ "a": {"type": "integer", "value": "42"}}} + set json2 {{ + "a": {"type": "integer", "value": "42"}, + "b": {"type": "string", "value": "test"} + }} + set json3 { +{ + "best-day-ever": {"type": "datetime", "value": "1987-07-05T17:45:00Z"}, + "numtheory": { + "boring": {"type": "bool", "value": "false"}, + "perfection": [ + {"type": "integer", "value": "6"}, + {"type": "integer", "value": "28"}, + {"type": "integer", "value": "496"} + ] + } +} + } + + set json4 { +{ + "best-day-ever": {"type": "datetime", "value": "1987-07-05T17:45:00Z"}, + "numtheory": { + "boring": {"type": "bool", "value": "false"}, + "perfection": [ + {"type": "integer", "value": "6"}, + {"type": "integer", "value": "28"}, + {"type": "integer", "value": "496"} + ] + }, + "emptyobj": {}, + "emptyarray": [] +} + } + + set json5 { +{ + "a": { + " x ": {}, + "b.c": {}, + "d.e": {}, + "b": { + "c": {} + } + } +} + } + + #surrogate pair face emoji + set json6 { +{ + "surrogatepair": {"type": "string", "value": "\uD83D\uDE10"} +} + } + + + set json7 { +{ + "escapes": {"type": "string", "value": "val\\ue"} +} + } + + + proc from_dict_from_typedjson {json} { + set d [tomlish::dict::from_typedjson $json] + tomlish::from_dict $d ;#return tomlish + } + + + proc toml_to_typedjson {toml} { + set tomlish [::tomlish::from_toml $toml] + set d [tomlish::dict::from_tomlish $tomlish] + #full validation only occurs by re-encoding dict to tomlish + set test [tomlish::from_dict $d] + + set h [tomlish::typedhuddle::from_dict $d] + #huddle jsondump $h + tomlish::huddle::jsondumpraw $h + } + + #proc get_json {tomlish} { + # package require fish::json + # set d [::tomlish::dict::from_tomlish $tomlish] + + # #return [::tomlish::dict_to_json $d] + # return [fish::json::from "struct" $d] + #} + + #return a Tcl list of tomlish tokens + #i.e get a standard list of all the toml terms in string $s + #where each element of the list is a *tomlish* term.. i.e a specially 'tagged' Tcl list. + #(simliar to a tcl 'Huddle' - but also supporting whitespace preservation) + # ---------------------------------------------------------------------------------------------- + # NOTE: the production of tomlish from toml source doesn't indicate the toml source was valid!!! + # e.g we deliberately don't check certain things such as duplicate table declarations here. + # ---------------------------------------------------------------------------------------------- + #Part of the justification for this is that as long as the syntax is toml shaped - we can load files which violate certain rules and allow programmatic manipulation. + # (e.g perhaps a toml editor to highlight violations for fixing) + # A further stage is then necessary to load the tomlish tagged list into a data structure more suitable for efficient query/reading. + # e.g dicts or an object oriented structure + #Note also - *no* escapes in quoted strings are processed. This is up to the datastructure stage + #e.g dict::from_tomlish will substitute \r \n \uHHHH \UHHHHHHH etc + #This is important for tomlish to maintain the ability to perform competely lossless round-trips from toml to tomlish and back to toml. + # (which is handy for testing as well as editing some part of the structure with absolutely no effect on other parts of the document) + #If we were to unescape a tab character for example + # - we have no way of knowing if it was originally specified as \t \u0009 or \U00000009 or directly as a tab character. + # For this reason, we also do absolutely no line-ending transformations based on platform. + # All line-endings are maintained as is, and even a file with mixed lf crlf line-endings will be correctly interpreted and can be 'roundtripped' + + proc from_toml {args} { + + namespace upvar ::tomlish::parse s s + set s [join $args \n] + namespace upvar ::tomlish::parse i i + set i 0 ;#index into s + + namespace upvar ::tomlish::parse is_parsing is_parsing + set is_parsing 1 + + if {[info command ::tomlish::parse::spacestack] eq "::tomlish::parse::spacestack"} { + tomlish::parse::spacestack destroy + } + struct::stack ::tomlish::parse::spacestack + + namespace upvar ::tomlish::parse last_space_action last_space_action + namespace upvar ::tomlish::parse last_space_type last_space_type + + namespace upvar ::tomlish::parse tok tok + set tok "" + + namespace upvar ::tomlish::parse type type + namespace upvar ::tomlish::parse tokenType tokenType + ::tomlish::parse::set_tokenType "" + namespace upvar ::tomlish::parse tokenType_list tokenType_list + set tokenType [list] ;#Flat (un-nested) list of tokentypes found + + namespace upvar ::tomlish::parse lastChar lastChar + set lastChar "" + + + set result "" + namespace upvar ::tomlish::parse nest nest + set nest 0 + + namespace upvar ::tomlish::parse v v ;#array keyed on nest level + + + set v(0) {TOMLISH} + array set s0 [list] ;#whitespace data to go in {SPACE {}} element. + set parentlevel 0 + + + namespace upvar ::tomlish::parse state state + + namespace upvar ::tomlish::parse braceCount braceCount + set barceCount 0 + namespace upvar ::tomlish::parse bracketCount bracketCount + set bracketCount 0 + + set sep 0 + set r 1 + namespace upvar ::tomlish::parse token_waiting token_waiting + set token_waiting [dict create] ;#if ::tok finds a *complete* second token during a run, it will put the 2nd one here to be returned by the next call. + + + + set state "table-space" + ::tomlish::parse::spacestack push {type space state table-space} + namespace upvar ::tomlish::parse linenum linenum;#'line number' of input data. (incremented for each literal linefeed - but not escaped ones in data) + set linenum 1 + + set ::tomlish::parse::state_list [list] + try { + while {$r} { + set r [::tomlish::parse::tok] + #puts stdout "got tok: '$tok' while parsing string '$s' " + set next_tokenType_known 0 ;#whether we begin a new token here based on what terminated the token result of 'tok' + + + #puts "got token: '$tok' tokenType='$tokenType'. while v($nest) = [set v($nest)]" + #puts "-->tok: $tok tokenType='$tokenType'" + set prevstate $state + set transition_info [::tomlish::parse::goNextState $tokenType $tok $state] + #review goNextState could perform more than one space_action + set space_action [dict get $transition_info space_action] + set newstate [dict get $transition_info newstate] ;#use of 'newstate' vs 'state' makes code clearer below + + if {[tcl::string::match "err-*" $state]} { + ::tomlish::log::warn "---- State error in state $prevstate for tokenType: $tokenType token value: $tok. $state aborting parse. [tomlish::parse::report_line]" + lappend v(0) [list ERROR tokentype $tokenType state $prevstate to $state leveldata [set v($nest)]] + return $v(0) + } + # --------------------------------------------------------- + #NOTE there may already be a token_waiting at this point + #set_token_waiting can raise an error here, + # in which case the space_action branch needs to be rewritten to handle the existing token_waiting + # --------------------------------------------------------- + + if {$space_action eq "pop"} { + #pop_trigger_tokens: newline tablename endarray endinlinetable + #note a token is a pop trigger depending on context. e.g first newline during keyval is a pop trigger. + set parentlevel [expr {$nest -1}] + set do_append_to_parent 1 ;#most tokens will leave this alone - but some like tentative_accum_squote need to do their own append + switch -exact -- $tokenType { + tentative_accum_squote { + #should only apply within a multiliteral + #### + set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed + #Without this - we would get extraneous empty list entries in the parent + # - as the xxx-squote-space isn't a space level from the toml perspective + # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop + #assert prevstate always trailing-squote-space + #dev guardrail - remove? assertion lib? + switch -exact -- $prevstate { + trailing-squote-space { + } + default { + error "--- unexpected popped due to tentative_accum_squote but came from state '$prevstate' should have been trailing-squote-space" + } + } + switch -- $tok { + ' { + tomlish::parse::set_token_waiting type single_squote value $tok complete 1 startindex [expr {$i -1}] + } + '' { + #review - we should perhaps return double_squote instead? + #tomlish::parse::set_token_waiting type literal value "" complete 1 + tomlish::parse::set_token_waiting type double_squote value "" complete 1 startindex [expr {$i - 2}] + } + ''' { + #### + #if already an eof in token_waiting - set_token_waiting will insert before it + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 3}] + } + '''' { + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] + #todo integrate left squote with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + LITERALPART { + set newval "[lindex $lastpart 1]'" + set parentdata $v($parentlevel) + lset parentdata end [list LITERALPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE { + lappend v($parentlevel) [list LITERALPART "'"] + } + MULTILITERAL { + #empty + lappend v($parentlevel) [list LITERALPART "'"] + } + default { + error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" + } + } + } + ''''' { + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] + #todo integrate left 2 squotes with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + LITERALPART { + set newval "[lindex $lastpart 1]''" + set parentdata $v($parentlevel) + lset parentdata end [list LITERALPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE { + lappend v($parentlevel) [list LITERALPART "''"] + } + MULTILITERAL { + lappend v($parentlevel) [list LITERALPART "''"] + } + default { + error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" + } + } + } + } + } + triple_squote { + #presumably popping multiliteral-space + ::tomlish::log::debug "---- triple_squote for last_space_action pop leveldata: $v($nest)" + set merged [list] + set lasttype "" + foreach part $v($nest) { + switch -exact -- [lindex $part 0] { + MULTILITERAL { + lappend merged $part + } + LITERALPART { + if {$lasttype eq "LITERALPART"} { + set prevpart [lindex $merged end] + lset prevpart 1 [lindex $prevpart 1][lindex $part 1] + lset merged end $prevpart + } else { + lappend merged $part + } + } + NEWLINE { + #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here + #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. + lappend merged $part + } + default { + error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" + } + } + set lasttype [lindex $part 0] + } + set v($nest) $merged + } + tentative_accum_dquote { + #should only apply within a multistring + #### + set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed + #Without this - we would get extraneous empty list entries in the parent + # - as the trailing-dquote-space isn't a space level from the toml perspective + # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop + #assert prevstate always trailing-dquote-space + #dev guardrail - remove? assertion lib? + switch -exact -- $prevstate { + trailing-dquote-space { + } + default { + error "--- unexpected popped due to tentative_accum_dquote but came from state '$prevstate' should have been trailing-dquote-space" + } + } + switch -- $tok { + {"} { + tomlish::parse::set_token_waiting type single_dquote value $tok complete 1 startindex [expr {$i -1}] + } + {""} { + #review - we should perhaps return double_dquote instead? + #tomlish::parse::set_token_waiting type literal value "" complete 1 + tomlish::parse::set_token_waiting type double_dquote value "" complete 1 startindex [expr {$i - 2}] + } + {"""} { + #### + #if already an eof in token_waiting - set_token_waiting will insert before it + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 3}] + } + {""""} { + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 4}] + #todo integrate left dquote with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + STRINGPART { + set newval "[lindex $lastpart 1]\"" + set parentdata $v($parentlevel) + lset parentdata end [list STRINGPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE - CONT - WS { + lappend v($parentlevel) [list STRINGPART {"}] + } + MULTISTRING { + #empty + lappend v($parentlevel) [list STRINGPART {"}] + } + default { + error "--- don't know how to integrate extra trailing dquote with data $v($parentlevel)" + } + } + } + {"""""} { + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i-5}] + #todo integrate left 2 dquotes with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + STRINGPART { + set newval "[lindex $lastpart 1]\"\"" + set parentdata $v($parentlevel) + lset parentdata end [list STRINGPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE - CONT - WS { + lappend v($parentlevel) [list STRINGPART {""}] + } + MULTISTRING { + lappend v($parentlevel) [list STRINGPART {""}] + } + default { + error "--- don't know how to integrate extra trailing 2 dquotes with data $v($parentlevel)" + } + } + } + } + } + triple_dquote { + #presumably popping multistring-space + ::tomlish::log::debug "---- triple_dquote for last_space_action pop leveldata: $v($nest)" + set merged [list] + set lasttype "" + foreach part $v($nest) { + switch -exact -- [lindex $part 0] { + MULTISTRING { + lappend merged $part + } + STRINGPART { + if {$lasttype eq "STRINGPART"} { + set prevpart [lindex $merged end] + lset prevpart 1 [lindex $prevpart 1][lindex $part 1] + lset merged end $prevpart + } else { + lappend merged $part + } + } + CONT - WS { + lappend merged $part + } + NEWLINE { + #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here + #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. + lappend merged $part + } + default { + error "---- triple_dquote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" + } + } + set lasttype [lindex $part 0] + } + set v($nest) $merged + } + equal { + #pop caused by = + switch -exact -- $prevstate { + dottedkey-space { + tomlish::log::debug "---- equal ending dottedkey-space for last_space_action pop" + #re-emit for parent space + tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] + } + dottedkey-space-tail { + #experiment? + tomlish::log::debug "---- equal ending dottedkey-space-tail for last_space_action pop" + #re-emit for parent space + tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] + } + } + } + newline { + incr linenum + lappend v($nest) [list NEWLINE $tok] + } + tablename { + #note: a tablename only 'pops' if we are greater than zero + error "---- tablename pop should already have been handled as special case zeropoppushspace in goNextState" + } + tablearrayname { + #!review - tablearrayname different to tablename regarding push/pop? + #note: a tablename only 'pops' if we are greater than zero + error "---- tablearrayname pop should already have been handled as special case zeropoppushspace in goNextState" + } + endarray { + #nothing to do here. + } + comma { + #comma for inline table will pop the keyvalue space + lappend v($nest) "SEP" + } + endinlinetable { + ::tomlish::log::debug "---- endinlinetable for last_space_action pop" + } + default { + error "---- unexpected tokenType '$tokenType' for last_space_action 'pop'" + } + } + if {$do_append_to_parent} { + #e.g tentative_accum_squote does it's own appends as necessary - so won't get here + lappend v($parentlevel) [set v($nest)] + } + + incr nest -1 + + } elseif {$last_space_action eq "push"} { + set prevnest $nest + incr nest 1 + set v($nest) [list] + # push_trigger_tokens: barekey dquotedkey startinlinetable startarray tablename tablearrayname + + + switch -exact -- $tokenType { + tentative_trigger_squote - tentative_trigger_dquote { + #### this startok will always be tentative_accum_squote/tentative_accum_dquote starting with one accumulated squote/dquote + if {[dict exists $transition_info starttok] && [dict get $transition_info starttok] ne ""} { + lassign [dict get $transition_info starttok] starttok_type starttok_val + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType $starttok_type + set tok $starttok_val + } + } + single_squote { + #JMN - REVIEW + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "squotedkey" + set tok "" + } + triple_squote { + ::tomlish::log::debug "---- push trigger tokenType triple_squote" + set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERALPART + } + squotedkey { + switch -exact -- $prevstate { + table-space - itable-space { + set v($nest) [list DOTTEDKEY] + } + } + #todo - check not something already waiting? + tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space + } + triple_dquote { + set v($nest) [list MULTISTRING] ;#container for NEWLINE,STRINGPART,CONT + } + dquotedkey { + switch -exact -- $prevstate { + table-space - itable-space { + set v($nest) [list DOTTEDKEY] + } + } + #todo - check not something already waiting? + tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space + } + barekey { + switch -exact -- $prevstate { + table-space - itable-space { + set v($nest) [list DOTTEDKEY] + } + } + #todo - check not something already waiting? + set waiting [tomlish::parse::get_token_waiting] + if {[llength $waiting]} { + set i [dict get $waiting startindex] + tomlish::parse::clear_token_waiting + tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space + } else { + tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space + } + } + tablename { + #note: we do not use the output of tablename_trim to produce a tablename for storage in the tomlish list! + #The tomlish list is intended to preserve all whitespace (and comments) - so a roundtrip from toml file to tomlish + # back to toml file will be identical. + #It is up to the datastructure stage to normalize and interpret tomlish for programmatic access. + # we call tablename_trim here only to to validate that the tablename data is well-formed at the outermost level, + # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names from + # a structural perspective. + + #todo - review! It's arguable that we should not do any validation here, and just store even incorrect raw tablenames, + # so that the tomlish list is more useful for say a toml editor. Consider adding an 'err' tag to the appropriate place in the + # tomlish list? + + #set trimtable [tablename_trim $tok] + #::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$trimtable'" + set v($nest) [list TABLE $tok] ;#$tok is the *raw* table name + #note also that equivalent tablenames may have different toml representations even after being trimmed! + #e.g ["x\t\t"] & ["x "] (tab escapes vs literals) + #These will show as above in the tomlish list, but should normalize to the same tablename when used as keys by the datastructure stage. + } + tablearrayname { + #set trimtable [tablename_trim $tok] + #::tomlish::log::debug "---- trimmed (but not normalized) tablearrayname: '$trimtable'" + set v($nest) [list TABLEARRAY $tok] ;#$tok is the *raw* tablearray name + } + startarray { + set v($nest) [list ARRAY] ;#$tok is just the opening bracket - don't output. + } + startinlinetable { + set v($nest) [list ITABLE] ;#$tok is just the opening curly brace - don't output. + } + default { + error "---- push trigger tokenType '$tokenType' not yet implemented" + } + } + + } else { + #no space level change + switch -exact -- $tokenType { + squotedkey { + #puts "---- squotedkey in state $prevstate (no space level change)" + lappend v($nest) [list SQKEY $tok] + } + dquotedkey { + #puts "---- dquotedkey in state $prevstate (no space level change)" + lappend v($nest) [list DQKEY $tok] + } + barekey { + lappend v($nest) [list KEY $tok] + } + dotsep { + lappend v($nest) [list DOTSEP] + } + starttablename { + #$tok is triggered by the opening bracket and sends nothing to output + } + starttablearrayname { + #$tok is triggered by the double opening brackets and sends nothing to output + } + tablename - tablenamearray { + error "---- did not expect 'tablename/tablearrayname' without space level change (no space level change)" + #set v($nest) [list TABLE $tok] + } + endtablename - endtablearrayname { + #no output into the tomlish list for this token + } + startinlinetable { + puts stderr "---- decode::toml error. did not expect startinlinetable without space level change (no space level change)" + } + single_dquote { + switch -exact -- $newstate { + string-state { + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "string" + set tok "" + } + dquoted-key { + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "dquotedkey" + set tok "" + } + multistring-space { + lappend v($nest) [list STRINGPART {"}] + #may need to be joined on pop if there are neighbouring STRINGPARTS + } + default { + error "---- single_dquote switch case not implemented for nextstate: $newstate (no space level change)" + } + } + } + double_dquote { + #leading extra quotes - test: toml_multistring_startquote2 + switch -exact -- $prevstate { + itable-keyval-value-expected - keyval-value-expected { + puts stderr "tomlish::decode::toml double_dquote TEST" + #empty string + lappend v($nest) [list STRINGPART ""] + } + multistring-space { + #multistring-space to multistring-space + lappend v($nest) [list STRINGPART {""}] + } + default { + error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" + } + } + + } + single_squote { + switch -exact -- $newstate { + literal-state { + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "literal" + set tok "" + } + squoted-key { + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "squotedkey" + set tok "" + } + multiliteral-space { + #false alarm squote returned from tentative_accum_squote pop + ::tomlish::log::debug "---- adding lone squote to own LITERALPART nextstate: $newstate (no space level change)" + #(single squote - not terminating space) + lappend v($nest) [list LITERALPART '] + #may need to be joined on pop if there are neighbouring LITERALPARTs + } + default { + error "---- single_squote switch case not implemented for nextstate: $newstate (no space level change)" + } + } + } + double_squote { + switch -exact -- $prevstate { + keyval-value-expected { + lappend v($nest) [list LITERAL ""] + } + multiliteral-space { + #multiliteral-space to multiliteral-space + lappend v($nest) [list LITERALPART ''] + } + default { + error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" + } + } + } + enddquote { + #nothing to do? + set tok "" + } + endsquote { + set tok "" + } + string { + #JJJJ + set tok [tomlish::from_Bstring $tok] + lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes + } + literal { + lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes + } + multistring { + #review + #JJJJ ? + lappend v($nest) [list MULTISTRING $tok] + } + stringpart { + #JJJJ + set tok [tomlish::from_Bstring $tok] + lappend v($nest) [list STRINGPART $tok] ;#will not get wrapped in dquotes directly + } + multiliteral { + lappend v($nest) [LIST MULTILITERAL $tok] + } + literalpart { + lappend v($nest) [list LITERALPART $tok] ;#will not get wrapped in squotes directly + } + untyped_value { + #would be better termed unclassified_value + #we can't determine the type of unquoted values (int,float,datetime,bool) until the entire token was read. + unset -nocomplain tag + if {$tok in {true false}} { + set tag BOOL + } else { + if {[::tomlish::utils::is_int $tok]} { + set tag INT + } else { + if {[string is integer -strict $tok]} { + #didn't qualify as a toml int - but still an int + #probably means is_int is limiting size and not accepting bigints (configurable?) + #or it didn't qualify due to more than 1 leading zero + #or other integer format issue such as repeated underscores + error "---- Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. (looks close to being an int. Formatting or range issue?) [tomlish::parse::report_line] (no space level change)" + } else { + #DDDD + if {[::tomlish::utils::is_float $tok]} { + set tag FLOAT + } elseif {[::tomlish::utils::is_localtime $tok]} { + set tag TIME-LOCAL + } elseif {[::tomlish::utils::is_timepart $tok]} { + #Note we must allow lone timepart here (not just is_localtime which doesn't allow tz offsets) in case it followed a previous localdate + set tag DATETIME ;#?? review standalone time with tz - no specific tag - only allowed as followup value from DATETIME-LOCAL + } elseif {[::tomlish::utils::is_datepart $tok]} { + set tag DATE-LOCAL + } elseif {[::tomlish::utils::is_datetime $tok]} { + #not just a date or just a time + #could be either local or have tz offset + #DDDD JJJ + set norm [string map {" " T} $tok];#prob unneeded - we won't get here if there was a space - would arrive as 2 separate tokens review. + lassign [split $norm T] dp tp + if {[::tomlish::utils::is_localtime $tp]} { + set tag DATETIME-LOCAL + } else { + set tag DATETIME + } + } elseif {[::tomlish::utils::is_datetime X$tok] || [::tomlish::utils::is_timepart X$tok]} { + # obsolete + #Note we must allow lone timepart here (not just is_localtime which doesn't allow tz offsets) in case it followed a previous localdate + #e.g x= 2025-01-01 02:34Z + #The dict::from_tomlish validation will catch an invalid standaline timepart, or combine with leading date if applicable. + set tag DATETIME + } else { + error "---- Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. [tomlish::parse::report_line] (no space level change)" + } + } + } + } + #assert either tag is set, or we errored out. + lappend v($nest) [list $tag $tok] + + } + comment { + #puts stdout "----- comment token returned '$tok'------" + #JJJJ + set tok [tomlish::from_comment $tok] + lappend v($nest) [list COMMENT "$tok"] + } + equal { + #we append '=' to the nest so that any surrounding whitespace is retained. + lappend v($nest) = + } + comma { + lappend v($nest) SEP + } + newline { + incr linenum + lappend v($nest) [list NEWLINE $tok] + } + whitespace { + lappend v($nest) [list WS $tok] + } + continuation { + lappend v($nest) CONT + } + bom { + lappend v($nest) BOM + } + eof { + #ok - nothing more to add to the tomlish list. + #!todo - check previous tokens are complete/valid? + } + default { + error "--- unknown tokenType '$tokenType' during state $prevstate [::tomlish::parse::report_line] (no space level change)" + } + } + } + + if {!$next_tokenType_known} { + ::tomlish::log::notice "---- tomlish::decode::toml - current tokenType:$tokenType Next token type not known" + ::tomlish::parse::set_tokenType "" + set tok "" + } + + if {$state eq "end-state"} { + break + } + + + } + + #while {$nest > 0} { + # lappend v([expr {$nest -1}]) [set v($nest)] + # incr nest -1 + #} + while {[::tomlish::parse::spacestack size] > 1} { + ::tomlish::parse::spacestack pop + lappend v([expr {$nest -1}]) [set v($nest)] + incr nest -1 + + #set parent [spacestack peek] ;#the level being appended to + #lassign $parent type state + #if {$type eq "space"} { + # + #} elseif {$type eq "buffer"} { + # lappend v([expr {$nest -1}]) {*}[set v($nest)] + #} else { + # error "invalid spacestack item: $parent" + #} + } + + } finally { + set is_parsing 0 + } + return $v(0) + } + + #toml dquoted string to tomlish STRING + # - only allow specified escape sequences + # - allow any unicode except those that must be escaped: dquote, bsl, and control chars(except tab) + proc from_Bstring {bstr} { + #JJJJ + if {[catch { + tomlish::utils::unescape_string $bstr + } errM]} { + return -code error -errorcode {TOML SYNTAX INVALIDESCAPE} "tomlish::from_Bstring toml Bstring contains invalid escape sequence\n$errM" ;#review + } + #assert: all escapes are now valid + + if {[regexp {[\u0000-\u0008\u000A-\u001F\u007f]} $bstr]} { + set msg "tomlish::from_Bstring toml Bstring contains controls that must be escaped" + return -code error -errorcode {TOML SYNTAX BSTRINGUNESCAPEDCONTROLS} $msg ;#review + } + return $bstr + } + #validate toml comment + # - disallow controls that must be escaped + #from spec: + # "Control characters other than tab (U+0000 to U+0008, U+000A to U+001F, U+007F) are not permitted in comments." + proc from_comment {comment} { + if {[regexp {[\u0000-\u0008\u000A-\u001F\u007f]} $comment]} { + set msg "tomlish::from_comment toml comment contains controls that must be escaped" + return -code error -errorcode {TOML SYNTAX COMMENTUNESCAPEDCONTROLS} $msg ;#review + } + return $comment + } + + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish ---}] +} +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +namespace eval tomlish::build { + #STRING,INT,FLOAT,BOOL, DATETIME - simple wrappers for completeness + # take a value of the appropriate type and wrap as a tomlish tagged item + proc STRING {s} { + return [list STRING [::tomlish::utils::rawstring_to_Bstring_with_escaped_controls $s]] + } + proc LITERAL {litstring} { + error todo + } + + proc INT {i} { + #whole numbers, may be prefixed with a + or - + #Leading zeros are not allowed + #Hex,octal binary forms are allowed (toml 1.0) + #We will error out on encountering commas, as commas are interpreted differently depending on locale (and don't seem to be supported in the toml spec anyway) + #!todo - Tcl can handle bignums - bigger than a 64bit signed long as specified in toml. + # - We should probably raise an error for number larger than this and suggest the user supply it as a string? + if {[tcl::string::last , $i] > -1} { + error "Unable to interpret '$i' as an integer. Use underscores if you need a thousands separator [::tomlish::parse::report_line]" + } + if {![::tomlish::utils::int_validchars $i]} { + error "Unable to interpret '$i' as an integer. Only 0-9 + 1 _ characters are acceptable. [::tomlish::parse::report_line]" + } + + if {[::tomlish::utils::is_int $i]} { + return [list INT $i] + } else { + error "'$i' is not a valid integer as per the Toml spec. [::tomlish::parse::report_line]" + } + + } + + proc FLOAT {f} { + #convert any non-lower case variants of special values to lowercase for Toml + if {[::tcl::string::tolower $f] in {nan +nan -nan inf +inf -inf}} { + return [list FLOAT [tcl::string::tolower $f]] + } + if {[::tomlish::utils::is_float $f]} { + return [list FLOAT $f] + } else { + error "Unable to interpret '$f' as Toml float. Check your input, or check that tomlish is able to handle all Toml floats properly [::tomlish::parse::report_line]" + } + } + + proc DATETIME {str} { + if {[::tomlish::utils::is_datetime $str]} { + return [list DATETIME $str] + } else { + error "Unable to interpret '$str' as Toml datetime. Check your input, or check that tomlish is able to handle all Toml datetimes properly [::tomlish::parse::report_line]" + } + } + proc DATETIME-LOCAL {str} { + error "build::DATETIME-LOCAL todo" + } + + proc BOOLEAN {b} { + #convert any Tcl-acceptable boolean to boolean as accepted by toml - lower case true/false + if {![tcl::string::is boolean -strict $b]} { + error "Unable to convert '$b' to Toml boolean true|false. [::tomlish::parse::report_line]" + } else { + if {$b && 1} { + return [::list BOOL true] + } else { + return [::list BOOL false] + } + } + } + + #REVIEW + #Take tablename followed by + # a) *tomlish* name-value pairs e.g table mydata [list KEY item11 = [list STRING "test"]] {KEY item2 = [list INT 1]} + # (accept also key value {STRING }) + # b) simple 2-element tcl lists being name & *simple* value pairs for which basic heuristics will be used to determine types + proc _table {name args} { + set pairs [list] + foreach t $args { + if {[llength $t] == 4} { + if {[tcl::string::tolower [lindex $t 0]] ne "key" || [tcl::string::tolower [lindex $t 2]] ni "= value"} { + error "Only items tagged as KEY = currently accepted as name-value pairs for table command" + } + lassign $t _k keystr _eq valuepart + if {[llength $valuepart] != 2} { + error "supplied value must be typed. e.g {INT 1} or {STRING test}" + } + lappend pairs [list KEY $keystr = $valuepart] + } elseif {[llength $t] == 2} { + #!todo - type heuristics + lassign $t n v + lappend pairs [list KEY $n = [list STRING $v]] + } else { + error "'KEY = { toml but + # the first newline is not part of the data. + # we elect instead to maintain a basic LITERALPART that must not contain newlines.. + # and to compose MULTILITERAL of multiple NEWLINE LITERALPART parts, + #with the datastructure representation dropping the first newline (if immediately following opening delim) when building the value. + set literal "" + foreach part [lrange $item 1 end] { + append literal [::tomlish::encode::tomlish [list $part] $nextcontext] + } + append toml '''$literal''' + } + INT - + BOOL - + FLOAT - + DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL { + #DDDD + append toml [lindex $item 1] + } + INCOMPLETE { + error "cannot process tomlish term tagged as INCOMPLETE" + } + COMMENT { + append toml "#[lindex $item 1]" + } + BOM { + #Byte Order Mark may appear at beginning of a file. Needs to be preserved. + append toml "\uFEFF" + } + default { + error "Not a properly formed 'tomlish' taggedlist.\n '$list'\n Unknown tag '[lindex $item 0]'. See output of \[tomlish::tags\] command." + } + } + + } + return $toml + } + + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::encode ---}] +} +#fish toml from tomlish + +#(encode tomlish as toml) +interp alias {} tomlish::to_toml {} tomlish::encode::tomlish + +# + + +namespace eval tomlish::decode { + #*** !doctools + #[subsection {Namespace tomlish::decode}] + #[para] + #[list_begin definitions] + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::decode ---}] +} +#decode toml to tomlish +#interp alias {} tomlish::from_toml {} tomlish::decode::toml + +namespace eval tomlish::utils { + #*** !doctools + #[subsection {Namespace tomlish::utils}] + #[para] + #[list_begin definitions] + + + + #basic generic quote matching for single and double quotes + #note for example that {[o'malley]} will return sq - as the single quote is not closed or wrapped in double quotes + proc tok_in_quotedpart {tok} { + set sLen [tcl::string::length $tok] + set quote_type "" + set had_slash 0 + for {set i 0} {$i < $sLen} {incr i} { + set c [tcl::string::index $tok $i] + if {$quote_type eq ""} { + if {$had_slash} { + #don't enter quote mode + #leave slash_mode because even if current char is slash - it is escaped + set had_slash 0 + } else { + set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] + switch -- $ctype { + dq { + set quote_type dq + } + sq { + set quote_type sq + } + bsl { + set had_slash 1 + } + } + } + } else { + if {$had_slash} { + #don't leave quoted mode + #leave slash_mode because even if current char is slash - it is escaped + set had_slash 0 + } else { + set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] + switch -- $ctype { + dq { + if {$quote_type eq "dq"} { + set quote_type "" + } + } + sq { + if {$quote_type eq "sq"} { + set quote_type "" + } + } + bsl { + set had_slash 1 + } + } + } + } + } + return $quote_type ;#dq | sq + } + + proc hex_escape_info {slashx} { + set exp {^\\x([0-9a-fA-F]{2}$)} + if {[regexp $exp $slashx match hex]} { + return [list ok [list char [subst -nocommand -novariable $slashx]]] + } else { + return [list err [list reason "Supplied string not of the form \\xHH where H in \[0-9a-fA-F\]"]] + } + } + proc unicode_escape_info {slashu} { + #!todo + # validate that slashu is either a \uxxxx or \Uxxxxxxxx value of the correct length and + # is a valid 'unicode scalar value' (any Unicode code point except high-surrogate and low-surrogate code points) + # ie integers in the range 0 to D7FF16 and E00016 to 10FFFF16 inclusive + #expr {(($x >= 0) && ($x <= 0xD7FF16)) || (($x >= 0xE00016) && ($x <= 0x10FFFF16))} + if {[tcl::string::match {\\u*} $slashu]} { + set exp {^\\u([0-9a-fA-F]{4}$)} + if {[regexp $exp $slashu match hex]} { + if {[scan $hex %4x dec] != 1} { + #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? + return [list err [list reason "Failed to convert '$hex' to decimal"]] + } else { + return [list ok [list char [subst -nocommand -novariable $slashu]]] + } + } else { + return [list err [list reason "Supplied string not of the form \\uHHHH where H in \[0-9a-fA-F\]"]] + } + } elseif {[tcl::string::match {\\U*} $slashu]} { + set exp {^\\U([0-9a-fA-F]{8}$)} + if {[regexp $exp $slashu match hex]} { + if {[scan $hex %8x dec] != 1} { + #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? + return [list err [list reason "Failed to convert '$hex' to decimal"]] + } else { + if {(($dec >= 0) && ($dec <= 0xD7FF16)) || (($dec >= 0xE00016) && ($dec <= 0x10FFFF16))} { + return [list ok [list char [subst -nocommand -novariable $slashu]]] + } else { + return [list err [list reason "$slashu is not within the 'unicode scalar value' ranges 0 to 0xD7FF16 or 0xE00016 to 0x10FFFF16"]] + } + } + } else { + return [list err [list reason "Supplied string not of the form \\UHHHHHHHH where H in \[0-9a-fA-F\]"]] + } + } else { + return [list err [list reason "Supplied string did not start with \\u or \\U" ]] + } + + } + + #Note that unicode characters don't *have* to be escaped. + #So if we provide a function named 'escape_string', the name implies the inverse of unescape_string which unescapes unicode \u \U values. + #- an inverse of unescape_string would encode all unicode chars unnecessarily. + #- as toml accepts a compact escape sequence for common chars such as tab,backspace,linefeed etc but also allows the full form \u009 etc + #- escape_string and unescape_string would not be reliably roundtrippable inverses anyway. + #REVIEW - provide it anyway? When would it be desirable to use? + + variable Bstring_control_map [dict create] + dict set Bstring_control_map \b {\b} + dict set Bstring_control_map \n {\n} + dict set Bstring_control_map \r {\r} + dict set Bstring_control_map \" {\"} + dict set Bstring_control_map \x1b {\e} ;#In spec it's included in the list of 'must be escaped', as well as the 'convenience' escapes - so we make it go both ways. + dict set Bstring_control_map \\ "\\\\" + + #\e for \x1b seems like it might be included - v1.1?? hard to find current state of where toml is going :/ + #for a Bstring (Basic string) tab is explicitly mentioned as not being one that must be escaped. + #8 = \b - already in list. + #built the remainder whilst checking for entries already hardcoded above -in case more are added to the hardcoded list + for {set cdec 0} {$cdec <= 7} {incr cdec} { + set hhhh [format %.4X $cdec] + set char [format %c $cdec] + if {![dict exists $Bstring_control_map $char]} { + dict set Bstring_control_map $char \\u$hhhh + } + } + for {set cdec [expr {0x0A}]} {$cdec <= 0x1F} {incr cdec} { + set hhhh [format %.4X $cdec] + set char [format %c $cdec] + if {![dict exists $Bstring_control_map $char]} { + dict set Bstring_control_map $char \\u$hhhh + } + } + # \u007F = 127 + dict set Bstring_control_map [format %c 127] \\u007F + + # ------------------------------------------------------------------ + variable Literal_control_map [dict create] + #controls other than tab + for {set cdec 0} {$cdec <= 8} {incr cdec} { + set hhhh [format %.4X $cdec] + set char [format %c $cdec] + if {![dict exists $Literal_control_map $char]} { + dict set Literal_control_map $char \\u$hhhh + } + } + for {set cdec [expr {0x0A}]} {$cdec <= 0x1F} {incr cdec} { + set hhhh [format %.4X $cdec] + set char [format %c $cdec] + if {![dict exists $Literal_control_map $char]} { + dict set Literal_control_map $char \\u$hhhh + } + } + # \u007F = 127 + dict set Literal_control_map [format %c 127] \\u007F + # ------------------------------------------------------------------ + variable Multiliteral_control_map + set Multiliteral_control_map [dict remove $Literal_control_map \n] + + variable String_control_map + set String_control_map [dict remove $Literal_control_map \\] + + + variable MultiBstring_totoml_map + #'minimally' escaped sequences of double quotes. + #e.g {""\"""\"} vs {\"\"\"\"\"} + #This produces easier to read toml - and in many cases may be more likely to match original format when roundtripped from dict datastructure + # REVIEW - should this be configurable? + set MultiBstring_totoml_map [dict remove $Bstring_control_map {"} \r \n] + dict set MultiBstring_totoml_map {"""} {""\"} ;#" editor hack: commented quote for dumb syntax highlighers + + #Note the inclusion of backslash in the list of controls makes this non idempotent - subsequent runs would keep encoding the backslashes! + #escape only those chars that must be escaped in a Bstring (e.g not tab which can be literal or escaped) + #for example - can be used by from_dict to produce valid Bstring data for a tomlish record + proc rawstring_to_Bstring_with_escaped_controls {str} { + #for the well known chars that have compact escape sequences allowed by toml - we choose that form over the full \u form. + #we'll use a string map with an explicit list rather than algorithmic at runtime + # - the string map is probably more performant than splitting a string, especially if it's large + + upvar ::tomlish::utils::Bstring_control_map map + + return [string map $map $str] + } + proc rawstring_to_MultiBstring_with_escaped_controls {str} { + #for the well known chars that have compact escape sequences allowed by toml - we choose that form over the full \u form. + #we'll use a string map with an explicit list rather than algorithmic at runtime + # - the string map is probably more performant than splitting a string, especially if it's large + + upvar ::tomlish::utils::MultiBstring_totoml_map map + + return [string map $map $str] + } + + proc rawstring_is_valid_tomlstring {str} { + #controls are allowed in this direction dict -> toml (they get quoted) + + #check any existing escapes are valid + if {[catch { + unescape_string $str + } errM]} { + return 0 + } + return 1 + } + + proc rawstring_is_valid_literal {str} { + #detect control chars other than tab + variable Literal_control_map + set testval [string map $Literal_control_map $str] + return [expr {$testval eq $str}] + } + proc rawstring_is_valid_multiliteral {str} { + #detect control chars other than tab + variable Multiliteral_control_map + + set teststr [string map [list \r\n ok] $str] + + set testval [string map $Multiliteral_control_map $teststr] + return [expr {$testval eq $teststr}] + } + + #review - unescape what string? Bstring vs MLBstring? + #we should be specific in the function naming here + #used by dict::from_tomlish - so part of validation? - REVIEW + proc unescape_string {str} { + #note we can't just use Tcl subst because: + # it also transforms \a (audible bell) and \v (vertical tab) which are not in the toml spec. + # it would strip out backslashes inappropriately: e.g "\j" becomes just j + # it recognizes other escapes which aren't approprite e.g octal \nnn + # it replaces \ with a single whitespace (trailing backslash) + #This means we shouldn't use 'subst' on the whole string, but instead substitute only the toml-specified escapes (\r \n \b \t \f \\ \" \uhhhh & \Uhhhhhhhh + #plus \e for \x1b? + + set buffer "" + set buffer2 "" ;#buffer for 2 hex characters following a \x + set buffer4 "" ;#buffer for 4 hex characters following a \u + set buffer8 "" ;#buffer for 8 hex characters following a \u + + set sLen [tcl::string::length $str] + + #we need to handle arbitrarily long sequences of backslashes. \\\\\ etc + set slash_active 0 + set unicode2_active 0 + set unicode4_active 0 + set unicode8_active 0 + + ::tomlish::log::debug "unescape_string. got len [string length str] str $str" + + #!todo - check for invalid data in the form of a raw carriage return (decimal 13) without following linefeed? + set i 0 + for {} {$i < $sLen} {} { + if {$i > 0} { + set lastChar [tcl::string::index $str [expr {$i - 1}]] + } else { + set lastChar "" + } + + set c [tcl::string::index $str $i] + #::tomlish::log::debug "unescape_string. got char $c" ;#too much? + + ##---------------------- + ##as we are 'unescaping' - should we really be testing here for existing values that should have been escaped? + ##The answer is probably no - keep this function to a single purpose - test elsewhere for raw controls. + ##this test looks incomplete anyway REVIEW + #scan $c %c n + #if {($n <= 31) && ($n != 9) && ($n != 10) && ($n != 13)} { + # #we don't expect unescaped unicode characters from 0000 to 001F - + # #*except* for raw tab (which is whitespace) and newlines + # error "unescape_string. Invalid data for a toml string. Unescaped control character (decimal $n) [::tomlish::utils::string_to_slashu $c]" + #} + ##---------------------- + + incr i ;#must incr here because we do'returns'inside the loop + if {$c eq "\\"} { + if {$slash_active} { + append buffer "\\" + set slash_active 0 + } elseif {$unicode2_active} { + error "unescape_string. unexpected case slash during unicode2 not yet handled" + } elseif {$unicode4_active} { + error "unescape_string. unexpected case slash during unicode4 not yet handled" + } elseif {$unicode8_active} { + error "unescape_string. unexpected case slash during unicode8 not yet handled" + } else { + # don't output anything (yet) + set slash_active 1 + } + } else { + if {$unicode2_active} { + if {[tcl::string::length $buffer2] < 2} { + append buffer2 $c + } + if {[tcl::string::length $buffer2] == 2} { + #we have a \xHH to test + set unicode2_active 0 + set result [tomlish::utils::hex_escape_info "\\x$buffer2"] + if {[lindex $result 0] eq "ok"} { + append buffer [dict get $result ok char] + } else { + error "unescape_string error: [lindex $result 1]" + } + } + } elseif {$unicode4_active} { + if {[tcl::string::length $buffer4] < 4} { + append buffer4 $c + } + if {[tcl::string::length $buffer4] == 4} { + #we have a \uHHHH to test + set unicode4_active 0 + set result [tomlish::utils::unicode_escape_info "\\u$buffer4"] + if {[lindex $result 0] eq "ok"} { + append buffer [dict get $result ok char] + } else { + error "unescape_string error: [lindex $result 1]" + } + } + } elseif {$unicode8_active} { + if {[tcl::string::length $buffer8] < 8} { + append buffer8 $c + } + if {[tcl::string::length $buffer8] == 8} { + #we have a \UHHHHHHHH to test + set unicode8_active 0 + set result [tomlish::utils::unicode_escape_info "\\U$buffer8"] + if {[lindex $result 0] eq "ok"} { + append buffer [dict get $result ok char] + } else { + error "unescape_string error: [lindex $result 1]" + } + } + } elseif {$slash_active} { + set slash_active 0 + set ctest [tcl::string::map {{"} dq} $c] + switch -exact -- $ctest { + dq { + append buffer {"} + } + b - t - n - f - r { + append buffer [subst -nocommand -novariable "\\$c"] + } + e { + append buffer \x1b + } + x { + #introduced in 1.1.0 \xHH + set unicode2_active 1 + set buffer2 "" + } + u { + set unicode4_active 1 + set buffer4 "" + } + U { + set unicode8_active 1 + set buffer8 "" + } + default { + set slash_active 0 + #review - toml spec says all other escapes are reserved + #and if they are used TOML should produce an error. + #append buffer "\\$c" + set msg "Invalid escape sequence \\ followed by '$c'" + return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} $msg + } + } + } else { + append buffer $c + } + } + } + #puts stdout "EOF 4:$unicode4_active 8:$unicode8_active slash:$slash_active" + if {$unicode2_active} { + error "End of string reached before complete hex escape sequence \xHH" + } + if {$unicode4_active} { + error "End of string reached before complete unicode escape sequence \uHHHH" + } + if {$unicode8_active} { + error "End of string reached before complete unicode escape sequence \UHHHHHHHH" + } + if {$slash_active} { + append buffer "\\" + } + try { + encoding convertto utf-8 $buffer + } trap {} {emsg eopts} { + return -code error -errorcode {TOMLISH SYNTAX ENCODINGERROR} $emsg + } + return $buffer + } + + #This does not have to do with unicode normal forms - which it seems toml has decided against regarding use in keys (review/references?) + #This is meant for internal use regarding ensuring we match equivalent keys which may have just been specified with different string mechanisms, + #e.g squoted vs dquoted vs barekey. + proc normalize_key {rawkey} { + set c1 [tcl::string::index $rawkey 0] + set c2 [tcl::string::index $rawkey end] + if {($c1 eq "'") && ($c2 eq "'")} { + #single quoted segment. No escapes allowed within it. + set key [tcl::string::range $rawkey 1 end-1] + } elseif {($c1 eq "\"") && ($c2 eq "\"")} { + #double quoted segment. Unapply escapes. + # + set keydata [tcl::string::range $rawkey 1 end-1] ;#strip outer quotes only + #e.g key could have mix of \UXXXXXXXX escapes and unicode chars + #or mix of \t and literal tabs. + #unescape to convert all to literal versions for comparison + set key [::tomlish::utils::unescape_string $keydata] + #set key [subst -nocommands -novariables $keydata] ;#wrong. Todo - create a string escape substitution function. + } else { + set key $rawkey + } + return $key + } + + proc string_to_slashu {string} { + set rv {} + foreach c [split $string {}] { + scan $c %c cdec + if {$cdec > 65535} { + append rv {\U} [format %.8X $cdec] + } else { + append rv {\u} [format %.4X $cdec] + } + } + return $rv + } + + #'nonprintable' is conservative here because some systems (e.g windows console) are very limited in what they can display. + #This is used for display purposes only (error msgs) + proc nonprintable_to_slashu {s} { + set res "" + foreach i [split $s ""] { + scan $i %c cdec + + set printable 0 + if {($cdec>31) && ($cdec<127)} { + set printable 1 + } + if {$printable} { + append res $i + } else { + if {$cdec > 65535} { + append res \\U[format %.8X $cdec] + } else { + append res \\u[format %.4X $cdec] + } + } + } + set res + } ;# initial version from tcl wiki RS + + proc rawstring_to_jsonstring {s} { + #like nonprintable_to_slashu + # - also escape every dquote + # - escape newlines + set res "" + foreach i [split $s ""] { + scan $i %c cdec + switch -- $cdec { + 34 { + #double quote + append res \\\" + } + 13 { + #carriage return + append res \\r + } + 8 { + append res \\b + } + 9 { + append res \\t + } + 10 { + #linefeed + append res \\n + } + 92 { + append res \\\\ + } + default { + set printable 0 + if {($cdec>31) && ($cdec<127)} { + set printable 1 + } + if {$printable} { + append res $i + } else { + if {$cdec > 65535} { + #append res $i + #append res \\U[format %.8X $cdec] ;#wrong + #append res "\\U{[format %.8x $cdec]}" ;#some variation of json? + package require punk::cesu + #e.g \U0001f610 emoticon face + #surrogate pair: \uD83D\uDE10 + set surrogatepair [punk::cesu::to_surrogatestring -format escape $i] + append res $surrogatepair + } else { + append res \\u[format %.4X $cdec] + } + } + } + } + } + set res + + } + + #check if str is valid for use as a toml bare key + #Early toml versions only allowed letters + underscore + dash + proc is_basic_barekey {str} { + if {[tcl::string::length $str] == 0} { + return 0 + } else { + set matches [regexp -all {[a-zA-Z0-9\_\-]} $str] + if {[tcl::string::length $str] == $matches} { + #all characters match the regexp + return 1 + } else { + return 0 + } + } + } + + #from toml.abnf in github.com/toml-lang/toml + #unquoted-key = 1*unquoted-key-char + #unquoted-key-char = ALPHA / DIGIT / %x2D / %x5F ; a-z A-Z 0-9 - _ + #unquoted-key-char =/ %xB2 / %xB3 / %xB9 / %xBC-BE ; superscript digits, fractions + #unquoted-key-char =/ %xC0-D6 / %xD8-F6 / %xF8-37D ; non-symbol chars in Latin block + #unquoted-key-char =/ %x37F-1FFF ; exclude GREEK QUESTION MARK, which is basically a semi-colon + #unquoted-key-char =/ %x200C-200D / %x203F-2040 ; from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ + #unquoted-key-char =/ %x2070-218F / %x2460-24FF ; include super-/subscripts, letterlike/numberlike forms, enclosed alphanumerics + #unquoted-key-char =/ %x2C00-2FEF / %x3001-D7FF ; skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces + #unquoted-key-char =/ %x2070-21FF / %x2300-24FF ; skip math operators + #unquoted-key-char =/ %x25A0-268B / %x2690-2757 ; skip box drawing, block elements, and some yin-yang symbols + #unquoted-key-char =/ %x2762-2767 / %x2776-27E5 ; skip some Dingbat punctuation + #unquoted-key-char =/ %x2801-297F ; skip some math brackets and arrows, and braille blank + #unquoted-key-char =/ %x2B00-2FFF / %x3001-D7FF ; skip various math operators and symbols, and ideographic space + #unquoted-key-char =/ %xF900-FDCF / %xFDF0-FFFD ; skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) + #unquoted-key-char =/ %x10000-EFFFF ; all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) + variable re_barekey + set ranges [list] + lappend ranges {a-zA-Z0-9\_\-} + lappend ranges {\u00B2} {\u00B3} {\u00B9} {\u00BC-\u00BE} ;# superscript digits, fractions + lappend ranges {\u00C0-\u00D6} {\u00D8-\u00F6} {\u00F8-\u037D} ;# non-symbol chars in Latin block + lappend ranges {\u037f-\u1FFF} ;# exclude GREEK QUESTION MARK, which is basically a semi-colon + lappend ranges {\u200C-\u200D} {\u203F-\u2040} ;# from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ + lappend ranges {\u2070-\u218f} {\u2460-\u24FF} ;# include super-subscripts, letterlike/numberlike forms, enclosed alphanumerics + lappend ranges {\u2C00-\u2FEF} {\u3001-\uD7FF} ;# skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces + lappend ranges {\u2070-\u21FF} {\u2300-\u24FF} ;# skip math operators + lappend ranges {\u25A0-\u268B} {\u2690-\u2757} ;# skip box drawing, block elements, and some yin-yang symbols + lappend ranges {\u2762-\u2767} {\u2776-\u27E5} ;# skip some Dingbat punctuation + lappend ranges {\u2801-\u297F} ;# skip some math brackets and arrows, and braille blank + lappend ranges {\u2B00-\u2FFF} {\u3001-\uD7FF} ;# skip various math operators and symbols, and ideographic space + lappend ranges {\uF900-\uFDCF} {\uFDF0-\uFFFD} ;# skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) + lappend ranges {\U10000-\UEFFFF} ;# all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) + set re_barekey {^[} + foreach r $ranges { + append re_barekey $r + } + append re_barekey {]+$} + + proc is_barekey {str} { + if {[tcl::string::length $str] == 0} { + return 0 + } + variable re_barekey + return [regexp $re_barekey $str] + } + + #test only that the characters in str are valid for the toml specified type 'integer'. + proc int_validchars1 {str} { + set numchars [tcl::string::length $str] + if {[regexp -all {[0-9\_\-\+]} $str] == $numchars} { + return 1 + } else { + return 0 + } + } + #add support for hex,octal,binary 0x.. 0o.. 0b... + proc int_validchars {str} { + set numchars [tcl::string::length $str] + if {[regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] == $numchars} { + return 1 + } else { + return 0 + } + } + + proc is_int {str} { + set matches [regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] ;#0b101 etc covered by a-f + + if {[tcl::string::length $str] == $matches} { + #all characters in legal range + + # --------------------------------------- + #check for leading zeroes in non 0x 0b 0o + #first strip any +, - or _ (just for this test) + #(but still allowing 0 -0 +0) + set check [tcl::string::map {+ "" - "" _ ""} $str] + if {([tcl::string::length $check] > 1) && ([tcl::string::index $check 0] eq "0") && ([tcl::string::index $check 1] ni {o x b})} { + return 0 + } + # --------------------------------------- + + #check +,- only occur in the first position. (excludes also +++1 etc) + if {[tcl::string::last - $str] > 0} { + return 0 + } + if {[tcl::string::last + $str] > 0} { + return 0 + } + + #------------------------------------------- + #unclear if a 'digit' includes the type specifiers x b o + #we assume the 0x 0b 0o are NOT counted as digits - as underscores here would seem + #to be likely to cause interop issues with other systems + #(e.g tcl allows 0b1_1 but not 0b_11) + #Most of this structure would be unnecessary if we could rely on string::is::integer understanding underscores (9+?) + #we still need to support earlier Tcl for now though. + + #first rule out any case with more than one underscore in a row + if {[regexp {__} $str]} { + return 0 + } + if {[string index $str 0] eq "_"} { + return 0 + } + set utest [string trimleft $str +-] + #test again for further trick like _+_0xFF + if {[string index $utest 0] eq "_"} { + return 0 + } + if {[string range $utest 0 1] in {0x 0b 0o}} { + set testnum [string range $utest 2 end] + #spec says *non-negative* integers may *also* be expressed in hex, octal or binary + #and also explicitly states + not allowed + #presumed to mean negative not allowed. + if {[string index $str 0] in {- +}} { + return 0 + } + } else { + set testnum $utest + #exclude also things like 0_x 0___b that snuck past our prefix test + if {![string is digit -strict [string map {_ ""} $testnum]]} { + return 0 + } + #assert - only digits and underscores in testnum + #still may have underscores at each end + } + #assert testnum is now the 'digits' portion of a , 0x 0b 0o number + #(+ and - already stripped) + #It may still have chars unsuitable for its type - which will be caught by the string::is::integer test below + if {[string length $testnum] != [string length [string trim $testnum _]]} { + #had non-inner underscores in 'digit' part + return 0 + } + #assert str only has solo inner underscores (if any) between 'digits' + #------------------------------------------- + + set numeric_value [tcl::string::map {_ ""} $str] ;#allow some earlier tcl versions which don't support underscores + #use Tcl's integer check to ensure we don't let things like 3e4 through - which is a float (would need to be 0x3e4 for hex) + if {![tcl::string::is integer -strict $numeric_value]} { + return 0 + } + + + + #!todo - check bounds only based on some config value + #even though Tcl can handle bignums, we won't accept anything outside of toml 1.0 minimum requirements by default (for now) + #presumably very large numbers would have to be supplied in a toml file as strings. + #Review - toml 1.0 only says that it must handle up to 2^63 - not that this is a max + #some question around implementations allowed to use lower values such as 2^31 on some systems? + if {$::tomlish::max_int ne "" && $numeric_value > $::tomlish::max_int} { + return 0 + } + if {$::tomlish::min_int ne "" && $numeric_value < $::tomlish::min_int} { + return 0 + } + } else { + return 0 + } + #Got this far - didn't find anything wrong with it. + return 1 + } + + #test only that the characters in str are valid for the toml specified type 'float'. + proc float_validchars {str} { + set numchars [tcl::string::length $str] + if {[regexp -all {[eE0-9\_\-\+\.]} $str] == $numchars} { + return 1 + } else { + #only allow lower case for these special values - as per Toml 1.0 spec + if {$str ni {inf +inf -inf nan +nan -nan}} { + return 0 + } else { + return 1 + } + } + } + + #note - Tcl's string is double will return true also for the subset of float values which are integers + #This function is to determine whether it matches the Toml float concept - so requires a . or e or E + proc is_float {str} { + #vip greenlight known literals, don't test for case variations - as Toml doesn't allow (whereas Tcl allows Inf NaN etc) + if {$str in {inf +inf -inf nan +nan -nan}} { + return 1 + } + #doorcheck the basics for floatiness vs members of that rival gang - ints + if {![regexp {[.eE]} $str]} { + #could be an integer - which isn't specifically a float for Toml purposes. + return 0 + } + + + #patdown for any contraband chars + set matches [regexp -all {[eE0-9\_\-\+\.]} $str] + if {[tcl::string::length $str] != $matches} { + return 0 + } + + #all characters in legal range + + #A leading zero is ok, but we should disallow multiple leading zeroes (same rules as toml ints) + + #Early Toml spec also disallowed leading zeros in the exponent part(?) + #... this seems less interoperable anyway (some libraries generate leading zeroes in exponents) + #we allow leading zeros in exponents here. + + #Check for leading zeros in main part + #first strip any +, - or _ (just for this test) + set check [tcl::string::map {+ "" - "" _ ""} $str] + set r {([0-9])*} + regexp $r $check intpart ;#intpart holds all numerals before the first .,e or E + #leading zero only if exactly one zero + if {$intpart ne "0" && [string match 0* $intpart]} { + return 0 + } + + #for floats, +,- may occur in multiple places + #e.g -2E-22 +3e34 + #!todo - check bounds ? + + #----------------------------------------- + if {[regexp {__} $str]} { + return 0 + } + if {[string index $str 0] eq "_" || [string index $str end] eq "_"} { + return 0 + } + set utest [string trimleft $str +-] + #test again for further trick like _+_ + if {[string index $utest 0] eq "_"} { + return 0 + } + #----------------------------------------- + + #decimal point, if used must be surrounded by at least one digit on each side + #e.g 3.e+20 also illegal + set dposn [string first . $str] + if {$dposn > -1 } { + set d3 [string range $str $dposn-1 $dposn+1] + if {![string is integer -strict [string index $d3 0]] || ![string is integer -strict [string index $d3 2]]} { + return 0 + } + } + #we've already eliminated leading/trailing underscores + #now ensure each inner underscore is surrounded by digits + if {[regexp {_[^0-9]|[^0-9]_} $str]} { + return 0 + } + + #strip underscores for tcl double check so we can support < tcl 9 versions which didn't allow underscores + set check [tcl::string::map {_ ""} $str] + #string is double accepts inf nan +NaN etc. + if {![tcl::string::is double $check]} { + return 0 + } + + #All good - seems to be a toml-approved float and not an int. + return 1 + } + + #test only that the characters in str are valid for the toml specified type 'datetime'. + proc datetime_validchars {str} { + set numchars [tcl::string::length $str] + if {[regexp -all {[zZtT0-9\-\+\.:]} $str] == $numchars} { + return 1 + } else { + return 0 + } + } + + proc is_datepart {str} { + set matches [regexp -all {[0-9\-]} $str] + if {[tcl::string::length $str] != $matches} { + return 0 + } + #seems to require yyyy-mm-dd (e.g not allowing just yyyy-mm) + if {![regexp {^([0-9]{4})-([0-9]{2})-([0-9]{2})$} $str _match y m d]} { + return 0 + } + if {$m > 12 || $m == 0} { + return 0 + } + switch -- [expr {$m}] { + 1 - 3 - 5 - 7 - 8 - 10 - 12 { + if {$d > 31 || $d == 0} { + return 0 + } + } + 2 { + if {$d > 29 || $d == 0} { + return 0 + } + if {$d == 29} { + #leapyear check + if {[catch {clock scan $str -format %Y-%m-%d} errM]} { + return 0 + } + } + } + 4 - 6 - 9 - 11 { + if {$d > 30 || $d == 0} { + return 0 + } + } + } + return 1 + } + proc is_localdate {str} { + is_datepart $str + } + + #allow only hh:mm:ss or hh:mm (no subseconds) + proc _is_hms_or_hm_time {val} { + set numchars [tcl::string::length $val] + if {[regexp -all {[0-9:]} $val] != $numchars} { + return 0 + } + #assert now digits and colons only + set hms_cparts [split $val :] + #2 or 3 parts only are valid - check contents of each part + if {[llength $hms_cparts] == 2} { + lassign $hms_cparts hr min + if {[string length $hr] != 2 || [string length $min] != 2} { + return 0 + } + if {$hr > 23 || $min > 59} { + return 0 + } + } elseif {[llength $hms_cparts] == 3} { + lassign $hms_cparts hr min sec + if {[string length $hr] != 2 || [string length $min] != 2 || [string length $sec] !=2} { + return 0 + } + #possible for sec to be 60 - leap second RFC 3339 + if {$hr > 23 || $min > 59 || $sec > 60} { + return 0 + } + } else { + return 0 + } + return 1 + } + proc is_timepart {str} { + #validate the part after the T (or space) + #we receive only that trailing part here. + + #odt1 = 1979-05-27T07:32:00Z + #odt2 = 1979-05-27T00:32:00-07:00 + #odt3 = 1979-05-27T00:32:00.5-07:00 + #odt4 = 1979-05-27T00:32:00.999999-07:00 + + set numchars [tcl::string::length $str] + #timepart can have negative or positive offsets so - and + must be accepted + if {[regexp -all {[zZt0-9\-\+\.:]} $str] == $numchars} { + #todo + #basic check that we have leading 2dig hr and 2dig min separated by colon + if {![regexp {^[0-9]{2}:[0-9]{2}$|^[0-9]{2}:[0-9]{2}[^0-9]{1}.*$} $str]} { + #nn:nn or nn:nnX.* where X is non digit + return 0 + } + set dotparts [split $str .] + if {[llength $dotparts] ni {1 2}} { + return 0 + } + if {[llength $dotparts] == 2} { + lassign $dotparts hms tail + #validate tail - which might have +- offset + if {[string index $tail end] ni {z Z}} { + #from hh:mm:??. + #check for +/- something + if {[regexp {(.*)[+-](.*)} $tail _match fraction offset]} { + if {![string is digit -strict $fraction]} { + return 0 + } + if {![_is_hms_or_hm_time $offset]} { + return 0 + } + } + } else { + set tail [string range $tail 0 end-1] + #expect tail nnnn (from hh:mm::ss.nnnnZ) + #had a dot and a zZ - no other offset valid (?) + if {![string is digit -strict $tail]} { + return 0 + } + } + + } else { + #no dot (fraction of second) + if {[regexp {(.*)[+-](.*)} $str _match hms offset]} { + #validate offset + if {![_is_hms_or_hm_time $offset]} { + return 0 + } + } else { + set hms $str + set offset "" + #trim a *single* z or Z off hms if present - multiple should error later + if {[string index $hms end] in {z Z}} { + set hms [string range $hms 0 end-1] + } + } + } + #hms is allowed in toml to be hh:mm:ss or hh:mm + #validate we have hh:mm:ss or hh:mm - exactly 2 digits each + if {![_is_hms_or_hm_time $hms]} { + return 0 + } + + return 1 + } else { + return 0 + } + } + proc is_localtime {str} { + #time of day without any relation to a specific day or any offset or timezone + set numchars [tcl::string::length $str] + if {[regexp -all {[0-9\.:]} $str] == $numchars} { + #todo + if {![regexp {^[0-9]{2}:[0-9]{2}$|^[0-9]{2}:[0-9]{2}:[0-9]{2}([.][0-9]+){0,1}$} $str]} { + #hh:mm or hh:mm:ss or hh:mm::ss.nnn + return 0 + } + set dotparts [split $str .] + if {[llength $dotparts] ni {1 2}} { + return 0 + } + if {[llength $dotparts] == 2} { + lassign $dotparts hms _tail + #validate tail - just fractional seconds - regex has confirmed at least one digit and only digits + #nothing todo? max length? + } else { + #no fractional seconds + set hms $str + } + if {![_is_hms_or_hm_time $hms]} { + return 0 + } + return 1 + } else { + return 0 + } + } + + #review + proc is_datetime {str} { + #Essentially RFC3339 formatted date-time - but: + #1) allowing seconds to be omitted (:00 assumed) + #2) T may be replaced with a single space character TODO - parser support for space in datetime! + # (RFC 3339 allows space instead of T also - but doesn't specify it *must* be a single space) + + #toml-lint @2025-04 doesn't accept t for T or z for Z - but RFC3339 does + #toml spec doesn't clarify - we will accept + + #e.g 1979-05-27 + #e.g 1979-05-27T00:32:00Z + #e.g 1979-05-27 00:32:00-07:00 + #e.g 1979-05-27 00:32:00+10:00 + #e.g 1979-05-27 00:32:00.999999-07:00 + + #review + #minimal datetimes? + # 2024 not ok - 2024T not accepted by tomlint why? + # 02:00 ok + # 02:00:00.5 ok + # 1:00 - not ok - RFC3339 requires 2-digit hr,min,sec + + #toml-lint.com accepts 2025-01 + + if {[string length $str] < 5} { + return 0 + } + + set matches [regexp -all {[zZtT0-9\ \-\+\.:]} $str] + if {[tcl::string::length $str] == $matches} { + #all characters in legal range + if {[regexp -all {\ } $str] > 1} { + #only a single space is allowed. + return 0 + } + #If we get a space - it is only valid as a convience to represent the T separator + #we can normalize by converting to T here before more tests + set str [string map {" " T t T} $str] + #a further sanity check on T + if {[regexp -all {T} $str] > 1} { + return 0 + } + + #!todo - use full RFC 3339 parser? + #!todo - what if the value is 'time only'? + + if {[string first T $str] > -1} { + lassign [split $str T] datepart timepart + if {![is_datepart $datepart]} { + return 0 + } + if {![is_timepart $timepart]} { + return 0 + } + } else { + #either a datepart or a localtime + #spec: "If you include only the time portion of an RFC 3339 formatted date-time, it will represent that time of day + # without any relation to a specific day or any offset or timezone." + if {!([is_datepart $str] || [is_localtime $str])} { + return 0 + } + } + + + #Tcl's free-form clock scan (no -format option) is deprecated + # + #if {[catch {clock scan $datepart} err]} { + # puts stderr "tcl clock scan failed err:'$err'" + # return 0 + #} + + } else { + return 0 + } + return 1 + } + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::utils ---}] +} + +namespace eval tomlish::parse { + #*** !doctools + #[subsection {Namespace tomlish::parse}] + #[para] + #[list_begin definitions] + + #This is a somewhat curly mix of a statemachine and toml-nesting-stack littered with special cases. + #The code is a pig's-nest - but it should be noted that for example trailing single double quotes in multiline strings are perhaps not so trivial to parse using more standard methods either: + # - e.g some kind of backtracking required if using an ABNF parser? + #I don't know the precise technical name for this sort of parser; probably something like "Dog's Breakfast" + #More seriously, we don't have distinct lex/parse steps - so it is basically a 'fused lexer' or 'scannerless parser' + + #It is also desirable for this system to be useful in 'interactive' use. review - would a separate lexer make this easier or harder? + + #A possible alternative more structured approach might be to use a PEG (Parsing Expression Grammar) + + + variable is_parsing 0 ;#whether we are in the middle of parsing tomlish text + + variable state + # states: + # table-space, itable-space, array-space + # array-value-expected,keyval-value-expected,itable-keyval-value-expected, keyval-syntax, + # dquoted-key, squoted-key + # string-state, literal-state, multistring... + # + # notes: + # only the -space states are also 'spaces' ie a container which is pushed/popped on the spacestack + + # + # xxx_value-expected - we also allow for leading whitespace in this state, but once a value is returned we jump to a state based on the containing space. e.g keyval-tail or array-syntax + # + #stateMatrix defines for each state, actions to take for each possible token. + #single-element actions are the name of the next state into which to transition, or a 'POPSPACE' instruction to pop a level off the spacestack and add the data to the parent container. + #dual-element actions are a push instruction and the name of the space to push on the stack. + # - PUSHSPACE is a simple push onto the spacestack, zeropoppushspace also pushes, but will first do a pop *if* the current space level is greater than zero (ie if only if not already in root table-space) + + # -- --- --- --- --- --- + #token/state naming guide + # -- --- --- --- --- --- + #tokens : underscore separated or bare name e.g newline, start_quote, start_squote + #private tokens: always have a leading underscore (These are private 'temporary state' tokens that are never returned as actual tokens e.g _start_squote_sequence + #states : always contain at least one dash e.g err-state, table-space + #instructions + # -- --- --- --- --- --- + + + #stateMatrix dict of elements mapping current state to next state based on returned tokens + # current-state {token-encountered next-state ... } + # where next-state can be a 1 or 2 element list. + #If 2 element - the first item is an instruction (ucase) + #If 1 element - it is either a lowercase dashed state name or an ucase instruction + #e.g {PUSHSPACE } or POPSPACE or SAMESPACE + + + #SAMESPACE - got to same space as parent without popping a level, but has it's own autotransition lookup - strange concept - review usecases + + variable stateMatrix + set stateMatrix [dict create] + #--------------------------------------------------------- + #WARNING + #The stateMatrix implementation here is currently messy. + #The code is a mixture of declarative via the stateMatrix and imperative via switch statements during PUSH/POP/SAMESPACE transitions. + #This means the state behaviour has to be reasoned about by looking at both in conjuction. + #--------------------------------------------------------- + + #xxx-space vs xxx-syntax inadequately documented - TODO + + #review - out of date? + # --------------------------------------------------------------------------------------------------------------# + # incomplete example of some state starting at table-space + # --------------------------------------------------------------------------------------------------------------# + # ( = -> keyval-value-expected) + # keyval-syntax (popped -> keyval-space -> keyval-tail) (autotransition on pop) + # keyval-space (autotransition on push ^) + # table-space (barekey^) (startdquote -> dquoted-key ^) + # --------------------------------------------------------------------------------------------------------------# + + dict set stateMatrix\ + table-space { + bom "table-space"\ + whitespace "table-space"\ + newline "table-space"\ + barekey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ + squotedkey {PUSHSPACE "keyval-space" state "keyval-syntax" note ""}\ + dquotedkey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ + XXXsingle_dquote "quoted-key"\ + XXXsingle_squote "squoted-key"\ + comment "table-space"\ + starttablename "tablename-state"\ + starttablearrayname "tablearrayname-state"\ + enddquote "err-state"\ + endsquote "err-state"\ + comma "err-state"\ + eof "end-state"\ + equal "err-state"\ + cr "err-lonecr"\ + } + + + + dict set stateMatrix\ + keyval-space {\ + whitespace "keyval-syntax"\ + equal "keyval-value-expected"\ + } + + # ' = ' portion of keyval + dict set stateMatrix\ + keyval-syntax {\ + whitespace "keyval-syntax"\ + barekey {PUSHSPACE "dottedkey-space"}\ + squotedkey {PUSHSPACE "dottedkey-space"}\ + dquotedkey {PUSHSPACE "dottedkey-space"}\ + equal "keyval-value-expected"\ + comma "err-state"\ + newline "err-state"\ + eof "err-state"\ + } + #### + dict set stateMatrix\ + keyval-value-expected {\ + whitespace "keyval-value-expected"\ + untyped_value {TOSTATE "keyval-untyped-sequence" note "possible datetime datepart"}\ + literal {TOSTATE "keyval-tail" note "required for empty literal at EOF"}\ + string {TOSTATE "keyval-tail" note "required for empty string at EOF"}\ + single_dquote {TOSTATE "string-state" returnstate keyval-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ + single_squote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ + triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ + startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ + startarray {PUSHSPACE array-space returnstate keyval-tail}\ + } + #double_squote {TOSTATE "keyval-tail" note "empty literal received when double squote occurs"} + + #untyped_value sequences without intervening comma are allowed for datepart timepart + #we will produce tomlish with missing SEPS and to_dict must validate whether 2 adjacent barekeys are valid + dict set stateMatrix\ + keyval-untyped-sequence {\ + whitespace "keyval-untyped-sequence"\ + untyped_value {TOSTATE "keyval-tail"}\ + literal {TOSTATE "keyval-tail" note "required for empty literal at EOF"}\ + string {TOSTATE "keyval-tail" note "required for empty string at EOF"}\ + single_dquote {TOSTATE "string-state" returnstate keyval-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ + single_squote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ + triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ + startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ + startarray {PUSHSPACE array-space returnstate keyval-tail}\ + newline "POPSPACE"\ + comment "keyval-tail"\ + eof "end-state"\ + } + + #2025 - no leading-squote-space - only trailing-squote-space. + + dict set stateMatrix\ + keyval-tail {\ + whitespace "keyval-tail"\ + newline "POPSPACE"\ + comment "keyval-tail"\ + eof "end-state"\ + } + + + #itable-space/ curly-syntax : itables + # x={y=1,} + dict set stateMatrix\ + itable-space {\ + whitespace "itable-space"\ + newline "itable-space"\ + barekey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + squotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + dquotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + endinlinetable "POPSPACE"\ + comma "err-state"\ + comment "itable-space"\ + eof "err-state"\ + } + #we don't get single_squote etc here - instead we get the resulting squotedkey token + + + # ??? review - something like this + # + # x={y =1,} + dict set stateMatrix\ + itable-keyval-syntax {\ + whitespace {TOSTATE "itable-keyval-syntax"}\ + barekey {PUSHSPACE "dottedkey-space"}\ + squotedkey {PUSHSPACE "dottedkey-space"}\ + dquotedkey {PUSHSPACE "dottedkey-space"}\ + equal {TOSTATE "itable-keyval-value-expected"}\ + newline "err-state"\ + eof "err-state"\ + } + + # x={y=1} + dict set stateMatrix\ + itable-keyval-space {\ + whitespace "itable-keyval-syntax"\ + equal {TOSTATE "itable-keyval-value-expected" note "required"}\ + } + + dict set stateMatrix\ + itable-keyval-value-expected {\ + whitespace "itable-keyval-value-expected"\ + untyped_value {TOSTATE "itable-val-tail" note ""}\ + single_dquote {TOSTATE "string-state" returnstate itable-val-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate itable-val-tail}\ + single_squote {TOSTATE "literal-state" returnstate itable-val-tail note "usual way a literal is triggered"}\ + triple_squote {PUSHSPACE "multiliteral-space" returnstate itable-val-tail}\ + startinlinetable {PUSHSPACE "itable-space" returnstate itable-val-tail}\ + startarray {PUSHSPACE "array-space" returnstate itable-val-tail}\ + } + #double_squote not currently generated by _start_squote_sequence - '' processed as single_squote to literal-state just like 'xxx' + # review + # double_squote {TOSTATE "itable-val-tail" note "empty literal received when double squote occurs"} + + + + # x={y=1,z="x"} + #POPSPACE is transition from itable-keyval-space to parent itable-space + dict set stateMatrix\ + itable-val-tail {\ + whitespace "itable-val-tail"\ + endinlinetable "POPSPACE"\ + comma "POPSPACE"\ + newline {TOSTATE "itable-val-tail" note "itable-space ??"}\ + comment "itable-val-tail"\ + eof "err-state"\ + } + # XXXnewline "POPSPACE" + # We shouldn't popspace on newline - as if there was no comma we need to stay in itable-val-tail + # This means the newline and subsequent whitespace, comments etc become part of the preceeding dottedkey record + #e.g + # x = { + # j=1 + # #comment within dottedkey j record + # , # comment unattached + # #comment unattached + # k=2 , #comment unattached + # l=3 #comment within l record + # , m=4 + # #comment associated with m record + # + # #still associated with m record + # } + ## - This doesn't quite correspond to what a user might expect - but seems like a consistent mechanism. + #The awkwardness is because there is no way to put in a comment that doesn't consume a trailing comma + #so we cant do: j= 1 #comment for j1 , + # and have the trailing comma recognised. + # + # To associate: j= 1, #comment for j1 + # we would need some extra processing . (not popping until next key ? extra state itable-sep-tail?) REVIEW - worth doing? + # + # The same issue occurs with multiline arrays. The most natural assumption is that a comment on same line after a comma + # is 'associated' with the previous entry. + # + # These comment issues are independent of the data dictionary being generated for conversion to json etc - as the comments don't carry through anyway, + # but are a potential oddity for manipulating the intermediate tomlish structure whilst attempting to preserve 'associated' comments + # (e.g reordering records within an itable) + #The user's intention for 'associated' isn't always clear and the specs don't really guide on this. + + + #dottedkey-space is not (currently) used within [tablename] or [[tablearrayname]] + #it is for keyval ie x.y.z = value + + #this is the state after dot + #we are expecting a complete key token or whitespace + #(initial entry to the space is by one of the keys - which will immediately go to dottedkey-space-tail) + dict set stateMatrix\ + dottedkey-space {\ + whitespace "dottedkey-space"\ + dotsep "err-state"\ + barekey "dottedkey-space-tail"\ + squotedkey "dottedkey-space-tail"\ + dquotedkey "dottedkey-space-tail"\ + newline "err-state"\ + comma "err-state"\ + comment "err-state"\ + equal "err-state"\ + } + + #dottedkeyend "POPSPACE" + #equal "POPSPACE"\ + + + #jmn 2025 + #we have 1 or more dottedkeys so far - need dotsep to add more, whitespace to maintain, equal to pop + dict set stateMatrix\ + dottedkey-space-tail {\ + whitespace "dottedkey-space-tail" + dotsep "dottedkey-space" + equal "POPSPACE"\ + eof "err-state"\ + newline "err-state"\ + } + + #-------------------------------------------------------------------------- + #scratch area + #from_toml {x=1} + # barekey tok + # table-space PUSHSPACE keyval-space state keyval-syntax + # + + + #-------------------------------------------------------------------------- + + + #REVIEW + #toml spec looks like heading towards allowing newlines within inline tables + #https://github.com/toml-lang/toml/issues/781 + + #2025 - multiline itables appear to be valid for 1.1 - which we are targeting. + #https://github.com/toml-lang/toml/blob/main/toml.md#inline-table + + #JMN2025 + #review comment "err-state" vs comment "itable-space" - see if TOML 1.1 comes out and allows comments in multiline ITABLES + #We currently allow multiline ITABLES (also with comments) in the tokenizer. + #if we want to disallow as per TOML 1.0 - we should do so when attempting to get structure? + + + #JMN REVIEW + #dict set stateMatrix\ + # array-space {\ + # whitespace "array-space"\ + # newline "array-space"\ + # untyped_value "SAMESPACE"\ + # startarray {PUSHSPACE "array-space"}\ + # endarray "POPSPACE"\ + # startinlinetable {PUSHSPACE itable-space}\ + # single_dquote "string-state"\ + # single_squote "literal-state"\ + # triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax note "seems ok 2024"}\ + # comma "array-space"\ + # comment "array-space"\ + # eof "err-state-array-space-got-eof"\ + # } + + ## array-space ## + set aspace [dict create] + dict set aspace whitespace "array-space" + dict set aspace newline "array-space" + #dict set aspace untyped_value "SAMESPACE" + dict set aspace untyped_value "array-syntax" + dict set aspace startarray {PUSHSPACE "array-space"} + dict set aspace endarray "POPSPACE" + dict set aspace single_dquote {TOSTATE "string-state" returnstate array-syntax} + dict set aspace triple_dquote {PUSHSPACE "multistring-space" returnstate array-syntax} + dict set aspace single_squote {TOSTATE "literal-state" returnstate array-syntax} + dict set aspace triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax} + dict set aspace startinlinetable {PUSHSPACE itable-space} + #dict set aspace comma "array-space" + dict set aspace comment "array-space" + dict set aspace eof "err-state-array-space-got-eof" + dict set stateMatrix array-space $aspace + + #when we pop from an inner array we get to array-syntax + #e.g {x=[[]] ??? + set tarntail [dict create] + dict set tarntail whitespace "err-state" ;#"tablearrayname-tail" ;#spec doesn't allow whitespace here + dict set tarntail newline "err-state" + dict set tarntail comment "err-state" + dict set tarntail eof "err-state" + dict set tarntail endtablename "tablearray-tail" + dict set stateMatrix tablearrayname-tail $tarntail + + #review - somewhat counterintuitive...? + # [(starttablearrayname) (endtablearrayname] + # [(starttablename) (endtablename)] + + # [[xxx]] ??? + set tartail [dict create] + dict set tartail whitespace "tablearray-tail" + dict set tartail newline "table-space" + dict set tartail comment "tablearray-tail" + dict set tartail eof "end-state" + dict set stateMatrix tablearray-tail $tartail + + + + + + + dict set stateMatrix\ + end-state {} + + set knowntokens [list] + set knownstates [list] + dict for {state transitions} $stateMatrix { + if {$state ni $knownstates} {lappend knownstates $state} + dict for {tok instructions} $transitions { + if {$tok ni $knowntokens} {lappend knowntokens $tok} + } + } + dict set stateMatrix nostate {} + foreach tok $knowntokens { + dict set stateMatrix nostate $tok "err-nostate-received-token-$tok" + } + + + # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + #purpose - debugging? remove? + # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + #build a list of 'push triggers' from the stateMatrix + # ie tokens which can push a new space onto spacestack + set push_trigger_tokens [list] + tcl::dict::for {s transitions} $stateMatrix { + tcl::dict::for {token transition_to} $transitions { + set instruction [lindex $transition_to 0] + switch -exact -- $instruction { + PUSHSPACE - zeropoppushspace { + if {$token ni $push_trigger_tokens} { + lappend push_trigger_tokens $token + } + } + } + } + } + ::tomlish::log::debug "push_trigger_tokens: $push_trigger_tokens" + # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + + + + #This seems hacky... (deprecate in favour of explicit arguments to the instructions in stateMatrix?) + #spacePopTransitions, spacePushTransitions, spaceSameTransitions below for auto state redirections on POPSPACE,PUSHSPACE,SAMESPACE + + #mainly for the -space states: + #redirect to another state $c based on a state transition from $whatever to $b + # e.g "string {array-space array-syntax}" means when transitioning from string to array-space, jump to array-syntax instead. + #this is useful as we often don't know state $b. e.g when it is decided by 'POPSPACE' + + #use dict set to add values so we can easily add/remove/comment lines + + #Push to, next + #default first states when we push to these spaces + variable spacePushTransitions [dict create] + dict set spacePushTransitions keyval-space keyval-syntax + dict set spacePushTransitions itable-keyval-space itable-keyval-syntax + dict set spacePushTransitions array-space array-space + dict set spacePushTransitions table-space tablename-state + #dict set spacePushTransitions #itable-space itable-space + + #Pop to, next + variable spacePopTransitions [dict create] + dict set spacePopTransitions array-space array-syntax + + + #itable-keyval-space itable-val-tail + #review + #we pop to keyval-space from dottedkey-space or from keyval-value-expected? we don't always want to go to keyval-tail + #leave it out and make the POPSPACE caller explicitly specify it + #keyval-space keyval-tail + + variable spaceSameTransitions [dict create] + #JMN test + #dict set spaceSameTransitions array-space array-syntax + + #itable-keyval-space itable-val-tail + + + variable state_list ;#reset every tomlish::decode::toml + + namespace export tomlish toml + namespace ensemble create + + #goNextState has various side-effects e.g pushes and pops spacestack + #REVIEW - setting nest and v elements here is ugly + #todo - make neater, more single-purpose? + proc goNextState {tokentype tok currentstate} { + variable state + variable nest + variable v + + set prevstate $currentstate + + + variable spacePopTransitions + variable spacePushTransitions + variable spaceSameTransitions + + variable last_space_action "none" + variable last_space_type "none" + variable state_list + + set result "" + set starttok "" + + if {[dict exists $::tomlish::parse::stateMatrix $currentstate $tokentype]} { + set transition_to [dict get $::tomlish::parse::stateMatrix $currentstate $tokentype] + ::tomlish::log::debug "--->> goNextState tokentype:$tokentype tok:$tok currentstate:$currentstate : transition_to = $transition_to" + switch -exact -- [lindex $transition_to 0] { + POPSPACE { + set popfromspace_info [spacestack peek] + set popfromspace_state [dict get $popfromspace_info state] + spacestack pop + set parent_info [spacestack peek] + set type [dict get $parent_info type] + set parentspace [dict get $parent_info state] + + set last_space_action "pop" + set last_space_type $type + + if {[dict exists $parent_info returnstate]} { + set next [dict get $parent_info returnstate] + #clear the returnstate on current level + set existing [spacestack pop] + dict unset existing returnstate + spacestack push $existing ;#re-push modification + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected to stored returnstate $next <<---" + } else { + ### + #review - do away with spacePopTransitions - which although useful to provide a default.. + # - involve error-prone configurations distant to the main state transition configuration in stateMatrix + if {[dict exists $::tomlish::parse::spacePopTransitions $parentspace]} { + set next [dict get $::tomlish::parse::spacePopTransitions $parentspace] + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected state to $next (spacePopTransitions)<<---" + } else { + set next $parentspace + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace<<---" + } + } + set result $next + } + SAMESPACE { + set currentspace_info [spacestack peek] + ::tomlish::log::debug "--->> SAMESPACE got current space entry: $currentspace_info <<<<<" + set type [dict get $currentspace_info type] + set currentspace [dict get $currentspace_info state] + + if {[dict exists $currentspace_info returnstate]} { + set next [dict get $currentspace_info returnstate] + #clear the returnstate on current level + set existing [spacestack pop] + dict unset existing returnstate + spacestack push $existing ;#re-push modification + ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace redirected to stored returnstate $next" + } else { + if {[dict exists $::tomlish::parse::spaceSameTransitions $currentspace]} { + set next [dict get $::tomlish::parse::spaceSameTransitions $currentspace] + ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace redirected state to $next (spaceSameTransitions)" + } else { + set next $currentspace + ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace" + } + } + set result $next + } + zeropoppushspace { + if {$nest > 0} { + #pop back down to the root level (table-space) + spacestack pop + set parentinfo [spacestack peek] + set type [dict get $parentinfo type] + set target [dict get $parentinfo state] + + set last_space_action "pop" + set last_space_type $type + + #----- + #standard pop + set parentlevel [expr {$nest -1}] + lappend v($parentlevel) [set v($nest)] + incr nest -1 + #----- + } + #re-entrancy + + #set next [list PUSHSPACE [lindex $transition_to 1]] + set nexttokentype ${tokentype}2 ;#fake token type e.g tablename2 or tablearrayname2 + ::tomlish::log::debug "--->> zeropoppushspace goNextState RECURSE. calling goNextState $nexttokentype $currentstate" + set transition_info [::tomlish::parse::goNextState $nexttokentype $tok $currentstate] + set result [dict get $transition_info newstate] + } + PUSHSPACE { + set original_target [dict get $transition_to PUSHSPACE] + if {[dict exists $transition_to returnstate]} { + #adjust the existing space record on the stack. + #struct::stack doesn't really support that - so we have to pop and re-push + #todo - investigate a custom stack implementation where we can efficiently lset the top of the stack + set currentspace [spacestack pop] + dict set currentspace returnstate [dict get $transition_to returnstate] + spacestack push $currentspace ;#return modified info to stack so when we POPSPACE the returnstate is available. + } + if {[dict exists $transition_to starttok]} { + set starttok [dict get $transition_to starttok] + } + spacestack push [dict create type space state $original_target] + + set last_space_action "push" + set last_space_type "space" + + if {[dict exists $transition_to state]} { + #an explicit state in the pushed space was requested in the stateMatrix - override the spacePushTransition (spacePushTransitions can be deprecated if we require explicitness?) + set next [dict get $transition_to state] + ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target redirected state to $next by explicit 'state' entry" + } else { + #puts $::tomlish::parse::spacePushTransitions + if {[dict exists $::tomlish::parse::spacePushTransitions $original_target]} { + set next [dict get $::tomlish::parse::spacePushTransitions $original_target] + ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target redirected state to $next (spacePushTransitions) " + } else { + set next $original_target + ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target" + } + } + set result $next + } + TOSTATE { + if {[dict exists $transition_to returnstate]} { + #adjust the existing space record on the stack. + #struct::stack doesn't really support that - so we have to pop and re-push + #todo - investigate a custom stack implementation where we can efficiently lset the top of the stack + set currentspace [spacestack pop] + dict set currentspace returnstate [dict get $transition_to returnstate] + spacestack push $currentspace ;#return modified info to stack so when we POPSPACE the returnstate is available. + } + set result [dict get $transition_to TOSTATE] + } + default { + #simplified version of TOSTATE + set result [lindex $transition_to 0] ;#ignore everything but first word + } + } + } else { + ::tomlish::log::error "--->> No state transition defined from state $currentstate when tokentype $tokentype received" + set result "nostate" + } + lappend state_list [list tokentype $tokentype from $currentstate to $result] + set state $result + ::tomlish::log::notice "--->> STATE TRANSITION tokenType: '$tokentype' tok:$tok triggering '$currentstate' -> '$result' last_space_action:$last_space_action " + return [dict create prevstate $prevstate newstate $result space_action $last_space_action starttok $starttok] + } + + proc report_line {{line ""}} { + variable linenum + variable is_parsing + if {$is_parsing} { + if {$line eq ""} { + set line $linenum + } + return "Line Number: $line" + } else { + #not in the middle of parsing tomlish text - return nothing. + return "" + } + } + + #produce a *slightly* more readable string rep of the nest for puts etc. + proc nest_pretty1 {list} { + set prettier "{" + + foreach el $list { + if { [lindex $el 0] eq "NEWLINE"} { + append prettier "[list $el]\n" + } elseif {([llength $el] > 1) && ([lindex $el 0] in {KEY DQKEY SQKEY TABLE ARRAY})} { + append prettier [nest_pretty1 $el] + } else { + append prettier "[list $el] " + } + } + append prettier "}" + return $prettier + } + + proc set_tokenType {t} { + variable tokenType + variable tokenType_list + if {![info exists tokenType]} { + set tokenType "" + } + lappend tokenType_list $t + set tokenType $t + } + + proc switch_tokenType {t} { + variable tokenType + variable tokenType_list + lset tokenType_list end $t + set tokenType $t + } + + proc get_tokenType {} { + variable tokenType + return $tokenType + } + + + proc get_token_waiting {} { + variable token_waiting + return [lindex $token_waiting 0] + } + proc clear_token_waiting {} { + variable token_waiting + set token_waiting [list] + } + + #token_waiting is a list - but our standard case is to have only one + #in certain circumstances such as near eof we may have 2 + #the set_token_waiting function only allows setting when there is not already one waiting. + #we want to catch cases of inadvertently trying to set multiple + # - the reason being that the state transition triggered by the previous token may have invalidated the assumptions made when a token was added as waiting. + proc set_token_waiting {args} { + if {[llength $args] %2 != 0} { + error "tomlish set_token_waiting must have args of form: type value complete 0|1" + } + variable token_waiting + + if {[llength $token_waiting] && [dict get [lindex $token_waiting end] type] ne "eof"} { + #tokloop already set a token_waiting - but something (post tokloop processing?) is trying to set another + #we may need to remove the existing token_waiting and reset the tokloop index to the previous char so it's reprocessed in the possibly new context + #rather than attempt to make the right decision here - we raise an error and require the caller to check/handle it + set err "tomlish set_token_waiting already has token_waiting: [lindex $token_waiting 0]" + append err \n " - cannot add token_waiting: $args" + error $err + #set tomlish::parse::i [expr {[dict get $token_waiting startindex] -1}] + #set token_waiting [list] + } + + set waiting [dict create] + dict for {k v} $args { + switch -exact $k { + type - complete { + dict set waiting $k $v + } + value { + dict set waiting tok $v + } + startindex { + dict set waiting startindex $v + } + default { + error "tomlish set_token_waiting error - unrecognised key $k. known keys: [dict keys $args]" + } + } + } + if {![tcl::string::is boolean -strict [dict get $waiting complete]]} { + error "tomlish set_token_waiting error - 'complete' must be a boolean. got [dict get $waiting complete]" + } + if {![llength $token_waiting]} { + set token_waiting [list $waiting] + } else { + #an extra sanity-check that we don't have more than just the eof.. + if {[llength $token_waiting] > 1} { + set err "tomlish Unexpected. Existing token_waiting count > 1.\n" + foreach tw $token_waiting { + append err " $tw" \n + } + append err " - cannot add token_waiting: $waiting" + error $err + } + #last entry must be a waiting eof + set token_waiting [list $waiting [lindex $token_waiting end]] + } + return + } + + #returns 0 or 1 + #tomlish::parse::tok + #we attempt to do this without lookahead (potential use in streaming toml? for what benefit?) todo -final flag + # - the possible benefit is being able to more easily process in arbitrarily split chunks (although we would still have to watch crlf splitting ?) + # - interactive use? + + proc tok {} { + variable nest + variable s + variable v + variable i + variable tok + variable type ;#character type + variable state ;#FSM + + + variable tokenType + variable tokenType_list + + + variable endToken + + variable lastChar + + variable braceCount + variable bracketCount + + + #------------------------------ + #Previous run found another (presumably single-char) token + #The normal case is for there to be only one dict in the list + #multiple is an exception - primarily for eof + variable token_waiting + if {[llength $token_waiting]} { + set waiting [lindex $token_waiting 0] + + set tokenType [dict get $waiting type] + set tok [dict get $waiting tok] + #todo: dict get $token_waiting complete + set token_waiting [lrange $token_waiting 1 end] + return 1 + } + #------------------------------ + + set resultlist [list] + set sLen [tcl::string::length $s] + + set slash_active 0 + set quote 0 + set c "" + for {} {$i < $sLen} {} { + if {$i > 0} { + set lastChar [tcl::string::index $s [expr {$i - 1}]] + set start_of_data h + } else { + set lastChar "" + set start_of_data 1 + #bom-handling + if {[tcl::string::index $s 0] eq "\uFEFF"} { + #bom (could be from various encodings - now decoded as single unicode char FEFF) + #incr i 1 ;#skip over initial bom? + } + } + + + set c [tcl::string::index $s $i] + set cindex $i + + set ctest [tcl::string::map {\{ lc \} rc \[ lb \] rb \" dq ' sq \\ bsl \r cr \n lf \t tab \uFEFF bom} $c] + tomlish::log::debug "- tokloop char <$ctest> index $i tokenType:$tokenType tok:<$tok>" + #puts "got char $c during tokenType '$tokenType'" + incr i ;#must incr here because we do returns inside the loop + + + + switch -exact -- $ctest { + # { + set had_slash $slash_active + set slash_active 0 + + if {$had_slash} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + #for multiliteral, multistring - data and/or end + incr i -1 + return 1 + } + _start_squote_sequence { + #pseudo token beginning with underscore - never returned to state machine - review + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + barekey { + error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed (see tomlish::utils::is_barekey). [tomlish::parse::report_line]" + } + whitespace { + # hash marks end of whitespace token + #do a return for the whitespace, set token_waiting + #set_token_waiting type comment value "" complete 1 + incr i -1 ;#leave comment for next run + return 1 + } + untyped_value { + #REVIEW! the spec isn't clear.. is whitespace after an int,bool etc required before comment? + #we will accept a comment marker as an immediate terminator of the untyped_value. + incr i -1 + return 1 + } + starttablename - starttablearrayname { + #fix? + error "tomlish Character '#' is invalid first character for $tokenType. [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #invalid in bare parts - but allowed in quoted parts - let tablename parser sort it out + append tok $c + } + default { + #dquotedkey, string,literal, multistring + append tok $c + } + } + } else { + switch -- $state { + multistring-space { + set_tokenType stringpart + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "#" + } + multiliteral-space { + set_tokenType "literalpart" + set tok "#" + } + default { + #start of token if we're not in a token + set_tokenType comment + set tok "" ;#The hash is not part of the comment data + } + } + } + } + lc { + #left curly brace + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i [tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + append tok $c + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + starttablename - starttablearrayname { + #*bare* tablename can only contain letters,digits underscores + error "tomlish Invalid tablename first character \{ [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #valid in quoted parts + append tok $c + } + comment { + if {$had_slash} {append tok "\\"} + append tok "\{" + } + default { + #end any other token. + incr i -1 + return 1 + } + } + } else { + switch -exact -- $state { + itable-keyval-value-expected - keyval-value-expected { + #switch last key to tablename?? + set_tokenType "startinlinetable" + set tok "\{" + return 1 + } + array-space - array-syntax { + #nested anonymous inline table + set_tokenType "startinlinetable" + set tok "\{" + return 1 + } + table-space { + #invalid - but allow parser statemachine to report it. ? + set_tokenType "startinlinetable" + set tok "\{" + return 1 + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "\{" + } + multiliteral-space { + set_tokenType "literalpart" + set tok "\{" + } + default { + error "tomlish state: '$state'. left brace case not implemented [tomlish::parse::report_line]" + } + } + } + + } + rc { + #right curly brace + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + append tok $c + } + string - dquotedkey - comment { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + starttablename - tablename { + if {$had_slash} {append tok "\\"} + #invalid! - but leave for datastructure loading stage to catch + set_token_waiting type endinlinetable value "" complete 1 startindex $cindex + return 1 + } + starttablearrayname - tablearrayname { + if {$had_slash} {append tok "\\"} + #invalid! - but leave for datastructure loading stage to catch + set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex + return 1 + } + default { + #end any other token + incr i -1 + return 1 + } + } + } else { + #$slash_active not relevant when no tokenType + switch -exact -- $state { + table-space { + #invalid - but allow parser statemachine to report it. ? + set_tokenType "endinlinetable" + set tok "\}" + return 1 + } + itable-space { + set_tokenType "endinlinetable" + set tok "\}" + return 1 + } + tablename-state { + #e.g [] - empty tablename - allowed or not? + #empty tablename/tablearrayname ? + #error "unexpected tablename problem" + + set_tokenType "endinlinetable" + set tok "" ;#no output into the tomlish list for this token + return 1 + } + tablearrayname-state { + error "tomlish unexpected tablearrayname-state problem" + set_tokenType "endinlinetable" + set tok "" ;#no output into the tomlish list for this token + return 1 + } + array-syntax - array-space { + #invalid + set_tokenType "endinlinetable" + set tok "\}" + return 1 + } + itable-val-tail { + set_tokenType "endinlinetable" + set tok "" + #we need to pop the keyval - and then reprocess to pop the inlinetable - so we incr -1 + incr i -1 + return 1 + } + itable-keyval-syntax { + error "tomlish endinlinetable unexpected at this point. Expecting key=val syntax [tomlish::parse::report_line]" + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "\}" + } + multiliteral-space { + set_tokenType "literalpart" ; #review + set tok "\}" + } + default { + #JMN2024b keyval-tail? + error "tomlish state '$state'. endinlinetable case not implemented [tomlish::parse::report_line]" + } + } + } + + } + lb { + #left square bracket + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + append tok $c + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + starttablename { + #change the tokenType + switch_tokenType "starttablearrayname" + set tok "" ;#no output into the tomlish list for this token + #any following whitespace is part of the tablearrayname, so return now + return 1 + } + tablename - tablearrayname { + #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token + if {$had_slash} { + #resultant tablename may be invalid - but leave for datastructure loading stage to catch + #append tok "\\[" + append tok {\[} + } else { + if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { + #invalid at this point - state machine should disallow: + # table -> starttablearrayname + # tablearray -> starttablearrayname + set_token_waiting type starttablearrayname value "" complete 1 startindex $cindex + return 1 + } else { + #we appear to still be in single or double quoted section + append tok "\[" + } + } + } + comment { + if {$had_slash} {append tok "\\"} + append tok "\[" + } + default { + #end any other token. + incr i -1 + return 1 + } + } + } else { + #$slash_active not relevant when no tokenType + switch -exact -- $state { + keyval-value-expected - itable-keyval-value-expected { + set_tokenType "startarray" + set tok "\[" + return 1 + } + array-space - array-syntax { + #nested array? + set_tokenType "startarray" + set tok "\[" + return 1 + #error "state: array-space. startarray case not implemented [tomlish::parse::report_line]" + } + table-space { + #table name + #assume it's a single bracket - but we need to wait for non-bracket to confirm it's not a tablearray + #note that a starttablearrayname token may contain whitespace between the brackets + # e.g \[ \[ + set_tokenType "starttablename" + set tok "" ;#there is no output into the tomlish list for this token + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "\[" + } + multiliteral-space { + set_tokenType "literalpart" + set tok "\[" + } + itable-space { + #handle state just to give specific error msg + error "tomlish state: '$state'. Left square bracket invalid. Cannot start array in inline table without key. Use key=\[\] syntax. [tomlish::parse::report_line]" + } + default { + error "tomlish state: '$state'. startarray case not implemented [tomlish::parse::report_line]" + } + } + } + } + rb { + #right square bracket + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + append tok $c + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + comment { + if {$had_slash} {append tok "\\"} + append tok $c + } + whitespace { + if {$state eq "multistring-space"} { + #???? + incr i -1 + if {$had_slash} {incr i -1} ;#reprocess + return 1 + } else { + incr i -1 + if {$had_slash} {incr i -1} ;#reprocess + return 1 + } + } + starttablename { + #toml-test invalid/table/empty + + set_token_waiting type tablename value "" complete 1 startindex $cindex + incr i -1 + return 1 + } + tablename { + #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token + if {$had_slash} { + #resultant tablename may be invalid - but leave for datastructure loading stage to catch + append tok "\\]" + } else { + if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { + set_token_waiting type endtablename value "" complete 1 startindex $cindex + return 1 + } else { + #we appear to still be in single or double quoted section + append tok "]" + } + } + } + tablearrayname { + #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token + if {$had_slash} { + #resultant tablename may be invalid - but leave for datastructure loading stage to catch + append tok "\\]" + } else { + if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { + set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex + return 1 + } else { + #we appear to still be in single or double quoted section + append tok "]" + } + } + } + default { + incr i -1 + return 1 + } + } + } else { + #$slash_active not relevant when no tokenType + switch -exact -- $state { + array-syntax - array-space { + #invalid - but allow parser statemachine to report it. + set_tokenType "endarray" + set tok "\]" + return 1 + } + table-space { + #invalid - but allow parser statemachine to report it. ? + set_tokenType "endarray" + set tok "\]" + return 1 + } + tablename-state { + #e.g [] - empty tablename + #tomltest 1.1.0 invalid/table/empty + #should be invalid + #we parse it and let dict::from_tomlish error when it tries to split table + + set_tokenType "endtablename" + set tok "" ;#no output into the tomlish list for this token + return 1 + } + tablearrayname-state { + error "tomlish unexpected tablearrayname problem" + set_tokenType "endtablearray" + set tok "" ;#no output into the tomlish list for this token + return 1 + } + tablearrayname-tail { + #[[xxx] + set_tokenType "endtablename" + #sequence: starttablename -> starttablearrayname -> endtablearrayname -> endtablename + return 1 + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "\]" + } + multiliteral-space { + set_tokenType "literalpart" + set tok "\]" + } + default { + error "tomlish state '$state'. endarray case not implemented [tomlish::parse::report_line]" + } + } + } + } + bsl { + #backslash + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + whitespace { + if {$state eq "multistring-space"} { + #end whitespace token + incr i -1 ;#reprocess bsl in next run + return 1 + } else { + error "tomlish Unexpected backslash during whitespace. [tomlish::parse::report_line]" + } + } + literal - literalpart - squotedkey { + #never need to set slash_active true when in single quoted tokens + append tok "\\" + set slash_active 0 + } + string - dquotedkey - comment { + if {$slash_active} { + set slash_active 0 + append tok "\\\\" + } else { + set slash_active 1 + } + } + stringpart { + if {$slash_active} { + set slash_active 0 + append tok "\\\\" + } else { + set slash_active 1 + } + } + starttablename - starttablearrayname { + error "tomlish backslash is invalid as first character of $tokenType [tomlish::parse::report_line]" + } + tablename - tablearrayname { + if {$slash_active} { + set slash_active 0 + append tok "\\\\" + } else { + set slash_active 1 + } + } + barekey { + error "tomlish Unexpected backslash during barekey. [tomlish::parse::report_line]" + } + default { + error "tomlish Backslash unexpected during tokentype: '$tokenType'. [tomlish::parse::report_line]" + } + } + } else { + switch -exact -- $state { + multistring-space { + if {$slash_active} { + set_tokenType "stringpart" + set tok "\\\\" + set slash_active 0 + } else { + set slash_active 1 + } + } + multiliteral-space { + #nothing can be escaped in multiliteral-space - not even squotes (?) review + set_tokenType "literalpart" + set tok "\\" + } + default { + error "tomlish tok error: Unexpected backslash when no token is active. [tomlish::parse::report_line]" + } + } + } + } + sq { + #single quote + set had_slash $slash_active + set slash_active 0 + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote { + #for within multiliteral + #short tentative_accum_squote tokens are returned if active upon receipt of any other character + #longest allowable for leading/trailing are returned here + #### + set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote + #assert state = trailing-squote-space + append tok $c + if {$existingtoklen == 4} { + #maxlen to be a tentative_accum_squote is multisquote + 2 = 5 + #return tok with value ''''' + return 1 + } + } + tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space + switch -- [tcl::string::length $tok] { + 1 { + #no conclusion can yet be reached + append tok $c + } + 2 { + #enter multiliteral + #switch? + append tok $c + set_tokenType triple_squote + return 1 + } + default { + #if there are more than 3 leading squotes we also enter multiliteral space and the subsequent ones are handled + #by the tentative_accum_squote check for ending sequence which can accept up to 5 and reintegrate the + #extra 1 or 2 squotes as data. + error "tomlish unexpected token length [tcl::string::length $tok] in '_start_squote_sequence'" + } + } + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + whitespace { + #end whitespace + incr i -1 ;#reprocess sq + return 1 + } + literal { + #slash_active always false + #terminate the literal + set_token_waiting type endsquote value "'" complete 1 startindex $cindex + return 1 + } + literalpart { + #ended by ''' - but final could be '''' or ''''' (up to 2 squotes allowed directly before ending triple squote sequence) + #todo + # idea: end this literalpart (possibly 'temporarily') + # let the sq be reprocessed in the multiliteral-space to push an end-multiliteral-sequence to state stack + # upon popping end-multiliteral-sequence - stitch quotes back into this literalpart's token (if either too short - or a long ending sequence as shown above) + incr i -1 ;#throw the "'" back to loop - will be added to a tentative_accum_squote token for later processing + return 1 + } + XXXitablesquotedkey { + set_token_waiting type endsquote value "'" complete 1 startindex $cindex + return 1 + } + squotedkey { + ### + #set_token_waiting type endsquote value "'" complete 1 + return 1 + } + starttablename - starttablearrayname { + #!!! + incr i -1 + return 1 + } + tablename - tablearrayname { + append tok $c + } + barekey { + #barekeys now support all sorts of unicode letter/number chars for other cultures + #but not punctuation - not even for those of Irish heritage who don't object + #to the anglicised form of some names. + # o'shenanigan seems to not be a legal barekey + #The Irish will have to use an earlier form Ó - which apparently many may prefer anyway. + error "tomlish Unexpected single quote during barekey. [tomlish::parse::report_line]" + } + default { + append tok $c + } + } + } else { + switch -exact -- $state { + array-space - keyval-value-expected - itable-keyval-value-expected { + #leading squote + #pseudo-token _start_squote_sequence ss not received by state machine + #This pseudotoken will trigger production of single_squote token or triple_squote token + #It currently doesn't trigger double_squote token + #(handle '' same as 'x' ie produce a single_squote and go into processing literal) + #review - producing double_squote for empty literal may be slightly more efficient. + #This token is not used to handle squote sequences *within* a multiliteral + set_tokenType "_start_squote_sequence" + set tok "'" + } + multiliteral-space { + #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row + #we are building up a tentative_accum_squote to determine if + #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines + #b) it is exactly ''' and we can terminate the whole multiliteral + #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space + set_tokenType "tentative_trigger_squote" ;#trigger tentative_accum_squote + set tok "'" + return 1 + } + table-space - itable-space { + #tests: squotedkey.test squotedkey_itable.test + set_tokenType "squotedkey" + set tok "" + } + XXXtable-space - XXXitable-space { + #future - could there be multiline keys? MLLKEY, MLBKEY ? + #this would (almost) allow arbitrary tcl dicts to be stored in toml (aside from escaping issues) + #probably unlikely - as it's perhaps not very 'minimal' or ergonomic for config files + #@2025 ABNF for toml mentions key, simple-key, unquoted-key, quoted-key and dotted-key + #where key is simple-key or dotted-key - no MLL or MLB components + #the spec states solution for arbitrary binary data is application specific involving encodings + #such as hex, base64 + set_tokenType "_start_squote_sequence" + set tok "'" + return 1 + } + tablename-state { + #first char in tablename-state/tablearrayname-state + set_tokenType "tablename" + append tok "'" + } + tablearrayname-state { + set_tokenType "tablearrayname" + append tok "'" + } + literal-state { + #shouldn't get here? review + tomlish::log::debug "- tokloop sq during literal-state with no tokentype - empty literal?" + set_tokenType "literal" + incr -1 + return 1 + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} {append tok "\\"} + append tok "," + #error "tomlish unimplemented - squote during state '$state'. [tomlish::parse::report_line]" + } + dottedkey-space { + set_tokenType "squotedkey" + } + default { + error "tomlish unhandled squote during state '$state'. [tomlish::parse::report_line]" + } + } + } + + } + dq { + #double quote + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #review + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + tentative_accum_dquote { + #within multistring + #short tentative_accum_dquote tokens are returned if active upon receipt of any other character + #longest allowable for leading/trailing are returned here + #### + set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote + #assert state = trailing-squote-space + append tok $c + if {$existingtoklen == 4} { + #maxlen to be a tentative_accum_dquote is multidquote + 2 = 5 + #return tok with value """"" + return 1 + } + } + _start_dquote_sequence { + #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space + switch -- [tcl::string::length $tok] { + 1 { + #no conclusion can yet be reached + append tok $c + } + 2 { + #enter multistring + #switch? + append tok $c + set_tokenType triple_dquote + return 1 + } + default { + #if there are more than 3 leading dquotes we also enter multistring space and the subsequent ones are handled + #by the tentative_accum_dquote check for ending sequence which can accept up to 5 and reintegrate the + #extra 1 or 2 dquotes as data. + error "tomlish unexpected token length [tcl::string::length $tok] in '_start_dquote_sequence'" + } + } + } + literal - literalpart { + append tok $c + } + string { + if {$had_slash} { + append tok "\\" $c + } else { + #unescaped quote always terminates a string + set_token_waiting type enddquote value "\"" complete 1 startindex $cindex + return 1 + } + } + stringpart { + #sub element of multistring + if {$had_slash} { + append tok "\\" $c + } else { + incr i -1 ;#throw the {"} back to loop - will be added to a tentative_accum_dquote token for later processing + return 1 + } + } + whitespace { + #assert: had_slash will only ever be true in multistring-space + if {$had_slash} { + incr i -2 + return 1 + } else { + #end whitespace token - throw dq back for reprocessing + incr i -1 + return 1 + } + } + comment { + if {$had_slash} {append tok "\\"} + append tok $c + } + XXXdquotedkey { + if {$had_slash} { + append tok "\\" + append tok $c + } else { + set_token_waiting type enddquote value "\"" complete 1 startindex $cindex + return 1 + } + } + dquotedkey { + ### + if {$had_slash} { + append tok "\\" + append tok $c + } else { + #set_token_waiting type enddquote value {"} complete 1 + return 1 + } + } + squotedkey { + append tok $c + } + tablename - tablearrayname { + if {$had_slash} {append tok "\\"} + append tok $c + } + starttablename - starttablearrayname { + incr i -1 ;## + return 1 + } + default { + error "tomlish got quote during tokenType '$tokenType' [tomlish::parse::report_line]" + } + } + } else { + #$slash_active not relevant when no tokenType + #token is string only if we're expecting a value at this point + switch -exact -- $state { + array-space - keyval-value-expected - itable-keyval-value-expected { + #leading dquote + #pseudo-token _start_squote_sequence ss not received by state machine + #This pseudotoken will trigger production of single_dquote token or triple_dquote token + #It currently doesn't trigger double_dquote token + #(handle "" same as "x" ie produce a single_dquote and go into processing string) + #review - producing double_dquote for empty string may be slightly more efficient. + #This token is not used to handle dquote sequences once *within* a multistring + set_tokenType "_start_dquote_sequence" + set tok {"} + } + multistring-space { + if {$had_slash} { + set_tokenType "stringpart" + set tok "\\\"" + } else { + #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row + #we are building up a tentative_accum_squote to determine if + #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines + #b) it is exactly ''' and we can terminate the whole multiliteral + #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space + set_tokenType "tentative_trigger_dquote" ;#trigger tentative_accum_dquote + set tok {"} + return 1 + } + } + multiliteral-space { + set_tokenType "literalpart" + set tok "\"" + } + table-space - itable-space { + set_tokenType "dquotedkey" + set tok "" + } + dottedkey-space { + set_tokenType dquotedkey + set tok "" + + #only if complex keys become a thing + #set_tokenType dquote_seq_begin + #set tok $c + } + tablename-state { + set_tokenType tablename + set tok $c + } + tablearrayname-state { + set_tokenType tablearrayname + set tok $c + } + default { + error "tomlish Unexpected dquote during state '$state' [tomlish::parse::report_line]" + } + } + } + } + = { + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #review + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + #assertion had_slash 0 + append tok $c + } + string - comment - dquotedkey { + #for these tokenTypes an = is just data. + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + whitespace { + if {$state eq "multistring-space"} { + incr i -1 + return 1 + } else { + set_token_waiting type equal value = complete 1 startindex $cindex + return 1 + } + } + barekey { + #set_token_waiting type equal value = complete 1 + incr i -1 + return 1 + } + starttablename - starttablearrayname { + error "tomlish Character '=' is invalid first character for $tokenType. [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #invalid in bare name - but valid in quoted parts - leave for tablename parser to sort out + append tok $c + } + default { + error "tomlish unexpected = character during tokentype $tokenType. case not implemented. [tomlish::parse::report_line]" + } + } + } else { + switch -exact -- $state { + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok = + } + multiliteral-space { + set_tokenType "literalpart" + set tok "=" + } + dottedkey-space { + set_tokenType "equal" + set tok "=" + return 1 + } + default { + set_tokenType "equal" + set tok = + return 1 + } + } + } + } + cr { + #REVIEW! + # \r carriage return + if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. + set slash_active 0 + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #we have received a double cr + ::tomlish::log::warn "double cr - will generate cr token. needs testing" + set_tokenType "cr" ;#lone cr token will generally raise an error - but let state machine handle it + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal { + append tok $c + } + literalpart { + #part of MLL string (multi-line literal string) + #we need to split out crlf as a separate NEWLINE to be consistent + ::tomlish::log::warn "literalpart ended by cr - needs testing" + #return literalpart temporarily - allow cr to be reprocessed from multiliteral-space + incr i -1 + return 1 + } + stringpart { + #stringpart is a part of MLB string (multi-line basic string) + #throw back the cr - if followed by lf it will become a {NEWLINE crlf} entry within the MULTISTRING list (e.g between STRINGPART entries) + incr i -1 + return 1 + } + starttablename - starttablearrayname { + error "tomlish Character is invalid first character for $tokenType. [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #could in theory be valid in quoted part of name + #review - might be better just to disallow here + append tok $c + } + whitespace { + #it should technically be part of whitespace if not followed by lf + #but outside of values we are also free to map it to be another NEWLINE instead? REVIEW + incr i -1 + return 1 + } + untyped_value { + incr i -1 + return 1 + } + comment { + #JJJJ + #review + incr i -1 + return 1 + } + default { + #!todo - error out if cr inappropriate for tokenType + append tok $c + } + } + } else { + #lf may be appended if next + #review - lone cr as newline? - this is uncommon - but so is lone cr in a string(?) + set_tokenType "newline" + set tok cr + } + } + lf { + # \n newline + set had_slash $slash_active + set slash_active 0 + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #review + #this lf is the trailing part of a crlf + append tok lf ;#assert we should now have tok "crlf" - as a previous cr is the only way to have an incomplete newline tok + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + #multiliteral or multistring + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal { + #nl is not allowed *within* a literal - require multiliteral syntax for any literal containing a newline ''' ''' + #even though we terminate the literal without the closing quote here - the token_waiting newline should trigger a state error + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 + } + literalpart { + #we allow newlines - but store them within the multiliteral as their own element + #This is a legitimate end to the literalpart - but not the whole multiliteral + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 + } + stringpart { + if {$had_slash} { + #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) + set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] + incr i -1 + return 1 + } else { + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 + } + } + starttablename - tablename - tablearrayname - starttablearrayname { + error "tomlish Character is invalid in $tokenType. [tomlish::parse::report_line]" + } + default { + #newline ends all other tokens. + #note for string: we don't add (raw unescaped) newline to simple string. (must use multi-string for this) + #note for whitespace: + # we will use the convention that \n terminates the current whitespace even if whitespace follows + # ie whitespace is split into separate whitespace tokens at each newline + + #puts "-------------- newline lf during tokenType $tokenType" + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 + } + } + } else { + switch -exact -- $state { + multistring-space { + if {$had_slash} { + set_tokenType "continuation" + set tok "\\" + incr i -1 + return 1 + } else { + set_tokenType "newline" + set tok lf + return 1 + } + } + multiliteral-space { + #assert had_slash 0 + set_tokenType "newline" + set tok "lf" + return 1 + } + default { + #ignore slash? error? + set_tokenType "newline" + set tok lf + return 1 + } + } + #if {$had_slash} { + # #CONT directly before newline - allows strings_5_byteequivalent test to pass + # set_tokenType "continuation" + # set tok "\\" + # incr i -1 + # return 1 + #} else { + # set_tokenType newline + # set tok lf + # return 1 + #} + } + } + , { + set had_slash $slash_active + set slash_active 0 + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + comment - tablename - tablearrayname { + if {$had_slash} {append tok "\\"} + append tok , + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + #stringpart can have up to 2 quotes too + if {$had_slash} {append tok "\\"} + append tok $c + } + literal - literalpart - squotedkey { + #assert had_slash always 0 + append tok $c + } + whitespace { + if {$state eq "multistring-space"} { + incr i -1 + return 1 + } else { + set_token_waiting type comma value "," complete 1 startindex $cindex + return 1 + } + } + default { + set_token_waiting type comma value "," complete 1 startindex $cindex + if {$had_slash} {append tok "\\"} + return 1 + } + } + } else { + switch -exact -- $state { + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} {append tok "\\"} + append tok "," + } + multiliteral-space { + #assert had_slash 0 + set_tokenType "literalpart" + set tok "," + } + default { + set_tokenType "comma" + set tok "," + return 1 + } + } + } + } + . { + set had_slash $slash_active + set slash_active 0 + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + comment - untyped_value { + if {$had_slash} {append tok "\\"} + append tok $c + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + literal - literalpart - squotedkey { + #assert had_slash always 0 + append tok $c + } + whitespace { + switch -exact -- $state { + multistring-space { + #review + if {$had_slash} { + incr i -2 + } else { + incr i -1 + } + return 1 + } + xxxdottedkey-space { + incr i -1 + return 1 + } + dottedkey-space-tail { + incr i -1 + return 1 + } + default { + error "tomlish Received period during tokenType 'whitespace' [tomlish::parse::report_line]" + } + } + } + starttablename - starttablearrayname { + #This would correspond to an empty table name + error "tomlish Character '.' is not allowed as first character ($tokenType). [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #subtable - split later - review + append tok $c + } + barekey { + #e.g x.y = 1 + #we need to transition the barekey to become a structured table name ??? review + #x is the tablename y is the key + set_token_waiting type dotsep value "." complete 1 startindex $cindex + return 1 + } + default { + error "tomlish Received period during tokenType '$tokenType' [tomlish::parse::report_line]" + #set_token_waiting type period value . complete 1 + #return 1 + } + } + } else { + switch -exact -- $state { + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} {append tok "\\"} + append tok "." + } + multiliteral-space { + set_tokenType "literalpart" + set tok "." + } + XXXdottedkey-space { + ### obs? + set_tokenType "dotsep" + set tok "." + return 1 + } + dottedkey-space-tail { + ### + set_tokenType "dotsep" + set tok "." + return 1 + } + default { + set_tokenType "untyped_value" + set tok "." + } + } + } + + } + " " - tab { + if {[tcl::string::length $tokenType]} { + set had_slash $slash_active + set slash_active 0 + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + barekey { + #todo had_slash - emit token or error + #whitespace is a terminator for bare keys + #set_token_waiting type whitespace value $c complete 1 + incr i -1 + return 1 + } + untyped_value { + #unquoted values (int,date,float etc) are terminated by whitespace + #set_token_waiting type whitespace value $c complete 1 + incr i -1 + return 1 + } + comment { + if {$had_slash} { + append tok "\\" + } + append tok $c + } + string - dquotedkey { + if {$had_slash} { append tok "\\" } + append tok $c + } + stringpart { + #for stringpart we store WS separately for ease of processing continuations (CONT stripping) + if {$had_slash} { + #REVIEW + #emit the stringpart - go back to the slash + incr i -2 + return 1 + } else { + #split into STRINGPART xxx WS " " + incr i -1 + return 1 + } + } + literal - literalpart - squotedkey { + append tok $c + } + whitespace { + if {$state eq "multistring-space"} { + append tok $c + } else { + append tok $c + } + } + starttablename - starttablearrayname { + incr i -1 + return 1 + } + tablename - tablearrayname { + #include whitespace in the tablename/tablearrayname + #Will need to be normalized upon interpreting the tomlish as a datastructure + append tok $c + } + default { + error "tomlish Received whitespace space during tokenType '$tokenType' [tomlish::parse::report_line]" + } + } + } else { + set had_slash $slash_active + set slash_active 0 + switch -exact -- $state { + tablename-state { + #tablename can have leading,trailing and interspersed whitespace! + #These will not be treated as whitespace tokens, instead forming part of the name. + set_tokenType tablename + set tok "" + if {$had_slash} {append tok "\\"} + append tok $c + } + tablearrayname-state { + set_tokenType tablearrayname + set tok "" + if {$had_slash} {append tok "\\"} + append tok $c + } + multistring-space { + if {$had_slash} { + set_tokenType "continuation" + set tok "\\" + incr i -1 + return 1 + } else { + set_tokenType "whitespace" + append tok $c + } + } + multiliteral-space { + set_tokenType "literalpart" + set tok $c + } + default { + if {$had_slash} { + error "tomlish unexpected backslash [tomlish::parse::report_line]" + } + set_tokenType "whitespace" + append tok $c + } + } + } + } + tabX { + if {[tcl::string::length $tokenType]} { + if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out (?review) + set slash_active 0 + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + barekey { + #whitespace is a terminator for bare keys + incr i -1 + #set_token_waiting type whitespace value $c complete 1 + return 1 + } + untyped_value { + #unquoted values (int,date,float etc) are terminated by whitespace + #set_token_waiting type whitespace value $c complete 1 + incr i -1 + return 1 + } + squotedkey { + append tok $c + } + dquotedkey - string - comment - whitespace { + #REVIEW + append tok $c + } + stringpart { + #for stringpart we store WS separately for ease of processing continuations (CONT stripping) + if {$had_slash} { + #REVIEW + #emit the stringpart - go back to the slash + incr i -2 + return 1 + } else { + #split into STRINGPART aaa WS " " + incr i -1 + return 1 + } + } + literal - literalpart { + append tok $c + } + starttablename - starttablearrayname { + incr i -1 + return 1 + } + tablename - tablearrayname { + #include whitespace in the tablename/tablearrayname + #Will need to be normalized upon interpreting the tomlish as a datastructure + append tok $c + } + default { + error "tomlish Received whitespace tab during tokenType '$tokenType' [tomlish::parse::report_line]" + } + } + } else { + set had_slash $slash_active + if {$slash_active} { + set slash_active 0 + } + switch -exact -- $state { + tablename-state { + #tablename can have leading,trailing and interspersed whitespace! + #These will not be treated as whitespace tokens, instead forming part of the name. + set_tokenType tablename + set tok $c + } + tablearrayname-state { + set_tokenType tablearrayname + set tok $c + } + multistring-space { + if {$had_slash} { + set_tokenType "continuation" + set tok "\\" + incr i -1 + return 1 + } else { + set_tokenType whitespace + append tok $c + } + } + multiliteral-space { + set_tokenType "literalpart" + set tok $c + } + default { + set_tokenType "whitespace" + append tok $c + } + } + } + } + bom { + #bom encoded as single unicode codepoint \uFFEF + #BOM (Byte Order Mark) - ignored by token consumer + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + #assert - tok will be one or two squotes only + #A toml literal probably isn't allowed to contain this + #but we will parse and let the validator sort it out. + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart { + append tok $c + } + string - stringpart { + append tok $c + } + default { + #state machine will generally not have entry to accept bom - let it crash + set_token_waiting type bom value "\uFEFF" complete 1 startindex $cindex + return 1 + } + } + } else { + switch -exact -- $state { + multiliteral-space { + set_tokenType "literalpart" + set tok $c + } + multistring-space { + set_tokenType "stringpart" + set tok $c + } + default { + set_tokenType "bom" + set tok "\uFEFF" + return 1 + } + } + } + } + default { + + if {[tcl::string::length $tokenType]} { + if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. + set slash_active 0 + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + whitespace { + if {$state eq "multistring-space"} { + incr i -1 + return 1 + } else { + #review + incr i -1 ;#We don't have a full token to add to the token_waiting dict - so leave this char for next run. + return 1 + } + } + barekey { + if {[tomlish::utils::is_barekey $c]} { + append tok $c + } else { + error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed. (see tomlish::utils::is_barekey) [tomlish::parse::report_line]" + } + } + starttablename - starttablearrayname { + incr i -1 + #allow statemachine to set context for subsequent chars + return 1 + } + string - stringpart { + append tok $c + } + default { + #e.g comment/literal/literalpart/untyped_value/starttablename/starttablearrayname/tablename/tablearrayname + append tok $c + } + } + } else { + set had_slash $slash_active + set slash_active 0 + switch -exact -- $state { + table-space - itable-space { + #if no currently active token - assume another key value pair + if {[tomlish::utils::is_barekey $c]} { + set_tokenType "barekey" + append tok $c + } else { + error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" + } + } + multistring-space { + set_tokenType "stringpart" + if {$had_slash} { + set tok \\$c + } else { + set tok $c + } + } + multiliteral-space { + set_tokenType "literalpart" + set tok $c + } + tablename-state { + set_tokenType "tablename" + set tok $c + } + tablearrayname-state { + set_tokenType "tablearrayname" + set tok $c + } + dottedkey-space { + set_tokenType barekey + set tok $c + } + default { + #todo - something like ansistring VIEW to show control chars? + set cshow [string map [list \t tab \v vt] $c] + tomlish::log::debug "- tokloop char '$cshow' setting to untyped_value while state:$state [tomlish::parse::report_line]" + set_tokenType "untyped_value" + set tok $c + } + } + } + } + } + + } + + #run out of characters (eof) + if {[tcl::string::length $tokenType]} { + #check for invalid ending tokens + #if {$state eq "err-state"} { + # error "Reached end of data whilst tokenType = '$tokenType'. INVALID" + #} + switch -exact -- $tokenType { + _start_squote_sequence { + set toklen [tcl::string::length $tok] + switch -- $toklen { + 1 { + #invalid eof with open literal + error "tomlish eof reached without closing single quote for string literal. [tomlish::parse::report_line]" + } + 2 { + set_tokenType "literal" + set tok "" + return 1 + + ##review + #set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] + #set_tokenType "literal" + #set tok "" + #return 1 + } + } + } + _start_dquote_sequence { + set toklen [tcl::string::length $tok] + switch -- $toklen { + 1 { + #invalid eof with open string + error "tomlish eof reached without closing double quote for string. [tomlish::parse::report_line]" + } + 2 { + set_tokenType "string" + set tok "" + return 1 + } + } + } + newline { + #The only newline token that has still not been returned should have a tok value of "cr" + puts "tomlish eof reached - with incomplete newline token '$tok'" + if {$tok eq "cr"} { + #we convert lone cr to it's own "cr" token elsewhere in the document to allow statemachine to handle it. + #(which it should generally do by not handling it ie raising an error - or emitting an ERROR list in the tomlish) + #if trailing char is a lone cr - we should encode it the same way as elsewhere that is outside of values + # ie as it's own token. + switch_tokenType "cr" + return 1 + } else { + #should be unreachable + error "tomlish eof reached - with invalid newline token. value: $tok" + } + } + } + set_token_waiting type eof value eof complete 1 startindex $i ;#review + return 1 + } else { + ::tomlish::log::debug "- No current tokenType, ran out of characters, setting tokenType to 'eof' [tomlish::parse::report_line]" + set tokenType "eof" + set tok "eof" + } + return 0 + } + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::parse ---}] +} +namespace eval tomlish::huddle { + proc from_json {json} { + package require huddle + package require huddle::json + #note - huddle may now contain raw surrogate pair - which cannot be emitted to stdout + set h [huddle::json::json2huddle parse $json] + } + proc from_dict {d} { + + } + + #raw - strings must already be processed into values suitable for json e.g surrogate pair escaping + proc jsondumpraw {huddle_object {offset " "} {newline "\n"} {begin ""}} { + upvar ::huddle::types types + set nextoff "$begin$offset" + set nlof "$newline$nextoff" + set sp " " + if {[string equal $offset ""]} {set sp ""} + + set type [huddle type $huddle_object] + + switch -- $type { + boolean - + number { + return [huddle get_stripped $huddle_object] + } + null { + return null + } + string { + set data [huddle get_stripped $huddle_object] + + # JSON permits only oneline string + #set data [string map { + # \n \\n + # \t \\t + # \r \\r + # \b \\b + # \f \\f + # \\ \\\\ + # \" \\\" + # / \\/ + # } $data + #] + return "\"$data\"" + } + list { + set inner {} + set len [huddle llength $huddle_object] + for {set i 0} {$i < $len} {incr i} { + set subobject [huddle get $huddle_object $i] + lappend inner [jsondumpraw $subobject $offset $newline $nextoff] + } + if {[llength $inner] == 1} { + return "\[[lindex $inner 0]\]" + } + return "\[$nlof[join $inner ,$nlof]$newline$begin\]" + } + dict { + set inner {} + foreach {key} [huddle keys $huddle_object] { + lappend inner [subst {"$key":$sp[jsondumpraw [huddle get $huddle_object $key] $offset $newline $nextoff]}] + } + #if {[llength $inner] == 1} { + # return $inner ;#wrong - breaks with quoted list representation + # #FAILS: toml-test valid/comment/tricky + #} + + return "\{$nlof[join $inner ,$nlof]$newline$begin\}" + } + default { + set node [unwrap $huddle_object] + #foreach {tag src} $node break + lassign $node tag src + return [$types(callback:$tag) jsondumpraw $huddle_object $offset $newline $nextoff] + } + } + } +} + +#typed as per toml-test types +namespace eval tomlish::typedhuddle { + proc from_json {json} { + set plainhuddle [tomlish::huddle::from_json $json] + + error "tomlish::typedhuddle::from_json unimplemented" + } + proc from_dict {d} { + package require huddle + set h [huddle create] + if {[tomlish::dict::is_typeval $d]} { + set dtype [dict get $d type] + switch -- $dtype { + ARRAY { + #error "typedhuddle::from_dict ARRAY not yet handled" + set h_list [huddle list] + set elements [dict get $d value] + foreach el $elements { + set sub [from_dict $el] + huddle append h_list $sub + } + return $h_list + } + default { + set tinfo [tomlish::dict::convert_typeval_to_tomltest $d] + #basic non-container types + set h_tdict [huddle create] + huddle set h_tdict type [huddle string [dict get $tinfo type]] + huddle set h_tdict value [huddle string [dict get $tinfo value]] + return $h_tdict + } + } + } else { + dict for {dictkey dictval} $d { + set jsonkey [tomlish::utils::rawstring_to_jsonstring $dictkey] + if {[tomlish::dict::is_typeval $dictval]} { + set dtype [dict get $dictval type] + switch -- $dtype { + ARRAY { + #error "typedhuddle::from_dict ARRAY not yet handled" + set h_next [huddle list] + set elements [dict get $dictval value] + foreach el $elements { + set sub [from_dict $el] + huddle append h_next $sub + } + } + default { + set tinfo [tomlish::dict::convert_typeval_to_tomltest $dictval] + set tp [dict get $tinfo type] + #basic non-container types + set h_next [huddle create] ;#dict + huddle set h_next type [huddle string [dict get $tinfo type]] + huddle set h_next value [huddle string [dict get $tinfo value]] + } + } + huddle set h $jsonkey $h_next + } else { + #dict + set sub [from_dict $dictval] + huddle set h $jsonkey $sub + } + } + } + return $h + } + proc is_typeval {huddled} { + set htype [huddle type $huddled] + if {$htype ne "dict"} { + return 0 + } + if {[huddle keys $huddled] ne {type value}} { + return 0 + } + set tp [huddle type $huddled type] + switch -- $tp { + string - integer - float - bool - datetime - datetime-local - date-local - time-local { + return 1 + } + } + return 0 + } + + #direction from typed json towards toml + proc convert_typeval_to_tomlish {huddled} { + set htype [huddle get_stripped $huddled type] + set hval [huddle get_stripped $huddled value] + switch -- $htype { + string { + #we need to decide here the type of string element to use in toml/tomlish + #STRING,MULTISTRING,LITERAL,MULTILITERAL + #set unesc [tomlish::utils::unescape_jsonstring $hval] ;#no need - json parser unescaped when creating the huddle + set unesc $hval + #(huddle::json::json2huddle parse $json) + #since it was unescaped any backslashes remaining represent themselves - reapply escape - REVIEW + #set hval [string map [list \\ \\\ ] $hval] + #JSJS + if {[string first \n $unesc] >= 0} { + #always use a MULTI + if {[string first ' $unesc] >=0} { + if {[string first ''' $unesc] >=0} { + set dtype MULTISTRING + } else { + set dtype MULTILITERAL + } + } else { + if {[string first \"\"\" $unesc] >=0} { + set dtype MULTILITERAL + } else { + set dtype MULTISTRING + } + } + } else { + #use multi if needed? + if {[string first '' $hval] >=0} { + if {[string first ''' $unesc] >=0} { + set dtype STRING + } else { + set dtype MULTILITERAL + } + } elseif {[string first ' $unesc] >= 0} { + set dtype STRING + } elseif {[string first \"\"\" $unesc] >= 0} { + set dtype LITERAL + } else { + #STRING or LITERAL? + set dtype STRING + } + } + + } + datetime - bool { + set dtype [string toupper $htype] + } + float { + set dtype FLOAT + if {[string is integer -strict $hval]} { + #json FLOAT specified as integer - must have dot for toml + set hval [expr {double($hval)}] + } + } + integer { + set dtype INT + } + datetime - datetime-local - date-local - time-local { + #DDDD + #set dtype DATETIME + set dtype [string toupper $htype] + } + default { + error "tomlish::typedhuddle::convert_typeval_to_tomlish unrecognised type $htype" + } + } + return [list type $dtype value $hval] + } + +} +namespace eval tomlish::toml { + proc from_binary {bindata} { + set bom "" + set b12 [tcl::string::range $bindata 0 1] + set b12test [string map [list \xEF\xBB utf8_12 \xFE\xFF bom16be \xFF\xFE utf32le_12 \x00\x00 utf32be_12] $b12] + switch -- $b12test { + bom16be { + #FEFF + set bom utf-16be + } + utf32le_12 { + #FFFE + set b34 [tcl::string::range $bindata 2 3] + if {$b34 eq "\x00\x00"} { + set bom utf-32le + } else { + set bom utf-16le + } + } + utf32be_12 { + #0000 + set b34 [tcl::string::range $bindata 2 3] + if {$b34 eq "\xFE\xFF"} { + set bom utf-32be + } + } + utf8_12 { + set b3 [tcl::string::index $bindata 2] + if {$b3 eq "\xBF"} { + set bom utf-8 + } + } + } + if {$bom eq ""} { + #no bom - assume utf8 - but we read in as binary + #if data wasn't actually utf8 we may error here depending on content - or we may just get wrongly encoded chars + set tomldata [encoding convertfrom utf-8 $bindata] + } elseif {$bom eq "utf-8"} { + #utf-8 bom read in as binary + set tomldata [encoding convertfrom utf-8 $bindata] + #bom now encoded as single unicode char \uFFEF + } else { + return -code error -errorcode {TOML ENCODING NOTUTF8} "Input not UTF8 encoded according to BOM. Indicated encoding is '$bom' - invalid for toml" + } + return $tomldata + } + proc from_tomlish {tomlish} { + return [tomlish::encode::tomlish $tomlish] + } + + #todo - rename to taggedjson + proc from_tomlish_from_dict_from_typedjson {json} { + set d [tomlish::dict::from_typedjson $json] + from_tomlish [tomlish::from_dict $d] ;#return tomlish + } + + proc tablename_split {tablename {normalize false}} { + #we can't just split on . because we have to handle quoted segments which may contain a dot. + #eg {dog."tater.man"} + if {$tablename eq ""} { + error "tablename_split. No table name segments found. empty tablename" + } + set sLen [tcl::string::length $tablename] + set segments [list] + set mode "preval" ;#5 modes: preval, quoted,litquoted, unquoted, postval + #quoted is for double-quotes, litquoted is for single-quotes (string literal) + set seg "" + for {set i 0} {$i < $sLen} {incr i} { + + if {$i > 0} { + set lastChar [tcl::string::index $tablename [expr {$i - 1}]] + } else { + set lastChar "" + } + + #todo - track\count backslashes properly + + set c [tcl::string::index $tablename $i] + if {$c eq "\""} { + if {($lastChar eq "\\")} { + #not strictly correct - we could have had an even number prior-backslash sequence + #the toml spec would have us error out immediately on bsl in bad location - but we're + #trying to parse to unvalidated tomlish + set ctest escq + } else { + set ctest dq + } + } else { + set ctest [string map [list " " sp \t tab] $c] + } + + switch -- $ctest { + . { + switch -exact -- $mode { + preval { + error "tablename_split. dot not allowed - expecting a value" + } + unquoted { + #dot marks end of segment. + if {![tomlish::utils::is_barekey $seg]} { + error "tablename_split. unquoted key segment $seg is not a valid toml key" + } + lappend segments $seg + set seg "" + set mode "preval" + } + quoted { + append seg $c + } + litquoted { + append seg $c + } + postval { + #got dot in an expected location + set mode "preval" + } + } + } + dq { + #unescaped dquote + switch -- $mode { + preval { + set mode "quoted" + set seg "\"" + } + unquoted { + #invalid in barekey - but we are after structure only + append seg $c + } + quoted { + append seg $c + #JJJJ + if {$normalize} { + lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] + } else { + lappend segments $seg + } + set seg "" + set mode "postval" ;#make sure we only accept a dot or end-of-data now. + } + litquoted { + append seg $c + } + postval { + error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" + } + } + } + ' { + switch -- $mode { + preval { + append seg $c + set mode "litquoted" + } + unquoted { + #single quote inside e.g o'neill - ultimately invalid - but we pass through here. + append seg $c + } + quoted { + append seg $c + } + litquoted { + append seg $c + #no normalization to do aside from stripping squotes + if {$normalize} { + lappend segments [tcl::string::range $seg 1 end-1] + } else { + lappend segments $seg + } + set seg "" + set mode "postval" + } + postval { + error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" + } + } + } + sp - tab { + switch -- $mode { + preval - postval { + #ignore + } + unquoted { + #terminates a barekey + lappend segments $seg + set seg "" + set mode "postval" + } + default { + #append to quoted or litquoted + append seg $c + } + } + } + default { + switch -- $mode { + preval { + set mode unquoted + append seg $c + } + postval { + error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" + } + default { + append seg $c + } + } + } + } + + if {$i == $sLen-1} { + #end of data + ::tomlish::log::debug "End of data: mode='$mode'" + switch -exact -- $mode { + preval { + if {[llength $segments]} { + error "tablename_split. Expected a value after last dot separator. tablename: '$tablename'" + } else { + error "tablename_split. Whitespace only? No table name segments found. tablename: '$tablename'" + } + } + unquoted { + if {![tomlish::utils::is_barekey $seg]} { + #e.g toml-test invalid/table/with-pound required to fail for invalid barekey + error "tablename_split. unquoted key segment $seg is not a valid toml key" + } + lappend segments $seg + } + quoted { + error "tablename_split. Expected a trailing double quote. tablename: '$tablename'" + } + litquoted { + error "tablename_split. Expected a trailing single quote. tablename: '$tablename'" + } + postval { + #ok - segment already lappended + } + } + } + } + + #note - we must allow 'empty' quoted strings '' & "" + # (these are 'discouraged' but valid toml keys) + + return $segments + } + + #tablenames (& tablearraynames) may contain irrelevant leading, trailing and interspersed whitespace + # tablenames can be made up of segments delimited by dots. .eg [ a.b . c ] + #trimmed, the tablename becomes {a.b.c} + # A segment may contain whitespace if it is quoted e.g [a . b . "c etc " ] + #ie whitespace is only irrelevant if it's outside a quoted segment + #trimmed, the tablename becomes {a.b."c etc "} + proc tablename_trim {tablename} { + set segments [tomlish::toml::tablename_split $tablename false] + set trimmed_segments [list] + foreach seg $segments { + lappend trimmed_segments [::string trim $seg " \t"] + } + return [join $trimmed_segments .] + } +} + +namespace eval tomlish::dict { + namespace export {[a-z]*}; # Convention: export all lowercase + namespace path [namespace parent] + + #from_taggedjson + proc from_typedjson {json} { + package require huddle + package require huddle::json + set h [huddle::json::json2huddle parse $json] + #json2huddle parse unescapes the basic json escapes \n \\ etc + #$h could now contain raw form of surrogate pair (json2huddle parse as at 2025-014 doesn't convert the surrogates - just unescapes?) + if {[catch {encoding convertto utf-8 $h} errM]} { + #This test suggests we have raw surrogate pairs - REVIEW + package require punk::cesu + set h [punk::cesu::from_surrogatestring $h] + } + tomlish::dict::from_typedhuddle $h + } + proc from_typedhuddle {h} { + set resultd [dict create] + switch -- [huddle type $h] { + dict { + foreach k [huddle keys $h] { + switch -- [huddle type $h $k] { + dict { + set huddle_d [huddle get $h $k] + #puts stderr "huddle_d: $huddle_d" + #set v [huddle get_stripped $h $k] + if {[tomlish::typedhuddle::is_typeval $huddle_d]} { + dict set resultd $k [tomlish::typedhuddle::convert_typeval_to_tomlish $huddle_d] + } else { + dict set resultd $k [from_typedhuddle $huddle_d] + } + } + list { + set items [huddle get $h $k] + + set numitems [huddle llength $items] + if {$numitems == 0} { + dict set resultd $k [list type ARRAY value {}] + } else { + set arritems [list] + for {set i 0} {$i < $numitems} {incr i} { + set item [huddle get $items $i] + #puts stderr "item: $item" + #set v [huddle get $item] + if {[tomlish::typedhuddle::is_typeval $item]} { + lappend arritems [tomlish::typedhuddle::convert_typeval_to_tomlish $item] + } else { + lappend arritems [from_typedhuddle $item] + } + } + dict set resultd $k [list type ARRAY value $arritems] + } + } + default { + error "dict_from_json unexpected subtype [huddle type $h $k] in dict" + } + } + } + } + list { + set items [huddle get $h] + set numitems [huddle llength $items] + if {$numitems == 0} { + return [list type ARRAY value {}] + } else { + set arritems [list] + for {set i 0} {$i < $numitems} {incr i} { + set item [huddle get $items $i] + #puts stderr "item: $item" + #set v [huddle get $item] + if {[tomlish::typedhuddle::is_typeval $item]} { + lappend arritems [tomlish::typedhuddle::convert_typeval_to_tomlish $item] + } else { + lappend arritems [from_typedhuddle $item] + } + } + return [list type ARRAY value $arritems] + } + + } + } + return $resultd + } + + proc is_typeval {d} { + #designed to detect {type value } e.g {type INT value 3}, {type STRING value "blah etc"} + #as a sanity check we need to avoid mistaking user data that happens to match same form + #consider x.y={type="spud",value="blah"} + #The value of type will itself have already been converted to {type STRING value spud} ie never a single element. + #check the length of the type as a quick way to see it's a tag - not something else masqerading. + expr {[string is dict $d] && [dict size $d] == 2 && [dict exists $d type] && [dict exists $d value] && [llength [dict get $d type]] == 1} + } + + #simple types only - not containers? + proc convert_typeval_to_tomltest {d} { + set dtype [dict get $d type] + set dval [dict get $d value] + switch -- $dtype { + INT { + set testtype integer + set dval [expr {$dval}] ;#convert e.g 0xDEADBEEF to 3735928559 + } + DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - FLOAT - BOOL { + #DDDD + set testtype [string tolower $dtype] + } + STRING - MULTISTRING { + set testtype string + #JJJJ + set dval [tomlish::utils::unescape_string $dval] + set dval [tomlish::utils::rawstring_to_jsonstring $dval] + } + LITERAL - MULTILITERAL { + set testtype string + #don't validate on way out to json here? + #decoder should validate by calling tomlish::from_dict + #if {![tomlish::utils::rawstring_is_valid_literal $dval]} { + # return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} $msg + #} + set dval [tomlish::utils::rawstring_to_jsonstring $dval] + } + MULTILITERAL { + #todo - escape newlines for json? + set testtype string + } + default { + error "convert_typeval_to_tomltest unhandled type $dtype" + } + } + return [list type $testtype value $dval] + } + + # Check that each leaf is a typeval or typeval dict + #importantly: must accept empty dict leaves e.g {x {}} + proc is_typeval_dict {d {checkarrays 0}} { + if {![string is dict $d]} { + return 0 + } + dict for {k v} $d { + set is_d 0 + if {!([is_typeval $v] || [set is_d [is_typeval_dict $v $checkarrays]])} { + return 0 + } + if {!$is_d} { + set vtype [dict get $v type] + switch -- $vtype { + INT - FLOAT - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - BOOL - LITERAL - STRING - MULTILITERAL - MULTISTRING {} + ARRAY { + if {$checkarrays} { + set arrdata [dict get $v value] + foreach el $arrdata { + if {![is_typeval_dict $el $checkarrays]} { + return 0 + } + } + } + } + default { + puts stderr "is_typeval_dict: Unexpected type '$vtype'" + return 0 + } + } + } + } + return 1 + } + + proc last_tomltype_posn {d} { + set last_simple -1 + set dictposn [expr {[dict size $d] -1}] + foreach k [lreverse [dict keys $d]] { + set dval [dict get $d $k] + if {[is_typeval $dval]} { + set last_simple $dictposn + break + } + incr dictposn -1 + } + return $last_simple + } + + + #review + proc name_from_tablestack {tablestack} { + set name "" + foreach tinfo [lrange $tablestack 1 end] { + lassign $tinfo type namepart + switch -- $type { + T { + if {$name eq ""} { + append name $namepart + } else { + append name .$namepart + } + } + I { + if {$name eq ""} { + append name $namepart + } else { + append name .$namepart + } + } + default { + #end at first break in the leading sequence of T & I tablenames + break + } + } + } + return $name + } + + + #tablenames_info is a flat dict with the key being an '@@' path + proc _show_tablenames {tablenames_info} { + #e.g {@l@a @@b} {ttype header_table tdefined closed} + append msg \n "tablenames_info:" \n + dict for {tkey tinfo} $tablenames_info { + append msg " " "table: $tkey" \n + dict for {field finfo} $tinfo { + append msg " " "$field $finfo" \n + } + } + return $msg + } + + #take a raw string and classify: result is a 2 element list comprised of KEY|SQKEY|DQKEY and the value being the appropriate inner string + proc classify_rawkey {rawval} { + if {![::tomlish::utils::is_barekey $rawval]} { + #requires quoting + # + #Any dot in the key would have been split by dict::from_tomlish - so if it's present here it's part of this key - not a level separator! + # + #we'll use a basic mechanisms for now to determine the type of quoting + # - whether it has any single quotes or not. + # (can't go in an SQKEY) + # - whether it has any chars that require quoting when in a Bstring + # (if so - then its visual representation might be unsuitable for a key in a toml text file, so escape and put in DQKEY instead of literal SQKEY) + #todo - more? + #REVIEW - the backslash might often be in things like a regex or windows path - which is often better expressed in a literal SQKEY + # from literal examples: + # 'c:\Users\nodejs\templates' + # '<\i\c*\s*>' + #If these are in *keys* our basic test will express these as: + # "c:\\Users\\nodejs\\templates" + # "<\\i\\c*\\s*>" + # This still works - but a smarter test might determine when SQKEY is the better form? + #when coming from external systems - can we even know if the value was already escaped? REVIEW + #Probably when coming from json - we know it's already escaped - and so we build our dict converting keys to unescaped + #TODO - clarify in documentation that keys resulting from dict::from_tomlish are in 'normalized' (unescaped) form + # + #For keys - we currently (2025) are only allowed barekeys,basic strings and literal strings. (no multiline forms) + set k_escaped [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $rawval] + if {[string length $k_escaped] != [string length $rawval]} { + #escaping made a difference + set has_escape_requirement 1 + } else { + set has_escape_requirement 0 + } + if {[string first ' $rawval] >=0 || $has_escape_requirement} { + #basic string + # (any ANSI SGR sequence will end up here in escaped form ) + return [list DQKEY $k_escaped] + } else { + #literal string + return [list SQKEY $rawval] + } + } else { + return [list KEY $rawval] + } + } + #the quoting implies the necessary escaping for DQKEYs + proc join_and_quote_rawkey_list {rawkeylist} { + set result "" + foreach rk $rawkeylist { + lassign [tomlish::dict::classify_rawkey $rk] type val + switch -- $type { + SQKEY { + append result "'$val'." + } + DQKEY { + append result "\"$val\"." + } + KEY { + append result "$val." + } + } + } + return [string range $result 0 end-1] + } + + proc _process_tomlish_dottedkey {element {context_refpath {}}} { + upvar tablenames_info tablenames_info + upvar datastructure datastructure + set dottedtables_defined [list] + set dkey_info [tomlish::get_dottedkey_info $element] + #e.g1 keys {x.y y} keys_raw {'x.y' "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) + #e.g2 keys {x.y y} keys_raw {{"x.y"} "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) + + #[a.b] + #t1.t2.dottedtable.leafkey = "val" + #we have already checked supertables a & {a b} + # - in basic case, passed in context_refpath as {@@a @@b} + # - our context_refpath could also include some combination of keys and array indices e.g {@@a @@b 3 @@subtablekey} + #We need to check {a b t1} & {a b t2} ('creation' only) + #and then 'dottedtable' is 'defined' while leafkey is an ordinary key in dottedtable + + #note we also get here as a 'dottedkey' with the following even though there is no dot in k + #[a.b] + #leafkey = "val" + + set all_dotted_keys [dict get $dkey_info keys] + set dottedkeyname [join $all_dotted_keys .] + + if {[llength $all_dotted_keys] > 1} { + #dottedtable.k=1 + #tX.dottedtable.k=1 + #etc + + #Wrap in a list so we can detect 'null' equivalent. + #We can't use empty string as that's a valid dotted key segment + set dottedtable_bag [list [lindex $all_dotted_keys end-1]] + set dotparents [lrange $all_dotted_keys 0 end-2] + } else { + #basic case - not really a 'dotted' key + #k = 1 + set dottedtable_bag [list] ;#empty bag + set dotparents [list] + } + #assert dottedtable_bag only ever holds 0 or 1 elements + set leaf_key [lindex $all_dotted_keys end] + + #see also: https://github.com/toml-lang/toml/issues/846 "Can dotted keys insert into already-defined [tables]?" + #This code was originally written with a misinterpretation of: + #"Dotted keys create and define a table for each key part before the last one, provided that such tables were not previously created." + # 'each key part before the last one' refers to each key in a single dotted key entry + # not each 2nd-to last key in a list of dotted keys. + + + #we've already tested the table/tablearray keys that got us here.. but not the dottedkey segments (if any) prior to dottedtable & leaf_key + set dottedsuper_refpath $context_refpath + foreach normkey $dotparents { + lappend dottedsuper_refpath @@$normkey + if {![dict exists $tablenames_info $dottedsuper_refpath ttype]} { + #supertable with this combined path (context_path plus parts of dottedkey) not yet 'created' + if {[tomlish::dict::path::exists $datastructure $dottedsuper_refpath]} { + #There is data so it must have been created as a keyval + set msg "Path $dottedsuper_refpath for dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here + #dict set tablenames_info $dottedsuper_refpath ttype unknown_table ;#REVIEW + dict set tablenames_info $dottedsuper_refpath ttype unknown_dotted ;#REVIEW + + #see note above re dotted keys insert into already defined table - we need to 'define' all the dotted supers in this block + lappend dottedtables_defined $dottedsuper_refpath + + #ensure empty tables are still represented in the datastructure + tomlish::dict::path::set_endpoint datastructure $dottedsuper_refpath {} ;#set to empty subdict + } else { + #added for fixed assumption + set ttype [dict get $tablenames_info $dottedsuper_refpath ttype] + set definedstate [dictn getdef $tablenames_info [list $dottedsuper_refpath tdefined] NULL] + switch -- $ttype { + dottedkey_table - unknown_dotted { + #'created' as dotted - but make sure it's from this header section - i.e defined not set + if {$definedstate ne "NULL"} { + #collision with some other dottedkey + set msg "Table at $dottedsuper_refpath represented by dottedkey $dottedkeyname has been 'defined' elsewhere (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + } + itable { + #itables are immediately defined + set msg "Table at $dottedsuper_refpath represented by dottedkey $dottedkeyname has been 'defined' as itable (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + default { + #header_table, header_tablearray or unknown_header + #is header_tablearray any different from header_table in this context? + #we don't set tdefined for tablearray anyway - so should be ok here. + if {$definedstate ne "NULL"} { + set msg "Table at $dottedsuper_refpath represented by dottedkey $dottedkeyname has been 'defined' in a header (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + } + } + } + } + + #dottedtable being 2nd last segment was for original assumption - todo - tidy up? we are duplicating the logic above + #review - any need/advantage to treat 2nd to last key any different from other supers? ie D in a.b.c.D.key=1 + #no need for 'unknown_dotted' vs 'dottedkey_table' ?? + if {[llength $dottedtable_bag] == 1} { + set dottedtable [lindex $dottedtable_bag 0] + set dottedkey_refpath [list {*}$dottedsuper_refpath "@@$dottedtable"] + #our dotted key is attempting to define a table + if {![dict exists $tablenames_info $dottedkey_refpath ttype]} { + #first one - but check datastructure for collisions + if {[tomlish::dict::path::exists $datastructure $dottedkey_refpath]} { + set msg "Path $dottedkey_refpath for dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #'create' the table + dict set tablenames_info $dottedkey_refpath ttype dottedkey_table + #don't actually set 'defined' here.. use the end of TABLE record to close them off by looking at this list + tomlish::dict::path::set_endpoint datastructure $dottedkey_refpath {} + lappend dottedtables_defined $dottedkey_refpath + + # + } else { + #exists - but might be from another dottedkey within the current header section + #the table is open for adding keys until the next 'header' section ([tablename] / [[tablearray]]) + #check for 'defined' closed (or just existence) + set ttype [dict get $tablenames_info $dottedkey_refpath ttype] + set definedstate [dictn getdef $tablenames_info [list $dottedkey_refpath tdefined] NULL] + switch -- $ttype { + dottedkey_table - unknown_dotted { + #'created' as dotted - but make sure it's from this header section - i.e defined not set + if {$definedstate ne "NULL"} { + #collision with some other dottedkey + set msg "Table at $dottedkey_refpath represented by dottedkey $dottedkeyname has been 'defined' elsewhere (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + } + itable { + #itables are immediately defined + set msg "Table at $dottedkey_refpath represented by dottedkey $dottedkeyname has been 'defined' as itable (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + default { + #header_table, header_tablearray or unknown_header + #is header_tablearray any different from header_table in this context? + #we don't set tdefined for tablearray anyway - so should be ok here. + if {$definedstate ne "NULL"} { + set msg "Table at $dottedkey_refpath represented by dottedkey $dottedkeyname has been 'defined' in a header (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + } + } + } + } else { + set dottedkey_refpath $dottedsuper_refpath + } + #assert - dottedkey represents a key val pair that can be added + + + set fullkey_refpath [list {*}$dottedkey_refpath @@$leaf_key] + if {[tomlish::dict::path::exists $datastructure $fullkey_refpath]} { + set msg "Duplicate key. The key (path $fullkey_refpath) already exists at this level in the toml data. The toml data is not valid." + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + + #set keyval_dict [_get_keyval_value $element] + lassign [_get_keyval_value $element] _ keyval_dict _ sub_tablenames_info + + + #keyval_dict is either a {type value } + #or the result from parsing an arbitrary dict from an inline table - which could theoretically look the same at the topmost level + #punk::dict::is_typeval can distinguish + tomlish::log::debug "_process_tomlish_dottedkey>>> context:$context_refpath dottedkey $dottedkeyname kv: $keyval_dict" + tomlish::dict::path::set_endpoint datastructure $fullkey_refpath $keyval_dict + + #remove ? + #if {![tomlish::dict::is_typeval $keyval_dict]} { + # #the value is either empty or or a dict structure with arbitrary (from-user-data) toplevel keys + # # inner structure will contain {type value } if all leaves are not empty ITABLES + # ##set tkey [list {*}$norm_segments {*}$all_dotted_keys] + + # #by not creating a tablenames_info record - we effectively make it closed anyway? + # #it should be detected as a key + # #is there any need to store tablenames_info for it?? + # #REVIEW + + # ##TODO - update? + # #dictn incr tablenames_info [list $tkey seencount] + # ##if the keyval_dict is not a simple type x value y - then it's an inline table ? + # ##if so - we should add the path to the leaf_key as a closed table too - as it's not allowed to have more entries added. + # #dictn set tablenames_info [list $tkey closed] 1 + #} + return [dict create dottedtables_defined $dottedtables_defined] + } + #tomlish::dict::from_tomlish is a *basic* programmatic datastructure for accessing the data. + # produce a dictionary of keys and values from a tomlish tagged list. + # dict::from_tomlish is primarily for read access to toml data. + #Extraneous (not within quoted sections) whitespace and comments are not preserved in this structure, + # so a roundtrip from toml to this datastructure and back to toml will lose whitespace formatting and comments. + # creating/changing toml values can be done directly on a tomlish list if preserving (or adding) formatting/comments is desired. + #A separate package 'tomlish::object' may be needed to allow easier programmatic creating/updating/deleting of data elements whilst preserving (or adding or selectively deleting/editing) such formatting. + # + + #within an ARRAY, we store a list of items such as plain dicts (possibly empty) and {type value } for simple types + #(ARRAYS can be mixed type) + #This means our dict structure should have only ARRAY and simple types which need to be in {type value } form + #A dict within an array encodeded as a type ITABLE value should also parse - but is the unpreferred form - REVIEW test? + + #Namespacing? + #ie note the difference: + #[Data] + #temp = { cpu = 79.5, case = 72.0} + # versus + #[Data] + #temps = [{cpu = 79.5, case = 72.0}] + proc from_tomlish {tomlish} { + package require dictn + + #keep track of which tablenames have already been directly defined, + # so we can raise an error to satisfy the toml rule: 'You cannot define any key or table more than once. Doing so is invalid' + #Note that [a] and then [a.b] is ok if there are no subkey conflicts - so we are only tracking complete tablenames here. + #we don't error out just because a previous tablename segment has already appeared. + + #Declaring, Creating, and Defining Tables + #https://github.com/toml-lang/toml/issues/795 + #(update - only Creating and Defining are relevant terminology) + + #review + #tablenames_info keys ttype created, tdefined, createdby, definedby, closedby ??? review keys + # [tname] = header_table [[tname]] = header_tablearray + + #consider the following 2 which are legal: + #[table] #'table' created, defined=open type header_table + #x.y = 3 + #[table.x.z] #'table' tdefined=closed closedby={header_table table.x.z}, 'table.x' created, 'table.x.z' created tdefined=open tdefinedby={header_table table.x.z} + #k= 22 + # #'table.x.z' tdefined=closed closedby={eof eof} + + #equivalent datastructure + + #[table] #'table' created, tdefined=open definedby={header_table table} + #[table.x] #'table' tdefined=closed closedby={header_table table.x}, 'table.x' created tdefined=open definedby={header_table table.x} + #y = 3 + #[table.x.z] #'table.x' tdefined=closed closedby={header_table table.x.z}, 'table.x.z' created tdefined=open definedby={header_table table.x.z} + #k=22 + + #illegal + #[table] #'table' created and tdefined=open + #x.y = 3 #'table.x' created first keyval pair tdefined=open definedby={keyval x.y = 3} + #[table.x.y.z] #'table' tdefined=closed, 'table.x' closed because parent 'table' closed?, 'table.x.y' cannot be created + #k = 22 + # + ## - we would fail on encountering table.x.y because only table and table.x are effectively tables - but that table.x is closed should be detected (?) + + #illegal + #[table] + #x.y = {p=3} + #[table.x.y.z] + #k = 22 + ## we should fail because y is an inline table which is closed to further entries + + #note: it is not safe to compare normalized tablenames using join! + # e.g a.'b.c'.d is not the same as a.b.c.d + # instead compare {a b.c d} with {a b c d} + # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. + #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' + #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} + + + + if {[uplevel 1 [list info exists tablenames_info]]} { + upvar tablenames_info tablenames_info + } else { + set tablenames_info [dict create] ;#keyed on tablepath each of which is an @@path such as {@@config @@subgroup @@etc} (corresponding to config.subgroup.etc) + #also has non @@ indexes which are list indexes as taken by tcl list commands (int or end-1 etc) + #value is a dict with keys: ttype, tdefined + } + + + log::info "---> dict::from_tomlish processing '$tomlish'<<<" + set items $tomlish + + foreach lst $items { + if {[lindex $lst 0] ni $::tomlish::tags} { + error "supplied list does not appear to be toml parsed into a tomlish tagged list. Run tomlish::decode::toml on the raw toml data to produce a tomlish list" + } + } + + if {[lindex $tomlish 0] eq "TOMLISH"} { + #ignore TOMLISH tag at beginning + set items [lrange $tomlish 1 end] + } + + set datastructure [dict create] + set dottedtables_defined [list] + foreach item $items { + set tag [lindex $item 0] + #puts "...> item:'$item' tag:'$tag'" + switch -exact -- $tag { + KEY - DQKEY - SQKEY - INT - FLOAT - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - STRING - LITERAL { + #why would we get individual key item as opposed to DOTTEDKEY? + error "tomlish::dict::from_tomlish error: invalid tag: $tag. At the toplevel, from_tomlish can only process WS NEWLINE COMMENT and compound elements DOTTEDKEY TABLE TABLEARRAY ITABLE MULTILITERAL MULTISTRING" + } + DOTTEDKEY { + #toplevel dotted key + set dkinfo [_process_tomlish_dottedkey $item] + lappend dottedtables_defined {*}[dict get $dkinfo dottedtables_defined] + #at any level - we don't expect any more DOTTEDKEY records in a tomlish structure after TABLE or TABLEARRAY are encountered + #as those records should encapsulate their own dottedkeys + + } + TABLEARRAY { + #close off any dottedtables_defined created by dottedkeys at this level + foreach dtablepath $dottedtables_defined { + dict set tablenames_info $dtablepath tdefined closed + } + set dottedtables_defined [list] ;#for closing off at end by setting 'defined' + + set tablearrayname [lindex $item 1] + tomlish::log::debug "---> tomlish::dict::from_tomlish processing item TABLENAME (name: $tablearrayname): $item" + set norm_segments [::tomlish::toml::tablename_split $tablearrayname true] ;#true to normalize + #we expect repeated tablearray entries - each adding a sub-object to the value, which is an array/list. + #tablearrayname is likely to appear multiple times - so unlike a TABLE we don't check for 'defined' for the full name as an indicator of a problem + set supertable [list] + ############## + # [[a.b.c.d]] + # norm_segments = {a b c d} + #check a {a b} {a b c} <---- supertables of a.b.c.d + ############## + set refpath [list] ;#e.g @@j1 @@j2 1 @@k1 end + foreach normseg [lrange $norm_segments 0 end-1] { + lappend supertable $normseg + lappend refpath @@$normseg + if {![dict exists $tablenames_info $refpath ttype]} { + #supertable with this path doesn't yet exist + if {[tomlish::dict::path::exists $datastructure $refpath]} { + #There is data though - so it must have been created as a keyval + set msg "Supertable [join $supertable .] of tablearray name $tablearrayname already has data but doesn't appear to be a table - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + #test: datastructure_tablearray_supertable_keycollision + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } else { + #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here + #review - we can't later specify as tablearray so should just set ttype to header_table even though it's being created + # because of a tablearray header? + #By setting ttype to something other than table_header we can provide more precise errorCode/msg ?? + dict set tablenames_info $refpath ttype unknown_header + #ensure empty tables are still represented in the datastructure + dict set datastructure {*}$supertable [list] + } + } else { + #REVIEW!! + # what happens with from_toml {[[a.b.c]]} {[[a.b]]} ??? + #presumed that a and a.b were 'created' as tables (supertables of tablearray at a.b.c) and can't now be redefined as tablearrays + + #supertable has already been created - and may be defined - but even if defined we can add subtables unless it is of type itable + #but if it's a tablearray - we need to point to the most 'recently defined table element of the array' + #(last member of that array - need to check type? allowed to have non-table elements ie nonhomogenous??) + set supertype [dict get $tablenames_info $refpath ttype] + if {$supertype eq "header_tablearray"} { + #exercised by toml-tests: + # valid/table/array-table-array + # valid/table/array-nest + + #puts stdout "todict!!! TABLEARRAY nesting required for supertable [join $supertable .]" + + #'refer' to the appropriate element in existing array + set arrdata [tomlish::dict::path::get $datastructure [list {*}$refpath @@value]] + set idx [expr {[llength $arrdata]-1}] + if {$idx < 0} { + #existing tablearray should have at least one entry even if empty (review) + set msg "reference to empty tablearray?" + return -code error -errorcode {TOMLISH STRUCTURE REFTOEMPTYTABLEARRAY} $msg + } + lappend refpath $idx + } + } + } + # + #puts "TABLE supertable refpath $refpath" + lappend refpath @@[lindex $norm_segments end] + tomlish::log::debug "TABLEARRAY refpath $refpath" + set tablearray_refpath $refpath + + + if {![dict exists $tablenames_info $tablearray_refpath ttype]} { + #first encounter of this tablearrayname + if {[tomlish::dict::path::exists $datastructure $tablearray_refpath]} { + #e.g from_toml {a=1} {[[a]]} + set msg "Cannot create tablearray name $tablearrayname. Key already has data but key doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + #test: datastructure_tablearray_direct_keycollision_error + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #no collision - we can create the tablearray and the array in the datastructure + dict set tablenames_info $tablearray_refpath ttype header_tablearray + #dict set datastructure {*}$norm_segments [list type ARRAY value {}] + #create array along with empty array-item at position zero + tomlish::dict::path::set_endpoint datastructure $tablearray_refpath [list type ARRAY value {{}}] + set arrayitem_refpath [list {*}$tablearray_refpath 0] + #set ARRAY_ELEMENTS [list] + } else { + #we have an existing tablenames_info record for this path - but is it a tablearray? + set ttype [dict get $tablenames_info $tablearray_refpath ttype] + if {$ttype ne "header_tablearray"} { + #header_table or itable + switch -- $ttype { + itable {set ttypename itable} + header_table {set ttypename table} + dottedkey_table {set ttypename dottedkey_table} + unknown_header - unknown_dotted { + #table was created e.g as supertable - but not specifically a tablearray + #violates ordering - return specific test error + set msg "Table $tablearrayname referenced as supertable before tablearray defined (ordering)" + return -code error -errorcode {TOMLISH STRUCTURE TABLEARRAYORDERING} $msg + } + default {error "unrecognised type $ttype - expected header_table or itable"} + } + set msg "tablearray name $tablearrayname already appears to be already created as '$ttypename' not tablearray - invalid?" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #EXISTING tablearray + #add to array + #error "add_to_array not implemented" + #{type ARRAY value } + #set ARRAY_ELEMENTS [dict get $datastructure {*}$norm_segments value] + tomlish::log::debug ">>>>pre-extend-array dict::from_tomlish datastructure: $datastructure" + set existing_array [tomlish::dict::path::get $datastructure [list {*}$tablearray_refpath @@value]] + set arrayitem_refpath [list {*}$tablearray_refpath [llength $existing_array]] + tomlish::dict::path::lappend datastructure $tablearray_refpath {} + tomlish::log::debug ">>>>post-extend-array dict::from_tomlish datastructure: $datastructure" + } + + + #set object [dict create] ;#array context equivalent of 'datastructure' + + #add to ARRAY_ELEMENTS and write back in to datastructure. + foreach element [lrange $item 2 end] { + set type [lindex $element 0] + tomlish::log::debug "----> todict processing $tag subitem $type processing contained element $element" + switch -exact -- $type { + DOTTEDKEY { + set dkinfo [_process_tomlish_dottedkey $element $arrayitem_refpath] + lappend dottedtables_defined {*}[dict get $dkinfo dottedtables_defined] + } + NEWLINE - COMMENT - WS { + #ignore + } + TABLE { + #we *perhaps* should be able to process tablearray subtables either as part of the tablearray record, or independently. + #(or even a mixture of both, although that is somewhat an edge case, and of limited utility) + #[[fruit]] + #x=1 + # [fruit.metadata] + # [fruit.otherdata] + + #when processing a dict destined for the above - the tomlish generator (e.g from_dict) + #should create as 1 or 3 records (but could create 2 records if there was an unrelated table in between the subtables) + #choices: all in tablearray record, tablearray + 1 or 2 table records. + # + #We are going the other way here - so we just need to realise that the list of tables 'belonging' to this tablearray might not be complete. + # + #the subtable names must be prefixed with the tablearray - we should validate that for any contained TABLE records + + #The default mechanism is for from_dict to produce tomlish with separate TABLE records - and use the ordering to determine membership + #If we were to support wrapping the TABLE records within a TABLEARRAY - we should also support TABLEARRAY within TABLEARRAY + # ----------------------------------------------------------------------- + #Implementing this is not critical for standard encoding/decoding of toml! + #It would be an alternative form for the tomlish intermediate form - and adds complexity. + # + #The upside would be to provide a function for sorting/rearranging in the tomlish form if all records were fully encapsulated. + #A possible downside is that unrelated tables placed before a tablearray is fully defined (within the tablearray definition area in toml) + # would have to be re-positioned before or after the encapsulated tablearray record. + # While unrelated tables in such a position aren't a recommended way to write toml, they appear to be valid + # and preserving the author's ordering is a goal of the basic encoding/decoding operations if no explicit sorting/reordering was requested. + # + #Consider an 'encapsulate' method to this (tomlish -> tomlish) + # ----------------------------------------------------------------------- + #todo + error "tomlish::dict::from_tomlish TABLE element within TABLEARRAY not handled - TABLE should be a separate tomlish record" + } + default { + error "tomlish::dict::from_tomlish Sub element of type '$type' not understood in tablearray context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" + } + } + } + + #end of TABLEARRAY record - equivalent of EOF or next header - close off the dottedtables + foreach dtablepath $dottedtables_defined { + dict set tablenames_info $dtablepath tdefined closed + } + } + TABLE { + #close off any dottedtables_defined created by dottedkeys at this level + foreach dtablepath $dottedtables_defined { + dict set tablenames_info $dtablepath tdefined closed + } + set tablename [lindex $item 1] + set dottedtables_defined [list] ;#for closing off at end by setting 'defined' + #As our TABLE record contains all it's child DOTTEDKEY records - this should be equivalent to setting them as defined at EOF or next header. + + #----------------------------------------------------------------------------------- + #default assumption - our reference is to the main tablenames_info and datastructure + #Will need to append keys appropriately if we have recursed + #----------------------------------------------------------------------------------- + + log::debug "---> tomlish::dict::from_tomlish processing item TABLE (name: $tablename): $item" + set norm_segments [::tomlish::toml::tablename_split $tablename true] ;#true to normalize + + + + set name_segments [::tomlish::toml::tablename_split $tablename 0] ;#unnormalized e.g ['a'."b".c.d] -> 'a' "b" c d + #results of tablename_split 0 are 'raw' - ie some segments may be enclosed in single or double quotes. + + + set supertable [list] + ############## + # [a.b.c.d] + # norm_segments = {a b c d} + #check a {a b} {a b c} <---- supertables of a.b.c.d + ############## + + ############## + #[[a]] + #[a.b] #supertable a is tablearray + ############## + + #also consider + ############## + # [[a.b]] + # [a.b.c.d] #supertable a is a table, supertable a.b is tablearray, supertable a.b.c is elementtable + ############## + set refpath [list] ;#e.g @@j1 @@j2 1 @@k1 end + foreach normseg [lrange $norm_segments 0 end-1] { + lappend supertable $normseg + lappend refpath @@$normseg + if {![dict exists $tablenames_info $refpath ttype]} { + #supertable with this path doesn't yet exist + if {[tomlish::dict::path::exists $datastructure $refpath]} { + #There is data though - so it must have been created as a keyval + set msg "Supertable [join $supertable .] of table name $tablename (path $refpath) already has data but doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here + #we also don't know whether it's a table or a dottedkey_table (not allowed to be tablearray - out of order?) + dict set tablenames_info $refpath ttype unknown_header + #ensure empty tables are still represented in the datastructure + #dict set datastructure {*}$supertable [list] + tomlish::dict::path::set_endpoint datastructure $refpath {} + } else { + #supertable has already been created - and may be defined - but even if defined we can add subtables unless it is of type itable + if {[dict get $tablenames_info $refpath ttype] eq "header_tablearray"} { + #'refer' to the appropriate element in existing array + set arrdata [tomlish::dict::path::get $datastructure [list {*}$refpath @@value]] + set idx [expr {[llength $arrdata]-1}] + if {$idx < 0} { + #existing tablearray should have at least one entry even if empty (review) + set msg "reference to empty tablearray?" + return -code error -errorcode {TOMLISH STRUCTURE REFTOEMPTYTABLEARRAY} $msg + } + lappend refpath $idx + } else { + #?? + if {[dictn getdef $tablenames_info [list $refpath tdefined] NULL] eq "NULL"} { + } else { + } + } + } + } + #puts "TABLE supertable refpath $refpath" + lappend refpath @@[lindex $norm_segments end] + tomlish::log::info "TABLE refpath $refpath" + set table_refpath $refpath + + + + + #table [a.b.c.d] hasn't been defined - but may have been 'created' already by a longer tablename + # - or may have existing data from a keyval + if {![dict exists $tablenames_info $table_refpath ttype]} { + if {[tomlish::dict::path::exists $datastructure $table_refpath]} { + #e.g from_toml {a=1} {[a]} + set msg "Cannot create table name $tablename. Key already has data but key doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + #test: datastructure_tablename_keyval_collision_error + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #no data or previously created table + dict set tablenames_info $table_refpath ttype header_table + + #We are 'defining' this table's keys and values here (even if empty) + #dict set datastructure {*}$norm_segments [list] ;#ensure table still represented in datastructure even if we add no keyvals here + tomlish::dict::path::set_endpoint datastructure $table_refpath {} ;#ensure table still represented in datastructure even if we add no keyvals here + } else { + if {[dict get $tablenames_info $table_refpath ttype] eq "header_tablearray"} { + #e.g tomltest invalid/table/duplicate-table-array2 + #[[tbl]] + #[tbl] + set msg "Table name $tablename has already been created as a tablearray. Invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } else { + #any other type tdefined is a problem + set T_DEFINED [dictn getdef $tablenames_info [list $table_refpath tdefined] NULL] + if {$T_DEFINED ne "NULL" } { + #our tablename e.g [a.b.c.d] declares a space to 'define' subkeys - but there has already been a definition space for this path + set msg "Table name $tablename has already been defined in the toml data. Invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + } + } + dict set tablenames_info $table_refpath tdefined open + + #now add the contained elements + foreach element [lrange $item 2 end] { + set type [lindex $element 0] + log::debug "----> todict processing $tag subitem $type processing contained element $element" + switch -exact -- $type { + DOTTEDKEY { + set dkinfo [_process_tomlish_dottedkey $element $table_refpath] + lappend dottedtables_defined {*}[dict get $dkinfo dottedtables_defined] + } + NEWLINE - COMMENT - WS { + #ignore + } + default { + error "Sub element of type '$type' not understood in table context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" + } + } + } + + #end of TABLE record - equivalent of EOF or next header - close off the dottedtables + foreach dtablepath $dottedtables_defined { + dict set tablenames_info $dtablepath tdefined closed + } + } + ITABLE { + #As there is no other mechanism to create tables within an ITABLE than dottedkeys + # and ITABLES are fully defined/enclosed - we can rely on key collision and don't need to track dottedtables_defined - REVIEW. + set dottedtables_defined [list] + #SEP??? + #ITABLE only ever on RHS of = or inside ARRAY + set datastructure [dict create] + set tablenames_info [dict create] + + foreach element [lrange $item 1 end] { + set type [lindex $element 0] + log::debug "----> tododict processing $tag subitem $type processing contained element $element" + switch -exact -- $type { + DOTTEDKEY { + set dkinfo [_process_tomlish_dottedkey $element] + } + NEWLINE - COMMENT - WS { + #ignore + } + default { + error "Sub element of type '$type' not understood in ITABLE context. Expected only KEY,DQKEY,SQKEY,NEWLINE,COMMENT,WS" + } + } + } + } + ARRAY { + #arrays in toml are allowed to contain mixtures of types + set datastructure [list] + log::debug "--> processing array: $item" + + foreach element [lrange $item 1 end] { + set type [lindex $element 0] + log::debug "----> tododict processing $tag subitem $type processing contained element $element" + switch -exact -- $type { + INT - FLOAT - BOOL - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL { + set value [lindex $element 1] + lappend datastructure [list type $type value $value] + } + STRING { + #JJJJ + #don't unescape string! + set value [lindex $element 1] + #lappend datastructure [list type $type value [::tomlish::utils::unescape_string $value]] + lappend datastructure [list type $type value $value] + } + LITERAL { + set value [lindex $element 1] + lappend datastructure [list type $type value $value] + } + ITABLE { + #anonymous table + #lappend datastructure [list type $type value [::tomlish::to_dict [list $element]]] + lappend datastructure [::tomlish::dict::from_tomlish [list $element]] ;#store itables within arrays as raw dicts (possibly empty) + } + TABLE - TABLEARRAY { + #invalid? shouldn't be output from from_dict - but could manually be constructed as such? review + #doesn't make sense as table needs a name? + #take as synonym for ITABLE? + error "tomlish::dict::from_tomlish $type within array unexpected" + } + ARRAY - MULTISTRING - MULTILITERAL { + #set value [lindex $element 1] + lappend datastructure [list type $type value [::tomlish::dict::from_tomlish [list $element]]] + } + WS - SEP - NEWLINE - COMMENT { + #ignore whitespace, commas, newlines and comments + } + default { + error "tomlish::dict::from_tomlish Unexpected value type '$type' found in array" + } + } + } + } + MULTILITERAL { + #triple squoted string + #first newline stripped only if it is the very first element + #(ie *immediately* following the opening delims) + #All whitespace other than newlines is within LITERALPARTS + # ------------------------------------------------------------------------- + #todo - consider extension to toml to allow indent-aware multiline literals + # how - propose as issue in toml github? Use different delim? e.g ^^^ ? + #e.g + # xxx=?'''abc + # def + # etc + # ''' + # - we would like to trimleft each line to the column following the opening delim + # ------------------------------------------------------------------------- + + log::debug "---> todict processing multiliteral: $item" + set parts [lrange $item 1 end] + if {[lindex $parts 0 0] eq "NEWLINE"} { + set parts [lrange $parts 1 end] ;#skip it + } + for {set idx 0} {$idx < [llength $parts]} {incr idx} { + set element [lindex $parts $idx] + set type [lindex $element 0] + switch -exact -- $type { + LITERALPART { + append stringvalue [lindex $element 1] + } + NEWLINE { + set val [lindex $element 1] + if {$val eq "lf"} { + append stringvalue \n + } else { + append stringvalue \r\n + } + } + default { + error "Unexpected value type '$type' found in multistring" + } + } + } + set datastructure $stringvalue + } + MULTISTRING { + #triple dquoted string + log::debug "---> tomlish::dict::from_tomlish processing multistring: $item" + set stringvalue "" + set idx 0 + set parts [lrange $item 1 end] + for {set idx 0} {$idx < [llength $parts]} {incr idx} { + set element [lindex $parts $idx] + set type [lindex $element 0] + #We use STRINGPART in the tomlish representation as a distinct element to STRING - which would imply wrapping quotes to be reinserted + switch -exact -- $type { + STRING { + #todo - do away with STRING ? + #we don't build MULTISTRINGS containing STRING - but should we accept it? + tomlish::log::warn "double quoting a STRING found in MULTISTRING - should be STRINGPART?" + #append stringvalue "\"[::tomlish::utils::unescape_string [lindex $element 1]]\"" + append stringvalue "\"[lindex $element 1]\"" + } + STRINGPART { + #JJJ + #don't unescape string + #append stringvalue [::tomlish::utils::unescape_string [lindex $element 1]] + append stringvalue [lindex $element 1] + } + CONT { + #When the last non-whitespace character on a line is an unescaped backslash, + #it will be trimmed along with all whitespace (including newlines) up to the next non-whitespace character or closing delimiter + # review - we allow some whitespace in stringpart elements - can a stringpart ever be all whitespace? + set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] + if {$next_nl == -1} { + #last (or first and only) line + return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} "Invalid whitespace escape - not a valid continuation position" + #set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] + #if {$non_ws >= 0} { + # #append stringvalue "\\" + # return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} "Invalid whitespace escape - not a valid continuation position" + #} else { + # #skip over ws without emitting + # set idx [llength $parts] + #} + } else { + set parts_til_nl [lrange $parts 0 $next_nl-1] + set non_ws [lsearch -index 0 -start $idx+1 -not $parts_til_nl WS] + if {$non_ws >= 0} { + #This CONT is invalid. If there had been a non-whitespace char directly following it, + #it wouldn't have come through as a CONT token + #Now that we see it isn't the last non-whitespace backslash on the line we can reject + # as an invalid escape of space or tab + #append stringvalue "\\" + return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} "Invalid whitespace escape - not a valid continuation position" + } else { + #skip over ws on this line + set idx $next_nl + #then have to check each subsequent line until we get to first non-whitespace + set trimming 1 + while {$trimming && $idx < [llength $parts]} { + set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] + if {$next_nl == -1} { + #last line + set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] + if {$non_ws >= 0} { + set idx [expr {$non_ws -1}] + } else { + set idx [llength $parts] + } + set trimming 0 + } else { + set non_ws [lsearch -index 0 -start $idx+1 -not [lrange $parts 0 $next_nl-1] WS] + if {$non_ws >= 0} { + set idx [expr {$non_ws -1}] + set trimming 0 + } else { + set idx $next_nl + #keep trimming + } + } + } + } + } + } + NEWLINE { + #if newline is first element - it is not part of the data of a multistring + if {$idx > 0} { + set val [lindex $element 1] + if {$val eq "lf"} { + append stringvalue \n + } else { + append stringvalue \r\n + } + } + } + WS { + append stringvalue [lindex $element 1] + } + default { + error "Unexpected value type '$type' found in multistring" + } + } + } + set datastructure $stringvalue + } + WS - COMMENT - NEWLINE { + #ignore + } + BOM { + #this token is the unicode single char \uFFEF + #It doesn't tell us what encoding was originally used (though toml should only accept UTF-8 files) + #ignore at start - what about in other positions? + } + default { + error "Unexpected tag '$tag' in Tomlish list '$tomlish'" + } + } + } + return $datastructure + } +} +namespace eval tomlish::dict::path { + #access tomlish dict structure + namespace export {[a-z]*}; # Convention: export all lowercase + + #access with path such as: @@k @@k 0 @@k end where dict keys marked with @@ and plain values are list indices into in {type ARRAY value } + #leaf elements returned as structured {type value } + proc get {dictval {path {}}} { + if {$path eq ""} { + return $dictval + } + ::set data $dictval + ::set pathsofar [list] + foreach p $path { + ::lappend pathsofar $p + if {[string range $p 0 1] eq "@@"} { + ::set data [dict get $data [string range $p 2 end]] + } else { + if {![tomlish::dict::is_typeval $data]} { + error "tomlish::dict::path::get error bad path $path. Attempt to access table as array at subpath $pathsofar." + } + if {[dict get $data type] ne "ARRAY"} { + error "tomlish::dict::get error bad path $path. Subpath $pathsofar is not an array." + } + ::set arrdata [dict get $data value] + ::set data [lindex $arrdata $p] + } + } + return $data + } + proc exists {dictval path} { + ::set data $dictval + ::set pathsofar [list] + ::set exists 1 + foreach p $path { + ::lappend pathsofar $p + if {[string range $p 0 1] eq "@@"} { + ::set k [string range $p 2 end] + if {![dict exists $data $k]} { + return 0 + } + ::set data [dict get $data $k] + } else { + if {![tomlish::dict::is_typeval $data]} { + return 0 + } + if {[dict get $data type] ne "ARRAY"} { + return 0 + } + ::set arrdata [dict get $data value] + ::set intp [tomlish::system::lindex_resolve_basic $arrdata $p] ;#handle index math (end-1 etc) + if {$intp == -1} { + #out of bounds + return 0 + } + ::set data [lindex $arrdata $p] + } + } + return $exists + } + + #a restricted analogy of 'dictn set' + #set 'endpoints' - don't create intermediate paths + # can replace an existing dict with another dict + # can create a key when key at tail end of path is a key (ie @@keyname, not index) + # can replace an existing {type value value } + # with added restriction that if is ARRAY the new must also be ARRAY + proc set_endpoint {dictvariable path value} { + upvar $dictvariable dict_being_edited + ::set data $dict_being_edited + ::set pathsofar [list] + if {!([tomlish::dict::is_typeval $value] || [tomlish::dict::is_typeval_dict $value 0])} { + #failed check of supplied value as basic type, or a sub-dict structure (not checking arrays) + error "tomlish::dict::path::set_endpoint error - value must already be in the tomlish form {type value } or be a dict with such forms as leaves" + } + foreach p $path { + ::lappend pathsofar $p + if {[string range $p 0 1] eq "@@"} { + ::set k [string range $p 2 end] + + #if {![dict exists $data $k]} { + # error "tomlish::dict:path::set error bad path $path. Attempt to access nonexistent element at subpath $pathsofar." + #} + ::set varname v[incr v] + + if {$pathsofar eq $path} { + #see if endpoint of the path given already exists + if {[dict exists $data $k]} { + ::set endpoint [dict get $data $k] + if {[tomlish::dict::is_typeval $endpoint]} { + set existing_tp [dict get $endpoint type] + if {![tomlish::dict::is_typeval $value]} { + error "tomlish::dict::path::set_endpoint error Unable to overwrite subpath '$pathsofar' which is of type $existing_tp with sub-dict. Supplied value not {type value val } with sub-dict: $value" + } + switch -- [dict get $endpoint type] { + ARRAY { + #disallow overwriting array - unless given value is an ARRAY? REVIEW + if {[dict get $value type] ne "ARRAY"} { + error "tomlish::dict::path::set_endpoint error bad path '$path'. Cannot overwrite array with non-array: $value" + } + } + default { + # + } + } + } else { + #endpoint is a typeval dict not a plain typeval - only allow overwrite with a typeval dict + if {![tomlish::dict::is_typeval_dict $value 0]} { + error "tomlish::dict::path::set_endpoint error path '$path'. Cannot overwrite sub-dict (size: [dict size $endpoint]) with non sub-dict: $value" + } + } + ::set $varname $value + dict set vdict $pathsofar $varname + break + } else { + ::set arrdata [dict get $data value] + set idx [tomlish::system::lindex_resolve_basic $arrdata $p] + if {$idx == -1} { + error "tomlish::dict::path::set_endpoint error bad path '$path'. No existing element at $p" + } + ::set data [lindex $arrdata $p] + ::set $varname $data + dict set vdict $pathsofar $varname + } + } + } + #dict for {path varname} $vdict { + # puts "$path $varname\n" + # puts " '[::set $varname]'\n" + # puts "" + #} + ::set i 0 + ::set reverse [lreverse $vdict] + foreach {varname path} $reverse { + set newval [::set $varname] + if {$i+2 == [llength $reverse]} { + ::set k [lindex $path end] + ::set k [string range $k 2 end] ;#first key is always @@something + dict set dict_being_edited $k $newval + #puts "--result $dict_being_edited" + break + } + ::set nextvarname [lindex $reverse $i+2] + ::set nextval [::set $nextvarname] + ::set k [lindex $path end] + if {[string match @@* $k]} { + #dict key + #dict set $nextvarname $k $newval + set_endpoint $nextvarname [list $k] $newval + } else { + #list index + ::set nextarr [dict get $nextval value] + ::lset nextarr $k $newval + dict set $nextvarname value $nextarr + } + ::incr i 2 + } + + return $dict_being_edited + + } + #path must be to a {type ARRAY value } + #REVIEW - how to lappend to deep mixed dict/array structure without rewriting whole datastructure? + proc lappend {dictvariable path args} { + upvar $dictvariable dict_being_edited + ::set data $dict_being_edited + ::set pathsofar [list] + #::set newlist [list] + ::set v 0 + ::set vdict [dict create] + foreach a $args { + if {![string is dict $a]} { + error "tomlish::dict::path::lappend error - lappended arguments must already be in the tomlish form {type value } or be a dict with such forms as leaves" + } + } + foreach p $path { + ::lappend pathsofar $p + if {[string range $p 0 1] eq "@@"} { + ::set k [string range $p 2 end] + if {![dict exists $data $k]} { + error "tomlish::dict::path::lappend error bad path $path. Attempt to access nonexistent element at subpath $pathsofar." + } + ::set varname v[incr v] + + if {$pathsofar eq $path} { + #see if endpoint of the path given is an ARRAY + ::set endpoint [dict get $data $k] + if {![tomlish::dict::is_typeval $endpoint]} { + error "tomlish::dict::path::lappend error bad path $path. Attempt to access table as array at subpath $pathsofar." + } + if {[dict get $endpoint type] ne "ARRAY"} { + error "tomlish::dict::path::lappend error bad path $path. Subpath $pathsofar is not an array." + } + ::set arrdata [dict get $endpoint value] + ::lappend arrdata {*}$args + dict set endpoint value $arrdata + ::set newlist $endpoint + ::set $varname $newlist + dict set vdict $pathsofar $varname + break + } + ::set data [dict get $data $k] + ::set $varname $data + dict set vdict $pathsofar $varname + } else { + if {![tomlish::dict::is_typeval $data]} { + error "tomlish::dict::path::lappend error bad path $path. Attempt to access table as array at subpath $pathsofar." + } + if {[dict get $data type] ne "ARRAY"} { + error "tomlish::dict::path::lappend error bad path $path. Subpath $pathsofar is not an array." + } + ::set varname v[incr v] + if {$pathsofar eq $path} { + if {[dict get $data type] ne "ARRAY"} { + error "tomlish::dict::path::lappend error bad path $path. Parent path is not an array." + } + ::set parentarray [dict get $data value] + ::set idx [tomlish::system::lindex_resolve_basic $parentarray $p] + if {$idx == -1} { + error "tomlish::dict::path::lappend error bad path $path. Index $p does not exist." + } + ::set endpoint [lindex $parentarray $p] + if {[dict get $endpoint type] ne "ARRAY"} { + error "tomlish::dict::path::lappend error bad path $path. Not an array." + } + + ::set arrdata [dict get $endpoint value] + ::lappend arrdata {*}$args + dict set endpoint value $arrdata + ::set newlist $endpoint + #::lset parentarray $p $newlist + #set parentarray $newlist + ::set $varname $newlist + dict set vdict $pathsofar $varname + break + } else { + ::set arrdata [dict get $data value] + set idx [tomlish::system::lindex_resolve_basic $arrdata $p] + if {$idx == -1} { + error "tomlish::dict::path::lappend error bad path $path. Subpath $pathsofar, index $p does not exist." + } + ::set data [lindex $arrdata $p] + ::set $varname $data + dict set vdict $pathsofar $varname + } + } + } + #todo tomlish::log::debug ? + #dict for {path varname} $vdict { + # puts "$path $varname\n" + # puts " [::set $varname]\n" + # puts "" + #} + ::set i 0 + ::set reverse [lreverse $vdict] + foreach {varname path} $reverse { + set newval [::set $varname] + if {$i+2 == [llength $reverse]} { + ::set k [lindex $path end] + ::set k [string range $k 2 end] ;#first key is always @@something + dict set dict_being_edited $k $newval + #puts "--result $dict_being_edited" + break + } + ::set nextvarname [lindex $reverse $i+2] + ::set nextval [::set $nextvarname] + ::set k [lindex $path end] + if {[string match @@* $k]} { + #dict key + set k [string range $k 2 end] + dict set $nextvarname $k $newval + } else { + #list index + ::set nextarr [dict get $nextval value] + ::lset nextarr $k $newval + dict set $nextvarname value $nextarr + } + ::incr i 2 + } + return $dict_being_edited + } +} +tcl::namespace::eval tomlish::to_dict { + + + proc @@path {dictkeys} { + lmap v $dictkeys {string cat @@ $v} + } + +} + + +tcl::namespace::eval tomlish::app { + #*** !doctools + #[subsection {Namespace tomlish::app}] + #[para] + #[list_begin definitions] + + tcl::namespace::eval argdoc { + proc test_suites {} { + if {[package provide test::tomlish] eq ""} { + return [list] + } + return [test::tomlish::SUITES] + } + } + + package require punk::args + punk::args::define { + @id -id ::tomlish::app::decoder + @cmd -name tomlish::app::decoder -help\ + "Read toml on stdin until EOF + on error - returns non-zero exit code and writes error to + the errorchannel. + on success - returns zero exit code and writes JSON encoding + of the data to the outputchannel. + This decoder is intended to be compatble with toml-test." + @leaders -min 0 -max 0 + @opts + -help -type none -help\ + "Display this usage message" + -inputchannel -default stdin + -inputencoding -default "iso8859-1" -choicerestricted 0 -choices {utf-8 utf-16 iso8859-1} -help\ + "configure encoding on input channel + iso8859-1 is equivalent to binary encoding" + -outputchannel -default stdout + -errorchannel -default stderr + @values -min 0 -max 0 + } + proc decoder {args} { + set argd [punk::args::parse $args withid ::tomlish::app::decoder] + set ch_input [dict get $argd opts -inputchannel] + set ch_input_enc [dict get $argd opts -inputencoding] + set ch_output [dict get $argd opts -outputchannel] + set ch_error [dict get $argd opts -errorchannel] + if {[dict exists $argd received -help]} { + return [punk::args::usage -scheme info ::tomlish::app::decoder] + } + + chan configure $ch_input -encoding $ch_input_enc + #translation? + chan configure $ch_input -translation lf ;# toml-test invalid/control tests we need to see raw CRs to reject them properly - auto translation won't do. + + #Just slurp it all - presumably we are not handling massive amounts of data on stdin. + # - even if the input is large, we probably don't gain much (aside from possible memory savings?) by attempting to process input as it arrives. + if {[catch { + set inputdata [read $ch_input] + if {$ch_input_enc eq "iso8859-1"} { + set toml [tomlish::toml::from_binary $inputdata] + } else { + set toml $inputdata + } + } errM]} { + puts stderr "read-input error: $errM" + #toml-tests expect exit code 1 + #e.g invalid/encoding/utf16-bom + exit 1 ;#read error + } + try { + set j [::tomlish::toml_to_typedjson $toml] + } on error {em} { + puts $ch_error "decoding failed: '$em'" + exit 1 + } + puts -nonewline $ch_output $j + exit 0 + } + + package require punk::args + punk::args::define { + @id -id ::tomlish::app::encoder + @cmd -name tomlish::app::encoder -help\ + "Read JSON on input until EOF + return non-zero exitcode if JSON data cannot be converted to + a valid TOML representation. + return zero exitcode and TOML data on output if JSON data can + be converted. + This encoder is intended to be compatible with toml-test." + @leaders -min 0 -max 0 + @opts + -help -type none -help \ + "Display this usage message" + -restrict_barekeys -default 0 -help\ + "If true, keys containing unicode will be quoted. + If false, an extended range of barekeys will be used + in unquoted form." + -inputchannel -default stdin + -inputencoding -default "" -choicerestricted 0 -choices {utf-8 utf-16 iso8859-1} -help\ + "configure encoding on input channel + If not supplied, leave at Tcl default" + -outputchannel -default stdout + -errorchannel -default stderr + @values -min 0 -max 0 + } + proc encoder {args} { + set argd [punk::args::parse $args withid ::tomlish::app::encoder] + set restrict_barekeys [dict get $argd opts -restrict_barekeys] + set ch_input [dict get $argd opts -inputchannel] + set ch_input_enc [dict get $argd opts -inputencoding] + set ch_output [dict get $argd opts -outputchannel] + set ch_error [dict get $argd opts -errorchannel] + if {[dict exists $argd received -help]} { + return [punk::args::usage -scheme info ::tomlish::app::encoder] + } + #review + if {$ch_input_enc ne ""} { + chan configure $ch_input -encoding $ch_input_enc + } + #review + chan configure $ch_input -translation lf + + if {[catch { + set json [read $ch_input] + }]} { + exit 2 ;#read error + } + try { + #tomlish::typedjson_to_toml + set toml [::tomlish::toml::from_tomlish_from_dict_from_typedjson $json] + } trap {} {e eopts} { + puts $ch_error "encoding failed: '$e'" + puts $ch_error "$::errorInfo" + exit 1 + } + puts -nonewline $ch_output $toml + exit 0 + } + + punk::args::define { + @dynamic + @id -id ::tomlish::app::test + @cmd -name tomlish::app::test + @leaders + @opts -any 1 + -help -type none -help\ + "Display this usage message + or further info if more args." + -suite -default tests -choices {${[::tomlish::app::argdoc::test_suites]}} + @values -min 0 -max -1 + } + proc test {args} { + package require test::tomlish + set argd [punk::args::parse $args withid ::tomlish::app::test] + set opts [dict get $argd opts] + set values [dict get $argd values] + set received [dict get $argd received] + set solos [dict get $argd solos] + set opt_suite [dict get $opts -suite] + if {[dict exists $received -help] && ![dict exists $received -suite]} { + return [punk::args::usage -scheme info ::tomlish::app::test] + } + + test::tomlish::SUITE $opt_suite + #if {[catch {test::tomlish::SUITE $opt_suite} errM]} { + # puts stderr "Unknown test suite '$opt_suite'. Available suites: [test::tomlish::SUITES]" + # exit 1 + #} + set run_opts [dict remove $opts -suite] + set run_opts [dict remove $run_opts {*}$solos] + set result [test::tomlish::RUN {*}$run_opts {*}$solos {*}$values] + return $result + } + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::app ---}] +} + +proc ::tomlish::appnames {} { + set applist [list] + foreach cmd [info commands ::tomlish::app::*] { + lappend applist [namespace tail $cmd] + } + return $applist +} + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# Secondary API namespace +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval tomlish::lib { + namespace export {[a-z]*}; # Convention: export all lowercase + namespace path [namespace parent] + #*** !doctools + #[subsection {Namespace tomlish::lib}] + #[para] Secondary functions that are part of the API + #[list_begin definitions] + + #proc utility1 {p1 args} { + # #*** !doctools + # #[call lib::[fun utility1] [arg p1] [opt {?option value...?}]] + # #[para]Description of utility1 + # return 1 + #} + + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::lib ---}] +} +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval tomlish::system { + + #taken from punk::lib + proc lindex_resolve_basic {list index} { + #*** !doctools + #[call [fun lindex_resolve_basic] [arg list] [arg index]] + #[para] Accepts index of the forms accepted by Tcl's list commands. (e.g compound indices such as 3+1 end-2) + #[para] returns -1 for out of range at either end, or a valid integer index + #[para] Unlike lindex_resolve; lindex_resolve_basic can't determine if an out of range index was out of range at the lower or upper bound + #[para] This is only likely to be faster than average over lindex_resolve for Tcl which has the builtin lseq command + #[para] The performance advantage is more likely to be present when using compound indexes such as $x+1 or end-1 + #[para] For pure integer indices the performance should be equivalent + + set index [tcl::string::map {_ {}} $index] ;#forward compatibility with integers such as 1_000 + if {[string is integer -strict $index]} { + #can match +i -i + #avoid even the lseq overhead when the index is simple + if {$index < 0 || ($index >= [llength $list])} { + #even though in this case we could return -2 or -3 like lindex_resolve; for consistency we don't, as it's not always determinable for compound indices using the lseq method. + return -1 + } else { + #integer may still have + sign - normalize with expr + return [expr {$index}] + } + } + if {[llength $list]} { + set indices [tomlish::system::range 0 [expr {[llength $list]-1}]] ;# uses lseq if available, has fallback. + #if lseq was available - $indices is an 'arithseries' - theoretically not taking up ram(?) + } else { + set indices [list] + } + set idx [lindex $indices $index] + if {$idx eq ""} { + #we have no way to determine if out of bounds is at lower vs upper end + return -1 + } else { + return $idx + } + } + + if {[info commands ::lseq] ne ""} { + #tcl 8.7+ lseq significantly faster, especially for larger ranges + #The internal rep can be an 'arithseries' with no string representation + #support minimal set from to + proc range {from to} { + lseq $from $to + } + } else { + #lseq accepts basic expressions e.g 4-2 for both arguments + #e.g we can do lseq 0 [llength $list]-1 + #if range is to be consistent with the lseq version above - it should support that, even though we don't support most lseq functionality in either wrapper. + proc range {from to} { + set to [offset_expr $to] + set from [offset_expr $from] + if {$to > $from} { + set count [expr {($to -$from) + 1}] + if {$from == 0} { + return [lsearch -all [lrepeat $count 0] *] + } else { + incr from -1 + return [lmap v [lrepeat $count 0] {incr from}] + } + #slower methods. + #2) + #set i -1 + #set L [lrepeat $count 0] + #lmap v $L {lset L [incr i] [incr from];lindex {}} + #return $L + #3) + #set L {} + #for {set i 0} {$i < $count} {incr i} { + # lappend L [incr from] + #} + #return $L + } elseif {$from > $to} { + set count [expr {$from - $to} + 1] + #1) + if {$to == 0} { + return [lreverse [lsearch -all [lrepeat $count 0] *]] + } else { + incr from + return [lmap v [lrepeat $count 0] {incr from -1}] + } + + #2) + #set i -1 + #set L [lrepeat $count 0] + #lmap v $L {lset L [incr i] [incr from -1];lindex {}} + #return $L + #3) + #set L {} + #for {set i 0} {$i < $count} {incr i} { + # lappend L [incr from -1] + #} + #return $L + } else { + return [list $from] + } + } + } + +} + +if {[info exists ::argc] && $::argc > 0} { + #puts stderr "argc: $::argc args: $::argv" + set arglist $::argv + # -------------- + #make sure any dependant packages that are sourced don't get any commandline args + set ::argv {} + set ::argc 0 + # -------------- + package require punk::args + punk::args::define { + @dynamic + @id -id tomlish::cmdline + @cmd -name tomlish -help\ + "toml encoder/decoder written in Tcl" + @opts -any 1 + -help -type none -help\ + "Display this usage message or more specific + help if further arguments provided." + -app -choices {${[tomlish::appnames]}} + } + try { + set argd [punk::args::parse $arglist withid tomlish::cmdline] + } trap {PUNKARGS VALIDATION} {msg erroropts} { + puts stderr $msg + exit 1 + } + + + lassign [dict values $argd] leaders opts values received solos + if {[dict exists $received -help] && ![dict exists $received -app]} { + #only emit cmdline help if -app not supplied as well - otherwise app function can act on -help for more specific help + #puts stdout "Usage: -app where appname one of:[tomlish::appnames]" + puts stdout [punk::args::usage -scheme info tomlish::cmdline] + exit 0 + } + if {![dict exists $received -app]} { + puts stderr [punk::args::usage -scheme error tomlish::cmdline] + exit 1 + } + + set app [dict get $opts -app] + set appnames [tomlish::appnames] + set app_opts [dict remove $opts -app {*}$solos] + try { + set result [tomlish::app::$app {*}$app_opts {*}$solos {*}$values] + } trap {PUNKARGS VALIDATION} {msg erroropts} { + #The validation error should fully describe the issue + #no need for errortrace - keep the output cleaner + puts stderr $msg + exit 1 + } trap {} {msg erroropts} { + #unexpected error - uncaught throw will produce error trace + #todo - a support msg? Otherwise we may as well just leave off this trap. + throw [dict get $erroropts -errorcode] [dict get $erroropts -errorinfo] + } + if {"-help" in $solos} { + puts stderr $result + exit 1 + } else { + if {$result ne ""} { + puts stdout $result + exit 0 + } + } +} + +## Ready +package provide tomlish [namespace eval tomlish { + variable pkg tomlish + variable version + set version 1.1.6 +}] +return + +#*** !doctools +#[manpage_end] + diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/overtype-1.6.6.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/overtype-1.6.6.tm new file mode 100644 index 00000000..b4e59ec6 --- /dev/null +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/overtype-1.6.6.tm @@ -0,0 +1,4774 @@ +# -*- tcl -*- +# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt +# +# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. +# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# (C) Julian Noble 2003-2023 +# +# @@ Meta Begin +# Application overtype 1.6.6 +# Meta platform tcl +# Meta license BSD +# @@ Meta End + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# doctools header +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +#*** !doctools +#[manpage_begin overtype_module_overtype 0 1.6.6] +#[copyright "2024"] +#[titledesc {overtype text layout - ansi aware}] [comment {-- Name section and table of contents description --}] +#[moddesc {overtype text layout}] [comment {-- Description at end of page heading --}] +#[require overtype] +#[keywords module text ansi] +#[description] +#[para] - + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section Overview] +#[para] overview of overtype +#[subsection Concepts] +#[para] - + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Requirements +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[subsection dependencies] +#[para] packages used by overtype +#[list_begin itemized] + +package require Tcl 8.6- +package require textutil +package require punk::lib ;#required for lines_as_list +package require punk::ansi ;#required to detect, split, strip and calculate lengths +package require punk::char ;#box drawing - and also unicode character width determination for proper layout of text with double-column-width chars +package require punk::assertion +#*** !doctools +#[item] [package {Tcl 8.6}] +#[item] [package textutil] +#[item] [package punk::ansi] +#[para] - required to detect, split, strip and calculate lengths of text possibly containing ansi codes +#[item] [package punk::char] +#[para] - box drawing - and also unicode character width determination for proper layout of text with double-column-width chars + +# #package require frobz +# #*** !doctools +# #[item] [package {frobz}] + +#*** !doctools +#[list_end] + +#PERFORMANCE notes +#overtype is very performance sensitive - used in ansi output all over the place ie needs to be optimised +#NOTE use of tcl::dict::for tcl::string::range etc instead of ensemble versions. This is for the many tcl 8.6/8.7 interps which don't compile ensemble commands when in safe interps +#similar for tcl::namespace::eval - but this is at least on some versions of Tcl - faster even in a normal interp. Review to see if that holds for Tcl 9. +#for string map: when there are exactly 2 elements - it is faster to use a literal which has a special case optimisation in the c code +#ie use tcl::string::map {\n ""} ... instead of tcl::string::map [list \n ""] ... +#note that we can use unicode (e.g \uFF31) and other escapes such as \t within these curly braces - we don't have to use double quotes +#generally using 'list' is preferred for the map as less error prone. +#can also use: tcl::string::map "token $var" .. but be careful regarding quoting and whitespace in var. This should be used sparingly if at all. + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +#*** !doctools +#[section API] + + +#Julian Noble - 2003 +#Released under standard 'BSD license' conditions. +# +#todo - ellipsis truncation indicator for center,right + +#v1.4 2023-07 - naive ansi color handling - todo - fix tcl::string::range +# - need to extract and replace ansi codes? + +tcl::namespace::eval overtype { + namespace import ::punk::assertion::assert + punk::assertion::active true + + namespace path ::punk::lib + + namespace export * + variable default_ellipsis_horizontal "..." ;#fallback + variable default_ellipsis_vertical "..." + tcl::namespace::eval priv { + proc _init {} { + upvar ::overtype::default_ellipsis_horizontal e_h + upvar ::overtype::default_ellipsis_vertical e_v + set e_h [format %c 0x2026] ;#Unicode Horizontal Ellipsis + set e_v [format %c 0x22EE] + #The unicode ellipsis looks more natural than triple-dash which is centred vertically whereas ellipsis is at floorline of text + #Also - unicode ellipsis has semantic meaning that other processors can interpret + #unicode does also provide a midline horizontal ellipsis 0x22EF + + #set e [format %c 0x2504] ;#punk::char::charshort boxd_ltdshhz - Box Drawings Light Triple Dash Horizontal + #if {![catch {package require punk::char}]} { + # set e [punk::char::charshort boxd_ltdshhz] + #} + } + } + priv::_init +} +proc overtype::about {} { + return "Simple text formatting. Author JMN. BSD-License" +} + +tcl::namespace::eval overtype { + variable grapheme_widths [tcl::dict::create] + + variable escape_terminals + #single "final byte" in the range 0x40–0x7E (ASCII @A–Z[\]^_`a–z{|}~). + tcl::dict::set escape_terminals CSI [list @ \\ ^ _ ` | ~ a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z "\{" "\}"] + #tcl::dict::set escape_terminals CSI [list J K m n A B C D E F G s u] ;#basic + tcl::dict::set escape_terminals OSC [list \007 \033\\] ;#note mix of 1 and 2-byte terminals + + #self-contained 2 byte ansi escape sequences - review more? + variable ansi_2byte_codes_dict + set ansi_2byte_codes_dict [tcl::dict::create\ + "reset_terminal" "\u001bc"\ + "save_cursor_posn" "\u001b7"\ + "restore_cursor_posn" "\u001b8"\ + "cursor_up_one" "\u001bM"\ + "NEL - Next Line" "\u001bE"\ + "IND - Down one line" "\u001bD"\ + "HTS - Set Tab Stop" "\u001bH"\ + ] + + #debatable whether strip should reveal the somethinghidden - some terminals don't hide it anyway. + # "PM - Privacy Message" "\u001b^somethinghidden\033\\"\ +} + + + + +proc overtype::string_columns {text} { + if {[punk::ansi::ta::detect $text]} { + #error "error string_columns is for calculating character length of string - ansi codes must be stripped/rendered first e.g with punk::ansi::ansistrip. Alternatively try punk::ansi::printing_length" + set text [punk::ansi::ansistrip $text] + } + return [punk::char::ansifreestring_width $text] +} + +#todo - consider a way to merge overtype::left/centre/right +#These have similar algorithms/requirements - and should be refactored to be argument-wrappers over a function called something like overtype::renderblock +#overtype::renderblock could render the input to a defined (possibly overflowing in x or y) rectangle overlapping the underlay. +#(i.e not even necessariy having it's top left within the underlay) +tcl::namespace::eval overtype::priv { +} + +#could return larger than renderwidth +proc _get_row_append_column {row} { + #obsolete? + upvar outputlines outputlines + set idx [expr {$row -1}] + if {$row <= 1 || $row > [llength $outputlines]} { + return 1 + } else { + upvar opt_expand_right expand_right + upvar renderwidth renderwidth + set existinglen [punk::ansi::printing_length [lindex $outputlines $idx]] + set endpos [expr {$existinglen +1}] + if {$expand_right} { + return $endpos + } else { + if {$endpos > $renderwidth} { + return $renderwidth + 1 + } else { + return $endpos + } + } + } +} + +tcl::namespace::eval overtype { + #*** !doctools + #[subsection {Namespace overtype}] + #[para] Core API functions for overtype + #[list_begin definitions] + + + + #tcl::string::range should generally be avoided for both undertext and overtext which contain ansi escapes and other cursor affecting chars such as \b and \r + #render onto an already-rendered (ansi already processed) 'underlay' string, a possibly ansi-laden 'overlay' string. + #The underlay and overlay can be multiline blocks of text of varying line lengths. + #The overlay may just be an ansi-colourised block - or may contain ansi cursor movements and cursor save/restore calls - in which case the apparent length and width of the overlay can't be determined as if it was a block of text. + #This is a single-shot rendering of strings - ie there is no way to chain another call containing a cursor-restore to previously rendered output and have it know about any cursor-saves in the first call. + # a cursor start position other than top-left is a possible addition to consider. + #see editbuf in punk::repl for a more stateful ansi-processor. Both systems use loops over overtype::renderline + proc renderspace {args} { + #*** !doctools + #[call [fun overtype::renderspace] [arg args] ] + #[para] usage: ?-transparent [lb]0|1[rb]? ?-expand_right [lb]1|0[rb]? ?-ellipsis [lb]1|0[rb]? ?-ellipsistext ...? undertext overtext + + # @c overtype starting at left (overstrike) + # @c can/should we use something like this?: 'format "%-*s" $len $overtext + variable default_ellipsis_horizontal + + if {[llength $args] < 2} { + error {usage: ?-width ? ?-startcolumn ? ?-transparent [0|1|]? ?-expand_right [1|0]? ?-ellipsis [1|0]? ?-ellipsistext ...? undertext overtext} + } + set optargs [lrange $args 0 end-2] + if {[llength $optargs] % 2 == 0} { + set overblock [lindex $args end] + set underblock [lindex $args end-1] + #lassign [lrange $args end-1 end] underblock overblock + set argsflags [lrange $args 0 end-2] + } else { + set optargs [lrange $args 0 end-1] + if {[llength $optargs] %2 == 0} { + set overblock [lindex $args end] + set underblock "" + set argsflags [lrange $args 0 end-1] + } else { + error "renderspace expects opt-val pairs followed by: or just " + } + } + set opts [tcl::dict::create\ + -bias ignored\ + -width \uFFEF\ + -height \uFFEF\ + -startcolumn 1\ + -ellipsis 0\ + -ellipsistext $default_ellipsis_horizontal\ + -ellipsiswhitespace 0\ + -expand_right 0\ + -appendlines 1\ + -transparent 0\ + -exposed1 \uFFFD\ + -exposed2 \uFFFD\ + -experimental 0\ + -cp437 0\ + -looplimit \uFFEF\ + -crm_mode 0\ + -reverse_mode 0\ + -insert_mode 0\ + -wrap 0\ + -info 0\ + -console {stdin stdout stderr}\ + ] + #expand_right is perhaps consistent with the idea of the page_size being allowed to grow horizontally.. + # it does not necessarily mean the viewport grows. (which further implies need for horizontal scrolling) + # - it does need to be within some concept of terminal width - as columns must be addressable by ansi sequences. + # - This implies the -width option value must grow if it is tied to the concept of renderspace terminal width! REVIEW. + # - further implication is that if expand_right grows the virtual renderspace terminal width - + # then some sort of reflow/rerender needs to be done for preceeding lines? + # possibly not - as expand_right is distinct from a normal terminal-width change event, + # expand_right being primarily to support other operations such as textblock::table + + #todo - viewport width/height as separate concept to terminal width/height? + #-ellipsis args not used if -wrap is true + foreach {k v} $argsflags { + switch -- $k { + -looplimit - -width - -height - -startcolumn - -bias - -ellipsis - -ellipsistext - -ellipsiswhitespace + - -transparent - -exposed1 - -exposed2 - -experimental + - -expand_right - -appendlines + - -reverse_mode - -crm_mode - -insert_mode + - -cp437 + - -info - -console { + tcl::dict::set opts $k $v + } + -wrap - -autowrap_mode { + #temp alias -autowrap_mode for consistency with renderline + #todo - + tcl::dict::set opts -wrap $v + } + default { + error "overtype::renderspace unknown option '$k'. Known options: [tcl::dict::keys $opts]" + } + } + } + #set opts [tcl::dict::merge $defaults $argsflags] + # -- --- --- --- --- --- + #review - expand_left for RTL text? + set opt_expand_right [tcl::dict::get $opts -expand_right] + #for repl - standard output line indicator is a dash - todo, add a different indicator for a continued line. + set opt_width [tcl::dict::get $opts -width] + set opt_height [tcl::dict::get $opts -height] + set opt_startcolumn [tcl::dict::get $opts -startcolumn] + set opt_appendlines [tcl::dict::get $opts -appendlines] + set opt_transparent [tcl::dict::get $opts -transparent] + set opt_ellipsistext [tcl::dict::get $opts -ellipsistext] + set opt_ellipsiswhitespace [tcl::dict::get $opts -ellipsiswhitespace] + set opt_exposed1 [tcl::dict::get $opts -exposed1] ;#widechar_exposed_left - todo + set opt_exposed2 [tcl::dict::get $opts -exposed2] ;#widechar_exposed_right - todo + # -- --- --- --- --- --- + set opt_crm_mode [tcl::dict::get $opts -crm_mode] + set opt_reverse_mode [tcl::dict::get $opts -reverse_mode] + set opt_insert_mode [tcl::dict::get $opts -insert_mode] + ##### + # review -wrap should map onto DECAWM terminal mode - the wrap 2 idea may not fit in with this?. + set opt_autowrap_mode [tcl::dict::get $opts -wrap] + #??? wrap 1 is hard wrap cutting word at exact column, or 1 column early for 2w-glyph, wrap 2 is for language-based word-wrap algorithm (todo) + ##### + # -- --- --- --- --- --- + set opt_cp437 [tcl::dict::get $opts -cp437] + set opt_info [tcl::dict::get $opts -info] + + + + # ---------------------------- + # -experimental dev flag to set flags etc + # ---------------------------- + set data_mode 0 + set edit_mode 0 + set opt_experimental [tcl::dict::get $opts -experimental] + foreach o $opt_experimental { + switch -- $o { + data_mode { + set data_mode 1 + } + edit_mode { + set edit_mode 1 + } + } + } + # ---------------------------- + + + set underblock [tcl::string::map {\r\n \n} $underblock] + set overblock [tcl::string::map {\r\n \n} $overblock] + + + #set underlines [split $underblock \n] + + #underblock is a 'rendered' block - so width height make sense + #only non-cursor affecting and non-width occupying ANSI codes should be present. + #ie SGR codes and perhaps things such as PM - although generally those should have been pushed to the application already + #renderwidth & renderheight were originally used with reference to rendering into a 'column' of output e.g a table column - before cursor row/col was implemented. + + if {$opt_width eq "\uFFEF" || $opt_height eq "\uFFEF"} { + lassign [blocksize $underblock] _w renderwidth _h renderheight + if {$opt_width ne "\uFFEF"} { + set renderwidth $opt_width + } + if {$opt_height ne "\uFFEF"} { + set renderheight $opt_height + } + } else { + set renderwidth $opt_width + set renderheight $opt_height + } + #initial state for renderspace 'terminal' reset + set initial_state [dict create\ + renderwidth $renderwidth\ + renderheight $renderheight\ + crm_mode $opt_crm_mode\ + reverse_mode $opt_reverse_mode\ + insert_mode $opt_insert_mode\ + autowrap_mode $opt_autowrap_mode\ + cp437 $opt_cp437\ + ] + #modes + #e.g insert_mode can be toggled by insert key or ansi IRM sequence CSI 4 h|l + #opt_startcolumn ?? - DECSLRM ? + set vtstate $initial_state + + # -- --- --- --- + #REVIEW - do we need ansi resets in the underblock? + if {$underblock eq ""} { + set underlines [lrepeat $renderheight ""] + } else { + set underblock [textblock::join_basic -- $underblock] ;#ensure properly rendered - ansi per-line resets & replays + set underlines [split $underblock \n] + } + #if {$underblock eq ""} { + # set blank "\x1b\[0m\x1b\[0m" + # #set underlines [list "\x1b\[0m\x1b\[0m"] + # set underlines [lrepeat $renderheight $blank] + #} else { + # #lines_as_list -ansiresets 1 will do nothing if -ansiresets 1 isn't specified - REVIEW + # set underlines [lines_as_list -ansiresets 1 $underblock] + #} + # -- --- --- --- + + #todo - reconsider the 'line' as the natural chunking mechanism for the overlay. + #In practice an overlay ANSI stream can be a single line with ansi moves/restores etc - or even have no moves or newlines, just relying on wrapping at the output renderwidth + #In such cases - we process the whole shebazzle for the first output line - only reducing by the applied amount at the head each time, reprocessing the long tail each time. + #(in cases where there are interline moves or cursor jumps anyway) + #This works - but doesn't seem efficient. + #On the other hand.. maybe it depends on the data. For simpler files it's more efficient than splitting first + + #a hack until we work out how to avoid infinite loops... + # + set looplimit [tcl::dict::get $opts -looplimit] + if {$looplimit eq "\uFFEF"} { + #looping for each char is worst case (all newlines?) - anything over that is an indication of something broken? + #do we need any margin above the length? (telnet mapscii.me test) + set looplimit [expr {[tcl::string::length $overblock] + 10}] + } + + #overblock height/width isn't useful in the presence of an ansi input overlay with movements. The number of lines may bear little relationship to the output height + #lassign [blocksize $overblock] _w overblock_width _h overblock_height + + set scheme 4 + switch -- $scheme { + 0 { + #one big chunk + set inputchunks [list $overblock] + } + 1 { + set inputchunks [punk::ansi::ta::split_codes $overblock] + } + 2 { + + #split into lines if possible first - then into plaintext/ansi-sequence chunks ? + set inputchunks [list ""] ;#put an empty plaintext split in for starters + set i 1 + set lines [split $overblock \n] + foreach ln $lines { + if {$i < [llength $lines]} { + append ln \n + } + set sequence_split [punk::ansi::ta::split_codes_single $ln] ;#use split_codes Not split_codes_single? + set lastpt [lindex $inputchunks end] + lset inputchunks end [tcl::string::cat $lastpt [lindex $sequence_split 0]] + lappend inputchunks {*}[lrange $sequence_split 1 end] + incr i + } + } + 3 { + #it turns out line based chunks are faster than the above.. probably because some of those end up doing the regex splitting twice + set lflines [list] + set inputchunks [split $overblock \n] + foreach ln $inputchunks { + append ln \n + lappend lflines $ln + } + if {[llength $lflines]} { + lset lflines end [tcl::string::range [lindex $lflines end] 0 end-1] + } + #set inputchunks $lflines[unset lflines] + set inputchunks [lindex [list $lflines [unset lflines]] 0] + + } + 4 { + set inputchunks [list] + foreach ln [split $overblock \n] { + lappend inputchunks $ln\n + } + if {[llength $inputchunks]} { + lset inputchunks end [tcl::string::range [lindex $inputchunks end] 0 end-1] + } + } + } + + + + + set replay_codes_underlay [tcl::dict::create 1 ""] + #lappend replay_codes_overlay "" + set replay_codes_overlay "[punk::ansi::a]" + set unapplied "" + set cursor_saved_position [tcl::dict::create] + set cursor_saved_attributes "" + + + set outputlines $underlines + set overidx 0 + + #underlines are not necessarily processed in order - depending on cursor-moves applied from overtext + set row 1 + #if {$data_mode} { + # set col [_get_row_append_column $row] + #} else { + set col $opt_startcolumn + #} + + set instruction_stats [tcl::dict::create] + + set loop 0 + #while {$overidx < [llength $inputchunks]} { } + + while {[llength $inputchunks]} { + #set overtext [lindex $inputchunks $overidx]; lset inputchunks $overidx "" + set overtext [lpop inputchunks 0] + if {![tcl::string::length $overtext]} { + incr loop + continue + } + #puts "----->[ansistring VIEW -lf 1 -vt 1 -nul 1 $overtext]<----" + set undertext [lindex $outputlines [expr {$row -1}]] + set renderedrow $row + + #renderline pads each underaly line to width with spaces and should track where end of data is + + + #set overtext [tcl::string::cat [lindex $replay_codes_overlay $overidx] $overtext] + set overtext $replay_codes_overlay$overtext + if {[tcl::dict::exists $replay_codes_underlay $row]} { + set undertext [tcl::dict::get $replay_codes_underlay $row]$undertext + } + #review insert_mode. As an 'overtype' function whose main function is not interactive keystrokes - insert is secondary - + #but even if we didn't want it as an option to the function call - to process ansi adequately we need to support IRM (insertion-replacement mode) ESC [ 4 h|l + set renderopts [list -experimental $opt_experimental\ + -cp437 $opt_cp437\ + -info 1\ + -crm_mode [tcl::dict::get $vtstate crm_mode]\ + -insert_mode [tcl::dict::get $vtstate insert_mode]\ + -autowrap_mode [tcl::dict::get $vtstate autowrap_mode]\ + -reverse_mode [tcl::dict::get $vtstate reverse_mode]\ + -cursor_restore_attributes $cursor_saved_attributes\ + -transparent $opt_transparent\ + -width [tcl::dict::get $vtstate renderwidth]\ + -exposed1 $opt_exposed1\ + -exposed2 $opt_exposed2\ + -expand_right $opt_expand_right\ + -cursor_column $col\ + -cursor_row $row\ + ] + set rinfo [renderline {*}$renderopts $undertext $overtext] + + set instruction [tcl::dict::get $rinfo instruction] + tcl::dict::set vtstate crm_mode [tcl::dict::get $rinfo crm_mode] + tcl::dict::set vtstate insert_mode [tcl::dict::get $rinfo insert_mode] + tcl::dict::set vtstate autowrap_mode [tcl::dict::get $rinfo autowrap_mode] ;# + tcl::dict::set vtstate reverse_mode [tcl::dict::get $rinfo reverse_mode] + #how to support reverse_mode in rendered linelist? we need to examine all pt/code blocks and flip each SGR stack? + # - review - the answer is probably that we don't need to - it is set/reset only during application of overtext + + #Note carefully the difference betw overflow_right and unapplied. + #overflow_right may need to be included in next run before the unapplied data + #overflow_right most commonly has data when in insert_mode + set rendered [tcl::dict::get $rinfo result] + set overflow_right [tcl::dict::get $rinfo overflow_right] + set overflow_right_column [tcl::dict::get $rinfo overflow_right_column] + set unapplied [tcl::dict::get $rinfo unapplied] + set unapplied_list [tcl::dict::get $rinfo unapplied_list] + set post_render_col [tcl::dict::get $rinfo cursor_column] + set post_render_row [tcl::dict::get $rinfo cursor_row] + set c_saved_pos [tcl::dict::get $rinfo cursor_saved_position] + set c_saved_attributes [tcl::dict::get $rinfo cursor_saved_attributes] + set visualwidth [tcl::dict::get $rinfo visualwidth] ;#column width of what is 'rendered' for the line + set insert_lines_above [tcl::dict::get $rinfo insert_lines_above] + set insert_lines_below [tcl::dict::get $rinfo insert_lines_below] + tcl::dict::set replay_codes_underlay [expr {$renderedrow+1}] [tcl::dict::get $rinfo replay_codes_underlay] + + #lset replay_codes_overlay [expr $overidx+1] [tcl::dict::get $rinfo replay_codes_overlay] + set replay_codes_overlay [tcl::dict::get $rinfo replay_codes_overlay] + if {0 && [tcl::dict::get $vtstate reverse_mode]} { + #test branch - todo - prune + puts stderr "---->[ansistring VIEW $replay_codes_overlay] rendered: $rendered" + #review + #JMN3 + set existing_reverse_state 0 + #split_codes_single is single esc sequence - but could have multiple sgr codes within one esc sequence + #e.g \x1b\[0;31;7m has a reset,colour red and reverse + set codeinfo [punk::ansi::codetype::sgr_merge [list $replay_codes_overlay] -info 1] + set codestate_reverse [dict get $codeinfo codestate reverse] + switch -- $codestate_reverse { + 7 { + set existing_reverse_state 1 + } + 27 { + set existing_reverse_state 0 + } + "" { + } + } + if {$existing_reverse_state == 0} { + set rflip [a+ reverse] + } else { + #reverse of reverse + set rflip [a+ noreverse] + } + #note that mergeresult can have multiple esc (due to unmergeables or non sgr codes) + set replay_codes_overlay [punk::ansi::codetype::sgr_merge [list [dict get $codeinfo mergeresult] $rflip]] + puts stderr "---->[ansistring VIEW $replay_codes_overlay] rendered: $rendered" + } + + + + #-- todo - detect looping properly + if {$row > 1 && $overtext ne "" && $unapplied eq $overtext && $post_render_row == $row && $instruction eq ""} { + puts stderr "overtype::renderspace loop?" + puts [ansistring VIEW $rinfo] + break + } + #-- + + if {[tcl::dict::size $c_saved_pos] >= 1} { + set cursor_saved_position $c_saved_pos + set cursor_saved_attributes $c_saved_attributes + } + + + set overflow_handled 0 + + + + set nextprefix "" + + + #todo - handle potential insertion mode as above for cursor restore? + #keeping separate branches for debugging - review and merge as appropriate when stable + set instruction_type [lindex $instruction 0] ;#some instructions have params + tcl::dict::incr instruction_stats $instruction_type + switch -- $instruction_type { + reset { + #reset the 'renderspace terminal' (not underlying terminal) + set row 1 + set col 1 + set vtstate [tcl::dict::merge $vtstate $initial_state] + #todo - clear screen + } + {} { + #end of supplied line input + #lf included in data + set row $post_render_row + set col $post_render_col + if {![llength $unapplied_list]} { + if {$overflow_right ne ""} { + incr row + } + } else { + puts stderr "renderspace end of input line - has unapplied: [ansistring VIEW $unapplied] (review)" + } + set col $opt_startcolumn + } + up { + + #renderline knows it's own line number, and knows not to go above row l + #it knows that a move whilst 1-beyond the width conflicts with the linefeed and reduces the move by one accordingly. + #row returned should be correct. + #column may be the overflow column - as it likes to report that to the caller. + + #Note that an ansi up sequence after last column going up to a previous line and also beyond the last column, will result in the next grapheme going onto the following line. + #this seems correct - as the column remains beyond the right margin so subsequent chars wrap (?) review + #puts stderr "up $post_render_row" + #puts stderr "$rinfo" + + #puts stdout "1 row:$row col $col" + set row $post_render_row + #data_mode (naming?) determines if we move to end of existing data or not. + #data_mode 0 means ignore existing line length and go to exact column + #set by -experimental flag + if {$data_mode == 0} { + set col $post_render_col + } else { + #This doesn't really work if columns are pre-filled with spaces..we can't distinguish them from data + #we need renderline to return the number of the maximum column filled (or min if we ever do r-to-l) + set existingdata [lindex $outputlines [expr {$post_render_row -1}]] + set lastdatacol [punk::ansi::printing_length $existingdata] + if {$lastdatacol < $renderwidth} { + set col [expr {$lastdatacol+1}] + } else { + set col $renderwidth + } + } + + #puts stdout "2 row:$row col $col" + #puts stdout "-----------------------" + #puts stdout $rinfo + #flush stdout + } + down { + if {$data_mode == 0} { + #renderline doesn't know how far down we can go.. + if {$post_render_row > [llength $outputlines]} { + if {$opt_appendlines} { + set diff [expr {$post_render_row - [llength $outputlines]}] + if {$diff > 0} { + lappend outputlines {*}[lrepeat $diff ""] + } + lappend outputlines "" + } + } + set row $post_render_row + set col $post_render_col + } else { + if {$post_render_row > [llength $outputlines]} { + if {$opt_appendlines} { + set diff [expr {$post_render_row - [llength $outputlines]}] + if {$diff > 0} { + lappend outputlines {*}[lrepeat $diff ""] + } + lappend outputlines "" + } + } + set existingdata [lindex $outputlines [expr {$post_render_row -1}]] + set lastdatacol [punk::ansi::printing_length $existingdata] + if {$lastdatacol < $renderwidth} { + set col [expr {$lastdatacol+1}] + } else { + set col $renderwidth + } + + } + } + restore_cursor { + #testfile belinda.ans uses this + + #puts stdout "[a+ blue bold]CURSOR_RESTORE[a]" + if {[tcl::dict::exists $cursor_saved_position row]} { + set row [tcl::dict::get $cursor_saved_position row] + set col [tcl::dict::get $cursor_saved_position column] + #puts stdout "restoring: row $row col $col [ansistring VIEW $cursor_saved_attributes] [a] unapplied [ansistring VIEWCODES $unapplied]" + #set nextprefix $cursor_saved_attributes + #lset replay_codes_overlay [expr $overidx+1] $cursor_saved_attributes + set replay_codes_overlay [tcl::dict::get $rinfo replay_codes_overlay]$cursor_saved_attributes + #set replay_codes_overlay $cursor_saved_attributes + set cursor_saved_position [tcl::dict::create] + set cursor_saved_attributes "" + } else { + #TODO + #?restore without save? + #should move to home position and reset ansi SGR? + #puts stderr "overtype::renderspace cursor_restore without save data available" + } + #If we were inserting prior to hitting the cursor_restore - there could be overflow_right data - generally the overtype functions aren't for inserting - but ansi can enable it + #if we were already in overflow when cursor_restore was hit - it shouldn't have been processed as an action - just stored. + if {!$overflow_handled && $overflow_right ne ""} { + #wrap before restore? - possible effect on saved cursor position + #this overflow data has previously been rendered so has no cursor movements or further save/restore operations etc + #we can just insert another call to renderline to solve this.. ? + #It would perhaps be more properly handled as a queue of instructions from our initial renderline call + #we don't need to worry about overflow next call (?)- but we should carry forward our gx and ansi stacks + + puts stdout ">>>[a+ red bold]overflow_right during restore_cursor[a]" + + set sub_info [overtype::renderline\ + -info 1\ + -width [tcl::dict::get $vtstate renderwidth]\ + -insert_mode [tcl::dict::get $vtstate insert_mode]\ + -autowrap_mode [tcl::dict::get $vtstate autowrap_mode]\ + -expand_right [tcl::dict::get $opts -expand_right]\ + ""\ + $overflow_right\ + ] + set foldline [tcl::dict::get $sub_info result] + tcl::dict::set vtstate insert_mode [tcl::dict::get $sub_info insert_mode] ;#probably not needed..? + tcl::dict::set vtstate autowrap_mode [tcl::dict::get $sub_info autowrap_mode] ;#nor this.. + linsert outputlines $renderedrow $foldline + #review - row & col set by restore - but not if there was no save.. + } + set overflow_handled 1 + + } + move { + ######## + if {$post_render_row > [llength $outputlines]} { + #Ansi moves need to create new lines ? + #if {$opt_appendlines} { + # set diff [expr {$post_render_row - [llength $outputlines]}] + # if {$diff > 0} { + # lappend outputlines {*}[lrepeat $diff ""] + # } + # set row $post_render_row + #} else { + set row [llength $outputlines] + #} + } else { + set row $post_render_row + } + ####### + set col $post_render_col + #overflow + unapplied? + } + clear_and_move { + #e.g 2J + if {$post_render_row > [llength $outputlines]} { + set row [llength $outputlines] + } else { + set row $post_render_row + } + set col $post_render_col + set overflow_right "" ;#if we're clearing - any overflow due to insert_mode is irrelevant + set clearedlines [list] + foreach ln $outputlines { + lappend clearedlines \x1b\[0m$replay_codes_overlay[string repeat \000 $renderwidth]\x1b\[0m + if 0 { + + set lineparts [punk::ansi::ta::split_codes $ln] + set numcells 0 + foreach {pt _code} $lineparts { + if {$pt ne ""} { + foreach grapheme [punk::char::grapheme_split $pt] { + switch -- $grapheme { + " " - - - _ - ! - @ - # - $ - % - ^ - & - * - = - + - : - . - , - / - | - ? - + a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - + z - A - B - C - D - E - F - G - H - I - J - K - L - M - N - O - P - Q - R - S - T - U - V - W - X - Y - Z - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 { + incr numcells 1 + } + default { + if {$grapheme eq "\u0000"} { + incr numcells 1 + } else { + incr numcells [grapheme_width_cached $grapheme] + } + } + } + + } + } + } + #replays/resets each line + lappend clearedlines \x1b\[0m$replay_codes_overlay[string repeat \000 $numcells]\x1b\[0m + } + } + set outputlines $clearedlines + #todo - determine background/default to be in effect - DECECM ? + puts stderr "replay_codes_overlay: [ansistring VIEW $replay_codes_overlay]" + #lset outputlines 0 $replay_codes_overlay[lindex $outputlines 0] + + } + lf_start { + #raw newlines + # ---------------------- + #test with fruit.ans + #test - treating as newline below... + #append rendered $overflow_right + #set overflow_right "" + set row $renderedrow + incr row + if {$row > [llength $outputlines]} { + lappend outputlines "" + } + set col $opt_startcolumn + # ---------------------- + } + lf_mid { + + set edit_mode 0 + if {$edit_mode} { + set inputchunks [linsert $inputchunks 0 $overflow_right$unapplied] + set overflow_right "" + set unapplied "" + set row $post_render_row + #set col $post_render_col + set col $opt_startcolumn + if {$row > [llength $outputlines]} { + lappend outputlines {*}[lrepeat 1 ""] + } + } else { + if 1 { + if {$overflow_right ne ""} { + if {$opt_expand_right} { + append rendered $overflow_right + set overflow_right "" + } else { + #review - we should really make renderline do the work...? + set overflow_width [punk::ansi::printing_length $overflow_right] + if {$visualwidth + $overflow_width <= $renderwidth} { + append rendered $overflow_right + set overflow_right "" + } else { + if {[tcl::dict::get $vtstate autowrap_mode]} { + set outputlines [linsert $outputlines $renderedrow $overflow_right] + set overflow_right "" + set row [expr {$renderedrow + 2}] + } else { + set overflow_right "" ;#abandon + } + + if {0 && $visualwidth < $renderwidth} { + puts stderr "visualwidth: $visualwidth < renderwidth:$renderwidth" + error "incomplete - abandon?" + set overflowparts [punk::ansi::ta::split_codes $overflow_right] + set remaining_overflow $overflowparts + set filled 0 + foreach {pt code} $overflowparts { + lpop remaining_overflow 0 + if {$pt ne ""} { + set graphemes [punk::char::grapheme_split $pt] + set add "" + set addlen $visualwidth + foreach g $graphemes { + set w [overtype::grapheme_width_cached $g] + if {$addlen + $w <= $renderwidth} { + append add $g + incr addlen $w + } else { + set filled 1 + break + } + } + append rendered $add + } + if {!$filled} { + lpop remaining_overflow 0 ;#pop code + } + } + set overflow_right [join $remaining_overflow ""] + } + } + } + } + set row $post_render_row + set col $opt_startcolumn + if {$row > [llength $outputlines]} { + lappend outputlines {*}[lrepeat 1 ""] + } + } else { + #old version - known to work with various ansi graphics - e.g fruit.ans + # - but fails to limit lines to renderwidth when expand_right == 0 + append rendered $overflow_right + set overflow_right "" + set row $post_render_row + set col $opt_startcolumn + if {$row > [llength $outputlines]} { + lappend outputlines {*}[lrepeat 1 ""] + } + } + } + } + lf_overflow { + #linefeed after renderwidth e.g at column 81 for an 80 col width + #we may also have other control sequences that came after col 80 e.g cursor save + + if 0 { + set lhs [overtype::left -width 40 -wrap 1 "" [ansistring VIEWSTYLE -nul 1 -lf 1 -vt 1 $rendered]] + set lhs [textblock::frame -title "rendered $visualwidth cols" -subtitle "row-$renderedrow" $lhs] + set rhs "" + + #assertion - there should be no overflow.. + puts $lhs + } + if {![tcl::dict::get $vtstate insert_mode]} { + assert {$overflow_right eq ""} lf_overflow should not get data in overflow_right when not insert_mode + } + + set row $post_render_row + #set row $renderedrow + #incr row + #only add newline if we're at the bottom + if {$row > [llength $outputlines]} { + lappend outputlines {*}[lrepeat 1 ""] + } + set col $opt_startcolumn + + } + newlines_above { + #we get a newlines_above instruction when received at column 1 + #In some cases we want to treat that as request to insert a new blank line above, and move our row 1 down (staying with the data) + #in other cases - we want to treat at column 1 the same as any other + + puts "--->newlines_above" + puts "rinfo: $rinfo" + #renderline doesn't advance the row for us - the caller has the choice to implement or not + set row $post_render_row + set col $post_render_col + if {$insert_lines_above > 0} { + set row $renderedrow + set outputlines [linsert $outputlines $renderedrow-1 {*}[lrepeat $insert_lines_above ""]] + incr row [expr {$insert_lines_above -1}] ;#we should end up on the same line of text (at a different index), with new empties inserted above + #? set row $post_render_row #can renderline tell us? + } + } + newlines_below { + #obsolete? - use for ANSI insert lines sequence + if {$data_mode == 0} { + puts --->nl_below + set row $post_render_row + set col $post_render_col + if {$insert_lines_below == 1} { + #set lhs [overtype::left -width 40 -wrap 1 "" [ansistring VIEWSTYLE -lf 1 -vt 1 $rendered]] + #set lhs [textblock::frame -title rendered -subtitle "row-$renderedrow" $lhs] + #set rhs "" + #if {$overflow_right ne ""} { + # set rhs [overtype::left -width 40 -wrap 1 "" [ansistring VIEWSTYLE -lf 1 -vt 1 $overflow_right]] + # set rhs [textblock::frame -title overflow_right $rhs] + #} + #puts [textblock::join $lhs $rhs] + + #rendered + append rendered $overflow_right + # + + + set overflow_right "" + set row $renderedrow + #only add newline if we're at the bottom + if {$row > [llength $outputlines]} { + lappend outputlines {*}[lrepeat $insert_lines_below ""] + } + incr row $insert_lines_below + set col $opt_startcolumn + } + } else { + set row $post_render_row + if {$post_render_row > [llength $outputlines]} { + if {$opt_appendlines} { + set diff [expr {$post_render_row - [llength $outputlines]}] + if {$diff > 0} { + lappend outputlines {*}[lrepeat $diff ""] + } + lappend outputlines "" + } + } else { + set existingdata [lindex $outputlines [expr {$post_render_row -1}]] + set lastdatacol [punk::ansi::printing_length $existingdata] + if {$lastdatacol < $renderwidth} { + set col [expr {$lastdatacol+1}] + } else { + set col $renderwidth + } + } + } + } + wrapmoveforward { + #doesn't seem to be used by fruit.ans testfile + #used by dzds.ans + #note that cursor_forward may move deep into the next line - or even span multiple lines !TODO + set c $renderwidth + set r $post_render_row + if {$post_render_col > $renderwidth} { + set i $c + while {$i <= $post_render_col} { + if {$c == $renderwidth+1} { + incr r + if {$opt_appendlines} { + if {$r < [llength $outputlines]} { + lappend outputlines "" + } + } + set c $opt_startcolumn + } else { + incr c + } + incr i + } + set col $c + } else { + #why are we getting this instruction then? + puts stderr "wrapmoveforward - test" + set r [expr {$post_render_row +1}] + set c $post_render_col + } + set row $r + set col $c + } + wrapmovebackward { + set c $renderwidth + set r $post_render_row + if {$post_render_col < 1} { + set c 1 + set i $c + while {$i >= $post_render_col} { + if {$c == 0} { + if {$r > 1} { + incr r -1 + set c $renderwidth + } else { + #leave r at 1 set c 1 + #testfile besthpav.ans first line top left border alignment + set c 1 + break + } + } else { + incr c -1 + } + incr i -1 + } + set col $c + } else { + puts stderr "Wrapmovebackward - but postrendercol >= 1???" + } + set row $r + set col $c + } + overflow { + #normal single-width grapheme overflow + #puts "----normal overflow --- [ansistring VIEWSTYLE -lf 1 -nul 1 -vt 1 $rendered]" + set row $post_render_row ;#renderline will not advance row when reporting overflow char + if {[tcl::dict::get $vtstate autowrap_mode]} { + incr row + set col $opt_startcolumn ;#whether wrap or not - next data is at column 1 ?? + } else { + set col $post_render_col + #set unapplied "" ;#this seems wrong? + #set unapplied [tcl::string::range $unapplied 1 end] + #The overflow can only be triggered by a grapheme (todo cluster?) - but our unapplied could contain SGR codes prior to the grapheme that triggered overflow - so we need to skip beyond any SGRs + #There may be more than one, because although the stack leading up to overflow may have been merged - codes between the last column and the overflowing grapheme will remain separate + #We don't expect any movement or other ANSI codes - as if they came before the grapheme, they would have triggered a different instruction to 'overflow' + set idx 0 + set next_grapheme_index -1 + foreach u $unapplied_list { + if {![punk::ansi::ta::detect $u]} { + set next_grapheme_index $idx + break + } + incr idx + } + assert {$next_grapheme_index >= 0} + #drop the overflow grapheme - keeping all codes in place. + set unapplied [join [lreplace $unapplied_list $next_grapheme_index $next_grapheme_index] ""] + #we need to run the reduced unapplied on the same line - further graphemes will just overflow again, but codes or control chars could trigger jumps to other lines + + set overflow_handled 1 + #handled by dropping overflow if any + } + } + overflow_splitchar { + set row $post_render_row ;#renderline will not advance row when reporting overflow char + + #2nd half of grapheme would overflow - treggering grapheme is returned in unapplied. There may also be overflow_right from earlier inserts + #todo - consider various options .. re-render a single trailing space or placeholder on same output line, etc + if {[tcl::dict::get $vtstate autowrap_mode]} { + if {$renderwidth < 2} { + #edge case of rendering to a single column output - any 2w char will just cause a loop if we don't substitute with something, or drop the character + set idx 0 + set triggering_grapheme_index -1 + foreach u $unapplied_list { + if {![punk::ansi::ta::detect $u]} { + set triggering_grapheme_index $idx + break + } + incr idx + } + set unapplied [join [lreplace $unapplied_list $triggering_grapheme_index $triggering_grapheme_index $opt_exposed1] ""] + } else { + set col $opt_startcolumn + incr row + } + } else { + set overflow_handled 1 + #handled by dropping entire overflow if any + if {$renderwidth < 2} { + set idx 0 + set triggering_grapheme_index -1 + foreach u $unapplied_list { + if {![punk::ansi::ta::detect $u]} { + set triggering_grapheme_index $idx + break + } + incr idx + } + set unapplied [join [lreplace $unapplied_list $triggering_grapheme_index $triggering_grapheme_index $opt_exposed1] ""] + } + } + + } + vt { + + #can vt add a line like a linefeed can? + set row $post_render_row + set col $post_render_col + } + set_window_title { + set newtitle [lindex $instruction 1] + puts stderr "overtype::renderspace set_window_title [ansistring VIEW $newtitle] instruction '$instruction'" + # + } + reset_colour_palette { + puts stderr "overtype::renderspace instruction '$instruction' unimplemented" + } + default { + puts stderr "overtype::renderspace unhandled renderline instruction '$instruction'" + } + + } + + + if {!$opt_expand_right && ![tcl::dict::get $vtstate autowrap_mode]} { + #not allowed to overflow column or wrap therefore we get overflow data to truncate + if {[tcl::dict::get $opts -ellipsis]} { + set show_ellipsis 1 + if {!$opt_ellipsiswhitespace} { + #we don't want ellipsis if only whitespace was lost + set lostdata "" + if {$overflow_right ne ""} { + append lostdata $overflow_right + } + if {$unapplied ne ""} { + append lostdata $unapplied + } + if {[tcl::string::trim $lostdata] eq ""} { + set show_ellipsis 0 + } + #set lostdata [tcl::string::range $overtext end-[expr {$overflowlength-1}] end] + if {[tcl::string::trim [punk::ansi::ansistrip $lostdata]] eq ""} { + set show_ellipsis 0 + } + } + if {$show_ellipsis} { + set rendered [overtype::right $rendered $opt_ellipsistext] + } + set overflow_handled 1 + } else { + #no wrap - no ellipsis - silently truncate + set overflow_handled 1 + } + } + + + + if {$renderedrow <= [llength $outputlines]} { + lset outputlines [expr {$renderedrow-1}] $rendered + } else { + if {$opt_appendlines} { + lappend outputlines $rendered + } else { + #? + lset outputlines [expr {$renderedrow-1}] $rendered + } + } + + if {!$overflow_handled} { + append nextprefix $overflow_right + } + + append nextprefix $unapplied + + if 0 { + if {$nextprefix ne ""} { + set nextoveridx [expr {$overidx+1}] + if {$nextoveridx >= [llength $inputchunks]} { + lappend inputchunks $nextprefix + } else { + #lset overlines $nextoveridx $nextprefix[lindex $overlines $nextoveridx] + set inputchunks [linsert $inputchunks $nextoveridx $nextprefix] + } + } + } + + if {$nextprefix ne ""} { + set inputchunks [linsert $inputchunks 0 $nextprefix] + } + + + incr overidx + incr loop + if {$loop >= $looplimit} { + puts stderr "overtype::renderspace looplimit reached ($looplimit)" + lappend outputlines "[a+ red bold] - looplimit $looplimit reached[a]" + set Y [a+ yellow bold] + set RST [a] + set sep_header ----DEBUG----- + set debugmsg "" + append debugmsg "${Y}${sep_header}${RST}" \n + append debugmsg "looplimit $looplimit reached\n" + append debugmsg "data_mode:$data_mode\n" + append debugmsg "opt_appendlines:$opt_appendlines\n" + append debugmsg "prev_row :[tcl::dict::get $renderopts -cursor_row]\n" + append debugmsg "prev_col :[tcl::dict::get $renderopts -cursor_column]\n" + tcl::dict::for {k v} $rinfo { + append debugmsg "${Y}$k [ansistring VIEW -lf 1 -vt 1 $v]$RST" \n + } + append debugmsg "${Y}[string repeat - [string length $sep_header]]$RST" \n + + puts stdout $debugmsg + #todo - config regarding error dumps rather than just dumping in working dir + set fd [open [pwd]/error_overtype.txt w] + puts $fd $debugmsg + close $fd + error $debugmsg + break + } + } + + set result [join $outputlines \n] + if {!$opt_info} { + return $result + } else { + #emit to debug window like basictelnet does? make debug configurable as syslog or even a telnet server to allow on 2nd window? + #append result \n$instruction_stats\n + set inforesult [dict create\ + result $result\ + last_instruction $instruction\ + instruction_stats $instruction_stats\ + ] + if {$opt_info == 2} { + return [pdict -channel none inforesult] + } else { + return $inforesult + } + } + } + + #todo - left-right ellipsis ? + proc centre {args} { + variable default_ellipsis_horizontal + if {[llength $args] < 2} { + error {usage: ?-transparent [0|1]? ?-bias [left|right]? ?-overflow [1|0]? undertext overtext} + } + + foreach {underblock overblock} [lrange $args end-1 end] break + + #todo - vertical vs horizontal overflow for blocks + set opts [tcl::dict::create\ + -bias left\ + -ellipsis 0\ + -ellipsistext $default_ellipsis_horizontal\ + -ellipsiswhitespace 0\ + -overflow 0\ + -transparent 0\ + -exposed1 \uFFFD\ + -exposed2 \uFFFD\ + ] + set argsflags [lrange $args 0 end-2] + foreach {k v} $argsflags { + switch -- $k { + -bias - -ellipsis - -ellipsistext - -ellipsiswhitespace - -overflow - -transparent - -exposed1 - -exposed2 { + tcl::dict::set opts $k $v + } + default { + set known_opts [tcl::dict::keys $opts] + error "overtype::centre unknown option '$k'. Known options: $known_opts" + } + } + } + #set opts [tcl::dict::merge $defaults $argsflags] + # -- --- --- --- --- --- + set opt_transparent [tcl::dict::get $opts -transparent] + set opt_ellipsis [tcl::dict::get $opts -ellipsis] + set opt_ellipsistext [tcl::dict::get $opts -ellipsistext] + set opt_ellipsiswhitespace [tcl::dict::get $opts -ellipsiswhitespace] + set opt_exposed1 [tcl::dict::get $opts -exposed1] + set opt_exposed2 [tcl::dict::get $opts -exposed2] + # -- --- --- --- --- --- + + + set underblock [tcl::string::map {\r\n \n} $underblock] + set overblock [tcl::string::map {\r\n \n} $overblock] + + set underlines [split $underblock \n] + #set renderwidth [tcl::mathfunc::max {*}[lmap v $underlines {punk::ansi::printing_length $v}]] + lassign [blocksize $underblock] _w renderwidth _h renderheight + set overlines [split $overblock \n] + lassign [blocksize $overblock] _w overblock_width _h overblock_height + set under_exposed_max [expr {$renderwidth - $overblock_width}] + if {$under_exposed_max > 0} { + #background block is wider + if {$under_exposed_max % 2 == 0} { + #even left/right exposure + set left_exposed [expr {$under_exposed_max / 2}] + } else { + set beforehalf [expr {$under_exposed_max / 2}] ;#1 less than half due to integer division + if {[tcl::string::tolower [tcl::dict::get $opts -bias]] eq "left"} { + set left_exposed $beforehalf + } else { + #bias to the right + set left_exposed [expr {$beforehalf + 1}] + } + } + } else { + set left_exposed 0 + } + + set outputlines [list] + if {[punk::ansi::ta::detect_sgr [lindex $overlines 0]]} { + set replay_codes "[punk::ansi::a]" + } else { + set replay_codes "" + } + set replay_codes_underlay "" + set replay_codes_overlay "" + foreach undertext $underlines overtext $overlines { + set overtext_datalen [punk::ansi::printing_length $overtext] + set ulen [punk::ansi::printing_length $undertext] + if {$ulen < $renderwidth} { + set udiff [expr {$renderwidth - $ulen}] + set undertext "$undertext[string repeat { } $udiff]" + } + set undertext $replay_codes_underlay$undertext + set overtext $replay_codes_overlay$overtext + + set overflowlength [expr {$overtext_datalen - $renderwidth}] + #review - right-to-left langs should elide on left! - extra option required + + if {$overflowlength > 0} { + #overlay line wider or equal + #review - we still expand_right for centred for now.. possibly should do something like -expand_leftright with ellipsis each end? + set rinfo [renderline -info 1 -insert_mode 0 -transparent $opt_transparent -expand_right [tcl::dict::get $opts -overflow] -exposed1 $opt_exposed1 -exposed2 $opt_exposed2 $undertext $overtext] + set rendered [tcl::dict::get $rinfo result] + set overflow_right [tcl::dict::get $rinfo overflow_right] + set unapplied [tcl::dict::get $rinfo unapplied] + #todo - get replay_codes from overflow_right instead of wherever it was truncated? + + #overlay line data is wider - trim if overflow not specified in opts - and overtype an ellipsis at right if it was specified + if {![tcl::dict::get $opts -overflow]} { + #lappend outputlines [tcl::string::range $overtext 0 [expr {$renderwidth - 1}]] + #set overtext [tcl::string::range $overtext 0 $renderwidth-1 ] + if {$opt_ellipsis} { + set show_ellipsis 1 + if {!$opt_ellipsiswhitespace} { + #we don't want ellipsis if only whitespace was lost + #don't use tcl::string::range on ANSI data + #set lostdata [tcl::string::range $overtext end-[expr {$overflowlength-1}] end] + set lostdata "" + if {$overflow_right ne ""} { + append lostdata $overflow_right + } + if {$unapplied ne ""} { + append lostdata $unapplied + } + if {[tcl::string::trim $lostdata] eq ""} { + set show_ellipsis 0 + } + } + if {$show_ellipsis} { + set rendered [overtype::right $rendered $opt_ellipsistext] + } + } + } + lappend outputlines $rendered + #lappend outputlines [renderline -insert_mode 0 -transparent $opt_transparent $undertext $overtext] + } else { + #background block is wider than or equal to data for this line + #lappend outputlines [renderline -insert_mode 0 -startcolumn [expr {$left_exposed + 1}] -transparent $opt_transparent -exposed1 $opt_exposed1 -exposed2 $opt_exposed2 $undertext $overtext] + set rinfo [renderline -info 1 -insert_mode 0 -startcolumn [expr {$left_exposed + 1}] -transparent $opt_transparent -exposed1 $opt_exposed1 -exposed2 $opt_exposed2 $undertext $overtext] + lappend outputlines [tcl::dict::get $rinfo result] + } + set replay_codes_underlay [tcl::dict::get $rinfo replay_codes_underlay] + set replay_codes_overlay [tcl::dict::get $rinfo replay_codes_overlay] + } + return [join $outputlines \n] + } + + #overtype::right is for a rendered ragged underblock and a rendered ragged overblock + #ie we can determine the block width for bost by examining the lines and picking the longest. + # + proc right {args} { + #NOT the same as align-right - which should be done to the overblock first if required + variable default_ellipsis_horizontal + # @d !todo - implement overflow, length checks etc + + if {[llength $args] < 2} { + error {usage: ?-overflow [1|0]? ?-transparent 0|? undertext overtext} + } + foreach {underblock overblock} [lrange $args end-1 end] break + + set opts [tcl::dict::create\ + -bias ignored\ + -ellipsis 0\ + -ellipsistext $default_ellipsis_horizontal\ + -ellipsiswhitespace 0\ + -overflow 0\ + -transparent 0\ + -exposed1 \uFFFD\ + -exposed2 \uFFFD\ + -align "left"\ + ] + set argsflags [lrange $args 0 end-2] + tcl::dict::for {k v} $argsflags { + switch -- $k { + -bias - -ellipsis - -ellipsistext - -ellipsiswhitespace - -overflow - -transparent - -exposed1 - -exposed2 - -align { + tcl::dict::set opts $k $v + } + default { + set known_opts [tcl::dict::keys $opts] + error "overtype::centre unknown option '$k'. Known options: $known_opts" + } + } + } + #set opts [tcl::dict::merge $defaults $argsflags] + # -- --- --- --- --- --- + set opt_transparent [tcl::dict::get $opts -transparent] + set opt_ellipsis [tcl::dict::get $opts -ellipsis] + set opt_ellipsistext [tcl::dict::get $opts -ellipsistext] + set opt_ellipsiswhitespace [tcl::dict::get $opts -ellipsiswhitespace] + set opt_overflow [tcl::dict::get $opts -overflow] + set opt_exposed1 [tcl::dict::get $opts -exposed1] + set opt_exposed2 [tcl::dict::get $opts -exposed2] + set opt_align [tcl::dict::get $opts -align] + # -- --- --- --- --- --- + + set underblock [tcl::string::map {\r\n \n} $underblock] + set overblock [tcl::string::map {\r\n \n} $overblock] + + set underlines [split $underblock \n] + lassign [blocksize $underblock] _w renderwidth _h renderheight + set overlines [split $overblock \n] + #set overblock_width [tcl::mathfunc::max {*}[lmap v $overlines {punk::ansi::printing_length $v}]] + lassign [blocksize $overblock] _w overblock_width _h overblock_height + set under_exposed_max [expr {max(0,$renderwidth - $overblock_width)}] + set left_exposed $under_exposed_max + + + + set outputlines [list] + if {[punk::ansi::ta::detect_sgr [lindex $overlines 0]]} { + set replay_codes "[punk::ansi::a]" + } else { + set replay_codes "" + } + set replay_codes_underlay "" + set replay_codes_overlay "" + foreach undertext $underlines overtext $overlines { + set overtext_datalen [punk::ansi::printing_length $overtext] + set ulen [punk::ansi::printing_length $undertext] + if {$ulen < $renderwidth} { + set udiff [expr {$renderwidth - $ulen}] + #puts xxx + append undertext [string repeat { } $udiff] + } + if {$overtext_datalen < $overblock_width} { + set odiff [expr {$overblock_width - $overtext_datalen}] + switch -- $opt_align { + left { + set startoffset 0 + } + right { + set startoffset $odiff + } + default { + set half [expr {$odiff / 2}] + #set lhs [string repeat { } $half] + #set righthalf [expr {$odiff - $half}] ;#remainder - may be one more - so we are biased left + #set rhs [string repeat { } $righthalf] + set startoffset $half + } + } + } else { + set startoffset 0 ;#negative? + } + + set undertext $replay_codes_underlay$undertext + set overtext $replay_codes_overlay$overtext + + set overflowlength [expr {$overtext_datalen - $renderwidth}] + if {$overflowlength > 0} { + #raw overtext wider than undertext column + set rinfo [renderline\ + -info 1\ + -insert_mode 0\ + -transparent $opt_transparent\ + -exposed1 $opt_exposed1 -exposed2 $opt_exposed2\ + -overflow $opt_overflow\ + -startcolumn [expr {1 + $startoffset}]\ + $undertext $overtext] + set replay_codes [tcl::dict::get $rinfo replay_codes] + set rendered [tcl::dict::get $rinfo result] + if {!$opt_overflow} { + if {$opt_ellipsis} { + set show_ellipsis 1 + if {!$opt_ellipsiswhitespace} { + #we don't want ellipsis if only whitespace was lost + set lostdata [tcl::string::range $overtext end-[expr {$overflowlength-1}] end] + if {[tcl::string::trim $lostdata] eq ""} { + set show_ellipsis 0 + } + } + if {$show_ellipsis} { + set ellipsis $replay_codes$opt_ellipsistext + #todo - overflow on left if allign = right?? + set rendered [overtype::right $rendered $ellipsis] + } + } + } + lappend outputlines $rendered + } else { + #padded overtext + #lappend outputlines [renderline -insert_mode 0 -transparent $opt_transparent -startcolumn [expr {$left_exposed + 1}] $undertext $overtext] + #Note - we still need overflow(exapnd_right) here - as although the overtext is short - it may oveflow due to the startoffset + set rinfo [renderline -info 1 -insert_mode 0 -transparent $opt_transparent -expand_right $opt_overflow -startcolumn [expr {$left_exposed + 1 + $startoffset}] $undertext $overtext] + lappend outputlines [tcl::dict::get $rinfo result] + } + set replay_codes [tcl::dict::get $rinfo replay_codes] + set replay_codes_underlay [tcl::dict::get $rinfo replay_codes_underlay] + set replay_codes_overlay [tcl::dict::get $rinfo replay_codes_overlay] + } + + return [join $outputlines \n] + } + + proc left {args} { + overtype::block -blockalign left {*}$args + } + #overtype a (possibly ragged) underblock with a (possibly ragged) overblock + proc block {args} { + variable default_ellipsis_horizontal + # @d !todo - implement overflow, length checks etc + + if {[llength $args] < 2} { + error {usage: ?-blockalign left|centre|right? ?-textalign left|centre|right? ?-overflow [1|0]? ?-transparent 0|? undertext overtext} + } + #foreach {underblock overblock} [lrange $args end-1 end] break + lassign [lrange $args end-1 end] underblock overblock + + set opts [tcl::dict::create\ + -ellipsis 0\ + -ellipsistext $default_ellipsis_horizontal\ + -ellipsiswhitespace 0\ + -overflow 0\ + -transparent 0\ + -exposed1 \uFFFD\ + -exposed2 \uFFFD\ + -textalign "left"\ + -textvertical "top"\ + -blockalign "left"\ + -blockalignbias left\ + -blockvertical "top"\ + ] + set argsflags [lrange $args 0 end-2] + tcl::dict::for {k v} $argsflags { + switch -- $k { + -blockalignbias - -ellipsis - -ellipsistext - -ellipsiswhitespace - -overflow - -transparent - -exposed1 - -exposed2 - -textalign - -blockalign - -blockvertical { + tcl::dict::set opts $k $v + } + default { + error "overtype::block unknown option '$k'. Known options: [tcl::dict::keys $opts]" + } + } + } + # -- --- --- --- --- --- + set opt_transparent [tcl::dict::get $opts -transparent] + set opt_ellipsis [tcl::dict::get $opts -ellipsis] + set opt_ellipsistext [tcl::dict::get $opts -ellipsistext] + set opt_ellipsiswhitespace [tcl::dict::get $opts -ellipsiswhitespace] + set opt_overflow [tcl::dict::get $opts -overflow] + set opt_exposed1 [tcl::dict::get $opts -exposed1] + set opt_exposed2 [tcl::dict::get $opts -exposed2] + set opt_textalign [tcl::dict::get $opts -textalign] + set opt_blockalign [tcl::dict::get $opts -blockalign] + if {$opt_blockalign eq "center"} { + set opt_blockalign "centre" + } + # -- --- --- --- --- --- + + set underblock [tcl::string::map {\r\n \n} $underblock] + set overblock [tcl::string::map {\r\n \n} $overblock] + + set underlines [split $underblock \n] + lassign [blocksize $underblock] _w renderwidth _h renderheight + set overlines [split $overblock \n] + #set overblock_width [tcl::mathfunc::max {*}[lmap v $overlines {punk::ansi::printing_length $v}]] + lassign [blocksize $overblock] _w overblock_width _h overblock_height + set under_exposed_max [expr {max(0,$renderwidth - $overblock_width)}] + + switch -- $opt_blockalign { + left { + set left_exposed 0 + } + right { + set left_exposed $under_exposed_max + } + centre { + if {$under_exposed_max > 0} { + #background block is wider + if {$under_exposed_max % 2 == 0} { + #even left/right exposure + set left_exposed [expr {$under_exposed_max / 2}] + } else { + set beforehalf [expr {$under_exposed_max / 2}] ;#1 less than half due to integer division + if {[tcl::string::tolower [tcl::dict::get $opts -blockalignbias]] eq "left"} { + set left_exposed $beforehalf + } else { + #bias to the right + set left_exposed [expr {$beforehalf + 1}] + } + } + } else { + set left_exposed 0 + } + } + default { + set left_exposed 0 + } + } + + + + set outputlines [list] + if {[punk::ansi::ta::detect_sgr [lindex $overlines 0]]} { + set replay_codes "[punk::ansi::a]" + } else { + set replay_codes "" + } + set replay_codes_underlay "" + set replay_codes_overlay "" + foreach undertext $underlines overtext $overlines { + set overtext_datalen [punk::ansi::printing_length $overtext] + set ulen [punk::ansi::printing_length $undertext] + if {$ulen < $renderwidth} { + set udiff [expr {$renderwidth - $ulen}] + #puts xxx + append undertext [string repeat { } $udiff] + } + if {$overtext_datalen < $overblock_width} { + set odiff [expr {$overblock_width - $overtext_datalen}] + switch -- $opt_textalign { + left { + set startoffset 0 + } + right { + set startoffset $odiff + } + default { + set half [expr {$odiff / 2}] + #set lhs [string repeat { } $half] + #set righthalf [expr {$odiff - $half}] ;#remainder - may be one more - so we are biased left + #set rhs [string repeat { } $righthalf] + set startoffset $half + } + } + } else { + set startoffset 0 ;#negative? + } + + set undertext $replay_codes_underlay$undertext + set overtext $replay_codes_overlay$overtext + + set overflowlength [expr {$overtext_datalen - $renderwidth}] + if {$overflowlength > 0} { + #raw overtext wider than undertext column + set rinfo [renderline -info 1 -insert_mode 0 -transparent $opt_transparent -exposed1 $opt_exposed1 -exposed2 $opt_exposed2 -expand_right $opt_overflow -startcolumn [expr {1 + $startoffset}] $undertext $overtext] + set replay_codes [tcl::dict::get $rinfo replay_codes] + set rendered [tcl::dict::get $rinfo result] + set overflow_right [tcl::dict::get $rinfo overflow_right] + set unapplied [tcl::dict::get $rinfo unapplied] + if {!$opt_overflow} { + if {$opt_ellipsis} { + set show_ellipsis 1 + if {!$opt_ellipsiswhitespace} { + #we don't want ellipsis if only whitespace was lost + #don't use tcl::string::range on ANSI data + #set lostdata [tcl::string::range $overtext end-[expr {$overflowlength-1}] end] + set lostdata "" + if {$overflow_right ne ""} { + append lostdata $overflow_right + } + if {$unapplied ne ""} { + append lostdata $unapplied + } + if {[tcl::string::trim $lostdata] eq ""} { + set show_ellipsis 0 + } + } + if {$show_ellipsis} { + set rendered [overtype::block -blockalign right $rendered $opt_ellipsistext] + } + } + + #if {$opt_ellipsis} { + # set show_ellipsis 1 + # if {!$opt_ellipsiswhitespace} { + # #we don't want ellipsis if only whitespace was lost + # set lostdata [tcl::string::range $overtext end-[expr {$overflowlength-1}] end] + # if {[tcl::string::trim $lostdata] eq ""} { + # set show_ellipsis 0 + # } + # } + # if {$show_ellipsis} { + # set ellipsis [tcl::string::cat $replay_codes $opt_ellipsistext] + # #todo - overflow on left if allign = right?? + # set rendered [overtype::right $rendered $ellipsis] + # } + #} + } + lappend outputlines $rendered + } else { + #padded overtext + #lappend outputlines [renderline -insert_mode 0 -transparent $opt_transparent -startcolumn [expr {$left_exposed + 1}] $undertext $overtext] + #Note - we still need expand_right here - as although the overtext is short - it may oveflow due to the startoffset + set rinfo [renderline -info 1 -insert_mode 0 -transparent $opt_transparent -expand_right $opt_overflow -startcolumn [expr {$left_exposed + 1 + $startoffset}] $undertext $overtext] + #puts stderr "--> [ansistring VIEW -lf 1 -nul 1 $rinfo] <--" + set overflow_right [tcl::dict::get $rinfo overflow_right] + set unapplied [tcl::dict::get $rinfo unapplied] + lappend outputlines [tcl::dict::get $rinfo result] + } + set replay_codes [tcl::dict::get $rinfo replay_codes] + set replay_codes_underlay [tcl::dict::get $rinfo replay_codes_underlay] + set replay_codes_overlay [tcl::dict::get $rinfo replay_codes_overlay] + } + + return [join $outputlines \n] + } + + variable optimise_ptruns 10 ;# can be set to zero to disable the ptruns branches + + # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### + # renderline written from a left-right line orientation perspective as a first-shot at getting something useful. + # ultimately right-to-left, top-to-bottom and bottom-to-top are probably needed. + # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### + # + # + #-returnextra enables returning of overflow and length + #review - use punk::ansi::ta::detect to short-circuit processing and do simpler string calcs as an optimisation? + #review - DECSWL/DECDWL double width line codes - very difficult/impossible to align and compose with other elements + #(could render it by faking it with sixels and a lot of work - find/make a sixel font and ensure it's exactly 2 cols per char? + # This would probably be impractical to support for different fonts) + #todo - review transparency issues with single/double width characters + #bidi - need a base direction and concept of directional runs for RTL vs LTR - may be best handled at another layer? + proc renderline {args} { + #*** !doctools + #[call [fun overtype::renderline] [arg args] ] + #[para] renderline is the core engine for overtype string processing (frames & textblocks), and the raw mode commandline repl for the Tcl Punk Shell + #[para] It is also a central part of an ansi (micro) virtual terminal-emulator of sorts + #[para] This system does a half decent job at rendering 90's ANSI art to manipulable colour text blocks that can be joined & framed for layout display within a unix or windows terminal + #[para] Renderline helps maintain ANSI text styling reset/replay codes so that the styling of one block doesn't affect another. + #[para] Calling on the punk::ansi library - it can coalesce codes to keep the size down. + #[para] It is a giant mess of doing exactly what common wisdom says not to do... lots at once. + #[para] renderline is part of the Unicode and ANSI aware Overtype system which 'renders' a block of text onto a static underlay + #[para] The underlay is generally expected to be an ordered set of lines or a rectangular text block analogous to a terminal screen - but it can also be ragged in line length, or just blank. + #[para] The overlay couuld be similar - in which case it may often be used to overwrite a column or section of the underlay. + #[para] The overlay could however be a sequence of ANSI-laden text that jumps all over the place. + # + #[para] renderline itself only deals with a single line - or sometimes a single character. It is generally called from a loop that does further terminal-like or textblock processing. + #[para] By suppyling the -info 1 option - it can return various fields indicating the state of the render. + #[para] The main 3 are the result, overflow_right, and unapplied. + #[para] Renderline handles cursor movements from either keystrokes or ANSI sequences but for a full system the aforementioned loop will need to be in place to manage the set of lines under manipulation. + + #puts stderr "renderline '$args'" + variable optimise_ptruns + + if {[llength $args] < 2} { + error {usage: ?-info 0|1? ?-startcolumn ? ?-cursor_column ? ?-cursor_row |""? ?-transparent [0|1|]? ?-expand_right [1|0]? undertext overtext} + } + set under [lindex $args end-1] + set over [lindex $args end] + #lassign [lrange $args end-1 end] under over + if {[string last \n $under] >= 0} { + error "overtype::renderline not allowed to contain newlines in undertext" + } + #if {[string first \n $over] >=0 || [string first \n $under] >= 0} { + # error "overtype::renderline not allowed to contain newlines" + #} + + #generally faster to create a new dict in the proc than to use a namespace variable to create dict once and link to variable (2024 8.6/8.7) + set opts [tcl::dict::create\ + -etabs 0\ + -width \uFFEF\ + -expand_right 0\ + -transparent 0\ + -startcolumn 1\ + -cursor_column 1\ + -cursor_row ""\ + -insert_mode 1\ + -crm_mode 0\ + -autowrap_mode 1\ + -reverse_mode 0\ + -info 0\ + -exposed1 \uFFFD\ + -exposed2 \uFFFD\ + -cursor_restore_attributes ""\ + -cp437 0\ + -experimental {}\ + ] + #-cursor_restore_attributes only - for replay stack - position and actual setting/restoring handled by throwback to caller + + #cursor_row, when numeric will allow detection of certain row moves that are still within our row - allowing us to avoid an early return + #An empty string for cursor_row tells us we have no info about our own row context, and to return with an unapplied string if any row move occurs + + #exposed1 and exposed2 for first and second col of underying 2wide char which is truncated by transparency, currsor movements to 2nd charcol, or overflow/expand_right + #todo - return info about such grapheme 'cuts' in -info structure and/or create option to raise an error + + set argsflags [lrange $args 0 end-2] + tcl::dict::for {k v} $argsflags { + switch -- $k { + -experimental - -cp437 - -width - -expand_right - -transparent - -startcolumn - -cursor_column - -cursor_row + - -crm_mode - -insert_mode - -autowrap_mode - -reverse_mode + - -info - -exposed1 - -exposed2 - -cursor_restore_attributes { + tcl::dict::set opts $k $v + } + default { + error "overtype::renderline unknown option '$k'. Known options: [tcl::dict::keys $opts]" + } + } + } + # -- --- --- --- --- --- --- --- --- --- --- --- + set opt_width [tcl::dict::get $opts -width] + set opt_etabs [tcl::dict::get $opts -etabs] + set opt_expand_right [tcl::dict::get $opts -expand_right] + set opt_colstart [tcl::dict::get $opts -startcolumn] ;#lhs limit for overlay - an offset to cursor_column - first visible column is 1. 0 or < 0 are before the start of the underlay + set opt_colcursor [tcl::dict::get $opts -cursor_column];#start cursor column relative to overlay + set opt_row_context [tcl::dict::get $opts -cursor_row] + if {[string length $opt_row_context]} { + if {![tcl::string::is integer -strict $opt_row_context] || $opt_row_context <1 } { + error "overtype::renderline -cursor_row must be empty for unspecified/unknown or a non-zero positive integer. received: '$opt_row_context'" + } + } + # -- --- --- --- --- --- --- --- --- --- --- --- + #The _mode flags correspond to terminal modes that can be set/reset via escape sequences (e.g DECAWM wraparound mode) + set opt_insert_mode [tcl::dict::get $opts -insert_mode];#should usually be 1 for each new line in editor mode but must be initialised to 1 externally (review) + #default is for overtype + # -- --- --- --- --- --- --- --- --- --- --- --- + set opt_autowrap_mode [tcl::dict::get $opts -autowrap_mode] ;#DECAWM - char or movement can go beyond leftmost/rightmost col to prev/next line + set opt_reverse_mode [tcl::dict::get $opts -reverse_mode] ;#DECSNM + set opt_crm_mode [tcl::dict::get $opts -crm_mode];# CRM - show control character mode + # -- --- --- --- --- --- --- --- --- --- --- --- + set temp_cursor_saved [tcl::dict::get $opts -cursor_restore_attributes] + + set cp437_glyphs [tcl::dict::get $opts -cp437] + set cp437_map [tcl::dict::create] + if {$cp437_glyphs} { + set cp437_map [set ::punk::ansi::cp437_map] + #for cp437 images we need to map these *after* splitting ansi + #some old files might use newline for its glyph.. but we can't easily support that. + #Not sure how old files did it.. maybe cr lf in sequence was newline and any lone cr or lf were displayed as glyphs? + tcl::dict::unset cp437_map \n + } + + set opt_transparent [tcl::dict::get $opts -transparent] + if {$opt_transparent eq "0"} { + set do_transparency 0 + } else { + set do_transparency 1 + if {$opt_transparent eq "1"} { + set opt_transparent {[\s]} + } + } + # -- --- --- --- --- --- --- --- --- --- --- --- + set opt_returnextra [tcl::dict::get $opts -info] + # -- --- --- --- --- --- --- --- --- --- --- --- + set opt_exposed1 [tcl::dict::get $opts -exposed1] + set opt_exposed2 [tcl::dict::get $opts -exposed2] + # -- --- --- --- --- --- --- --- --- --- --- --- + + if {$opt_row_context eq ""} { + set cursor_row 1 + } else { + set cursor_row $opt_row_context + } + + set insert_mode $opt_insert_mode ;#default 1 + set autowrap_mode $opt_autowrap_mode ;#default 1 + set crm_mode $opt_crm_mode ;#default 0 (Show Control Character mode) + set reverse_mode $opt_reverse_mode + + #----- + # + if {[info exists punk::console::tabwidth]} { + #punk console is updated if punk::console::set_tabstop_width is used or rep is started/restarted + #It is way too slow to test the current width by querying the terminal here - so it could conceivably get out of sync + #todo - we also need to handle the new threaded repl where console config is in a different thread. + # - also - concept of sub-regions being mini-consoles with their own settings - 'id' for console, or use in/out channels as id? + set tw $::punk::console::tabwidth + } else { + set tw 8 + } + + set overdata $over + if {!$cp437_glyphs} { + #REVIEW! tabify will give different answers for an ANSI colourised string vs plain text + if {!$opt_etabs} { + if {[string first \t $under] >= 0} { + #set under [textutil::tabify::untabify2 $under] + set under [textutil::tabify::untabifyLine $under $tw] + } + if {[string first \t $over] >= 0} { + #set overdata [textutil::tabify::untabify2 $over] + set overdata [textutil::tabify::untabifyLine $over $tw] + } + } + } + #------- + + #ta_detect ansi and do simpler processing? + + #we repeat tests for grapheme width in different loops - rather than create another datastructure to store widths based on column, + #we'll use the grapheme_width_cached function as a lookup table of all graphemes encountered - as there will often be repeats in different positions anyway. + + # -- --- --- --- --- --- --- --- + if {$under ne ""} { + if {[punk::ansi::ta::detect $under]} { + set undermap [punk::ansi::ta::split_codes_single $under] + } else { + #single plaintext part + set undermap [list $under] + } + } else { + set undermap [list] + } + set understacks [list] + set understacks_gx [list] + set pm_list [list] + + set i_u -1 ;#underlay may legitimately be empty + set undercols [list] + set u_codestack [list] + #u_gx_stack probably isn't really a stack - I don't know if g0 g1 can stack or not - for now we support only g0 anyway + set u_gx_stack [list] ;#separate stack for g0 (g1 g2 g3?) graphics on and off (DEC special graphics) + #set pt_underchars "" ;#for string_columns length calculation for expand_right 0 truncation + set remainder [list] ;#for returnextra + foreach {pt code} $undermap { + #pt = plain text + #append pt_underchars $pt + if {$pt ne ""} { + if {$cp437_glyphs} { + set pt [tcl::string::map $cp437_map $pt] + } + set is_ptrun 0 + if {$optimise_ptruns && [tcl::string::length $pt] >= $optimise_ptruns} { + set p1 [tcl::string::index $pt 0] + set hex [format %x [scan $p1 %c]] ;#punk::char::char_hex + set re [tcl::string::cat {^[} \\U$hex {]+$}] + set is_ptrun [regexp $re $pt] + } + if {$is_ptrun} { + #switch -- $p1 { + # " " - - - _ - ! - @ - # - $ - % - ^ - & - * - = - + - : - . - , - / - | - ? - + # a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - + # z - A - B - C - D - E - F - G - H - I - J - K - L - M - N - O - P - Q - R - S - T - U - V - W - X - Y - Z - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 { + # set width 1 + # } + # default { + # if {$p1 eq "\u0000"} { + # #use null as empty cell representation - review + # #use of this will probably collide with some application at some point + # #consider an option to set the empty cell character + # set width 1 + # } else { + # set width [grapheme_width_cached $p1] ;# when zero??? + # } + # } + #} + set width [grapheme_width_cached $p1] ;# when zero??? + set ptlen [string length $pt] + if {$width <= 1} { + #review - 0 and 1? + incr i_u $ptlen + lappend understacks {*}[lrepeat $ptlen $u_codestack] + lappend understacks_gx {*}[lrepeat $ptlen $u_gx_stack] + lappend undercols {*}[lrepeat $ptlen $p1] + } else { + incr i_u $ptlen ;#2nd col empty str - so same as above + set 2ptlen [expr {$ptlen * 2}] + lappend understacks {*}[lrepeat $2ptlen $u_codestack] + lappend understacks_gx {*}[lrepeat $2ptlen $u_gx_stack] + set l [concat {*}[lrepeat $ptlen [list $p1 ""]] + lappend undercols {*}$l + unset l + } + + } else { + foreach grapheme [punk::char::grapheme_split $pt] { + #an ugly but easy hack to serve *some* common case ascii quickly with byte-compiled literal switch - feels dirty. + #.. but even 0.5uS per char (grapheme_width_cached) adds up quickly when stitching lots of lines together. + #todo - test decimal value instead, compare performance + switch -- $grapheme { + " " - - - _ - ! - @ - # - $ - % - ^ - & - * - = - + - : - . - , - / - | - ? - + a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - + z - A - B - C - D - E - F - G - H - I - J - K - L - M - N - O - P - Q - R - S - T - U - V - W - X - Y - Z - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 { + set width 1 + } + default { + if {$grapheme eq "\u0000"} { + #use null as empty cell representation - review + #use of this will probably collide with some application at some point + #consider an option to set the empty cell character + set width 1 + } else { + #zero width still acts as 1 below??? review what should happen + set width [grapheme_width_cached $grapheme] + #we still want most controls and other zero-length codepoints such as \u200d (zero width joiner) to stay zero-length + #we substitute lone ESC that weren't captured within ANSI context as a debugging aid to see malformed ANSI + #todo - default to off and add a flag (?) to enable this substitution + set sub_stray_escapes 0 + if {$sub_stray_escapes && $width == 0} { + if {$grapheme eq "\x1b"} { + set gvis [ansistring VIEW $grapheme] ;#can only use with graphemes that have a single replacement char.. + set grapheme $gvis + set width 1 + } + } + } + } + } + + #set width [grapheme_width_cached $grapheme] + incr i_u + lappend understacks $u_codestack + lappend understacks_gx $u_gx_stack + + lappend undercols $grapheme + if {$width > 1} { + #presumably there are no triple-column or wider unicode chars.. until the aliens arrive.(?) + #but what about emoji combinations etc - can they be wider than 2? + #todo - if -etabs enabled - then we treat \t as the width determined by our elastic tabstop + incr i_u + lappend understacks $u_codestack + lappend understacks_gx $u_gx_stack + lappend undercols "" + } + } + + } + } + #underlay should already have been rendered and not have non-sgr codes - but let's retain the check for them and not stack them if other codes are here + + #only stack SGR (graphics rendition) codes - not title sets, cursor moves etc + #keep any remaining PMs in place + if {$code ne ""} { + set c1c2 [tcl::string::range $code 0 1] + + set leadernorm [tcl::string::range [tcl::string::map [list\ + \x1b\[ 7CSI\ + \x9b 8CSI\ + \x1b\( 7GFX\ + \x1b^ 7PMX\ + \x1bX 7SOS\ + ] $c1c2] 0 3];# leadernorm is 1st 2 chars mapped to normalised indicator - or is original 2 chars + + switch -- $leadernorm { + 7CSI - 8CSI { + #need to exclude certain leaders after the lb e.g < for SGR 1006 mouse + #REVIEW - what else could end in m but be mistaken as a normal SGR code here? + set maybemouse "" + if {[tcl::string::index $c1c2 0] eq "\x1b"} { + set maybemouse [tcl::string::index $code 2] + } + + if {$maybemouse ne "<" && [tcl::string::index $code end] eq "m"} { + if {[punk::ansi::codetype::is_sgr_reset $code]} { + set u_codestack [list "\x1b\[m"] + } elseif {[punk::ansi::codetype::has_sgr_leadingreset $code]} { + set u_codestack [list $code] + } else { + #basic simplification first.. straight dups + set dup_posns [lsearch -all -exact $u_codestack $code] ;#-exact because of square-bracket glob chars + set u_codestack [lremove $u_codestack {*}$dup_posns] + lappend u_codestack $code + } + } + } + 7GFX { + switch -- [tcl::string::index $code 2] { + "0" { + set u_gx_stack [list gx0_on] ;#we'd better use a placeholder - or debugging will probably get into a big mess + } + B { + set u_gx_stack [list] + } + } + } + 7PMX - 7SOS { + #we can have PMs or SOS (start of string) in the underlay - though mostly the PMs should have been processed.. + #attach the PM/SOS (entire ANSI sequence) to the previous grapheme! + #It should not affect the size - but terminal display may get thrown out if terminal doesn't support them. + + #note that there may in theory already be ANSI stored - we don't assume it was a pure grapheme string + set graphemeplus [lindex $undercols end] + if {$graphemeplus ne "\0"} { + append graphemeplus $code + } else { + set graphemeplus $code + } + lset undercols end $graphemeplus + #The grapheme_width_cached function will be called on this later - and doesn't account for ansi. + #we need to manually cache the item with it's proper width + variable grapheme_widths + #stripped and plus version keys pointing to same length + dict set grapheme_widths $graphemeplus [grapheme_width_cached [::punk::ansi::ansistrip $graphemeplus]] + + } + default { + + } + + } + + #if {[punk::ansi::codetype::is_sgr_reset $code]} { + # #set u_codestack [list] + #} elseif {[punk::ansi::codetype::has_sgr_leadingreset $code]} { + #} elseif {[punk::ansi::codetype::is_sgr $code]} { + #} else { + # #leave SGR stack as is + # if {[punk::ansi::codetype::is_gx_open $code]} { + # } elseif {[punk::ansi::codetype::is_gx_close $code]} { + # } + #} + } + #consider also if there are other codes that should be stacked..? + } + + #NULL empty cell indicator + if {$opt_width ne "\uFFEF"} { + if {[llength $understacks]} { + set cs $u_codestack + set gs $u_gx_stack + } else { + set cs [list] + set gs [list] + } + if {[llength $undercols]< $opt_width} { + set diff [expr {$opt_width- [llength $undercols]}] + if {$diff > 0} { + #set undercols [list {*}$undercols {*}[lrepeat $diff "\u0000"]] ;#2024 - much slower + lappend undercols {*}[lrepeat $diff "\u0000"] + lappend understacks {*}[lrepeat $diff $cs] + lappend understacks_gx {*}[lrepeat $diff $gs] + } + } + } + + if {$opt_width ne "\uFFEF"} { + set renderwidth $opt_width + } else { + set renderwidth [llength $undercols] + } + + + if 0 { + # ----------------- + # if we aren't extending understacks & understacks_gx each time we incr idx above the undercols length.. this doesn't really serve a purpose + # Review. + # ----------------- + #replay code for last overlay position in input line + # whether or not we get that far - we need to return it for possible replay on next line + if {[llength $understacks]} { + lappend understacks $u_codestack + lappend understacks_gx $u_gx_stack + } else { + #in case overlay onto emptystring as underlay + lappend understacks [list] + lappend understacks_gx [list] + } + # ----------------- + } + + #trailing codes in effect for underlay + if {[llength $u_codestack]} { + #set replay_codes_underlay [join $u_codestack ""] + set replay_codes_underlay [punk::ansi::codetype::sgr_merge_list {*}$u_codestack] + } else { + set replay_codes_underlay "" + } + + + # -- --- --- --- --- --- --- --- + #### + #if opt_colstart - we need to build a space (or any singlewidth char ?) padding on the left containing the right number of columns. + #this will be processed as transparent - and handle doublewidth underlay characters appropriately + set startpadding [string repeat " " [expr {$opt_colstart -1}]] + #overdata with left padding spaces based on col-start under will show through for left-padding portion regardless of -transparency + if {$startpadding ne "" || $overdata ne ""} { + if {[punk::ansi::ta::detect $overdata]} { + set overmap [punk::ansi::ta::split_codes_single $startpadding$overdata] + } else { + #single plaintext part + set overmap [list $startpadding$overdata] + } + } else { + set overmap [list] + } + #### + + + #todo - detect plain ascii no-ansi input line (aside from leading ansi reset) + #will that allow some optimisations? + + #todo - detect repeated transparent char in overlay + #regexp {^(.)\1+$} ;#backtracking regexp - relatively slow. + # - try set c [string index $data 0]; regexp [string map [list %c% $c] {^[%c%]+$}] $data + #we should be able to optimize to pass through the underlay?? + + #??? + set colcursor $opt_colstart + #TODO - make a little virtual column object + #we need to refer to column1 or columnmin? or columnmax without calculating offsets due to to startcolumn + #need to lock-down what start column means from perspective of ANSI codes moving around - the offset perspective is unclear and a mess. + + + #set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+} + #as at 2024-02 punk::char::grapheme_split uses these - not aware of more complex graphemes + + set overstacks [list] + set overstacks_gx [list] + + set o_codestack [list]; #SGR codestack (not other codes such as movement,insert key etc) + set o_gxstack [list] + set pt_overchars "" + set i_o 0 + set overlay_grapheme_control_list [list] ;#tag each with g, sgr or other. 'other' are things like cursor-movement or insert-mode or codes we don't recognise/use + #experiment + set overlay_grapheme_control_stacks [list] + foreach {pt code} $overmap { + if {$pt ne ""} { + #todo - wrap in test for empty pt (we used split_codes_single - and it may be common for sgr sequences to be unmerged and so have empty pts between) + if {$cp437_glyphs} { + set pt [tcl::string::map $cp437_map $pt] + } + append pt_overchars $pt + #will get empty pt between adjacent codes + if {!$crm_mode} { + + set is_ptrun 0 + if {$optimise_ptruns && [tcl::string::length $pt] >= $optimise_ptruns} { + set p1 [tcl::string::index $pt 0] + set re [tcl::string::cat {^[} \\U[format %x [scan $p1 %c]] {]+$}] + set is_ptrun [regexp $re $pt] + + #leading only? we would have to check for graphemes at the trailing boundary? + #set re [tcl::string::cat {^[} \\U[format %x [scan $p1 %c]] {]+}] + #set is_ptrun [regexp -indices $re $pt runrange] + #if {$is_ptrun && 1} { + #} + } + if {$is_ptrun} { + #review - in theory a run over a certain length won't be part of some grapheme combo (graphemes can be long e.g 44?, but not as runs(?)) + #could be edge cases for runs at line end? (should be ok as we include trailing \n in our data) + set len [string length $pt] + set g_element [list g $p1] + + #lappend overstacks {*}[lrepeat $len $o_codestack] + #lappend overstacks_gx {*}[lrepeat $len $o_gxstack] + #incr i_o $len + #lappend overlay_grapheme_control_list {*}[lrepeat $len [list g $p1]] + #lappend overlay_grapheme_control_stacks {*}[lrepeat $len $o_codestack] + + set pi 0 + incr i_o $len + while {$pi < $len} { + lappend overstacks $o_codestack + lappend overstacks_gx $o_gxstack + lappend overlay_grapheme_control_list $g_element + lappend overlay_grapheme_control_stacks $o_codestack + incr pi + } + } else { + foreach grapheme [punk::char::grapheme_split $pt] { + lappend overstacks $o_codestack + lappend overstacks_gx $o_gxstack + incr i_o + lappend overlay_grapheme_control_list [list g $grapheme] + lappend overlay_grapheme_control_stacks $o_codestack + } + } + } else { + set tsbegin [clock micros] + foreach grapheme_original [punk::char::grapheme_split $pt] { + set pt_crm [ansistring VIEW -nul 1 -lf 2 -vt 2 -ff 2 $grapheme_original] + #puts stderr "ptlen [string length $pt] graphemelen[string length $grapheme_original] pt_crmlen[string length $pt_crm] $pt_crm" + foreach grapheme [punk::char::grapheme_split $pt_crm] { + if {$grapheme eq "\n"} { + lappend overlay_grapheme_control_stacks $o_codestack + lappend overlay_grapheme_control_list [list crmcontrol "\x1b\[00001E"] + } else { + lappend overstacks $o_codestack + lappend overstacks_gx $o_gxstack + incr i_o + lappend overlay_grapheme_control_list [list g $grapheme] + lappend overlay_grapheme_control_stacks $o_codestack + } + } + } + set elapsed [expr {[clock micros] - $tsbegin}] + puts stderr "ptlen [string length $pt] elapsedus:$elapsed" + } + } + + #only stack SGR (graphics rendition) codes - not title sets, cursor moves etc + #order of if-else based on assumptions: + # that pure resets are fairly common - more so than leading resets with other info + # that non-sgr codes are not that common, so ok to check for resets before verifying it is actually SGR at all. + if {$code ne ""} { + #we need to immediately set crm_mode here if \x1b\[3h received + if {$code eq "\x1b\[3h"} { + set crm_mode 1 + } elseif {$code eq "\x1b\[3l"} { + set crm_mode 0 + } + #else crm_mode could be set either way from options + if {$crm_mode && $code ne "\x1b\[00001E"} { + #treat the code as type 'g' like above - only allow through codes to reset mode REVIEW for now just \x1b\[3l ? + #we need to somehow convert further \n in the graphical rep to an instruction for newline that will bypass further crm_mode processing or we would loop. + set code_as_pt [ansistring VIEW -nul 1 -lf 1 -vt 1 -ff 1 $code] + #split using standard split for first foreach loop - grapheme based split when processing 2nd foreach loop + set chars [split $code_as_pt ""] + set codeparts [list] ;#list of 2-el lists each element {crmcontrol } or {g } + foreach c $chars { + if {$c eq "\n"} { + #use CNL (cursor next line) \x1b\[00001E ;#leading zeroes ok for this processor - used as debugging aid to distinguish + lappend codeparts [list crmcontrol "\x1b\[00001E"] + } else { + if {[llength $codeparts] > 0 && [lindex $codeparts end 0] eq "g"} { + set existing [lindex $codeparts end 1] + lset codeparts end [list g [string cat $existing $c]] + } else { + lappend codeparts [list g $c] + } + } + } + + set partidx 0 + foreach record $codeparts { + lassign $record rtype rval + switch -exact -- $rtype { + g { + append pt_overchars $rval + foreach grapheme [punk::char::grapheme_split $rval] { + lappend overstacks $o_codestack + lappend overstacks_gx $o_gxstack + incr i_o + lappend overlay_grapheme_control_list [list g $grapheme] + lappend overlay_grapheme_control_stacks $o_codestack + } + } + crmcontrol { + #leave o_codestack + lappend overlay_grapheme_control_stacks $o_codestack + lappend overlay_grapheme_control_list [list crmcontrol $rval] + } + } + } + } else { + lappend overlay_grapheme_control_stacks $o_codestack + #there will always be an empty code at end due to foreach on 2 vars with odd-sized list ending with pt (overmap coming from perlish split) + if {[punk::ansi::codetype::is_sgr_reset $code]} { + set o_codestack [list "\x1b\[m"] ;#reset better than empty list - fixes some ansi art issues + lappend overlay_grapheme_control_list [list sgr $code] + } elseif {[punk::ansi::codetype::has_sgr_leadingreset $code]} { + set o_codestack [list $code] + lappend overlay_grapheme_control_list [list sgr $code] + } elseif {[priv::is_sgr $code]} { + #basic simplification first - remove straight dupes + set dup_posns [lsearch -all -exact $o_codestack $code] ;#must be -exact because of square-bracket glob chars + set o_codestack [lremove $o_codestack {*}$dup_posns] + lappend o_codestack $code + lappend overlay_grapheme_control_list [list sgr $code] + } elseif {[regexp {\x1b7|\x1b\[s} $code]} { + #experiment + #cursor_save - for the replays review. + #jmn + #set temp_cursor_saved [punk::ansi::codetype::sgr_merge_list {*}$o_codestack] + lappend overlay_grapheme_control_list [list other $code] + } elseif {[regexp {\x1b8|\x1b\[u} $code]} { + #experiment + #cursor_restore - for the replays + set o_codestack [list $temp_cursor_saved] + lappend overlay_grapheme_control_list [list other $code] + } else { + #review + if {[punk::ansi::codetype::is_gx_open $code]} { + set o_gxstack [list "gx0_on"] + lappend overlay_grapheme_control_list [list gx0 gx0_on] ;#don't store code - will complicate debugging if we spit it out and jump character sets + } elseif {[punk::ansi::codetype::is_gx_close $code]} { + set o_gxstack [list] + lappend overlay_grapheme_control_list [list gx0 gx0_off] ;#don't store code - will complicate debugging if we spit it out and jump character sets + } else { + lappend overlay_grapheme_control_list [list other $code] + } + } + } + } + + } + #replay code for last overlay position in input line - should take account of possible trailing sgr code after last grapheme + set max_overlay_grapheme_index [expr {$i_o -1}] + lappend overstacks $o_codestack + lappend overstacks_gx $o_gxstack + + #set replay_codes_overlay [join $o_codestack ""] + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}$o_codestack] + + #if {[tcl::dict::exists $overstacks $max_overlay_grapheme_index]} { + # set replay_codes_overlay [join [tcl::dict::get $overstacks $max_overlay_grapheme_index] ""] + #} else { + # set replay_codes_overlay "" + #} + # -- --- --- --- --- --- --- --- + + + #potential problem - combinining diacritics directly following control chars like \r \b + + # -- --- --- + #we need to initialise overflow_idx before any potential row-movements - as they need to perform a loop break and force in_excess to 1 + if {$opt_expand_right} { + #expand_right true means we can have lines as long as we want, but either way there can be excess data that needs to be thrown back to the calling loop. + #we currently only support horizontal expansion to the right (review regarding RTL text!) + set overflow_idx -1 + } else { + #expand_right zero - we can't grow beyond our column width - so we get ellipsis or truncation + if {$opt_width ne "\uFFEF"} { + set overflow_idx [expr {$opt_width}] + } else { + #review - this is also the cursor position when adding a char at end of line? + set overflow_idx [expr {[llength $undercols]}] ;#index at which we would be *in* overflow a row move may still override it + } + } + # -- --- --- + + set outcols $undercols ;#leave undercols as is, outcols can potentially be appended to. + + set unapplied "" ;#if we break for move row (but not for /v ?) + set unapplied_list [list] + + set insert_lines_above 0 ;#return key + set insert_lines_below 0 + set instruction "" + + # -- --- --- + #cursor_save_dec, cursor_restore_dec etc + set cursor_restore_required 0 + set cursor_saved_attributes "" + set cursor_saved_position "" + # -- --- --- + + #set idx 0 ;# line index (cursor - 1) + #set idx [expr {$opt_colstart + $opt_colcursor} -1] + + #idx is the per column output index + set idx [expr {$opt_colcursor -1}] ;#don't use opt_colstart here - we have padded and won't start emitting until idx reaches opt_colstart-1 + #cursor_column is usually one above idx - but we have opt_colstart which is like a margin - todo: remove cursor_column from the following loop and calculate it's offset when breaking or at end. + #(for now we are incrementing/decrementing both in sync - which is a bit silly) + set cursor_column $opt_colcursor + + #idx_over is the per grapheme overlay index + set idx_over -1 + + + #movements only occur within the overlay range. + #an underlay is however not necessary.. e.g + #renderline -expand_right 1 "" data + + #set re_mode {\x1b\[\?([0-9]*)(h|l)} ;#e.g DECAWM + #set re_col_move {\x1b\[([0-9]*)(C|D|G)$} + #set re_row_move {\x1b\[([0-9]*)(A|B)$} + #set re_both_move {\x1b\[([0-9]*)(?:;){0,1}([0-9]*)H$} ;# or "f" ? + #set re_vt_sequence {\x1b\[([0-9]*)(?:;){0,1}([0-9]*)~$} + #set re_cursor_save {\x1b\[s$} ;#note probable incompatibility with DECSLRM (set left right margin)! + #set re_cursor_restore {\x1b\[u$} + #set re_cursor_save_dec {\x1b7$} + #set re_cursor_restore_dec {\x1b8$} + #set re_other_single {\x1b(D|M|E)$} + #set re_decstbm {\x1b\[([0-9]*)(?:;){0,1}([0-9]*)r$} ;#DECSTBM set top and bottom margins + + #puts "-->$overlay_grapheme_control_list<--" + #puts "-->overflow_idx: $overflow_idx" + for {set gci 0} {$gci < [llength $overlay_grapheme_control_list]} {incr gci} { + set gc [lindex $overlay_grapheme_control_list $gci] + lassign $gc type item + + #emit plaintext chars first using existing SGR codes from under/over stack as appropriate + #then check if the following code is a cursor movement within the line and adjust index if so + #foreach ch $overlay_graphemes {} + switch -- $type { + g { + set ch $item + #crm_mode affects both graphic and control + if {0 && $crm_mode} { + set chars [ansistring VIEW -nul 1 -lf 2 -vt 2 -ff 2 $ch] + set chars [string map [list \n "\x1b\[00001E"] $chars] + if {[llength [split $chars ""]] > 1} { + priv::render_unapplied $overlay_grapheme_control_list $gci + #prefix the unapplied controls with the string version of this control + set unapplied_list [linsert $unapplied_list 0 {*}[split $chars ""]] + set unapplied [join $unapplied_list ""] + #incr idx_over + break + } else { + set ch $chars + } + } + incr idx_over; #idx_over (until unapplied reached anyway) is per *grapheme* in the overlay - not per col. + if {($idx < ($opt_colstart -1))} { + incr idx [grapheme_width_cached $ch] + continue + } + #set within_undercols [expr {$idx <= [llength $undercols]-1}] ;#within our active data width + set within_undercols [expr {$idx <= $renderwidth-1}] + + #https://www.enigma.com/resources/blog/the-secret-world-of-newline-characters + #\x85 NEL in the c1 control set is treated by some terminal emulators (e.g Hyper) as a newline, + #on some it's invisble but doesn't change the line, on some it's a visible glyph of width 1. + #This is hard to process in any standard manner - but I think the Hyper behaviour of doing what it was intended is perhaps most reasonable + #We will map it to the same behaviour as lf here for now... but we need also to consider the equivalent ANSI sequence: \x1bE + + set chtest [tcl::string::map [list \n \x85 \b \r \v \x7f ] $ch] + #puts --->chtest:$chtest + #specials - each shoud have it's own test of what to do if it happens after overflow_idx reached + switch -- $chtest { + "" { + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + if {$idx == 0} { + #puts "---a at col 1" + #linefeed at column 1 + #leave the overflow_idx ;#? review + set instruction lf_start ;#specific instruction for newline at column 1 + priv::render_unapplied $overlay_grapheme_control_list $gci + break + } elseif {$overflow_idx != -1 && $idx == $overflow_idx} { + #linefeed after final column + #puts "---c at overflow_idx=$overflow_idx" + incr cursor_row + set overflow_idx $idx ;#override overflow_idx even if it was set to -1 due to expand_right = 1 + set instruction lf_overflow ;#only special treatment is to give it it's own instruction in case caller needs to handle differently + priv::render_unapplied $overlay_grapheme_control_list $gci + break + } else { + #linefeed occurred in middle or at end of text + #puts "---mid-or-end-text-linefeed idx:$idx overflow_idx:$overflow_idx" + if {$insert_mode == 0} { + incr cursor_row + if {$idx == -1 || $overflow_idx > $idx} { + #don't set overflow_idx higher if it's already set lower and we're adding graphemes to overflow + set overflow_idx $idx ;#override overflow_idx even if it was set to -1 due to expand_right = 1 + } + set instruction lf_mid + priv::render_unapplied $overlay_grapheme_control_list $gci + break + } else { + incr cursor_row + #don't adjust the overflow_idx + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction lf_mid + break ;# could have overdata following the \n - don't keep processing + } + } + + } + "" { + #will we want/need to use raw for keypresses in terminal? (terminal with LNM in standard reset mode means enter= this is the usual config for terminals) + #So far we are assuming the caller has translated to and handle above.. REVIEW. + + #consider also the old space-carriagereturn softwrap convention used in some terminals. + #In the context of rendering to a block of text - this works similarly in that the space gets eaten so programs emitting space-cr at the terminal width col will pretty much get what they expect. + set idx [expr {$opt_colstart -1}] + set cursor_column $opt_colstart ;#? + } + "" { + #literal backspace char - not necessarily from keyboard + #review - backspace effect on double-width chars - we are taking a column-editing perspective in overtype + #(important for -transparent option - hence replacement chars for half-exposed etc) + #review - overstrike support as per nroff/less (generally considered an old technology replaced by unicode mechanisms and/or ansi SGR) + if {$idx > ($opt_colstart -1)} { + incr idx -1 + incr cursor_column -1 + } else { + set flag 0 + if $flag { + #review - conflicting requirements? Need a different sequence for destructive interactive backspace? + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction backspace_at_start + break + } + } + } + "" { + #literal del character - some terminals send just this for what is generally expected to be a destructive backspace + #We instead treat this as a pure delete at current cursor position - it is up to the repl or terminal to remap backspace key to a sequence that has the desired effect. + priv::render_delchar $idx + } + "" { + #end processing this overline. rest of line is remainder. cursor for column as is. + #REVIEW - this theoretically depends on terminal's vertical tabulation setting (name?) + #e.g it could be configured to jump down 6 rows. + #On the other hand I've seen indications that some modern terminal emulators treat it pretty much as a linefeed. + #todo? + incr cursor_row + set overflow_idx $idx + #idx_over has already been incremented as this is both a movement-control and in some sense a grapheme + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction vt + break + } + default { + if {$overflow_idx != -1} { + #review - how to check arbitrary length item such as tab is going to overflow .. before we get to overflow_idx? + #call grapheme_width_cached on each ch, or look for tab specifically as it's currently the only known reason to have a grapheme width > 2? + #we need to decide what a tab spanning the overflow_idx means and how it affects wrap etc etc + if {$idx == $overflow_idx-1} { + set owidth [grapheme_width_cached $ch] + if {$owidth == 2} { + #review split 2w overflow? + #we don't want to make the decision here to split a 2w into replacement characters at end of line and beginning of next line + #better to consider the overlay char as unable to be applied to the line + #render empty column(?) - and reduce overlay grapheme index by one so that the current ch goes into unapplied + #throwing back to caller with instruction complicates its job - but is necessary to avoid making decsions for it here. + priv::render_addchar $idx "" [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + #change the overflow_idx + set overflow_idx $idx + incr idx + incr idx_over -1 ;#set overlay grapheme index back one so that sgr stack from previous overlay grapheme used + priv::render_unapplied $overlay_grapheme_control_list [expr {$gci-1}] ;#note $gci-1 instead of just gci + #throw back to caller's loop - add instruction to caller as this is not the usual case + #caller may for example choose to render a single replacement char to this line and omit the grapheme, or wrap it to the next line + set instruction overflow_splitchar + break + } elseif {$owidth > 2} { + #? tab? + #TODO! + puts stderr "overtype::renderline long overtext grapheme '[ansistring VIEW -lf 1 -vt 1 $ch]' not handled" + #tab of some length dependent on tabstops/elastic tabstop settings? + } + } elseif {$idx >= $overflow_idx} { + #REVIEW + set next_gc [lindex $overlay_grapheme_control_list $gci+1] ;#next grapheme or control + lassign $next_gc next_type next_item + if {$autowrap_mode || $next_type ne "g"} { + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci-1]] + #set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #don't incr idx beyond the overflow_idx + #idx_over already incremented - decrement so current overlay grapheme stacks go to unapplied + incr idx_over -1 + #priv::render_unapplied $overlay_grapheme_control_list [expr {$gci-1}] ;#back one index here too + priv::render_this_unapplied $overlay_grapheme_control_list $gci ;# + set instruction overflow + break + } else { + #no point throwing back to caller for each grapheme that is overflowing + #without this branch - renderline would be called with overtext reducing only by one grapheme per call + #processing a potentially long overtext each time (ie - very slow) + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #JMN4 + + } + } + } else { + #review. + #overflow_idx = -1 + #This corresponds to expand_right being true (at least until overflow_idx is in some cases forced to a value when throwing back to calling loop) + } + + if {($do_transparency && [regexp $opt_transparent $ch])} { + #pre opt_colstart is effectively transparent (we have applied padding of required number of columns to left of overlay) + if {$idx > [llength $outcols]-1} { + lappend outcols " " + #tcl::dict::set understacks $idx [list] ;#review - use idx-1 codestack? + #lset understacks $idx [list] ;#will get index $i out of range error + lappend understacks [list] ;#REVIEW + incr idx + incr cursor_column + } else { + #todo - punk::char::char_width + set g [lindex $outcols $idx] + #JMN + set uwidth [grapheme_width_cached $g] + if {[lindex $outcols $idx] eq ""} { + #2nd col of 2-wide char in underlay + incr idx + incr cursor_column + } elseif {$uwidth == 0} { + #e.g control char ? combining diacritic ? + incr idx + incr cursor_column + } elseif {$uwidth == 1} { + set owidth [grapheme_width_cached $ch] + incr idx + incr cursor_column + if {$owidth > 1} { + incr idx + incr cursor_column + } + } elseif {$uwidth > 1} { + if {[grapheme_width_cached $ch] == 1} { + if {!$insert_mode} { + #normal singlewide transparent overlay onto double-wide underlay + set next_pt_overchar [tcl::string::index $pt_overchars $idx_over+1] ;#lookahead of next plain-text char in overlay + if {$next_pt_overchar eq ""} { + #special-case trailing transparent - no next_pt_overchar + incr idx + incr cursor_column + } else { + if {[regexp $opt_transparent $next_pt_overchar]} { + incr idx + incr cursor_column + } else { + #next overlay char is not transparent.. first-half of underlying 2wide char is exposed + #priv::render_addchar $idx $opt_exposed1 [tcl::dict::get $overstacks $idx_over] [tcl::dict::get $overstacks_gx $idx_over] $insert_mode + priv::render_addchar $idx $opt_exposed1 [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + incr cursor_column + } + } + } else { + #? todo - decide what transparency even means for insert mode + incr idx + incr cursor_column + } + } else { + #2wide transparency over 2wide in underlay - review + incr idx + incr cursor_column + } + } + } + } else { + + set idxchar [lindex $outcols $idx] + #non-transparent char in overlay or empty cell + if {$idxchar eq "\u0000"} { + #empty/erased cell indicator + set uwidth 1 + } else { + set uwidth [grapheme_width_cached $idxchar] + } + if {$within_undercols} { + if {$idxchar eq ""} { + #2nd col of 2wide char in underlay + if {!$insert_mode} { + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] 0 + #JMN - this has to expose if our startposn chopped an underlay - but not if we already overwrote the first half of the widechar underlay grapheme + #e.g renderline \uFF21\uFF21--- a\uFF23\uFF23 + #vs + # renderline -startcolumn 2 \uFF21---- \uFF23 + if {[lindex $outcols $idx-1] != ""} { + #verified it's an empty following a filled - so it's a legit underlay remnant (REVIEW - when would it not be??) + #reset previous to an exposed 1st-half - but leave understacks code as is + priv::render_addchar [expr {$idx-1}] $opt_exposed1 [lindex $understacks $idx-1] [lindex $understacks_gx $idx-1] 0 + } + incr idx + } else { + set prevcolinfo [lindex $outcols $idx-1] + #for insert mode - first replace the empty 2ndhalf char with exposed2 before shifting it right + #REVIEW - this leaves a replacement character permanently in our columns.. but it is consistent regarding length (?) + #The alternative is to disallow insertion at a column cursor that is at 2nd half of 2wide char + #perhaps by inserting after the char - this may be worthwhile - but may cause other surprises + #It is perhaps best avoided at another level and try to make renderline do exactly as it's told + #the advantage of this 2w splitting method is that the inserted character ends up in exactly the column we expect. + priv::render_addchar $idx $opt_exposed2 [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] 0 ;#replace not insert + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] 1 ;#insert - same index + if {$prevcolinfo ne ""} { + #we've split the 2wide - it may already have been rendered as an exposed1 - but not for example if our startcolumn was current idx + priv::render_addchar [expr {$idx-1}] $opt_exposed1 [lindex $understacks $idx-1] [lindex $understacks_gx $idx-1] 0 ;#replace not insert + } ;# else?? + incr idx + } + if {$cursor_column < [llength $outcols] || $overflow_idx == -1} { + incr cursor_column + } + } elseif {$uwidth == 0} { + #what if this is some other c0/c1 control we haven't handled specifically? + + #by emitting a preceding empty-string column - we associate whatever this char is with the preceeding non-zero-length character and any existing zero-lengths that follow it + #e.g combining diacritic - increment before over char REVIEW + #arguably the previous overchar should have done this - ie lookahead for combiners? + #if we can get a proper grapheme_split function - this should be easier to tidy up. + priv::render_addchar $idx "" [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + incr cursor_column 2 + + if {$cursor_column > [llength $outcols] && $overflow_idx != -1} { + set cursor_column [llength $outcols] + } + } elseif {$uwidth == 1} { + #includes null empty cells + set owidth [grapheme_width_cached $ch] + if {$owidth == 1} { + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + } else { + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + priv::render_addchar $idx "" [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + #if next column in underlay empty - we've overwritten first half of underlying 2wide grapheme + #replace with rhs exposure in case there are no more overlay graphemes coming - use underlay's stack + if {([llength $outcols] >= $idx +2) && [lindex $outcols $idx+1] eq ""} { + priv::render_addchar [expr {$idx+1}] $opt_exposed2 [lindex $understacks $idx+1] [lindex $understacks_gx $idx+1] $insert_mode + } + incr idx + } + if {($cursor_column < [llength $outcols]) || $overflow_idx == -1} { + incr cursor_column + } + } elseif {$uwidth > 1} { + set owidth [grapheme_width_cached $ch] + if {$owidth == 1} { + #1wide over 2wide in underlay + if {!$insert_mode} { + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + incr cursor_column + priv::render_addchar $idx $opt_exposed2 [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + #don't incr idx - we are just putting a broken-indication in the underlay - which may get overwritten by next overlay char + } else { + #insert mode just pushes all to right - no exposition char here + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + incr cursor_column + } + } else { + #2wide over 2wide + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx 2 + incr cursor_column 2 + } + + if {$cursor_column > [llength $outcols] && $overflow_idx != -1} { + set cursor_column [llength $outcols] + } + } + } else { + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + incr cursor_column + } + } + } + } ;# end switch + + + } + other - crmcontrol { + if {$crm_mode && $type ne "crmcontrol" && $item ne "\x1b\[00001E"} { + if {$item eq "\x1b\[3l"} { + set crm_mode 0 + } else { + #When our initial overlay split was done - we weren't in crm_mode - so there are codes that weren't mapped to unicode control character representations + #set within_undercols [expr {$idx <= $renderwidth-1}] + #set chars [ansistring VIEW -nul 1 -lf 2 -vt 2 -ff 2 $item] + set chars [ansistring VIEW -nul 1 -lf 1 -vt 1 -ff 1 $item] + priv::render_unapplied $overlay_grapheme_control_list $gci + #prefix the unapplied controls with the string version of this control + set unapplied_list [linsert $unapplied_list 0 {*}[split $chars ""]] + set unapplied [join $unapplied_list ""] + + break + } + } + + #todo - consider CSI s DECSLRM vs ansi.sys \x1b\[s - we need \x1b\[s for oldschool ansi art - but may have to enable only for that. + #we should possibly therefore reverse this mapping so that x1b7 x1b8 are the primary codes for save/restore? + set code [tcl::string::map [list \x1b7 \x1b\[s \x1b8 \x1b\[u ] $item] + #since this element isn't a grapheme - advance idx_over to next grapheme overlay when about to fill 'unapplied' + + + #remap of DEC cursor_save/cursor_restore from ESC sequence to equivalent CSI + #probably not ideal - consider putting cursor_save/cursor_restore in functions so they can be called from the appropriate switch branch instead of using this mapping + #review - cost/benefit of function calls within these switch-arms instead of inline code? + + set c1 [tcl::string::index $code 0] + set c1c2c3 [tcl::string::range $code 0 2] + #set re_ST_open {(?:\033P|\u0090|\033X|\u0098|\033\^|\u009e|\033_|\u009f)} + #tcl 8.7 - faster to use inline list than to store it in a local var outside of loop. + #(somewhat surprising) + set leadernorm [tcl::string::range [tcl::string::map [list\ + \x1b\[< 1006\ + \x1b\[ 7CSI\ + \x1bY 7MAP\ + \x1bP 7DCS\ + \x90 8DCS\ + \x9b 8CSI\ + \x1b\] 7OSC\ + \x9d 8OSC\ + \x1b 7ESC\ + ] $c1c2c3] 0 3] ;#leadernorm is 1st 1,2 or 3 chars mapped to 4char normalised indicator - or is original first chars (1,2 or 3 len) + + #we leave the tail of the code unmapped for now + switch -- $leadernorm { + 1006 { + #https://invisible-island.net/xterm/ctlseqs/ctlseqs.html + #SGR (1006) CSI < followed by colon separated encoded-button-value,px,py ordinates and final M for button press m for button release + set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 3 end]] + } + 7CSI - 7OSC { + #set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 2 end]] + set codenorm $leadernorm[tcl::string::range $code 2 end] + } + 7DCS { + #ESC P + #Device Control String https://invisible-island.net/xterm/ctlseqs/ctlseqs.html#h4-Controls-beginning-with-ESC:ESC-F.C74 + set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 2 end]] + } + 8DCS { + #8-bit Device Control String + set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 1 end]] + } + 7MAP { + #map to another type of code to share implementation branch + set codenorm $leadernorm[tcl::string::range $code 1 end] + } + 7ESC { + #set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 1 end]] + set codenorm $leadernorm[tcl::string::range $code 1 end] + } + 8CSI - 8OSC { + set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 1 end]] + } + default { + puts stderr "Sequence detected as ANSI, but not handled in leadernorm switch. code: [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + #we haven't made a mapping for this + #could in theory be 1,2 or 3 in len + #although we shouldn't be getting here if the regexp for ansi codes is kept in sync with our switch branches + set codenorm $code + } + } + + switch -- $leadernorm { + 7MAP { + switch -- [lindex $codenorm 4] { + Y { + #vt52 movement. we expect 2 chars representing position (limited range) + set params [tcl::string::range $codenorm 5 end] + if {[tcl::string::length $params] != 2} { + #shouldn't really get here or need this branch if ansi splitting was done correctly + puts stderr "overtype::renderline ESC Y recognised as vt52 move, but incorrect parameters length ([string length $params] vs expected 2) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented codenorm:[ansistring VIEW -lf 1 -vt 1 -nul 1 $codenorm]" + } + set line [tcl::string::index $params 5] + set column [tcl::string::index $params 1] + set r [expr {[scan $line %c] -31}] + set c [expr {[scan $column %c] -31}] + + #MAP to: + #CSI n;m H - CUP - Cursor Position + set leadernorm 7CSI + set codenorm "$leadernorm${r}\;${c}H" + } + } + } + } + + #we've mapped 7 and 8bit escapes to values we can handle as literals in switch statements to take advantange of jump tables. + switch -- $leadernorm { + 1006 { + #TODO + # + switch -- [tcl::string::index $codenorm end] { + M { + puts stderr "mousedown $codenorm" + } + m { + puts stderr "mouseup $codenorm" + } + } + + } + {7CSI} - {8CSI} { + set param [tcl::string::range $codenorm 4 end-1] + #puts stdout "--> CSI [tcl::string::index $leadernorm 0] bit param:$param" + set code_end [tcl::string::index $codenorm end] ;#used for e.g h|l set/unset mode + + switch -exact -- $code_end { + A { + #Row move - up + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #todo + lassign [split $param {;}] num modifierkey + if {$modifierkey ne ""} { + puts stderr "modifierkey:$modifierkey" + } + + if {$num eq ""} {set num 1} + incr cursor_row -$num + + if {$cursor_row < 1} { + set cursor_row 1 + } + + #ensure rest of *overlay* is emitted to remainder + incr idx_over + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction up + #retain cursor_column + break + } + B { + #CUD - Cursor Down + #Row move - down + lassign [split $param {;}] num modifierkey + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #move down + if {$modifierkey ne ""} { + puts stderr "modifierkey:$modifierkey" + } + if {$num eq ""} {set num 1} + incr cursor_row $num + + + incr idx_over ;#idx_over hasn't encountered a grapheme and hasn't advanced yet + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction down + #retain cursor_column + break + } + C { + #CUF - Cursor Forward + #Col move + #puts stdout "->forward" + #todo - consider right-to-left cursor mode (e.g Hebrew).. some day. + #cursor forward + #right-arrow/move forward + lassign [split $param {;}] num modifierkey + if {$modifierkey ne ""} { + puts stderr "modifierkey:$modifierkey" + } + if {$num eq ""} {set num 1} + + #todo - retrict to moving 1 position past datalen? restrict to column width? + #should ideally wrap to next line when interactive and not on last row + #(some ansi art seems to expect this behaviour) + #This presumably depends on the terminal's wrap mode + #e.g DECAWM autowrap mode + # CSI ? 7 h - set: autowrap (also tput smam) + # CSI ? 7 l - reset: no autowrap (also tput rmam) + set version 2 + if {$version eq "2"} { + set max [llength $outcols] + if {$overflow_idx == -1} { + incr max + } + if {$cursor_column == $max+1} { + #move_forward while in overflow + incr cursor_column -1 + } + + if {($cursor_column + $num) <= $max} { + incr idx $num + incr cursor_column $num + } else { + if {$autowrap_mode} { + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #jmn + if {$idx == $overflow_idx} { + incr num + } + + #horizontal movement beyond line extent needs to wrap - throw back to caller + #we may have both overflow_right and unapplied data + #(can have overflow_right if we were in insert_mode and processed chars prior to this movement) + #leave row as is - caller will need to determine how many rows the column-movement has consumed + incr cursor_column $num ;#give our caller the necessary info as columns from start of row + #incr idx_over + #should be gci following last one applied + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction wrapmoveforward + break + } else { + set cursor_column $max + set idx [expr {$cursor_column -1}] + } + } + } else { + #review - dead branch + if {!$expand_right || ($cursor_column + $num) <= [llength $outcols+1]} { + incr idx $num + incr cursor_column $num + } else { + if {!$insert_mode} { + #block editing style with arrow keys + #overtype mode + set idxstart $idx + set idxend [llength $outcols] + set moveend [expr {$idxend - $idxstart}] + if {$moveend < 0} {set moveend 0} ;#sanity? + #puts "idxstart:$idxstart idxend:$idxend outcols[llength $outcols] undercols:[llength $undercols]" + incr idx $moveend + incr cursor_column $moveend + #if {[tcl::dict::exists $understacks $idx]} { + # set stackinfo [tcl::dict::get $understacks $idx] ;#use understack at end - which may or may not have already been replaced by stack from overtext + #} else { + # set stackinfo [list] + #} + if {$idx < [llength $understacks]} { + set stackinfo [lindex $understacks $idx] ;#use understack at end - which may or may not have already been replaced by stack from overtext + } else { + set stackinfo [list] + } + if {$idx < [llength $understacks_gx]} { + #set gxstackinfo [tcl::dict::get $understacks_gx $idx] + set gxstackinfo [lindex $understacks_gx $idx] + } else { + set gxstackinfo [list] + } + #pad outcols + set movemore [expr {$num - $moveend}] + #assert movemore always at least 1 or we wouldn't be in this branch + for {set m 1} {$m <= $movemore} {incr m} { + incr idx + incr cursor_column + priv::render_addchar $idx " " $stackinfo $gxstackinfo $insert_mode + } + } else { + #normal - insert + incr idx $num + incr cursor_column $num + if {$idx > [llength $outcols]} { + set idx [llength $outcols];#allow one beyond - for adding character at end of line + set cursor_column [expr {[llength $outcols]+1}] + } + } + } + } + } + D { + #Col move + #puts stdout "<-back" + #cursor back + #left-arrow/move-back when ltr mode + lassign [split $param {;}] num modifierkey + if {$modifierkey ne ""} { + puts stderr "modifierkey:$modifierkey" + } + if {$num eq ""} {set num 1} + + set version 2 + if {$version eq "2"} { + #todo - startcolumn offset! + if {$cursor_column - $num >= 1} { + incr idx -$num + incr cursor_column -$num + } else { + if {!$autowrap_mode} { + set cursor_column 1 + set idx 0 + } else { + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + incr cursor_column -$num + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction wrapmovebackward + break + } + } + } else { + incr idx -$num + incr cursor_column -$num + if {$idx < $opt_colstart-1} { + #wrap to previous line and position cursor at end of data + set idx [expr {$opt_colstart-1}] + set cursor_column $opt_colstart + } + } + } + E { + #CNL - Cursor Next Line + if {$param eq ""} { + set downmove 1 + } else { + set downmove [expr {$param}] + } + puts stderr "renderline CNL down-by-$downmove" + set cursor_column 1 + set cursor_row [expr {$cursor_row + $downmove}] + set idx [expr {$cursor_column -1}] + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + incr idx_over + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction move + break + + } + F { + #CPL - Cursor Previous Line + if {$param eq ""} { + set upmove 1 + } else { + set upmove [expr {$param}] + } + puts stderr "renderline CPL up-by-$upmove" + set cursor_column 1 + set cursor_row [expr {$cursor_row -$upmove}] + if {$cursor_row < 1} { + set cursor_row 1 + } + set idx [expr {$cursor_column - 1}] + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + incr idx_over + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction move + break + + } + G { + #CHA - Cursor Horizontal Absolute (move to absolute column no) + if {$param eq ""} { + set targetcol 1 + } else { + set targetcol $param + if {![string is integer -strict $targetcol]} { + puts stderr "renderline CHA (Cursor Horizontal Absolute) error. Unrecognised parameter '$param'" + } + set targetcol [expr {$param}] + set max [llength $outcols] + if {$overflow_idx == -1} { + incr max + } + if {$targetcol > $max} { + puts stderr "renderline CHA (Cursor Horizontal Absolute) error. Param '$param' > max: $max" + set targetcol $max + } + } + #adjust to colstart - as column 1 is within overlay + #??? REVIEW + set idx [expr {($targetcol -1) + $opt_colstart -1}] + + + set cursor_column $targetcol + #puts stderr "renderline absolute col move ESC G (TEST)" + } + H - f { + #CSI n;m H - CUP - Cursor Position + + #CSI n;m f - HVP - Horizontal Vertical Position REVIEW - same as CUP with differences (what?) in some terminal modes + # - 'counts as effector format function (like CR or LF) rather than an editor function (like CUD or CNL)' + # - REVIEW + #see Annex A at: https://www.ecma-international.org/wp-content/uploads/ECMA-48_5th_edition_june_1991.pdf + + #test e.g ansicat face_2.ans + #$re_both_move + lassign [split $param {;}] paramrow paramcol + #missing defaults to 1 + #CSI ;5H = CSI 1;5H -> row 1 col 5 + #CSI 17;H = CSI 17H = CSI 17;1H -> row 17 col 1 + + if {$paramcol eq ""} {set paramcol 1} + if {$paramrow eq ""} {set paramrow 1} + if {![string is integer -strict $paramcol] || ![string is integer -strict $paramrow]} { + puts stderr "renderline CUP (CSI H) unrecognised param $param" + #ignore? + } else { + set max [llength $outcols] + if {$overflow_idx == -1} { + incr max + } + if {$paramcol > $max} { + set target_column $max + } else { + set target_column [expr {$paramcol}] + } + + + if {$paramrow < 1} { + puts stderr "renderline CUP (CSI H) bad row target 0. Assuming 1" + set target_row 1 + } else { + set target_row [expr {$paramrow}] + } + if {$target_row == $cursor_row} { + #col move only - no need for break and move + #puts stderr "renderline CUP col move only to col $target_column param:$param" + set cursor_column $target_column + set idx [expr {$cursor_column -1}] + } else { + set cursor_row $target_row + set cursor_column $target_column + set idx [expr {$cursor_column -1}] + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + incr idx_over + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction move + break + } + } + } + J { + set modegroup [tcl::string::index $codenorm 4] ;#e.g ? + switch -exact -- $modegroup { + ? { + #CSI ? Pn J - selective erase + puts stderr "overtype::renderline ED - SELECTIVE ERASE IN DISPLAY (UNIMPLEMENTED) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + default { + puts stderr "overtype::renderline ED - ERASE IN DISPLAY (TESTING) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + if {$param eq ""} {set param 0} + switch -exact -- $param { + 0 { + #clear from cursor to end of screen + } + 1 { + #clear from cursor to beginning of screen + } + 2 { + #clear entire screen + #ansi.sys - move cursor to upper left REVIEW + set cursor_row 1 + set cursor_column 1 + set idx [expr {$cursor_column -1}] + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + incr idx_over + if {[llength $outcols]} { + priv::render_erasechar 0 [llength $outcols] + } + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction clear_and_move + break + } + 3 { + #clear entire screen. presumably cursor doesn't move - otherwise it would be same as 2J ? + + } + default { + } + } + + } + } + } + K { + #see DECECM regarding background colour + set modegroup [tcl::string::index $codenorm 4] ;#e.g ? + switch -exact -- $modegroup { + ? { + puts stderr "overtype::renderline DECSEL - SELECTIVE ERASE IN LINE (UNIMPLEMENTED) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + set param [string range $param 1 end] ;#chop qmark + if {$param eq ""} {set param 0} + switch -exact -- $param { + 0 { + #clear from cursor to end of line - depending on DECSCA + } + 1 { + #clear from cursor to beginning of line - depending on DECSCA + + } + 2 { + #clear entire line - depending on DECSCA + } + default { + puts stderr "overtype::renderline DECSEL - SELECTIVE ERASE IN LINE PARAM '$param' unrecognised [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + + } + default { + puts stderr "overtype::renderline EL - ERASE IN LINE (TESTING) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + if {$param eq ""} {set param 0} + switch -exact -- $param { + 0 { + #clear from cursor to end of line + } + 1 { + #clear from cursor to beginning of line + + } + 2 { + #clear entire line + } + default { + puts stderr "overtype::renderline EL - ERASE IN LINE PARAM '$param' unrecognised [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + } + } + } + L { + puts stderr "overtype::renderline IL - Insert Line - not implemented [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + M { + #CSI Pn M - DL - Delete Line + puts stderr "overtype::renderline DL - Delete Line - not implemented [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + + } + T { + #CSI Pn T - SD Pan Up (empty lines introduced at top) + #CSI Pn+T - kitty extension (lines at top come from scrollback buffer) + #Pn new lines appear at top of the display, Pn old lines disappear at the bottom of the display + if {$param eq "" || $param eq "0"} {set param 1} + if {[string index $param end] eq "+"} { + puts stderr "overtype::renderline CSI Pn + T - kitty Pan Up - not implemented [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } else { + puts stderr "overtype::renderline CSI Pn T - SD Pan Up - not implemented [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + X { + puts stderr "overtype::renderline X ECH ERASE CHARACTER - $param" + #ECH - erase character + if {$param eq "" || $param eq "0"} {set param 1}; #param=count of chars to erase + priv::render_erasechar $idx $param + #cursor position doesn't change. + } + q { + set code_secondlast [tcl::string::index $codenorm end-1] + switch -exact -- $code_secondlast { + {"} { + #DECSCA - Select Character Protection Attribute + #(for use with selective erase: DECSED and DECSEL) + set param [tcl::string::range $codenorm 4 end-2] + if {$param eq ""} {set param 0} + #TODO - store like SGR in stacks - replays? + switch -exact -- $param { + 0 - 2 { + #canerase + puts stderr "overtype::renderline - DECSCA canerase not implemented - [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + 1 { + #cannoterase + puts stderr "overtype::renderline - DECSCA cannoterase not implemented - [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + default { + puts stderr "overtype::renderline DECSCA param '$param' not understood [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + + } + default { + puts stderr "overtype::renderline - CSI ... q not implemented - [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + + } + r { + #$re_decstbm + #https://www.vt100.net/docs/vt510-rm/DECSTBM.html + #This control function sets the top and bottom margins for the current page. You cannot perform scrolling outside the margins + lassign [split $param {;}] margin_top margin_bottom + + #todo - return these for the caller to process.. + puts stderr "overtype::renderline DECSTBM set top and bottom margin not implemented" + #Also moves the cursor to col 1 line 1 of the page + set cursor_column 1 + set cursor_row 1 + + incr idx_over + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction move ;#own instruction? decstbm? + break + } + s { + #code conflict between ansi emulation and DECSLRM - REVIEW + #ANSISYSSC (when no parameters) - like other terminals - essentially treat same as DECSC + # todo - when parameters - support DECSLRM instead + + if {$param ne ""} { + #DECSLRM - should only be recognised if DECLRMM is set (vertical split screen mode) + lassign [split $param {;} margin_left margin_right + puts stderr "overtype DECSLRM not yet supported - got [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + if {$margin_left eq ""} { + set margin_left 1 + } + set columns_per_page 80 ;#todo - set to 'page width (DECSCPP set columns per page)' - could be 132 or?? + if {$margin_right eq ""} { + set margin_right $columns_per_page + } + puts stderr "DECSLRM margin left: $margin_left margin right: $margin_right" + if {![string is integer -strict $margin_left] || $margin_left < 0} { + puts stderr "DECSLRM invalid margin_left" + } + if {![string is integer -strict $margin_right] || $margin_right < 0} { + puts stderr "DECSLRM invalid margin_right" + } + set scrolling_region_size [expr {$margin_right - $margin_left}] + if {$scrolling_region_size < 2 || $scrolling_region_size > $columns_per_page} { + puts stderr "DECSLRM region size '$scrolling_regsion_size' must be between 1 and $columns_per_page" + } + #todo + + + } else { + #DECSC + #//notes on expected behaviour: + #DECSC - saves following items in terminal's memory + #cursor position + #character attributes set by the SGR command + #character sets (G0,G1,G2 or G3) currently in GL and GR + #Wrap flag (autowrap or no autowrap) + #State of origin mode (DECOM) + #selective erase attribute + #any single shift 2 (SS2) or single shift 3(SSD) functions sent + + #$re_cursor_save + #cursor save could come after last column + if {$overflow_idx != -1 && $idx == $overflow_idx} { + #bartman2.ans test file - fixes misalignment at bottom of dialog bubble + #incr cursor_row + #set cursor_column 1 + #bwings1.ans test file - breaks if we actually incr cursor (has repeated saves) + set cursor_saved_position [list row [expr {$cursor_row+1}] column 1] + } else { + set cursor_saved_position [list row $cursor_row column $cursor_column] + } + #there may be overlay stackable codes emitted that aren't in the understacks because they come between the last emmited character and the cursor_save control. + #we need the SGR and gx overlay codes prior to the cursor_save + + #a real terminal would not be able to know the state of the underlay.. so we should probably ignore it. + #set sgr_stack [lindex $understacks $idx] + #set gx_stack [lindex $understacks_gx $idx] ;#not actually a stack - just a boolean state (for now?) + + set sgr_stack [list] + set gx_stack [list] + + #we shouldn't need to scan for intermediate cursor save/restores - as restores would throw-back to the calling loop - so our overlay 'line' is since those. + #The overlay_grapheme_control_list had leading resets from previous lines - so we go back to the beginning not just the first grapheme. + + foreach gc [lrange $overlay_grapheme_control_list 0 $gci-1] { + lassign $gc type code + #types g other sgr gx0 + switch -- $type { + gx0 { + #code is actually a stand-in for the graphics on/off code - not the raw code + #It is either gx0_on or gx0_off + set gx_stack [list $code] + } + sgr { + #code is the raw code + if {[punk::ansi::codetype::is_sgr_reset $code]} { + #jmn + set sgr_stack [list "\x1b\[m"] + } elseif {[punk::ansi::codetype::has_sgr_leadingreset $code]} { + set sgr_stack [list $code] + lappend overlay_grapheme_control_list [list sgr $code] + } elseif {[priv::is_sgr $code]} { + #often we don't get resets - and codes just pile up. + #as a first step to simplifying - at least remove earlier straight up dupes + set dup_posns [lsearch -all -exact $sgr_stack $code] ;#needs -exact - codes have square-brackets (glob chars) + set sgr_stack [lremove $sgr_stack {*}$dup_posns] + lappend sgr_stack $code + } + } + } + } + set cursor_saved_attributes "" + switch -- [lindex $gx_stack 0] { + gx0_on { + append cursor_saved_attributes "\x1b(0" + } + gx0_off { + append cursor_saved_attributes "\x1b(B" + } + } + #append cursor_saved_attributes [join $sgr_stack ""] + append cursor_saved_attributes [punk::ansi::codetype::sgr_merge_list {*}$sgr_stack] + + #as there is apparently only one cursor storage element we don't need to throw back to the calling loop for a save. + + #don't incr index - or the save will cause cursor to move to the right + #carry on + } + } + u { + #ANSISYSRC save cursor (when no parameters) (DECSC) + + #$re_cursor_restore + #we are going to jump somewhere.. for now we will assume another line, and process accordingly. + #The caller has the cursor_saved_position/cursor_saved_attributes if any (?review - if we always pass it back it, we could save some calls for moves in same line) + #don't set overflow at this point. The existing underlay to the right must be preserved. + #we only want to jump and render the unapplied at the new location. + + #lset overstacks $idx_over [list] + #set replay_codes_overlay "" + + #if {$cursor_saved_attributes ne ""} { + # set replay_codes_overlay $cursor_saved_attributes ;#empty - or last save if it happend in this input chunk + #} else { + #jj + #set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + set replay_codes_overlay "" + #} + + #like priv::render_unapplied - but without the overlay's ansi reset or gx stacks from before the restore code + incr idx_over + + set unapplied "" + set unapplied_list [list] + foreach gc [lrange $overlay_grapheme_control_list $gci+1 end] { + lassign $gc type item + if {$type eq "gx0"} { + if {$item eq "gx0_on"} { + lappend unapplied_list "\x1b(0" + } elseif {$item eq "gx0_off"} { + lappend unapplied_list "\x1b(B" + } + } else { + lappend unapplied_list $item + } + #incr idx_over + } + set unapplied [join $unapplied_list ""] + #if the save occured within this line - that's ok - it's in the return value list and caller can prepend for the next loop. + set instruction restore_cursor + break + } + "{" { + + puts stderr "renderline warning - CSI.. - unimplemented [ansistring VIEW -lf 1 -nul 1 $code]" + } + "}" { + set code_secondlast [tcl::string::index $codenorm end-1] + switch -exact -- $code_secondlast { + ' { + puts stderr "renderline warning - DECIC - Insert Column - CSI...' - unimplemented [ansistring VIEW -lf 1 -nul 1 $code]" + } + default { + puts stderr "renderline warning - CSI.. - unimplemented [ansistring VIEW -lf 1 -nul 1 $code]" + } + } + } + ~ { + set code_secondlast [tcl::string::index $codenorm end-1] ;#used for e.g CSI x '~ + switch -exact -- $code_secondlast { + ' { + #DECDC - editing sequence - Delete Column + puts stderr "renderline warning - DECDC - unimplemented" + } + default { + #$re_vt_sequence + lassign [split $param {;}] key mod + + #Note that f1 to f4 show as ESCOP|Q|R|S (VT220?) but f5+ show as ESC\[15~ + # + #e.g esc \[2~ insert esc \[2;2~ shift-insert + #mod - subtract 1, and then use bitmask + #shift = 1, (left)Alt = 2, control=4, meta=8 (meta seems to do nothing on many terminals on windows? Intercepted by windows?) + #puts stderr "vt key:$key mod:$mod code:[ansistring VIEW $code]" + if {$key eq "1"} { + #home + } elseif {$key eq "2"} { + #Insert + if {$mod eq ""} { + #no modifier key + set insert_mode [expr {!$insert_mode}] + #rather than set the cursor - we return the insert mode state so the caller can decide + } + } elseif {$key eq "3"} { + #Delete - presumably this shifts other chars in the line, with empty cells coming in from the end + switch -- $mod { + "" { + priv::render_delchar $idx + } + "5" { + #ctrl-del - delete to end of word (pwsh) - possibly word on next line if current line empty(?) + } + } + } elseif {$key eq "4"} { + #End + } elseif {$key eq "5"} { + #pgup + } elseif {$key eq "6"} { + #pgDn + } elseif {$key eq "7"} { + #Home + #?? + set idx [expr {$opt_colstart -1}] + set cursor_column 1 + } elseif {$key eq "8"} { + #End + } elseif {$key eq "11"} { + #F1 - or ESCOP or e.g shift F1 ESC\[1;2P + } elseif {$key eq "12"} { + #F2 - or ESCOQ + } elseif {$key eq "13"} { + #F3 - or ESCOR + } elseif {$key eq "14"} { + #F4 - or ESCOS + } elseif {$key eq "15"} { + #F5 or shift F5 ESC\[15;2~ + } elseif {$key eq "17"} { + #F6 + } elseif {$key eq "18"} { + #F7 + } elseif {$key eq "19"} { + #F8 + } elseif {$key eq "20"} { + #F9 + } elseif {$key eq "21"} { + #F10 + } elseif {$key eq "23"} { + #F11 + } elseif {$key eq "24"} { + #F12 + } + + } + } + + } + h - l { + #set mode unset mode + #we are matching only last char to get to this arm - but are there other sequences ending in h|l we need to handle? + + #$re_mode if first after CSI is "?" + #some docs mention ESC=h|l - not seen on windows terminals.. review + #e.g https://www2.math.upenn.edu/~kazdan/210/computer/ansi.html + set modegroup [tcl::string::index $codenorm 4] ;#e.g ? = + switch -exact -- $modegroup { + ? { + set smparams [tcl::string::range $codenorm 5 end-1] ;#params between ? and h|l + #one or more modes can be set + set smparam_list [split $smparams {;}] + foreach num $smparam_list { + switch -- $num { + "" { + #ignore empties e.g extra/trailing semicolon in params + } + 5 { + #DECSNM - reverse video + #How we simulate this to render within a block of text is an open question. + #track all SGR stacks and constantly flip based on the current SGR reverse state? + #It is the job of the calling loop to do this - so at this stage we'll just set the states + + if {$code_end eq "h"} { + #set (enable) + set reverse_mode 1 + } else { + #reset (disable) + set reverse_mode 0 + } + + } + 7 { + #DECAWM autowrap + if {$code_end eq "h"} { + #set (enable) + set autowrap_mode 1 + if {$opt_width ne "\uFFEF"} { + set overflow_idx $opt_width + } else { + #review - this is also the cursor position when adding a char at end of line? + set overflow_idx [expr {[llength $undercols]}] ;#index at which we would be *in* overflow a row move may still override it + } + #review - can idx ever be beyond overflow_idx limit when we change e.g with a width setting and cursor movements? + # presume not usually - but sanity check with warning for now. + if {$idx >= $overflow_idx} { + puts stderr "renderline warning - idx '$idx' >= overflow_idx '$overflow_idx' - unexpected" + } + } else { + #reset (disable) + set autowrap_mode 0 + #REVIEW! + set overflow_idx -1 + } + } + 25 { + if {$code_end eq "h"} { + #visible cursor + + } else { + #invisible cursor + + } + } + 117 { + #DECECM - Erase Color Mode + #https://invisible-island.net/ncurses/ncurses.faq.html + #The Erase color selection controls the background color used when text is erased or new + #text is scrolled on to the screen. Screen background causes newly erased areas or + #scrolled text to be written using color index zero, the screen background. This is VT + #and DECterm compatible. Text background causes erased areas or scrolled text to be + #written using the current text background color. This is PC console compatible and is + #the factory default. + + #see also: https://unix.stackexchange.com/questions/251726/clear-to-end-of-line-uses-the-wrong-background-color-in-screen + } + } + } + } + = { + set num [tcl::string::range $codenorm 5 end-1] ;#param between = and h|l + puts stderr "overtype::renderline CSI=...h|l code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented" + } + default { + #e.g CSI 4 h + set num [tcl::string::range $codenorm 4 end-1] ;#param before h|l + switch -exact -- $num { + 3 { + puts stderr "CRM MODE $code_end" + #CRM - Show control character mode + # 'No control functions are executed except LF,FF and VT which are represented in the CRM FONT before a CRLF(new line) is executed' + # + #use ansistring VIEW -nul 1 -lf 2 -ff 2 -vt 2 + #https://vt100.net/docs/vt510-rm/CRM.html + #NOTE - vt100 CRM always does auto-wrap at right margin. + #disabling auto-wrap in set-up or by sequence is disabled. + #We should default to turning off auto-wrap when crm_mode enabled.. but + #displaying truncated (on rhs) crm can still be very useful - and we have optimisation in overflow to avoid excess renderline calls (per grapheme) + #we therefore could reasonably put in an exception to allow auto_wrap to be disabled after crm_mode is engaged, + #although this would be potentially an annoying difference to some.. REVIEW + if {$code_end eq "h"} { + set crm_mode 1 + set autowrap_mode 1 + if {$opt_width ne "\uFFEF"} { + set overflow_idx $opt_width + } else { + #review - this is also the cursor position when adding a char at end of line? + set overflow_idx [expr {[llength $undercols]}] ;#index at which we would be *in* overflow a row move may still override it + } + } else { + set crm_mode 0 + } + } + 4 { + #IRM - Insert/Replace Mode + if {$code_end eq "h"} { + #CSI 4 h + set insert_mode 1 + } else { + #CSI 4 l + #replace mode + set insert_mode 0 + } + } + default { + puts stderr "overtype::renderline CSI...h|l code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented" + } + } + } + } + } + | { + switch -- [tcl::string::index $codenorm end-1] { + {$} { + #CSI ... $ | DECSCPP set columns per page- (recommended in vt510 docs as preferable to DECCOLM) + #real terminals generally only supported 80/132 + #some other virtuals support any where from 2 to 65,536? + #we will allow arbitrary widths >= 2 .. to some as yet undetermined limit. + #CSI $ | + #empty or 0 param is 80 for compatibility - other numbers > 2 accepted + set page_width -1 ;#flag as unset + if {$param eq ""} { + set page_width 80 + } elseif {[string is integer -strict $param] && $param >=2 0} { + set page_width [expr {$param}] ;#we should allow leading zeros in the number - but lets normalize using expr + } else { + puts stderr "overtype::renderline unacceptable DECSPP value '$param'" + } + + if {$page_width > 2} { + puts stderr "overtype::renderline DECSCPP - not implemented - but selected width '$page_width' looks ok" + #if cursor already beyond new page_width - will move to right colum - otherwise no cursor movement + + } + + } + default { + puts stderr "overtype::renderline unrecognised CSI code ending in pipe (|) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented" + } + } + } + default { + puts stderr "overtype::renderline CSI code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented" + } + } + } + 7ESC { + # + #re_other_single {\x1b(D|M|E)$} + #also vt52 Y.. + #also PM \x1b^...(ST) + switch -- [tcl::string::index $codenorm 4] { + c { + #RIS - reset terminal to initial state - where 'terminal' in this case is the renderspace - not the underlying terminal! + puts stderr "renderline reset" + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction reset + break + } + D { + #\x84 + #index (IND) + #vt102-docs: "Moves cursor down one line in same column. If cursor is at bottom margin, screen performs a scroll-up" + puts stderr "renderline ESC D not fully implemented" + incr cursor_row + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction down + #retain cursor_column + break + } + E { + #\x85 + #review - is behaviour different to lf? + #todo - possibly(?) same logic as handling above. i.e return instruction depends on where column_cursor is at the time we get NEL + #leave implementation until logic for is set in stone... still under review + #It's arguable NEL is a pure cursor movement as opposed to the semantic meaning of crlf or lf in a file. + # + #Next Line (NEL) "Move the cursor to the left margin on the next line. If the cursor is at the bottom margin, scroll the page up" + puts stderr "overtype::renderline ESC E unimplemented" + + } + H { + #\x88 + #Tab Set + puts stderr "overtype::renderline ESC H tab set unimplemented" + } + M { + #\x8D + #Reverse Index (RI) + #vt102-docs: "Moves cursor up one line in same column. If cursor is at top margin, screen performs a scroll-down" + puts stderr "overtype::renderline ESC M not fully implemented" + + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #move up + incr cursor_row -1 + if {$cursor_row < 1} { + set cursor_row 1 + } + #ensure rest of *overlay* is emitted to remainder + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction up ;#need instruction for scroll-down? + #retain cursor_column + break + } + N { + #\x8e - affects next character only + puts stderr "overtype::renderline single shift select G2 command UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + O { + #\x8f - affects next character only + puts stderr "overtype::renderline single shift select G3 command UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + P { + #\x90 + #DCS - shouldn't get here - handled in 7DCS branch + #similarly \] OSC (\x9d) and \\ (\x9c) ST + } + V { + #\x96 + + } + W { + #\x97 + } + X { + #\x98 + #SOS + if {[string index $code end] eq "\007"} { + set sos_content [string range $code 2 end-1] ;#ST is \007 + } else { + set sos_content [string range $code 2 end-2] ;#ST is \x1b\\ + } + #return in some useful form to the caller + #TODO! + lappend sos_list [list string $sos_content row $cursor_row column $cursor_column] + puts stderr "overtype::renderline ESCX SOS UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + ^ { + #puts stderr "renderline PM" + #Privacy Message. + if {[string index $code end] eq "\007"} { + set pm_content [string range $code 2 end-1] ;#ST is \007 + } else { + set pm_content [string range $code 2 end-2] ;#ST is \x1b\\ + } + #We don't want to render it - but we need to make it available to the application + #see the textblock library in punk, for the exception we make here for single backspace. + #It is unlikely to be encountered as a useful PM - so we hack to pass it through as a fix + #for spacing issues on old terminals which miscalculate the single-width 'Symbols for Legacy Computing' + if {$pm_content eq "\b"} { + #puts stderr "renderline PM sole backspace special handling for \U1FB00 - \U1FBFF" + #esc^\b\007 or esc^\besc\\ + #HACKY pass-through - targeting terminals that both mis-space legacy symbols *and* don't support PMs + #The result is repair of the extra space. If the terminal is a modern one and does support PM - the \b should be hidden anyway. + #If the terminal has the space problem AND does support PMs - then this just won't fix it. + #The fix relies on the symbol-supplier to cooperate by appending esc^\b\esc\\ to the problematic symbols. + + #priv::render_addchar $idx $code [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + #idx has been incremented after last grapheme added + priv::render_append_to_char [expr {$idx -1}] $code + } + #lappend to a dict element in the result for application-specific processing + lappend pm_list $pm_content + } + _ { + #APC Application Program Command + #just warn for now.. + puts stderr "overtype::renderline ESC_ APC command UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + default { + puts stderr "overtype::renderline ESC code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented codenorm:[ansistring VIEW -lf 1 -vt 1 -nul 1 $codenorm]" + } + } + + } + 7DCS - 8DCS { + puts stderr "overtype::renderline DCS - DEVICE CONTROL STRING command UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + #ST (string terminator) \x9c or \x1b\\ + if {[tcl::string::index $codenorm end] eq "\x9c"} { + set code_content [tcl::string::range $codenorm 4 end-1] ;#ST is 8-bit 0x9c + } else { + set code_content [tcl::string::range $codenorm 4 end-2] ;#ST is \x1b\\ + } + + } + 7OSC - 8OSC { + # OSCs are terminated with ST of either \007 or \x1b\\ - we allow either whether code was 7 or 8 bit + if {[tcl::string::index $codenorm end] eq "\007"} { + set code_content [tcl::string::range $codenorm 4 end-1] ;#ST is \007 + } else { + set code_content [tcl::string::range $codenorm 4 end-2] ;#ST is \x1b\\ + } + set first_colon [tcl::string::first {;} $code_content] + if {$first_colon == -1} { + #there probably should always be a colon - but we'll try to make sense of it without + set osc_code $code_content ;#e.g \x1b\]104\007 vs \x1b\]104\;\007 + } else { + set osc_code [tcl::string::range $code_content 0 $first_colon-1] + } + switch -exact -- $osc_code { + 2 { + set newtitle [tcl::string::range $code_content 2 end] + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction [list set_window_title $newtitle] + break + } + 4 { + #OSC 4 - set colour palette + #can take multiple params + #e.g \x1b\]4\;1\;red\;2\;green\x1b\\ + set params [tcl::string::range $code_content 2 end] ;#strip 4 and first semicolon + set cmap [dict create] + foreach {cnum spec} [split $params {;}] { + if {$cnum >= 0 and $cnum <= 255} { + #todo - parse spec from names like 'red' to RGB + #todo - accept rgb:ab/cd/ef as well as rgb:/a/b/c (as alias for aa/bb/cc) + #also - what about rgb:abcd/defg/hijk and 12-bit abc/def/ghi ? + dict set cmap $cnum $spec + } else { + #todo - log + puts stderr "overtype::renderline OSC 4 set colour palette - bad color number: $cnum must be from 0 to 255. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + + puts stderr "overtype::renderline OSC 4 set colour palette unimplemented. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + + + } + 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 { + #OSC 10 through 17 - so called 'dynamic colours' + #can take multiple params - each successive parameter changes the next colour in the list + #- e.g if code started at 11 - next param is for 12. 17 takes only one param because there are no more + #10 change text foreground colour + #11 change text background colour + #12 change text cursor colour + #13 change mouse foreground colour + #14 change mouse background colour + #15 change tektronix foreground colour + #16 change tektronix background colour + #17 change highlight colour + set params [tcl::string::range $code_content 2 end] + + puts stderr "overtype::renderline OSC $osc_code set dynamic colours unimplemented. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + + + } + 18 { + #why is this not considered one of the dynamic colours above? + #https://www.xfree86.org/current/ctlseqs.html + #tektronix cursor color + puts stderr "overtype::renderline OSC 18 - set tektronix cursor color unimplemented. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + 99 { + #kitty desktop notifications + #https://sw.kovidgoyal.net/kitty/desktop-notifications/ + # 99 ; metadata ; payload + puts stderr "overtype::renderline OSC 99 kitty desktop notification unimplemented. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + 104 { + #reset colour palette + #we want to do it for the current rendering context only (vvt) - not just pass through to underlying vt + puts stderr "overtype::renderline OSC 104 reset colour palette unimplemented. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction [list reset_colour_palette] + break + } + 1337 { + #iterm2 graphics and file transfer + puts stderr "overtype::renderline OSC 1337 iterm2 graphics/file_transfer unimplemented. 1st 100 chars of code [ansistring VIEW -lf 1 -vt 1 -nul 1 [string range $code 0 99]]" + } + 5113 { + puts stderr "overtype::renderline OSC 5113 kitty file transfer unimplemented. 1st 100 chars of code [ansistring VIEW -lf 1 -vt 1 -nul 1 [string range $code 0 99]]" + } + default { + puts stderr "overtype::renderline OSC - UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + + } + default { + } + } + + + } + default { + #don't need to handle sgr or gx0 types + #we have our sgr gx0 codes already in stacks for each overlay grapheme + } + } + } + + #-------- + if {$opt_expand_right == 0} { + #need to truncate to the width of the original undertext + #review - string_width vs printing_length here. undertext requirement to be already rendered therefore punk::char::string_width ok? + #set num_under_columns [punk::char::string_width $pt_underchars] ;#plaintext underchars + } + if {$overflow_idx == -1} { + #overflow was initially unlimited and hasn't been overridden + } else { + + } + #-------- + + + #coalesce and replay codestacks for outcols grapheme list + set outstring "" ;#output prior to overflow + set overflow_right "" ;#remainder after overflow point reached + set i 0 + set cstack [list] + set prevstack [list] + set prev_g0 [list] + #note overflow_idx may already have been set lower if we had a row move above due to \v or ANSI moves + set in_overflow 0 ;#used to stop char-width scanning once in overflow + if {$overflow_idx == 0} { + #how does caller avoid an infinite loop if they have autowrap on and keep throwing graphemes to the next line? REVIEW + set in_overflow 1 + } + set trailing_nulls 0 + foreach ch [lreverse $outcols] { + if {$ch eq "\u0000"} { + incr trailing_nulls + } else { + break + } + } + if {$trailing_nulls} { + set first_tail_null_posn [expr {[llength $outcols] - $trailing_nulls}] + } else { + set first_tail_null_posn -1 + } + + #puts stderr "first_tail_null_posn: $first_tail_null_posn" + #puts stderr "colview: [ansistring VIEW $outcols]" + + foreach ch $outcols { + #puts "---- [ansistring VIEW $ch]" + + set gxleader "" + if {$i < [llength $understacks_gx]} { + #set g0 [tcl::dict::get $understacks_gx $i] + set g0 [lindex $understacks_gx $i] + if {$g0 ne $prev_g0} { + if {$g0 eq [list "gx0_on"]} { + set gxleader "\x1b(0" + } else { + set gxleader "\x1b(B" + } + } + set prev_g0 $g0 + } else { + set prev_g0 [list] + } + + set sgrleader "" + if {$i < [llength $understacks]} { + #set cstack [tcl::dict::get $understacks $i] + set cstack [lindex $understacks $i] + if {$cstack ne $prevstack} { + if {[llength $prevstack] && ![llength $cstack]} { + #This reset is important e.g testfile fruit.ans - we get overhang on rhs without it. But why is cstack empty? + append sgrleader \033\[m + } else { + append sgrleader [punk::ansi::codetype::sgr_merge_list {*}$cstack] + } + } + set prevstack $cstack + } else { + set prevstack [list] + } + + + + if {$in_overflow} { + if {$i == $overflow_idx} { + set 0 [lindex $understacks_gx $i] + set gxleader "" + if {$g0 eq [list "gx0_on"]} { + set gxleader "\x1b(0" + } elseif {$g0 eq [list "gx0_off"]} { + set gxleader "\x1b(B" + } + append overflow_right $gxleader + set cstack [lindex $understacks $i] + set sgrleader "" + #whether cstack is same or differs from previous char's stack - we must have an output at the start of the overflow_right + #if {[llength $prevstack] && ![llength $cstack]} { + # append sgrleader \033\[m + #} + append sgrleader [punk::ansi::codetype::sgr_merge_list {*}$cstack] + append overflow_right $sgrleader + append overflow_right $ch + } else { + append overflow_right $gxleader + append overflow_right $sgrleader + append overflow_right $ch + } + } else { + if {$overflow_idx != -1 && $i+1 == $overflow_idx} { + #one before overflow + #will be in overflow in next iteration + set in_overflow 1 + if {[grapheme_width_cached $ch]> 1} { + #we overflowed with second-half of a double-width char - replace first-half with user-supplied exposition char (should be 1 wide) + set ch $opt_exposed1 + } + } + append outstring $gxleader + append outstring $sgrleader + if {$ch eq "\u0000"} { + if {$cp437_glyphs} { + #map all nulls including at tail to space + append outstring " " + } else { + if {$trailing_nulls && $i < $first_tail_null_posn} { + append outstring " " ;#map inner nulls to space + } else { + append outstring \u0000 + } + } + } else { + append outstring $ch + } + } + incr i + } + #flower.ans good test for null handling - reverse line building + #review - presence of overflow_right doesn't indicate line's trailing nulls should remain. + #The cells could have been erased? + #if {!$cp437_glyphs} { + # #if {![ansistring length $overflow_right]} { + # # set outstring [tcl::string::trimright $outstring "\u0000"] + # #} + # set outstring [tcl::string::trimright $outstring "\u0000"] + # set outstring [tcl::string::map {\u0000 " "} $outstring] + #} + + + #REVIEW + #set overflow_right [tcl::string::trimright $overflow_right "\u0000"] + #set overflow_right [tcl::string::map {\u0000 " "} $overflow_right] + + set replay_codes "" + if {[llength $understacks] > 0} { + if {$overflow_idx == -1} { + #set tail_idx [tcl::dict::size $understacks] + set tail_idx [llength $understacks] + } else { + set tail_idx [llength $undercols] + } + if {$tail_idx-1 < [llength $understacks]} { + #set replay_codes [join [lindex $understacks $tail_idx-1] ""] ;#tail replay codes + set replay_codes [punk::ansi::codetype::sgr_merge_list {*}[lindex $understacks $tail_idx-1]] ;#tail replay codes + } + if {$tail_idx-1 < [llength $understacks_gx]} { + set gx0 [lindex $understacks_gx $tail_idx-1] + if {$gx0 eq [list "gx0_on"]} { + #if it was on, turn gx0 off at the point we stop processing overlay + append outstring "\x1b(B" + } + } + } + if {[string length $overflow_right]} { + #puts stderr "remainder:$overflow_right" + } + #pdict $understacks + + if {[punk::ansi::ta::detect_sgr $outstring]} { + append outstring [punk::ansi::a] ;#without this - we would get for example, trailing backgrounds after rightmost column + + #close off any open gx? + #probably should - and overflow_right reopen? + } + + if {$opt_returnextra} { + #replay_codes is the codestack at the boundary - used for ellipsis colouring to match elided text - review + #replay_codes_underlay is the set of codes in effect at the very end of the original underlay + + #review + #replay_codes_overlay is the set of codes in effect at the very end of the original overlay (even if not all overlay was applied) + #todo - replay_codes for gx0 mode + + #overflow_idx may change during ansi & character processing + if {$overflow_idx == -1} { + set overflow_right_column "" + } else { + set overflow_right_column [expr {$overflow_idx+1}] + } + set result [tcl::dict::create\ + result $outstring\ + visualwidth [punk::ansi::printing_length $outstring]\ + instruction $instruction\ + stringlen [string length $outstring]\ + overflow_right_column $overflow_right_column\ + overflow_right $overflow_right\ + unapplied $unapplied\ + unapplied_list $unapplied_list\ + insert_mode $insert_mode\ + autowrap_mode $autowrap_mode\ + crm_mode $crm_mode\ + reverse_mode $reverse_mode\ + insert_lines_above $insert_lines_above\ + insert_lines_below $insert_lines_below\ + cursor_saved_position $cursor_saved_position\ + cursor_saved_attributes $cursor_saved_attributes\ + cursor_column $cursor_column\ + cursor_row $cursor_row\ + expand_right $opt_expand_right\ + replay_codes $replay_codes\ + replay_codes_underlay $replay_codes_underlay\ + replay_codes_overlay $replay_codes_overlay\ + pm_list $pm_list\ + ] + if {$opt_returnextra == 1} { + #puts stderr "renderline: $result" + return $result + } else { + #human/debug - map special chars to visual glyphs + set viewop VIEW + switch -- $opt_returnextra { + 2 { + #codes and character data + set viewop VIEWCODES ;#ansi colorisation of codes - green for SGR, blue/blue reverse for cursor_save/cursor_restore, cyan for movements, orange for others + } + 3 { + set viewop VIEWSTYLE ;#ansi colorise the characters within the output with preceding codes, stacking codes only within each dict value - may not be same SGR effect as the effect in-situ. + } + } + tcl::dict::set result result [ansistring $viewop -lf 1 -vt 1 [tcl::dict::get $result result]] + tcl::dict::set result overflow_right [ansistring VIEW -lf 1 -vt 1 [tcl::dict::get $result overflow_right]] + tcl::dict::set result unapplied [ansistring VIEW -lf 1 -vt 1 [tcl::dict::get $result unapplied]] + tcl::dict::set result unapplied_list [ansistring VIEW -lf 1 -vt 1 [tcl::dict::get $result unapplied_list]] + tcl::dict::set result replay_codes [ansistring $viewop -lf 1 -vt 1 [tcl::dict::get $result replay_codes]] + tcl::dict::set result replay_codes_underlay [ansistring $viewop -lf 1 -vt 1 [tcl::dict::get $result replay_codes_underlay]] + tcl::dict::set result replay_codes_overlay [ansistring $viewop -lf 1 -vt 1 [tcl::dict::get $result replay_codes_overlay]] + tcl::dict::set result cursor_saved_attributes [ansistring $viewop -lf 1 -vt 1 [tcl::dict::get $result cursor_saved_attributes]] + return $result + } + } else { + #puts stderr "renderline returning: result $outstring instruction $instruction unapplied $unapplied overflow_right $overflow_right" + return $outstring + } + #return [join $out ""] + } + + #*** !doctools + #[list_end] [comment {--- end definitions namespace overtype ---}] +} + +tcl::namespace::eval overtype::piper { + proc overcentre {args} { + if {[llength $args] < 2} { + error {usage: ?-bias left|right? ?-transparent [0|1|]? ?-exposed1 ? ?-exposed2 ? ?-overflow [1|0]? overtext pipelinedata} + } + lassign [lrange $args end-1 end] over under + set argsflags [lrange $args 0 end-2] + tailcall overtype::centre {*}$argsflags $under $over + } + proc overleft {args} { + if {[llength $args] < 2} { + error {usage: ?-startcolumn ? ?-transparent [0|1|]? ?-exposed1 ? ?-exposed2 ? ?-overflow [1|0]? overtext pipelinedata} + } + lassign [lrange $args end-1 end] over under + set argsflags [lrange $args 0 end-2] + tailcall overtype::left {*}$argsflags $under $over + } +} + + +# -- --- --- --- --- --- --- --- --- --- --- +proc overtype::transparentline {args} { + foreach {under over} [lrange $args end-1 end] break + set argsflags [lrange $args 0 end-2] + set defaults [tcl::dict::create\ + -transparent 1\ + -exposed 1 " "\ + -exposed 2 " "\ + ] + set newargs [tcl::dict::merge $defaults $argsflags] + tailcall overtype::renderline {*}$newargs $under $over +} +#renderline may not make sense as it is in the long run for blocks of text - but is handy in the single-line-handling form anyway. +# We are trying to handle ansi codes in a block of text which is acting like a mini-terminal in some sense. +#We can process standard cursor moves such as \b \r - but no way to respond to other cursor movements e.g moving to other lines. +# +tcl::namespace::eval overtype::piper { + proc renderline {args} { + if {[llength $args] < 2} { + error {usage: ?-start ? ?-transparent [0|1|]? ?-overflow [1|0]? overtext pipelinedata} + } + foreach {over under} [lrange $args end-1 end] break + set argsflags [lrange $args 0 end-2] + tailcall overtype::renderline {*}$argsflags $under $over + } +} +interp alias "" piper_renderline "" overtype::piper::renderline + +#intended primarily for single grapheme - but will work for multiple +#WARNING: query CAN contain ansi or newlines - but if cache was not already set manually,the answer will be incorrect! +#We deliberately allow this for PM/SOS attached within a column +#(a cache of ansifreestring_width calls - as these are quite regex heavy) +proc overtype::grapheme_width_cached {ch} { + variable grapheme_widths + if {[tcl::dict::exists $grapheme_widths $ch]} { + return [tcl::dict::get $grapheme_widths $ch] + } + set width [punk::char::ansifreestring_width $ch] + tcl::dict::set grapheme_widths $ch $width + return $width +} + + + +proc overtype::test_renderline {} { + set t \uFF5E ;#2-wide tilde + set u \uFF3F ;#2-wide underscore + set missing \uFFFD + return [list $t $u A${t}B] +} + +#maintenance warning +#same as textblock::size - but we don't want that circular dependency +#block width and height can be tricky. e.g \v handled differently on different terminal emulators and can affect both +proc overtype::blocksize {textblock} { + if {$textblock eq ""} { + return [tcl::dict::create width 0 height 1] ;#no such thing as zero-height block - for consistency with non-empty strings having no line-endings + } + if {[tcl::string::first \t $textblock] >= 0} { + if {[info exists punk::console::tabwidth]} { + set tw $::punk::console::tabwidth + } else { + set tw 8 + } + set textblock [textutil::tabify::untabify2 $textblock $tw] + } + #ansistrip on entire block in one go rather than line by line - result should be the same - review - make tests + if {[punk::ansi::ta::detect $textblock]} { + set textblock [punk::ansi::ansistrip $textblock] + } + if {[tcl::string::last \n $textblock] >= 0} { + set num_le [expr {[tcl::string::length $textblock]-[tcl::string::length [tcl::string::map {\n {}} $textblock]]}] ;#faster than splitting into single-char list + set width [tcl::mathfunc::max {*}[lmap v [split $textblock \n] {::punk::char::ansifreestring_width $v}]] + } else { + set num_le 0 + set width [punk::char::ansifreestring_width $textblock] + } + #our concept of block-height is likely to be different to other line-counting mechanisms + set height [expr {$num_le + 1}] ;# one line if no le - 2 if there is one trailing le even if no data follows le + + return [tcl::dict::create width $width height $height] ;#maintain order in 'image processing' standard width then height - caller may use lassign [dict values [blocksize ]] width height +} + +tcl::namespace::eval overtype::priv { + variable cache_is_sgr [tcl::dict::create] + + #we are likely to be asking the same question of the same ansi codes repeatedly + #caching the answer saves some regex expense - possibly a few uS to lookup vs under 1uS + #todo - test if still worthwhile after a large cache is built up. (limit cache size?) + proc is_sgr {code} { + variable cache_is_sgr + if {[tcl::dict::exists $cache_is_sgr $code]} { + return [tcl::dict::get $cache_is_sgr $code] + } + set answer [punk::ansi::codetype::is_sgr $code] + tcl::dict::set cache_is_sgr $code $answer + return $answer + } + # better named render_to_unapplied? + proc render_unapplied {overlay_grapheme_control_list gci} { + upvar idx_over idx_over + upvar unapplied unapplied + upvar unapplied_list unapplied_list ;#maintaining as a list allows caller to utilize it without having to re-split + upvar overstacks overstacks + upvar overstacks_gx overstacks_gx + upvar overlay_grapheme_control_stacks og_stacks + + #set unapplied [join [lrange $overlay_grapheme_control_list $gci+1 end]] + set unapplied "" + set unapplied_list [list] + #append unapplied [join [lindex $overstacks $idx_over] ""] + #append unapplied [punk::ansi::codetype::sgr_merge_list {*}[lindex $overstacks $idx_over]] + set sgr_merged [punk::ansi::codetype::sgr_merge_list {*}[lindex $og_stacks $gci]] + if {$sgr_merged ne ""} { + lappend unapplied_list $sgr_merged + } + switch -- [lindex $overstacks_gx $idx_over] { + "gx0_on" { + lappend unapplied_list "\x1b(0" + } + "gx0_off" { + lappend unapplied_list "\x1b(B" + } + } + + foreach gc [lrange $overlay_grapheme_control_list $gci+1 end] { + lassign $gc type item + #types g other sgr gx0 + if {$type eq "gx0"} { + if {$item eq "gx0_on"} { + lappend unapplied_list "\x1b(0" + } elseif {$item eq "gx0_off"} { + lappend unapplied_list "\x1b(B" + } + } else { + lappend unapplied_list $item + } + } + set unapplied [join $unapplied_list ""] + } + + #clearer - renders the specific gci forward as unapplied - prefixed with it's merged sgr stack + proc render_this_unapplied {overlay_grapheme_control_list gci} { + upvar idx_over idx_over + upvar unapplied unapplied + upvar unapplied_list unapplied_list + upvar overstacks overstacks + upvar overstacks_gx overstacks_gx + upvar overlay_grapheme_control_stacks og_stacks + + #set unapplied [join [lrange $overlay_grapheme_control_list $gci+1 end]] + set unapplied "" + set unapplied_list [list] + + set sgr_merged [punk::ansi::codetype::sgr_merge_list {*}[lindex $og_stacks $gci]] + if {$sgr_merged ne ""} { + lappend unapplied_list $sgr_merged + } + switch -- [lindex $overstacks_gx $idx_over] { + "gx0_on" { + lappend unapplied_list "\x1b(0" + } + "gx0_off" { + lappend unapplied_list "\x1b(B" + } + } + + foreach gc [lrange $overlay_grapheme_control_list $gci end] { + lassign $gc type item + #types g other sgr gx0 + if {$type eq "gx0"} { + if {$item eq "gx0_on"} { + lappend unapplied_list "\x1b(0" + } elseif {$item eq "gx0_off"} { + lappend unapplied_list "\x1b(B" + } + } else { + lappend unapplied_list $item + } + } + set unapplied [join $unapplied_list ""] + } + proc render_delchar {i} { + upvar outcols o + upvar understacks ustacks + upvar understacks_gx gxstacks + set nxt [llength $o] + if {$i < $nxt} { + set o [lreplace $o $i $i] + set ustacks [lreplace $ustacks $i $i] + set gxstacks [lreplace $gxstacks $i $i] + } elseif {$i == 0 || $i == $nxt} { + #nothing to do + } else { + puts stderr "render_delchar - attempt to delchar at index $i >= number of outcols $nxt - shouldn't happen" + } + } + proc render_erasechar {i count} { + upvar outcols o + upvar understacks ustacks + upvar understacks_gx gxstacks + upvar replay_codes_overlay replay + #ECH clears character attributes from erased character positions + #ECH accepts 0 or empty parameter, which is equivalent to 1. Caller of render_erasechar should do that mapping and only supply 1 or greater. + if {![tcl::string::is integer -strict $count] || $count < 1} { + error "render_erasechar count must be integer >= 1" + } + set start $i + set end [expr {$i + $count -1}] + #we restrict ECH to current line - as some terminals do - review - is that the only way it's implemented? + if {$i > [llength $o]-1} { + return + } + if {$end > [llength $o]-1} { + set end [expr {[llength $o]-1}] + } + set num [expr {$end - $start + 1}] + set o [lreplace $o $start $end {*}[lrepeat $num \u0000]] ;#or space? + #DECECM ??? + set ustacks [lreplace $ustacks $start $end {*}[lrepeat $num [list $replay]]] + set gxstacks [lreplace $gxstacks $start $end {*}[lrepeat $num [list]]] ;# ??? review + return + } + proc render_setchar {i c } { + upvar outcols o + lset o $i $c + } + + #Initial usecase is for old-terminal hack to add PM-wrapped \b + #review - can be used for other multibyte sequences that occupy one column? + #combiners? diacritics? + proc render_append_to_char {i c} { + upvar outcols o + if {$i > [llength $o]-1} { + error "render_append_to_char cannot append [ansistring VIEW -lf 1 -nul 1 $c] to existing char at index $i while $i >= llength outcols [llength $o]" + } + set existing [lindex $o $i] + if {$existing eq "\0"} { + lset o $i $c + } else { + lset o $i $existing$c + } + } + #is actually addgrapheme? + proc render_addchar {i c sgrstack gx0stack {insert_mode 0}} { + upvar outcols o + upvar understacks ustacks + upvar understacks_gx gxstacks + + # -- --- --- + #this is somewhat of a hack.. probably not really the equivalent of proper reverse video? review + #we should ideally be able to reverse the video of a sequence that already includes SGR reverse/noreverse attributes + upvar reverse_mode do_reverse + #if {$do_reverse} { + # lappend sgrstack [a+ reverse] + #} else { + # lappend sgrstack [a+ noreverse] + #} + + #JMN3 + if {$do_reverse} { + #note we can't just look for \x1b\[7m or \x1b\[27m + # it may be a more complex sequence like \x1b\[0\;\;7\;31m etc + + set existing_reverse_state 0 + set codeinfo [punk::ansi::codetype::sgr_merge $sgrstack -info 1] + set codestate_reverse [dict get $codeinfo codestate reverse] + switch -- $codestate_reverse { + 7 { + set existing_reverse_state 1 + } + 27 { + set existing_reverse_state 0 + } + "" { + } + } + if {$existing_reverse_state == 0} { + set rflip [a+ reverse] + } else { + #reverse of reverse + set rflip [a+ noreverse] + } + #note that mergeresult can have multiple esc (due to unmergeables or non sgr codes) + set sgrstack [list [dict get $codeinfo mergeresult] $rflip] + #set sgrstack [punk::ansi::codetype::sgr_merge [list [dict get $codeinfo mergeresult] $rflip]] + } + + # -- --- --- + + set nxt [llength $o] + if {!$insert_mode} { + if {$i < $nxt} { + #These lists must always be in sync + lset o $i $c + } else { + lappend o $c + } + if {$i < [llength $ustacks]} { + lset ustacks $i $sgrstack + lset gxstacks $i $gx0stack + } else { + lappend ustacks $sgrstack + lappend gxstacks $gx0stack + } + } else { + #insert of single-width vs double-width when underlying is double-width? + if {$i < $nxt} { + set o [linsert $o $i $c] + } else { + lappend o $c + } + if {$i < [llength $ustacks]} { + set ustacks [linsert $ustacks $i $sgrstack] + set gxstacks [linsert $gxstacks $i $gx0stack] + } else { + lappend ustacks $sgrstack + lappend gxstacks $gx0stack + } + } + } + +} + + + +# -- --- --- --- --- --- --- --- --- --- --- +tcl::namespace::eval overtype { + interp alias {} ::overtype::center {} ::overtype::centre +} + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Ready +package provide overtype [tcl::namespace::eval overtype { + variable version + set version 1.6.6 +}] +return + +#*** !doctools +#[manpage_end] diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/test/tomlish-1.1.5.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/test/tomlish-1.1.5.tm index 536e3fa3..35de5e70 100644 Binary files a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/test/tomlish-1.1.5.tm and b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/test/tomlish-1.1.5.tm differ diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/tomlish-1.1.6.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/tomlish-1.1.6.tm new file mode 100644 index 00000000..dddcd0bb --- /dev/null +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/tomlish-1.1.6.tm @@ -0,0 +1,8408 @@ +# -*- tcl -*- +# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt +# +# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. +# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# (C) 2024 +# +# @@ Meta Begin +# Application tomlish 1.1.6 +# Meta platform tcl +# Meta license +# @@ Meta End + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# doctools header +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +#*** !doctools +#[manpage_begin tomlish_module_tomlish 0 1.1.6] +#[copyright "2024"] +#[titledesc {tomlish toml parser}] [comment {-- Name section and table of contents description --}] +#[moddesc {tomlish}] [comment {-- Description at end of page heading --}] +#[require tomlish] +#[keywords module parsing toml configuration] +#[description] +#[para] tomlish is an intermediate representation of toml data in a tree structure (tagged lists representing type information) +#[para] The design goals are for tomlish to be whitespace and comment preserving ie byte-for byte preservation during roundtrips from toml to tomlish and back to toml +#[para] The tomlish representation can then be converted to a Tcl dict structure or to other formats such as json, +#[para] although these other formats are generally unlikely to retain whitespace or comments +#[para] The other formats also won't preserve roundtripability e.g \t and a literal tab coming from a toml file will be indistinguishable. +#[para] A further goal is to allow at least a useful subset of in-place editing operations which also preserve whitespace and comments. +#[para] e.g leaf key value editing, and table reordering/sorting, key-renaming at any level, key insertions/deletions +#[para] The API for editing (tomldoc object?) may require explicit setting of type if accessing an existing key +#[para] e.g setting a key that already exists and is a different type (especially if nested structure such as a table or array) +#[para] will need a -type option (-force ?) to force overriding with another type such as an int. + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section Overview] +#[para] overview of tomlish +#[subsection Concepts] +#[para] - + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Requirements +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[subsection dependencies] +#[para] packages used by tomlish +#[list_begin itemized] + +package require Tcl 8.6- +package require struct::stack +package require logger + +#*** !doctools +#[item] [package {Tcl 8.6-}] +#[item] [package {struct::stack}] + +#limit ourselves to clear, destroy, peek, pop, push, rotate, or size (e.g v 1.3 does not implement 'get') + + +#*** !doctools +#[list_end] + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section API] + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# Base namespace +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval tomlish { + namespace export {[a-z]*}; # Convention: export all lowercase + variable types + + #*** !doctools + #[subsection {Namespace tomlish}] + #[para] Core API functions for tomlish + #[list_begin definitions] + + #default interp recursionlimit of 1000 is insufficient to pass 1000 deep nested structures as in certain toml tests. + #e.g https://github.com/iarna/toml-spec-tests/tree/latest/values + #1000 seems deep for a 'configuration' format - but toml sometimes used for other serialisation purposes. + #todo - review + set existing_recursionlimit [interp recursionlimit {}] + if {$existing_recursionlimit < 5000} { + interp recursionlimit {} 5000 + } + + #IDEAS: + # since get_toml produces tomlish with whitespace/comments intact: + # tomldoc object - allow (at least basic?) editing of toml whilst preserving comments & whitespace + # - setKey (set leaf only to value) how to specify type? -type option? - whole array vs index into arrays and further nested objects? - option for raw toml additions? + # - separate addKey?? + # - deleteKey (delete leaf) + # - deleteTable (delete table - if only has leaves? - option to delete with child tables?) + # - set/add Table? - position in doc based on existing tables/subtables? + + #The tomlish intermediate representation allows things such as sorting the toml document by table name or other re-ordering of tables - + # because the tables include subkeys, comments and newlines within their structure - those elements all come along with it nicely during reordering. + #The same goes for the first newline following a keyval e.g x=1 \ny=2\n\n + #The newline is part of the keyval structure so makes reordering easier + #example from_toml "a=1\nb=2\n\n\n" + # 0 = TOMLISH + # 1 = KEY a = {INT 1} {NEWLINE lf} + # 2 = NEWLINE lf + # 3 = KEY b = {INT 2} {NEWLINE lf} + # 4 = NEWLINE lf + # 5 = NEWLINE lf + + #This reordering idea is complicated by the nature of tablearrays - especially as a table header references last tablearrayname, + # and duplicate table headers are allowed in that context. + #e.g + #[[fruits]] + # name="apple" + # [fruits.metadata] + # id=1 + # + #[unrelated1] + # + #[[fruits]] + # name="pear" + # + #[unrelated2] + # silly="ordering" + # + #[fruits.metadata] + #id=2 + #The TABLEARRAY record can't be completely selfcontained on the default parsing mechanism - because it is legal (though not recommended) to have unrelated tables in between. + #If we were to 'insert' later related records (such as the 2nd [fruits.metadata] above) into the TABLEARRAY structure - then, even though it might produce 'nicer' toml, + # we would lose roundtripability toml->tomlish->toml + # ----------------------------------------------------- + #REVIEW + #todo - some sort of 'normalize'/'grouping' function on tomlish that at least makes records self-contained, and perhaps then (optionally) reorders resulting records sensibly. + #such a function on the tomlish may work - although it would be unwise to duplicate the validation aspects of dict::from_tomlish + #The most practical way might be to use dict::from_tomlish followed by from_dict - but that would lose comment info and formatting. + #In the above example - The decision by the toml author to put [unrelated1] between related tablearrays should be respected, + #but the positioning of [unrelated2] between a tablearray and one of its contained tables is suspect. + #Both [fruits.metadata] table records should theoretically be added as children to their corresponding [[fruits]] tablearray record in the tomlish. (just as their name keys are) + # ----------------------------------------------------- + + + + #ARRAY is analogous to a Tcl list + #TABLE is analogous to a Tcl dict + #WS = inline whitespace + #KEY = bare key and value + #DQKEY = double quoted key and value + #SQKEY = single quoted key and value + #ITABLE = inline table (*can* be anonymous table) + # inline table values immediately create a table with the opening brace + # inline tables are fully defined between their braces, as are dotted-key subtables defined within + # No additional subtables or arrays of tables may be defined within an inline table after the ending brace - they must be entirely self-contained + + set tags [list TOMLISH BOM ARRAY TABLE ITABLE TABLEARRAY WS NEWLINE COMMENT DOTTEDKEY KEY DQKEY SQKEY STRING STRINGPART MULTISTRING LITERAL LITERALPART MULTILITERAL INT FLOAT BOOL] + #DDDD + lappend tags {*}[list\ + DATETIME\ + DATETIME-LOCAL\ + DATE-LOCAL\ + TIME-LOCAL\ + ] + + #removed - ANONTABLE + #tomlish v1.0 should accept arbitrary 64-bit signed ints (from -2^63 to 2^63-1) + #we will restrict to this range for compatibility for now - although Tcl can handle larger (arbitrarily so?) + #todo - configurable - allow empty string for 'unlimited' + set min_int -9223372036854775808 ;#-2^63 + set max_int +9223372036854775807 ;#2^63-1 + + proc Dolog {lvl txt} { + #return "$lvl -- $txt" + set msg "[clock format [clock seconds] -format "%Y-%m-%dT%H:%M:%S"] tomlish '$txt'" + puts stderr $msg + } + logger::initNamespace ::tomlish + foreach lvl [logger::levels] { + interp alias {} tomlish_log_$lvl {} ::tomlish::Dolog $lvl + log::logproc $lvl tomlish_log_$lvl + } + + + proc tags {} { + return $::tomlish::tags + } + + proc get_dottedkey_info {dottedkeyrecord} { + set key_hierarchy [list] + set key_hierarchy_raw [list] + if {[lindex $dottedkeyrecord 0] ne "DOTTEDKEY"} { + error "tomlish::get_dottedkey_info error. Supplied list doesn't appear to be a DOTTEDKEY (tag: [lindex $dottedkeyrecord 0])" + } + set compoundkeylist [lindex $dottedkeyrecord 1] + set expect_sep 0 + foreach part $compoundkeylist { + set parttag [lindex $part 0] + if {$parttag eq "WS"} { + continue + } + if {$expect_sep} { + if {$parttag ne "DOTSEP"} { + error "DOTTEDKEY missing dot separator between parts. '$dottedkeyrecord'" + } + set expect_sep 0 + } else { + set val [lindex $part 1] + switch -exact -- $parttag { + KEY { + lappend key_hierarchy $val + lappend key_hierarchy_raw $val + } + DQKEY { + #REVIEW unescape or not? + #JJJJ + lappend key_hierarchy [::tomlish::utils::unescape_string $val] + lappend key_hierarchy_raw \"$val\" + } + SQKEY { + lappend key_hierarchy $val + lappend key_hierarchy_raw "'$val'" + } + default { + error "tomlish::get_dottedkey_info DOTTED key unexpected part '$parttag' - ensure dot separator is between key parts. '$compoundkeylist'" + } + } + set expect_sep 1 + } + } + return [dict create keys $key_hierarchy keys_raw $key_hierarchy_raw] + } + + #helper function for tomlish::dict::from_tomlish + proc _get_keyval_value {keyval_element} { + #e.g + #DOTTEDKEY {{KEY a} {WS { }}} = {WS { }} {ARRAY {INT 1} SEP {ITABLE {DOTTEDKEY {{KEY x}} = {INT 1} SEP} {DOTTEDKEY {{KEY y}} = {INT 2}}}} + + log::notice ">>> _get_keyval_value from '$keyval_element'<<<" + #find the value (or 2 values if space separated datetime - and stitch back into one) + # 3 is the earliest index at which the value could occur (depending on whitespace) + if {[lindex $keyval_element 2] ne "="} { + error "tomlish _get_keyval_value keyval_element doesn't seem to be a properly structured { = } list\n $keyval_element" + } + + #review + if {[uplevel 1 [list info exists tablenames_info]]} { + upvar tablenames_info tablenames_info + } else { + set tablenames_info [dict create] ;#keys are @@ paths {@@parenttable @@arrayable @@etc} corresponding to parenttable.arraytable[].etc + #value is a dict with keys such as ttype, tdefined + } + set sublist [lrange $keyval_element 3 end] ;# rhs of = + + set values [list] + set value_posns [list] + set posn 0 + foreach sub $sublist { + #note that a barekey/dquotedkey won't occur directly inside a barekey/dquotedkey + #DDDD + switch -exact -- [lindex $sub 0] { + STRING - LITERAL - MULTISTRING - MULTILITERAL - INT - FLOAT - BOOL - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - TABLE - ARRAY - ITABLE { + lappend values $sub + lappend value_posns $posn + } + DOTTEDKEY { + #we should never see DOTTEDKEY as a toplevel element on RHS + #sanity check in case manually manipulated tomlish - or something went very wrong + set msg "tomlish::_get_keyval_value Unexpected toplevel value element DOTTEDKEY after =" + return -code error -errorcode {TOMLISH SYNTAX UNEXPECTEDDOTTEDKEYRHS} $msg + } + WS - NEWLINE - COMMENT {} + SEP {} + default { + set msg "tomlish::_get_keyval_value Unexpected toplevel value element [lindex $sub 0] after =" + return -code error -errorcode {TOMLISH SYNTAX UNEXPECTED} $msg + } + } + incr posn + } + switch -- [llength $values] { + 0 { + error "tomlish Failed to find value element in KEY. '$keyval_element'" + } + 1 { + lassign [lindex $values 0] type value + } + 2 { + #we generally expect a single 'value' item on RHS of = + #(ignoring WS,NEWLINE,SEP + #(either a simple type, or a container which has multiple values inside) + #exception for space separated datetime which is two toplevel values + + #validate than exactly single space was between the two values + lassign $value_posns p1 p2 + if {$p2 != $p1 +2} { + #sanity check + #can probably only get here through manual manipulation of the tomlish list to an unprocessable form + error "tomlish KEY appears to have more than one part - but not separated by whitespace - invalid '$keyval_element'" + } + set between_token [lindex $sublist $p1+1] + if {[lindex $between_token 1] ne " "} { + error "tomlish KEY in 2 parts is not separated by a single space - cannot consider for datetime '$keyval_element'" + } + lassign [lindex $values 0] type_d1 value_d1 + lassign [lindex $values 1] type_d2 value_d2 + #DDDD + if {$type_d1 ne "DATE-LOCAL" || $type_d2 ni {DATETIME TIME-LOCAL}} { + #we reuse DATETIME tag for standalone time with tz offset (or zZ) + error "tomlish KEY in 2 parts does not appear to be datetime '$keyval_element'" + } + if {$type_d2 eq "TIME-LOCAL"} { + set type DATETIME-LOCAL + } else { + #extra check that 2nd part is actually a time + if {![tomlish::utils::is_timepart $value_d2]} { + error "tomlish KEY in 2 parts does not appear to be datetime. (part 2 not a time value) '$keyval_element'" + } + set type DATETIME + } + set value "${value_d1}T${value_d2}" + } + default { + error "tomlish Found multiple value elements in KEY, expected one. (or 2 for space-separated datetime) '$keyval_element'" + } + } + set sub_tablenames_info [dict create] + switch -exact -- $type { + INT - FLOAT - BOOL - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL { + #DDDD + #simple (non-container, no-substitution) datatype + set result [list type $type value $value] + } + STRING - STRINGPART { + #JJJ + #!!! review + #set result [list type $type value [::tomlish::utils::unescape_string $value]] + set result [list type $type value $value] + } + LITERAL - LITERALPART { + #REVIEW + set result [list type $type value $value] + } + TABLE { + #invalid? + error "tomlish _get_keyval_value invalid to have type TABLE on rhs of =" + } + ITABLE { + #This one should not be returned as a type value structure! + # + set prev_tablenames_info $tablenames_info + set tablenames_info [dict create] + set result [::tomlish::dict::from_tomlish [ list [lindex $values 0] ]] + set sub_tablenames_info $tablenames_info + set tablenames_info $prev_tablenames_info + } + ARRAY { + #we need to recurse to get the corresponding dict for the contained item(s) + #pass in the whole [lindex $values 0] (type val) - not just the $value! + set prev_tablenames_info $tablenames_info + set tablenames_info [dict create] + set result [list type $type value [ ::tomlish::dict::from_tomlish [ list [lindex $values 0] ] ]] + set sub_tablenames_info $tablenames_info + set tablenames_info $prev_tablenames_info + } + MULTISTRING - MULTILITERAL { + #review - mapping these to STRING might make some conversions harder? + #if we keep the MULTI - we know we have to look for newlines for example when converting to json + #without specific types we'd have to check every STRING - and lose info about how best to map chars within it + set result [list type $type value [ ::tomlish::dict::from_tomlish [ list [lindex $values 0] ] ]] + } + default { + error "tomlish Unexpected value type '$type' found in keyval '$keyval_element'" + } + } + return [dict create result $result tablenames_info $sub_tablenames_info] + } + + + proc to_dict {tomlish} { + tomlish::dict::from_tomlish $tomlish + } + + + + proc _from_dictval_tomltype {parents tablestack keys typeval} { + set type [dict get $typeval type] + set val [dict get $typeval value] + #These are the restricted sets of typed used in the tomlish::dict representation + #They are a subset of the types in tomlish: data types plus ARRAY, arranged in a dictionary form. + #The container types: ITABLE, TABLE, TABLEARRAY are not used as they are represented as dictionary keys and ARRAY items. + #The WS, COMMENT, and NEWLINE elements are also unrepresented in the dict structure. + switch -- $type { + ARRAY { + set subitems [list] + foreach item $val { + lappend subitems [_from_dictval [list {*}$parents ARRAY] $tablestack $keys $item] SEP + } + if {[lindex $subitems end] eq "SEP"} { + set subitems [lrange $subitems 0 end-1] + } + return [list ARRAY {*}$subitems] + } + ITABLE { + error "not applicable" + if {$val eq ""} { + return ITABLE + } else { + return [_from_dictval [list {*}$parents ITABLE] $tablestack $keys $val] + } + } + STRING { + #JSJS + #if our dict came from json - we have already decided what type of STRING/LITERAL etc to use when building the dict + + #do not validate like this - important that eg json val\\ue -> dict val\ue -> tomlish/toml val\\ue + #see toml-tests + #if {![tomlish::utils::rawstring_is_valid_tomlstring $val]} { + # #todo? + # return -code error -errorcode {TOML SYNTAX INVALIDSTRING} "Unescaped controls in string or invalid escapes" + #} + return [list STRING [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $val]] + } + MULTISTRING { + #value is a raw string that isn't encoded as tomlish + #create a valid toml snippet with the raw value and decode it to the proper tomlish MULTISTRING format + #We need to convert controls in $val to escape sequences - except for newlines + # + #consider an *option* to reformat for long lines? (perhaps overcomplex - byte equiv - but may fold in ugly places) + #we could use a line-length limit to decide when to put in a "line ending backslash" + #and even format it with a reasonable indent so that proper CONT and WS entries are made (?) REVIEW + # + #TODO + set tomlpart "x=\"\"\"\\\n" + append tomlpart [tomlish::utils::rawstring_to_MultiBstring_with_escaped_controls $val] + append tomlpart "\"\"\"" + set tomlish [tomlish::from_toml $tomlpart] + #e.g if val = " etc\nblah" + #TOMLISH {DOTTEDKEY {{KEY x}} = {MULTISTRING CONT {NEWLINE LF} {WS { }} {STRINGPART etc} {NEWLINE lf} {STRINGPART blah} } } + #lindex 1 3 is the MULTISTRING tomlish list + return [lindex $tomlish 1 3] + } + MULTILITERAL { + #MLL string can contain newlines - but still no control chars + #todo - validate - e.g val can't contain more than 2 squotes in a row + if {[string first ''' $val] >=0} { + set msg "_from_dictval_tomltype error: more than 2 single quotes in a row found in MULTILITERAL - cannot encode dict to TOML-VALID TOMLISH" + return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} $msg + } + + #rawstring_is_valid_multiliteral - allow newlines as lf or crlf - but not bare cr + if {![tomlish::utils::rawstring_is_valid_multiliteral $val]} { + return -code error -errorcode {TOML SYNTAX INVALIDMULTILITERAL} "Controls other than tab or newlines found in multiliteral" + } + + set tomlpart "x='''\n" + append tomlpart $val ''' + set tomlish [tomlish::from_toml $tomlpart] + return [lindex $tomlish 1 3] + } + LITERAL { + #from v1.0 spec - "Control characters other than tab are not permitted in a literal string" + #(This rules out raw ANSI SGR - which is somewhat restrictive - but perhaps justified for a config format + # as copy-pasting ansi to a config value is probably not always wise, and it's not something that can be + # easily input via a text editor. ANSI can go in Basic strings using the \e escape if that's accepted v1.1?) + #we could choose to change the type to another format here when encountering invalid chars - but that seems + #like too much magic. We elect to error out and require the dict to have valid data for the types it specifies. + if {[string first ' $val] >=0} { + set msg "_from_dictval_tomltype error: single quote found in LITERAL - cannot encode dict to TOML-VALID TOMLISH" + return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} $msg + } + #JJJJ + if {![tomlish::utils::rawstring_is_valid_literal $val]} { + #has controls other than tab + #todo - squote? + return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} "Controls other than tab found in literal" + } + return [list LITERAL $val] + } + INT { + if {![::tomlish::utils::is_int $val]} { + error "_from_dictval_tomltype error: bad INT value '$val' - cannot encode dict to TOML-VALID TOMLISH" + } + return [list INT $val] + } + FLOAT { + if {![::tomlish::utils::is_float $val]} { + error "_from_dictval_tomltype error: bad FLOAT value '$val' - cannot encode dict to TOML-VALID TOMLISH" + } + return [list FLOAT $val] + } + default { + if {$type ni [::tomlish::tags]} { + error "_from_dictval_tomltype error: Unrecognised typename '$type' in {type value } - cannot encode dict to TOML-VALID TOMLISH" + } + return [list $type $val] + } + } + } + + proc _from_dictval {parents tablestack keys vinfo} { + set k [lindex $keys end] + set K_PART [tomlish::dict::classify_rawkey $k] ;#get [list SQKEY ] + #puts stderr "---parents:'$parents' keys:'$keys' vinfo: $vinfo---" + #puts stderr "---tablestack: $tablestack---" + set result [list] + set lastparent [lindex $parents end] + if {$lastparent in [list "" do_inline]} { + if {[tomlish::dict::is_typeval $vinfo]} { + set type [dict get $vinfo type] + #treat ITABLE differently? + set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] + lappend result DOTTEDKEY [list $K_PART {WS { }}] = {WS { }} $sublist {NEWLINE lf} + } else { + if {$vinfo ne ""} { + + #set result [list DOTTEDKEY [list [list KEY $k]] = ] + #set records [list ITABLE] + + set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] + + if {$lastparent eq "do_inline"} { + set result [list DOTTEDKEY [list $K_PART] =] + set records [list ITABLE] + } else { + set tname [tomlish::dict::join_and_quote_rawkey_list [list $k]] + set result [list TABLE $tname {NEWLINE lf}] + set tablestack [list {*}$tablestack [list T $k]] + set records [list] + } + + + + set lastidx [expr {[dict size $vinfo] -1}] + set dictidx 0 + dict for {vk vv} $vinfo { + set VK_PART [tomlish::dict::classify_rawkey $vk] ;#get [list SQKEY ] + #(SQKEY & DQKEY do not have the enclosing quotes in their returned val) + #if {[regexp {\s} $vk] || [string first . $vk] >= 0} { + # set VK_PART [list SQKEY $vk] + #} else { + # set VK_PART [list KEY $vk] + #} + if {[tomlish::dict::is_typeval $vv]} { + #type x value y + #REVIEW - we could detect if value is an array of objects, + #and depending on parent context - emit a series of TABLEARRAY records instead of a DOTTEDKEY record containing an ARRAY of objects + set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] + set record [list DOTTEDKEY [list $VK_PART {WS { }}] = {WS { }} $sublist] + } else { + if {$vv eq ""} { + #experimental + if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { + ::tomlish::log::notice "_from_dictval could uninline KEY $vk (tablestack:$tablestack)" + #set tname [tomlish::dict::name_from_tablestack [list {*}$tablestack [list T $vk]]] + + #we can't just join normalized keys - need keys with appropriate quotes and escapes + #set tname [join [list {*}$keys $vk] .] ;#WRONG + set tq [tomlish::dict::join_and_quote_rawkey_list [list {*}$keys $vk]] + + + ##wrong? results in TABLE within TABLE record?? todo pop? + #set record [list TABLE $tq {NEWLINE lf}] + #set tablestack [list {*}$tablestack [list T $vk]] + + #REVIEW!!! + + set record [list DOTTEDKEY [list $VK_PART] = ITABLE] + set tablestack [list {*}$tablestack [list I $vk]] + + } else { + set record [list DOTTEDKEY [list $VK_PART] = ITABLE] + set tablestack [list {*}$tablestack [list I $vk]] + } + } else { + if { 0 } { + #experiment.. sort of getting there. + if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { + ::tomlish::log::notice "_from_dictval could uninline2 KEYS [list {*}$keys $vk] (tablestack:$tablestack)" + set tq [tomlish::dict::join_and_quote_rawkey_list [list {*}$keys $vk]] + set record [list TABLE $tq {NEWLINE lf}] + set tablestack [list {*}$tablestack [list T $vk]] + + #review - todo? + set dottedkey_value [_from_dictval [list {*}$parents TABLE] $tablestack [list {*}$keys $vk] $vv] + lappend record {*}$dottedkey_value + + } else { + set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] + set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] + } + } else { + set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] + set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] + } + } + } + if {$dictidx != $lastidx} { + #lappend record SEP + if {$lastparent eq "do_inline"} { + lappend record SEP + } else { + lappend record {NEWLINE lf} + } + } + if {[llength $record]} { + lappend records $record + } + incr dictidx + } + if {$lastparent eq "do_inline"} { + lappend result $records {NEWLINE lf} + } else { + lappend result {*}$records {NEWLINE lf} + } + } else { + if {$lastparent eq "do_inline"} { + lappend result DOTTEDKEY [list $K_PART] = ITABLE {NEWLINE lf} + } else { + set tname [tomlish::dict::join_and_quote_rawkey_list [list $k]] + #REVIEW + lappend result TABLE $tname {NEWLINE lf} + } + } + } + } else { + #lastparent is not toplevel "" or "do_inline" + if {[tomlish::dict::is_typeval $vinfo]} { + #type x value y + set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] + lappend result {*}$sublist + } else { + if {$lastparent eq "TABLE"} { + #review + dict for {vk vv} $vinfo { + set VK_PART [tomlish::dict::classify_rawkey $vk] ;#get [list SQKEY ] + set dottedkey_value [_from_dictval [list {*}$parents DOTTEDKEY] $tablestack [list {*}$keys $vk] $vv] + lappend result [list DOTTEDKEY [list $VK_PART] = $dottedkey_value {NEWLINE lf}] + } + } else { + if {$vinfo ne ""} { + set lastidx [expr {[dict size $vinfo] -1}] + set dictidx 0 + set sub [list] + #REVIEW + #set result $lastparent ;#e.g sets ITABLE + set result ITABLE + set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] + dict for {vk vv} $vinfo { + set VK_PART [tomlish::dict::classify_rawkey $vk] ;#get [list SQKEY ] + if {[tomlish::dict::is_typeval $vv]} { + #type x value y + set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] + set record [list DOTTEDKEY [list $VK_PART] = $sublist] + } else { + if {$vv eq ""} { + #can't just uninline at this level + #we need a better method to query main dict for uninlinability at each level + # (including what's been inlined already) + #if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { + # puts stderr "_from_dictval uninline2 KEY $keys" + # set tname [tomlish::dict::join_and_quote_rawkey_list [list {*}$keys $vk]] + # set record [list TABLE $tname {NEWLINE lf}] + # set tablestack [list {*}$tablestack [list T $vk]] + #} else { + set record [list DOTTEDKEY [list $VK_PART] = ITABLE] + #} + } else { + #set sub [_from_dictval ITABLE $vk $vv] + set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] + #set record [list DOTTEDKEY [list $VK_PART] = ITABLE $dottedkey_value] + set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] + } + } + if {$dictidx != $lastidx} { + lappend record SEP + } + lappend result $record + incr dictidx + } + } else { + #e.g x=[{}] + log::debug "---> _from_dictval empty ITABLE x-1" + #lappend result DOTTEDKEY [list $K_PART] = ITABLE ;#wrong + lappend result ITABLE + } + } + } + } + return $result + } + + + proc from_dict {d} { + #consider: + # t1={a=1,b=2} + # x = 1 + + # from_dict gives us: t1 {a {type INT value 1} b {type INT value 2}} x {type INT value 1} + + #If we represent t1 as an expanded table we get + # [t1] + # a=1 + # b=2 + # x=1 + # --- which is incorrect - as x was a toplevel key like t1! + #This issue doesn't occur if x is itself an inline table + # t1={a=1,b=2} + # x= {no="problem"} + # + # (or if we were to reorder x to come before t1) + + #ie the order of the dict elements influences how the toml can be represented. + + #As the dictionary form doesn't distinguish the structure used to create tables {[table1]\nk=v} vs inline {table1={k=v}} + #Without a solution, from_dict would have to always produce the inline form for toplevel tables unless we allowed re-ordering, + #which is unpreferred here. + + #A possible solution: + #scan the top level to see if all (trailing) elements are themselves dicts + # (ie not of form {type XXX value yyy}) + # + # A further point is that if all root level values are at the 'top' - we can treat lower table-like structures as {[table]} elements + #ie we don't need to force do_inline if all the 'simple' keys are before any compound keys + + #set root_has_values 0 + #approach 1) - the naive approach - forces inline when not always necessary + #dict for {k v} $d { + # if {[llength $v] == 4 && [lindex $v 0] eq "type"} { + # set root_has_values 1 + # break + # } + #} + + + #approach 2) - track the position of last {type x value y} in the dictionary built by dict::from_tomlish + # - still not perfect. Inlines dotted tables unnecessarily + #This means from_dict doesn't produce output optimal for human editing. + set last_simple [tomlish::dict::last_tomltype_posn $d] + + + ## set parent "do_inline" ;#a value used in _from_dictval to distinguish from "" or other context based parent values + #Any keys that are themselves tables - will need to be represented inline + #to avoid reordering, or incorrect assignment of plain values to the wrong table. + + ## set parent "" + #all toplevel keys in the dict structure can represent subtables. + #we are free to use {[tablename]\n} syntax for toplevel elements. + + + set tomlish [list TOMLISH] + set dictposn 0 + set tablestack [list [list T root]] ;#todo + dict for {t tinfo} $d { + if {$last_simple > $dictposn} { + set parents [list do_inline] + } else { + set parents [list ""] + } + set keys [list $t] + #review - where to make decision on + # DOTTEDKEY containing array of objs + #vs + # list of TABLEARRAY records + #At least for the top + set trecord [_from_dictval $parents $tablestack $keys $tinfo] + lappend tomlish $trecord + incr dictposn + } + return $tomlish + } + + proc typedjson_to_toml {json} { + #*** !doctools + #[call [fun typedjson_to_toml] [arg json]] + #[para] + + set tomlish [::tomlish::from_dict_from_typedjson $json] + lappend tomlish [list NEWLINE lf] + set toml [::tomlish::to_toml $tomlish] + } + + set json1 {{ "a": {"type": "integer", "value": "42"}}} + set json2 {{ + "a": {"type": "integer", "value": "42"}, + "b": {"type": "string", "value": "test"} + }} + set json3 { +{ + "best-day-ever": {"type": "datetime", "value": "1987-07-05T17:45:00Z"}, + "numtheory": { + "boring": {"type": "bool", "value": "false"}, + "perfection": [ + {"type": "integer", "value": "6"}, + {"type": "integer", "value": "28"}, + {"type": "integer", "value": "496"} + ] + } +} + } + + set json4 { +{ + "best-day-ever": {"type": "datetime", "value": "1987-07-05T17:45:00Z"}, + "numtheory": { + "boring": {"type": "bool", "value": "false"}, + "perfection": [ + {"type": "integer", "value": "6"}, + {"type": "integer", "value": "28"}, + {"type": "integer", "value": "496"} + ] + }, + "emptyobj": {}, + "emptyarray": [] +} + } + + set json5 { +{ + "a": { + " x ": {}, + "b.c": {}, + "d.e": {}, + "b": { + "c": {} + } + } +} + } + + #surrogate pair face emoji + set json6 { +{ + "surrogatepair": {"type": "string", "value": "\uD83D\uDE10"} +} + } + + + set json7 { +{ + "escapes": {"type": "string", "value": "val\\ue"} +} + } + + + proc from_dict_from_typedjson {json} { + set d [tomlish::dict::from_typedjson $json] + tomlish::from_dict $d ;#return tomlish + } + + + proc toml_to_typedjson {toml} { + set tomlish [::tomlish::from_toml $toml] + set d [tomlish::dict::from_tomlish $tomlish] + #full validation only occurs by re-encoding dict to tomlish + set test [tomlish::from_dict $d] + + set h [tomlish::typedhuddle::from_dict $d] + #huddle jsondump $h + tomlish::huddle::jsondumpraw $h + } + + #proc get_json {tomlish} { + # package require fish::json + # set d [::tomlish::dict::from_tomlish $tomlish] + + # #return [::tomlish::dict_to_json $d] + # return [fish::json::from "struct" $d] + #} + + #return a Tcl list of tomlish tokens + #i.e get a standard list of all the toml terms in string $s + #where each element of the list is a *tomlish* term.. i.e a specially 'tagged' Tcl list. + #(simliar to a tcl 'Huddle' - but also supporting whitespace preservation) + # ---------------------------------------------------------------------------------------------- + # NOTE: the production of tomlish from toml source doesn't indicate the toml source was valid!!! + # e.g we deliberately don't check certain things such as duplicate table declarations here. + # ---------------------------------------------------------------------------------------------- + #Part of the justification for this is that as long as the syntax is toml shaped - we can load files which violate certain rules and allow programmatic manipulation. + # (e.g perhaps a toml editor to highlight violations for fixing) + # A further stage is then necessary to load the tomlish tagged list into a data structure more suitable for efficient query/reading. + # e.g dicts or an object oriented structure + #Note also - *no* escapes in quoted strings are processed. This is up to the datastructure stage + #e.g dict::from_tomlish will substitute \r \n \uHHHH \UHHHHHHH etc + #This is important for tomlish to maintain the ability to perform competely lossless round-trips from toml to tomlish and back to toml. + # (which is handy for testing as well as editing some part of the structure with absolutely no effect on other parts of the document) + #If we were to unescape a tab character for example + # - we have no way of knowing if it was originally specified as \t \u0009 or \U00000009 or directly as a tab character. + # For this reason, we also do absolutely no line-ending transformations based on platform. + # All line-endings are maintained as is, and even a file with mixed lf crlf line-endings will be correctly interpreted and can be 'roundtripped' + + proc from_toml {args} { + + namespace upvar ::tomlish::parse s s + set s [join $args \n] + namespace upvar ::tomlish::parse i i + set i 0 ;#index into s + + namespace upvar ::tomlish::parse is_parsing is_parsing + set is_parsing 1 + + if {[info command ::tomlish::parse::spacestack] eq "::tomlish::parse::spacestack"} { + tomlish::parse::spacestack destroy + } + struct::stack ::tomlish::parse::spacestack + + namespace upvar ::tomlish::parse last_space_action last_space_action + namespace upvar ::tomlish::parse last_space_type last_space_type + + namespace upvar ::tomlish::parse tok tok + set tok "" + + namespace upvar ::tomlish::parse type type + namespace upvar ::tomlish::parse tokenType tokenType + ::tomlish::parse::set_tokenType "" + namespace upvar ::tomlish::parse tokenType_list tokenType_list + set tokenType [list] ;#Flat (un-nested) list of tokentypes found + + namespace upvar ::tomlish::parse lastChar lastChar + set lastChar "" + + + set result "" + namespace upvar ::tomlish::parse nest nest + set nest 0 + + namespace upvar ::tomlish::parse v v ;#array keyed on nest level + + + set v(0) {TOMLISH} + array set s0 [list] ;#whitespace data to go in {SPACE {}} element. + set parentlevel 0 + + + namespace upvar ::tomlish::parse state state + + namespace upvar ::tomlish::parse braceCount braceCount + set barceCount 0 + namespace upvar ::tomlish::parse bracketCount bracketCount + set bracketCount 0 + + set sep 0 + set r 1 + namespace upvar ::tomlish::parse token_waiting token_waiting + set token_waiting [dict create] ;#if ::tok finds a *complete* second token during a run, it will put the 2nd one here to be returned by the next call. + + + + set state "table-space" + ::tomlish::parse::spacestack push {type space state table-space} + namespace upvar ::tomlish::parse linenum linenum;#'line number' of input data. (incremented for each literal linefeed - but not escaped ones in data) + set linenum 1 + + set ::tomlish::parse::state_list [list] + try { + while {$r} { + set r [::tomlish::parse::tok] + #puts stdout "got tok: '$tok' while parsing string '$s' " + set next_tokenType_known 0 ;#whether we begin a new token here based on what terminated the token result of 'tok' + + + #puts "got token: '$tok' tokenType='$tokenType'. while v($nest) = [set v($nest)]" + #puts "-->tok: $tok tokenType='$tokenType'" + set prevstate $state + set transition_info [::tomlish::parse::goNextState $tokenType $tok $state] + #review goNextState could perform more than one space_action + set space_action [dict get $transition_info space_action] + set newstate [dict get $transition_info newstate] ;#use of 'newstate' vs 'state' makes code clearer below + + if {[tcl::string::match "err-*" $state]} { + ::tomlish::log::warn "---- State error in state $prevstate for tokenType: $tokenType token value: $tok. $state aborting parse. [tomlish::parse::report_line]" + lappend v(0) [list ERROR tokentype $tokenType state $prevstate to $state leveldata [set v($nest)]] + return $v(0) + } + # --------------------------------------------------------- + #NOTE there may already be a token_waiting at this point + #set_token_waiting can raise an error here, + # in which case the space_action branch needs to be rewritten to handle the existing token_waiting + # --------------------------------------------------------- + + if {$space_action eq "pop"} { + #pop_trigger_tokens: newline tablename endarray endinlinetable + #note a token is a pop trigger depending on context. e.g first newline during keyval is a pop trigger. + set parentlevel [expr {$nest -1}] + set do_append_to_parent 1 ;#most tokens will leave this alone - but some like tentative_accum_squote need to do their own append + switch -exact -- $tokenType { + tentative_accum_squote { + #should only apply within a multiliteral + #### + set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed + #Without this - we would get extraneous empty list entries in the parent + # - as the xxx-squote-space isn't a space level from the toml perspective + # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop + #assert prevstate always trailing-squote-space + #dev guardrail - remove? assertion lib? + switch -exact -- $prevstate { + trailing-squote-space { + } + default { + error "--- unexpected popped due to tentative_accum_squote but came from state '$prevstate' should have been trailing-squote-space" + } + } + switch -- $tok { + ' { + tomlish::parse::set_token_waiting type single_squote value $tok complete 1 startindex [expr {$i -1}] + } + '' { + #review - we should perhaps return double_squote instead? + #tomlish::parse::set_token_waiting type literal value "" complete 1 + tomlish::parse::set_token_waiting type double_squote value "" complete 1 startindex [expr {$i - 2}] + } + ''' { + #### + #if already an eof in token_waiting - set_token_waiting will insert before it + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 3}] + } + '''' { + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] + #todo integrate left squote with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + LITERALPART { + set newval "[lindex $lastpart 1]'" + set parentdata $v($parentlevel) + lset parentdata end [list LITERALPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE { + lappend v($parentlevel) [list LITERALPART "'"] + } + MULTILITERAL { + #empty + lappend v($parentlevel) [list LITERALPART "'"] + } + default { + error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" + } + } + } + ''''' { + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] + #todo integrate left 2 squotes with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + LITERALPART { + set newval "[lindex $lastpart 1]''" + set parentdata $v($parentlevel) + lset parentdata end [list LITERALPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE { + lappend v($parentlevel) [list LITERALPART "''"] + } + MULTILITERAL { + lappend v($parentlevel) [list LITERALPART "''"] + } + default { + error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" + } + } + } + } + } + triple_squote { + #presumably popping multiliteral-space + ::tomlish::log::debug "---- triple_squote for last_space_action pop leveldata: $v($nest)" + set merged [list] + set lasttype "" + foreach part $v($nest) { + switch -exact -- [lindex $part 0] { + MULTILITERAL { + lappend merged $part + } + LITERALPART { + if {$lasttype eq "LITERALPART"} { + set prevpart [lindex $merged end] + lset prevpart 1 [lindex $prevpart 1][lindex $part 1] + lset merged end $prevpart + } else { + lappend merged $part + } + } + NEWLINE { + #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here + #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. + lappend merged $part + } + default { + error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" + } + } + set lasttype [lindex $part 0] + } + set v($nest) $merged + } + tentative_accum_dquote { + #should only apply within a multistring + #### + set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed + #Without this - we would get extraneous empty list entries in the parent + # - as the trailing-dquote-space isn't a space level from the toml perspective + # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop + #assert prevstate always trailing-dquote-space + #dev guardrail - remove? assertion lib? + switch -exact -- $prevstate { + trailing-dquote-space { + } + default { + error "--- unexpected popped due to tentative_accum_dquote but came from state '$prevstate' should have been trailing-dquote-space" + } + } + switch -- $tok { + {"} { + tomlish::parse::set_token_waiting type single_dquote value $tok complete 1 startindex [expr {$i -1}] + } + {""} { + #review - we should perhaps return double_dquote instead? + #tomlish::parse::set_token_waiting type literal value "" complete 1 + tomlish::parse::set_token_waiting type double_dquote value "" complete 1 startindex [expr {$i - 2}] + } + {"""} { + #### + #if already an eof in token_waiting - set_token_waiting will insert before it + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 3}] + } + {""""} { + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 4}] + #todo integrate left dquote with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + STRINGPART { + set newval "[lindex $lastpart 1]\"" + set parentdata $v($parentlevel) + lset parentdata end [list STRINGPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE - CONT - WS { + lappend v($parentlevel) [list STRINGPART {"}] + } + MULTISTRING { + #empty + lappend v($parentlevel) [list STRINGPART {"}] + } + default { + error "--- don't know how to integrate extra trailing dquote with data $v($parentlevel)" + } + } + } + {"""""} { + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i-5}] + #todo integrate left 2 dquotes with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + STRINGPART { + set newval "[lindex $lastpart 1]\"\"" + set parentdata $v($parentlevel) + lset parentdata end [list STRINGPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE - CONT - WS { + lappend v($parentlevel) [list STRINGPART {""}] + } + MULTISTRING { + lappend v($parentlevel) [list STRINGPART {""}] + } + default { + error "--- don't know how to integrate extra trailing 2 dquotes with data $v($parentlevel)" + } + } + } + } + } + triple_dquote { + #presumably popping multistring-space + ::tomlish::log::debug "---- triple_dquote for last_space_action pop leveldata: $v($nest)" + set merged [list] + set lasttype "" + foreach part $v($nest) { + switch -exact -- [lindex $part 0] { + MULTISTRING { + lappend merged $part + } + STRINGPART { + if {$lasttype eq "STRINGPART"} { + set prevpart [lindex $merged end] + lset prevpart 1 [lindex $prevpart 1][lindex $part 1] + lset merged end $prevpart + } else { + lappend merged $part + } + } + CONT - WS { + lappend merged $part + } + NEWLINE { + #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here + #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. + lappend merged $part + } + default { + error "---- triple_dquote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" + } + } + set lasttype [lindex $part 0] + } + set v($nest) $merged + } + equal { + #pop caused by = + switch -exact -- $prevstate { + dottedkey-space { + tomlish::log::debug "---- equal ending dottedkey-space for last_space_action pop" + #re-emit for parent space + tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] + } + dottedkey-space-tail { + #experiment? + tomlish::log::debug "---- equal ending dottedkey-space-tail for last_space_action pop" + #re-emit for parent space + tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] + } + } + } + newline { + incr linenum + lappend v($nest) [list NEWLINE $tok] + } + tablename { + #note: a tablename only 'pops' if we are greater than zero + error "---- tablename pop should already have been handled as special case zeropoppushspace in goNextState" + } + tablearrayname { + #!review - tablearrayname different to tablename regarding push/pop? + #note: a tablename only 'pops' if we are greater than zero + error "---- tablearrayname pop should already have been handled as special case zeropoppushspace in goNextState" + } + endarray { + #nothing to do here. + } + comma { + #comma for inline table will pop the keyvalue space + lappend v($nest) "SEP" + } + endinlinetable { + ::tomlish::log::debug "---- endinlinetable for last_space_action pop" + } + default { + error "---- unexpected tokenType '$tokenType' for last_space_action 'pop'" + } + } + if {$do_append_to_parent} { + #e.g tentative_accum_squote does it's own appends as necessary - so won't get here + lappend v($parentlevel) [set v($nest)] + } + + incr nest -1 + + } elseif {$last_space_action eq "push"} { + set prevnest $nest + incr nest 1 + set v($nest) [list] + # push_trigger_tokens: barekey dquotedkey startinlinetable startarray tablename tablearrayname + + + switch -exact -- $tokenType { + tentative_trigger_squote - tentative_trigger_dquote { + #### this startok will always be tentative_accum_squote/tentative_accum_dquote starting with one accumulated squote/dquote + if {[dict exists $transition_info starttok] && [dict get $transition_info starttok] ne ""} { + lassign [dict get $transition_info starttok] starttok_type starttok_val + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType $starttok_type + set tok $starttok_val + } + } + single_squote { + #JMN - REVIEW + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "squotedkey" + set tok "" + } + triple_squote { + ::tomlish::log::debug "---- push trigger tokenType triple_squote" + set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERALPART + } + squotedkey { + switch -exact -- $prevstate { + table-space - itable-space { + set v($nest) [list DOTTEDKEY] + } + } + #todo - check not something already waiting? + tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space + } + triple_dquote { + set v($nest) [list MULTISTRING] ;#container for NEWLINE,STRINGPART,CONT + } + dquotedkey { + switch -exact -- $prevstate { + table-space - itable-space { + set v($nest) [list DOTTEDKEY] + } + } + #todo - check not something already waiting? + tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space + } + barekey { + switch -exact -- $prevstate { + table-space - itable-space { + set v($nest) [list DOTTEDKEY] + } + } + #todo - check not something already waiting? + set waiting [tomlish::parse::get_token_waiting] + if {[llength $waiting]} { + set i [dict get $waiting startindex] + tomlish::parse::clear_token_waiting + tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space + } else { + tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space + } + } + tablename { + #note: we do not use the output of tablename_trim to produce a tablename for storage in the tomlish list! + #The tomlish list is intended to preserve all whitespace (and comments) - so a roundtrip from toml file to tomlish + # back to toml file will be identical. + #It is up to the datastructure stage to normalize and interpret tomlish for programmatic access. + # we call tablename_trim here only to to validate that the tablename data is well-formed at the outermost level, + # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names from + # a structural perspective. + + #todo - review! It's arguable that we should not do any validation here, and just store even incorrect raw tablenames, + # so that the tomlish list is more useful for say a toml editor. Consider adding an 'err' tag to the appropriate place in the + # tomlish list? + + #set trimtable [tablename_trim $tok] + #::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$trimtable'" + set v($nest) [list TABLE $tok] ;#$tok is the *raw* table name + #note also that equivalent tablenames may have different toml representations even after being trimmed! + #e.g ["x\t\t"] & ["x "] (tab escapes vs literals) + #These will show as above in the tomlish list, but should normalize to the same tablename when used as keys by the datastructure stage. + } + tablearrayname { + #set trimtable [tablename_trim $tok] + #::tomlish::log::debug "---- trimmed (but not normalized) tablearrayname: '$trimtable'" + set v($nest) [list TABLEARRAY $tok] ;#$tok is the *raw* tablearray name + } + startarray { + set v($nest) [list ARRAY] ;#$tok is just the opening bracket - don't output. + } + startinlinetable { + set v($nest) [list ITABLE] ;#$tok is just the opening curly brace - don't output. + } + default { + error "---- push trigger tokenType '$tokenType' not yet implemented" + } + } + + } else { + #no space level change + switch -exact -- $tokenType { + squotedkey { + #puts "---- squotedkey in state $prevstate (no space level change)" + lappend v($nest) [list SQKEY $tok] + } + dquotedkey { + #puts "---- dquotedkey in state $prevstate (no space level change)" + lappend v($nest) [list DQKEY $tok] + } + barekey { + lappend v($nest) [list KEY $tok] + } + dotsep { + lappend v($nest) [list DOTSEP] + } + starttablename { + #$tok is triggered by the opening bracket and sends nothing to output + } + starttablearrayname { + #$tok is triggered by the double opening brackets and sends nothing to output + } + tablename - tablenamearray { + error "---- did not expect 'tablename/tablearrayname' without space level change (no space level change)" + #set v($nest) [list TABLE $tok] + } + endtablename - endtablearrayname { + #no output into the tomlish list for this token + } + startinlinetable { + puts stderr "---- decode::toml error. did not expect startinlinetable without space level change (no space level change)" + } + single_dquote { + switch -exact -- $newstate { + string-state { + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "string" + set tok "" + } + dquoted-key { + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "dquotedkey" + set tok "" + } + multistring-space { + lappend v($nest) [list STRINGPART {"}] + #may need to be joined on pop if there are neighbouring STRINGPARTS + } + default { + error "---- single_dquote switch case not implemented for nextstate: $newstate (no space level change)" + } + } + } + double_dquote { + #leading extra quotes - test: toml_multistring_startquote2 + switch -exact -- $prevstate { + itable-keyval-value-expected - keyval-value-expected { + puts stderr "tomlish::decode::toml double_dquote TEST" + #empty string + lappend v($nest) [list STRINGPART ""] + } + multistring-space { + #multistring-space to multistring-space + lappend v($nest) [list STRINGPART {""}] + } + default { + error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" + } + } + + } + single_squote { + switch -exact -- $newstate { + literal-state { + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "literal" + set tok "" + } + squoted-key { + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "squotedkey" + set tok "" + } + multiliteral-space { + #false alarm squote returned from tentative_accum_squote pop + ::tomlish::log::debug "---- adding lone squote to own LITERALPART nextstate: $newstate (no space level change)" + #(single squote - not terminating space) + lappend v($nest) [list LITERALPART '] + #may need to be joined on pop if there are neighbouring LITERALPARTs + } + default { + error "---- single_squote switch case not implemented for nextstate: $newstate (no space level change)" + } + } + } + double_squote { + switch -exact -- $prevstate { + keyval-value-expected { + lappend v($nest) [list LITERAL ""] + } + multiliteral-space { + #multiliteral-space to multiliteral-space + lappend v($nest) [list LITERALPART ''] + } + default { + error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" + } + } + } + enddquote { + #nothing to do? + set tok "" + } + endsquote { + set tok "" + } + string { + #JJJJ + set tok [tomlish::from_Bstring $tok] + lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes + } + literal { + lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes + } + multistring { + #review + #JJJJ ? + lappend v($nest) [list MULTISTRING $tok] + } + stringpart { + #JJJJ + set tok [tomlish::from_Bstring $tok] + lappend v($nest) [list STRINGPART $tok] ;#will not get wrapped in dquotes directly + } + multiliteral { + lappend v($nest) [LIST MULTILITERAL $tok] + } + literalpart { + lappend v($nest) [list LITERALPART $tok] ;#will not get wrapped in squotes directly + } + untyped_value { + #would be better termed unclassified_value + #we can't determine the type of unquoted values (int,float,datetime,bool) until the entire token was read. + unset -nocomplain tag + if {$tok in {true false}} { + set tag BOOL + } else { + if {[::tomlish::utils::is_int $tok]} { + set tag INT + } else { + if {[string is integer -strict $tok]} { + #didn't qualify as a toml int - but still an int + #probably means is_int is limiting size and not accepting bigints (configurable?) + #or it didn't qualify due to more than 1 leading zero + #or other integer format issue such as repeated underscores + error "---- Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. (looks close to being an int. Formatting or range issue?) [tomlish::parse::report_line] (no space level change)" + } else { + #DDDD + if {[::tomlish::utils::is_float $tok]} { + set tag FLOAT + } elseif {[::tomlish::utils::is_localtime $tok]} { + set tag TIME-LOCAL + } elseif {[::tomlish::utils::is_timepart $tok]} { + #Note we must allow lone timepart here (not just is_localtime which doesn't allow tz offsets) in case it followed a previous localdate + set tag DATETIME ;#?? review standalone time with tz - no specific tag - only allowed as followup value from DATETIME-LOCAL + } elseif {[::tomlish::utils::is_datepart $tok]} { + set tag DATE-LOCAL + } elseif {[::tomlish::utils::is_datetime $tok]} { + #not just a date or just a time + #could be either local or have tz offset + #DDDD JJJ + set norm [string map {" " T} $tok];#prob unneeded - we won't get here if there was a space - would arrive as 2 separate tokens review. + lassign [split $norm T] dp tp + if {[::tomlish::utils::is_localtime $tp]} { + set tag DATETIME-LOCAL + } else { + set tag DATETIME + } + } elseif {[::tomlish::utils::is_datetime X$tok] || [::tomlish::utils::is_timepart X$tok]} { + # obsolete + #Note we must allow lone timepart here (not just is_localtime which doesn't allow tz offsets) in case it followed a previous localdate + #e.g x= 2025-01-01 02:34Z + #The dict::from_tomlish validation will catch an invalid standaline timepart, or combine with leading date if applicable. + set tag DATETIME + } else { + error "---- Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. [tomlish::parse::report_line] (no space level change)" + } + } + } + } + #assert either tag is set, or we errored out. + lappend v($nest) [list $tag $tok] + + } + comment { + #puts stdout "----- comment token returned '$tok'------" + #JJJJ + set tok [tomlish::from_comment $tok] + lappend v($nest) [list COMMENT "$tok"] + } + equal { + #we append '=' to the nest so that any surrounding whitespace is retained. + lappend v($nest) = + } + comma { + lappend v($nest) SEP + } + newline { + incr linenum + lappend v($nest) [list NEWLINE $tok] + } + whitespace { + lappend v($nest) [list WS $tok] + } + continuation { + lappend v($nest) CONT + } + bom { + lappend v($nest) BOM + } + eof { + #ok - nothing more to add to the tomlish list. + #!todo - check previous tokens are complete/valid? + } + default { + error "--- unknown tokenType '$tokenType' during state $prevstate [::tomlish::parse::report_line] (no space level change)" + } + } + } + + if {!$next_tokenType_known} { + ::tomlish::log::notice "---- tomlish::decode::toml - current tokenType:$tokenType Next token type not known" + ::tomlish::parse::set_tokenType "" + set tok "" + } + + if {$state eq "end-state"} { + break + } + + + } + + #while {$nest > 0} { + # lappend v([expr {$nest -1}]) [set v($nest)] + # incr nest -1 + #} + while {[::tomlish::parse::spacestack size] > 1} { + ::tomlish::parse::spacestack pop + lappend v([expr {$nest -1}]) [set v($nest)] + incr nest -1 + + #set parent [spacestack peek] ;#the level being appended to + #lassign $parent type state + #if {$type eq "space"} { + # + #} elseif {$type eq "buffer"} { + # lappend v([expr {$nest -1}]) {*}[set v($nest)] + #} else { + # error "invalid spacestack item: $parent" + #} + } + + } finally { + set is_parsing 0 + } + return $v(0) + } + + #toml dquoted string to tomlish STRING + # - only allow specified escape sequences + # - allow any unicode except those that must be escaped: dquote, bsl, and control chars(except tab) + proc from_Bstring {bstr} { + #JJJJ + if {[catch { + tomlish::utils::unescape_string $bstr + } errM]} { + return -code error -errorcode {TOML SYNTAX INVALIDESCAPE} "tomlish::from_Bstring toml Bstring contains invalid escape sequence\n$errM" ;#review + } + #assert: all escapes are now valid + + if {[regexp {[\u0000-\u0008\u000A-\u001F\u007f]} $bstr]} { + set msg "tomlish::from_Bstring toml Bstring contains controls that must be escaped" + return -code error -errorcode {TOML SYNTAX BSTRINGUNESCAPEDCONTROLS} $msg ;#review + } + return $bstr + } + #validate toml comment + # - disallow controls that must be escaped + #from spec: + # "Control characters other than tab (U+0000 to U+0008, U+000A to U+001F, U+007F) are not permitted in comments." + proc from_comment {comment} { + if {[regexp {[\u0000-\u0008\u000A-\u001F\u007f]} $comment]} { + set msg "tomlish::from_comment toml comment contains controls that must be escaped" + return -code error -errorcode {TOML SYNTAX COMMENTUNESCAPEDCONTROLS} $msg ;#review + } + return $comment + } + + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish ---}] +} +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +namespace eval tomlish::build { + #STRING,INT,FLOAT,BOOL, DATETIME - simple wrappers for completeness + # take a value of the appropriate type and wrap as a tomlish tagged item + proc STRING {s} { + return [list STRING [::tomlish::utils::rawstring_to_Bstring_with_escaped_controls $s]] + } + proc LITERAL {litstring} { + error todo + } + + proc INT {i} { + #whole numbers, may be prefixed with a + or - + #Leading zeros are not allowed + #Hex,octal binary forms are allowed (toml 1.0) + #We will error out on encountering commas, as commas are interpreted differently depending on locale (and don't seem to be supported in the toml spec anyway) + #!todo - Tcl can handle bignums - bigger than a 64bit signed long as specified in toml. + # - We should probably raise an error for number larger than this and suggest the user supply it as a string? + if {[tcl::string::last , $i] > -1} { + error "Unable to interpret '$i' as an integer. Use underscores if you need a thousands separator [::tomlish::parse::report_line]" + } + if {![::tomlish::utils::int_validchars $i]} { + error "Unable to interpret '$i' as an integer. Only 0-9 + 1 _ characters are acceptable. [::tomlish::parse::report_line]" + } + + if {[::tomlish::utils::is_int $i]} { + return [list INT $i] + } else { + error "'$i' is not a valid integer as per the Toml spec. [::tomlish::parse::report_line]" + } + + } + + proc FLOAT {f} { + #convert any non-lower case variants of special values to lowercase for Toml + if {[::tcl::string::tolower $f] in {nan +nan -nan inf +inf -inf}} { + return [list FLOAT [tcl::string::tolower $f]] + } + if {[::tomlish::utils::is_float $f]} { + return [list FLOAT $f] + } else { + error "Unable to interpret '$f' as Toml float. Check your input, or check that tomlish is able to handle all Toml floats properly [::tomlish::parse::report_line]" + } + } + + proc DATETIME {str} { + if {[::tomlish::utils::is_datetime $str]} { + return [list DATETIME $str] + } else { + error "Unable to interpret '$str' as Toml datetime. Check your input, or check that tomlish is able to handle all Toml datetimes properly [::tomlish::parse::report_line]" + } + } + proc DATETIME-LOCAL {str} { + error "build::DATETIME-LOCAL todo" + } + + proc BOOLEAN {b} { + #convert any Tcl-acceptable boolean to boolean as accepted by toml - lower case true/false + if {![tcl::string::is boolean -strict $b]} { + error "Unable to convert '$b' to Toml boolean true|false. [::tomlish::parse::report_line]" + } else { + if {$b && 1} { + return [::list BOOL true] + } else { + return [::list BOOL false] + } + } + } + + #REVIEW + #Take tablename followed by + # a) *tomlish* name-value pairs e.g table mydata [list KEY item11 = [list STRING "test"]] {KEY item2 = [list INT 1]} + # (accept also key value {STRING }) + # b) simple 2-element tcl lists being name & *simple* value pairs for which basic heuristics will be used to determine types + proc _table {name args} { + set pairs [list] + foreach t $args { + if {[llength $t] == 4} { + if {[tcl::string::tolower [lindex $t 0]] ne "key" || [tcl::string::tolower [lindex $t 2]] ni "= value"} { + error "Only items tagged as KEY = currently accepted as name-value pairs for table command" + } + lassign $t _k keystr _eq valuepart + if {[llength $valuepart] != 2} { + error "supplied value must be typed. e.g {INT 1} or {STRING test}" + } + lappend pairs [list KEY $keystr = $valuepart] + } elseif {[llength $t] == 2} { + #!todo - type heuristics + lassign $t n v + lappend pairs [list KEY $n = [list STRING $v]] + } else { + error "'KEY = { toml but + # the first newline is not part of the data. + # we elect instead to maintain a basic LITERALPART that must not contain newlines.. + # and to compose MULTILITERAL of multiple NEWLINE LITERALPART parts, + #with the datastructure representation dropping the first newline (if immediately following opening delim) when building the value. + set literal "" + foreach part [lrange $item 1 end] { + append literal [::tomlish::encode::tomlish [list $part] $nextcontext] + } + append toml '''$literal''' + } + INT - + BOOL - + FLOAT - + DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL { + #DDDD + append toml [lindex $item 1] + } + INCOMPLETE { + error "cannot process tomlish term tagged as INCOMPLETE" + } + COMMENT { + append toml "#[lindex $item 1]" + } + BOM { + #Byte Order Mark may appear at beginning of a file. Needs to be preserved. + append toml "\uFEFF" + } + default { + error "Not a properly formed 'tomlish' taggedlist.\n '$list'\n Unknown tag '[lindex $item 0]'. See output of \[tomlish::tags\] command." + } + } + + } + return $toml + } + + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::encode ---}] +} +#fish toml from tomlish + +#(encode tomlish as toml) +interp alias {} tomlish::to_toml {} tomlish::encode::tomlish + +# + + +namespace eval tomlish::decode { + #*** !doctools + #[subsection {Namespace tomlish::decode}] + #[para] + #[list_begin definitions] + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::decode ---}] +} +#decode toml to tomlish +#interp alias {} tomlish::from_toml {} tomlish::decode::toml + +namespace eval tomlish::utils { + #*** !doctools + #[subsection {Namespace tomlish::utils}] + #[para] + #[list_begin definitions] + + + + #basic generic quote matching for single and double quotes + #note for example that {[o'malley]} will return sq - as the single quote is not closed or wrapped in double quotes + proc tok_in_quotedpart {tok} { + set sLen [tcl::string::length $tok] + set quote_type "" + set had_slash 0 + for {set i 0} {$i < $sLen} {incr i} { + set c [tcl::string::index $tok $i] + if {$quote_type eq ""} { + if {$had_slash} { + #don't enter quote mode + #leave slash_mode because even if current char is slash - it is escaped + set had_slash 0 + } else { + set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] + switch -- $ctype { + dq { + set quote_type dq + } + sq { + set quote_type sq + } + bsl { + set had_slash 1 + } + } + } + } else { + if {$had_slash} { + #don't leave quoted mode + #leave slash_mode because even if current char is slash - it is escaped + set had_slash 0 + } else { + set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] + switch -- $ctype { + dq { + if {$quote_type eq "dq"} { + set quote_type "" + } + } + sq { + if {$quote_type eq "sq"} { + set quote_type "" + } + } + bsl { + set had_slash 1 + } + } + } + } + } + return $quote_type ;#dq | sq + } + + proc hex_escape_info {slashx} { + set exp {^\\x([0-9a-fA-F]{2}$)} + if {[regexp $exp $slashx match hex]} { + return [list ok [list char [subst -nocommand -novariable $slashx]]] + } else { + return [list err [list reason "Supplied string not of the form \\xHH where H in \[0-9a-fA-F\]"]] + } + } + proc unicode_escape_info {slashu} { + #!todo + # validate that slashu is either a \uxxxx or \Uxxxxxxxx value of the correct length and + # is a valid 'unicode scalar value' (any Unicode code point except high-surrogate and low-surrogate code points) + # ie integers in the range 0 to D7FF16 and E00016 to 10FFFF16 inclusive + #expr {(($x >= 0) && ($x <= 0xD7FF16)) || (($x >= 0xE00016) && ($x <= 0x10FFFF16))} + if {[tcl::string::match {\\u*} $slashu]} { + set exp {^\\u([0-9a-fA-F]{4}$)} + if {[regexp $exp $slashu match hex]} { + if {[scan $hex %4x dec] != 1} { + #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? + return [list err [list reason "Failed to convert '$hex' to decimal"]] + } else { + return [list ok [list char [subst -nocommand -novariable $slashu]]] + } + } else { + return [list err [list reason "Supplied string not of the form \\uHHHH where H in \[0-9a-fA-F\]"]] + } + } elseif {[tcl::string::match {\\U*} $slashu]} { + set exp {^\\U([0-9a-fA-F]{8}$)} + if {[regexp $exp $slashu match hex]} { + if {[scan $hex %8x dec] != 1} { + #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? + return [list err [list reason "Failed to convert '$hex' to decimal"]] + } else { + if {(($dec >= 0) && ($dec <= 0xD7FF16)) || (($dec >= 0xE00016) && ($dec <= 0x10FFFF16))} { + return [list ok [list char [subst -nocommand -novariable $slashu]]] + } else { + return [list err [list reason "$slashu is not within the 'unicode scalar value' ranges 0 to 0xD7FF16 or 0xE00016 to 0x10FFFF16"]] + } + } + } else { + return [list err [list reason "Supplied string not of the form \\UHHHHHHHH where H in \[0-9a-fA-F\]"]] + } + } else { + return [list err [list reason "Supplied string did not start with \\u or \\U" ]] + } + + } + + #Note that unicode characters don't *have* to be escaped. + #So if we provide a function named 'escape_string', the name implies the inverse of unescape_string which unescapes unicode \u \U values. + #- an inverse of unescape_string would encode all unicode chars unnecessarily. + #- as toml accepts a compact escape sequence for common chars such as tab,backspace,linefeed etc but also allows the full form \u009 etc + #- escape_string and unescape_string would not be reliably roundtrippable inverses anyway. + #REVIEW - provide it anyway? When would it be desirable to use? + + variable Bstring_control_map [dict create] + dict set Bstring_control_map \b {\b} + dict set Bstring_control_map \n {\n} + dict set Bstring_control_map \r {\r} + dict set Bstring_control_map \" {\"} + dict set Bstring_control_map \x1b {\e} ;#In spec it's included in the list of 'must be escaped', as well as the 'convenience' escapes - so we make it go both ways. + dict set Bstring_control_map \\ "\\\\" + + #\e for \x1b seems like it might be included - v1.1?? hard to find current state of where toml is going :/ + #for a Bstring (Basic string) tab is explicitly mentioned as not being one that must be escaped. + #8 = \b - already in list. + #built the remainder whilst checking for entries already hardcoded above -in case more are added to the hardcoded list + for {set cdec 0} {$cdec <= 7} {incr cdec} { + set hhhh [format %.4X $cdec] + set char [format %c $cdec] + if {![dict exists $Bstring_control_map $char]} { + dict set Bstring_control_map $char \\u$hhhh + } + } + for {set cdec [expr {0x0A}]} {$cdec <= 0x1F} {incr cdec} { + set hhhh [format %.4X $cdec] + set char [format %c $cdec] + if {![dict exists $Bstring_control_map $char]} { + dict set Bstring_control_map $char \\u$hhhh + } + } + # \u007F = 127 + dict set Bstring_control_map [format %c 127] \\u007F + + # ------------------------------------------------------------------ + variable Literal_control_map [dict create] + #controls other than tab + for {set cdec 0} {$cdec <= 8} {incr cdec} { + set hhhh [format %.4X $cdec] + set char [format %c $cdec] + if {![dict exists $Literal_control_map $char]} { + dict set Literal_control_map $char \\u$hhhh + } + } + for {set cdec [expr {0x0A}]} {$cdec <= 0x1F} {incr cdec} { + set hhhh [format %.4X $cdec] + set char [format %c $cdec] + if {![dict exists $Literal_control_map $char]} { + dict set Literal_control_map $char \\u$hhhh + } + } + # \u007F = 127 + dict set Literal_control_map [format %c 127] \\u007F + # ------------------------------------------------------------------ + variable Multiliteral_control_map + set Multiliteral_control_map [dict remove $Literal_control_map \n] + + variable String_control_map + set String_control_map [dict remove $Literal_control_map \\] + + + variable MultiBstring_totoml_map + #'minimally' escaped sequences of double quotes. + #e.g {""\"""\"} vs {\"\"\"\"\"} + #This produces easier to read toml - and in many cases may be more likely to match original format when roundtripped from dict datastructure + # REVIEW - should this be configurable? + set MultiBstring_totoml_map [dict remove $Bstring_control_map {"} \r \n] + dict set MultiBstring_totoml_map {"""} {""\"} ;#" editor hack: commented quote for dumb syntax highlighers + + #Note the inclusion of backslash in the list of controls makes this non idempotent - subsequent runs would keep encoding the backslashes! + #escape only those chars that must be escaped in a Bstring (e.g not tab which can be literal or escaped) + #for example - can be used by from_dict to produce valid Bstring data for a tomlish record + proc rawstring_to_Bstring_with_escaped_controls {str} { + #for the well known chars that have compact escape sequences allowed by toml - we choose that form over the full \u form. + #we'll use a string map with an explicit list rather than algorithmic at runtime + # - the string map is probably more performant than splitting a string, especially if it's large + + upvar ::tomlish::utils::Bstring_control_map map + + return [string map $map $str] + } + proc rawstring_to_MultiBstring_with_escaped_controls {str} { + #for the well known chars that have compact escape sequences allowed by toml - we choose that form over the full \u form. + #we'll use a string map with an explicit list rather than algorithmic at runtime + # - the string map is probably more performant than splitting a string, especially if it's large + + upvar ::tomlish::utils::MultiBstring_totoml_map map + + return [string map $map $str] + } + + proc rawstring_is_valid_tomlstring {str} { + #controls are allowed in this direction dict -> toml (they get quoted) + + #check any existing escapes are valid + if {[catch { + unescape_string $str + } errM]} { + return 0 + } + return 1 + } + + proc rawstring_is_valid_literal {str} { + #detect control chars other than tab + variable Literal_control_map + set testval [string map $Literal_control_map $str] + return [expr {$testval eq $str}] + } + proc rawstring_is_valid_multiliteral {str} { + #detect control chars other than tab + variable Multiliteral_control_map + + set teststr [string map [list \r\n ok] $str] + + set testval [string map $Multiliteral_control_map $teststr] + return [expr {$testval eq $teststr}] + } + + #review - unescape what string? Bstring vs MLBstring? + #we should be specific in the function naming here + #used by dict::from_tomlish - so part of validation? - REVIEW + proc unescape_string {str} { + #note we can't just use Tcl subst because: + # it also transforms \a (audible bell) and \v (vertical tab) which are not in the toml spec. + # it would strip out backslashes inappropriately: e.g "\j" becomes just j + # it recognizes other escapes which aren't approprite e.g octal \nnn + # it replaces \ with a single whitespace (trailing backslash) + #This means we shouldn't use 'subst' on the whole string, but instead substitute only the toml-specified escapes (\r \n \b \t \f \\ \" \uhhhh & \Uhhhhhhhh + #plus \e for \x1b? + + set buffer "" + set buffer2 "" ;#buffer for 2 hex characters following a \x + set buffer4 "" ;#buffer for 4 hex characters following a \u + set buffer8 "" ;#buffer for 8 hex characters following a \u + + set sLen [tcl::string::length $str] + + #we need to handle arbitrarily long sequences of backslashes. \\\\\ etc + set slash_active 0 + set unicode2_active 0 + set unicode4_active 0 + set unicode8_active 0 + + ::tomlish::log::debug "unescape_string. got len [string length str] str $str" + + #!todo - check for invalid data in the form of a raw carriage return (decimal 13) without following linefeed? + set i 0 + for {} {$i < $sLen} {} { + if {$i > 0} { + set lastChar [tcl::string::index $str [expr {$i - 1}]] + } else { + set lastChar "" + } + + set c [tcl::string::index $str $i] + #::tomlish::log::debug "unescape_string. got char $c" ;#too much? + + ##---------------------- + ##as we are 'unescaping' - should we really be testing here for existing values that should have been escaped? + ##The answer is probably no - keep this function to a single purpose - test elsewhere for raw controls. + ##this test looks incomplete anyway REVIEW + #scan $c %c n + #if {($n <= 31) && ($n != 9) && ($n != 10) && ($n != 13)} { + # #we don't expect unescaped unicode characters from 0000 to 001F - + # #*except* for raw tab (which is whitespace) and newlines + # error "unescape_string. Invalid data for a toml string. Unescaped control character (decimal $n) [::tomlish::utils::string_to_slashu $c]" + #} + ##---------------------- + + incr i ;#must incr here because we do'returns'inside the loop + if {$c eq "\\"} { + if {$slash_active} { + append buffer "\\" + set slash_active 0 + } elseif {$unicode2_active} { + error "unescape_string. unexpected case slash during unicode2 not yet handled" + } elseif {$unicode4_active} { + error "unescape_string. unexpected case slash during unicode4 not yet handled" + } elseif {$unicode8_active} { + error "unescape_string. unexpected case slash during unicode8 not yet handled" + } else { + # don't output anything (yet) + set slash_active 1 + } + } else { + if {$unicode2_active} { + if {[tcl::string::length $buffer2] < 2} { + append buffer2 $c + } + if {[tcl::string::length $buffer2] == 2} { + #we have a \xHH to test + set unicode2_active 0 + set result [tomlish::utils::hex_escape_info "\\x$buffer2"] + if {[lindex $result 0] eq "ok"} { + append buffer [dict get $result ok char] + } else { + error "unescape_string error: [lindex $result 1]" + } + } + } elseif {$unicode4_active} { + if {[tcl::string::length $buffer4] < 4} { + append buffer4 $c + } + if {[tcl::string::length $buffer4] == 4} { + #we have a \uHHHH to test + set unicode4_active 0 + set result [tomlish::utils::unicode_escape_info "\\u$buffer4"] + if {[lindex $result 0] eq "ok"} { + append buffer [dict get $result ok char] + } else { + error "unescape_string error: [lindex $result 1]" + } + } + } elseif {$unicode8_active} { + if {[tcl::string::length $buffer8] < 8} { + append buffer8 $c + } + if {[tcl::string::length $buffer8] == 8} { + #we have a \UHHHHHHHH to test + set unicode8_active 0 + set result [tomlish::utils::unicode_escape_info "\\U$buffer8"] + if {[lindex $result 0] eq "ok"} { + append buffer [dict get $result ok char] + } else { + error "unescape_string error: [lindex $result 1]" + } + } + } elseif {$slash_active} { + set slash_active 0 + set ctest [tcl::string::map {{"} dq} $c] + switch -exact -- $ctest { + dq { + append buffer {"} + } + b - t - n - f - r { + append buffer [subst -nocommand -novariable "\\$c"] + } + e { + append buffer \x1b + } + x { + #introduced in 1.1.0 \xHH + set unicode2_active 1 + set buffer2 "" + } + u { + set unicode4_active 1 + set buffer4 "" + } + U { + set unicode8_active 1 + set buffer8 "" + } + default { + set slash_active 0 + #review - toml spec says all other escapes are reserved + #and if they are used TOML should produce an error. + #append buffer "\\$c" + set msg "Invalid escape sequence \\ followed by '$c'" + return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} $msg + } + } + } else { + append buffer $c + } + } + } + #puts stdout "EOF 4:$unicode4_active 8:$unicode8_active slash:$slash_active" + if {$unicode2_active} { + error "End of string reached before complete hex escape sequence \xHH" + } + if {$unicode4_active} { + error "End of string reached before complete unicode escape sequence \uHHHH" + } + if {$unicode8_active} { + error "End of string reached before complete unicode escape sequence \UHHHHHHHH" + } + if {$slash_active} { + append buffer "\\" + } + try { + encoding convertto utf-8 $buffer + } trap {} {emsg eopts} { + return -code error -errorcode {TOMLISH SYNTAX ENCODINGERROR} $emsg + } + return $buffer + } + + #This does not have to do with unicode normal forms - which it seems toml has decided against regarding use in keys (review/references?) + #This is meant for internal use regarding ensuring we match equivalent keys which may have just been specified with different string mechanisms, + #e.g squoted vs dquoted vs barekey. + proc normalize_key {rawkey} { + set c1 [tcl::string::index $rawkey 0] + set c2 [tcl::string::index $rawkey end] + if {($c1 eq "'") && ($c2 eq "'")} { + #single quoted segment. No escapes allowed within it. + set key [tcl::string::range $rawkey 1 end-1] + } elseif {($c1 eq "\"") && ($c2 eq "\"")} { + #double quoted segment. Unapply escapes. + # + set keydata [tcl::string::range $rawkey 1 end-1] ;#strip outer quotes only + #e.g key could have mix of \UXXXXXXXX escapes and unicode chars + #or mix of \t and literal tabs. + #unescape to convert all to literal versions for comparison + set key [::tomlish::utils::unescape_string $keydata] + #set key [subst -nocommands -novariables $keydata] ;#wrong. Todo - create a string escape substitution function. + } else { + set key $rawkey + } + return $key + } + + proc string_to_slashu {string} { + set rv {} + foreach c [split $string {}] { + scan $c %c cdec + if {$cdec > 65535} { + append rv {\U} [format %.8X $cdec] + } else { + append rv {\u} [format %.4X $cdec] + } + } + return $rv + } + + #'nonprintable' is conservative here because some systems (e.g windows console) are very limited in what they can display. + #This is used for display purposes only (error msgs) + proc nonprintable_to_slashu {s} { + set res "" + foreach i [split $s ""] { + scan $i %c cdec + + set printable 0 + if {($cdec>31) && ($cdec<127)} { + set printable 1 + } + if {$printable} { + append res $i + } else { + if {$cdec > 65535} { + append res \\U[format %.8X $cdec] + } else { + append res \\u[format %.4X $cdec] + } + } + } + set res + } ;# initial version from tcl wiki RS + + proc rawstring_to_jsonstring {s} { + #like nonprintable_to_slashu + # - also escape every dquote + # - escape newlines + set res "" + foreach i [split $s ""] { + scan $i %c cdec + switch -- $cdec { + 34 { + #double quote + append res \\\" + } + 13 { + #carriage return + append res \\r + } + 8 { + append res \\b + } + 9 { + append res \\t + } + 10 { + #linefeed + append res \\n + } + 92 { + append res \\\\ + } + default { + set printable 0 + if {($cdec>31) && ($cdec<127)} { + set printable 1 + } + if {$printable} { + append res $i + } else { + if {$cdec > 65535} { + #append res $i + #append res \\U[format %.8X $cdec] ;#wrong + #append res "\\U{[format %.8x $cdec]}" ;#some variation of json? + package require punk::cesu + #e.g \U0001f610 emoticon face + #surrogate pair: \uD83D\uDE10 + set surrogatepair [punk::cesu::to_surrogatestring -format escape $i] + append res $surrogatepair + } else { + append res \\u[format %.4X $cdec] + } + } + } + } + } + set res + + } + + #check if str is valid for use as a toml bare key + #Early toml versions only allowed letters + underscore + dash + proc is_basic_barekey {str} { + if {[tcl::string::length $str] == 0} { + return 0 + } else { + set matches [regexp -all {[a-zA-Z0-9\_\-]} $str] + if {[tcl::string::length $str] == $matches} { + #all characters match the regexp + return 1 + } else { + return 0 + } + } + } + + #from toml.abnf in github.com/toml-lang/toml + #unquoted-key = 1*unquoted-key-char + #unquoted-key-char = ALPHA / DIGIT / %x2D / %x5F ; a-z A-Z 0-9 - _ + #unquoted-key-char =/ %xB2 / %xB3 / %xB9 / %xBC-BE ; superscript digits, fractions + #unquoted-key-char =/ %xC0-D6 / %xD8-F6 / %xF8-37D ; non-symbol chars in Latin block + #unquoted-key-char =/ %x37F-1FFF ; exclude GREEK QUESTION MARK, which is basically a semi-colon + #unquoted-key-char =/ %x200C-200D / %x203F-2040 ; from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ + #unquoted-key-char =/ %x2070-218F / %x2460-24FF ; include super-/subscripts, letterlike/numberlike forms, enclosed alphanumerics + #unquoted-key-char =/ %x2C00-2FEF / %x3001-D7FF ; skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces + #unquoted-key-char =/ %x2070-21FF / %x2300-24FF ; skip math operators + #unquoted-key-char =/ %x25A0-268B / %x2690-2757 ; skip box drawing, block elements, and some yin-yang symbols + #unquoted-key-char =/ %x2762-2767 / %x2776-27E5 ; skip some Dingbat punctuation + #unquoted-key-char =/ %x2801-297F ; skip some math brackets and arrows, and braille blank + #unquoted-key-char =/ %x2B00-2FFF / %x3001-D7FF ; skip various math operators and symbols, and ideographic space + #unquoted-key-char =/ %xF900-FDCF / %xFDF0-FFFD ; skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) + #unquoted-key-char =/ %x10000-EFFFF ; all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) + variable re_barekey + set ranges [list] + lappend ranges {a-zA-Z0-9\_\-} + lappend ranges {\u00B2} {\u00B3} {\u00B9} {\u00BC-\u00BE} ;# superscript digits, fractions + lappend ranges {\u00C0-\u00D6} {\u00D8-\u00F6} {\u00F8-\u037D} ;# non-symbol chars in Latin block + lappend ranges {\u037f-\u1FFF} ;# exclude GREEK QUESTION MARK, which is basically a semi-colon + lappend ranges {\u200C-\u200D} {\u203F-\u2040} ;# from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ + lappend ranges {\u2070-\u218f} {\u2460-\u24FF} ;# include super-subscripts, letterlike/numberlike forms, enclosed alphanumerics + lappend ranges {\u2C00-\u2FEF} {\u3001-\uD7FF} ;# skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces + lappend ranges {\u2070-\u21FF} {\u2300-\u24FF} ;# skip math operators + lappend ranges {\u25A0-\u268B} {\u2690-\u2757} ;# skip box drawing, block elements, and some yin-yang symbols + lappend ranges {\u2762-\u2767} {\u2776-\u27E5} ;# skip some Dingbat punctuation + lappend ranges {\u2801-\u297F} ;# skip some math brackets and arrows, and braille blank + lappend ranges {\u2B00-\u2FFF} {\u3001-\uD7FF} ;# skip various math operators and symbols, and ideographic space + lappend ranges {\uF900-\uFDCF} {\uFDF0-\uFFFD} ;# skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) + lappend ranges {\U10000-\UEFFFF} ;# all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) + set re_barekey {^[} + foreach r $ranges { + append re_barekey $r + } + append re_barekey {]+$} + + proc is_barekey {str} { + if {[tcl::string::length $str] == 0} { + return 0 + } + variable re_barekey + return [regexp $re_barekey $str] + } + + #test only that the characters in str are valid for the toml specified type 'integer'. + proc int_validchars1 {str} { + set numchars [tcl::string::length $str] + if {[regexp -all {[0-9\_\-\+]} $str] == $numchars} { + return 1 + } else { + return 0 + } + } + #add support for hex,octal,binary 0x.. 0o.. 0b... + proc int_validchars {str} { + set numchars [tcl::string::length $str] + if {[regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] == $numchars} { + return 1 + } else { + return 0 + } + } + + proc is_int {str} { + set matches [regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] ;#0b101 etc covered by a-f + + if {[tcl::string::length $str] == $matches} { + #all characters in legal range + + # --------------------------------------- + #check for leading zeroes in non 0x 0b 0o + #first strip any +, - or _ (just for this test) + #(but still allowing 0 -0 +0) + set check [tcl::string::map {+ "" - "" _ ""} $str] + if {([tcl::string::length $check] > 1) && ([tcl::string::index $check 0] eq "0") && ([tcl::string::index $check 1] ni {o x b})} { + return 0 + } + # --------------------------------------- + + #check +,- only occur in the first position. (excludes also +++1 etc) + if {[tcl::string::last - $str] > 0} { + return 0 + } + if {[tcl::string::last + $str] > 0} { + return 0 + } + + #------------------------------------------- + #unclear if a 'digit' includes the type specifiers x b o + #we assume the 0x 0b 0o are NOT counted as digits - as underscores here would seem + #to be likely to cause interop issues with other systems + #(e.g tcl allows 0b1_1 but not 0b_11) + #Most of this structure would be unnecessary if we could rely on string::is::integer understanding underscores (9+?) + #we still need to support earlier Tcl for now though. + + #first rule out any case with more than one underscore in a row + if {[regexp {__} $str]} { + return 0 + } + if {[string index $str 0] eq "_"} { + return 0 + } + set utest [string trimleft $str +-] + #test again for further trick like _+_0xFF + if {[string index $utest 0] eq "_"} { + return 0 + } + if {[string range $utest 0 1] in {0x 0b 0o}} { + set testnum [string range $utest 2 end] + #spec says *non-negative* integers may *also* be expressed in hex, octal or binary + #and also explicitly states + not allowed + #presumed to mean negative not allowed. + if {[string index $str 0] in {- +}} { + return 0 + } + } else { + set testnum $utest + #exclude also things like 0_x 0___b that snuck past our prefix test + if {![string is digit -strict [string map {_ ""} $testnum]]} { + return 0 + } + #assert - only digits and underscores in testnum + #still may have underscores at each end + } + #assert testnum is now the 'digits' portion of a , 0x 0b 0o number + #(+ and - already stripped) + #It may still have chars unsuitable for its type - which will be caught by the string::is::integer test below + if {[string length $testnum] != [string length [string trim $testnum _]]} { + #had non-inner underscores in 'digit' part + return 0 + } + #assert str only has solo inner underscores (if any) between 'digits' + #------------------------------------------- + + set numeric_value [tcl::string::map {_ ""} $str] ;#allow some earlier tcl versions which don't support underscores + #use Tcl's integer check to ensure we don't let things like 3e4 through - which is a float (would need to be 0x3e4 for hex) + if {![tcl::string::is integer -strict $numeric_value]} { + return 0 + } + + + + #!todo - check bounds only based on some config value + #even though Tcl can handle bignums, we won't accept anything outside of toml 1.0 minimum requirements by default (for now) + #presumably very large numbers would have to be supplied in a toml file as strings. + #Review - toml 1.0 only says that it must handle up to 2^63 - not that this is a max + #some question around implementations allowed to use lower values such as 2^31 on some systems? + if {$::tomlish::max_int ne "" && $numeric_value > $::tomlish::max_int} { + return 0 + } + if {$::tomlish::min_int ne "" && $numeric_value < $::tomlish::min_int} { + return 0 + } + } else { + return 0 + } + #Got this far - didn't find anything wrong with it. + return 1 + } + + #test only that the characters in str are valid for the toml specified type 'float'. + proc float_validchars {str} { + set numchars [tcl::string::length $str] + if {[regexp -all {[eE0-9\_\-\+\.]} $str] == $numchars} { + return 1 + } else { + #only allow lower case for these special values - as per Toml 1.0 spec + if {$str ni {inf +inf -inf nan +nan -nan}} { + return 0 + } else { + return 1 + } + } + } + + #note - Tcl's string is double will return true also for the subset of float values which are integers + #This function is to determine whether it matches the Toml float concept - so requires a . or e or E + proc is_float {str} { + #vip greenlight known literals, don't test for case variations - as Toml doesn't allow (whereas Tcl allows Inf NaN etc) + if {$str in {inf +inf -inf nan +nan -nan}} { + return 1 + } + #doorcheck the basics for floatiness vs members of that rival gang - ints + if {![regexp {[.eE]} $str]} { + #could be an integer - which isn't specifically a float for Toml purposes. + return 0 + } + + + #patdown for any contraband chars + set matches [regexp -all {[eE0-9\_\-\+\.]} $str] + if {[tcl::string::length $str] != $matches} { + return 0 + } + + #all characters in legal range + + #A leading zero is ok, but we should disallow multiple leading zeroes (same rules as toml ints) + + #Early Toml spec also disallowed leading zeros in the exponent part(?) + #... this seems less interoperable anyway (some libraries generate leading zeroes in exponents) + #we allow leading zeros in exponents here. + + #Check for leading zeros in main part + #first strip any +, - or _ (just for this test) + set check [tcl::string::map {+ "" - "" _ ""} $str] + set r {([0-9])*} + regexp $r $check intpart ;#intpart holds all numerals before the first .,e or E + #leading zero only if exactly one zero + if {$intpart ne "0" && [string match 0* $intpart]} { + return 0 + } + + #for floats, +,- may occur in multiple places + #e.g -2E-22 +3e34 + #!todo - check bounds ? + + #----------------------------------------- + if {[regexp {__} $str]} { + return 0 + } + if {[string index $str 0] eq "_" || [string index $str end] eq "_"} { + return 0 + } + set utest [string trimleft $str +-] + #test again for further trick like _+_ + if {[string index $utest 0] eq "_"} { + return 0 + } + #----------------------------------------- + + #decimal point, if used must be surrounded by at least one digit on each side + #e.g 3.e+20 also illegal + set dposn [string first . $str] + if {$dposn > -1 } { + set d3 [string range $str $dposn-1 $dposn+1] + if {![string is integer -strict [string index $d3 0]] || ![string is integer -strict [string index $d3 2]]} { + return 0 + } + } + #we've already eliminated leading/trailing underscores + #now ensure each inner underscore is surrounded by digits + if {[regexp {_[^0-9]|[^0-9]_} $str]} { + return 0 + } + + #strip underscores for tcl double check so we can support < tcl 9 versions which didn't allow underscores + set check [tcl::string::map {_ ""} $str] + #string is double accepts inf nan +NaN etc. + if {![tcl::string::is double $check]} { + return 0 + } + + #All good - seems to be a toml-approved float and not an int. + return 1 + } + + #test only that the characters in str are valid for the toml specified type 'datetime'. + proc datetime_validchars {str} { + set numchars [tcl::string::length $str] + if {[regexp -all {[zZtT0-9\-\+\.:]} $str] == $numchars} { + return 1 + } else { + return 0 + } + } + + proc is_datepart {str} { + set matches [regexp -all {[0-9\-]} $str] + if {[tcl::string::length $str] != $matches} { + return 0 + } + #seems to require yyyy-mm-dd (e.g not allowing just yyyy-mm) + if {![regexp {^([0-9]{4})-([0-9]{2})-([0-9]{2})$} $str _match y m d]} { + return 0 + } + if {$m > 12 || $m == 0} { + return 0 + } + switch -- [expr {$m}] { + 1 - 3 - 5 - 7 - 8 - 10 - 12 { + if {$d > 31 || $d == 0} { + return 0 + } + } + 2 { + if {$d > 29 || $d == 0} { + return 0 + } + if {$d == 29} { + #leapyear check + if {[catch {clock scan $str -format %Y-%m-%d} errM]} { + return 0 + } + } + } + 4 - 6 - 9 - 11 { + if {$d > 30 || $d == 0} { + return 0 + } + } + } + return 1 + } + proc is_localdate {str} { + is_datepart $str + } + + #allow only hh:mm:ss or hh:mm (no subseconds) + proc _is_hms_or_hm_time {val} { + set numchars [tcl::string::length $val] + if {[regexp -all {[0-9:]} $val] != $numchars} { + return 0 + } + #assert now digits and colons only + set hms_cparts [split $val :] + #2 or 3 parts only are valid - check contents of each part + if {[llength $hms_cparts] == 2} { + lassign $hms_cparts hr min + if {[string length $hr] != 2 || [string length $min] != 2} { + return 0 + } + if {$hr > 23 || $min > 59} { + return 0 + } + } elseif {[llength $hms_cparts] == 3} { + lassign $hms_cparts hr min sec + if {[string length $hr] != 2 || [string length $min] != 2 || [string length $sec] !=2} { + return 0 + } + #possible for sec to be 60 - leap second RFC 3339 + if {$hr > 23 || $min > 59 || $sec > 60} { + return 0 + } + } else { + return 0 + } + return 1 + } + proc is_timepart {str} { + #validate the part after the T (or space) + #we receive only that trailing part here. + + #odt1 = 1979-05-27T07:32:00Z + #odt2 = 1979-05-27T00:32:00-07:00 + #odt3 = 1979-05-27T00:32:00.5-07:00 + #odt4 = 1979-05-27T00:32:00.999999-07:00 + + set numchars [tcl::string::length $str] + #timepart can have negative or positive offsets so - and + must be accepted + if {[regexp -all {[zZt0-9\-\+\.:]} $str] == $numchars} { + #todo + #basic check that we have leading 2dig hr and 2dig min separated by colon + if {![regexp {^[0-9]{2}:[0-9]{2}$|^[0-9]{2}:[0-9]{2}[^0-9]{1}.*$} $str]} { + #nn:nn or nn:nnX.* where X is non digit + return 0 + } + set dotparts [split $str .] + if {[llength $dotparts] ni {1 2}} { + return 0 + } + if {[llength $dotparts] == 2} { + lassign $dotparts hms tail + #validate tail - which might have +- offset + if {[string index $tail end] ni {z Z}} { + #from hh:mm:??. + #check for +/- something + if {[regexp {(.*)[+-](.*)} $tail _match fraction offset]} { + if {![string is digit -strict $fraction]} { + return 0 + } + if {![_is_hms_or_hm_time $offset]} { + return 0 + } + } + } else { + set tail [string range $tail 0 end-1] + #expect tail nnnn (from hh:mm::ss.nnnnZ) + #had a dot and a zZ - no other offset valid (?) + if {![string is digit -strict $tail]} { + return 0 + } + } + + } else { + #no dot (fraction of second) + if {[regexp {(.*)[+-](.*)} $str _match hms offset]} { + #validate offset + if {![_is_hms_or_hm_time $offset]} { + return 0 + } + } else { + set hms $str + set offset "" + #trim a *single* z or Z off hms if present - multiple should error later + if {[string index $hms end] in {z Z}} { + set hms [string range $hms 0 end-1] + } + } + } + #hms is allowed in toml to be hh:mm:ss or hh:mm + #validate we have hh:mm:ss or hh:mm - exactly 2 digits each + if {![_is_hms_or_hm_time $hms]} { + return 0 + } + + return 1 + } else { + return 0 + } + } + proc is_localtime {str} { + #time of day without any relation to a specific day or any offset or timezone + set numchars [tcl::string::length $str] + if {[regexp -all {[0-9\.:]} $str] == $numchars} { + #todo + if {![regexp {^[0-9]{2}:[0-9]{2}$|^[0-9]{2}:[0-9]{2}:[0-9]{2}([.][0-9]+){0,1}$} $str]} { + #hh:mm or hh:mm:ss or hh:mm::ss.nnn + return 0 + } + set dotparts [split $str .] + if {[llength $dotparts] ni {1 2}} { + return 0 + } + if {[llength $dotparts] == 2} { + lassign $dotparts hms _tail + #validate tail - just fractional seconds - regex has confirmed at least one digit and only digits + #nothing todo? max length? + } else { + #no fractional seconds + set hms $str + } + if {![_is_hms_or_hm_time $hms]} { + return 0 + } + return 1 + } else { + return 0 + } + } + + #review + proc is_datetime {str} { + #Essentially RFC3339 formatted date-time - but: + #1) allowing seconds to be omitted (:00 assumed) + #2) T may be replaced with a single space character TODO - parser support for space in datetime! + # (RFC 3339 allows space instead of T also - but doesn't specify it *must* be a single space) + + #toml-lint @2025-04 doesn't accept t for T or z for Z - but RFC3339 does + #toml spec doesn't clarify - we will accept + + #e.g 1979-05-27 + #e.g 1979-05-27T00:32:00Z + #e.g 1979-05-27 00:32:00-07:00 + #e.g 1979-05-27 00:32:00+10:00 + #e.g 1979-05-27 00:32:00.999999-07:00 + + #review + #minimal datetimes? + # 2024 not ok - 2024T not accepted by tomlint why? + # 02:00 ok + # 02:00:00.5 ok + # 1:00 - not ok - RFC3339 requires 2-digit hr,min,sec + + #toml-lint.com accepts 2025-01 + + if {[string length $str] < 5} { + return 0 + } + + set matches [regexp -all {[zZtT0-9\ \-\+\.:]} $str] + if {[tcl::string::length $str] == $matches} { + #all characters in legal range + if {[regexp -all {\ } $str] > 1} { + #only a single space is allowed. + return 0 + } + #If we get a space - it is only valid as a convience to represent the T separator + #we can normalize by converting to T here before more tests + set str [string map {" " T t T} $str] + #a further sanity check on T + if {[regexp -all {T} $str] > 1} { + return 0 + } + + #!todo - use full RFC 3339 parser? + #!todo - what if the value is 'time only'? + + if {[string first T $str] > -1} { + lassign [split $str T] datepart timepart + if {![is_datepart $datepart]} { + return 0 + } + if {![is_timepart $timepart]} { + return 0 + } + } else { + #either a datepart or a localtime + #spec: "If you include only the time portion of an RFC 3339 formatted date-time, it will represent that time of day + # without any relation to a specific day or any offset or timezone." + if {!([is_datepart $str] || [is_localtime $str])} { + return 0 + } + } + + + #Tcl's free-form clock scan (no -format option) is deprecated + # + #if {[catch {clock scan $datepart} err]} { + # puts stderr "tcl clock scan failed err:'$err'" + # return 0 + #} + + } else { + return 0 + } + return 1 + } + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::utils ---}] +} + +namespace eval tomlish::parse { + #*** !doctools + #[subsection {Namespace tomlish::parse}] + #[para] + #[list_begin definitions] + + #This is a somewhat curly mix of a statemachine and toml-nesting-stack littered with special cases. + #The code is a pig's-nest - but it should be noted that for example trailing single double quotes in multiline strings are perhaps not so trivial to parse using more standard methods either: + # - e.g some kind of backtracking required if using an ABNF parser? + #I don't know the precise technical name for this sort of parser; probably something like "Dog's Breakfast" + #More seriously, we don't have distinct lex/parse steps - so it is basically a 'fused lexer' or 'scannerless parser' + + #It is also desirable for this system to be useful in 'interactive' use. review - would a separate lexer make this easier or harder? + + #A possible alternative more structured approach might be to use a PEG (Parsing Expression Grammar) + + + variable is_parsing 0 ;#whether we are in the middle of parsing tomlish text + + variable state + # states: + # table-space, itable-space, array-space + # array-value-expected,keyval-value-expected,itable-keyval-value-expected, keyval-syntax, + # dquoted-key, squoted-key + # string-state, literal-state, multistring... + # + # notes: + # only the -space states are also 'spaces' ie a container which is pushed/popped on the spacestack + + # + # xxx_value-expected - we also allow for leading whitespace in this state, but once a value is returned we jump to a state based on the containing space. e.g keyval-tail or array-syntax + # + #stateMatrix defines for each state, actions to take for each possible token. + #single-element actions are the name of the next state into which to transition, or a 'POPSPACE' instruction to pop a level off the spacestack and add the data to the parent container. + #dual-element actions are a push instruction and the name of the space to push on the stack. + # - PUSHSPACE is a simple push onto the spacestack, zeropoppushspace also pushes, but will first do a pop *if* the current space level is greater than zero (ie if only if not already in root table-space) + + # -- --- --- --- --- --- + #token/state naming guide + # -- --- --- --- --- --- + #tokens : underscore separated or bare name e.g newline, start_quote, start_squote + #private tokens: always have a leading underscore (These are private 'temporary state' tokens that are never returned as actual tokens e.g _start_squote_sequence + #states : always contain at least one dash e.g err-state, table-space + #instructions + # -- --- --- --- --- --- + + + #stateMatrix dict of elements mapping current state to next state based on returned tokens + # current-state {token-encountered next-state ... } + # where next-state can be a 1 or 2 element list. + #If 2 element - the first item is an instruction (ucase) + #If 1 element - it is either a lowercase dashed state name or an ucase instruction + #e.g {PUSHSPACE } or POPSPACE or SAMESPACE + + + #SAMESPACE - got to same space as parent without popping a level, but has it's own autotransition lookup - strange concept - review usecases + + variable stateMatrix + set stateMatrix [dict create] + #--------------------------------------------------------- + #WARNING + #The stateMatrix implementation here is currently messy. + #The code is a mixture of declarative via the stateMatrix and imperative via switch statements during PUSH/POP/SAMESPACE transitions. + #This means the state behaviour has to be reasoned about by looking at both in conjuction. + #--------------------------------------------------------- + + #xxx-space vs xxx-syntax inadequately documented - TODO + + #review - out of date? + # --------------------------------------------------------------------------------------------------------------# + # incomplete example of some state starting at table-space + # --------------------------------------------------------------------------------------------------------------# + # ( = -> keyval-value-expected) + # keyval-syntax (popped -> keyval-space -> keyval-tail) (autotransition on pop) + # keyval-space (autotransition on push ^) + # table-space (barekey^) (startdquote -> dquoted-key ^) + # --------------------------------------------------------------------------------------------------------------# + + dict set stateMatrix\ + table-space { + bom "table-space"\ + whitespace "table-space"\ + newline "table-space"\ + barekey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ + squotedkey {PUSHSPACE "keyval-space" state "keyval-syntax" note ""}\ + dquotedkey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ + XXXsingle_dquote "quoted-key"\ + XXXsingle_squote "squoted-key"\ + comment "table-space"\ + starttablename "tablename-state"\ + starttablearrayname "tablearrayname-state"\ + enddquote "err-state"\ + endsquote "err-state"\ + comma "err-state"\ + eof "end-state"\ + equal "err-state"\ + cr "err-lonecr"\ + } + + + + dict set stateMatrix\ + keyval-space {\ + whitespace "keyval-syntax"\ + equal "keyval-value-expected"\ + } + + # ' = ' portion of keyval + dict set stateMatrix\ + keyval-syntax {\ + whitespace "keyval-syntax"\ + barekey {PUSHSPACE "dottedkey-space"}\ + squotedkey {PUSHSPACE "dottedkey-space"}\ + dquotedkey {PUSHSPACE "dottedkey-space"}\ + equal "keyval-value-expected"\ + comma "err-state"\ + newline "err-state"\ + eof "err-state"\ + } + #### + dict set stateMatrix\ + keyval-value-expected {\ + whitespace "keyval-value-expected"\ + untyped_value {TOSTATE "keyval-untyped-sequence" note "possible datetime datepart"}\ + literal {TOSTATE "keyval-tail" note "required for empty literal at EOF"}\ + string {TOSTATE "keyval-tail" note "required for empty string at EOF"}\ + single_dquote {TOSTATE "string-state" returnstate keyval-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ + single_squote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ + triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ + startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ + startarray {PUSHSPACE array-space returnstate keyval-tail}\ + } + #double_squote {TOSTATE "keyval-tail" note "empty literal received when double squote occurs"} + + #untyped_value sequences without intervening comma are allowed for datepart timepart + #we will produce tomlish with missing SEPS and to_dict must validate whether 2 adjacent barekeys are valid + dict set stateMatrix\ + keyval-untyped-sequence {\ + whitespace "keyval-untyped-sequence"\ + untyped_value {TOSTATE "keyval-tail"}\ + literal {TOSTATE "keyval-tail" note "required for empty literal at EOF"}\ + string {TOSTATE "keyval-tail" note "required for empty string at EOF"}\ + single_dquote {TOSTATE "string-state" returnstate keyval-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ + single_squote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ + triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ + startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ + startarray {PUSHSPACE array-space returnstate keyval-tail}\ + newline "POPSPACE"\ + comment "keyval-tail"\ + eof "end-state"\ + } + + #2025 - no leading-squote-space - only trailing-squote-space. + + dict set stateMatrix\ + keyval-tail {\ + whitespace "keyval-tail"\ + newline "POPSPACE"\ + comment "keyval-tail"\ + eof "end-state"\ + } + + + #itable-space/ curly-syntax : itables + # x={y=1,} + dict set stateMatrix\ + itable-space {\ + whitespace "itable-space"\ + newline "itable-space"\ + barekey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + squotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + dquotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + endinlinetable "POPSPACE"\ + comma "err-state"\ + comment "itable-space"\ + eof "err-state"\ + } + #we don't get single_squote etc here - instead we get the resulting squotedkey token + + + # ??? review - something like this + # + # x={y =1,} + dict set stateMatrix\ + itable-keyval-syntax {\ + whitespace {TOSTATE "itable-keyval-syntax"}\ + barekey {PUSHSPACE "dottedkey-space"}\ + squotedkey {PUSHSPACE "dottedkey-space"}\ + dquotedkey {PUSHSPACE "dottedkey-space"}\ + equal {TOSTATE "itable-keyval-value-expected"}\ + newline "err-state"\ + eof "err-state"\ + } + + # x={y=1} + dict set stateMatrix\ + itable-keyval-space {\ + whitespace "itable-keyval-syntax"\ + equal {TOSTATE "itable-keyval-value-expected" note "required"}\ + } + + dict set stateMatrix\ + itable-keyval-value-expected {\ + whitespace "itable-keyval-value-expected"\ + untyped_value {TOSTATE "itable-val-tail" note ""}\ + single_dquote {TOSTATE "string-state" returnstate itable-val-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate itable-val-tail}\ + single_squote {TOSTATE "literal-state" returnstate itable-val-tail note "usual way a literal is triggered"}\ + triple_squote {PUSHSPACE "multiliteral-space" returnstate itable-val-tail}\ + startinlinetable {PUSHSPACE "itable-space" returnstate itable-val-tail}\ + startarray {PUSHSPACE "array-space" returnstate itable-val-tail}\ + } + #double_squote not currently generated by _start_squote_sequence - '' processed as single_squote to literal-state just like 'xxx' + # review + # double_squote {TOSTATE "itable-val-tail" note "empty literal received when double squote occurs"} + + + + # x={y=1,z="x"} + #POPSPACE is transition from itable-keyval-space to parent itable-space + dict set stateMatrix\ + itable-val-tail {\ + whitespace "itable-val-tail"\ + endinlinetable "POPSPACE"\ + comma "POPSPACE"\ + newline {TOSTATE "itable-val-tail" note "itable-space ??"}\ + comment "itable-val-tail"\ + eof "err-state"\ + } + # XXXnewline "POPSPACE" + # We shouldn't popspace on newline - as if there was no comma we need to stay in itable-val-tail + # This means the newline and subsequent whitespace, comments etc become part of the preceeding dottedkey record + #e.g + # x = { + # j=1 + # #comment within dottedkey j record + # , # comment unattached + # #comment unattached + # k=2 , #comment unattached + # l=3 #comment within l record + # , m=4 + # #comment associated with m record + # + # #still associated with m record + # } + ## - This doesn't quite correspond to what a user might expect - but seems like a consistent mechanism. + #The awkwardness is because there is no way to put in a comment that doesn't consume a trailing comma + #so we cant do: j= 1 #comment for j1 , + # and have the trailing comma recognised. + # + # To associate: j= 1, #comment for j1 + # we would need some extra processing . (not popping until next key ? extra state itable-sep-tail?) REVIEW - worth doing? + # + # The same issue occurs with multiline arrays. The most natural assumption is that a comment on same line after a comma + # is 'associated' with the previous entry. + # + # These comment issues are independent of the data dictionary being generated for conversion to json etc - as the comments don't carry through anyway, + # but are a potential oddity for manipulating the intermediate tomlish structure whilst attempting to preserve 'associated' comments + # (e.g reordering records within an itable) + #The user's intention for 'associated' isn't always clear and the specs don't really guide on this. + + + #dottedkey-space is not (currently) used within [tablename] or [[tablearrayname]] + #it is for keyval ie x.y.z = value + + #this is the state after dot + #we are expecting a complete key token or whitespace + #(initial entry to the space is by one of the keys - which will immediately go to dottedkey-space-tail) + dict set stateMatrix\ + dottedkey-space {\ + whitespace "dottedkey-space"\ + dotsep "err-state"\ + barekey "dottedkey-space-tail"\ + squotedkey "dottedkey-space-tail"\ + dquotedkey "dottedkey-space-tail"\ + newline "err-state"\ + comma "err-state"\ + comment "err-state"\ + equal "err-state"\ + } + + #dottedkeyend "POPSPACE" + #equal "POPSPACE"\ + + + #jmn 2025 + #we have 1 or more dottedkeys so far - need dotsep to add more, whitespace to maintain, equal to pop + dict set stateMatrix\ + dottedkey-space-tail {\ + whitespace "dottedkey-space-tail" + dotsep "dottedkey-space" + equal "POPSPACE"\ + eof "err-state"\ + newline "err-state"\ + } + + #-------------------------------------------------------------------------- + #scratch area + #from_toml {x=1} + # barekey tok + # table-space PUSHSPACE keyval-space state keyval-syntax + # + + + #-------------------------------------------------------------------------- + + + #REVIEW + #toml spec looks like heading towards allowing newlines within inline tables + #https://github.com/toml-lang/toml/issues/781 + + #2025 - multiline itables appear to be valid for 1.1 - which we are targeting. + #https://github.com/toml-lang/toml/blob/main/toml.md#inline-table + + #JMN2025 + #review comment "err-state" vs comment "itable-space" - see if TOML 1.1 comes out and allows comments in multiline ITABLES + #We currently allow multiline ITABLES (also with comments) in the tokenizer. + #if we want to disallow as per TOML 1.0 - we should do so when attempting to get structure? + + + #JMN REVIEW + #dict set stateMatrix\ + # array-space {\ + # whitespace "array-space"\ + # newline "array-space"\ + # untyped_value "SAMESPACE"\ + # startarray {PUSHSPACE "array-space"}\ + # endarray "POPSPACE"\ + # startinlinetable {PUSHSPACE itable-space}\ + # single_dquote "string-state"\ + # single_squote "literal-state"\ + # triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax note "seems ok 2024"}\ + # comma "array-space"\ + # comment "array-space"\ + # eof "err-state-array-space-got-eof"\ + # } + + ## array-space ## + set aspace [dict create] + dict set aspace whitespace "array-space" + dict set aspace newline "array-space" + #dict set aspace untyped_value "SAMESPACE" + dict set aspace untyped_value "array-syntax" + dict set aspace startarray {PUSHSPACE "array-space"} + dict set aspace endarray "POPSPACE" + dict set aspace single_dquote {TOSTATE "string-state" returnstate array-syntax} + dict set aspace triple_dquote {PUSHSPACE "multistring-space" returnstate array-syntax} + dict set aspace single_squote {TOSTATE "literal-state" returnstate array-syntax} + dict set aspace triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax} + dict set aspace startinlinetable {PUSHSPACE itable-space} + #dict set aspace comma "array-space" + dict set aspace comment "array-space" + dict set aspace eof "err-state-array-space-got-eof" + dict set stateMatrix array-space $aspace + + #when we pop from an inner array we get to array-syntax + #e.g {x=[[]] ??? + set tarntail [dict create] + dict set tarntail whitespace "err-state" ;#"tablearrayname-tail" ;#spec doesn't allow whitespace here + dict set tarntail newline "err-state" + dict set tarntail comment "err-state" + dict set tarntail eof "err-state" + dict set tarntail endtablename "tablearray-tail" + dict set stateMatrix tablearrayname-tail $tarntail + + #review - somewhat counterintuitive...? + # [(starttablearrayname) (endtablearrayname] + # [(starttablename) (endtablename)] + + # [[xxx]] ??? + set tartail [dict create] + dict set tartail whitespace "tablearray-tail" + dict set tartail newline "table-space" + dict set tartail comment "tablearray-tail" + dict set tartail eof "end-state" + dict set stateMatrix tablearray-tail $tartail + + + + + + + dict set stateMatrix\ + end-state {} + + set knowntokens [list] + set knownstates [list] + dict for {state transitions} $stateMatrix { + if {$state ni $knownstates} {lappend knownstates $state} + dict for {tok instructions} $transitions { + if {$tok ni $knowntokens} {lappend knowntokens $tok} + } + } + dict set stateMatrix nostate {} + foreach tok $knowntokens { + dict set stateMatrix nostate $tok "err-nostate-received-token-$tok" + } + + + # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + #purpose - debugging? remove? + # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + #build a list of 'push triggers' from the stateMatrix + # ie tokens which can push a new space onto spacestack + set push_trigger_tokens [list] + tcl::dict::for {s transitions} $stateMatrix { + tcl::dict::for {token transition_to} $transitions { + set instruction [lindex $transition_to 0] + switch -exact -- $instruction { + PUSHSPACE - zeropoppushspace { + if {$token ni $push_trigger_tokens} { + lappend push_trigger_tokens $token + } + } + } + } + } + ::tomlish::log::debug "push_trigger_tokens: $push_trigger_tokens" + # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + + + + #This seems hacky... (deprecate in favour of explicit arguments to the instructions in stateMatrix?) + #spacePopTransitions, spacePushTransitions, spaceSameTransitions below for auto state redirections on POPSPACE,PUSHSPACE,SAMESPACE + + #mainly for the -space states: + #redirect to another state $c based on a state transition from $whatever to $b + # e.g "string {array-space array-syntax}" means when transitioning from string to array-space, jump to array-syntax instead. + #this is useful as we often don't know state $b. e.g when it is decided by 'POPSPACE' + + #use dict set to add values so we can easily add/remove/comment lines + + #Push to, next + #default first states when we push to these spaces + variable spacePushTransitions [dict create] + dict set spacePushTransitions keyval-space keyval-syntax + dict set spacePushTransitions itable-keyval-space itable-keyval-syntax + dict set spacePushTransitions array-space array-space + dict set spacePushTransitions table-space tablename-state + #dict set spacePushTransitions #itable-space itable-space + + #Pop to, next + variable spacePopTransitions [dict create] + dict set spacePopTransitions array-space array-syntax + + + #itable-keyval-space itable-val-tail + #review + #we pop to keyval-space from dottedkey-space or from keyval-value-expected? we don't always want to go to keyval-tail + #leave it out and make the POPSPACE caller explicitly specify it + #keyval-space keyval-tail + + variable spaceSameTransitions [dict create] + #JMN test + #dict set spaceSameTransitions array-space array-syntax + + #itable-keyval-space itable-val-tail + + + variable state_list ;#reset every tomlish::decode::toml + + namespace export tomlish toml + namespace ensemble create + + #goNextState has various side-effects e.g pushes and pops spacestack + #REVIEW - setting nest and v elements here is ugly + #todo - make neater, more single-purpose? + proc goNextState {tokentype tok currentstate} { + variable state + variable nest + variable v + + set prevstate $currentstate + + + variable spacePopTransitions + variable spacePushTransitions + variable spaceSameTransitions + + variable last_space_action "none" + variable last_space_type "none" + variable state_list + + set result "" + set starttok "" + + if {[dict exists $::tomlish::parse::stateMatrix $currentstate $tokentype]} { + set transition_to [dict get $::tomlish::parse::stateMatrix $currentstate $tokentype] + ::tomlish::log::debug "--->> goNextState tokentype:$tokentype tok:$tok currentstate:$currentstate : transition_to = $transition_to" + switch -exact -- [lindex $transition_to 0] { + POPSPACE { + set popfromspace_info [spacestack peek] + set popfromspace_state [dict get $popfromspace_info state] + spacestack pop + set parent_info [spacestack peek] + set type [dict get $parent_info type] + set parentspace [dict get $parent_info state] + + set last_space_action "pop" + set last_space_type $type + + if {[dict exists $parent_info returnstate]} { + set next [dict get $parent_info returnstate] + #clear the returnstate on current level + set existing [spacestack pop] + dict unset existing returnstate + spacestack push $existing ;#re-push modification + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected to stored returnstate $next <<---" + } else { + ### + #review - do away with spacePopTransitions - which although useful to provide a default.. + # - involve error-prone configurations distant to the main state transition configuration in stateMatrix + if {[dict exists $::tomlish::parse::spacePopTransitions $parentspace]} { + set next [dict get $::tomlish::parse::spacePopTransitions $parentspace] + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected state to $next (spacePopTransitions)<<---" + } else { + set next $parentspace + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace<<---" + } + } + set result $next + } + SAMESPACE { + set currentspace_info [spacestack peek] + ::tomlish::log::debug "--->> SAMESPACE got current space entry: $currentspace_info <<<<<" + set type [dict get $currentspace_info type] + set currentspace [dict get $currentspace_info state] + + if {[dict exists $currentspace_info returnstate]} { + set next [dict get $currentspace_info returnstate] + #clear the returnstate on current level + set existing [spacestack pop] + dict unset existing returnstate + spacestack push $existing ;#re-push modification + ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace redirected to stored returnstate $next" + } else { + if {[dict exists $::tomlish::parse::spaceSameTransitions $currentspace]} { + set next [dict get $::tomlish::parse::spaceSameTransitions $currentspace] + ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace redirected state to $next (spaceSameTransitions)" + } else { + set next $currentspace + ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace" + } + } + set result $next + } + zeropoppushspace { + if {$nest > 0} { + #pop back down to the root level (table-space) + spacestack pop + set parentinfo [spacestack peek] + set type [dict get $parentinfo type] + set target [dict get $parentinfo state] + + set last_space_action "pop" + set last_space_type $type + + #----- + #standard pop + set parentlevel [expr {$nest -1}] + lappend v($parentlevel) [set v($nest)] + incr nest -1 + #----- + } + #re-entrancy + + #set next [list PUSHSPACE [lindex $transition_to 1]] + set nexttokentype ${tokentype}2 ;#fake token type e.g tablename2 or tablearrayname2 + ::tomlish::log::debug "--->> zeropoppushspace goNextState RECURSE. calling goNextState $nexttokentype $currentstate" + set transition_info [::tomlish::parse::goNextState $nexttokentype $tok $currentstate] + set result [dict get $transition_info newstate] + } + PUSHSPACE { + set original_target [dict get $transition_to PUSHSPACE] + if {[dict exists $transition_to returnstate]} { + #adjust the existing space record on the stack. + #struct::stack doesn't really support that - so we have to pop and re-push + #todo - investigate a custom stack implementation where we can efficiently lset the top of the stack + set currentspace [spacestack pop] + dict set currentspace returnstate [dict get $transition_to returnstate] + spacestack push $currentspace ;#return modified info to stack so when we POPSPACE the returnstate is available. + } + if {[dict exists $transition_to starttok]} { + set starttok [dict get $transition_to starttok] + } + spacestack push [dict create type space state $original_target] + + set last_space_action "push" + set last_space_type "space" + + if {[dict exists $transition_to state]} { + #an explicit state in the pushed space was requested in the stateMatrix - override the spacePushTransition (spacePushTransitions can be deprecated if we require explicitness?) + set next [dict get $transition_to state] + ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target redirected state to $next by explicit 'state' entry" + } else { + #puts $::tomlish::parse::spacePushTransitions + if {[dict exists $::tomlish::parse::spacePushTransitions $original_target]} { + set next [dict get $::tomlish::parse::spacePushTransitions $original_target] + ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target redirected state to $next (spacePushTransitions) " + } else { + set next $original_target + ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target" + } + } + set result $next + } + TOSTATE { + if {[dict exists $transition_to returnstate]} { + #adjust the existing space record on the stack. + #struct::stack doesn't really support that - so we have to pop and re-push + #todo - investigate a custom stack implementation where we can efficiently lset the top of the stack + set currentspace [spacestack pop] + dict set currentspace returnstate [dict get $transition_to returnstate] + spacestack push $currentspace ;#return modified info to stack so when we POPSPACE the returnstate is available. + } + set result [dict get $transition_to TOSTATE] + } + default { + #simplified version of TOSTATE + set result [lindex $transition_to 0] ;#ignore everything but first word + } + } + } else { + ::tomlish::log::error "--->> No state transition defined from state $currentstate when tokentype $tokentype received" + set result "nostate" + } + lappend state_list [list tokentype $tokentype from $currentstate to $result] + set state $result + ::tomlish::log::notice "--->> STATE TRANSITION tokenType: '$tokentype' tok:$tok triggering '$currentstate' -> '$result' last_space_action:$last_space_action " + return [dict create prevstate $prevstate newstate $result space_action $last_space_action starttok $starttok] + } + + proc report_line {{line ""}} { + variable linenum + variable is_parsing + if {$is_parsing} { + if {$line eq ""} { + set line $linenum + } + return "Line Number: $line" + } else { + #not in the middle of parsing tomlish text - return nothing. + return "" + } + } + + #produce a *slightly* more readable string rep of the nest for puts etc. + proc nest_pretty1 {list} { + set prettier "{" + + foreach el $list { + if { [lindex $el 0] eq "NEWLINE"} { + append prettier "[list $el]\n" + } elseif {([llength $el] > 1) && ([lindex $el 0] in {KEY DQKEY SQKEY TABLE ARRAY})} { + append prettier [nest_pretty1 $el] + } else { + append prettier "[list $el] " + } + } + append prettier "}" + return $prettier + } + + proc set_tokenType {t} { + variable tokenType + variable tokenType_list + if {![info exists tokenType]} { + set tokenType "" + } + lappend tokenType_list $t + set tokenType $t + } + + proc switch_tokenType {t} { + variable tokenType + variable tokenType_list + lset tokenType_list end $t + set tokenType $t + } + + proc get_tokenType {} { + variable tokenType + return $tokenType + } + + + proc get_token_waiting {} { + variable token_waiting + return [lindex $token_waiting 0] + } + proc clear_token_waiting {} { + variable token_waiting + set token_waiting [list] + } + + #token_waiting is a list - but our standard case is to have only one + #in certain circumstances such as near eof we may have 2 + #the set_token_waiting function only allows setting when there is not already one waiting. + #we want to catch cases of inadvertently trying to set multiple + # - the reason being that the state transition triggered by the previous token may have invalidated the assumptions made when a token was added as waiting. + proc set_token_waiting {args} { + if {[llength $args] %2 != 0} { + error "tomlish set_token_waiting must have args of form: type value complete 0|1" + } + variable token_waiting + + if {[llength $token_waiting] && [dict get [lindex $token_waiting end] type] ne "eof"} { + #tokloop already set a token_waiting - but something (post tokloop processing?) is trying to set another + #we may need to remove the existing token_waiting and reset the tokloop index to the previous char so it's reprocessed in the possibly new context + #rather than attempt to make the right decision here - we raise an error and require the caller to check/handle it + set err "tomlish set_token_waiting already has token_waiting: [lindex $token_waiting 0]" + append err \n " - cannot add token_waiting: $args" + error $err + #set tomlish::parse::i [expr {[dict get $token_waiting startindex] -1}] + #set token_waiting [list] + } + + set waiting [dict create] + dict for {k v} $args { + switch -exact $k { + type - complete { + dict set waiting $k $v + } + value { + dict set waiting tok $v + } + startindex { + dict set waiting startindex $v + } + default { + error "tomlish set_token_waiting error - unrecognised key $k. known keys: [dict keys $args]" + } + } + } + if {![tcl::string::is boolean -strict [dict get $waiting complete]]} { + error "tomlish set_token_waiting error - 'complete' must be a boolean. got [dict get $waiting complete]" + } + if {![llength $token_waiting]} { + set token_waiting [list $waiting] + } else { + #an extra sanity-check that we don't have more than just the eof.. + if {[llength $token_waiting] > 1} { + set err "tomlish Unexpected. Existing token_waiting count > 1.\n" + foreach tw $token_waiting { + append err " $tw" \n + } + append err " - cannot add token_waiting: $waiting" + error $err + } + #last entry must be a waiting eof + set token_waiting [list $waiting [lindex $token_waiting end]] + } + return + } + + #returns 0 or 1 + #tomlish::parse::tok + #we attempt to do this without lookahead (potential use in streaming toml? for what benefit?) todo -final flag + # - the possible benefit is being able to more easily process in arbitrarily split chunks (although we would still have to watch crlf splitting ?) + # - interactive use? + + proc tok {} { + variable nest + variable s + variable v + variable i + variable tok + variable type ;#character type + variable state ;#FSM + + + variable tokenType + variable tokenType_list + + + variable endToken + + variable lastChar + + variable braceCount + variable bracketCount + + + #------------------------------ + #Previous run found another (presumably single-char) token + #The normal case is for there to be only one dict in the list + #multiple is an exception - primarily for eof + variable token_waiting + if {[llength $token_waiting]} { + set waiting [lindex $token_waiting 0] + + set tokenType [dict get $waiting type] + set tok [dict get $waiting tok] + #todo: dict get $token_waiting complete + set token_waiting [lrange $token_waiting 1 end] + return 1 + } + #------------------------------ + + set resultlist [list] + set sLen [tcl::string::length $s] + + set slash_active 0 + set quote 0 + set c "" + for {} {$i < $sLen} {} { + if {$i > 0} { + set lastChar [tcl::string::index $s [expr {$i - 1}]] + set start_of_data h + } else { + set lastChar "" + set start_of_data 1 + #bom-handling + if {[tcl::string::index $s 0] eq "\uFEFF"} { + #bom (could be from various encodings - now decoded as single unicode char FEFF) + #incr i 1 ;#skip over initial bom? + } + } + + + set c [tcl::string::index $s $i] + set cindex $i + + set ctest [tcl::string::map {\{ lc \} rc \[ lb \] rb \" dq ' sq \\ bsl \r cr \n lf \t tab \uFEFF bom} $c] + tomlish::log::debug "- tokloop char <$ctest> index $i tokenType:$tokenType tok:<$tok>" + #puts "got char $c during tokenType '$tokenType'" + incr i ;#must incr here because we do returns inside the loop + + + + switch -exact -- $ctest { + # { + set had_slash $slash_active + set slash_active 0 + + if {$had_slash} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + #for multiliteral, multistring - data and/or end + incr i -1 + return 1 + } + _start_squote_sequence { + #pseudo token beginning with underscore - never returned to state machine - review + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + barekey { + error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed (see tomlish::utils::is_barekey). [tomlish::parse::report_line]" + } + whitespace { + # hash marks end of whitespace token + #do a return for the whitespace, set token_waiting + #set_token_waiting type comment value "" complete 1 + incr i -1 ;#leave comment for next run + return 1 + } + untyped_value { + #REVIEW! the spec isn't clear.. is whitespace after an int,bool etc required before comment? + #we will accept a comment marker as an immediate terminator of the untyped_value. + incr i -1 + return 1 + } + starttablename - starttablearrayname { + #fix? + error "tomlish Character '#' is invalid first character for $tokenType. [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #invalid in bare parts - but allowed in quoted parts - let tablename parser sort it out + append tok $c + } + default { + #dquotedkey, string,literal, multistring + append tok $c + } + } + } else { + switch -- $state { + multistring-space { + set_tokenType stringpart + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "#" + } + multiliteral-space { + set_tokenType "literalpart" + set tok "#" + } + default { + #start of token if we're not in a token + set_tokenType comment + set tok "" ;#The hash is not part of the comment data + } + } + } + } + lc { + #left curly brace + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i [tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + append tok $c + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + starttablename - starttablearrayname { + #*bare* tablename can only contain letters,digits underscores + error "tomlish Invalid tablename first character \{ [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #valid in quoted parts + append tok $c + } + comment { + if {$had_slash} {append tok "\\"} + append tok "\{" + } + default { + #end any other token. + incr i -1 + return 1 + } + } + } else { + switch -exact -- $state { + itable-keyval-value-expected - keyval-value-expected { + #switch last key to tablename?? + set_tokenType "startinlinetable" + set tok "\{" + return 1 + } + array-space - array-syntax { + #nested anonymous inline table + set_tokenType "startinlinetable" + set tok "\{" + return 1 + } + table-space { + #invalid - but allow parser statemachine to report it. ? + set_tokenType "startinlinetable" + set tok "\{" + return 1 + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "\{" + } + multiliteral-space { + set_tokenType "literalpart" + set tok "\{" + } + default { + error "tomlish state: '$state'. left brace case not implemented [tomlish::parse::report_line]" + } + } + } + + } + rc { + #right curly brace + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + append tok $c + } + string - dquotedkey - comment { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + starttablename - tablename { + if {$had_slash} {append tok "\\"} + #invalid! - but leave for datastructure loading stage to catch + set_token_waiting type endinlinetable value "" complete 1 startindex $cindex + return 1 + } + starttablearrayname - tablearrayname { + if {$had_slash} {append tok "\\"} + #invalid! - but leave for datastructure loading stage to catch + set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex + return 1 + } + default { + #end any other token + incr i -1 + return 1 + } + } + } else { + #$slash_active not relevant when no tokenType + switch -exact -- $state { + table-space { + #invalid - but allow parser statemachine to report it. ? + set_tokenType "endinlinetable" + set tok "\}" + return 1 + } + itable-space { + set_tokenType "endinlinetable" + set tok "\}" + return 1 + } + tablename-state { + #e.g [] - empty tablename - allowed or not? + #empty tablename/tablearrayname ? + #error "unexpected tablename problem" + + set_tokenType "endinlinetable" + set tok "" ;#no output into the tomlish list for this token + return 1 + } + tablearrayname-state { + error "tomlish unexpected tablearrayname-state problem" + set_tokenType "endinlinetable" + set tok "" ;#no output into the tomlish list for this token + return 1 + } + array-syntax - array-space { + #invalid + set_tokenType "endinlinetable" + set tok "\}" + return 1 + } + itable-val-tail { + set_tokenType "endinlinetable" + set tok "" + #we need to pop the keyval - and then reprocess to pop the inlinetable - so we incr -1 + incr i -1 + return 1 + } + itable-keyval-syntax { + error "tomlish endinlinetable unexpected at this point. Expecting key=val syntax [tomlish::parse::report_line]" + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "\}" + } + multiliteral-space { + set_tokenType "literalpart" ; #review + set tok "\}" + } + default { + #JMN2024b keyval-tail? + error "tomlish state '$state'. endinlinetable case not implemented [tomlish::parse::report_line]" + } + } + } + + } + lb { + #left square bracket + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + append tok $c + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + starttablename { + #change the tokenType + switch_tokenType "starttablearrayname" + set tok "" ;#no output into the tomlish list for this token + #any following whitespace is part of the tablearrayname, so return now + return 1 + } + tablename - tablearrayname { + #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token + if {$had_slash} { + #resultant tablename may be invalid - but leave for datastructure loading stage to catch + #append tok "\\[" + append tok {\[} + } else { + if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { + #invalid at this point - state machine should disallow: + # table -> starttablearrayname + # tablearray -> starttablearrayname + set_token_waiting type starttablearrayname value "" complete 1 startindex $cindex + return 1 + } else { + #we appear to still be in single or double quoted section + append tok "\[" + } + } + } + comment { + if {$had_slash} {append tok "\\"} + append tok "\[" + } + default { + #end any other token. + incr i -1 + return 1 + } + } + } else { + #$slash_active not relevant when no tokenType + switch -exact -- $state { + keyval-value-expected - itable-keyval-value-expected { + set_tokenType "startarray" + set tok "\[" + return 1 + } + array-space - array-syntax { + #nested array? + set_tokenType "startarray" + set tok "\[" + return 1 + #error "state: array-space. startarray case not implemented [tomlish::parse::report_line]" + } + table-space { + #table name + #assume it's a single bracket - but we need to wait for non-bracket to confirm it's not a tablearray + #note that a starttablearrayname token may contain whitespace between the brackets + # e.g \[ \[ + set_tokenType "starttablename" + set tok "" ;#there is no output into the tomlish list for this token + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "\[" + } + multiliteral-space { + set_tokenType "literalpart" + set tok "\[" + } + itable-space { + #handle state just to give specific error msg + error "tomlish state: '$state'. Left square bracket invalid. Cannot start array in inline table without key. Use key=\[\] syntax. [tomlish::parse::report_line]" + } + default { + error "tomlish state: '$state'. startarray case not implemented [tomlish::parse::report_line]" + } + } + } + } + rb { + #right square bracket + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + append tok $c + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + comment { + if {$had_slash} {append tok "\\"} + append tok $c + } + whitespace { + if {$state eq "multistring-space"} { + #???? + incr i -1 + if {$had_slash} {incr i -1} ;#reprocess + return 1 + } else { + incr i -1 + if {$had_slash} {incr i -1} ;#reprocess + return 1 + } + } + starttablename { + #toml-test invalid/table/empty + + set_token_waiting type tablename value "" complete 1 startindex $cindex + incr i -1 + return 1 + } + tablename { + #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token + if {$had_slash} { + #resultant tablename may be invalid - but leave for datastructure loading stage to catch + append tok "\\]" + } else { + if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { + set_token_waiting type endtablename value "" complete 1 startindex $cindex + return 1 + } else { + #we appear to still be in single or double quoted section + append tok "]" + } + } + } + tablearrayname { + #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token + if {$had_slash} { + #resultant tablename may be invalid - but leave for datastructure loading stage to catch + append tok "\\]" + } else { + if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { + set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex + return 1 + } else { + #we appear to still be in single or double quoted section + append tok "]" + } + } + } + default { + incr i -1 + return 1 + } + } + } else { + #$slash_active not relevant when no tokenType + switch -exact -- $state { + array-syntax - array-space { + #invalid - but allow parser statemachine to report it. + set_tokenType "endarray" + set tok "\]" + return 1 + } + table-space { + #invalid - but allow parser statemachine to report it. ? + set_tokenType "endarray" + set tok "\]" + return 1 + } + tablename-state { + #e.g [] - empty tablename + #tomltest 1.1.0 invalid/table/empty + #should be invalid + #we parse it and let dict::from_tomlish error when it tries to split table + + set_tokenType "endtablename" + set tok "" ;#no output into the tomlish list for this token + return 1 + } + tablearrayname-state { + error "tomlish unexpected tablearrayname problem" + set_tokenType "endtablearray" + set tok "" ;#no output into the tomlish list for this token + return 1 + } + tablearrayname-tail { + #[[xxx] + set_tokenType "endtablename" + #sequence: starttablename -> starttablearrayname -> endtablearrayname -> endtablename + return 1 + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "\]" + } + multiliteral-space { + set_tokenType "literalpart" + set tok "\]" + } + default { + error "tomlish state '$state'. endarray case not implemented [tomlish::parse::report_line]" + } + } + } + } + bsl { + #backslash + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + whitespace { + if {$state eq "multistring-space"} { + #end whitespace token + incr i -1 ;#reprocess bsl in next run + return 1 + } else { + error "tomlish Unexpected backslash during whitespace. [tomlish::parse::report_line]" + } + } + literal - literalpart - squotedkey { + #never need to set slash_active true when in single quoted tokens + append tok "\\" + set slash_active 0 + } + string - dquotedkey - comment { + if {$slash_active} { + set slash_active 0 + append tok "\\\\" + } else { + set slash_active 1 + } + } + stringpart { + if {$slash_active} { + set slash_active 0 + append tok "\\\\" + } else { + set slash_active 1 + } + } + starttablename - starttablearrayname { + error "tomlish backslash is invalid as first character of $tokenType [tomlish::parse::report_line]" + } + tablename - tablearrayname { + if {$slash_active} { + set slash_active 0 + append tok "\\\\" + } else { + set slash_active 1 + } + } + barekey { + error "tomlish Unexpected backslash during barekey. [tomlish::parse::report_line]" + } + default { + error "tomlish Backslash unexpected during tokentype: '$tokenType'. [tomlish::parse::report_line]" + } + } + } else { + switch -exact -- $state { + multistring-space { + if {$slash_active} { + set_tokenType "stringpart" + set tok "\\\\" + set slash_active 0 + } else { + set slash_active 1 + } + } + multiliteral-space { + #nothing can be escaped in multiliteral-space - not even squotes (?) review + set_tokenType "literalpart" + set tok "\\" + } + default { + error "tomlish tok error: Unexpected backslash when no token is active. [tomlish::parse::report_line]" + } + } + } + } + sq { + #single quote + set had_slash $slash_active + set slash_active 0 + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote { + #for within multiliteral + #short tentative_accum_squote tokens are returned if active upon receipt of any other character + #longest allowable for leading/trailing are returned here + #### + set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote + #assert state = trailing-squote-space + append tok $c + if {$existingtoklen == 4} { + #maxlen to be a tentative_accum_squote is multisquote + 2 = 5 + #return tok with value ''''' + return 1 + } + } + tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space + switch -- [tcl::string::length $tok] { + 1 { + #no conclusion can yet be reached + append tok $c + } + 2 { + #enter multiliteral + #switch? + append tok $c + set_tokenType triple_squote + return 1 + } + default { + #if there are more than 3 leading squotes we also enter multiliteral space and the subsequent ones are handled + #by the tentative_accum_squote check for ending sequence which can accept up to 5 and reintegrate the + #extra 1 or 2 squotes as data. + error "tomlish unexpected token length [tcl::string::length $tok] in '_start_squote_sequence'" + } + } + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + whitespace { + #end whitespace + incr i -1 ;#reprocess sq + return 1 + } + literal { + #slash_active always false + #terminate the literal + set_token_waiting type endsquote value "'" complete 1 startindex $cindex + return 1 + } + literalpart { + #ended by ''' - but final could be '''' or ''''' (up to 2 squotes allowed directly before ending triple squote sequence) + #todo + # idea: end this literalpart (possibly 'temporarily') + # let the sq be reprocessed in the multiliteral-space to push an end-multiliteral-sequence to state stack + # upon popping end-multiliteral-sequence - stitch quotes back into this literalpart's token (if either too short - or a long ending sequence as shown above) + incr i -1 ;#throw the "'" back to loop - will be added to a tentative_accum_squote token for later processing + return 1 + } + XXXitablesquotedkey { + set_token_waiting type endsquote value "'" complete 1 startindex $cindex + return 1 + } + squotedkey { + ### + #set_token_waiting type endsquote value "'" complete 1 + return 1 + } + starttablename - starttablearrayname { + #!!! + incr i -1 + return 1 + } + tablename - tablearrayname { + append tok $c + } + barekey { + #barekeys now support all sorts of unicode letter/number chars for other cultures + #but not punctuation - not even for those of Irish heritage who don't object + #to the anglicised form of some names. + # o'shenanigan seems to not be a legal barekey + #The Irish will have to use an earlier form Ó - which apparently many may prefer anyway. + error "tomlish Unexpected single quote during barekey. [tomlish::parse::report_line]" + } + default { + append tok $c + } + } + } else { + switch -exact -- $state { + array-space - keyval-value-expected - itable-keyval-value-expected { + #leading squote + #pseudo-token _start_squote_sequence ss not received by state machine + #This pseudotoken will trigger production of single_squote token or triple_squote token + #It currently doesn't trigger double_squote token + #(handle '' same as 'x' ie produce a single_squote and go into processing literal) + #review - producing double_squote for empty literal may be slightly more efficient. + #This token is not used to handle squote sequences *within* a multiliteral + set_tokenType "_start_squote_sequence" + set tok "'" + } + multiliteral-space { + #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row + #we are building up a tentative_accum_squote to determine if + #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines + #b) it is exactly ''' and we can terminate the whole multiliteral + #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space + set_tokenType "tentative_trigger_squote" ;#trigger tentative_accum_squote + set tok "'" + return 1 + } + table-space - itable-space { + #tests: squotedkey.test squotedkey_itable.test + set_tokenType "squotedkey" + set tok "" + } + XXXtable-space - XXXitable-space { + #future - could there be multiline keys? MLLKEY, MLBKEY ? + #this would (almost) allow arbitrary tcl dicts to be stored in toml (aside from escaping issues) + #probably unlikely - as it's perhaps not very 'minimal' or ergonomic for config files + #@2025 ABNF for toml mentions key, simple-key, unquoted-key, quoted-key and dotted-key + #where key is simple-key or dotted-key - no MLL or MLB components + #the spec states solution for arbitrary binary data is application specific involving encodings + #such as hex, base64 + set_tokenType "_start_squote_sequence" + set tok "'" + return 1 + } + tablename-state { + #first char in tablename-state/tablearrayname-state + set_tokenType "tablename" + append tok "'" + } + tablearrayname-state { + set_tokenType "tablearrayname" + append tok "'" + } + literal-state { + #shouldn't get here? review + tomlish::log::debug "- tokloop sq during literal-state with no tokentype - empty literal?" + set_tokenType "literal" + incr -1 + return 1 + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} {append tok "\\"} + append tok "," + #error "tomlish unimplemented - squote during state '$state'. [tomlish::parse::report_line]" + } + dottedkey-space { + set_tokenType "squotedkey" + } + default { + error "tomlish unhandled squote during state '$state'. [tomlish::parse::report_line]" + } + } + } + + } + dq { + #double quote + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #review + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + tentative_accum_dquote { + #within multistring + #short tentative_accum_dquote tokens are returned if active upon receipt of any other character + #longest allowable for leading/trailing are returned here + #### + set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote + #assert state = trailing-squote-space + append tok $c + if {$existingtoklen == 4} { + #maxlen to be a tentative_accum_dquote is multidquote + 2 = 5 + #return tok with value """"" + return 1 + } + } + _start_dquote_sequence { + #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space + switch -- [tcl::string::length $tok] { + 1 { + #no conclusion can yet be reached + append tok $c + } + 2 { + #enter multistring + #switch? + append tok $c + set_tokenType triple_dquote + return 1 + } + default { + #if there are more than 3 leading dquotes we also enter multistring space and the subsequent ones are handled + #by the tentative_accum_dquote check for ending sequence which can accept up to 5 and reintegrate the + #extra 1 or 2 dquotes as data. + error "tomlish unexpected token length [tcl::string::length $tok] in '_start_dquote_sequence'" + } + } + } + literal - literalpart { + append tok $c + } + string { + if {$had_slash} { + append tok "\\" $c + } else { + #unescaped quote always terminates a string + set_token_waiting type enddquote value "\"" complete 1 startindex $cindex + return 1 + } + } + stringpart { + #sub element of multistring + if {$had_slash} { + append tok "\\" $c + } else { + incr i -1 ;#throw the {"} back to loop - will be added to a tentative_accum_dquote token for later processing + return 1 + } + } + whitespace { + #assert: had_slash will only ever be true in multistring-space + if {$had_slash} { + incr i -2 + return 1 + } else { + #end whitespace token - throw dq back for reprocessing + incr i -1 + return 1 + } + } + comment { + if {$had_slash} {append tok "\\"} + append tok $c + } + XXXdquotedkey { + if {$had_slash} { + append tok "\\" + append tok $c + } else { + set_token_waiting type enddquote value "\"" complete 1 startindex $cindex + return 1 + } + } + dquotedkey { + ### + if {$had_slash} { + append tok "\\" + append tok $c + } else { + #set_token_waiting type enddquote value {"} complete 1 + return 1 + } + } + squotedkey { + append tok $c + } + tablename - tablearrayname { + if {$had_slash} {append tok "\\"} + append tok $c + } + starttablename - starttablearrayname { + incr i -1 ;## + return 1 + } + default { + error "tomlish got quote during tokenType '$tokenType' [tomlish::parse::report_line]" + } + } + } else { + #$slash_active not relevant when no tokenType + #token is string only if we're expecting a value at this point + switch -exact -- $state { + array-space - keyval-value-expected - itable-keyval-value-expected { + #leading dquote + #pseudo-token _start_squote_sequence ss not received by state machine + #This pseudotoken will trigger production of single_dquote token or triple_dquote token + #It currently doesn't trigger double_dquote token + #(handle "" same as "x" ie produce a single_dquote and go into processing string) + #review - producing double_dquote for empty string may be slightly more efficient. + #This token is not used to handle dquote sequences once *within* a multistring + set_tokenType "_start_dquote_sequence" + set tok {"} + } + multistring-space { + if {$had_slash} { + set_tokenType "stringpart" + set tok "\\\"" + } else { + #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row + #we are building up a tentative_accum_squote to determine if + #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines + #b) it is exactly ''' and we can terminate the whole multiliteral + #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space + set_tokenType "tentative_trigger_dquote" ;#trigger tentative_accum_dquote + set tok {"} + return 1 + } + } + multiliteral-space { + set_tokenType "literalpart" + set tok "\"" + } + table-space - itable-space { + set_tokenType "dquotedkey" + set tok "" + } + dottedkey-space { + set_tokenType dquotedkey + set tok "" + + #only if complex keys become a thing + #set_tokenType dquote_seq_begin + #set tok $c + } + tablename-state { + set_tokenType tablename + set tok $c + } + tablearrayname-state { + set_tokenType tablearrayname + set tok $c + } + default { + error "tomlish Unexpected dquote during state '$state' [tomlish::parse::report_line]" + } + } + } + } + = { + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #review + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + #assertion had_slash 0 + append tok $c + } + string - comment - dquotedkey { + #for these tokenTypes an = is just data. + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + whitespace { + if {$state eq "multistring-space"} { + incr i -1 + return 1 + } else { + set_token_waiting type equal value = complete 1 startindex $cindex + return 1 + } + } + barekey { + #set_token_waiting type equal value = complete 1 + incr i -1 + return 1 + } + starttablename - starttablearrayname { + error "tomlish Character '=' is invalid first character for $tokenType. [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #invalid in bare name - but valid in quoted parts - leave for tablename parser to sort out + append tok $c + } + default { + error "tomlish unexpected = character during tokentype $tokenType. case not implemented. [tomlish::parse::report_line]" + } + } + } else { + switch -exact -- $state { + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok = + } + multiliteral-space { + set_tokenType "literalpart" + set tok "=" + } + dottedkey-space { + set_tokenType "equal" + set tok "=" + return 1 + } + default { + set_tokenType "equal" + set tok = + return 1 + } + } + } + } + cr { + #REVIEW! + # \r carriage return + if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. + set slash_active 0 + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #we have received a double cr + ::tomlish::log::warn "double cr - will generate cr token. needs testing" + set_tokenType "cr" ;#lone cr token will generally raise an error - but let state machine handle it + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal { + append tok $c + } + literalpart { + #part of MLL string (multi-line literal string) + #we need to split out crlf as a separate NEWLINE to be consistent + ::tomlish::log::warn "literalpart ended by cr - needs testing" + #return literalpart temporarily - allow cr to be reprocessed from multiliteral-space + incr i -1 + return 1 + } + stringpart { + #stringpart is a part of MLB string (multi-line basic string) + #throw back the cr - if followed by lf it will become a {NEWLINE crlf} entry within the MULTISTRING list (e.g between STRINGPART entries) + incr i -1 + return 1 + } + starttablename - starttablearrayname { + error "tomlish Character is invalid first character for $tokenType. [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #could in theory be valid in quoted part of name + #review - might be better just to disallow here + append tok $c + } + whitespace { + #it should technically be part of whitespace if not followed by lf + #but outside of values we are also free to map it to be another NEWLINE instead? REVIEW + incr i -1 + return 1 + } + untyped_value { + incr i -1 + return 1 + } + comment { + #JJJJ + #review + incr i -1 + return 1 + } + default { + #!todo - error out if cr inappropriate for tokenType + append tok $c + } + } + } else { + #lf may be appended if next + #review - lone cr as newline? - this is uncommon - but so is lone cr in a string(?) + set_tokenType "newline" + set tok cr + } + } + lf { + # \n newline + set had_slash $slash_active + set slash_active 0 + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #review + #this lf is the trailing part of a crlf + append tok lf ;#assert we should now have tok "crlf" - as a previous cr is the only way to have an incomplete newline tok + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + #multiliteral or multistring + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal { + #nl is not allowed *within* a literal - require multiliteral syntax for any literal containing a newline ''' ''' + #even though we terminate the literal without the closing quote here - the token_waiting newline should trigger a state error + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 + } + literalpart { + #we allow newlines - but store them within the multiliteral as their own element + #This is a legitimate end to the literalpart - but not the whole multiliteral + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 + } + stringpart { + if {$had_slash} { + #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) + set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] + incr i -1 + return 1 + } else { + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 + } + } + starttablename - tablename - tablearrayname - starttablearrayname { + error "tomlish Character is invalid in $tokenType. [tomlish::parse::report_line]" + } + default { + #newline ends all other tokens. + #note for string: we don't add (raw unescaped) newline to simple string. (must use multi-string for this) + #note for whitespace: + # we will use the convention that \n terminates the current whitespace even if whitespace follows + # ie whitespace is split into separate whitespace tokens at each newline + + #puts "-------------- newline lf during tokenType $tokenType" + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 + } + } + } else { + switch -exact -- $state { + multistring-space { + if {$had_slash} { + set_tokenType "continuation" + set tok "\\" + incr i -1 + return 1 + } else { + set_tokenType "newline" + set tok lf + return 1 + } + } + multiliteral-space { + #assert had_slash 0 + set_tokenType "newline" + set tok "lf" + return 1 + } + default { + #ignore slash? error? + set_tokenType "newline" + set tok lf + return 1 + } + } + #if {$had_slash} { + # #CONT directly before newline - allows strings_5_byteequivalent test to pass + # set_tokenType "continuation" + # set tok "\\" + # incr i -1 + # return 1 + #} else { + # set_tokenType newline + # set tok lf + # return 1 + #} + } + } + , { + set had_slash $slash_active + set slash_active 0 + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + comment - tablename - tablearrayname { + if {$had_slash} {append tok "\\"} + append tok , + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + #stringpart can have up to 2 quotes too + if {$had_slash} {append tok "\\"} + append tok $c + } + literal - literalpart - squotedkey { + #assert had_slash always 0 + append tok $c + } + whitespace { + if {$state eq "multistring-space"} { + incr i -1 + return 1 + } else { + set_token_waiting type comma value "," complete 1 startindex $cindex + return 1 + } + } + default { + set_token_waiting type comma value "," complete 1 startindex $cindex + if {$had_slash} {append tok "\\"} + return 1 + } + } + } else { + switch -exact -- $state { + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} {append tok "\\"} + append tok "," + } + multiliteral-space { + #assert had_slash 0 + set_tokenType "literalpart" + set tok "," + } + default { + set_tokenType "comma" + set tok "," + return 1 + } + } + } + } + . { + set had_slash $slash_active + set slash_active 0 + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + comment - untyped_value { + if {$had_slash} {append tok "\\"} + append tok $c + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + literal - literalpart - squotedkey { + #assert had_slash always 0 + append tok $c + } + whitespace { + switch -exact -- $state { + multistring-space { + #review + if {$had_slash} { + incr i -2 + } else { + incr i -1 + } + return 1 + } + xxxdottedkey-space { + incr i -1 + return 1 + } + dottedkey-space-tail { + incr i -1 + return 1 + } + default { + error "tomlish Received period during tokenType 'whitespace' [tomlish::parse::report_line]" + } + } + } + starttablename - starttablearrayname { + #This would correspond to an empty table name + error "tomlish Character '.' is not allowed as first character ($tokenType). [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #subtable - split later - review + append tok $c + } + barekey { + #e.g x.y = 1 + #we need to transition the barekey to become a structured table name ??? review + #x is the tablename y is the key + set_token_waiting type dotsep value "." complete 1 startindex $cindex + return 1 + } + default { + error "tomlish Received period during tokenType '$tokenType' [tomlish::parse::report_line]" + #set_token_waiting type period value . complete 1 + #return 1 + } + } + } else { + switch -exact -- $state { + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} {append tok "\\"} + append tok "." + } + multiliteral-space { + set_tokenType "literalpart" + set tok "." + } + XXXdottedkey-space { + ### obs? + set_tokenType "dotsep" + set tok "." + return 1 + } + dottedkey-space-tail { + ### + set_tokenType "dotsep" + set tok "." + return 1 + } + default { + set_tokenType "untyped_value" + set tok "." + } + } + } + + } + " " - tab { + if {[tcl::string::length $tokenType]} { + set had_slash $slash_active + set slash_active 0 + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + barekey { + #todo had_slash - emit token or error + #whitespace is a terminator for bare keys + #set_token_waiting type whitespace value $c complete 1 + incr i -1 + return 1 + } + untyped_value { + #unquoted values (int,date,float etc) are terminated by whitespace + #set_token_waiting type whitespace value $c complete 1 + incr i -1 + return 1 + } + comment { + if {$had_slash} { + append tok "\\" + } + append tok $c + } + string - dquotedkey { + if {$had_slash} { append tok "\\" } + append tok $c + } + stringpart { + #for stringpart we store WS separately for ease of processing continuations (CONT stripping) + if {$had_slash} { + #REVIEW + #emit the stringpart - go back to the slash + incr i -2 + return 1 + } else { + #split into STRINGPART xxx WS " " + incr i -1 + return 1 + } + } + literal - literalpart - squotedkey { + append tok $c + } + whitespace { + if {$state eq "multistring-space"} { + append tok $c + } else { + append tok $c + } + } + starttablename - starttablearrayname { + incr i -1 + return 1 + } + tablename - tablearrayname { + #include whitespace in the tablename/tablearrayname + #Will need to be normalized upon interpreting the tomlish as a datastructure + append tok $c + } + default { + error "tomlish Received whitespace space during tokenType '$tokenType' [tomlish::parse::report_line]" + } + } + } else { + set had_slash $slash_active + set slash_active 0 + switch -exact -- $state { + tablename-state { + #tablename can have leading,trailing and interspersed whitespace! + #These will not be treated as whitespace tokens, instead forming part of the name. + set_tokenType tablename + set tok "" + if {$had_slash} {append tok "\\"} + append tok $c + } + tablearrayname-state { + set_tokenType tablearrayname + set tok "" + if {$had_slash} {append tok "\\"} + append tok $c + } + multistring-space { + if {$had_slash} { + set_tokenType "continuation" + set tok "\\" + incr i -1 + return 1 + } else { + set_tokenType "whitespace" + append tok $c + } + } + multiliteral-space { + set_tokenType "literalpart" + set tok $c + } + default { + if {$had_slash} { + error "tomlish unexpected backslash [tomlish::parse::report_line]" + } + set_tokenType "whitespace" + append tok $c + } + } + } + } + tabX { + if {[tcl::string::length $tokenType]} { + if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out (?review) + set slash_active 0 + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + barekey { + #whitespace is a terminator for bare keys + incr i -1 + #set_token_waiting type whitespace value $c complete 1 + return 1 + } + untyped_value { + #unquoted values (int,date,float etc) are terminated by whitespace + #set_token_waiting type whitespace value $c complete 1 + incr i -1 + return 1 + } + squotedkey { + append tok $c + } + dquotedkey - string - comment - whitespace { + #REVIEW + append tok $c + } + stringpart { + #for stringpart we store WS separately for ease of processing continuations (CONT stripping) + if {$had_slash} { + #REVIEW + #emit the stringpart - go back to the slash + incr i -2 + return 1 + } else { + #split into STRINGPART aaa WS " " + incr i -1 + return 1 + } + } + literal - literalpart { + append tok $c + } + starttablename - starttablearrayname { + incr i -1 + return 1 + } + tablename - tablearrayname { + #include whitespace in the tablename/tablearrayname + #Will need to be normalized upon interpreting the tomlish as a datastructure + append tok $c + } + default { + error "tomlish Received whitespace tab during tokenType '$tokenType' [tomlish::parse::report_line]" + } + } + } else { + set had_slash $slash_active + if {$slash_active} { + set slash_active 0 + } + switch -exact -- $state { + tablename-state { + #tablename can have leading,trailing and interspersed whitespace! + #These will not be treated as whitespace tokens, instead forming part of the name. + set_tokenType tablename + set tok $c + } + tablearrayname-state { + set_tokenType tablearrayname + set tok $c + } + multistring-space { + if {$had_slash} { + set_tokenType "continuation" + set tok "\\" + incr i -1 + return 1 + } else { + set_tokenType whitespace + append tok $c + } + } + multiliteral-space { + set_tokenType "literalpart" + set tok $c + } + default { + set_tokenType "whitespace" + append tok $c + } + } + } + } + bom { + #bom encoded as single unicode codepoint \uFFEF + #BOM (Byte Order Mark) - ignored by token consumer + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + #assert - tok will be one or two squotes only + #A toml literal probably isn't allowed to contain this + #but we will parse and let the validator sort it out. + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart { + append tok $c + } + string - stringpart { + append tok $c + } + default { + #state machine will generally not have entry to accept bom - let it crash + set_token_waiting type bom value "\uFEFF" complete 1 startindex $cindex + return 1 + } + } + } else { + switch -exact -- $state { + multiliteral-space { + set_tokenType "literalpart" + set tok $c + } + multistring-space { + set_tokenType "stringpart" + set tok $c + } + default { + set_tokenType "bom" + set tok "\uFEFF" + return 1 + } + } + } + } + default { + + if {[tcl::string::length $tokenType]} { + if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. + set slash_active 0 + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + whitespace { + if {$state eq "multistring-space"} { + incr i -1 + return 1 + } else { + #review + incr i -1 ;#We don't have a full token to add to the token_waiting dict - so leave this char for next run. + return 1 + } + } + barekey { + if {[tomlish::utils::is_barekey $c]} { + append tok $c + } else { + error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed. (see tomlish::utils::is_barekey) [tomlish::parse::report_line]" + } + } + starttablename - starttablearrayname { + incr i -1 + #allow statemachine to set context for subsequent chars + return 1 + } + string - stringpart { + append tok $c + } + default { + #e.g comment/literal/literalpart/untyped_value/starttablename/starttablearrayname/tablename/tablearrayname + append tok $c + } + } + } else { + set had_slash $slash_active + set slash_active 0 + switch -exact -- $state { + table-space - itable-space { + #if no currently active token - assume another key value pair + if {[tomlish::utils::is_barekey $c]} { + set_tokenType "barekey" + append tok $c + } else { + error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" + } + } + multistring-space { + set_tokenType "stringpart" + if {$had_slash} { + set tok \\$c + } else { + set tok $c + } + } + multiliteral-space { + set_tokenType "literalpart" + set tok $c + } + tablename-state { + set_tokenType "tablename" + set tok $c + } + tablearrayname-state { + set_tokenType "tablearrayname" + set tok $c + } + dottedkey-space { + set_tokenType barekey + set tok $c + } + default { + #todo - something like ansistring VIEW to show control chars? + set cshow [string map [list \t tab \v vt] $c] + tomlish::log::debug "- tokloop char '$cshow' setting to untyped_value while state:$state [tomlish::parse::report_line]" + set_tokenType "untyped_value" + set tok $c + } + } + } + } + } + + } + + #run out of characters (eof) + if {[tcl::string::length $tokenType]} { + #check for invalid ending tokens + #if {$state eq "err-state"} { + # error "Reached end of data whilst tokenType = '$tokenType'. INVALID" + #} + switch -exact -- $tokenType { + _start_squote_sequence { + set toklen [tcl::string::length $tok] + switch -- $toklen { + 1 { + #invalid eof with open literal + error "tomlish eof reached without closing single quote for string literal. [tomlish::parse::report_line]" + } + 2 { + set_tokenType "literal" + set tok "" + return 1 + + ##review + #set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] + #set_tokenType "literal" + #set tok "" + #return 1 + } + } + } + _start_dquote_sequence { + set toklen [tcl::string::length $tok] + switch -- $toklen { + 1 { + #invalid eof with open string + error "tomlish eof reached without closing double quote for string. [tomlish::parse::report_line]" + } + 2 { + set_tokenType "string" + set tok "" + return 1 + } + } + } + newline { + #The only newline token that has still not been returned should have a tok value of "cr" + puts "tomlish eof reached - with incomplete newline token '$tok'" + if {$tok eq "cr"} { + #we convert lone cr to it's own "cr" token elsewhere in the document to allow statemachine to handle it. + #(which it should generally do by not handling it ie raising an error - or emitting an ERROR list in the tomlish) + #if trailing char is a lone cr - we should encode it the same way as elsewhere that is outside of values + # ie as it's own token. + switch_tokenType "cr" + return 1 + } else { + #should be unreachable + error "tomlish eof reached - with invalid newline token. value: $tok" + } + } + } + set_token_waiting type eof value eof complete 1 startindex $i ;#review + return 1 + } else { + ::tomlish::log::debug "- No current tokenType, ran out of characters, setting tokenType to 'eof' [tomlish::parse::report_line]" + set tokenType "eof" + set tok "eof" + } + return 0 + } + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::parse ---}] +} +namespace eval tomlish::huddle { + proc from_json {json} { + package require huddle + package require huddle::json + #note - huddle may now contain raw surrogate pair - which cannot be emitted to stdout + set h [huddle::json::json2huddle parse $json] + } + proc from_dict {d} { + + } + + #raw - strings must already be processed into values suitable for json e.g surrogate pair escaping + proc jsondumpraw {huddle_object {offset " "} {newline "\n"} {begin ""}} { + upvar ::huddle::types types + set nextoff "$begin$offset" + set nlof "$newline$nextoff" + set sp " " + if {[string equal $offset ""]} {set sp ""} + + set type [huddle type $huddle_object] + + switch -- $type { + boolean - + number { + return [huddle get_stripped $huddle_object] + } + null { + return null + } + string { + set data [huddle get_stripped $huddle_object] + + # JSON permits only oneline string + #set data [string map { + # \n \\n + # \t \\t + # \r \\r + # \b \\b + # \f \\f + # \\ \\\\ + # \" \\\" + # / \\/ + # } $data + #] + return "\"$data\"" + } + list { + set inner {} + set len [huddle llength $huddle_object] + for {set i 0} {$i < $len} {incr i} { + set subobject [huddle get $huddle_object $i] + lappend inner [jsondumpraw $subobject $offset $newline $nextoff] + } + if {[llength $inner] == 1} { + return "\[[lindex $inner 0]\]" + } + return "\[$nlof[join $inner ,$nlof]$newline$begin\]" + } + dict { + set inner {} + foreach {key} [huddle keys $huddle_object] { + lappend inner [subst {"$key":$sp[jsondumpraw [huddle get $huddle_object $key] $offset $newline $nextoff]}] + } + #if {[llength $inner] == 1} { + # return $inner ;#wrong - breaks with quoted list representation + # #FAILS: toml-test valid/comment/tricky + #} + + return "\{$nlof[join $inner ,$nlof]$newline$begin\}" + } + default { + set node [unwrap $huddle_object] + #foreach {tag src} $node break + lassign $node tag src + return [$types(callback:$tag) jsondumpraw $huddle_object $offset $newline $nextoff] + } + } + } +} + +#typed as per toml-test types +namespace eval tomlish::typedhuddle { + proc from_json {json} { + set plainhuddle [tomlish::huddle::from_json $json] + + error "tomlish::typedhuddle::from_json unimplemented" + } + proc from_dict {d} { + package require huddle + set h [huddle create] + if {[tomlish::dict::is_typeval $d]} { + set dtype [dict get $d type] + switch -- $dtype { + ARRAY { + #error "typedhuddle::from_dict ARRAY not yet handled" + set h_list [huddle list] + set elements [dict get $d value] + foreach el $elements { + set sub [from_dict $el] + huddle append h_list $sub + } + return $h_list + } + default { + set tinfo [tomlish::dict::convert_typeval_to_tomltest $d] + #basic non-container types + set h_tdict [huddle create] + huddle set h_tdict type [huddle string [dict get $tinfo type]] + huddle set h_tdict value [huddle string [dict get $tinfo value]] + return $h_tdict + } + } + } else { + dict for {dictkey dictval} $d { + set jsonkey [tomlish::utils::rawstring_to_jsonstring $dictkey] + if {[tomlish::dict::is_typeval $dictval]} { + set dtype [dict get $dictval type] + switch -- $dtype { + ARRAY { + #error "typedhuddle::from_dict ARRAY not yet handled" + set h_next [huddle list] + set elements [dict get $dictval value] + foreach el $elements { + set sub [from_dict $el] + huddle append h_next $sub + } + } + default { + set tinfo [tomlish::dict::convert_typeval_to_tomltest $dictval] + set tp [dict get $tinfo type] + #basic non-container types + set h_next [huddle create] ;#dict + huddle set h_next type [huddle string [dict get $tinfo type]] + huddle set h_next value [huddle string [dict get $tinfo value]] + } + } + huddle set h $jsonkey $h_next + } else { + #dict + set sub [from_dict $dictval] + huddle set h $jsonkey $sub + } + } + } + return $h + } + proc is_typeval {huddled} { + set htype [huddle type $huddled] + if {$htype ne "dict"} { + return 0 + } + if {[huddle keys $huddled] ne {type value}} { + return 0 + } + set tp [huddle type $huddled type] + switch -- $tp { + string - integer - float - bool - datetime - datetime-local - date-local - time-local { + return 1 + } + } + return 0 + } + + #direction from typed json towards toml + proc convert_typeval_to_tomlish {huddled} { + set htype [huddle get_stripped $huddled type] + set hval [huddle get_stripped $huddled value] + switch -- $htype { + string { + #we need to decide here the type of string element to use in toml/tomlish + #STRING,MULTISTRING,LITERAL,MULTILITERAL + #set unesc [tomlish::utils::unescape_jsonstring $hval] ;#no need - json parser unescaped when creating the huddle + set unesc $hval + #(huddle::json::json2huddle parse $json) + #since it was unescaped any backslashes remaining represent themselves - reapply escape - REVIEW + #set hval [string map [list \\ \\\ ] $hval] + #JSJS + if {[string first \n $unesc] >= 0} { + #always use a MULTI + if {[string first ' $unesc] >=0} { + if {[string first ''' $unesc] >=0} { + set dtype MULTISTRING + } else { + set dtype MULTILITERAL + } + } else { + if {[string first \"\"\" $unesc] >=0} { + set dtype MULTILITERAL + } else { + set dtype MULTISTRING + } + } + } else { + #use multi if needed? + if {[string first '' $hval] >=0} { + if {[string first ''' $unesc] >=0} { + set dtype STRING + } else { + set dtype MULTILITERAL + } + } elseif {[string first ' $unesc] >= 0} { + set dtype STRING + } elseif {[string first \"\"\" $unesc] >= 0} { + set dtype LITERAL + } else { + #STRING or LITERAL? + set dtype STRING + } + } + + } + datetime - bool { + set dtype [string toupper $htype] + } + float { + set dtype FLOAT + if {[string is integer -strict $hval]} { + #json FLOAT specified as integer - must have dot for toml + set hval [expr {double($hval)}] + } + } + integer { + set dtype INT + } + datetime - datetime-local - date-local - time-local { + #DDDD + #set dtype DATETIME + set dtype [string toupper $htype] + } + default { + error "tomlish::typedhuddle::convert_typeval_to_tomlish unrecognised type $htype" + } + } + return [list type $dtype value $hval] + } + +} +namespace eval tomlish::toml { + proc from_binary {bindata} { + set bom "" + set b12 [tcl::string::range $bindata 0 1] + set b12test [string map [list \xEF\xBB utf8_12 \xFE\xFF bom16be \xFF\xFE utf32le_12 \x00\x00 utf32be_12] $b12] + switch -- $b12test { + bom16be { + #FEFF + set bom utf-16be + } + utf32le_12 { + #FFFE + set b34 [tcl::string::range $bindata 2 3] + if {$b34 eq "\x00\x00"} { + set bom utf-32le + } else { + set bom utf-16le + } + } + utf32be_12 { + #0000 + set b34 [tcl::string::range $bindata 2 3] + if {$b34 eq "\xFE\xFF"} { + set bom utf-32be + } + } + utf8_12 { + set b3 [tcl::string::index $bindata 2] + if {$b3 eq "\xBF"} { + set bom utf-8 + } + } + } + if {$bom eq ""} { + #no bom - assume utf8 - but we read in as binary + #if data wasn't actually utf8 we may error here depending on content - or we may just get wrongly encoded chars + set tomldata [encoding convertfrom utf-8 $bindata] + } elseif {$bom eq "utf-8"} { + #utf-8 bom read in as binary + set tomldata [encoding convertfrom utf-8 $bindata] + #bom now encoded as single unicode char \uFFEF + } else { + return -code error -errorcode {TOML ENCODING NOTUTF8} "Input not UTF8 encoded according to BOM. Indicated encoding is '$bom' - invalid for toml" + } + return $tomldata + } + proc from_tomlish {tomlish} { + return [tomlish::encode::tomlish $tomlish] + } + + #todo - rename to taggedjson + proc from_tomlish_from_dict_from_typedjson {json} { + set d [tomlish::dict::from_typedjson $json] + from_tomlish [tomlish::from_dict $d] ;#return tomlish + } + + proc tablename_split {tablename {normalize false}} { + #we can't just split on . because we have to handle quoted segments which may contain a dot. + #eg {dog."tater.man"} + if {$tablename eq ""} { + error "tablename_split. No table name segments found. empty tablename" + } + set sLen [tcl::string::length $tablename] + set segments [list] + set mode "preval" ;#5 modes: preval, quoted,litquoted, unquoted, postval + #quoted is for double-quotes, litquoted is for single-quotes (string literal) + set seg "" + for {set i 0} {$i < $sLen} {incr i} { + + if {$i > 0} { + set lastChar [tcl::string::index $tablename [expr {$i - 1}]] + } else { + set lastChar "" + } + + #todo - track\count backslashes properly + + set c [tcl::string::index $tablename $i] + if {$c eq "\""} { + if {($lastChar eq "\\")} { + #not strictly correct - we could have had an even number prior-backslash sequence + #the toml spec would have us error out immediately on bsl in bad location - but we're + #trying to parse to unvalidated tomlish + set ctest escq + } else { + set ctest dq + } + } else { + set ctest [string map [list " " sp \t tab] $c] + } + + switch -- $ctest { + . { + switch -exact -- $mode { + preval { + error "tablename_split. dot not allowed - expecting a value" + } + unquoted { + #dot marks end of segment. + if {![tomlish::utils::is_barekey $seg]} { + error "tablename_split. unquoted key segment $seg is not a valid toml key" + } + lappend segments $seg + set seg "" + set mode "preval" + } + quoted { + append seg $c + } + litquoted { + append seg $c + } + postval { + #got dot in an expected location + set mode "preval" + } + } + } + dq { + #unescaped dquote + switch -- $mode { + preval { + set mode "quoted" + set seg "\"" + } + unquoted { + #invalid in barekey - but we are after structure only + append seg $c + } + quoted { + append seg $c + #JJJJ + if {$normalize} { + lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] + } else { + lappend segments $seg + } + set seg "" + set mode "postval" ;#make sure we only accept a dot or end-of-data now. + } + litquoted { + append seg $c + } + postval { + error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" + } + } + } + ' { + switch -- $mode { + preval { + append seg $c + set mode "litquoted" + } + unquoted { + #single quote inside e.g o'neill - ultimately invalid - but we pass through here. + append seg $c + } + quoted { + append seg $c + } + litquoted { + append seg $c + #no normalization to do aside from stripping squotes + if {$normalize} { + lappend segments [tcl::string::range $seg 1 end-1] + } else { + lappend segments $seg + } + set seg "" + set mode "postval" + } + postval { + error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" + } + } + } + sp - tab { + switch -- $mode { + preval - postval { + #ignore + } + unquoted { + #terminates a barekey + lappend segments $seg + set seg "" + set mode "postval" + } + default { + #append to quoted or litquoted + append seg $c + } + } + } + default { + switch -- $mode { + preval { + set mode unquoted + append seg $c + } + postval { + error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" + } + default { + append seg $c + } + } + } + } + + if {$i == $sLen-1} { + #end of data + ::tomlish::log::debug "End of data: mode='$mode'" + switch -exact -- $mode { + preval { + if {[llength $segments]} { + error "tablename_split. Expected a value after last dot separator. tablename: '$tablename'" + } else { + error "tablename_split. Whitespace only? No table name segments found. tablename: '$tablename'" + } + } + unquoted { + if {![tomlish::utils::is_barekey $seg]} { + #e.g toml-test invalid/table/with-pound required to fail for invalid barekey + error "tablename_split. unquoted key segment $seg is not a valid toml key" + } + lappend segments $seg + } + quoted { + error "tablename_split. Expected a trailing double quote. tablename: '$tablename'" + } + litquoted { + error "tablename_split. Expected a trailing single quote. tablename: '$tablename'" + } + postval { + #ok - segment already lappended + } + } + } + } + + #note - we must allow 'empty' quoted strings '' & "" + # (these are 'discouraged' but valid toml keys) + + return $segments + } + + #tablenames (& tablearraynames) may contain irrelevant leading, trailing and interspersed whitespace + # tablenames can be made up of segments delimited by dots. .eg [ a.b . c ] + #trimmed, the tablename becomes {a.b.c} + # A segment may contain whitespace if it is quoted e.g [a . b . "c etc " ] + #ie whitespace is only irrelevant if it's outside a quoted segment + #trimmed, the tablename becomes {a.b."c etc "} + proc tablename_trim {tablename} { + set segments [tomlish::toml::tablename_split $tablename false] + set trimmed_segments [list] + foreach seg $segments { + lappend trimmed_segments [::string trim $seg " \t"] + } + return [join $trimmed_segments .] + } +} + +namespace eval tomlish::dict { + namespace export {[a-z]*}; # Convention: export all lowercase + namespace path [namespace parent] + + #from_taggedjson + proc from_typedjson {json} { + package require huddle + package require huddle::json + set h [huddle::json::json2huddle parse $json] + #json2huddle parse unescapes the basic json escapes \n \\ etc + #$h could now contain raw form of surrogate pair (json2huddle parse as at 2025-014 doesn't convert the surrogates - just unescapes?) + if {[catch {encoding convertto utf-8 $h} errM]} { + #This test suggests we have raw surrogate pairs - REVIEW + package require punk::cesu + set h [punk::cesu::from_surrogatestring $h] + } + tomlish::dict::from_typedhuddle $h + } + proc from_typedhuddle {h} { + set resultd [dict create] + switch -- [huddle type $h] { + dict { + foreach k [huddle keys $h] { + switch -- [huddle type $h $k] { + dict { + set huddle_d [huddle get $h $k] + #puts stderr "huddle_d: $huddle_d" + #set v [huddle get_stripped $h $k] + if {[tomlish::typedhuddle::is_typeval $huddle_d]} { + dict set resultd $k [tomlish::typedhuddle::convert_typeval_to_tomlish $huddle_d] + } else { + dict set resultd $k [from_typedhuddle $huddle_d] + } + } + list { + set items [huddle get $h $k] + + set numitems [huddle llength $items] + if {$numitems == 0} { + dict set resultd $k [list type ARRAY value {}] + } else { + set arritems [list] + for {set i 0} {$i < $numitems} {incr i} { + set item [huddle get $items $i] + #puts stderr "item: $item" + #set v [huddle get $item] + if {[tomlish::typedhuddle::is_typeval $item]} { + lappend arritems [tomlish::typedhuddle::convert_typeval_to_tomlish $item] + } else { + lappend arritems [from_typedhuddle $item] + } + } + dict set resultd $k [list type ARRAY value $arritems] + } + } + default { + error "dict_from_json unexpected subtype [huddle type $h $k] in dict" + } + } + } + } + list { + set items [huddle get $h] + set numitems [huddle llength $items] + if {$numitems == 0} { + return [list type ARRAY value {}] + } else { + set arritems [list] + for {set i 0} {$i < $numitems} {incr i} { + set item [huddle get $items $i] + #puts stderr "item: $item" + #set v [huddle get $item] + if {[tomlish::typedhuddle::is_typeval $item]} { + lappend arritems [tomlish::typedhuddle::convert_typeval_to_tomlish $item] + } else { + lappend arritems [from_typedhuddle $item] + } + } + return [list type ARRAY value $arritems] + } + + } + } + return $resultd + } + + proc is_typeval {d} { + #designed to detect {type value } e.g {type INT value 3}, {type STRING value "blah etc"} + #as a sanity check we need to avoid mistaking user data that happens to match same form + #consider x.y={type="spud",value="blah"} + #The value of type will itself have already been converted to {type STRING value spud} ie never a single element. + #check the length of the type as a quick way to see it's a tag - not something else masqerading. + expr {[string is dict $d] && [dict size $d] == 2 && [dict exists $d type] && [dict exists $d value] && [llength [dict get $d type]] == 1} + } + + #simple types only - not containers? + proc convert_typeval_to_tomltest {d} { + set dtype [dict get $d type] + set dval [dict get $d value] + switch -- $dtype { + INT { + set testtype integer + set dval [expr {$dval}] ;#convert e.g 0xDEADBEEF to 3735928559 + } + DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - FLOAT - BOOL { + #DDDD + set testtype [string tolower $dtype] + } + STRING - MULTISTRING { + set testtype string + #JJJJ + set dval [tomlish::utils::unescape_string $dval] + set dval [tomlish::utils::rawstring_to_jsonstring $dval] + } + LITERAL - MULTILITERAL { + set testtype string + #don't validate on way out to json here? + #decoder should validate by calling tomlish::from_dict + #if {![tomlish::utils::rawstring_is_valid_literal $dval]} { + # return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} $msg + #} + set dval [tomlish::utils::rawstring_to_jsonstring $dval] + } + MULTILITERAL { + #todo - escape newlines for json? + set testtype string + } + default { + error "convert_typeval_to_tomltest unhandled type $dtype" + } + } + return [list type $testtype value $dval] + } + + # Check that each leaf is a typeval or typeval dict + #importantly: must accept empty dict leaves e.g {x {}} + proc is_typeval_dict {d {checkarrays 0}} { + if {![string is dict $d]} { + return 0 + } + dict for {k v} $d { + set is_d 0 + if {!([is_typeval $v] || [set is_d [is_typeval_dict $v $checkarrays]])} { + return 0 + } + if {!$is_d} { + set vtype [dict get $v type] + switch -- $vtype { + INT - FLOAT - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - BOOL - LITERAL - STRING - MULTILITERAL - MULTISTRING {} + ARRAY { + if {$checkarrays} { + set arrdata [dict get $v value] + foreach el $arrdata { + if {![is_typeval_dict $el $checkarrays]} { + return 0 + } + } + } + } + default { + puts stderr "is_typeval_dict: Unexpected type '$vtype'" + return 0 + } + } + } + } + return 1 + } + + proc last_tomltype_posn {d} { + set last_simple -1 + set dictposn [expr {[dict size $d] -1}] + foreach k [lreverse [dict keys $d]] { + set dval [dict get $d $k] + if {[is_typeval $dval]} { + set last_simple $dictposn + break + } + incr dictposn -1 + } + return $last_simple + } + + + #review + proc name_from_tablestack {tablestack} { + set name "" + foreach tinfo [lrange $tablestack 1 end] { + lassign $tinfo type namepart + switch -- $type { + T { + if {$name eq ""} { + append name $namepart + } else { + append name .$namepart + } + } + I { + if {$name eq ""} { + append name $namepart + } else { + append name .$namepart + } + } + default { + #end at first break in the leading sequence of T & I tablenames + break + } + } + } + return $name + } + + + #tablenames_info is a flat dict with the key being an '@@' path + proc _show_tablenames {tablenames_info} { + #e.g {@l@a @@b} {ttype header_table tdefined closed} + append msg \n "tablenames_info:" \n + dict for {tkey tinfo} $tablenames_info { + append msg " " "table: $tkey" \n + dict for {field finfo} $tinfo { + append msg " " "$field $finfo" \n + } + } + return $msg + } + + #take a raw string and classify: result is a 2 element list comprised of KEY|SQKEY|DQKEY and the value being the appropriate inner string + proc classify_rawkey {rawval} { + if {![::tomlish::utils::is_barekey $rawval]} { + #requires quoting + # + #Any dot in the key would have been split by dict::from_tomlish - so if it's present here it's part of this key - not a level separator! + # + #we'll use a basic mechanisms for now to determine the type of quoting + # - whether it has any single quotes or not. + # (can't go in an SQKEY) + # - whether it has any chars that require quoting when in a Bstring + # (if so - then its visual representation might be unsuitable for a key in a toml text file, so escape and put in DQKEY instead of literal SQKEY) + #todo - more? + #REVIEW - the backslash might often be in things like a regex or windows path - which is often better expressed in a literal SQKEY + # from literal examples: + # 'c:\Users\nodejs\templates' + # '<\i\c*\s*>' + #If these are in *keys* our basic test will express these as: + # "c:\\Users\\nodejs\\templates" + # "<\\i\\c*\\s*>" + # This still works - but a smarter test might determine when SQKEY is the better form? + #when coming from external systems - can we even know if the value was already escaped? REVIEW + #Probably when coming from json - we know it's already escaped - and so we build our dict converting keys to unescaped + #TODO - clarify in documentation that keys resulting from dict::from_tomlish are in 'normalized' (unescaped) form + # + #For keys - we currently (2025) are only allowed barekeys,basic strings and literal strings. (no multiline forms) + set k_escaped [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $rawval] + if {[string length $k_escaped] != [string length $rawval]} { + #escaping made a difference + set has_escape_requirement 1 + } else { + set has_escape_requirement 0 + } + if {[string first ' $rawval] >=0 || $has_escape_requirement} { + #basic string + # (any ANSI SGR sequence will end up here in escaped form ) + return [list DQKEY $k_escaped] + } else { + #literal string + return [list SQKEY $rawval] + } + } else { + return [list KEY $rawval] + } + } + #the quoting implies the necessary escaping for DQKEYs + proc join_and_quote_rawkey_list {rawkeylist} { + set result "" + foreach rk $rawkeylist { + lassign [tomlish::dict::classify_rawkey $rk] type val + switch -- $type { + SQKEY { + append result "'$val'." + } + DQKEY { + append result "\"$val\"." + } + KEY { + append result "$val." + } + } + } + return [string range $result 0 end-1] + } + + proc _process_tomlish_dottedkey {element {context_refpath {}}} { + upvar tablenames_info tablenames_info + upvar datastructure datastructure + set dottedtables_defined [list] + set dkey_info [tomlish::get_dottedkey_info $element] + #e.g1 keys {x.y y} keys_raw {'x.y' "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) + #e.g2 keys {x.y y} keys_raw {{"x.y"} "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) + + #[a.b] + #t1.t2.dottedtable.leafkey = "val" + #we have already checked supertables a & {a b} + # - in basic case, passed in context_refpath as {@@a @@b} + # - our context_refpath could also include some combination of keys and array indices e.g {@@a @@b 3 @@subtablekey} + #We need to check {a b t1} & {a b t2} ('creation' only) + #and then 'dottedtable' is 'defined' while leafkey is an ordinary key in dottedtable + + #note we also get here as a 'dottedkey' with the following even though there is no dot in k + #[a.b] + #leafkey = "val" + + set all_dotted_keys [dict get $dkey_info keys] + set dottedkeyname [join $all_dotted_keys .] + + if {[llength $all_dotted_keys] > 1} { + #dottedtable.k=1 + #tX.dottedtable.k=1 + #etc + + #Wrap in a list so we can detect 'null' equivalent. + #We can't use empty string as that's a valid dotted key segment + set dottedtable_bag [list [lindex $all_dotted_keys end-1]] + set dotparents [lrange $all_dotted_keys 0 end-2] + } else { + #basic case - not really a 'dotted' key + #k = 1 + set dottedtable_bag [list] ;#empty bag + set dotparents [list] + } + #assert dottedtable_bag only ever holds 0 or 1 elements + set leaf_key [lindex $all_dotted_keys end] + + #see also: https://github.com/toml-lang/toml/issues/846 "Can dotted keys insert into already-defined [tables]?" + #This code was originally written with a misinterpretation of: + #"Dotted keys create and define a table for each key part before the last one, provided that such tables were not previously created." + # 'each key part before the last one' refers to each key in a single dotted key entry + # not each 2nd-to last key in a list of dotted keys. + + + #we've already tested the table/tablearray keys that got us here.. but not the dottedkey segments (if any) prior to dottedtable & leaf_key + set dottedsuper_refpath $context_refpath + foreach normkey $dotparents { + lappend dottedsuper_refpath @@$normkey + if {![dict exists $tablenames_info $dottedsuper_refpath ttype]} { + #supertable with this combined path (context_path plus parts of dottedkey) not yet 'created' + if {[tomlish::dict::path::exists $datastructure $dottedsuper_refpath]} { + #There is data so it must have been created as a keyval + set msg "Path $dottedsuper_refpath for dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here + #dict set tablenames_info $dottedsuper_refpath ttype unknown_table ;#REVIEW + dict set tablenames_info $dottedsuper_refpath ttype unknown_dotted ;#REVIEW + + #see note above re dotted keys insert into already defined table - we need to 'define' all the dotted supers in this block + lappend dottedtables_defined $dottedsuper_refpath + + #ensure empty tables are still represented in the datastructure + tomlish::dict::path::set_endpoint datastructure $dottedsuper_refpath {} ;#set to empty subdict + } else { + #added for fixed assumption + set ttype [dict get $tablenames_info $dottedsuper_refpath ttype] + set definedstate [dictn getdef $tablenames_info [list $dottedsuper_refpath tdefined] NULL] + switch -- $ttype { + dottedkey_table - unknown_dotted { + #'created' as dotted - but make sure it's from this header section - i.e defined not set + if {$definedstate ne "NULL"} { + #collision with some other dottedkey + set msg "Table at $dottedsuper_refpath represented by dottedkey $dottedkeyname has been 'defined' elsewhere (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + } + itable { + #itables are immediately defined + set msg "Table at $dottedsuper_refpath represented by dottedkey $dottedkeyname has been 'defined' as itable (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + default { + #header_table, header_tablearray or unknown_header + #is header_tablearray any different from header_table in this context? + #we don't set tdefined for tablearray anyway - so should be ok here. + if {$definedstate ne "NULL"} { + set msg "Table at $dottedsuper_refpath represented by dottedkey $dottedkeyname has been 'defined' in a header (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + } + } + } + } + + #dottedtable being 2nd last segment was for original assumption - todo - tidy up? we are duplicating the logic above + #review - any need/advantage to treat 2nd to last key any different from other supers? ie D in a.b.c.D.key=1 + #no need for 'unknown_dotted' vs 'dottedkey_table' ?? + if {[llength $dottedtable_bag] == 1} { + set dottedtable [lindex $dottedtable_bag 0] + set dottedkey_refpath [list {*}$dottedsuper_refpath "@@$dottedtable"] + #our dotted key is attempting to define a table + if {![dict exists $tablenames_info $dottedkey_refpath ttype]} { + #first one - but check datastructure for collisions + if {[tomlish::dict::path::exists $datastructure $dottedkey_refpath]} { + set msg "Path $dottedkey_refpath for dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #'create' the table + dict set tablenames_info $dottedkey_refpath ttype dottedkey_table + #don't actually set 'defined' here.. use the end of TABLE record to close them off by looking at this list + tomlish::dict::path::set_endpoint datastructure $dottedkey_refpath {} + lappend dottedtables_defined $dottedkey_refpath + + # + } else { + #exists - but might be from another dottedkey within the current header section + #the table is open for adding keys until the next 'header' section ([tablename] / [[tablearray]]) + #check for 'defined' closed (or just existence) + set ttype [dict get $tablenames_info $dottedkey_refpath ttype] + set definedstate [dictn getdef $tablenames_info [list $dottedkey_refpath tdefined] NULL] + switch -- $ttype { + dottedkey_table - unknown_dotted { + #'created' as dotted - but make sure it's from this header section - i.e defined not set + if {$definedstate ne "NULL"} { + #collision with some other dottedkey + set msg "Table at $dottedkey_refpath represented by dottedkey $dottedkeyname has been 'defined' elsewhere (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + } + itable { + #itables are immediately defined + set msg "Table at $dottedkey_refpath represented by dottedkey $dottedkeyname has been 'defined' as itable (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + default { + #header_table, header_tablearray or unknown_header + #is header_tablearray any different from header_table in this context? + #we don't set tdefined for tablearray anyway - so should be ok here. + if {$definedstate ne "NULL"} { + set msg "Table at $dottedkey_refpath represented by dottedkey $dottedkeyname has been 'defined' in a header (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + } + } + } + } else { + set dottedkey_refpath $dottedsuper_refpath + } + #assert - dottedkey represents a key val pair that can be added + + + set fullkey_refpath [list {*}$dottedkey_refpath @@$leaf_key] + if {[tomlish::dict::path::exists $datastructure $fullkey_refpath]} { + set msg "Duplicate key. The key (path $fullkey_refpath) already exists at this level in the toml data. The toml data is not valid." + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + + #set keyval_dict [_get_keyval_value $element] + lassign [_get_keyval_value $element] _ keyval_dict _ sub_tablenames_info + + + #keyval_dict is either a {type value } + #or the result from parsing an arbitrary dict from an inline table - which could theoretically look the same at the topmost level + #punk::dict::is_typeval can distinguish + tomlish::log::debug "_process_tomlish_dottedkey>>> context:$context_refpath dottedkey $dottedkeyname kv: $keyval_dict" + tomlish::dict::path::set_endpoint datastructure $fullkey_refpath $keyval_dict + + #remove ? + #if {![tomlish::dict::is_typeval $keyval_dict]} { + # #the value is either empty or or a dict structure with arbitrary (from-user-data) toplevel keys + # # inner structure will contain {type value } if all leaves are not empty ITABLES + # ##set tkey [list {*}$norm_segments {*}$all_dotted_keys] + + # #by not creating a tablenames_info record - we effectively make it closed anyway? + # #it should be detected as a key + # #is there any need to store tablenames_info for it?? + # #REVIEW + + # ##TODO - update? + # #dictn incr tablenames_info [list $tkey seencount] + # ##if the keyval_dict is not a simple type x value y - then it's an inline table ? + # ##if so - we should add the path to the leaf_key as a closed table too - as it's not allowed to have more entries added. + # #dictn set tablenames_info [list $tkey closed] 1 + #} + return [dict create dottedtables_defined $dottedtables_defined] + } + #tomlish::dict::from_tomlish is a *basic* programmatic datastructure for accessing the data. + # produce a dictionary of keys and values from a tomlish tagged list. + # dict::from_tomlish is primarily for read access to toml data. + #Extraneous (not within quoted sections) whitespace and comments are not preserved in this structure, + # so a roundtrip from toml to this datastructure and back to toml will lose whitespace formatting and comments. + # creating/changing toml values can be done directly on a tomlish list if preserving (or adding) formatting/comments is desired. + #A separate package 'tomlish::object' may be needed to allow easier programmatic creating/updating/deleting of data elements whilst preserving (or adding or selectively deleting/editing) such formatting. + # + + #within an ARRAY, we store a list of items such as plain dicts (possibly empty) and {type value } for simple types + #(ARRAYS can be mixed type) + #This means our dict structure should have only ARRAY and simple types which need to be in {type value } form + #A dict within an array encodeded as a type ITABLE value should also parse - but is the unpreferred form - REVIEW test? + + #Namespacing? + #ie note the difference: + #[Data] + #temp = { cpu = 79.5, case = 72.0} + # versus + #[Data] + #temps = [{cpu = 79.5, case = 72.0}] + proc from_tomlish {tomlish} { + package require dictn + + #keep track of which tablenames have already been directly defined, + # so we can raise an error to satisfy the toml rule: 'You cannot define any key or table more than once. Doing so is invalid' + #Note that [a] and then [a.b] is ok if there are no subkey conflicts - so we are only tracking complete tablenames here. + #we don't error out just because a previous tablename segment has already appeared. + + #Declaring, Creating, and Defining Tables + #https://github.com/toml-lang/toml/issues/795 + #(update - only Creating and Defining are relevant terminology) + + #review + #tablenames_info keys ttype created, tdefined, createdby, definedby, closedby ??? review keys + # [tname] = header_table [[tname]] = header_tablearray + + #consider the following 2 which are legal: + #[table] #'table' created, defined=open type header_table + #x.y = 3 + #[table.x.z] #'table' tdefined=closed closedby={header_table table.x.z}, 'table.x' created, 'table.x.z' created tdefined=open tdefinedby={header_table table.x.z} + #k= 22 + # #'table.x.z' tdefined=closed closedby={eof eof} + + #equivalent datastructure + + #[table] #'table' created, tdefined=open definedby={header_table table} + #[table.x] #'table' tdefined=closed closedby={header_table table.x}, 'table.x' created tdefined=open definedby={header_table table.x} + #y = 3 + #[table.x.z] #'table.x' tdefined=closed closedby={header_table table.x.z}, 'table.x.z' created tdefined=open definedby={header_table table.x.z} + #k=22 + + #illegal + #[table] #'table' created and tdefined=open + #x.y = 3 #'table.x' created first keyval pair tdefined=open definedby={keyval x.y = 3} + #[table.x.y.z] #'table' tdefined=closed, 'table.x' closed because parent 'table' closed?, 'table.x.y' cannot be created + #k = 22 + # + ## - we would fail on encountering table.x.y because only table and table.x are effectively tables - but that table.x is closed should be detected (?) + + #illegal + #[table] + #x.y = {p=3} + #[table.x.y.z] + #k = 22 + ## we should fail because y is an inline table which is closed to further entries + + #note: it is not safe to compare normalized tablenames using join! + # e.g a.'b.c'.d is not the same as a.b.c.d + # instead compare {a b.c d} with {a b c d} + # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. + #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' + #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} + + + + if {[uplevel 1 [list info exists tablenames_info]]} { + upvar tablenames_info tablenames_info + } else { + set tablenames_info [dict create] ;#keyed on tablepath each of which is an @@path such as {@@config @@subgroup @@etc} (corresponding to config.subgroup.etc) + #also has non @@ indexes which are list indexes as taken by tcl list commands (int or end-1 etc) + #value is a dict with keys: ttype, tdefined + } + + + log::info "---> dict::from_tomlish processing '$tomlish'<<<" + set items $tomlish + + foreach lst $items { + if {[lindex $lst 0] ni $::tomlish::tags} { + error "supplied list does not appear to be toml parsed into a tomlish tagged list. Run tomlish::decode::toml on the raw toml data to produce a tomlish list" + } + } + + if {[lindex $tomlish 0] eq "TOMLISH"} { + #ignore TOMLISH tag at beginning + set items [lrange $tomlish 1 end] + } + + set datastructure [dict create] + set dottedtables_defined [list] + foreach item $items { + set tag [lindex $item 0] + #puts "...> item:'$item' tag:'$tag'" + switch -exact -- $tag { + KEY - DQKEY - SQKEY - INT - FLOAT - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - STRING - LITERAL { + #why would we get individual key item as opposed to DOTTEDKEY? + error "tomlish::dict::from_tomlish error: invalid tag: $tag. At the toplevel, from_tomlish can only process WS NEWLINE COMMENT and compound elements DOTTEDKEY TABLE TABLEARRAY ITABLE MULTILITERAL MULTISTRING" + } + DOTTEDKEY { + #toplevel dotted key + set dkinfo [_process_tomlish_dottedkey $item] + lappend dottedtables_defined {*}[dict get $dkinfo dottedtables_defined] + #at any level - we don't expect any more DOTTEDKEY records in a tomlish structure after TABLE or TABLEARRAY are encountered + #as those records should encapsulate their own dottedkeys + + } + TABLEARRAY { + #close off any dottedtables_defined created by dottedkeys at this level + foreach dtablepath $dottedtables_defined { + dict set tablenames_info $dtablepath tdefined closed + } + set dottedtables_defined [list] ;#for closing off at end by setting 'defined' + + set tablearrayname [lindex $item 1] + tomlish::log::debug "---> tomlish::dict::from_tomlish processing item TABLENAME (name: $tablearrayname): $item" + set norm_segments [::tomlish::toml::tablename_split $tablearrayname true] ;#true to normalize + #we expect repeated tablearray entries - each adding a sub-object to the value, which is an array/list. + #tablearrayname is likely to appear multiple times - so unlike a TABLE we don't check for 'defined' for the full name as an indicator of a problem + set supertable [list] + ############## + # [[a.b.c.d]] + # norm_segments = {a b c d} + #check a {a b} {a b c} <---- supertables of a.b.c.d + ############## + set refpath [list] ;#e.g @@j1 @@j2 1 @@k1 end + foreach normseg [lrange $norm_segments 0 end-1] { + lappend supertable $normseg + lappend refpath @@$normseg + if {![dict exists $tablenames_info $refpath ttype]} { + #supertable with this path doesn't yet exist + if {[tomlish::dict::path::exists $datastructure $refpath]} { + #There is data though - so it must have been created as a keyval + set msg "Supertable [join $supertable .] of tablearray name $tablearrayname already has data but doesn't appear to be a table - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + #test: datastructure_tablearray_supertable_keycollision + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } else { + #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here + #review - we can't later specify as tablearray so should just set ttype to header_table even though it's being created + # because of a tablearray header? + #By setting ttype to something other than table_header we can provide more precise errorCode/msg ?? + dict set tablenames_info $refpath ttype unknown_header + #ensure empty tables are still represented in the datastructure + dict set datastructure {*}$supertable [list] + } + } else { + #REVIEW!! + # what happens with from_toml {[[a.b.c]]} {[[a.b]]} ??? + #presumed that a and a.b were 'created' as tables (supertables of tablearray at a.b.c) and can't now be redefined as tablearrays + + #supertable has already been created - and may be defined - but even if defined we can add subtables unless it is of type itable + #but if it's a tablearray - we need to point to the most 'recently defined table element of the array' + #(last member of that array - need to check type? allowed to have non-table elements ie nonhomogenous??) + set supertype [dict get $tablenames_info $refpath ttype] + if {$supertype eq "header_tablearray"} { + #exercised by toml-tests: + # valid/table/array-table-array + # valid/table/array-nest + + #puts stdout "todict!!! TABLEARRAY nesting required for supertable [join $supertable .]" + + #'refer' to the appropriate element in existing array + set arrdata [tomlish::dict::path::get $datastructure [list {*}$refpath @@value]] + set idx [expr {[llength $arrdata]-1}] + if {$idx < 0} { + #existing tablearray should have at least one entry even if empty (review) + set msg "reference to empty tablearray?" + return -code error -errorcode {TOMLISH STRUCTURE REFTOEMPTYTABLEARRAY} $msg + } + lappend refpath $idx + } + } + } + # + #puts "TABLE supertable refpath $refpath" + lappend refpath @@[lindex $norm_segments end] + tomlish::log::debug "TABLEARRAY refpath $refpath" + set tablearray_refpath $refpath + + + if {![dict exists $tablenames_info $tablearray_refpath ttype]} { + #first encounter of this tablearrayname + if {[tomlish::dict::path::exists $datastructure $tablearray_refpath]} { + #e.g from_toml {a=1} {[[a]]} + set msg "Cannot create tablearray name $tablearrayname. Key already has data but key doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + #test: datastructure_tablearray_direct_keycollision_error + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #no collision - we can create the tablearray and the array in the datastructure + dict set tablenames_info $tablearray_refpath ttype header_tablearray + #dict set datastructure {*}$norm_segments [list type ARRAY value {}] + #create array along with empty array-item at position zero + tomlish::dict::path::set_endpoint datastructure $tablearray_refpath [list type ARRAY value {{}}] + set arrayitem_refpath [list {*}$tablearray_refpath 0] + #set ARRAY_ELEMENTS [list] + } else { + #we have an existing tablenames_info record for this path - but is it a tablearray? + set ttype [dict get $tablenames_info $tablearray_refpath ttype] + if {$ttype ne "header_tablearray"} { + #header_table or itable + switch -- $ttype { + itable {set ttypename itable} + header_table {set ttypename table} + dottedkey_table {set ttypename dottedkey_table} + unknown_header - unknown_dotted { + #table was created e.g as supertable - but not specifically a tablearray + #violates ordering - return specific test error + set msg "Table $tablearrayname referenced as supertable before tablearray defined (ordering)" + return -code error -errorcode {TOMLISH STRUCTURE TABLEARRAYORDERING} $msg + } + default {error "unrecognised type $ttype - expected header_table or itable"} + } + set msg "tablearray name $tablearrayname already appears to be already created as '$ttypename' not tablearray - invalid?" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #EXISTING tablearray + #add to array + #error "add_to_array not implemented" + #{type ARRAY value } + #set ARRAY_ELEMENTS [dict get $datastructure {*}$norm_segments value] + tomlish::log::debug ">>>>pre-extend-array dict::from_tomlish datastructure: $datastructure" + set existing_array [tomlish::dict::path::get $datastructure [list {*}$tablearray_refpath @@value]] + set arrayitem_refpath [list {*}$tablearray_refpath [llength $existing_array]] + tomlish::dict::path::lappend datastructure $tablearray_refpath {} + tomlish::log::debug ">>>>post-extend-array dict::from_tomlish datastructure: $datastructure" + } + + + #set object [dict create] ;#array context equivalent of 'datastructure' + + #add to ARRAY_ELEMENTS and write back in to datastructure. + foreach element [lrange $item 2 end] { + set type [lindex $element 0] + tomlish::log::debug "----> todict processing $tag subitem $type processing contained element $element" + switch -exact -- $type { + DOTTEDKEY { + set dkinfo [_process_tomlish_dottedkey $element $arrayitem_refpath] + lappend dottedtables_defined {*}[dict get $dkinfo dottedtables_defined] + } + NEWLINE - COMMENT - WS { + #ignore + } + TABLE { + #we *perhaps* should be able to process tablearray subtables either as part of the tablearray record, or independently. + #(or even a mixture of both, although that is somewhat an edge case, and of limited utility) + #[[fruit]] + #x=1 + # [fruit.metadata] + # [fruit.otherdata] + + #when processing a dict destined for the above - the tomlish generator (e.g from_dict) + #should create as 1 or 3 records (but could create 2 records if there was an unrelated table in between the subtables) + #choices: all in tablearray record, tablearray + 1 or 2 table records. + # + #We are going the other way here - so we just need to realise that the list of tables 'belonging' to this tablearray might not be complete. + # + #the subtable names must be prefixed with the tablearray - we should validate that for any contained TABLE records + + #The default mechanism is for from_dict to produce tomlish with separate TABLE records - and use the ordering to determine membership + #If we were to support wrapping the TABLE records within a TABLEARRAY - we should also support TABLEARRAY within TABLEARRAY + # ----------------------------------------------------------------------- + #Implementing this is not critical for standard encoding/decoding of toml! + #It would be an alternative form for the tomlish intermediate form - and adds complexity. + # + #The upside would be to provide a function for sorting/rearranging in the tomlish form if all records were fully encapsulated. + #A possible downside is that unrelated tables placed before a tablearray is fully defined (within the tablearray definition area in toml) + # would have to be re-positioned before or after the encapsulated tablearray record. + # While unrelated tables in such a position aren't a recommended way to write toml, they appear to be valid + # and preserving the author's ordering is a goal of the basic encoding/decoding operations if no explicit sorting/reordering was requested. + # + #Consider an 'encapsulate' method to this (tomlish -> tomlish) + # ----------------------------------------------------------------------- + #todo + error "tomlish::dict::from_tomlish TABLE element within TABLEARRAY not handled - TABLE should be a separate tomlish record" + } + default { + error "tomlish::dict::from_tomlish Sub element of type '$type' not understood in tablearray context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" + } + } + } + + #end of TABLEARRAY record - equivalent of EOF or next header - close off the dottedtables + foreach dtablepath $dottedtables_defined { + dict set tablenames_info $dtablepath tdefined closed + } + } + TABLE { + #close off any dottedtables_defined created by dottedkeys at this level + foreach dtablepath $dottedtables_defined { + dict set tablenames_info $dtablepath tdefined closed + } + set tablename [lindex $item 1] + set dottedtables_defined [list] ;#for closing off at end by setting 'defined' + #As our TABLE record contains all it's child DOTTEDKEY records - this should be equivalent to setting them as defined at EOF or next header. + + #----------------------------------------------------------------------------------- + #default assumption - our reference is to the main tablenames_info and datastructure + #Will need to append keys appropriately if we have recursed + #----------------------------------------------------------------------------------- + + log::debug "---> tomlish::dict::from_tomlish processing item TABLE (name: $tablename): $item" + set norm_segments [::tomlish::toml::tablename_split $tablename true] ;#true to normalize + + + + set name_segments [::tomlish::toml::tablename_split $tablename 0] ;#unnormalized e.g ['a'."b".c.d] -> 'a' "b" c d + #results of tablename_split 0 are 'raw' - ie some segments may be enclosed in single or double quotes. + + + set supertable [list] + ############## + # [a.b.c.d] + # norm_segments = {a b c d} + #check a {a b} {a b c} <---- supertables of a.b.c.d + ############## + + ############## + #[[a]] + #[a.b] #supertable a is tablearray + ############## + + #also consider + ############## + # [[a.b]] + # [a.b.c.d] #supertable a is a table, supertable a.b is tablearray, supertable a.b.c is elementtable + ############## + set refpath [list] ;#e.g @@j1 @@j2 1 @@k1 end + foreach normseg [lrange $norm_segments 0 end-1] { + lappend supertable $normseg + lappend refpath @@$normseg + if {![dict exists $tablenames_info $refpath ttype]} { + #supertable with this path doesn't yet exist + if {[tomlish::dict::path::exists $datastructure $refpath]} { + #There is data though - so it must have been created as a keyval + set msg "Supertable [join $supertable .] of table name $tablename (path $refpath) already has data but doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here + #we also don't know whether it's a table or a dottedkey_table (not allowed to be tablearray - out of order?) + dict set tablenames_info $refpath ttype unknown_header + #ensure empty tables are still represented in the datastructure + #dict set datastructure {*}$supertable [list] + tomlish::dict::path::set_endpoint datastructure $refpath {} + } else { + #supertable has already been created - and may be defined - but even if defined we can add subtables unless it is of type itable + if {[dict get $tablenames_info $refpath ttype] eq "header_tablearray"} { + #'refer' to the appropriate element in existing array + set arrdata [tomlish::dict::path::get $datastructure [list {*}$refpath @@value]] + set idx [expr {[llength $arrdata]-1}] + if {$idx < 0} { + #existing tablearray should have at least one entry even if empty (review) + set msg "reference to empty tablearray?" + return -code error -errorcode {TOMLISH STRUCTURE REFTOEMPTYTABLEARRAY} $msg + } + lappend refpath $idx + } else { + #?? + if {[dictn getdef $tablenames_info [list $refpath tdefined] NULL] eq "NULL"} { + } else { + } + } + } + } + #puts "TABLE supertable refpath $refpath" + lappend refpath @@[lindex $norm_segments end] + tomlish::log::info "TABLE refpath $refpath" + set table_refpath $refpath + + + + + #table [a.b.c.d] hasn't been defined - but may have been 'created' already by a longer tablename + # - or may have existing data from a keyval + if {![dict exists $tablenames_info $table_refpath ttype]} { + if {[tomlish::dict::path::exists $datastructure $table_refpath]} { + #e.g from_toml {a=1} {[a]} + set msg "Cannot create table name $tablename. Key already has data but key doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + #test: datastructure_tablename_keyval_collision_error + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #no data or previously created table + dict set tablenames_info $table_refpath ttype header_table + + #We are 'defining' this table's keys and values here (even if empty) + #dict set datastructure {*}$norm_segments [list] ;#ensure table still represented in datastructure even if we add no keyvals here + tomlish::dict::path::set_endpoint datastructure $table_refpath {} ;#ensure table still represented in datastructure even if we add no keyvals here + } else { + if {[dict get $tablenames_info $table_refpath ttype] eq "header_tablearray"} { + #e.g tomltest invalid/table/duplicate-table-array2 + #[[tbl]] + #[tbl] + set msg "Table name $tablename has already been created as a tablearray. Invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } else { + #any other type tdefined is a problem + set T_DEFINED [dictn getdef $tablenames_info [list $table_refpath tdefined] NULL] + if {$T_DEFINED ne "NULL" } { + #our tablename e.g [a.b.c.d] declares a space to 'define' subkeys - but there has already been a definition space for this path + set msg "Table name $tablename has already been defined in the toml data. Invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + } + } + dict set tablenames_info $table_refpath tdefined open + + #now add the contained elements + foreach element [lrange $item 2 end] { + set type [lindex $element 0] + log::debug "----> todict processing $tag subitem $type processing contained element $element" + switch -exact -- $type { + DOTTEDKEY { + set dkinfo [_process_tomlish_dottedkey $element $table_refpath] + lappend dottedtables_defined {*}[dict get $dkinfo dottedtables_defined] + } + NEWLINE - COMMENT - WS { + #ignore + } + default { + error "Sub element of type '$type' not understood in table context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" + } + } + } + + #end of TABLE record - equivalent of EOF or next header - close off the dottedtables + foreach dtablepath $dottedtables_defined { + dict set tablenames_info $dtablepath tdefined closed + } + } + ITABLE { + #As there is no other mechanism to create tables within an ITABLE than dottedkeys + # and ITABLES are fully defined/enclosed - we can rely on key collision and don't need to track dottedtables_defined - REVIEW. + set dottedtables_defined [list] + #SEP??? + #ITABLE only ever on RHS of = or inside ARRAY + set datastructure [dict create] + set tablenames_info [dict create] + + foreach element [lrange $item 1 end] { + set type [lindex $element 0] + log::debug "----> tododict processing $tag subitem $type processing contained element $element" + switch -exact -- $type { + DOTTEDKEY { + set dkinfo [_process_tomlish_dottedkey $element] + } + NEWLINE - COMMENT - WS { + #ignore + } + default { + error "Sub element of type '$type' not understood in ITABLE context. Expected only KEY,DQKEY,SQKEY,NEWLINE,COMMENT,WS" + } + } + } + } + ARRAY { + #arrays in toml are allowed to contain mixtures of types + set datastructure [list] + log::debug "--> processing array: $item" + + foreach element [lrange $item 1 end] { + set type [lindex $element 0] + log::debug "----> tododict processing $tag subitem $type processing contained element $element" + switch -exact -- $type { + INT - FLOAT - BOOL - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL { + set value [lindex $element 1] + lappend datastructure [list type $type value $value] + } + STRING { + #JJJJ + #don't unescape string! + set value [lindex $element 1] + #lappend datastructure [list type $type value [::tomlish::utils::unescape_string $value]] + lappend datastructure [list type $type value $value] + } + LITERAL { + set value [lindex $element 1] + lappend datastructure [list type $type value $value] + } + ITABLE { + #anonymous table + #lappend datastructure [list type $type value [::tomlish::to_dict [list $element]]] + lappend datastructure [::tomlish::dict::from_tomlish [list $element]] ;#store itables within arrays as raw dicts (possibly empty) + } + TABLE - TABLEARRAY { + #invalid? shouldn't be output from from_dict - but could manually be constructed as such? review + #doesn't make sense as table needs a name? + #take as synonym for ITABLE? + error "tomlish::dict::from_tomlish $type within array unexpected" + } + ARRAY - MULTISTRING - MULTILITERAL { + #set value [lindex $element 1] + lappend datastructure [list type $type value [::tomlish::dict::from_tomlish [list $element]]] + } + WS - SEP - NEWLINE - COMMENT { + #ignore whitespace, commas, newlines and comments + } + default { + error "tomlish::dict::from_tomlish Unexpected value type '$type' found in array" + } + } + } + } + MULTILITERAL { + #triple squoted string + #first newline stripped only if it is the very first element + #(ie *immediately* following the opening delims) + #All whitespace other than newlines is within LITERALPARTS + # ------------------------------------------------------------------------- + #todo - consider extension to toml to allow indent-aware multiline literals + # how - propose as issue in toml github? Use different delim? e.g ^^^ ? + #e.g + # xxx=?'''abc + # def + # etc + # ''' + # - we would like to trimleft each line to the column following the opening delim + # ------------------------------------------------------------------------- + + log::debug "---> todict processing multiliteral: $item" + set parts [lrange $item 1 end] + if {[lindex $parts 0 0] eq "NEWLINE"} { + set parts [lrange $parts 1 end] ;#skip it + } + for {set idx 0} {$idx < [llength $parts]} {incr idx} { + set element [lindex $parts $idx] + set type [lindex $element 0] + switch -exact -- $type { + LITERALPART { + append stringvalue [lindex $element 1] + } + NEWLINE { + set val [lindex $element 1] + if {$val eq "lf"} { + append stringvalue \n + } else { + append stringvalue \r\n + } + } + default { + error "Unexpected value type '$type' found in multistring" + } + } + } + set datastructure $stringvalue + } + MULTISTRING { + #triple dquoted string + log::debug "---> tomlish::dict::from_tomlish processing multistring: $item" + set stringvalue "" + set idx 0 + set parts [lrange $item 1 end] + for {set idx 0} {$idx < [llength $parts]} {incr idx} { + set element [lindex $parts $idx] + set type [lindex $element 0] + #We use STRINGPART in the tomlish representation as a distinct element to STRING - which would imply wrapping quotes to be reinserted + switch -exact -- $type { + STRING { + #todo - do away with STRING ? + #we don't build MULTISTRINGS containing STRING - but should we accept it? + tomlish::log::warn "double quoting a STRING found in MULTISTRING - should be STRINGPART?" + #append stringvalue "\"[::tomlish::utils::unescape_string [lindex $element 1]]\"" + append stringvalue "\"[lindex $element 1]\"" + } + STRINGPART { + #JJJ + #don't unescape string + #append stringvalue [::tomlish::utils::unescape_string [lindex $element 1]] + append stringvalue [lindex $element 1] + } + CONT { + #When the last non-whitespace character on a line is an unescaped backslash, + #it will be trimmed along with all whitespace (including newlines) up to the next non-whitespace character or closing delimiter + # review - we allow some whitespace in stringpart elements - can a stringpart ever be all whitespace? + set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] + if {$next_nl == -1} { + #last (or first and only) line + return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} "Invalid whitespace escape - not a valid continuation position" + #set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] + #if {$non_ws >= 0} { + # #append stringvalue "\\" + # return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} "Invalid whitespace escape - not a valid continuation position" + #} else { + # #skip over ws without emitting + # set idx [llength $parts] + #} + } else { + set parts_til_nl [lrange $parts 0 $next_nl-1] + set non_ws [lsearch -index 0 -start $idx+1 -not $parts_til_nl WS] + if {$non_ws >= 0} { + #This CONT is invalid. If there had been a non-whitespace char directly following it, + #it wouldn't have come through as a CONT token + #Now that we see it isn't the last non-whitespace backslash on the line we can reject + # as an invalid escape of space or tab + #append stringvalue "\\" + return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} "Invalid whitespace escape - not a valid continuation position" + } else { + #skip over ws on this line + set idx $next_nl + #then have to check each subsequent line until we get to first non-whitespace + set trimming 1 + while {$trimming && $idx < [llength $parts]} { + set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] + if {$next_nl == -1} { + #last line + set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] + if {$non_ws >= 0} { + set idx [expr {$non_ws -1}] + } else { + set idx [llength $parts] + } + set trimming 0 + } else { + set non_ws [lsearch -index 0 -start $idx+1 -not [lrange $parts 0 $next_nl-1] WS] + if {$non_ws >= 0} { + set idx [expr {$non_ws -1}] + set trimming 0 + } else { + set idx $next_nl + #keep trimming + } + } + } + } + } + } + NEWLINE { + #if newline is first element - it is not part of the data of a multistring + if {$idx > 0} { + set val [lindex $element 1] + if {$val eq "lf"} { + append stringvalue \n + } else { + append stringvalue \r\n + } + } + } + WS { + append stringvalue [lindex $element 1] + } + default { + error "Unexpected value type '$type' found in multistring" + } + } + } + set datastructure $stringvalue + } + WS - COMMENT - NEWLINE { + #ignore + } + BOM { + #this token is the unicode single char \uFFEF + #It doesn't tell us what encoding was originally used (though toml should only accept UTF-8 files) + #ignore at start - what about in other positions? + } + default { + error "Unexpected tag '$tag' in Tomlish list '$tomlish'" + } + } + } + return $datastructure + } +} +namespace eval tomlish::dict::path { + #access tomlish dict structure + namespace export {[a-z]*}; # Convention: export all lowercase + + #access with path such as: @@k @@k 0 @@k end where dict keys marked with @@ and plain values are list indices into in {type ARRAY value } + #leaf elements returned as structured {type value } + proc get {dictval {path {}}} { + if {$path eq ""} { + return $dictval + } + ::set data $dictval + ::set pathsofar [list] + foreach p $path { + ::lappend pathsofar $p + if {[string range $p 0 1] eq "@@"} { + ::set data [dict get $data [string range $p 2 end]] + } else { + if {![tomlish::dict::is_typeval $data]} { + error "tomlish::dict::path::get error bad path $path. Attempt to access table as array at subpath $pathsofar." + } + if {[dict get $data type] ne "ARRAY"} { + error "tomlish::dict::get error bad path $path. Subpath $pathsofar is not an array." + } + ::set arrdata [dict get $data value] + ::set data [lindex $arrdata $p] + } + } + return $data + } + proc exists {dictval path} { + ::set data $dictval + ::set pathsofar [list] + ::set exists 1 + foreach p $path { + ::lappend pathsofar $p + if {[string range $p 0 1] eq "@@"} { + ::set k [string range $p 2 end] + if {![dict exists $data $k]} { + return 0 + } + ::set data [dict get $data $k] + } else { + if {![tomlish::dict::is_typeval $data]} { + return 0 + } + if {[dict get $data type] ne "ARRAY"} { + return 0 + } + ::set arrdata [dict get $data value] + ::set intp [tomlish::system::lindex_resolve_basic $arrdata $p] ;#handle index math (end-1 etc) + if {$intp == -1} { + #out of bounds + return 0 + } + ::set data [lindex $arrdata $p] + } + } + return $exists + } + + #a restricted analogy of 'dictn set' + #set 'endpoints' - don't create intermediate paths + # can replace an existing dict with another dict + # can create a key when key at tail end of path is a key (ie @@keyname, not index) + # can replace an existing {type value value } + # with added restriction that if is ARRAY the new must also be ARRAY + proc set_endpoint {dictvariable path value} { + upvar $dictvariable dict_being_edited + ::set data $dict_being_edited + ::set pathsofar [list] + if {!([tomlish::dict::is_typeval $value] || [tomlish::dict::is_typeval_dict $value 0])} { + #failed check of supplied value as basic type, or a sub-dict structure (not checking arrays) + error "tomlish::dict::path::set_endpoint error - value must already be in the tomlish form {type value } or be a dict with such forms as leaves" + } + foreach p $path { + ::lappend pathsofar $p + if {[string range $p 0 1] eq "@@"} { + ::set k [string range $p 2 end] + + #if {![dict exists $data $k]} { + # error "tomlish::dict:path::set error bad path $path. Attempt to access nonexistent element at subpath $pathsofar." + #} + ::set varname v[incr v] + + if {$pathsofar eq $path} { + #see if endpoint of the path given already exists + if {[dict exists $data $k]} { + ::set endpoint [dict get $data $k] + if {[tomlish::dict::is_typeval $endpoint]} { + set existing_tp [dict get $endpoint type] + if {![tomlish::dict::is_typeval $value]} { + error "tomlish::dict::path::set_endpoint error Unable to overwrite subpath '$pathsofar' which is of type $existing_tp with sub-dict. Supplied value not {type value val } with sub-dict: $value" + } + switch -- [dict get $endpoint type] { + ARRAY { + #disallow overwriting array - unless given value is an ARRAY? REVIEW + if {[dict get $value type] ne "ARRAY"} { + error "tomlish::dict::path::set_endpoint error bad path '$path'. Cannot overwrite array with non-array: $value" + } + } + default { + # + } + } + } else { + #endpoint is a typeval dict not a plain typeval - only allow overwrite with a typeval dict + if {![tomlish::dict::is_typeval_dict $value 0]} { + error "tomlish::dict::path::set_endpoint error path '$path'. Cannot overwrite sub-dict (size: [dict size $endpoint]) with non sub-dict: $value" + } + } + ::set $varname $value + dict set vdict $pathsofar $varname + break + } else { + ::set arrdata [dict get $data value] + set idx [tomlish::system::lindex_resolve_basic $arrdata $p] + if {$idx == -1} { + error "tomlish::dict::path::set_endpoint error bad path '$path'. No existing element at $p" + } + ::set data [lindex $arrdata $p] + ::set $varname $data + dict set vdict $pathsofar $varname + } + } + } + #dict for {path varname} $vdict { + # puts "$path $varname\n" + # puts " '[::set $varname]'\n" + # puts "" + #} + ::set i 0 + ::set reverse [lreverse $vdict] + foreach {varname path} $reverse { + set newval [::set $varname] + if {$i+2 == [llength $reverse]} { + ::set k [lindex $path end] + ::set k [string range $k 2 end] ;#first key is always @@something + dict set dict_being_edited $k $newval + #puts "--result $dict_being_edited" + break + } + ::set nextvarname [lindex $reverse $i+2] + ::set nextval [::set $nextvarname] + ::set k [lindex $path end] + if {[string match @@* $k]} { + #dict key + #dict set $nextvarname $k $newval + set_endpoint $nextvarname [list $k] $newval + } else { + #list index + ::set nextarr [dict get $nextval value] + ::lset nextarr $k $newval + dict set $nextvarname value $nextarr + } + ::incr i 2 + } + + return $dict_being_edited + + } + #path must be to a {type ARRAY value } + #REVIEW - how to lappend to deep mixed dict/array structure without rewriting whole datastructure? + proc lappend {dictvariable path args} { + upvar $dictvariable dict_being_edited + ::set data $dict_being_edited + ::set pathsofar [list] + #::set newlist [list] + ::set v 0 + ::set vdict [dict create] + foreach a $args { + if {![string is dict $a]} { + error "tomlish::dict::path::lappend error - lappended arguments must already be in the tomlish form {type value } or be a dict with such forms as leaves" + } + } + foreach p $path { + ::lappend pathsofar $p + if {[string range $p 0 1] eq "@@"} { + ::set k [string range $p 2 end] + if {![dict exists $data $k]} { + error "tomlish::dict::path::lappend error bad path $path. Attempt to access nonexistent element at subpath $pathsofar." + } + ::set varname v[incr v] + + if {$pathsofar eq $path} { + #see if endpoint of the path given is an ARRAY + ::set endpoint [dict get $data $k] + if {![tomlish::dict::is_typeval $endpoint]} { + error "tomlish::dict::path::lappend error bad path $path. Attempt to access table as array at subpath $pathsofar." + } + if {[dict get $endpoint type] ne "ARRAY"} { + error "tomlish::dict::path::lappend error bad path $path. Subpath $pathsofar is not an array." + } + ::set arrdata [dict get $endpoint value] + ::lappend arrdata {*}$args + dict set endpoint value $arrdata + ::set newlist $endpoint + ::set $varname $newlist + dict set vdict $pathsofar $varname + break + } + ::set data [dict get $data $k] + ::set $varname $data + dict set vdict $pathsofar $varname + } else { + if {![tomlish::dict::is_typeval $data]} { + error "tomlish::dict::path::lappend error bad path $path. Attempt to access table as array at subpath $pathsofar." + } + if {[dict get $data type] ne "ARRAY"} { + error "tomlish::dict::path::lappend error bad path $path. Subpath $pathsofar is not an array." + } + ::set varname v[incr v] + if {$pathsofar eq $path} { + if {[dict get $data type] ne "ARRAY"} { + error "tomlish::dict::path::lappend error bad path $path. Parent path is not an array." + } + ::set parentarray [dict get $data value] + ::set idx [tomlish::system::lindex_resolve_basic $parentarray $p] + if {$idx == -1} { + error "tomlish::dict::path::lappend error bad path $path. Index $p does not exist." + } + ::set endpoint [lindex $parentarray $p] + if {[dict get $endpoint type] ne "ARRAY"} { + error "tomlish::dict::path::lappend error bad path $path. Not an array." + } + + ::set arrdata [dict get $endpoint value] + ::lappend arrdata {*}$args + dict set endpoint value $arrdata + ::set newlist $endpoint + #::lset parentarray $p $newlist + #set parentarray $newlist + ::set $varname $newlist + dict set vdict $pathsofar $varname + break + } else { + ::set arrdata [dict get $data value] + set idx [tomlish::system::lindex_resolve_basic $arrdata $p] + if {$idx == -1} { + error "tomlish::dict::path::lappend error bad path $path. Subpath $pathsofar, index $p does not exist." + } + ::set data [lindex $arrdata $p] + ::set $varname $data + dict set vdict $pathsofar $varname + } + } + } + #todo tomlish::log::debug ? + #dict for {path varname} $vdict { + # puts "$path $varname\n" + # puts " [::set $varname]\n" + # puts "" + #} + ::set i 0 + ::set reverse [lreverse $vdict] + foreach {varname path} $reverse { + set newval [::set $varname] + if {$i+2 == [llength $reverse]} { + ::set k [lindex $path end] + ::set k [string range $k 2 end] ;#first key is always @@something + dict set dict_being_edited $k $newval + #puts "--result $dict_being_edited" + break + } + ::set nextvarname [lindex $reverse $i+2] + ::set nextval [::set $nextvarname] + ::set k [lindex $path end] + if {[string match @@* $k]} { + #dict key + set k [string range $k 2 end] + dict set $nextvarname $k $newval + } else { + #list index + ::set nextarr [dict get $nextval value] + ::lset nextarr $k $newval + dict set $nextvarname value $nextarr + } + ::incr i 2 + } + return $dict_being_edited + } +} +tcl::namespace::eval tomlish::to_dict { + + + proc @@path {dictkeys} { + lmap v $dictkeys {string cat @@ $v} + } + +} + + +tcl::namespace::eval tomlish::app { + #*** !doctools + #[subsection {Namespace tomlish::app}] + #[para] + #[list_begin definitions] + + tcl::namespace::eval argdoc { + proc test_suites {} { + if {[package provide test::tomlish] eq ""} { + return [list] + } + return [test::tomlish::SUITES] + } + } + + package require punk::args + punk::args::define { + @id -id ::tomlish::app::decoder + @cmd -name tomlish::app::decoder -help\ + "Read toml on stdin until EOF + on error - returns non-zero exit code and writes error to + the errorchannel. + on success - returns zero exit code and writes JSON encoding + of the data to the outputchannel. + This decoder is intended to be compatble with toml-test." + @leaders -min 0 -max 0 + @opts + -help -type none -help\ + "Display this usage message" + -inputchannel -default stdin + -inputencoding -default "iso8859-1" -choicerestricted 0 -choices {utf-8 utf-16 iso8859-1} -help\ + "configure encoding on input channel + iso8859-1 is equivalent to binary encoding" + -outputchannel -default stdout + -errorchannel -default stderr + @values -min 0 -max 0 + } + proc decoder {args} { + set argd [punk::args::parse $args withid ::tomlish::app::decoder] + set ch_input [dict get $argd opts -inputchannel] + set ch_input_enc [dict get $argd opts -inputencoding] + set ch_output [dict get $argd opts -outputchannel] + set ch_error [dict get $argd opts -errorchannel] + if {[dict exists $argd received -help]} { + return [punk::args::usage -scheme info ::tomlish::app::decoder] + } + + chan configure $ch_input -encoding $ch_input_enc + #translation? + chan configure $ch_input -translation lf ;# toml-test invalid/control tests we need to see raw CRs to reject them properly - auto translation won't do. + + #Just slurp it all - presumably we are not handling massive amounts of data on stdin. + # - even if the input is large, we probably don't gain much (aside from possible memory savings?) by attempting to process input as it arrives. + if {[catch { + set inputdata [read $ch_input] + if {$ch_input_enc eq "iso8859-1"} { + set toml [tomlish::toml::from_binary $inputdata] + } else { + set toml $inputdata + } + } errM]} { + puts stderr "read-input error: $errM" + #toml-tests expect exit code 1 + #e.g invalid/encoding/utf16-bom + exit 1 ;#read error + } + try { + set j [::tomlish::toml_to_typedjson $toml] + } on error {em} { + puts $ch_error "decoding failed: '$em'" + exit 1 + } + puts -nonewline $ch_output $j + exit 0 + } + + package require punk::args + punk::args::define { + @id -id ::tomlish::app::encoder + @cmd -name tomlish::app::encoder -help\ + "Read JSON on input until EOF + return non-zero exitcode if JSON data cannot be converted to + a valid TOML representation. + return zero exitcode and TOML data on output if JSON data can + be converted. + This encoder is intended to be compatible with toml-test." + @leaders -min 0 -max 0 + @opts + -help -type none -help \ + "Display this usage message" + -restrict_barekeys -default 0 -help\ + "If true, keys containing unicode will be quoted. + If false, an extended range of barekeys will be used + in unquoted form." + -inputchannel -default stdin + -inputencoding -default "" -choicerestricted 0 -choices {utf-8 utf-16 iso8859-1} -help\ + "configure encoding on input channel + If not supplied, leave at Tcl default" + -outputchannel -default stdout + -errorchannel -default stderr + @values -min 0 -max 0 + } + proc encoder {args} { + set argd [punk::args::parse $args withid ::tomlish::app::encoder] + set restrict_barekeys [dict get $argd opts -restrict_barekeys] + set ch_input [dict get $argd opts -inputchannel] + set ch_input_enc [dict get $argd opts -inputencoding] + set ch_output [dict get $argd opts -outputchannel] + set ch_error [dict get $argd opts -errorchannel] + if {[dict exists $argd received -help]} { + return [punk::args::usage -scheme info ::tomlish::app::encoder] + } + #review + if {$ch_input_enc ne ""} { + chan configure $ch_input -encoding $ch_input_enc + } + #review + chan configure $ch_input -translation lf + + if {[catch { + set json [read $ch_input] + }]} { + exit 2 ;#read error + } + try { + #tomlish::typedjson_to_toml + set toml [::tomlish::toml::from_tomlish_from_dict_from_typedjson $json] + } trap {} {e eopts} { + puts $ch_error "encoding failed: '$e'" + puts $ch_error "$::errorInfo" + exit 1 + } + puts -nonewline $ch_output $toml + exit 0 + } + + punk::args::define { + @dynamic + @id -id ::tomlish::app::test + @cmd -name tomlish::app::test + @leaders + @opts -any 1 + -help -type none -help\ + "Display this usage message + or further info if more args." + -suite -default tests -choices {${[::tomlish::app::argdoc::test_suites]}} + @values -min 0 -max -1 + } + proc test {args} { + package require test::tomlish + set argd [punk::args::parse $args withid ::tomlish::app::test] + set opts [dict get $argd opts] + set values [dict get $argd values] + set received [dict get $argd received] + set solos [dict get $argd solos] + set opt_suite [dict get $opts -suite] + if {[dict exists $received -help] && ![dict exists $received -suite]} { + return [punk::args::usage -scheme info ::tomlish::app::test] + } + + test::tomlish::SUITE $opt_suite + #if {[catch {test::tomlish::SUITE $opt_suite} errM]} { + # puts stderr "Unknown test suite '$opt_suite'. Available suites: [test::tomlish::SUITES]" + # exit 1 + #} + set run_opts [dict remove $opts -suite] + set run_opts [dict remove $run_opts {*}$solos] + set result [test::tomlish::RUN {*}$run_opts {*}$solos {*}$values] + return $result + } + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::app ---}] +} + +proc ::tomlish::appnames {} { + set applist [list] + foreach cmd [info commands ::tomlish::app::*] { + lappend applist [namespace tail $cmd] + } + return $applist +} + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# Secondary API namespace +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval tomlish::lib { + namespace export {[a-z]*}; # Convention: export all lowercase + namespace path [namespace parent] + #*** !doctools + #[subsection {Namespace tomlish::lib}] + #[para] Secondary functions that are part of the API + #[list_begin definitions] + + #proc utility1 {p1 args} { + # #*** !doctools + # #[call lib::[fun utility1] [arg p1] [opt {?option value...?}]] + # #[para]Description of utility1 + # return 1 + #} + + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::lib ---}] +} +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval tomlish::system { + + #taken from punk::lib + proc lindex_resolve_basic {list index} { + #*** !doctools + #[call [fun lindex_resolve_basic] [arg list] [arg index]] + #[para] Accepts index of the forms accepted by Tcl's list commands. (e.g compound indices such as 3+1 end-2) + #[para] returns -1 for out of range at either end, or a valid integer index + #[para] Unlike lindex_resolve; lindex_resolve_basic can't determine if an out of range index was out of range at the lower or upper bound + #[para] This is only likely to be faster than average over lindex_resolve for Tcl which has the builtin lseq command + #[para] The performance advantage is more likely to be present when using compound indexes such as $x+1 or end-1 + #[para] For pure integer indices the performance should be equivalent + + set index [tcl::string::map {_ {}} $index] ;#forward compatibility with integers such as 1_000 + if {[string is integer -strict $index]} { + #can match +i -i + #avoid even the lseq overhead when the index is simple + if {$index < 0 || ($index >= [llength $list])} { + #even though in this case we could return -2 or -3 like lindex_resolve; for consistency we don't, as it's not always determinable for compound indices using the lseq method. + return -1 + } else { + #integer may still have + sign - normalize with expr + return [expr {$index}] + } + } + if {[llength $list]} { + set indices [tomlish::system::range 0 [expr {[llength $list]-1}]] ;# uses lseq if available, has fallback. + #if lseq was available - $indices is an 'arithseries' - theoretically not taking up ram(?) + } else { + set indices [list] + } + set idx [lindex $indices $index] + if {$idx eq ""} { + #we have no way to determine if out of bounds is at lower vs upper end + return -1 + } else { + return $idx + } + } + + if {[info commands ::lseq] ne ""} { + #tcl 8.7+ lseq significantly faster, especially for larger ranges + #The internal rep can be an 'arithseries' with no string representation + #support minimal set from to + proc range {from to} { + lseq $from $to + } + } else { + #lseq accepts basic expressions e.g 4-2 for both arguments + #e.g we can do lseq 0 [llength $list]-1 + #if range is to be consistent with the lseq version above - it should support that, even though we don't support most lseq functionality in either wrapper. + proc range {from to} { + set to [offset_expr $to] + set from [offset_expr $from] + if {$to > $from} { + set count [expr {($to -$from) + 1}] + if {$from == 0} { + return [lsearch -all [lrepeat $count 0] *] + } else { + incr from -1 + return [lmap v [lrepeat $count 0] {incr from}] + } + #slower methods. + #2) + #set i -1 + #set L [lrepeat $count 0] + #lmap v $L {lset L [incr i] [incr from];lindex {}} + #return $L + #3) + #set L {} + #for {set i 0} {$i < $count} {incr i} { + # lappend L [incr from] + #} + #return $L + } elseif {$from > $to} { + set count [expr {$from - $to} + 1] + #1) + if {$to == 0} { + return [lreverse [lsearch -all [lrepeat $count 0] *]] + } else { + incr from + return [lmap v [lrepeat $count 0] {incr from -1}] + } + + #2) + #set i -1 + #set L [lrepeat $count 0] + #lmap v $L {lset L [incr i] [incr from -1];lindex {}} + #return $L + #3) + #set L {} + #for {set i 0} {$i < $count} {incr i} { + # lappend L [incr from -1] + #} + #return $L + } else { + return [list $from] + } + } + } + +} + +if {[info exists ::argc] && $::argc > 0} { + #puts stderr "argc: $::argc args: $::argv" + set arglist $::argv + # -------------- + #make sure any dependant packages that are sourced don't get any commandline args + set ::argv {} + set ::argc 0 + # -------------- + package require punk::args + punk::args::define { + @dynamic + @id -id tomlish::cmdline + @cmd -name tomlish -help\ + "toml encoder/decoder written in Tcl" + @opts -any 1 + -help -type none -help\ + "Display this usage message or more specific + help if further arguments provided." + -app -choices {${[tomlish::appnames]}} + } + try { + set argd [punk::args::parse $arglist withid tomlish::cmdline] + } trap {PUNKARGS VALIDATION} {msg erroropts} { + puts stderr $msg + exit 1 + } + + + lassign [dict values $argd] leaders opts values received solos + if {[dict exists $received -help] && ![dict exists $received -app]} { + #only emit cmdline help if -app not supplied as well - otherwise app function can act on -help for more specific help + #puts stdout "Usage: -app where appname one of:[tomlish::appnames]" + puts stdout [punk::args::usage -scheme info tomlish::cmdline] + exit 0 + } + if {![dict exists $received -app]} { + puts stderr [punk::args::usage -scheme error tomlish::cmdline] + exit 1 + } + + set app [dict get $opts -app] + set appnames [tomlish::appnames] + set app_opts [dict remove $opts -app {*}$solos] + try { + set result [tomlish::app::$app {*}$app_opts {*}$solos {*}$values] + } trap {PUNKARGS VALIDATION} {msg erroropts} { + #The validation error should fully describe the issue + #no need for errortrace - keep the output cleaner + puts stderr $msg + exit 1 + } trap {} {msg erroropts} { + #unexpected error - uncaught throw will produce error trace + #todo - a support msg? Otherwise we may as well just leave off this trap. + throw [dict get $erroropts -errorcode] [dict get $erroropts -errorinfo] + } + if {"-help" in $solos} { + puts stderr $result + exit 1 + } else { + if {$result ne ""} { + puts stdout $result + exit 0 + } + } +} + +## Ready +package provide tomlish [namespace eval tomlish { + variable pkg tomlish + variable version + set version 1.1.6 +}] +return + +#*** !doctools +#[manpage_end] + diff --git a/src/vendormodules/test/tomlish-1.1.5.tm b/src/vendormodules/test/tomlish-1.1.5.tm index 642b935b..35de5e70 100644 Binary files a/src/vendormodules/test/tomlish-1.1.5.tm and b/src/vendormodules/test/tomlish-1.1.5.tm differ diff --git a/src/vendormodules/tomlish-1.1.6.tm b/src/vendormodules/tomlish-1.1.6.tm index f3251660..dddcd0bb 100644 --- a/src/vendormodules/tomlish-1.1.6.tm +++ b/src/vendormodules/tomlish-1.1.6.tm @@ -81,6 +81,11 @@ namespace eval tomlish { namespace export {[a-z]*}; # Convention: export all lowercase variable types + #*** !doctools + #[subsection {Namespace tomlish}] + #[para] Core API functions for tomlish + #[list_begin definitions] + #default interp recursionlimit of 1000 is insufficient to pass 1000 deep nested structures as in certain toml tests. #e.g https://github.com/iarna/toml-spec-tests/tree/latest/values #1000 seems deep for a 'configuration' format - but toml sometimes used for other serialisation purposes. @@ -135,8 +140,8 @@ namespace eval tomlish { # ----------------------------------------------------- #REVIEW #todo - some sort of 'normalize'/'grouping' function on tomlish that at least makes records self-contained, and perhaps then (optionally) reorders resulting records sensibly. - #such a function on the tomlish may work - although it would be unwise to duplicate the validation aspects of to_dict - #The most practical way might be to use to_dict followed by from_dict - but that would lose comment info and formatting. + #such a function on the tomlish may work - although it would be unwise to duplicate the validation aspects of dict::from_tomlish + #The most practical way might be to use dict::from_tomlish followed by from_dict - but that would lose comment info and formatting. #In the above example - The decision by the toml author to put [unrelated1] between related tablearrays should be respected, #but the positioning of [unrelated2] between a tablearray and one of its contained tables is suspect. #Both [fruits.metadata] table records should theoretically be added as children to their corresponding [[fruits]] tablearray record in the tomlish. (just as their name keys are) @@ -155,7 +160,15 @@ namespace eval tomlish { # inline tables are fully defined between their braces, as are dotted-key subtables defined within # No additional subtables or arrays of tables may be defined within an inline table after the ending brace - they must be entirely self-contained - set tags [list TOMLISH ARRAY TABLE ITABLE TABLEARRAY WS NEWLINE COMMENT DOTTEDKEY KEY DQKEY SQKEY STRING STRINGPART MULTISTRING LITERAL LITERALPART MULTILITERAL INT FLOAT BOOL DATETIME] + set tags [list TOMLISH BOM ARRAY TABLE ITABLE TABLEARRAY WS NEWLINE COMMENT DOTTEDKEY KEY DQKEY SQKEY STRING STRINGPART MULTISTRING LITERAL LITERALPART MULTILITERAL INT FLOAT BOOL] + #DDDD + lappend tags {*}[list\ + DATETIME\ + DATETIME-LOCAL\ + DATE-LOCAL\ + TIME-LOCAL\ + ] + #removed - ANONTABLE #tomlish v1.0 should accept arbitrary 64-bit signed ints (from -2^63 to 2^63-1) #we will restrict to this range for compatibility for now - although Tcl can handle larger (arbitrarily so?) @@ -174,16 +187,57 @@ namespace eval tomlish { log::logproc $lvl tomlish_log_$lvl } - #*** !doctools - #[subsection {Namespace tomlish}] - #[para] Core API functions for tomlish - #[list_begin definitions] proc tags {} { return $::tomlish::tags } - #helper function for to_dict + proc get_dottedkey_info {dottedkeyrecord} { + set key_hierarchy [list] + set key_hierarchy_raw [list] + if {[lindex $dottedkeyrecord 0] ne "DOTTEDKEY"} { + error "tomlish::get_dottedkey_info error. Supplied list doesn't appear to be a DOTTEDKEY (tag: [lindex $dottedkeyrecord 0])" + } + set compoundkeylist [lindex $dottedkeyrecord 1] + set expect_sep 0 + foreach part $compoundkeylist { + set parttag [lindex $part 0] + if {$parttag eq "WS"} { + continue + } + if {$expect_sep} { + if {$parttag ne "DOTSEP"} { + error "DOTTEDKEY missing dot separator between parts. '$dottedkeyrecord'" + } + set expect_sep 0 + } else { + set val [lindex $part 1] + switch -exact -- $parttag { + KEY { + lappend key_hierarchy $val + lappend key_hierarchy_raw $val + } + DQKEY { + #REVIEW unescape or not? + #JJJJ + lappend key_hierarchy [::tomlish::utils::unescape_string $val] + lappend key_hierarchy_raw \"$val\" + } + SQKEY { + lappend key_hierarchy $val + lappend key_hierarchy_raw "'$val'" + } + default { + error "tomlish::get_dottedkey_info DOTTED key unexpected part '$parttag' - ensure dot separator is between key parts. '$compoundkeylist'" + } + } + set expect_sep 1 + } + } + return [dict create keys $key_hierarchy keys_raw $key_hierarchy_raw] + } + + #helper function for tomlish::dict::from_tomlish proc _get_keyval_value {keyval_element} { #e.g #DOTTEDKEY {{KEY a} {WS { }}} = {WS { }} {ARRAY {INT 1} SEP {ITABLE {DOTTEDKEY {{KEY x}} = {INT 1} SEP} {DOTTEDKEY {{KEY y}} = {INT 2}}}} @@ -209,21 +263,22 @@ namespace eval tomlish { set posn 0 foreach sub $sublist { #note that a barekey/dquotedkey won't occur directly inside a barekey/dquotedkey + #DDDD switch -exact -- [lindex $sub 0] { - STRING - LITERAL - MULTISTRING - MULTILITERAL - INT - FLOAT - BOOL - DATETIME - TABLE - ARRAY - ITABLE { + STRING - LITERAL - MULTISTRING - MULTILITERAL - INT - FLOAT - BOOL - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - TABLE - ARRAY - ITABLE { lappend values $sub lappend value_posns $posn } DOTTEDKEY { #we should never see DOTTEDKEY as a toplevel element on RHS #sanity check in case manually manipulated tomlish - or something went very wrong - set msg "tomlish::_get_keyval_value Unexpected toplevel DOTTEDKEY after =" + set msg "tomlish::_get_keyval_value Unexpected toplevel value element DOTTEDKEY after =" return -code error -errorcode {TOMLISH SYNTAX UNEXPECTEDDOTTEDKEYRHS} $msg } - WS - NEWLINE {} + WS - NEWLINE - COMMENT {} SEP {} default { - set msg "tomlish::_get_keyval_value Unexpected toplevel element [lindex $sub 0]" + set msg "tomlish::_get_keyval_value Unexpected toplevel value element [lindex $sub 0] after =" return -code error -errorcode {TOMLISH SYNTAX UNEXPECTED} $msg } } @@ -255,10 +310,20 @@ namespace eval tomlish { } lassign [lindex $values 0] type_d1 value_d1 lassign [lindex $values 1] type_d2 value_d2 - if {$type_d1 ne "DATETIME" || $type_d2 ne "DATETIME"} { + #DDDD + if {$type_d1 ne "DATE-LOCAL" || $type_d2 ni {DATETIME TIME-LOCAL}} { + #we reuse DATETIME tag for standalone time with tz offset (or zZ) error "tomlish KEY in 2 parts does not appear to be datetime '$keyval_element'" } - set type DATETIME + if {$type_d2 eq "TIME-LOCAL"} { + set type DATETIME-LOCAL + } else { + #extra check that 2nd part is actually a time + if {![tomlish::utils::is_timepart $value_d2]} { + error "tomlish KEY in 2 parts does not appear to be datetime. (part 2 not a time value) '$keyval_element'" + } + set type DATETIME + } set value "${value_d1}T${value_d2}" } default { @@ -267,12 +332,16 @@ namespace eval tomlish { } set sub_tablenames_info [dict create] switch -exact -- $type { - INT - FLOAT - BOOL - DATETIME { + INT - FLOAT - BOOL - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL { + #DDDD #simple (non-container, no-substitution) datatype set result [list type $type value $value] } STRING - STRINGPART { - set result [list type $type value [::tomlish::utils::unescape_string $value]] + #JJJ + #!!! review + #set result [list type $type value [::tomlish::utils::unescape_string $value]] + set result [list type $type value $value] } LITERAL - LITERALPART { #REVIEW @@ -287,7 +356,7 @@ namespace eval tomlish { # set prev_tablenames_info $tablenames_info set tablenames_info [dict create] - set result [::tomlish::to_dict [ list [lindex $values 0] ]] + set result [::tomlish::dict::from_tomlish [ list [lindex $values 0] ]] set sub_tablenames_info $tablenames_info set tablenames_info $prev_tablenames_info } @@ -296,7 +365,7 @@ namespace eval tomlish { #pass in the whole [lindex $values 0] (type val) - not just the $value! set prev_tablenames_info $tablenames_info set tablenames_info [dict create] - set result [list type $type value [ ::tomlish::to_dict [ list [lindex $values 0] ] ]] + set result [list type $type value [ ::tomlish::dict::from_tomlish [ list [lindex $values 0] ] ]] set sub_tablenames_info $tablenames_info set tablenames_info $prev_tablenames_info } @@ -304,7 +373,7 @@ namespace eval tomlish { #review - mapping these to STRING might make some conversions harder? #if we keep the MULTI - we know we have to look for newlines for example when converting to json #without specific types we'd have to check every STRING - and lose info about how best to map chars within it - set result [list type $type value [ ::tomlish::to_dict [ list [lindex $values 0] ] ]] + set result [list type $type value [ ::tomlish::dict::from_tomlish [ list [lindex $values 0] ] ]] } default { error "tomlish Unexpected value type '$type' found in keyval '$keyval_element'" @@ -314,1556 +383,1325 @@ namespace eval tomlish { } + proc to_dict {tomlish} { + tomlish::dict::from_tomlish $tomlish + } - #to_dict is a *basic* programmatic datastructure for accessing the data. - # produce a dictionary of keys and values from a tomlish tagged list. - # to_dict is primarily for reading toml data. - #Extraneous (not within quoted sections) whitespace and comments are not preserved in this structure, - # so a roundtrip from toml to this datastructure and back to toml will lose whitespace formatting and comments. - # creating/changing toml values can be done directly on a tomlish list if preserving (or adding) formatting/comments is desired. - #A separate package 'tomlish::object' may be needed to allow easier programmatic creating/updating/deleting of data elements whilst preserving (or adding or selectively deleting/editing) such formatting. - # - - #within an ARRAY, we store a list of items such as plain dicts (possibly empty) and {type value } for simple types - #(ARRAYS can be mixed type) - #This means our dict structure should have only ARRAY and simple types which need to be in {type value } form - #A dict within an array encodeded as a type ITABLE value should also parse - but is the unpreferred form - REVIEW test? - #Namespacing? - #ie note the difference: - #[Data] - #temp = { cpu = 79.5, case = 72.0} - # versus - #[Data] - #temps = [{cpu = 79.5, case = 72.0}] - proc to_dict {tomlish} { - package require dictn - #keep track of which tablenames have already been directly defined, - # so we can raise an error to satisfy the toml rule: 'You cannot define any key or table more than once. Doing so is invalid' - #Note that [a] and then [a.b] is ok if there are no subkey conflicts - so we are only tracking complete tablenames here. - #we don't error out just because a previous tablename segment has already appeared. + proc _from_dictval_tomltype {parents tablestack keys typeval} { + set type [dict get $typeval type] + set val [dict get $typeval value] + #These are the restricted sets of typed used in the tomlish::dict representation + #They are a subset of the types in tomlish: data types plus ARRAY, arranged in a dictionary form. + #The container types: ITABLE, TABLE, TABLEARRAY are not used as they are represented as dictionary keys and ARRAY items. + #The WS, COMMENT, and NEWLINE elements are also unrepresented in the dict structure. + switch -- $type { + ARRAY { + set subitems [list] + foreach item $val { + lappend subitems [_from_dictval [list {*}$parents ARRAY] $tablestack $keys $item] SEP + } + if {[lindex $subitems end] eq "SEP"} { + set subitems [lrange $subitems 0 end-1] + } + return [list ARRAY {*}$subitems] + } + ITABLE { + error "not applicable" + if {$val eq ""} { + return ITABLE + } else { + return [_from_dictval [list {*}$parents ITABLE] $tablestack $keys $val] + } + } + STRING { + #JSJS + #if our dict came from json - we have already decided what type of STRING/LITERAL etc to use when building the dict + + #do not validate like this - important that eg json val\\ue -> dict val\ue -> tomlish/toml val\\ue + #see toml-tests + #if {![tomlish::utils::rawstring_is_valid_tomlstring $val]} { + # #todo? + # return -code error -errorcode {TOML SYNTAX INVALIDSTRING} "Unescaped controls in string or invalid escapes" + #} + return [list STRING [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $val]] + } + MULTISTRING { + #value is a raw string that isn't encoded as tomlish + #create a valid toml snippet with the raw value and decode it to the proper tomlish MULTISTRING format + #We need to convert controls in $val to escape sequences - except for newlines + # + #consider an *option* to reformat for long lines? (perhaps overcomplex - byte equiv - but may fold in ugly places) + #we could use a line-length limit to decide when to put in a "line ending backslash" + #and even format it with a reasonable indent so that proper CONT and WS entries are made (?) REVIEW + # + #TODO + set tomlpart "x=\"\"\"\\\n" + append tomlpart [tomlish::utils::rawstring_to_MultiBstring_with_escaped_controls $val] + append tomlpart "\"\"\"" + set tomlish [tomlish::from_toml $tomlpart] + #e.g if val = " etc\nblah" + #TOMLISH {DOTTEDKEY {{KEY x}} = {MULTISTRING CONT {NEWLINE LF} {WS { }} {STRINGPART etc} {NEWLINE lf} {STRINGPART blah} } } + #lindex 1 3 is the MULTISTRING tomlish list + return [lindex $tomlish 1 3] + } + MULTILITERAL { + #MLL string can contain newlines - but still no control chars + #todo - validate - e.g val can't contain more than 2 squotes in a row + if {[string first ''' $val] >=0} { + set msg "_from_dictval_tomltype error: more than 2 single quotes in a row found in MULTILITERAL - cannot encode dict to TOML-VALID TOMLISH" + return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} $msg + } - #Declaring, Creating, and Defining Tables - #https://github.com/toml-lang/toml/issues/795 - #(update - only Creating and Defining are relevant terminology) + #rawstring_is_valid_multiliteral - allow newlines as lf or crlf - but not bare cr + if {![tomlish::utils::rawstring_is_valid_multiliteral $val]} { + return -code error -errorcode {TOML SYNTAX INVALIDMULTILITERAL} "Controls other than tab or newlines found in multiliteral" + } - #review - #tablenames_info keys ttype created, tdefined, createdby, definedby, closedby ??? review keys - # [tname] = header_table [[tname]] = header_tablearray + set tomlpart "x='''\n" + append tomlpart $val ''' + set tomlish [tomlish::from_toml $tomlpart] + return [lindex $tomlish 1 3] + } + LITERAL { + #from v1.0 spec - "Control characters other than tab are not permitted in a literal string" + #(This rules out raw ANSI SGR - which is somewhat restrictive - but perhaps justified for a config format + # as copy-pasting ansi to a config value is probably not always wise, and it's not something that can be + # easily input via a text editor. ANSI can go in Basic strings using the \e escape if that's accepted v1.1?) + #we could choose to change the type to another format here when encountering invalid chars - but that seems + #like too much magic. We elect to error out and require the dict to have valid data for the types it specifies. + if {[string first ' $val] >=0} { + set msg "_from_dictval_tomltype error: single quote found in LITERAL - cannot encode dict to TOML-VALID TOMLISH" + return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} $msg + } + #JJJJ + if {![tomlish::utils::rawstring_is_valid_literal $val]} { + #has controls other than tab + #todo - squote? + return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} "Controls other than tab found in literal" + } + return [list LITERAL $val] + } + INT { + if {![::tomlish::utils::is_int $val]} { + error "_from_dictval_tomltype error: bad INT value '$val' - cannot encode dict to TOML-VALID TOMLISH" + } + return [list INT $val] + } + FLOAT { + if {![::tomlish::utils::is_float $val]} { + error "_from_dictval_tomltype error: bad FLOAT value '$val' - cannot encode dict to TOML-VALID TOMLISH" + } + return [list FLOAT $val] + } + default { + if {$type ni [::tomlish::tags]} { + error "_from_dictval_tomltype error: Unrecognised typename '$type' in {type value } - cannot encode dict to TOML-VALID TOMLISH" + } + return [list $type $val] + } + } + } - #consider the following 2 which are legal: - #[table] #'table' created, defined=open type header_table - #x.y = 3 - #[table.x.z] #'table' tdefined=closed closedby={header_table table.x.z}, 'table.x' created, 'table.x.z' created tdefined=open tdefinedby={header_table table.x.z} - #k= 22 - # #'table.x.z' tdefined=closed closedby={eof eof} + proc _from_dictval {parents tablestack keys vinfo} { + set k [lindex $keys end] + set K_PART [tomlish::dict::classify_rawkey $k] ;#get [list SQKEY ] + #puts stderr "---parents:'$parents' keys:'$keys' vinfo: $vinfo---" + #puts stderr "---tablestack: $tablestack---" + set result [list] + set lastparent [lindex $parents end] + if {$lastparent in [list "" do_inline]} { + if {[tomlish::dict::is_typeval $vinfo]} { + set type [dict get $vinfo type] + #treat ITABLE differently? + set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] + lappend result DOTTEDKEY [list $K_PART {WS { }}] = {WS { }} $sublist {NEWLINE lf} + } else { + if {$vinfo ne ""} { - #equivalent datastructure + #set result [list DOTTEDKEY [list [list KEY $k]] = ] + #set records [list ITABLE] - #[table] #'table' created, tdefined=open definedby={header_table table} - #[table.x] #'table' tdefined=closed closedby={header_table table.x}, 'table.x' created tdefined=open definedby={header_table table.x} - #y = 3 - #[table.x.z] #'table.x' tdefined=closed closedby={header_table table.x.z}, 'table.x.z' created tdefined=open definedby={header_table table.x.z} - #k=22 + set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] - #illegal - #[table] #'table' created and tdefined=open - #x.y = 3 #'table.x' created first keyval pair tdefined=open definedby={keyval x.y = 3} - #[table.x.y.z] #'table' tdefined=closed, 'table.x' closed because parent 'table' closed?, 'table.x.y' cannot be created - #k = 22 - # - ## - we would fail on encountering table.x.y because only table and table.x are effectively tables - but that table.x is closed should be detected (?) + if {$lastparent eq "do_inline"} { + set result [list DOTTEDKEY [list $K_PART] =] + set records [list ITABLE] + } else { + set tname [tomlish::dict::join_and_quote_rawkey_list [list $k]] + set result [list TABLE $tname {NEWLINE lf}] + set tablestack [list {*}$tablestack [list T $k]] + set records [list] + } - #illegal - #[table] - #x.y = {p=3} - #[table.x.y.z] - #k = 22 - ## we should fail because y is an inline table which is closed to further entries - #note: it is not safe to compare normalized tablenames using join! - # e.g a.'b.c'.d is not the same as a.b.c.d - # instead compare {a b.c d} with {a b c d} - # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. - #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' - #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} + set lastidx [expr {[dict size $vinfo] -1}] + set dictidx 0 + dict for {vk vv} $vinfo { + set VK_PART [tomlish::dict::classify_rawkey $vk] ;#get [list SQKEY ] + #(SQKEY & DQKEY do not have the enclosing quotes in their returned val) + #if {[regexp {\s} $vk] || [string first . $vk] >= 0} { + # set VK_PART [list SQKEY $vk] + #} else { + # set VK_PART [list KEY $vk] + #} + if {[tomlish::dict::is_typeval $vv]} { + #type x value y + #REVIEW - we could detect if value is an array of objects, + #and depending on parent context - emit a series of TABLEARRAY records instead of a DOTTEDKEY record containing an ARRAY of objects + set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] + set record [list DOTTEDKEY [list $VK_PART {WS { }}] = {WS { }} $sublist] + } else { + if {$vv eq ""} { + #experimental + if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { + ::tomlish::log::notice "_from_dictval could uninline KEY $vk (tablestack:$tablestack)" + #set tname [tomlish::dict::name_from_tablestack [list {*}$tablestack [list T $vk]]] + #we can't just join normalized keys - need keys with appropriate quotes and escapes + #set tname [join [list {*}$keys $vk] .] ;#WRONG + set tq [tomlish::dict::join_and_quote_rawkey_list [list {*}$keys $vk]] - if {[uplevel 1 [list info exists tablenames_info]]} { - upvar tablenames_info tablenames_info - } else { - set tablenames_info [dict create] ;#keyed on tablepath each of which is an @@path such as {@@config @@subgroup @@etc} (corresponding to config.subgroup.etc) - #also has non @@ indexes which are list indexes as taken by tcl list commands (int or end-1 etc) - #value is a dict with keys: ttype, tdefined - } + ##wrong? results in TABLE within TABLE record?? todo pop? + #set record [list TABLE $tq {NEWLINE lf}] + #set tablestack [list {*}$tablestack [list T $vk]] - log::info "---> to_dict processing '$tomlish'<<<" - set items $tomlish + #REVIEW!!! - foreach lst $items { - if {[lindex $lst 0] ni $::tomlish::tags} { - error "supplied string does not appear to be toml parsed into a tomlish tagged list. Run tomlish::decode::toml on the raw toml data to produce a tomlish list" - } - } + set record [list DOTTEDKEY [list $VK_PART] = ITABLE] + set tablestack [list {*}$tablestack [list I $vk]] - if {[lindex $tomlish 0] eq "TOMLISH"} { - #ignore TOMLISH tag at beginning - set items [lrange $tomlish 1 end] - } - - set datastructure [dict create] - foreach item $items { - set tag [lindex $item 0] - #puts "...> item:'$item' tag:'$tag'" - switch -exact -- $tag { - KEY - DQKEY - SQKEY { - log::debug "---> to_dict item: processing $tag: $item" - set key [lindex $item 1] - if {$tag eq "DQKEY"} { - set key [::tomlish::utils::unescape_string $key] - } - #!todo - normalize key. (may be quoted/doublequoted) + } else { + set record [list DOTTEDKEY [list $VK_PART] = ITABLE] + set tablestack [list {*}$tablestack [list I $vk]] + } + } else { + if { 0 } { + #experiment.. sort of getting there. + if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { + ::tomlish::log::notice "_from_dictval could uninline2 KEYS [list {*}$keys $vk] (tablestack:$tablestack)" + set tq [tomlish::dict::join_and_quote_rawkey_list [list {*}$keys $vk]] + set record [list TABLE $tq {NEWLINE lf}] + set tablestack [list {*}$tablestack [list T $vk]] - if {[dict exists $datastructure $key]} { - error "Duplicate key '$key'. The key already exists at this level in the toml data. The toml data is not valid." - } + #review - todo? + set dottedkey_value [_from_dictval [list {*}$parents TABLE] $tablestack [list {*}$keys $vk] $vv] + lappend record {*}$dottedkey_value - #lassign [_get_keyval_value $item] type val - #set keyval_dict [_get_keyval_value $item] - lassign [_get_keyval_value $item] _ keyval_dict _ sub_tablenames_info - dict set datastructure $key $keyval_dict - } - DOTTEDKEY { - log::debug "---> to_dict item processing $tag: $item" - set dkey_info [tomlish::to_dict::get_dottedkey_info $item] - set all_dotted_keys [dict get $dkey_info keys] - set dottedkeyname [join $all_dotted_keys .] - - if {[llength $all_dotted_keys] == 0} { - #empty?? probably invalid. review - #This is different to '' = 1 or ''.'' = 1 which have lengths 1 and 2 respectively - #presumably only reachanble via manual manipulation of tomlish elements - set msg "DOTTED key has no parts - invalid? '$item'" - return -code error -errorcode {TOMLISH SYNTAX DOTTEDKEYINVALID} $msg - } elseif {[llength $all_dotted_keys] > 1} { - #dottedtable.k=1 - #tX.dottedtable.k=1 - #etc - - set defines_a_table 1 - #Wrap in a list so we can detect 'null' equivalent. - #We can't use empty string as that's a valid dotted key segment - set dottedtable_bag [list [lindex $all_dotted_keys end-1]] - set dotparents [lrange $all_dotted_keys 0 end-2] - } else { - #basic case - not really a 'dotted' key - #a = 1 - set defines_a_table 0 - set dottedtable_bag [list] ;#empty bag - set dotparents [list] - } - - #assert dottedtable_bag only ever holds 0 or 1 elements - set leaf_key [lindex $all_dotted_keys end] - set supertable [list] ;#disconnect from main structure - each array element is a new context for key paths! - foreach normkey $dotparents { - lappend supertable $normkey - if {![dict exists $tablenames_info [tomlish::to_dict::@@path $supertable] ttype]} { - #supertable not yet 'created' - if {[dict exists $datastructure {*}$supertable]} { - #There is data so it must have been created as a keyval - set msg "Supertable [join $supertable .] of dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here - dict set tablenames_info [tomlish::to_dict::@@path $supertable] ttype dottedkey_table ;#REVIEW - #ensure empty tables are still represented in the datastructure - dict set datastructure {*}$supertable [list] - } - } - if {[llength $dottedtable_bag] == 1} { - set dottedtable [lindex $dottedtable_bag 0] - set dottedpath [list {*}$dotparents $dottedtable] ;#full path to dottedtable (in this context) - #our dotted key is attempting to define a table (2nd last element) - if {![dict exists $tablenames_info [tomlish::to_dict::@@path $dottedpath] ttype]} { - #first one - but check datastructure for collisions - if {[dict exists $datastructure {*}$dottedpath]} { - set msg "Supertable [join $dottedpath .] of dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #'create' the table - dict set tablenames_info [tomlish::to_dict::@@path $dottedpath] ttype dottedkey_table - #don't actually set 'defined' here.. use the end of TABLE record to close them off by looking at this list - lappend dottedtables_defined $dottedpath - # - } else { - #exists - but might be from another dottedkey within the current header section - #the table is open for adding keys until the next 'header' section ([tablename] / [[tablearray]]) - #check for 'defined' closed (or just existence) - if {[dict get $tablenames_info [tomlish::to_dict::@@path $dottedpath] ttype] eq "dottedkey_table"} { - #right ttype - but make sure it's from this header section - i.e defined not set - set definedstate [dictn getdef $tablenames_info [list [tomlish::to_dict::@@path $dottedpath] tdefined] NULL] - if {$definedstate ne "NULL"} { - #collision with some other dottedkey - set msg "Table $dottedpath represented by dottedkey $dottedkeyname has been 'defined' elsewhere (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } else { + set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] + set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] + } + } else { + set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] + set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] } } } - } - if {[dict exists $datastructure {*}$all_dotted_keys]} { - set msg "Duplicate key '$all_dotted_keys'. The key already exists at this level in the toml data. The toml data is not valid." - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - - #set keyval_dict [_get_keyval_value $item] - lassign [_get_keyval_value $item] _ keyval_dict _ sub_tablenames_info - #keyval_dict is either a {type value } - #or the result from parsing an arbitrary dict from an inline table - which could theoretically look the same at the topmost level - #punk::dict::is_tomlish_typeval can distinguish - puts stdout "to_dict>>> root/dottedkey $dottedkeyname kv: $keyval_dict" - dict set datastructure {*}$all_dotted_keys $keyval_dict - #assert - dottedkey represents a key val pair that can be added - - - - - # if {[llength $all_dotted] == 0} { - # } elseif {[llength $all_dotted] == 1} { - # #dottedkey is only a key - no table component - # set table_hierarchy [list] - # set tleaf [lindex $all_dotted 0] - # } else { - # set table_hierarchy [lrange $all_dotted 0 end-1] - # set tleaf [lindex $all_dotted end] - # } - - # #ensure empty tables are still represented in the datastructure - # #review - this seems unnecessary? - # set pathkeys [list] - # foreach k $table_hierarchy { - # lappend pathkeys $k - # if {![dict exists $datastructure {*}$pathkeys]} { - # dict set datastructure {*}$pathkeys [list] - # } else { - # tomlish::log::notice "to_dict datastructure at key path $pathkeys already had data: [dict get $datastructure {*}$pathkeys]" - # } - # } - # #review? - # if {[dict exists $datastructure {*}$table_hierarchy $tleaf]} { - # error "Duplicate key '$table_hierarchy $tleaf'. The key already exists at this level in the toml data. The toml data is not valid." - # } - - # #JMN test 2025 - # if {[llength $table_hierarchy]} { - # dictn incr tablenames_info [list $table_hierarchy seencount] - # } - - # #review - item is an ITABLE - we recurse here without datastructure context - # set keyval_dict [_get_keyval_value $item] - # if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { - # set t [list {*}$table_hierarchy $tleaf] - # dictn incr tablenames_info [list $t seencount] - # dictn set tablenames_info [list $t closed] 1 - - # #overwriting keys? todo ? - # dict set datastructure {*}$table_hierarchy $tleaf $keyval_dict - # } else { - # dict set datastructure {*}$table_hierarchy $tleaf $keyval_dict - # } - - } - TABLEARRAY { - set dottedtables_defined [list] ;#for closing off at end by setting 'defined' - - set NEST_DICT [dict create] ;#first blush attempt at referencing supertable tablearrays - set tablearrayname [lindex $item 1] - log::debug "---> to_dict processing item TABLENAME (name: $tablearrayname): $item" - set norm_segments [::tomlish::to_dict::tablename_split $tablearrayname true] ;#true to normalize - #we expect repeated tablearray entries - each adding a sub-object to the value, which is an array/list. - #tablearrayname is likely to appear multiple times - so unlike a TABLE we don't check for 'defined' for the full name as an indicator of a problem - set supertable [list] - ############## - # [[a.b.c.d]] - # norm_segments = {a b c d} - #check a {a b} {a b c} <---- supertables of a.b.c.d - ############## - foreach normseg [lrange $norm_segments 0 end-1] { - lappend supertable $normseg - if {![dictn exists $tablenames_info [list [tomlish::to_dict::@@path $supertable] ttype]]} { - #supertable with this path doesn't yet exist - if {[dict exists $datastructure {*}$supertable]} { - #There is data though - so it must have been created as a keyval - set msg "Supertable [join $supertable .] of tablearray name $tablearrayname already has data but doesn't appear to be a table - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - #test: datastructure_tablearray_supertable_keycollision - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + if {$dictidx != $lastidx} { + #lappend record SEP + if {$lastparent eq "do_inline"} { + lappend record SEP } else { - #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here - #REVIEW!! - # what happens with from_toml {[[a.b.c]]} {[[a]]} ??? - dictn set tablenames_info [list [tomlish::to_dict::@@path $supertable] ttype] header_table ;#how do we know it's not going to be a tablearray? - #ensure empty tables are still represented in the datastructure - dict set datastructure {*}$supertable [list] - } - } else { - #supertable has already been created - and maybe defined - but even if defined we can add subtables unless it is of type itable - #but if it's a tablearray - we need to point to the most 'recently defined table element of the array' - #(another way of saying last member of that array)?? - set supertype [dictn get $tablenames_info [list [tomlish::to_dict::@@path $supertable] ttype]] - if {$supertype eq "header_tablearray"} { - puts stdout "todict!!! TABLEARRAY nesting required for supertable [join $supertable .]" - puts stdout "todict!!! todo.." - #how to do multilevel nesting?? - set EXISTING_SUPERARRAY_ELEMENTS [dict get $datastructure {*}$supertable value] - dict set NEST_DICT $supertable $EXISTING_SUPERARRAY_ELEMENTS - puts stdout "todict!!! supertable '[join $supertable .]' elements $EXISTING_SUPERARRAY_ELEMENTS" + lappend record {NEWLINE lf} } } - } - # - if {![dictn exists $tablenames_info [list [tomlish::to_dict::@@path $norm_segments] ttype]]} { - #first encounter of this tablearrayname - if {[dict exists $datastructure {*}$norm_segments]} { - #e.g from_toml {a=1} {[[a]]} - set msg "Cannot create tablearray name $tablearrayname. Key already has data but key doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - #test: datastructure_tablearray_direct_keycollision_error - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + if {[llength $record]} { + lappend records $record } - #no collision - we can create the tablearray and the array in the datastructure - dictn set tablenames_info [list [tomlish::to_dict::@@path $norm_segments] ttype] header_tablearray - dict set datastructure {*}$norm_segments [list type ARRAY value {}] - set ARRAY_ELEMENTS [list] + incr dictidx + } + if {$lastparent eq "do_inline"} { + lappend result $records {NEWLINE lf} } else { - #we have an existing tablenames_info record for this path - but is it a tablearray? - set ttype [dictn get $tablenames_info [list [::tomlish::to_dict::@@path $norm_segments] ttype]] - #we use a header_unknown type for previous 'created' only tables - - if {$ttype eq "header_unknown"} { - dictn set tablenames_info [list [tomlish::to_dict::@@path $norm_segments] ttype] header_tablearray - set ttype header_tablearray - #assert - must not be 'defined' - #we have seen it before as a supertable ie 'created' only - #Not 'defined' but could still have subtables - treat it as a dict - set ARRAY_ELEMENTS [dict get $datastructure {*}$norm_segments] - } else { - if {$ttype ne "header_tablearray"} { - #header_table or itable - switch -- $ttype { - itable {set ttypename itable} - header_table {set ttypename table} - default {error "unrecognised type $ttype - expected header_table or itable"} - } - set msg "tablearray name $tablearrayname already appears to be already created as '$ttypename' not tablearray - invalid?" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #EXISTING tablearray - #add to array - #error "add_to_array not implemented" - #{type ARRAY value } - set ARRAY_ELEMENTS [dict get $datastructure {*}$norm_segments value] - } + lappend result {*}$records {NEWLINE lf} } - - - set object [dict create] ;#array context equivalent of 'datastructure' - set objectnames_info [dict create] ;#array contex equivalent of tablenames_info - - #add to ARRAY_ELEMENTS and write back in to datastructure. - foreach element [lrange $item 2 end] { - set type [lindex $element 0] - log::debug "----> todict processing $tag subitem $type processing contained element $element" - switch -exact -- $type { - DOTTEDKEY { - #MAINTENANCE: temp copy from TABLE - #convert to function: data needed? element, norm_segments refs to: tablename_info datastructure ??? - set dkey_info [tomlish::to_dict::get_dottedkey_info $element] - #e.g1 keys {x.y y} keys_raw {'x.y' "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) - #e.g2 keys {x.y y} keys_raw {{"x.y"} "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) - - #[a.b] - #t1.t2.dottedtable.k = "val" - #we have already checked supertables a {a b} - #We need to check {a b t1} & {a b t2} ('creation' only) - #and then 'dottedtable' is 'defined' while k is an ordinary key in dottedtable - - #note we also get here as a 'dottedkey' with a simple - #[a.b] - #k = "val" - - set all_dotted_keys [dict get $dkey_info keys] - set dottedkeyname [join $all_dotted_keys .] - #obsolete - set nonleaf_keys [lrange $all_dotted_keys 0 end-1] ;#may be empty - - if {[llength $all_dotted_keys] > 1} { - #dottedtable.k=1 - #tX.dottedtable.k=1 - #etc - - set defines_a_table 1 - #Wrap in a list so we can detect 'null' equivalent. - #We can't use empty string as that's a valid dotted key segment - set dottedtable_bag [list [lindex $all_dotted_keys end-1]] - set dotparents [lrange $all_dotted_keys 0 end-2] + } else { + if {$lastparent eq "do_inline"} { + lappend result DOTTEDKEY [list $K_PART] = ITABLE {NEWLINE lf} + } else { + set tname [tomlish::dict::join_and_quote_rawkey_list [list $k]] + #REVIEW + lappend result TABLE $tname {NEWLINE lf} + } + } + } + } else { + #lastparent is not toplevel "" or "do_inline" + if {[tomlish::dict::is_typeval $vinfo]} { + #type x value y + set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] + lappend result {*}$sublist + } else { + if {$lastparent eq "TABLE"} { + #review + dict for {vk vv} $vinfo { + set VK_PART [tomlish::dict::classify_rawkey $vk] ;#get [list SQKEY ] + set dottedkey_value [_from_dictval [list {*}$parents DOTTEDKEY] $tablestack [list {*}$keys $vk] $vv] + lappend result [list DOTTEDKEY [list $VK_PART] = $dottedkey_value {NEWLINE lf}] + } + } else { + if {$vinfo ne ""} { + set lastidx [expr {[dict size $vinfo] -1}] + set dictidx 0 + set sub [list] + #REVIEW + #set result $lastparent ;#e.g sets ITABLE + set result ITABLE + set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] + dict for {vk vv} $vinfo { + set VK_PART [tomlish::dict::classify_rawkey $vk] ;#get [list SQKEY ] + if {[tomlish::dict::is_typeval $vv]} { + #type x value y + set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] + set record [list DOTTEDKEY [list $VK_PART] = $sublist] + } else { + if {$vv eq ""} { + #can't just uninline at this level + #we need a better method to query main dict for uninlinability at each level + # (including what's been inlined already) + #if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { + # puts stderr "_from_dictval uninline2 KEY $keys" + # set tname [tomlish::dict::join_and_quote_rawkey_list [list {*}$keys $vk]] + # set record [list TABLE $tname {NEWLINE lf}] + # set tablestack [list {*}$tablestack [list T $vk]] + #} else { + set record [list DOTTEDKEY [list $VK_PART] = ITABLE] + #} } else { - #basic case - not really a 'dotted' key - #a = 1 - set defines_a_table 0 - set dottedtable_bag [list] ;#empty bag - set dotparents [list] - } - #assert dottedtable_bag only ever holds 0 or 1 elements - set leaf_key [lindex $all_dotted_keys end] - - #we've already tested the table keys that got us here.. but not the dottedkey segments (if any) prior to dottedtable & leaf_key - #set supertable $norm_segments - set supertable [list] ;#disconnect from main structure - each array element is a new context for key paths! - foreach normkey $dotparents { - lappend supertable $normkey - if {![dictn exists $tablenames_info [list [tomlish::to_dict::@@path $supertable] ttype]]} { - #supertable with this combined path (table norm_segments plus parts of dottedkey) not yet 'created' - if {[dict exists $datastructure {*}$supertable]} { - #There is data so it must have been created as a keyval - set msg "Supertable [join $supertable .] of dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here - dict set tablenames_info [tomlish::to_dict::@@path $supertable] ttype unknown_table ;#REVIEW - #ensure empty tables are still represented in the datastructure - dict set datastructure {*}$supertable [list] - } - } - if {[llength $dottedtable_bag] == 1} { - set dottedtable [lindex $dottedtable_bag 0] - set dottedpath [list {*}$norm_segments {*}$dotparents $dottedtable] ;#full path to dottedtable - #our dotted key is attempting to define a table - if {![dict exists $tablenames_info [tomlish::to_dict::@@path $dottedpath] ttype]} { - #first one - but check datastructure for collisions - if {[dict exists $datastructure {*}$dottedpath]} { - set msg "Supertable [join $dottedpath .] of dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #'create' the table - dict set tablenames_info [tomlish::to_dict::@@path $dottedpath] ttype dottedkey_table - #don't actually set 'tdefined' here.. use the end of TABLE record to close them off by looking at this list - lappend dottedtables_defined $dottedpath - # - } else { - #exists - but might be from another dottedkey within the current header section - #the table is open for adding keys until the next 'header' section ([tablename] / [[tablearray]]) - #check for 'defined' closed (or just existence) - if {[dict get $tablenames_info [tomlish::to_dict::@@path $dottedpath] type] eq "dottedkey_table"} { - #right type - but make sure it's from this header section - i.e defined not set - set definedstate [dictn getdef $tablenames_info [list [tomlish::to_dict::@@path $dottedpath] tdefined] NULL] - if {$definedstate ne "NULL"} { - #collision with some other dottedkey - set msg "Table $dottedpath represented by dottedkey $dottedkeyname has been 'defined' elsewhere (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - } - } - } - #assert - dottedkey represents a key val pair that can be added - - - if {[dict exists $datastructure {*}$norm_segments {*}$all_dotted_keys]} { - set msg "Duplicate key '$norm_segments $all_dotted_keys'. The key already exists at this level in the toml data. The toml data is not valid." - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - - #set keyval_dict [_get_keyval_value $element] - lassign [_get_keyval_value $element] _ keyval_dict _ sub_tablenames_info - - #keyval_dict is either a {type value } - #or the result from parsing an arbitrary dict from an inline table - which could theoretically look the same at the topmost level - #punk::dict::is_tomlish_typeval can distinguish - puts stdout "to_dict>>> TABLE/dottedkey '$dottedkeyname' kv: $keyval_dict" - #dict set datastructure {*}$norm_segments {*}$all_dotted_keys $keyval_dict - #wrong - #TODO!!!!!!!!!!!!! - #lappend ARRAY_ELEMENTS [list $dottedkeyname $keyval_dict] - dict set object $dottedkeyname $keyval_dict - - #remove ? - if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { - #the value is either empty or or a dict structure with arbitrary (from-user-data) toplevel keys - # inner structure will contain {type value } if all leaves are not empty ITABLES - set tkey [list {*}$norm_segments {*}$all_dotted_keys] - #lappend tablenames_info [list {*}$norm_segments {*}$dkeys $leaf_key] - - #by not creating a tablenames_info record - we effectively make it closed anyway? - #it should be detected as a key - #is there any need to store tablenames_info for it?? - #REVIEW - - ##TODO - update? - #dictn incr tablenames_info [list $tkey seencount] - ##if the keyval_dict is not a simple type x value y - then it's an inline table ? - ##if so - we should add the path to the leaf_key as a closed table too - as it's not allowed to have more entries added. - #dictn set tablenames_info [list $tkey closed] 1 + #set sub [_from_dictval ITABLE $vk $vv] + set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] + #set record [list DOTTEDKEY [list $VK_PART] = ITABLE $dottedkey_value] + set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] } - - } - NEWLINE - COMMENT - WS { - #ignore - } - TABLE { - #we should be able to process tablearray subtables either as part of the tablearray record, or independently. - #(or even a mixture of both, although that is somewhat an edge case) - #[[fruit]] - #x=1 - # [fruit.metadata] - # [fruit.otherdata] - - #when processing a dict destined for the above - the tomlish generator (e.g from_dict) - #should create as 1 or 3 records (but could create 2 records if there was an unrelated table in between the subtables) - #choices: all in tablearray record, tablearray + 1 or 2 table records. - # - #We are going the other way here - so we just need to realize the list of tables 'belonging' to this tablearray might not be complete. - # - #the subtable names must be prefixed with the tablearray - we should validate that for any contained TABLE records - } - default { - error "Sub element of type '$type' not understood in tablearray context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" + if {$dictidx != $lastidx} { + lappend record SEP } - } - } - - #todo? - ##end of TABLEARRAY record - equivalent of EOF or next header - close off the dottedtables - #foreach dtablepath $dottedtables_defined { - # dictn set tablename_info [list $dtablepath tdefined] closed - #} - - if {[dict size $NEST_DICT]} { - puts "reintegrate?? $NEST_DICT" - #todo - more - what if multiple in hierarchy? - dict for {superpath existing_elements} $NEST_DICT { - #objects stored directly as dicts in ARRAY value - set lastd [lindex $existing_elements end] - #insufficient.. - #dict set lastd [lindex $norm_segments end] [list type ITABLE value $object] - dict set lastd [lindex $norm_segments end] $object - #set lastd [dict merge $lastd $object] - lset existing_elements end $lastd - dict set datastructure {*}$superpath [list type ARRAY value $existing_elements] + lappend result $record + incr dictidx } } else { - #lappend ARRAY_ELEMENTS [list type ITABLE value $object] - lappend ARRAY_ELEMENTS $object - dict set datastructure {*}$norm_segments [list type ARRAY value $ARRAY_ELEMENTS] + #e.g x=[{}] + log::debug "---> _from_dictval empty ITABLE x-1" + #lappend result DOTTEDKEY [list $K_PART] = ITABLE ;#wrong + lappend result ITABLE } } - TABLE { - set tablename [lindex $item 1] - set dottedtables_defined [list] ;#for closing off at end by setting 'defined' - #As our TABLE record contains all it's child DOTTEDKEY records - this should be equivalent to setting them as defined at EOF or next header. + } + } + return $result + } - #----------------------------------------------------------------------------------- - #default assumption - our reference is to the main tablenames_info and datastructure - #Will need to append keys appropriately if we have recursed - #----------------------------------------------------------------------------------- - log::debug "---> to_dict processing item TABLE (name: $tablename): $item" - set norm_segments [::tomlish::to_dict::tablename_split $tablename true] ;#true to normalize + proc from_dict {d} { + #consider: + # t1={a=1,b=2} + # x = 1 - set T_DEFINED [dictn getdef $tablenames_info [list [tomlish::to_dict::@@path $norm_segments] tdefined] NULL] - if {$T_DEFINED ne "NULL" } { - #our tablename e.g [a.b.c.d] declares a space to 'define' subkeys - but there has already been a definition space for this path - set msg "Table name $tablename has already been directly defined in the toml data. Invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg - } + # from_dict gives us: t1 {a {type INT value 1} b {type INT value 2}} x {type INT value 1} + #If we represent t1 as an expanded table we get + # [t1] + # a=1 + # b=2 + # x=1 + # --- which is incorrect - as x was a toplevel key like t1! + #This issue doesn't occur if x is itself an inline table + # t1={a=1,b=2} + # x= {no="problem"} + # + # (or if we were to reorder x to come before t1) - set name_segments [::tomlish::to_dict::tablename_split $tablename 0] ;#unnormalized e.g ['a'."b".c.d] -> 'a' "b" c d - #results of tablename_split 0 are 'raw' - ie some segments may be enclosed in single or double quotes. + #ie the order of the dict elements influences how the toml can be represented. + #As the dictionary form doesn't distinguish the structure used to create tables {[table1]\nk=v} vs inline {table1={k=v}} + #Without a solution, from_dict would have to always produce the inline form for toplevel tables unless we allowed re-ordering, + #which is unpreferred here. - set supertable [list] - ############## - # [a.b.c.d] - # norm_segments = {a b c d} - #check a {a b} {a b c} <---- supertables of a.b.c.d - ############## + #A possible solution: + #scan the top level to see if all (trailing) elements are themselves dicts + # (ie not of form {type XXX value yyy}) + # + # A further point is that if all root level values are at the 'top' - we can treat lower table-like structures as {[table]} elements + #ie we don't need to force do_inline if all the 'simple' keys are before any compound keys - ############## - #[[a]] - #[a.b] #supertable a is tablearray - ############## + #set root_has_values 0 + #approach 1) - the naive approach - forces inline when not always necessary + #dict for {k v} $d { + # if {[llength $v] == 4 && [lindex $v 0] eq "type"} { + # set root_has_values 1 + # break + # } + #} - #also consider - ############## - # [[a.b]] - # [a.b.c.d] #supertable a is a table, supertable a.b is tablearray, supertable a.b.c is elementtable - ############## - set refpath [list] ;#e.g @@j1 @@j2 1 @@k1 end - foreach normseg [lrange $norm_segments 0 end-1] { - lappend supertable $normseg - lappend refpath @@$normseg - if {![dict exists $tablenames_info $refpath ttype]} { - #supertable with this path doesn't yet exist - if {[tomlish::tdictn::exists $datastructure $refpath]} { - #There is data though - so it must have been created as a keyval - set msg "Supertable [join $supertable .] of table name $tablename (path $refpath) already has data but doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here - #we also don't know whether it's a table or a dottedkey_table (not allowed to be tablearray - out of order?) - dict set tablenames_info $refpath ttype unknown_table - #ensure empty tables are still represented in the datastructure - dict set datastructure {*}$supertable [list] - } else { - #supertable has already been created - and may be defined - but even if defined we can add subtables unless it is of type itable - if {[dict get $tablenames_info $refpath ttype] eq "header_tablearray"} { - #'refer' to the appropriate element in existing array - #lappend refpath {*}[tomlish::to_dict::@@path $supertable] end - set arrdata [tomlish::tdictn::get $datastructure [list {*}$refpath @@value]] - set idx [expr {[llength $arrdata]-1}] - if {$idx < 0} { - #existing tablearray should have at least one entry even if empty (review) - set msg "reference to empty tablearray?" - return -code error -errorcode {TOMLISH STRUCTURE REFTOEMPTYTABLEARRAY} $msg - } - lappend refpath $idx - } else { - #?? - if {[dictn getdef $tablenames_info [list $refpath tdefined] NULL] eq "NULL"} { - } else { - } - } - } - } - #puts "TABLE supertable refpath $refpath" - lappend refpath @@[lindex $norm_segments end] - puts "TABLE refpath $refpath" - set table_refpath $refpath - #table [a.b.c.d] hasn't been defined - but may have been 'created' already by a longer tablename - # - or may have existing data from a keyval - if {![dict exists $tablenames_info $table_refpath ttype]} { - if {[tomlish::tdictn::exists $datastructure $table_refpath]} { - #e.g from_toml {a=1} {[a]} - set msg "Cannot create table name $tablename. Key already has data but key doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - #test: datastructure_tablename_keyval_collision_error - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #no data or previously created table - dict set tablenames_info $table_refpath ttype header_table + #approach 2) - track the position of last {type x value y} in the dictionary built by dict::from_tomlish + # - still not perfect. Inlines dotted tables unnecessarily + #This means from_dict doesn't produce output optimal for human editing. + set last_simple [tomlish::dict::last_tomltype_posn $d] - #We are 'defining' this table's keys and values here (even if empty) - #todo tomlish::tdictn::set - dict set datastructure {*}$norm_segments [list] ;#ensure table still represented in datastructure even if we add no keyvals here - } - dict set tablenames_info $table_refpath tdefined open - #now add the contained elements - foreach element [lrange $item 2 end] { - set type [lindex $element 0] - log::debug "----> todict processing $tag subitem $type processing contained element $element" - switch -exact -- $type { - DOTTEDKEY { - #convert to function: data needed? element, norm_segments refs to: tablename_info datastructure ??? - set dkey_info [tomlish::to_dict::get_dottedkey_info $element] - #e.g1 keys {x.y y} keys_raw {'x.y' "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) - #e.g2 keys {x.y y} keys_raw {{"x.y"} "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) - - #[a.b] - #t1.t2.dottedtable.k = "val" - #we have already checked supertables a {a b} - #We need to check {a b t1} & {a b t2} ('creation' only) - #and then 'dottedtable' is 'defined' while k is an ordinary key in dottedtable - - #note we also get here as a 'dottedkey' with a simple - #[a.b] - #k = "val" - - set all_dotted_keys [dict get $dkey_info keys] - set dottedkeyname [join $all_dotted_keys .] - #obsolete - set nonleaf_keys [lrange $all_dotted_keys 0 end-1] ;#may be empty - - if {[llength $all_dotted_keys] > 1} { - #dottedtable.k=1 - #tX.dottedtable.k=1 - #etc - - set defines_a_table 1 - #Wrap in a list so we can detect 'null' equivalent. - #We can't use empty string as that's a valid dotted key segment - set dottedtable_bag [list [lindex $all_dotted_keys end-1]] - set dotparents [lrange $all_dotted_keys 0 end-2] - } else { - #basic case - not really a 'dotted' key - #a = 1 - set defines_a_table 0 - set dottedtable_bag [list] ;#empty bag - set dotparents [list] - } - #assert dottedtable_bag only ever holds 0 or 1 elements - set leaf_key [lindex $all_dotted_keys end] - - #we've already tested the table keys that got us here.. but not the dottedkey segments (if any) prior to dottedtable & leaf_key - set supertable $norm_segments - set dottedsuper_refpath $table_refpath - foreach normkey $dotparents { - lappend supertable $normkey - lappend dottedsuper_refpath @@$normkey - if {![dict exists $tablenames_info $dottedsuper_refpath ttype]} { - #supertable with this combined path (table norm_segments plus parts of dottedkey) not yet 'created' - if {[tomlish::tdictn::exists $datastructure $dottedsuper_refpath]} { - #There is data so it must have been created as a keyval - set msg "Supertable [join $supertable .] of dotted key $dottedkeyname (path $dottedsuper_refpath) already has data but doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here - dict set tablenames_info $dottedsuper_refpath ttype unknown_table ;#REVIEW - #ensure empty tables are still represented in the datastructure - #todo - dict set datastructure {*}$supertable [list] - } - } - if {[llength $dottedtable_bag] == 1} { - set dottedtable [lindex $dottedtable_bag 0] - set dottedpath [list {*}$norm_segments {*}$dotparents $dottedtable] ;#full path to dottedtable - set dottedkey_refpath [list {*}$dottedsuper_refpath @@$dottedtable] - #our dotted key is attempting to define a table - if {![dict exists $tablenames_info $dottedkey_refpath ttype]} { - #first one - but check datastructure for collisions - if {[tomlish::tdictn::exists $datastructure $dottedkey_refpath]} { - set msg "Supertable [join $dottedpath .] of dotted key $dottedkeyname (path $dottedkey_refpath) already has data but doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #'create' the table - dict set tablenames_info $dottedkey_refpath ttype dottedkey_table - #don't actually set 'defined' here.. use the end of TABLE record to close them off by looking at this list - lappend dottedtables_defined $dottedkey_refpath - # - } else { - #exists - but might be from another dottedkey within the current header section - #the table is open for adding keys until the next 'header' section ([tablename] / [[tablearray]]) - #check for 'defined' closed (or just existence) - if {[dict get $tablenames_info $dottedkey_refpath ttype] eq "dottedkey_table"} { - #right type - but make sure it's from this header section - i.e defined not set - set definedstate [dictn getdef $tablenames_info [list $dottedkey_refpath tdefined] NULL] - if {$definedstate ne "NULL"} { - #collision with some other dottedkey - set msg "Table $dottedpath represented by dottedkey $dottedkeyname has been 'defined' elsewhere (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - } - } - } else { - set dottedkey_refpath $dottedsuper_refpath - } - #assert - dottedkey represents a key val pair that can be added + ## set parent "do_inline" ;#a value used in _from_dictval to distinguish from "" or other context based parent values + #Any keys that are themselves tables - will need to be represented inline + #to avoid reordering, or incorrect assignment of plain values to the wrong table. + ## set parent "" + #all toplevel keys in the dict structure can represent subtables. + #we are free to use {[tablename]\n} syntax for toplevel elements. - #if {[dict exists $datastructure {*}$norm_segments {*}$all_dotted_keys]} {} - set fullkey_refpath [list {*}$dottedkey_refpath @@[lindex $all_dotted_keys end]] - if {[tomlish::tdictn::exists $datastructure $fullkey_refpath]} { - set msg "Duplicate key '$norm_segments $all_dotted_keys'. The key (path $fullkey_refpath) already exists at this level in the toml data. The toml data is not valid." - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #set keyval_dict [_get_keyval_value $element] - lassign [_get_keyval_value $element] _ keyval_dict _ sub_tablenames_info + set tomlish [list TOMLISH] + set dictposn 0 + set tablestack [list [list T root]] ;#todo + dict for {t tinfo} $d { + if {$last_simple > $dictposn} { + set parents [list do_inline] + } else { + set parents [list ""] + } + set keys [list $t] + #review - where to make decision on + # DOTTEDKEY containing array of objs + #vs + # list of TABLEARRAY records + #At least for the top + set trecord [_from_dictval $parents $tablestack $keys $tinfo] + lappend tomlish $trecord + incr dictposn + } + return $tomlish + } + proc typedjson_to_toml {json} { + #*** !doctools + #[call [fun typedjson_to_toml] [arg json]] + #[para] - #keyval_dict is either a {type value } - #or the result from parsing an arbitrary dict from an inline table - which could theoretically look the same at the topmost level - #punk::dict::is_tomlish_typeval can distinguish - puts stdout "to_dict>>> TABLE/dottedkey $dottedkeyname kv: $keyval_dict" - dict set datastructure {*}$norm_segments {*}$all_dotted_keys $keyval_dict + set tomlish [::tomlish::from_dict_from_typedjson $json] + lappend tomlish [list NEWLINE lf] + set toml [::tomlish::to_toml $tomlish] + } - #remove ? - if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { - #the value is either empty or or a dict structure with arbitrary (from-user-data) toplevel keys - # inner structure will contain {type value } if all leaves are not empty ITABLES - set tkey [list {*}$norm_segments {*}$all_dotted_keys] + set json1 {{ "a": {"type": "integer", "value": "42"}}} + set json2 {{ + "a": {"type": "integer", "value": "42"}, + "b": {"type": "string", "value": "test"} + }} + set json3 { +{ + "best-day-ever": {"type": "datetime", "value": "1987-07-05T17:45:00Z"}, + "numtheory": { + "boring": {"type": "bool", "value": "false"}, + "perfection": [ + {"type": "integer", "value": "6"}, + {"type": "integer", "value": "28"}, + {"type": "integer", "value": "496"} + ] + } +} + } - #by not creating a tablenames_info record - we effectively make it closed anyway? - #it should be detected as a key - #is there any need to store tablenames_info for it?? - #REVIEW + set json4 { +{ + "best-day-ever": {"type": "datetime", "value": "1987-07-05T17:45:00Z"}, + "numtheory": { + "boring": {"type": "bool", "value": "false"}, + "perfection": [ + {"type": "integer", "value": "6"}, + {"type": "integer", "value": "28"}, + {"type": "integer", "value": "496"} + ] + }, + "emptyobj": {}, + "emptyarray": [] +} + } - ##TODO - update? - #dictn incr tablenames_info [list $tkey seencount] - ##if the keyval_dict is not a simple type x value y - then it's an inline table ? - ##if so - we should add the path to the leaf_key as a closed table too - as it's not allowed to have more entries added. - #dictn set tablenames_info [list $tkey closed] 1 - } + set json5 { +{ + "a": { + " x ": {}, + "b.c": {}, + "d.e": {}, + "b": { + "c": {} + } + } +} + } - } - NEWLINE - COMMENT - WS { - #ignore - } - default { - error "Sub element of type '$type' not understood in table context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" - } - } - } + #surrogate pair face emoji + set json6 { +{ + "surrogatepair": {"type": "string", "value": "\uD83D\uDE10"} +} + } - #end of TABLE record - equivalent of EOF or next header - close off the dottedtables - foreach dtablepath $dottedtables_defined { - dict set tablename_info $dtablepath tdefined closed - } + set json7 { +{ + "escapes": {"type": "string", "value": "val\\ue"} +} + } - } - ITABLE { - #SEP??? - set datastructure [list] - foreach element [lrange $item 1 end] { - set type [lindex $element 0] - log::debug "----> tododict processing $tag subitem $type processing contained element $element" - switch -exact -- $type { - DOTTEDKEY { - set dkey_info [tomlish::to_dict::get_dottedkey_info $element] - set dotted_key_hierarchy [dict get $dkey_info keys] - set leaf_key [lindex $dotted_key_hierarchy end] - set dkeys [lrange $dotted_key_hierarchy 0 end-1] - - #ensure empty keys are still represented in the datastructure - set table_keys [list] ;#We don't know the context - next level up will have to check for key collisions? - set test_keys $table_keys - foreach k $dkeys { - lappend test_keys $k - if {![dict exists $datastructure {*}$test_keys]} { - dict set datastructure {*}$test_keys [list] - } else { - tomlish::log::notice "to_dict datastructure at (DOTTEDKEY) subkey $test_keys already had data: [dict get $datastructure {*}$test_keys]" - } - } - if {[dict exists $datastructure {*}$table_keys {*}$dkeys $leaf_key]} { - error "Duplicate key '$table_keys $dkeys $leaf_key'. The key already exists at this level in the toml data. The toml data is not valid." - } - #set keyval_dict [_get_keyval_value $element] - lassign [_get_keyval_value $element] _ keyval_dict _ sub_tablenames_info - dict set datastructure {*}$table_keys {*}$dkeys $leaf_key $keyval_dict - } - NEWLINE - COMMENT - WS { - #ignore - } - default { - error "Sub element of type '$type' not understood in ITABLE context. Expected only KEY,DQKEY,SQKEY,NEWLINE,COMMENT,WS" - } - } - } - } - ARRAY { - #arrays in toml are allowed to contain mixtures of types - set datastructure [list] - log::debug "--> processing array: $item" + proc from_dict_from_typedjson {json} { + set d [tomlish::dict::from_typedjson $json] + tomlish::from_dict $d ;#return tomlish + } - foreach element [lrange $item 1 end] { - set type [lindex $element 0] - log::debug "----> tododict processing $tag subitem $type processing contained element $element" - switch -exact -- $type { - INT - FLOAT - BOOL - DATETIME { - set value [lindex $element 1] - lappend datastructure [list type $type value $value] - } - STRING { - set value [lindex $element 1] - lappend datastructure [list type $type value [::tomlish::utils::unescape_string $value]] - } - LITERAL { - set value [lindex $element 1] - lappend datastructure [list type $type value $value] - } - ITABLE { - #anonymous table - #lappend datastructure [list type $type value [::tomlish::to_dict [list $element]]] - lappend datastructure [::tomlish::to_dict [list $element]] ;#store itables within arrays as raw dicts (possibly empty) - } - TABLE { - #invalid? shouldn't be output from from_dict - but could manually be constructed as such? review - #doesn't make sense as table needs a name? - #take as synonym for ITABLE? - error "to_dict TABLE within array unexpected" - } - ARRAY - MULTISTRING - MULTILITERAL { - #set value [lindex $element 1] - lappend datastructure [list type $type value [::tomlish::to_dict [list $element]]] - } - WS - SEP - NEWLINE - COMMENT { - #ignore whitespace, commas, newlines and comments - } - default { - error "Unexpected value type '$type' found in array" - } - } - } - } - MULTILITERAL { - #triple squoted string - #first newline stripped only if it is the very first element - #(ie *immediately* following the opening delims) - #All whitespace other than newlines is within LITERALPARTS - # ------------------------------------------------------------------------- - #todo - consider extension to toml to allow indent-aware multiline literals - # how - propose as issue in toml github? Use different delim? e.g ^^^ ? - #e.g - # xxx=?'''abc - # def - # etc - # ''' - # - we would like to trimleft each line to the column following the opening delim - # ------------------------------------------------------------------------- - log::debug "---> todict processing multiliteral: $item" - set parts [lrange $item 1 end] - if {[lindex $parts 0 0] eq "NEWLINE"} { - set parts [lrange $parts 1 end] ;#skip it - } - for {set idx 0} {$idx < [llength $parts]} {incr idx} { - set element [lindex $parts $idx] - set type [lindex $element 0] - switch -exact -- $type { - LITERALPART { - append stringvalue [lindex $element 1] - } - NEWLINE { - set val [lindex $element 1] - if {$val eq "nl"} { - append stringvalue \n - } else { - append stringvalue \r\n - } - } - default { - error "Unexpected value type '$type' found in multistring" - } - } - } - set datastructure $stringvalue - } - MULTISTRING { - #triple dquoted string - log::debug "---> to_dict processing multistring: $item" - set stringvalue "" - set idx 0 - set parts [lrange $item 1 end] - for {set idx 0} {$idx < [llength $parts]} {incr idx} { - set element [lindex $parts $idx] - set type [lindex $element 0] - #We use STRINGPART in the tomlish representation as a distinct element to STRING - which would imply wrapping quotes to be reinserted - switch -exact -- $type { - STRING { - #todo - do away with STRING ? - #we don't build MULTISTRINGS containing STRING - but should we accept it? - tomlish::log::warn "double quoting a STRING found in MULTISTRING - should be STRINGPART?" - append stringvalue "\"[::tomlish::utils::unescape_string [lindex $element 1]]\"" - } - STRINGPART { - append stringvalue [::tomlish::utils::unescape_string [lindex $element 1]] + proc toml_to_typedjson {toml} { + set tomlish [::tomlish::from_toml $toml] + set d [tomlish::dict::from_tomlish $tomlish] + #full validation only occurs by re-encoding dict to tomlish + set test [tomlish::from_dict $d] + + set h [tomlish::typedhuddle::from_dict $d] + #huddle jsondump $h + tomlish::huddle::jsondumpraw $h + } + + #proc get_json {tomlish} { + # package require fish::json + # set d [::tomlish::dict::from_tomlish $tomlish] + + # #return [::tomlish::dict_to_json $d] + # return [fish::json::from "struct" $d] + #} + + #return a Tcl list of tomlish tokens + #i.e get a standard list of all the toml terms in string $s + #where each element of the list is a *tomlish* term.. i.e a specially 'tagged' Tcl list. + #(simliar to a tcl 'Huddle' - but also supporting whitespace preservation) + # ---------------------------------------------------------------------------------------------- + # NOTE: the production of tomlish from toml source doesn't indicate the toml source was valid!!! + # e.g we deliberately don't check certain things such as duplicate table declarations here. + # ---------------------------------------------------------------------------------------------- + #Part of the justification for this is that as long as the syntax is toml shaped - we can load files which violate certain rules and allow programmatic manipulation. + # (e.g perhaps a toml editor to highlight violations for fixing) + # A further stage is then necessary to load the tomlish tagged list into a data structure more suitable for efficient query/reading. + # e.g dicts or an object oriented structure + #Note also - *no* escapes in quoted strings are processed. This is up to the datastructure stage + #e.g dict::from_tomlish will substitute \r \n \uHHHH \UHHHHHHH etc + #This is important for tomlish to maintain the ability to perform competely lossless round-trips from toml to tomlish and back to toml. + # (which is handy for testing as well as editing some part of the structure with absolutely no effect on other parts of the document) + #If we were to unescape a tab character for example + # - we have no way of knowing if it was originally specified as \t \u0009 or \U00000009 or directly as a tab character. + # For this reason, we also do absolutely no line-ending transformations based on platform. + # All line-endings are maintained as is, and even a file with mixed lf crlf line-endings will be correctly interpreted and can be 'roundtripped' + + proc from_toml {args} { + + namespace upvar ::tomlish::parse s s + set s [join $args \n] + namespace upvar ::tomlish::parse i i + set i 0 ;#index into s + + namespace upvar ::tomlish::parse is_parsing is_parsing + set is_parsing 1 + + if {[info command ::tomlish::parse::spacestack] eq "::tomlish::parse::spacestack"} { + tomlish::parse::spacestack destroy + } + struct::stack ::tomlish::parse::spacestack + + namespace upvar ::tomlish::parse last_space_action last_space_action + namespace upvar ::tomlish::parse last_space_type last_space_type + + namespace upvar ::tomlish::parse tok tok + set tok "" + + namespace upvar ::tomlish::parse type type + namespace upvar ::tomlish::parse tokenType tokenType + ::tomlish::parse::set_tokenType "" + namespace upvar ::tomlish::parse tokenType_list tokenType_list + set tokenType [list] ;#Flat (un-nested) list of tokentypes found + + namespace upvar ::tomlish::parse lastChar lastChar + set lastChar "" + + + set result "" + namespace upvar ::tomlish::parse nest nest + set nest 0 + + namespace upvar ::tomlish::parse v v ;#array keyed on nest level + + + set v(0) {TOMLISH} + array set s0 [list] ;#whitespace data to go in {SPACE {}} element. + set parentlevel 0 + + + namespace upvar ::tomlish::parse state state + + namespace upvar ::tomlish::parse braceCount braceCount + set barceCount 0 + namespace upvar ::tomlish::parse bracketCount bracketCount + set bracketCount 0 + + set sep 0 + set r 1 + namespace upvar ::tomlish::parse token_waiting token_waiting + set token_waiting [dict create] ;#if ::tok finds a *complete* second token during a run, it will put the 2nd one here to be returned by the next call. + + + + set state "table-space" + ::tomlish::parse::spacestack push {type space state table-space} + namespace upvar ::tomlish::parse linenum linenum;#'line number' of input data. (incremented for each literal linefeed - but not escaped ones in data) + set linenum 1 + + set ::tomlish::parse::state_list [list] + try { + while {$r} { + set r [::tomlish::parse::tok] + #puts stdout "got tok: '$tok' while parsing string '$s' " + set next_tokenType_known 0 ;#whether we begin a new token here based on what terminated the token result of 'tok' + + + #puts "got token: '$tok' tokenType='$tokenType'. while v($nest) = [set v($nest)]" + #puts "-->tok: $tok tokenType='$tokenType'" + set prevstate $state + set transition_info [::tomlish::parse::goNextState $tokenType $tok $state] + #review goNextState could perform more than one space_action + set space_action [dict get $transition_info space_action] + set newstate [dict get $transition_info newstate] ;#use of 'newstate' vs 'state' makes code clearer below + + if {[tcl::string::match "err-*" $state]} { + ::tomlish::log::warn "---- State error in state $prevstate for tokenType: $tokenType token value: $tok. $state aborting parse. [tomlish::parse::report_line]" + lappend v(0) [list ERROR tokentype $tokenType state $prevstate to $state leveldata [set v($nest)]] + return $v(0) + } + # --------------------------------------------------------- + #NOTE there may already be a token_waiting at this point + #set_token_waiting can raise an error here, + # in which case the space_action branch needs to be rewritten to handle the existing token_waiting + # --------------------------------------------------------- + + if {$space_action eq "pop"} { + #pop_trigger_tokens: newline tablename endarray endinlinetable + #note a token is a pop trigger depending on context. e.g first newline during keyval is a pop trigger. + set parentlevel [expr {$nest -1}] + set do_append_to_parent 1 ;#most tokens will leave this alone - but some like tentative_accum_squote need to do their own append + switch -exact -- $tokenType { + tentative_accum_squote { + #should only apply within a multiliteral + #### + set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed + #Without this - we would get extraneous empty list entries in the parent + # - as the xxx-squote-space isn't a space level from the toml perspective + # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop + #assert prevstate always trailing-squote-space + #dev guardrail - remove? assertion lib? + switch -exact -- $prevstate { + trailing-squote-space { + } + default { + error "--- unexpected popped due to tentative_accum_squote but came from state '$prevstate' should have been trailing-squote-space" + } } - CONT { - #When the last non-whitespace character on a line is an unescaped backslash, - #it will be trimmed along with all whitespace (including newlines) up to the next non-whitespace character or closing delimiter - # review - we allow some whitespace in stringpart elements - can a stringpart ever be all whitespace? - set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] - if {$next_nl == -1} { - #last line - set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] - if {$non_ws >= 0} { - append stringvalue "\\" ;#add the sep - } else { - #skip over ws without emitting - set idx [llength $parts] - } - } else { - set parts_til_nl [lrange $parts 0 $next_nl-1] - set non_ws [lsearch -index 0 -start $idx+1 -not $parts_til_nl WS] - if {$non_ws >= 0} { - append stringvalue "\\" - } else { - #skip over ws on this line - set idx $next_nl - #then have to check each subsequent line until we get to first non-whitespace - set trimming 1 - while {$trimming && $idx < [llength $parts]} { - set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] - if {$next_nl == -1} { - #last line - set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] - if {$non_ws >= 0} { - set idx [expr {$non_ws -1}] - } else { - set idx [llength $parts] - } - set trimming 0 - } else { - set non_ws [lsearch -index 0 -start $idx+1 -not [lrange $parts 0 $next_nl-1] WS] - if {$non_ws >= 0} { - set idx [expr {$non_ws -1}] - set trimming 0 - } else { - set idx $next_nl - #keep trimming - } - } + switch -- $tok { + ' { + tomlish::parse::set_token_waiting type single_squote value $tok complete 1 startindex [expr {$i -1}] + } + '' { + #review - we should perhaps return double_squote instead? + #tomlish::parse::set_token_waiting type literal value "" complete 1 + tomlish::parse::set_token_waiting type double_squote value "" complete 1 startindex [expr {$i - 2}] + } + ''' { + #### + #if already an eof in token_waiting - set_token_waiting will insert before it + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 3}] + } + '''' { + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] + #todo integrate left squote with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + LITERALPART { + set newval "[lindex $lastpart 1]'" + set parentdata $v($parentlevel) + lset parentdata end [list LITERALPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE { + lappend v($parentlevel) [list LITERALPART "'"] + } + MULTILITERAL { + #empty + lappend v($parentlevel) [list LITERALPART "'"] + } + default { + error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" } } } - } - NEWLINE { - #if newline is first element - it is not part of the data of a multistring - if {$idx > 0} { - set val [lindex $element 1] - if {$val eq "nl"} { - append stringvalue \n - } else { - append stringvalue \r\n + ''''' { + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] + #todo integrate left 2 squotes with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + LITERALPART { + set newval "[lindex $lastpart 1]''" + set parentdata $v($parentlevel) + lset parentdata end [list LITERALPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE { + lappend v($parentlevel) [list LITERALPART "''"] + } + MULTILITERAL { + lappend v($parentlevel) [list LITERALPART "''"] + } + default { + error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" + } } } } - WS { - append stringvalue [lindex $element 1] - } - default { - error "Unexpected value type '$type' found in multistring" - } } - } - set datastructure $stringvalue - } - WS - COMMENT - NEWLINE { - #ignore - } - default { - error "Unexpected tag '$tag' in Tomlish list '$tomlish'" - } - } - } - return $datastructure - } - - - proc _from_dictval_tomltype {parents tablestack keys typeval} { - set type [dict get $typeval type] - set val [dict get $typeval value] - switch -- $type { - ARRAY { - set subitems [list] - foreach item $val { - lappend subitems [_from_dictval [list {*}$parents ARRAY] $tablestack $keys $item] SEP - } - if {[lindex $subitems end] eq "SEP"} { - set subitems [lrange $subitems 0 end-1] - } - return [list ARRAY {*}$subitems] - } - ITABLE { - if {$val eq ""} { - return ITABLE - } else { - return [_from_dictval [list {*}$parents ITABLE] $tablestack $keys $val] - } - } - MULTISTRING { - #value is a raw string that isn't encoded as tomlish - #create a valid toml snippet with the raw value and decode it to the proper tomlish MULTISTRING format - #We need to convert controls in $val to escape sequences - except for newlines - # - #consider an *option* to reformat for long lines? (perhaps overcomplex - byte equiv - but may fold in ugly places) - #we could use a line-length limit to decide when to put in a "line ending backslash" - #and even format it with a reasonable indent so that proper CONT and WS entries are made (?) REVIEW - # - #TODO - set tomlpart "x=\"\"\"\\\n" - append tomlpart $val "\"\"\"" - set tomlish [tomlish::decode::toml $tomlpart] - #e.g if val = " etc\nblah" - #TOMLISH {DOTTEDKEY {{KEY x}} = {MULTISTRING CONT {NEWLINE LF} {WS { }} {STRINGPART etc} {NEWLINE lf} {STRINGPART blah} } } - #lindex 1 3 is the MULTISTRING tomlish list - return [lindex $tomlish 1 3] - } - MULTILITERAL { - #MLL string can contain newlines - but still no control chars - #todo - validate - set tomlpart "x='''\n" - append tomlpart $val ''' - set tomlish [tomlish::decode::toml $tomlpart] - return [lindex $tomlish 1 3] - } - LITERAL { - #from v1.0 spec - "Control characters other than tab are not permitted in a literal string" - #(This rules out raw ANSI SGR - which is somewhat restrictive - but perhaps justified for a config format - # as copy-pasting ansi to a config value is probably not always wise, and it's not something that can be - # easily input via a text editor. ANSI can go in Basic strings using the \e escape if that's accepted v1.1?) - #we could choose to change the type to another format here when encountering invalid chars - but that seems - #like too much magic. We elect to error out and require the dict to have valid data for the types it specifies. - if {[string first ' $val] >=0} { - error "_from_dictval_tomltype error: single quote found in LITERAL - cannot encode dict to TOML-VALID TOMLISH" - } - #detect control chars other than tab - #for this we can use rawstring_to_Bstring_with_escaped_controls - even though this isn't a Bstring - #we are just using the map to detect a difference. - set testval [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $val] - if {$testval ne $val} { - #some escaping would have to be done if this value was destined for a Bstring... - #therefor this string has controls and isn't suitable for a LITERAL according to the specs. - error "_from_dictval_tomltype error: control chars (other than tab) found in LITERAL value - cannot encode dict to TOML-VALID TOMLISH" - } - return [list LITERAL $val] - } - STRING { - return [list STRING [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $val]] - } - INT { - if {![::tomlish::utils::is_int $val]} { - error "_from_dictval_tomltype error: bad INT value '$val' - cannot encode dict to TOML-VALID TOMLISH" - } - return [list INT $val] - } - FLOAT { - if {![::tomlish::utils::is_float $val]} { - error "_from_dictval_tomltype error: bad FLOAT value '$val' - cannot encode dict to TOML-VALID TOMLISH" - } - return [list FLOAT $val] - } - default { - if {$type ni [::tomlish::tags]} { - error "_from_dictval_tomltype error: Unrecognised typename '$type' in {type value } - cannot encode dict to TOML-VALID TOMLISH" - } - return [list $type $val] - } - } - } - - #take a raw string and classify: result is a 2 element list comprised of KEY|SQKEY|DQKEY and the value being the appropriate inner string - proc _from_dict_classify_key {rawval} { - if {![::tomlish::utils::is_barekey $rawval]} { - #requires quoting - # - #Any dot in the key would have been split by to_dict - so if it's present here it's part of this key - not a level separator! - # - #we'll use a basic mechanisms for now to determine the type of quoting - # - whether it has any single quotes or not. - # (can't go in an SQKEY) - # - whether it has any chars that require quoting when in a Bstring - # (if so - then its visual representation might be unsuitable for a key in a toml text file, so escape and put in DQKEY instead of literal SQKEY) - #todo - more? - #REVIEW - the backslash might often be in things like a regex or windows path - which is often better expressed in a literal SQKEY - # from literal examples: - # 'c:\Users\nodejs\templates' - # '<\i\c*\s*>' - #If these are in *keys* our basic test will express these as: - # "c:\\Users\\nodejs\\templates" - # "<\\i\\c*\\s*>" - # This still works - but a smarter test might determine when SQKEY is the better form? - #when coming from external systems - can we even know if the value was already escaped? REVIEW - #Probably when coming from json - we know it's already escaped - and so we build our dict converting keys to unescaped - #TODO - clarify in documentation that keys resulting from to_dict are in 'normalized' (unescaped) form - # - #For keys - we currently (2025) are only allowed barekeys,basic strings and literal strings. (no multiline forms) - set k_escaped [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $rawval] - if {[string length $k_escaped] != [string length $rawval]} { - #escaping made a difference - set has_escape_requirement 1 - } else { - set has_escape_requirement 0 - } - if {[string first ' $rawval] >=0 || $has_escape_requirement} { - #basic string - # (any ANSI SGR sequence will end up here in escaped form ) - return [list DQKEY $k_escaped] - } else { - #literal string - return [list SQKEY $rawval] - } - } else { - return [list KEY $rawval] - } - } - - #the quoting implies the necessary escaping for DQKEYs - proc _from_dict_join_and_quote_raw_keys {rawkeylist} { - set result "" - foreach rk $rawkeylist { - lassign [_from_dict_classify_key $rk] type val - switch -- $type { - SQKEY { - append result "'$val'." - } - DQKEY { - append result "\"$val\"." - } - KEY { - append result "$val." - } - } - } - return [string range $result 0 end-1] - } - proc _from_dictval {parents tablestack keys vinfo} { - set k [lindex $keys end] - set K_PART [_from_dict_classify_key $k] ;#get [list SQKEY ] - puts stderr "---parents:'$parents' keys:'$keys' vinfo: $vinfo---" - puts stderr "---tablestack: $tablestack---" - set result [list] - set lastparent [lindex $parents end] - if {$lastparent in [list "" do_inline]} { - if {[tomlish::dict::is_tomlish_typeval $vinfo]} { - set type [dict get $vinfo type] - #treat ITABLE differently? - set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] - lappend result DOTTEDKEY [list $K_PART {WS { }}] = {WS { }} $sublist {NEWLINE lf} - } else { - if {$vinfo ne ""} { - - #set result [list DOTTEDKEY [list [list KEY $k]] = ] - #set records [list ITABLE] - - set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] - - if {$lastparent eq "do_inline"} { - set result [list DOTTEDKEY [list $K_PART] =] - set records [list ITABLE] - } else { - set tname [_from_dict_join_and_quote_raw_keys [list $k]] - set result [list TABLE $tname {NEWLINE lf}] - set tablestack [list {*}$tablestack [list T $k]] - set records [list] - } - - - - set lastidx [expr {[dict size $vinfo] -1}] - set dictidx 0 - dict for {vk vv} $vinfo { - set VK_PART [_from_dict_classify_key $vk] ;#get [list SQKEY ] - #(SQKEY & DQKEY do not have the enclosing quotes in their returned val) - #if {[regexp {\s} $vk] || [string first . $vk] >= 0} { - # set VK_PART [list SQKEY $vk] - #} else { - # set VK_PART [list KEY $vk] - #} - if {[tomlish::dict::is_tomlish_typeval $vv]} { - #type x value y - #REVIEW - we could detect if value is an array of objects, - #and depending on parent context - emit a series of TABLEARRAY records instead of a DOTTEDKEY record containing an ARRAY of objects - set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] - set record [list DOTTEDKEY [list $VK_PART {WS { }}] = {WS { }} $sublist] - } else { - if {$vv eq ""} { - #experimental - if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { - puts stderr "_from_dictval could uninline KEY $vk (tablestack:$tablestack)" - #set tname [tomlish::dict::name_from_tablestack [list {*}$tablestack [list T $vk]]] - - #we can't just join normalized keys - need keys with appropriate quotes and escapes - #set tname [join [list {*}$keys $vk] .] ;#WRONG - set tq [_from_dict_join_and_quote_raw_keys [list {*}$keys $vk]] - set record [list TABLE $tq {NEWLINE lf}] - set tablestack [list {*}$tablestack [list T $vk]] - } else { - set record [list DOTTEDKEY [list $VK_PART] = ITABLE] - set tablestack [list {*}$tablestack [list I $vk]] - } - } else { - if { 0 } { - #experiment.. sort of getting there. - if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { - puts stderr "_from_dictval could uninline2 KEYS [list {*}$keys $vk] (tablestack:$tablestack)" - set tq [_from_dict_join_and_quote_raw_keys [list {*}$keys $vk]] - set record [list TABLE $tq {NEWLINE lf}] - set tablestack [list {*}$tablestack [list T $vk]] - - #review - todo? - set dottedkey_value [_from_dictval [list {*}$parents TABLE] $tablestack [list {*}$keys $vk] $vv] - lappend record {*}$dottedkey_value - - } else { - set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] - set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] + triple_squote { + #presumably popping multiliteral-space + ::tomlish::log::debug "---- triple_squote for last_space_action pop leveldata: $v($nest)" + set merged [list] + set lasttype "" + foreach part $v($nest) { + switch -exact -- [lindex $part 0] { + MULTILITERAL { + lappend merged $part + } + LITERALPART { + if {$lasttype eq "LITERALPART"} { + set prevpart [lindex $merged end] + lset prevpart 1 [lindex $prevpart 1][lindex $part 1] + lset merged end $prevpart + } else { + lappend merged $part + } + } + NEWLINE { + #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here + #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. + lappend merged $part + } + default { + error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" } - } else { - set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] - set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] } + set lasttype [lindex $part 0] } + set v($nest) $merged } - if {$dictidx != $lastidx} { - #lappend record SEP - if {$lastparent eq "do_inline"} { - lappend record SEP - } else { - lappend record {NEWLINE lf} - } - } - lappend records $record - incr dictidx - } - if {$lastparent eq "do_inline"} { - lappend result $records {NEWLINE lf} - } else { - lappend result {*}$records {NEWLINE lf} - } - } else { - if {$lastparent eq "do_inline"} { - lappend result DOTTEDKEY [list $K_PART] = ITABLE {NEWLINE lf} - } else { - set tname [_from_dict_join_and_quote_raw_keys [list $k]] - lappend result TABLE $tname {NEWLINE lf} - } - } - } - } else { - #lastparent is not toplevel "" or "do_inline" - if {[tomlish::dict::is_tomlish_typeval $vinfo]} { - #type x value y - set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] - lappend result {*}$sublist - } else { - if {$lastparent eq "TABLE"} { - #review - dict for {vk vv} $vinfo { - set VK_PART [_from_dict_classify_key $vk] ;#get [list SQKEY ] - set dottedkey_value [_from_dictval [list {*}$parents DOTTEDKEY] $tablestack [list {*}$keys $vk] $vv] - lappend result [list DOTTEDKEY [list $VK_PART] = $dottedkey_value {NEWLINE lf}] - } - } else { - if {$vinfo ne ""} { - set lastidx [expr {[dict size $vinfo] -1}] - set dictidx 0 - set sub [list] - #REVIEW - #set result $lastparent ;#e.g sets ITABLE - set result ITABLE - set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] - dict for {vk vv} $vinfo { - set VK_PART [_from_dict_classify_key $vk] ;#get [list SQKEY ] - if {[tomlish::dict::is_tomlish_typeval $vv]} { - #type x value y - set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] - set record [list DOTTEDKEY [list $VK_PART] = $sublist] - } else { - if {$vv eq ""} { - #can't just uninline at this level - #we need a better method to query main dict for uninlinability at each level - # (including what's been inlined already) - #if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { - # puts stderr "_from_dictval uninline2 KEY $keys" - # set tname [_from_dict_join_and_quote_raw_keys [list {*}$keys $vk]] - # set record [list TABLE $tname {NEWLINE lf}] - # set tablestack [list {*}$tablestack [list T $vk]] - #} else { - set record [list DOTTEDKEY [list $VK_PART] = ITABLE] - #} - } else { - #set sub [_from_dictval ITABLE $vk $vv] - set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] - #set record [list DOTTEDKEY [list $VK_PART] = ITABLE $dottedkey_value] - set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] + tentative_accum_dquote { + #should only apply within a multistring + #### + set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed + #Without this - we would get extraneous empty list entries in the parent + # - as the trailing-dquote-space isn't a space level from the toml perspective + # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop + #assert prevstate always trailing-dquote-space + #dev guardrail - remove? assertion lib? + switch -exact -- $prevstate { + trailing-dquote-space { + } + default { + error "--- unexpected popped due to tentative_accum_dquote but came from state '$prevstate' should have been trailing-dquote-space" } } - if {$dictidx != $lastidx} { - lappend record SEP + switch -- $tok { + {"} { + tomlish::parse::set_token_waiting type single_dquote value $tok complete 1 startindex [expr {$i -1}] + } + {""} { + #review - we should perhaps return double_dquote instead? + #tomlish::parse::set_token_waiting type literal value "" complete 1 + tomlish::parse::set_token_waiting type double_dquote value "" complete 1 startindex [expr {$i - 2}] + } + {"""} { + #### + #if already an eof in token_waiting - set_token_waiting will insert before it + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 3}] + } + {""""} { + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 4}] + #todo integrate left dquote with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + STRINGPART { + set newval "[lindex $lastpart 1]\"" + set parentdata $v($parentlevel) + lset parentdata end [list STRINGPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE - CONT - WS { + lappend v($parentlevel) [list STRINGPART {"}] + } + MULTISTRING { + #empty + lappend v($parentlevel) [list STRINGPART {"}] + } + default { + error "--- don't know how to integrate extra trailing dquote with data $v($parentlevel)" + } + } + } + {"""""} { + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i-5}] + #todo integrate left 2 dquotes with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + STRINGPART { + set newval "[lindex $lastpart 1]\"\"" + set parentdata $v($parentlevel) + lset parentdata end [list STRINGPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE - CONT - WS { + lappend v($parentlevel) [list STRINGPART {""}] + } + MULTISTRING { + lappend v($parentlevel) [list STRINGPART {""}] + } + default { + error "--- don't know how to integrate extra trailing 2 dquotes with data $v($parentlevel)" + } + } + } } - lappend result $record - incr dictidx } - } else { - puts stderr "table x-1" - lappend result DOTTEDKEY [list $K_PART] = ITABLE + triple_dquote { + #presumably popping multistring-space + ::tomlish::log::debug "---- triple_dquote for last_space_action pop leveldata: $v($nest)" + set merged [list] + set lasttype "" + foreach part $v($nest) { + switch -exact -- [lindex $part 0] { + MULTISTRING { + lappend merged $part + } + STRINGPART { + if {$lasttype eq "STRINGPART"} { + set prevpart [lindex $merged end] + lset prevpart 1 [lindex $prevpart 1][lindex $part 1] + lset merged end $prevpart + } else { + lappend merged $part + } + } + CONT - WS { + lappend merged $part + } + NEWLINE { + #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here + #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. + lappend merged $part + } + default { + error "---- triple_dquote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" + } + } + set lasttype [lindex $part 0] + } + set v($nest) $merged + } + equal { + #pop caused by = + switch -exact -- $prevstate { + dottedkey-space { + tomlish::log::debug "---- equal ending dottedkey-space for last_space_action pop" + #re-emit for parent space + tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] + } + dottedkey-space-tail { + #experiment? + tomlish::log::debug "---- equal ending dottedkey-space-tail for last_space_action pop" + #re-emit for parent space + tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] + } + } + } + newline { + incr linenum + lappend v($nest) [list NEWLINE $tok] + } + tablename { + #note: a tablename only 'pops' if we are greater than zero + error "---- tablename pop should already have been handled as special case zeropoppushspace in goNextState" + } + tablearrayname { + #!review - tablearrayname different to tablename regarding push/pop? + #note: a tablename only 'pops' if we are greater than zero + error "---- tablearrayname pop should already have been handled as special case zeropoppushspace in goNextState" + } + endarray { + #nothing to do here. + } + comma { + #comma for inline table will pop the keyvalue space + lappend v($nest) "SEP" + } + endinlinetable { + ::tomlish::log::debug "---- endinlinetable for last_space_action pop" + } + default { + error "---- unexpected tokenType '$tokenType' for last_space_action 'pop'" + } + } + if {$do_append_to_parent} { + #e.g tentative_accum_squote does it's own appends as necessary - so won't get here + lappend v($parentlevel) [set v($nest)] } - } - } - } - return $result - } - - proc from_dict {d} { - #consider: - # t1={a=1,b=2} - # x = 1 - #If we represent t1 as an expanded table we get - # [t1] - # a=1 - # b=2 - # x=1 - # --- which is incorrect - as x was a toplevel key like t1! - #This issue doesn't occur if x is itself an inline table - # t1={a=1,b=2} - # x= {no="problem"} - # - # (or if we were to reorder x to come before t1) + incr nest -1 - #As the dictionary form doesn't distinguish the structure used to create tables {[table1]\nk=v} vs inline {table1={k=v}} - #Without a solution, from_dict would have to always produce the inline form for toplevel tables unless we allowed re-ordering, - #which is unpreferred here. + } elseif {$last_space_action eq "push"} { + set prevnest $nest + incr nest 1 + set v($nest) [list] + # push_trigger_tokens: barekey dquotedkey startinlinetable startarray tablename tablearrayname - #A possible solution: - #scan the top level to see if all (trailing) elements are themselves dicts - # (ie not of form {type XXX value yyy}) - # - # A further point is that if all root level values are at the 'top' - we can treat lower table-like structures as {[table]} elements - #ie we don't need to force do_inline if all the 'simple' keys are before any compound keys - #set root_has_values 0 - #approach 1) - the naive approach - forces inline when not always necessary - #dict for {k v} $d { - # if {[llength $v] == 4 && [lindex $v 0] eq "type"} { - # set root_has_values 1 - # break - # } - #} - - - #approach 2) - track the position of last {type x value y} in the dictionary built by to_dict - # - still not perfect. Inlines dotted tables unnecessarily - #This means from_dict doesn't produce output optimal for human editing. - set last_simple [tomlish::dict::last_tomltype_posn $d] - - - ## set parent "do_inline" ;#a value used in _from_dictval to distinguish from "" or other context based parent values - #Any keys that are themselves tables - will need to be represented inline - #to avoid reordering, or incorrect assignment of plain values to the wrong table. - - ## set parent "" - #all toplevel keys in the dict structure can represent subtables. - #we are free to use {[tablename]\n} syntax for toplevel elements. - - - set tomlish [list TOMLISH] - set dictposn 0 - set tablestack [list [list T root]] ;#todo - dict for {t tinfo} $d { - if {$last_simple > $dictposn} { - set parents [list do_inline] - } else { - set parents [list ""] - } - set keys [list $t] - #review - where to make decision on - # DOTTEDKEY containing array of objs - #vs - # list of TABLEARRAY records - #At least for the top - set trecord [_from_dictval $parents $tablestack $keys $tinfo] - lappend tomlish $trecord - incr dictposn - } - return $tomlish - } - - proc json_to_toml {json} { - #*** !doctools - #[call [fun json_to_toml] [arg json]] - #[para] - - set tomlish [::tomlish::from_json $json] - set toml [::tomlish::to_toml $tomlish] - } - - #TODO use huddle? - proc from_json {json} { - #set jstruct [::tomlish::json_struct $json] - #return [::tomlish::from_json_struct $jstruct] - package require huddle - package require huddle::json - set h [huddle::json::json2huddle parse $json] - - } - - proc from_json_struct {jstruct} { - package require fish::json_toml - return [fish::json_toml::jsonstruct2tomlish $jstruct] - } - - proc toml_to_json {toml} { - set tomlish [::tomlish::from_toml $toml] - return [::tomlish::get_json $tomlish] - } - - proc get_json {tomlish} { - package require fish::json - set d [::tomlish::to_dict $tomlish] - - #return [::tomlish::dict_to_json $d] - return [fish::json::from "struct" $d] - } - - - - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish ---}] -} -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -namespace eval tomlish::build { - #STRING,INT,FLOAT,BOOL, DATETIME - simple wrappers for completeness - # take a value of the appropriate type and wrap as a tomlish tagged item - proc STRING {s} { - return [list STRING [::tomlish::utils::rawstring_to_Bstring_with_escaped_controls $s]] - } - proc LITERAL {litstring} { - error todo - } + switch -exact -- $tokenType { + tentative_trigger_squote - tentative_trigger_dquote { + #### this startok will always be tentative_accum_squote/tentative_accum_dquote starting with one accumulated squote/dquote + if {[dict exists $transition_info starttok] && [dict get $transition_info starttok] ne ""} { + lassign [dict get $transition_info starttok] starttok_type starttok_val + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType $starttok_type + set tok $starttok_val + } + } + single_squote { + #JMN - REVIEW + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "squotedkey" + set tok "" + } + triple_squote { + ::tomlish::log::debug "---- push trigger tokenType triple_squote" + set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERALPART + } + squotedkey { + switch -exact -- $prevstate { + table-space - itable-space { + set v($nest) [list DOTTEDKEY] + } + } + #todo - check not something already waiting? + tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space + } + triple_dquote { + set v($nest) [list MULTISTRING] ;#container for NEWLINE,STRINGPART,CONT + } + dquotedkey { + switch -exact -- $prevstate { + table-space - itable-space { + set v($nest) [list DOTTEDKEY] + } + } + #todo - check not something already waiting? + tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space + } + barekey { + switch -exact -- $prevstate { + table-space - itable-space { + set v($nest) [list DOTTEDKEY] + } + } + #todo - check not something already waiting? + set waiting [tomlish::parse::get_token_waiting] + if {[llength $waiting]} { + set i [dict get $waiting startindex] + tomlish::parse::clear_token_waiting + tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space + } else { + tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space + } + } + tablename { + #note: we do not use the output of tablename_trim to produce a tablename for storage in the tomlish list! + #The tomlish list is intended to preserve all whitespace (and comments) - so a roundtrip from toml file to tomlish + # back to toml file will be identical. + #It is up to the datastructure stage to normalize and interpret tomlish for programmatic access. + # we call tablename_trim here only to to validate that the tablename data is well-formed at the outermost level, + # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names from + # a structural perspective. - proc INT {i} { - #whole numbers, may be prefixed with a + or - - #Leading zeros are not allowed - #Hex,octal binary forms are allowed (toml 1.0) - #We will error out on encountering commas, as commas are interpreted differently depending on locale (and don't seem to be supported in the toml spec anyway) - #!todo - Tcl can handle bignums - bigger than a 64bit signed long as specified in toml. - # - We should probably raise an error for number larger than this and suggest the user supply it as a string? - if {[tcl::string::last , $i] > -1} { - error "Unable to interpret '$i' as an integer. Use underscores if you need a thousands separator [::tomlish::parse::report_line]" - } - if {![::tomlish::utils::int_validchars $i]} { - error "Unable to interpret '$i' as an integer. Only 0-9 + 1 _ characters are acceptable. [::tomlish::parse::report_line]" - } + #todo - review! It's arguable that we should not do any validation here, and just store even incorrect raw tablenames, + # so that the tomlish list is more useful for say a toml editor. Consider adding an 'err' tag to the appropriate place in the + # tomlish list? - if {[::tomlish::utils::is_int $i]} { - return [list INT $i] - } else { - error "'$i' is not a valid integer as per the Toml spec. [::tomlish::parse::report_line]" - } + #set trimtable [tablename_trim $tok] + #::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$trimtable'" + set v($nest) [list TABLE $tok] ;#$tok is the *raw* table name + #note also that equivalent tablenames may have different toml representations even after being trimmed! + #e.g ["x\t\t"] & ["x "] (tab escapes vs literals) + #These will show as above in the tomlish list, but should normalize to the same tablename when used as keys by the datastructure stage. + } + tablearrayname { + #set trimtable [tablename_trim $tok] + #::tomlish::log::debug "---- trimmed (but not normalized) tablearrayname: '$trimtable'" + set v($nest) [list TABLEARRAY $tok] ;#$tok is the *raw* tablearray name + } + startarray { + set v($nest) [list ARRAY] ;#$tok is just the opening bracket - don't output. + } + startinlinetable { + set v($nest) [list ITABLE] ;#$tok is just the opening curly brace - don't output. + } + default { + error "---- push trigger tokenType '$tokenType' not yet implemented" + } + } - } + } else { + #no space level change + switch -exact -- $tokenType { + squotedkey { + #puts "---- squotedkey in state $prevstate (no space level change)" + lappend v($nest) [list SQKEY $tok] + } + dquotedkey { + #puts "---- dquotedkey in state $prevstate (no space level change)" + lappend v($nest) [list DQKEY $tok] + } + barekey { + lappend v($nest) [list KEY $tok] + } + dotsep { + lappend v($nest) [list DOTSEP] + } + starttablename { + #$tok is triggered by the opening bracket and sends nothing to output + } + starttablearrayname { + #$tok is triggered by the double opening brackets and sends nothing to output + } + tablename - tablenamearray { + error "---- did not expect 'tablename/tablearrayname' without space level change (no space level change)" + #set v($nest) [list TABLE $tok] + } + endtablename - endtablearrayname { + #no output into the tomlish list for this token + } + startinlinetable { + puts stderr "---- decode::toml error. did not expect startinlinetable without space level change (no space level change)" + } + single_dquote { + switch -exact -- $newstate { + string-state { + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "string" + set tok "" + } + dquoted-key { + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "dquotedkey" + set tok "" + } + multistring-space { + lappend v($nest) [list STRINGPART {"}] + #may need to be joined on pop if there are neighbouring STRINGPARTS + } + default { + error "---- single_dquote switch case not implemented for nextstate: $newstate (no space level change)" + } + } + } + double_dquote { + #leading extra quotes - test: toml_multistring_startquote2 + switch -exact -- $prevstate { + itable-keyval-value-expected - keyval-value-expected { + puts stderr "tomlish::decode::toml double_dquote TEST" + #empty string + lappend v($nest) [list STRINGPART ""] + } + multistring-space { + #multistring-space to multistring-space + lappend v($nest) [list STRINGPART {""}] + } + default { + error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" + } + } - proc FLOAT {f} { - #convert any non-lower case variants of special values to lowercase for Toml + } + single_squote { + switch -exact -- $newstate { + literal-state { + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "literal" + set tok "" + } + squoted-key { + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "squotedkey" + set tok "" + } + multiliteral-space { + #false alarm squote returned from tentative_accum_squote pop + ::tomlish::log::debug "---- adding lone squote to own LITERALPART nextstate: $newstate (no space level change)" + #(single squote - not terminating space) + lappend v($nest) [list LITERALPART '] + #may need to be joined on pop if there are neighbouring LITERALPARTs + } + default { + error "---- single_squote switch case not implemented for nextstate: $newstate (no space level change)" + } + } + } + double_squote { + switch -exact -- $prevstate { + keyval-value-expected { + lappend v($nest) [list LITERAL ""] + } + multiliteral-space { + #multiliteral-space to multiliteral-space + lappend v($nest) [list LITERALPART ''] + } + default { + error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" + } + } + } + enddquote { + #nothing to do? + set tok "" + } + endsquote { + set tok "" + } + string { + #JJJJ + set tok [tomlish::from_Bstring $tok] + lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes + } + literal { + lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes + } + multistring { + #review + #JJJJ ? + lappend v($nest) [list MULTISTRING $tok] + } + stringpart { + #JJJJ + set tok [tomlish::from_Bstring $tok] + lappend v($nest) [list STRINGPART $tok] ;#will not get wrapped in dquotes directly + } + multiliteral { + lappend v($nest) [LIST MULTILITERAL $tok] + } + literalpart { + lappend v($nest) [list LITERALPART $tok] ;#will not get wrapped in squotes directly + } + untyped_value { + #would be better termed unclassified_value + #we can't determine the type of unquoted values (int,float,datetime,bool) until the entire token was read. + unset -nocomplain tag + if {$tok in {true false}} { + set tag BOOL + } else { + if {[::tomlish::utils::is_int $tok]} { + set tag INT + } else { + if {[string is integer -strict $tok]} { + #didn't qualify as a toml int - but still an int + #probably means is_int is limiting size and not accepting bigints (configurable?) + #or it didn't qualify due to more than 1 leading zero + #or other integer format issue such as repeated underscores + error "---- Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. (looks close to being an int. Formatting or range issue?) [tomlish::parse::report_line] (no space level change)" + } else { + #DDDD + if {[::tomlish::utils::is_float $tok]} { + set tag FLOAT + } elseif {[::tomlish::utils::is_localtime $tok]} { + set tag TIME-LOCAL + } elseif {[::tomlish::utils::is_timepart $tok]} { + #Note we must allow lone timepart here (not just is_localtime which doesn't allow tz offsets) in case it followed a previous localdate + set tag DATETIME ;#?? review standalone time with tz - no specific tag - only allowed as followup value from DATETIME-LOCAL + } elseif {[::tomlish::utils::is_datepart $tok]} { + set tag DATE-LOCAL + } elseif {[::tomlish::utils::is_datetime $tok]} { + #not just a date or just a time + #could be either local or have tz offset + #DDDD JJJ + set norm [string map {" " T} $tok];#prob unneeded - we won't get here if there was a space - would arrive as 2 separate tokens review. + lassign [split $norm T] dp tp + if {[::tomlish::utils::is_localtime $tp]} { + set tag DATETIME-LOCAL + } else { + set tag DATETIME + } + } elseif {[::tomlish::utils::is_datetime X$tok] || [::tomlish::utils::is_timepart X$tok]} { + # obsolete + #Note we must allow lone timepart here (not just is_localtime which doesn't allow tz offsets) in case it followed a previous localdate + #e.g x= 2025-01-01 02:34Z + #The dict::from_tomlish validation will catch an invalid standaline timepart, or combine with leading date if applicable. + set tag DATETIME + } else { + error "---- Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. [tomlish::parse::report_line] (no space level change)" + } + } + } + } + #assert either tag is set, or we errored out. + lappend v($nest) [list $tag $tok] + + } + comment { + #puts stdout "----- comment token returned '$tok'------" + #JJJJ + set tok [tomlish::from_comment $tok] + lappend v($nest) [list COMMENT "$tok"] + } + equal { + #we append '=' to the nest so that any surrounding whitespace is retained. + lappend v($nest) = + } + comma { + lappend v($nest) SEP + } + newline { + incr linenum + lappend v($nest) [list NEWLINE $tok] + } + whitespace { + lappend v($nest) [list WS $tok] + } + continuation { + lappend v($nest) CONT + } + bom { + lappend v($nest) BOM + } + eof { + #ok - nothing more to add to the tomlish list. + #!todo - check previous tokens are complete/valid? + } + default { + error "--- unknown tokenType '$tokenType' during state $prevstate [::tomlish::parse::report_line] (no space level change)" + } + } + } + + if {!$next_tokenType_known} { + ::tomlish::log::notice "---- tomlish::decode::toml - current tokenType:$tokenType Next token type not known" + ::tomlish::parse::set_tokenType "" + set tok "" + } + + if {$state eq "end-state"} { + break + } + + + } + + #while {$nest > 0} { + # lappend v([expr {$nest -1}]) [set v($nest)] + # incr nest -1 + #} + while {[::tomlish::parse::spacestack size] > 1} { + ::tomlish::parse::spacestack pop + lappend v([expr {$nest -1}]) [set v($nest)] + incr nest -1 + + #set parent [spacestack peek] ;#the level being appended to + #lassign $parent type state + #if {$type eq "space"} { + # + #} elseif {$type eq "buffer"} { + # lappend v([expr {$nest -1}]) {*}[set v($nest)] + #} else { + # error "invalid spacestack item: $parent" + #} + } + + } finally { + set is_parsing 0 + } + return $v(0) + } + + #toml dquoted string to tomlish STRING + # - only allow specified escape sequences + # - allow any unicode except those that must be escaped: dquote, bsl, and control chars(except tab) + proc from_Bstring {bstr} { + #JJJJ + if {[catch { + tomlish::utils::unescape_string $bstr + } errM]} { + return -code error -errorcode {TOML SYNTAX INVALIDESCAPE} "tomlish::from_Bstring toml Bstring contains invalid escape sequence\n$errM" ;#review + } + #assert: all escapes are now valid + + if {[regexp {[\u0000-\u0008\u000A-\u001F\u007f]} $bstr]} { + set msg "tomlish::from_Bstring toml Bstring contains controls that must be escaped" + return -code error -errorcode {TOML SYNTAX BSTRINGUNESCAPEDCONTROLS} $msg ;#review + } + return $bstr + } + #validate toml comment + # - disallow controls that must be escaped + #from spec: + # "Control characters other than tab (U+0000 to U+0008, U+000A to U+001F, U+007F) are not permitted in comments." + proc from_comment {comment} { + if {[regexp {[\u0000-\u0008\u000A-\u001F\u007f]} $comment]} { + set msg "tomlish::from_comment toml comment contains controls that must be escaped" + return -code error -errorcode {TOML SYNTAX COMMENTUNESCAPEDCONTROLS} $msg ;#review + } + return $comment + } + + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish ---}] +} +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +namespace eval tomlish::build { + #STRING,INT,FLOAT,BOOL, DATETIME - simple wrappers for completeness + # take a value of the appropriate type and wrap as a tomlish tagged item + proc STRING {s} { + return [list STRING [::tomlish::utils::rawstring_to_Bstring_with_escaped_controls $s]] + } + proc LITERAL {litstring} { + error todo + } + + proc INT {i} { + #whole numbers, may be prefixed with a + or - + #Leading zeros are not allowed + #Hex,octal binary forms are allowed (toml 1.0) + #We will error out on encountering commas, as commas are interpreted differently depending on locale (and don't seem to be supported in the toml spec anyway) + #!todo - Tcl can handle bignums - bigger than a 64bit signed long as specified in toml. + # - We should probably raise an error for number larger than this and suggest the user supply it as a string? + if {[tcl::string::last , $i] > -1} { + error "Unable to interpret '$i' as an integer. Use underscores if you need a thousands separator [::tomlish::parse::report_line]" + } + if {![::tomlish::utils::int_validchars $i]} { + error "Unable to interpret '$i' as an integer. Only 0-9 + 1 _ characters are acceptable. [::tomlish::parse::report_line]" + } + + if {[::tomlish::utils::is_int $i]} { + return [list INT $i] + } else { + error "'$i' is not a valid integer as per the Toml spec. [::tomlish::parse::report_line]" + } + + } + + proc FLOAT {f} { + #convert any non-lower case variants of special values to lowercase for Toml if {[::tcl::string::tolower $f] in {nan +nan -nan inf +inf -inf}} { return [list FLOAT [tcl::string::tolower $f]] } @@ -1881,6 +1719,9 @@ namespace eval tomlish::build { error "Unable to interpret '$str' as Toml datetime. Check your input, or check that tomlish is able to handle all Toml datetimes properly [::tomlish::parse::report_line]" } } + proc DATETIME-LOCAL {str} { + error "build::DATETIME-LOCAL todo" + } proc BOOLEAN {b} { #convert any Tcl-acceptable boolean to boolean as accepted by toml - lower case true/false @@ -1959,7 +1800,7 @@ namespace eval tomlish::encode { # (e.g from_dict could make formatting decisions in the tomlish it produces) # #e.g duplicate keys etc can exist in the toml output. - #The to_dict from_dict (or any equivalent processor pair) is responsible for validation and conversion + #The dict::from_tomlish tomlish::from_dict (or any equivalent processor pair) is responsible for validation and conversion #back and forth of escape sequences where appropriate. #--------------------------------------------------------------------------------------------------------- proc tomlish {list {context ""}} { @@ -2131,7 +1972,8 @@ namespace eval tomlish::encode { INT - BOOL - FLOAT - - DATETIME { + DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL { + #DDDD append toml [lindex $item 1] } INCOMPLETE { @@ -2172,1208 +2014,649 @@ namespace eval tomlish::decode { #[para] #[list_begin definitions] - #return a Tcl list of tomlish tokens - #i.e get a standard list of all the toml terms in string $s - #where each element of the list is a *tomlish* term.. i.e a specially 'tagged' Tcl list. - #(simliar to a tcl 'Huddle' - but also supporting whitespace preservation) - # ---------------------------------------------------------------------------------------------- - # NOTE: the production of tomlish from toml source doesn't indicate the toml source was valid!!! - # e.g we deliberately don't check certain things such as duplicate table declarations here. - # ---------------------------------------------------------------------------------------------- - #Part of the justification for this is that as long as the syntax is toml shaped - we can load files which violate certain rules and allow programmatic manipulation. - # (e.g perhaps a toml editor to highlight violations for fixing) - # A further stage is then necessary to load the tomlish tagged list into a data structure more suitable for efficient query/reading. - # e.g dicts or an object oriented structure - #Note also - *no* escapes in quoted strings are processed. This is up to the datastructure stage - #e.g to_dict will substitute \r \n \uHHHH \UHHHHHHH etc - #This is important for tomlish to maintain the ability to perform competely lossless round-trips from toml to tomlish and back to toml. - # (which is handy for testing as well as editing some part of the structure with absolutely no effect on other parts of the document) - #If we were to unescape a tab character for example - # - we have no way of knowing if it was originally specified as \t \u0009 or \U00000009 or directly as a tab character. - # For this reason, we also do absolutely no line-ending transformations based on platform. - # All line-endings are maintained as is, and even a file with mixed lf crlf line-endings will be correctly interpreted and can be 'roundtripped' - proc toml {args} { - #*** !doctools - #[call [fun toml] [arg arg...]] - #[para] return a Tcl list of tomlish tokens + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::decode ---}] +} +#decode toml to tomlish +#interp alias {} tomlish::from_toml {} tomlish::decode::toml - set s [join $args \n] +namespace eval tomlish::utils { + #*** !doctools + #[subsection {Namespace tomlish::utils}] + #[para] + #[list_begin definitions] - namespace upvar ::tomlish::parse is_parsing is_parsing - set is_parsing 1 - if {[info command ::tomlish::parse::spacestack] eq "::tomlish::parse::spacestack"} { - tomlish::parse::spacestack destroy - } - struct::stack ::tomlish::parse::spacestack - namespace upvar ::tomlish::parse last_space_action last_space_action - namespace upvar ::tomlish::parse last_space_type last_space_type + #basic generic quote matching for single and double quotes + #note for example that {[o'malley]} will return sq - as the single quote is not closed or wrapped in double quotes + proc tok_in_quotedpart {tok} { + set sLen [tcl::string::length $tok] + set quote_type "" + set had_slash 0 + for {set i 0} {$i < $sLen} {incr i} { + set c [tcl::string::index $tok $i] + if {$quote_type eq ""} { + if {$had_slash} { + #don't enter quote mode + #leave slash_mode because even if current char is slash - it is escaped + set had_slash 0 + } else { + set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] + switch -- $ctype { + dq { + set quote_type dq + } + sq { + set quote_type sq + } + bsl { + set had_slash 1 + } + } + } + } else { + if {$had_slash} { + #don't leave quoted mode + #leave slash_mode because even if current char is slash - it is escaped + set had_slash 0 + } else { + set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] + switch -- $ctype { + dq { + if {$quote_type eq "dq"} { + set quote_type "" + } + } + sq { + if {$quote_type eq "sq"} { + set quote_type "" + } + } + bsl { + set had_slash 1 + } + } + } + } + } + return $quote_type ;#dq | sq + } - namespace upvar ::tomlish::parse tok tok - set tok "" + proc hex_escape_info {slashx} { + set exp {^\\x([0-9a-fA-F]{2}$)} + if {[regexp $exp $slashx match hex]} { + return [list ok [list char [subst -nocommand -novariable $slashx]]] + } else { + return [list err [list reason "Supplied string not of the form \\xHH where H in \[0-9a-fA-F\]"]] + } + } + proc unicode_escape_info {slashu} { + #!todo + # validate that slashu is either a \uxxxx or \Uxxxxxxxx value of the correct length and + # is a valid 'unicode scalar value' (any Unicode code point except high-surrogate and low-surrogate code points) + # ie integers in the range 0 to D7FF16 and E00016 to 10FFFF16 inclusive + #expr {(($x >= 0) && ($x <= 0xD7FF16)) || (($x >= 0xE00016) && ($x <= 0x10FFFF16))} + if {[tcl::string::match {\\u*} $slashu]} { + set exp {^\\u([0-9a-fA-F]{4}$)} + if {[regexp $exp $slashu match hex]} { + if {[scan $hex %4x dec] != 1} { + #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? + return [list err [list reason "Failed to convert '$hex' to decimal"]] + } else { + return [list ok [list char [subst -nocommand -novariable $slashu]]] + } + } else { + return [list err [list reason "Supplied string not of the form \\uHHHH where H in \[0-9a-fA-F\]"]] + } + } elseif {[tcl::string::match {\\U*} $slashu]} { + set exp {^\\U([0-9a-fA-F]{8}$)} + if {[regexp $exp $slashu match hex]} { + if {[scan $hex %8x dec] != 1} { + #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? + return [list err [list reason "Failed to convert '$hex' to decimal"]] + } else { + if {(($dec >= 0) && ($dec <= 0xD7FF16)) || (($dec >= 0xE00016) && ($dec <= 0x10FFFF16))} { + return [list ok [list char [subst -nocommand -novariable $slashu]]] + } else { + return [list err [list reason "$slashu is not within the 'unicode scalar value' ranges 0 to 0xD7FF16 or 0xE00016 to 0x10FFFF16"]] + } + } + } else { + return [list err [list reason "Supplied string not of the form \\UHHHHHHHH where H in \[0-9a-fA-F\]"]] + } + } else { + return [list err [list reason "Supplied string did not start with \\u or \\U" ]] + } - namespace upvar ::tomlish::parse type type - namespace upvar ::tomlish::parse tokenType tokenType - ::tomlish::parse::set_tokenType "" - namespace upvar ::tomlish::parse tokenType_list tokenType_list - set tokenType [list] ;#Flat (un-nested) list of tokentypes found + } - namespace upvar ::tomlish::parse lastChar lastChar - set lastChar "" + #Note that unicode characters don't *have* to be escaped. + #So if we provide a function named 'escape_string', the name implies the inverse of unescape_string which unescapes unicode \u \U values. + #- an inverse of unescape_string would encode all unicode chars unnecessarily. + #- as toml accepts a compact escape sequence for common chars such as tab,backspace,linefeed etc but also allows the full form \u009 etc + #- escape_string and unescape_string would not be reliably roundtrippable inverses anyway. + #REVIEW - provide it anyway? When would it be desirable to use? + variable Bstring_control_map [dict create] + dict set Bstring_control_map \b {\b} + dict set Bstring_control_map \n {\n} + dict set Bstring_control_map \r {\r} + dict set Bstring_control_map \" {\"} + dict set Bstring_control_map \x1b {\e} ;#In spec it's included in the list of 'must be escaped', as well as the 'convenience' escapes - so we make it go both ways. + dict set Bstring_control_map \\ "\\\\" - set result "" - namespace upvar ::tomlish::parse nest nest - set nest 0 + #\e for \x1b seems like it might be included - v1.1?? hard to find current state of where toml is going :/ + #for a Bstring (Basic string) tab is explicitly mentioned as not being one that must be escaped. + #8 = \b - already in list. + #built the remainder whilst checking for entries already hardcoded above -in case more are added to the hardcoded list + for {set cdec 0} {$cdec <= 7} {incr cdec} { + set hhhh [format %.4X $cdec] + set char [format %c $cdec] + if {![dict exists $Bstring_control_map $char]} { + dict set Bstring_control_map $char \\u$hhhh + } + } + for {set cdec [expr {0x0A}]} {$cdec <= 0x1F} {incr cdec} { + set hhhh [format %.4X $cdec] + set char [format %c $cdec] + if {![dict exists $Bstring_control_map $char]} { + dict set Bstring_control_map $char \\u$hhhh + } + } + # \u007F = 127 + dict set Bstring_control_map [format %c 127] \\u007F - namespace upvar ::tomlish::parse v v ;#array keyed on nest level + # ------------------------------------------------------------------ + variable Literal_control_map [dict create] + #controls other than tab + for {set cdec 0} {$cdec <= 8} {incr cdec} { + set hhhh [format %.4X $cdec] + set char [format %c $cdec] + if {![dict exists $Literal_control_map $char]} { + dict set Literal_control_map $char \\u$hhhh + } + } + for {set cdec [expr {0x0A}]} {$cdec <= 0x1F} {incr cdec} { + set hhhh [format %.4X $cdec] + set char [format %c $cdec] + if {![dict exists $Literal_control_map $char]} { + dict set Literal_control_map $char \\u$hhhh + } + } + # \u007F = 127 + dict set Literal_control_map [format %c 127] \\u007F + # ------------------------------------------------------------------ + variable Multiliteral_control_map + set Multiliteral_control_map [dict remove $Literal_control_map \n] + variable String_control_map + set String_control_map [dict remove $Literal_control_map \\] - set v(0) {TOMLISH} - array set s0 [list] ;#whitespace data to go in {SPACE {}} element. - set parentlevel 0 - namespace upvar ::tomlish::parse i i - set i 0 + variable MultiBstring_totoml_map + #'minimally' escaped sequences of double quotes. + #e.g {""\"""\"} vs {\"\"\"\"\"} + #This produces easier to read toml - and in many cases may be more likely to match original format when roundtripped from dict datastructure + # REVIEW - should this be configurable? + set MultiBstring_totoml_map [dict remove $Bstring_control_map {"} \r \n] + dict set MultiBstring_totoml_map {"""} {""\"} ;#" editor hack: commented quote for dumb syntax highlighers - namespace upvar ::tomlish::parse state state + #Note the inclusion of backslash in the list of controls makes this non idempotent - subsequent runs would keep encoding the backslashes! + #escape only those chars that must be escaped in a Bstring (e.g not tab which can be literal or escaped) + #for example - can be used by from_dict to produce valid Bstring data for a tomlish record + proc rawstring_to_Bstring_with_escaped_controls {str} { + #for the well known chars that have compact escape sequences allowed by toml - we choose that form over the full \u form. + #we'll use a string map with an explicit list rather than algorithmic at runtime + # - the string map is probably more performant than splitting a string, especially if it's large - namespace upvar ::tomlish::parse braceCount braceCount - set barceCount 0 - namespace upvar ::tomlish::parse bracketCount bracketCount - set bracketCount 0 + upvar ::tomlish::utils::Bstring_control_map map - set sep 0 - set r 1 - namespace upvar ::tomlish::parse token_waiting token_waiting - set token_waiting [dict create] ;#if ::tok finds a *complete* second token during a run, it will put the 2nd one here to be returned by the next call. + return [string map $map $str] + } + proc rawstring_to_MultiBstring_with_escaped_controls {str} { + #for the well known chars that have compact escape sequences allowed by toml - we choose that form over the full \u form. + #we'll use a string map with an explicit list rather than algorithmic at runtime + # - the string map is probably more performant than splitting a string, especially if it's large + upvar ::tomlish::utils::MultiBstring_totoml_map map - set state "table-space" - ::tomlish::parse::spacestack push {type space state table-space} - namespace upvar ::tomlish::parse linenum linenum;#'line number' of input data. (incremented for each literal linefeed - but not escaped ones in data) - set linenum 1 + return [string map $map $str] + } - set ::tomlish::parse::state_list [list] - try { - while {$r} { - set r [::tomlish::parse::tok $s] - #puts stdout "got tok: '$tok' while parsing string '$s' " - set next_tokenType_known 0 ;#whether we begin a new token here based on what terminated the token result of 'tok' + proc rawstring_is_valid_tomlstring {str} { + #controls are allowed in this direction dict -> toml (they get quoted) + #check any existing escapes are valid + if {[catch { + unescape_string $str + } errM]} { + return 0 + } + return 1 + } - #puts "got token: '$tok' tokenType='$tokenType'. while v($nest) = [set v($nest)]" - #puts "-->tok: $tok tokenType='$tokenType'" - set prevstate $state - set transition_info [::tomlish::parse::goNextState $tokenType $tok $state] - #review goNextState could perform more than one space_action - set space_action [dict get $transition_info space_action] - set newstate [dict get $transition_info newstate] ;#use of 'newstate' vs 'state' makes code clearer below + proc rawstring_is_valid_literal {str} { + #detect control chars other than tab + variable Literal_control_map + set testval [string map $Literal_control_map $str] + return [expr {$testval eq $str}] + } + proc rawstring_is_valid_multiliteral {str} { + #detect control chars other than tab + variable Multiliteral_control_map - if {[tcl::string::match "err-*" $state]} { - ::tomlish::log::warn "---- State error in state $prevstate for tokenType: $tokenType token value: $tok. $state aborting parse. [tomlish::parse::report_line]" - lappend v(0) [list ERROR tokentype $tokenType state $prevstate to $state leveldata [set v($nest)]] - return $v(0) - } - # --------------------------------------------------------- - #NOTE there may already be a token_waiting at this point - #set_token_waiting can raise an error here, - # in which case the space_action branch needs to be rewritten to handle the existing token_waiting - # --------------------------------------------------------- + set teststr [string map [list \r\n ok] $str] - if {$space_action eq "pop"} { - #pop_trigger_tokens: newline tablename endarray endinlinetable - #note a token is a pop trigger depending on context. e.g first newline during keyval is a pop trigger. - set parentlevel [expr {$nest -1}] - set do_append_to_parent 1 ;#most tokens will leave this alone - but some like tentative_accum_squote need to do their own append - switch -exact -- $tokenType { - tentative_accum_squote { - #should only apply within a multiliteral - #### - set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed - #Without this - we would get extraneous empty list entries in the parent - # - as the xxx-squote-space isn't a space level from the toml perspective - # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop - #assert prevstate always trailing-squote-space - #dev guardrail - remove? assertion lib? - switch -exact -- $prevstate { - trailing-squote-space { - } - default { - error "--- unexpected popped due to tentative_accum_squote but came from state '$prevstate' should have been trailing-squote-space" - } - } - switch -- $tok { - ' { - tomlish::parse::set_token_waiting type single_squote value $tok complete 1 startindex [expr {$i -1}] - } - '' { - #review - we should perhaps return double_squote instead? - #tomlish::parse::set_token_waiting type literal value "" complete 1 - tomlish::parse::set_token_waiting type double_squote value "" complete 1 startindex [expr {$i - 2}] - } - ''' { - #### - #if already an eof in token_waiting - set_token_waiting will insert before it - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 3}] - } - '''' { - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] - #todo integrate left squote with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - LITERALPART { - set newval "[lindex $lastpart 1]'" - set parentdata $v($parentlevel) - lset parentdata end [list LITERALPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE { - lappend v($parentlevel) [list LITERALPART "'"] - } - MULTILITERAL { - #empty - lappend v($parentlevel) [list LITERALPART "'"] - } - default { - error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" - } - } - } - ''''' { - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] - #todo integrate left 2 squotes with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - LITERALPART { - set newval "[lindex $lastpart 1]''" - set parentdata $v($parentlevel) - lset parentdata end [list LITERALPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE { - lappend v($parentlevel) [list LITERALPART "''"] - } - MULTILITERAL { - lappend v($parentlevel) [list LITERALPART "''"] - } - default { - error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" - } - } - } - } - } - triple_squote { - #presumably popping multiliteral-space - ::tomlish::log::debug "---- triple_squote for last_space_action pop leveldata: $v($nest)" - set merged [list] - set lasttype "" - foreach part $v($nest) { - switch -exact -- [lindex $part 0] { - MULTILITERAL { - lappend merged $part - } - LITERALPART { - if {$lasttype eq "LITERALPART"} { - set prevpart [lindex $merged end] - lset prevpart 1 [lindex $prevpart 1][lindex $part 1] - lset merged end $prevpart - } else { - lappend merged $part - } - } - NEWLINE { - #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here - #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. - lappend merged $part - } - default { - error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" - } - } - set lasttype [lindex $part 0] - } - set v($nest) $merged - } - tentative_accum_dquote { - #should only apply within a multistring - #### - set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed - #Without this - we would get extraneous empty list entries in the parent - # - as the trailing-dquote-space isn't a space level from the toml perspective - # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop - #assert prevstate always trailing-dquote-space - #dev guardrail - remove? assertion lib? - switch -exact -- $prevstate { - trailing-dquote-space { - } - default { - error "--- unexpected popped due to tentative_accum_dquote but came from state '$prevstate' should have been trailing-dquote-space" - } - } - switch -- $tok { - {"} { - tomlish::parse::set_token_waiting type single_dquote value $tok complete 1 startindex [expr {$i -1}] - } - {""} { - #review - we should perhaps return double_dquote instead? - #tomlish::parse::set_token_waiting type literal value "" complete 1 - tomlish::parse::set_token_waiting type double_dquote value "" complete 1 startindex [expr {$i - 2}] - } - {"""} { - #### - #if already an eof in token_waiting - set_token_waiting will insert before it - tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 3}] - } - {""""} { - tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 4}] - #todo integrate left dquote with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - STRINGPART { - set newval "[lindex $lastpart 1]\"" - set parentdata $v($parentlevel) - lset parentdata end [list STRINGPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE - CONT - WS { - lappend v($parentlevel) [list STRINGPART {"}] - } - MULTISTRING { - #empty - lappend v($parentlevel) [list STRINGPART {"}] - } - default { - error "--- don't know how to integrate extra trailing dquote with data $v($parentlevel)" - } - } - } - {"""""} { - tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i-5}] - #todo integrate left 2 dquotes with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - STRINGPART { - set newval "[lindex $lastpart 1]\"\"" - set parentdata $v($parentlevel) - lset parentdata end [list STRINGPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE - CONT - WS { - lappend v($parentlevel) [list STRINGPART {""}] - } - MULTISTRING { - lappend v($parentlevel) [list STRINGPART {""}] - } - default { - error "--- don't know how to integrate extra trailing 2 dquotes with data $v($parentlevel)" - } - } - } - } - } - triple_dquote { - #presumably popping multistring-space - ::tomlish::log::debug "---- triple_dquote for last_space_action pop leveldata: $v($nest)" - set merged [list] - set lasttype "" - foreach part $v($nest) { - switch -exact -- [lindex $part 0] { - MULTISTRING { - lappend merged $part - } - STRINGPART { - if {$lasttype eq "STRINGPART"} { - set prevpart [lindex $merged end] - lset prevpart 1 [lindex $prevpart 1][lindex $part 1] - lset merged end $prevpart - } else { - lappend merged $part - } - } - CONT - WS { - lappend merged $part - } - NEWLINE { - #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here - #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. - lappend merged $part - } - default { - error "---- triple_dquote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" - } - } - set lasttype [lindex $part 0] - } - set v($nest) $merged - } - equal { - #pop caused by = - switch -exact -- $prevstate { - dottedkey-space { - tomlish::log::debug "---- equal ending dottedkey-space for last_space_action pop" - #re-emit for parent space - tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] - } - dottedkey-space-tail { - #experiment? - tomlish::log::debug "---- equal ending dottedkey-space-tail for last_space_action pop" - #re-emit for parent space - tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] - } - } - } - newline { - incr linenum - lappend v($nest) [list NEWLINE $tok] - } - tablename { - #note: a tablename only 'pops' if we are greater than zero - error "---- tablename pop should already have been handled as special case zeropoppushspace in goNextState" - } - tablearrayname { - #!review - tablearrayname different to tablename regarding push/pop? - #note: a tablename only 'pops' if we are greater than zero - error "---- tablearrayname pop should already have been handled as special case zeropoppushspace in goNextState" - } - endarray { - #nothing to do here. - } - comma { - #comma for inline table will pop the keyvalue space - lappend v($nest) "SEP" - } - endinlinetable { - ::tomlish::log::debug "---- endinlinetable for last_space_action pop" - } - default { - error "---- unexpected tokenType '$tokenType' for last_space_action 'pop'" - } - } - if {$do_append_to_parent} { - #e.g tentative_accum_squote does it's own appends as necessary - so won't get here - lappend v($parentlevel) [set v($nest)] - } + set testval [string map $Multiliteral_control_map $teststr] + return [expr {$testval eq $teststr}] + } - incr nest -1 + #review - unescape what string? Bstring vs MLBstring? + #we should be specific in the function naming here + #used by dict::from_tomlish - so part of validation? - REVIEW + proc unescape_string {str} { + #note we can't just use Tcl subst because: + # it also transforms \a (audible bell) and \v (vertical tab) which are not in the toml spec. + # it would strip out backslashes inappropriately: e.g "\j" becomes just j + # it recognizes other escapes which aren't approprite e.g octal \nnn + # it replaces \ with a single whitespace (trailing backslash) + #This means we shouldn't use 'subst' on the whole string, but instead substitute only the toml-specified escapes (\r \n \b \t \f \\ \" \uhhhh & \Uhhhhhhhh + #plus \e for \x1b? - } elseif {$last_space_action eq "push"} { - set prevnest $nest - incr nest 1 - set v($nest) [list] - # push_trigger_tokens: barekey dquotedkey startinlinetable startarray tablename tablearrayname + set buffer "" + set buffer2 "" ;#buffer for 2 hex characters following a \x + set buffer4 "" ;#buffer for 4 hex characters following a \u + set buffer8 "" ;#buffer for 8 hex characters following a \u + set sLen [tcl::string::length $str] - switch -exact -- $tokenType { - tentative_trigger_squote - tentative_trigger_dquote { - #### this startok will always be tentative_accum_squote/tentative_accum_dquote starting with one accumulated squote/dquote - if {[dict exists $transition_info starttok] && [dict get $transition_info starttok] ne ""} { - lassign [dict get $transition_info starttok] starttok_type starttok_val - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType $starttok_type - set tok $starttok_val - } - } - single_squote { - #JMN - REVIEW - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "squotedkey" - set tok "" - } - triple_squote { - ::tomlish::log::debug "---- push trigger tokenType triple_squote" - set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERALPART - } - squotedkey { - switch -exact -- $prevstate { - table-space - itable-space { - set v($nest) [list DOTTEDKEY] - } - } - #todo - check not something already waiting? - tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space - } - triple_dquote { - set v($nest) [list MULTISTRING] ;#container for NEWLINE,STRINGPART,CONT - } - dquotedkey { - switch -exact -- $prevstate { - table-space - itable-space { - set v($nest) [list DOTTEDKEY] - } - } - #todo - check not something already waiting? - tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space - } - barekey { - switch -exact -- $prevstate { - table-space - itable-space { - set v($nest) [list DOTTEDKEY] - } - } - #todo - check not something already waiting? - set waiting [tomlish::parse::get_token_waiting] - if {[llength $waiting]} { - set i [dict get $waiting startindex] - tomlish::parse::clear_token_waiting - tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space - } else { - tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space - } - } - tablename { - #note: we do not use the output of tomlish::to_dict::tablename_trim to produce a tablename for storage in the tomlish list! - #The tomlish list is intended to preserve all whitespace (and comments) - so a roundtrip from toml file to tomlish - # back to toml file will be identical. - #It is up to the datastructure stage to normalize and interpret tomlish for programmatic access. - # we call tablename_trim here only to to validate that the tablename data is well-formed at the outermost level, - # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names from - # a structural perspective. + #we need to handle arbitrarily long sequences of backslashes. \\\\\ etc + set slash_active 0 + set unicode2_active 0 + set unicode4_active 0 + set unicode8_active 0 - #todo - review! It's arguable that we should not do any validation here, and just store even incorrect raw tablenames, - # so that the tomlish list is more useful for say a toml editor. Consider adding an 'err' tag to the appropriate place in the - # tomlish list? + ::tomlish::log::debug "unescape_string. got len [string length str] str $str" - #set trimtable [::tomlish::to_dict::tablename_trim $tok] - #::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$trimtable'" - set v($nest) [list TABLE $tok] ;#$tok is the *raw* table name - #note also that equivalent tablenames may have different toml representations even after being trimmed! - #e.g ["x\t\t"] & ["x "] (tab escapes vs literals) - #These will show as above in the tomlish list, but should normalize to the same tablename when used as keys by the datastructure stage. - } - tablearrayname { - #set trimtable [::tomlish::to_dict::tablename_trim $tok] - #::tomlish::log::debug "---- trimmed (but not normalized) tablearrayname: '$trimtable'" - set v($nest) [list TABLEARRAY $tok] ;#$tok is the *raw* tablearray name - } - startarray { - set v($nest) [list ARRAY] ;#$tok is just the opening bracket - don't output. - } - startinlinetable { - set v($nest) [list ITABLE] ;#$tok is just the opening curly brace - don't output. - } - default { - error "---- push trigger tokenType '$tokenType' not yet implemented" - } - } + #!todo - check for invalid data in the form of a raw carriage return (decimal 13) without following linefeed? + set i 0 + for {} {$i < $sLen} {} { + if {$i > 0} { + set lastChar [tcl::string::index $str [expr {$i - 1}]] + } else { + set lastChar "" + } + + set c [tcl::string::index $str $i] + #::tomlish::log::debug "unescape_string. got char $c" ;#too much? + + ##---------------------- + ##as we are 'unescaping' - should we really be testing here for existing values that should have been escaped? + ##The answer is probably no - keep this function to a single purpose - test elsewhere for raw controls. + ##this test looks incomplete anyway REVIEW + #scan $c %c n + #if {($n <= 31) && ($n != 9) && ($n != 10) && ($n != 13)} { + # #we don't expect unescaped unicode characters from 0000 to 001F - + # #*except* for raw tab (which is whitespace) and newlines + # error "unescape_string. Invalid data for a toml string. Unescaped control character (decimal $n) [::tomlish::utils::string_to_slashu $c]" + #} + ##---------------------- + incr i ;#must incr here because we do'returns'inside the loop + if {$c eq "\\"} { + if {$slash_active} { + append buffer "\\" + set slash_active 0 + } elseif {$unicode2_active} { + error "unescape_string. unexpected case slash during unicode2 not yet handled" + } elseif {$unicode4_active} { + error "unescape_string. unexpected case slash during unicode4 not yet handled" + } elseif {$unicode8_active} { + error "unescape_string. unexpected case slash during unicode8 not yet handled" } else { - #no space level change - switch -exact -- $tokenType { - squotedkey { - #puts "---- squotedkey in state $prevstate (no space level change)" - lappend v($nest) [list SQKEY $tok] + # don't output anything (yet) + set slash_active 1 + } + } else { + if {$unicode2_active} { + if {[tcl::string::length $buffer2] < 2} { + append buffer2 $c + } + if {[tcl::string::length $buffer2] == 2} { + #we have a \xHH to test + set unicode2_active 0 + set result [tomlish::utils::hex_escape_info "\\x$buffer2"] + if {[lindex $result 0] eq "ok"} { + append buffer [dict get $result ok char] + } else { + error "unescape_string error: [lindex $result 1]" } - dquotedkey { - #puts "---- dquotedkey in state $prevstate (no space level change)" - lappend v($nest) [list DQKEY $tok] + } + } elseif {$unicode4_active} { + if {[tcl::string::length $buffer4] < 4} { + append buffer4 $c + } + if {[tcl::string::length $buffer4] == 4} { + #we have a \uHHHH to test + set unicode4_active 0 + set result [tomlish::utils::unicode_escape_info "\\u$buffer4"] + if {[lindex $result 0] eq "ok"} { + append buffer [dict get $result ok char] + } else { + error "unescape_string error: [lindex $result 1]" } - barekey { - lappend v($nest) [list KEY $tok] + } + } elseif {$unicode8_active} { + if {[tcl::string::length $buffer8] < 8} { + append buffer8 $c + } + if {[tcl::string::length $buffer8] == 8} { + #we have a \UHHHHHHHH to test + set unicode8_active 0 + set result [tomlish::utils::unicode_escape_info "\\U$buffer8"] + if {[lindex $result 0] eq "ok"} { + append buffer [dict get $result ok char] + } else { + error "unescape_string error: [lindex $result 1]" } - dotsep { - lappend v($nest) [list DOTSEP] + } + } elseif {$slash_active} { + set slash_active 0 + set ctest [tcl::string::map {{"} dq} $c] + switch -exact -- $ctest { + dq { + append buffer {"} } - starttablename { - #$tok is triggered by the opening bracket and sends nothing to output + b - t - n - f - r { + append buffer [subst -nocommand -novariable "\\$c"] } - starttablearrayname { - #$tok is triggered by the double opening brackets and sends nothing to output + e { + append buffer \x1b } - tablename - tablenamearray { - error "---- did not expect 'tablename/tablearrayname' without space level change (no space level change)" - #set v($nest) [list TABLE $tok] + x { + #introduced in 1.1.0 \xHH + set unicode2_active 1 + set buffer2 "" } - endtablename - endtablearrayname { - #no output into the tomlish list for this token + u { + set unicode4_active 1 + set buffer4 "" } - startinlinetable { - puts stderr "---- decode::toml error. did not expect startinlinetable without space level change (no space level change)" + U { + set unicode8_active 1 + set buffer8 "" } - single_dquote { - switch -exact -- $newstate { - string-state { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "string" - set tok "" - } - dquoted-key { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "dquotedkey" - set tok "" - } - multistring-space { - lappend v($nest) [list STRINGPART {"}] - #may need to be joined on pop if there are neighbouring STRINGPARTS - } - default { - error "---- single_dquote switch case not implemented for nextstate: $newstate (no space level change)" - } - } + default { + set slash_active 0 + #review - toml spec says all other escapes are reserved + #and if they are used TOML should produce an error. + #append buffer "\\$c" + set msg "Invalid escape sequence \\ followed by '$c'" + return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} $msg } - double_dquote { - #leading extra quotes - test: toml_multistring_startquote2 - switch -exact -- $prevstate { - itable-keyval-value-expected - keyval-value-expected { - puts stderr "tomlish::decode::toml double_dquote TEST" - #empty string - lappend v($nest) [list STRINGPART ""] - } - multistring-space { - #multistring-space to multistring-space - lappend v($nest) [list STRINGPART {""}] - } - default { - error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" - } - } + } + } else { + append buffer $c + } + } + } + #puts stdout "EOF 4:$unicode4_active 8:$unicode8_active slash:$slash_active" + if {$unicode2_active} { + error "End of string reached before complete hex escape sequence \xHH" + } + if {$unicode4_active} { + error "End of string reached before complete unicode escape sequence \uHHHH" + } + if {$unicode8_active} { + error "End of string reached before complete unicode escape sequence \UHHHHHHHH" + } + if {$slash_active} { + append buffer "\\" + } + try { + encoding convertto utf-8 $buffer + } trap {} {emsg eopts} { + return -code error -errorcode {TOMLISH SYNTAX ENCODINGERROR} $emsg + } + return $buffer + } - } - single_squote { - switch -exact -- $newstate { - literal-state { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "literal" - set tok "" - } - squoted-key { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "squotedkey" - set tok "" - } - multiliteral-space { - #false alarm squote returned from tentative_accum_squote pop - ::tomlish::log::debug "---- adding lone squote to own LITERALPART nextstate: $newstate (no space level change)" - #(single squote - not terminating space) - lappend v($nest) [list LITERALPART '] - #may need to be joined on pop if there are neighbouring LITERALPARTs - } - default { - error "---- single_squote switch case not implemented for nextstate: $newstate (no space level change)" - } - } - } - double_squote { - switch -exact -- $prevstate { - keyval-value-expected { - lappend v($nest) [list LITERAL ""] - } - multiliteral-space { - #multiliteral-space to multiliteral-space - lappend v($nest) [list LITERALPART ''] - } - default { - error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" - } - } - } - enddquote { - #nothing to do? - set tok "" - } - endsquote { - set tok "" - } - string { - lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes - } - literal { - lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes - } - multistring { - #review - lappend v($nest) [list MULTISTRING $tok] - } - stringpart { - lappend v($nest) [list STRINGPART $tok] ;#will not get wrapped in dquotes directly - } - multiliteral { - lappend v($nest) [LIST MULTILITERAL $tok] - } - literalpart { - lappend v($nest) [list LITERALPART $tok] ;#will not get wrapped in squotes directly - } - untyped_value { - #would be better termed unclassified_value - #we can't determine the type of unquoted values (int,float,datetime,bool) until the entire token was read. - unset -nocomplain tag - if {$tok in {true false}} { - set tag BOOL - } else { - if {[::tomlish::utils::is_int $tok]} { - set tag INT - } else { - if {[string is integer -strict $tok]} { - #didn't qualify as a toml int - but still an int - #probably means is_int is limiting size and not accepting bigints (configurable?) - #or it didn't qualify due to more than 1 leading zero - #or other integer format issue such as repeated underscores - error "---- Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. (looks close to being an int. Formatting or range issue?) [tomlish::parse::report_line] (no space level change)" - } else { - if {[::tomlish::utils::is_float $tok]} { - set tag FLOAT - } elseif {[::tomlish::utils::is_datetime $tok] || [::tomlish::utils::is_timepart $tok]} { - #Note we must allow lone timepart here (not just is_localtime which doesn't allow tz offsets) in case it followed a localdate - #e.g x= 2025-01-01 02:34Z - #The to_dict validation will catch an invalid standaline timepart, or combine with leading date if applicable. - set tag DATETIME - } else { - error "---- Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. [tomlish::parse::report_line] (no space level change)" - } - } - } - } - #assert either tag is set, or we errored out. - lappend v($nest) [list $tag $tok] + #This does not have to do with unicode normal forms - which it seems toml has decided against regarding use in keys (review/references?) + #This is meant for internal use regarding ensuring we match equivalent keys which may have just been specified with different string mechanisms, + #e.g squoted vs dquoted vs barekey. + proc normalize_key {rawkey} { + set c1 [tcl::string::index $rawkey 0] + set c2 [tcl::string::index $rawkey end] + if {($c1 eq "'") && ($c2 eq "'")} { + #single quoted segment. No escapes allowed within it. + set key [tcl::string::range $rawkey 1 end-1] + } elseif {($c1 eq "\"") && ($c2 eq "\"")} { + #double quoted segment. Unapply escapes. + # + set keydata [tcl::string::range $rawkey 1 end-1] ;#strip outer quotes only + #e.g key could have mix of \UXXXXXXXX escapes and unicode chars + #or mix of \t and literal tabs. + #unescape to convert all to literal versions for comparison + set key [::tomlish::utils::unescape_string $keydata] + #set key [subst -nocommands -novariables $keydata] ;#wrong. Todo - create a string escape substitution function. + } else { + set key $rawkey + } + return $key + } - } - comment { - #puts stdout "----- comment token returned '$tok'------" - lappend v($nest) [list COMMENT "$tok"] - } - equal { - #we append '=' to the nest so that any surrounding whitespace is retained. - lappend v($nest) = - } - comma { - lappend v($nest) SEP - } - newline { - incr linenum - lappend v($nest) [list NEWLINE $tok] - } - whitespace { - lappend v($nest) [list WS $tok] - } - continuation { - lappend v($nest) CONT - } - bom { - lappend v($nest) BOM - } - eof { - #ok - nothing more to add to the tomlish list. - #!todo - check previous tokens are complete/valid? - } - default { - error "--- unknown tokenType '$tokenType' during state $prevstate [::tomlish::parse::report_line] (no space level change)" - } - } - } + proc string_to_slashu {string} { + set rv {} + foreach c [split $string {}] { + scan $c %c cdec + if {$cdec > 65535} { + append rv {\U} [format %.8X $cdec] + } else { + append rv {\u} [format %.4X $cdec] + } + } + return $rv + } - if {!$next_tokenType_known} { - ::tomlish::log::notice "---- tomlish::decode::toml - current tokenType:$tokenType Next token type not known" - ::tomlish::parse::set_tokenType "" - set tok "" - } + #'nonprintable' is conservative here because some systems (e.g windows console) are very limited in what they can display. + #This is used for display purposes only (error msgs) + proc nonprintable_to_slashu {s} { + set res "" + foreach i [split $s ""] { + scan $i %c cdec - if {$state eq "end-state"} { - break + set printable 0 + if {($cdec>31) && ($cdec<127)} { + set printable 1 + } + if {$printable} { + append res $i + } else { + if {$cdec > 65535} { + append res \\U[format %.8X $cdec] + } else { + append res \\u[format %.4X $cdec] } + } + } + set res + } ;# initial version from tcl wiki RS - + proc rawstring_to_jsonstring {s} { + #like nonprintable_to_slashu + # - also escape every dquote + # - escape newlines + set res "" + foreach i [split $s ""] { + scan $i %c cdec + switch -- $cdec { + 34 { + #double quote + append res \\\" + } + 13 { + #carriage return + append res \\r + } + 8 { + append res \\b + } + 9 { + append res \\t + } + 10 { + #linefeed + append res \\n + } + 92 { + append res \\\\ + } + default { + set printable 0 + if {($cdec>31) && ($cdec<127)} { + set printable 1 + } + if {$printable} { + append res $i + } else { + if {$cdec > 65535} { + #append res $i + #append res \\U[format %.8X $cdec] ;#wrong + #append res "\\U{[format %.8x $cdec]}" ;#some variation of json? + package require punk::cesu + #e.g \U0001f610 emoticon face + #surrogate pair: \uD83D\uDE10 + set surrogatepair [punk::cesu::to_surrogatestring -format escape $i] + append res $surrogatepair + } else { + append res \\u[format %.4X $cdec] + } + } + } } + } + set res - #while {$nest > 0} { - # lappend v([expr {$nest -1}]) [set v($nest)] - # incr nest -1 - #} - while {[::tomlish::parse::spacestack size] > 1} { - ::tomlish::parse::spacestack pop - lappend v([expr {$nest -1}]) [set v($nest)] - incr nest -1 + } - #set parent [spacestack peek] ;#the level being appended to - #lassign $parent type state - #if {$type eq "space"} { - # - #} elseif {$type eq "buffer"} { - # lappend v([expr {$nest -1}]) {*}[set v($nest)] - #} else { - # error "invalid spacestack item: $parent" - #} + #check if str is valid for use as a toml bare key + #Early toml versions only allowed letters + underscore + dash + proc is_basic_barekey {str} { + if {[tcl::string::length $str] == 0} { + return 0 + } else { + set matches [regexp -all {[a-zA-Z0-9\_\-]} $str] + if {[tcl::string::length $str] == $matches} { + #all characters match the regexp + return 1 + } else { + return 0 } - - } finally { - set is_parsing 0 } - return $v(0) } - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::decode ---}] -} -#decode toml to tomlish -interp alias {} tomlish::from_toml {} tomlish::decode::toml - -namespace eval tomlish::utils { - #*** !doctools - #[subsection {Namespace tomlish::utils}] - #[para] - #[list_begin definitions] - - - - #basic generic quote matching for single and double quotes - #note for example that {[o'malley]} will return sq - as the single quote is not closed or wrapped in double quotes - proc tok_in_quotedpart {tok} { - set sLen [tcl::string::length $tok] - set quote_type "" - set had_slash 0 - for {set i 0} {$i < $sLen} {incr i} { - set c [tcl::string::index $tok $i] - if {$quote_type eq ""} { - if {$had_slash} { - #don't enter quote mode - #leave slash_mode because even if current char is slash - it is escaped - set had_slash 0 - } else { - set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] - switch -- $ctype { - dq { - set quote_type dq - } - sq { - set quote_type sq - } - bsl { - set had_slash 1 - } - } - } - } else { - if {$had_slash} { - #don't leave quoted mode - #leave slash_mode because even if current char is slash - it is escaped - set had_slash 0 - } else { - set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] - switch -- $ctype { - dq { - if {$quote_type eq "dq"} { - set quote_type "" - } - } - sq { - if {$quote_type eq "sq"} { - set quote_type "" - } - } - bsl { - set had_slash 1 - } - } - } - } - } - return $quote_type ;#dq | sq + #from toml.abnf in github.com/toml-lang/toml + #unquoted-key = 1*unquoted-key-char + #unquoted-key-char = ALPHA / DIGIT / %x2D / %x5F ; a-z A-Z 0-9 - _ + #unquoted-key-char =/ %xB2 / %xB3 / %xB9 / %xBC-BE ; superscript digits, fractions + #unquoted-key-char =/ %xC0-D6 / %xD8-F6 / %xF8-37D ; non-symbol chars in Latin block + #unquoted-key-char =/ %x37F-1FFF ; exclude GREEK QUESTION MARK, which is basically a semi-colon + #unquoted-key-char =/ %x200C-200D / %x203F-2040 ; from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ + #unquoted-key-char =/ %x2070-218F / %x2460-24FF ; include super-/subscripts, letterlike/numberlike forms, enclosed alphanumerics + #unquoted-key-char =/ %x2C00-2FEF / %x3001-D7FF ; skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces + #unquoted-key-char =/ %x2070-21FF / %x2300-24FF ; skip math operators + #unquoted-key-char =/ %x25A0-268B / %x2690-2757 ; skip box drawing, block elements, and some yin-yang symbols + #unquoted-key-char =/ %x2762-2767 / %x2776-27E5 ; skip some Dingbat punctuation + #unquoted-key-char =/ %x2801-297F ; skip some math brackets and arrows, and braille blank + #unquoted-key-char =/ %x2B00-2FFF / %x3001-D7FF ; skip various math operators and symbols, and ideographic space + #unquoted-key-char =/ %xF900-FDCF / %xFDF0-FFFD ; skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) + #unquoted-key-char =/ %x10000-EFFFF ; all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) + variable re_barekey + set ranges [list] + lappend ranges {a-zA-Z0-9\_\-} + lappend ranges {\u00B2} {\u00B3} {\u00B9} {\u00BC-\u00BE} ;# superscript digits, fractions + lappend ranges {\u00C0-\u00D6} {\u00D8-\u00F6} {\u00F8-\u037D} ;# non-symbol chars in Latin block + lappend ranges {\u037f-\u1FFF} ;# exclude GREEK QUESTION MARK, which is basically a semi-colon + lappend ranges {\u200C-\u200D} {\u203F-\u2040} ;# from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ + lappend ranges {\u2070-\u218f} {\u2460-\u24FF} ;# include super-subscripts, letterlike/numberlike forms, enclosed alphanumerics + lappend ranges {\u2C00-\u2FEF} {\u3001-\uD7FF} ;# skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces + lappend ranges {\u2070-\u21FF} {\u2300-\u24FF} ;# skip math operators + lappend ranges {\u25A0-\u268B} {\u2690-\u2757} ;# skip box drawing, block elements, and some yin-yang symbols + lappend ranges {\u2762-\u2767} {\u2776-\u27E5} ;# skip some Dingbat punctuation + lappend ranges {\u2801-\u297F} ;# skip some math brackets and arrows, and braille blank + lappend ranges {\u2B00-\u2FFF} {\u3001-\uD7FF} ;# skip various math operators and symbols, and ideographic space + lappend ranges {\uF900-\uFDCF} {\uFDF0-\uFFFD} ;# skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) + lappend ranges {\U10000-\UEFFFF} ;# all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) + set re_barekey {^[} + foreach r $ranges { + append re_barekey $r } + append re_barekey {]+$} - - proc unicode_escape_info {slashu} { - #!todo - # validate that slashu is either a \uxxxx or \Uxxxxxxxx value of the correct length and - # is a valid 'unicode scalar value' (any Unicode code point except high-surrogate and low-surrogate code points) - # ie integers in the range 0 to D7FF16 and E00016 to 10FFFF16 inclusive - #expr {(($x >= 0) && ($x <= 0xD7FF16)) || (($x >= 0xE00016) && ($x <= 0x10FFFF16))} - if {[tcl::string::match {\\u*} $slashu]} { - set exp {^\\u([0-9a-fA-F]{4}$)} - if {[regexp $exp $slashu match hex]} { - if {[scan $hex %4x dec] != 1} { - #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? - return [list err [list reason "Failed to convert '$hex' to decimal"]] - } else { - return [list ok [list char [subst -nocommand -novariable $slashu]]] - } - } else { - return [list err [list reason "Supplied string not of the form \\uHHHH where H in \[0-9a-fA-F\]"]] - } - } elseif {[tcl::string::match {\\U*} $slashu]} { - set exp {^\\U([0-9a-fA-F]{8}$)} - if {[regexp $exp $slashu match hex]} { - if {[scan $hex %8x dec] != 1} { - #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? - return [list err [list reason "Failed to convert '$hex' to decimal"]] - } else { - if {(($dec >= 0) && ($dec <= 0xD7FF16)) || (($dec >= 0xE00016) && ($dec <= 0x10FFFF16))} { - return [list ok [list char [subst -nocommand -novariable $slashu]]] - } else { - return [list err [list reason "$slashu is not within the 'unicode scalar value' ranges 0 to 0xD7FF16 or 0xE00016 to 0x10FFFF16"]] - } - } - } else { - return [list err [list reason "Supplied string not of the form \\UHHHHHHHH where H in \[0-9a-fA-F\]"]] - } - } else { - return [list err [list reason "Supplied string did not start with \\u or \\U" ]] - } - + proc is_barekey {str} { + if {[tcl::string::length $str] == 0} { + return 0 + } + variable re_barekey + return [regexp $re_barekey $str] } - #Note that unicode characters don't *have* to be escaped. - #So if we provide a function named 'escape_string', the name implies the inverse of unescape_string which unescapes unicode \u \U values. - #- an inverse of unescape_string would encode all unicode chars unnecessarily. - #- as toml accepts a compact escape sequence for common chars such as tab,backspace,linefeed etc but also allows the full form \u009 etc - #- escape_string and unescape_string would not be reliably roundtrippable inverses anyway. - #REVIEW - provide it anyway? When would it be desirable to use? - - variable Bstring_control_map [dict create] - dict set Bstring_control_map \b {\b} - dict set Bstring_control_map \n {\n} - dict set Bstring_control_map \r {\r} - dict set Bstring_control_map \" {\"} - dict set Bstring_control_map \x1b {\e} ;#In spec it's included in the list of 'must be escaped', as well as the 'convenience' escapes - so we make it go both ways. - dict set Bstring_control_map \\ "\\\\" - - #\e for \x1b seems like it might be included - v1.1?? hard to find current state of where toml is going :/ - #for a Bstring (Basic string) tab is explicitly mentioned as not being one that must be escaped. - #8 = \b - already in list. - #built the remainder whilst checking for entries already hardcoded above -in case more are added to the hardcoded list - for {set cdec 0} {$cdec <= 7} {incr cdec} { - set hhhh [format %.4X $cdec] - set char [format %c $cdec] - if {![dict exists $Bstring_control_map $char]} { - dict set Bstring_control_map $char \\u$hhhh + #test only that the characters in str are valid for the toml specified type 'integer'. + proc int_validchars1 {str} { + set numchars [tcl::string::length $str] + if {[regexp -all {[0-9\_\-\+]} $str] == $numchars} { + return 1 + } else { + return 0 } } - for {set cdec [expr {0x0A}]} {$cdec <= 0x1F} {incr cdec} { - set hhhh [format %.4X $cdec] - set char [format %c $cdec] - if {![dict exists $Bstring_control_map $char]} { - dict set Bstring_control_map $char \\u$hhhh + #add support for hex,octal,binary 0x.. 0o.. 0b... + proc int_validchars {str} { + set numchars [tcl::string::length $str] + if {[regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] == $numchars} { + return 1 + } else { + return 0 } } - # \u007F = 127 - dict set Bstring_control_map [format %c 127] \\u007F - - #Note the inclusion of backslash in the list of controls makes this non idempotent - subsequent runs would keep encoding the backslashes! - #escape only those chars that must be escaped in a Bstring (e.g not tab which can be literal or escaped) - #for example - can be used by from_dict to produce valid Bstring data for a tomlish record - proc rawstring_to_Bstring_with_escaped_controls {str} { - #for the well known chars that have compact escape sequences allowed by toml - we choose that form over the full \u form. - #we'll use a string map with an explicit list rather than algorithmic at runtime - # - the string map is probably more performant than splitting a string, especially if it's large - variable Bstring_control_map - return [string map $Bstring_control_map $str] - } - - #review - unescape what string? Bstring vs MLBstring? - #we should be specific in the function naming here - #used by to_dict - so part of validation? - REVIEW - proc unescape_string {str} { - #note we can't just use Tcl subst because: - # it also transforms \a (audible bell) and \v (vertical tab) which are not in the toml spec. - # it would strip out backslashes inappropriately: e.g "\j" becomes just j - # it recognizes other escapes which aren't approprite e.g \xhh and octal \nnn - # it replaces \ with a single whitespace (trailing backslash) - #This means we shouldn't use 'subst' on the whole string, but instead substitute only the toml-specified escapes (\r \n \b \t \f \\ \" \uhhhh & \Uhhhhhhhh - #plus \e for \x1b? - - set buffer "" - set buffer4 "" ;#buffer for 4 hex characters following a \u - set buffer8 "" ;#buffer for 8 hex characters following a \u - set sLen [tcl::string::length $str] + proc is_int {str} { + set matches [regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] ;#0b101 etc covered by a-f - #we need to handle arbitrarily long sequences of backslashes. \\\\\ etc - set slash_active 0 - set unicode4_active 0 - set unicode8_active 0 + if {[tcl::string::length $str] == $matches} { + #all characters in legal range - ::tomlish::log::debug "unescape_string. got len [string length str] str $str" + # --------------------------------------- + #check for leading zeroes in non 0x 0b 0o + #first strip any +, - or _ (just for this test) + #(but still allowing 0 -0 +0) + set check [tcl::string::map {+ "" - "" _ ""} $str] + if {([tcl::string::length $check] > 1) && ([tcl::string::index $check 0] eq "0") && ([tcl::string::index $check 1] ni {o x b})} { + return 0 + } + # --------------------------------------- - #!todo - check for invalid data in the form of a raw carriage return (decimal 13) without following linefeed? - set i 0 - for {} {$i < $sLen} {} { - if {$i > 0} { - set lastChar [tcl::string::index $str [expr {$i - 1}]] - } else { - set lastChar "" + #check +,- only occur in the first position. (excludes also +++1 etc) + if {[tcl::string::last - $str] > 0} { + return 0 + } + if {[tcl::string::last + $str] > 0} { + return 0 } - set c [tcl::string::index $str $i] - #::tomlish::log::debug "unescape_string. got char $c" ;#too much? + #------------------------------------------- + #unclear if a 'digit' includes the type specifiers x b o + #we assume the 0x 0b 0o are NOT counted as digits - as underscores here would seem + #to be likely to cause interop issues with other systems + #(e.g tcl allows 0b1_1 but not 0b_11) + #Most of this structure would be unnecessary if we could rely on string::is::integer understanding underscores (9+?) + #we still need to support earlier Tcl for now though. - #---------------------- - #as we are 'unescaping' - should we really be testing for existing values that should have been escaped? - #this test looks incomplete anyway REVIEW - scan $c %c n - if {($n <= 31) && ($n != 9) && ($n != 10) && ($n != 13)} { - #we don't expect unescaped unicode characters from 0000 to 001F - - #*except* for raw tab (which is whitespace) and newlines - error "unescape_string. Invalid data for a toml string. Unescaped control character (decimal $n) [::tomlish::utils::string_to_slashu $c]" - } - #---------------------- - - incr i ;#must incr here because we do'returns'inside the loop - if {$c eq "\\"} { - if {$slash_active} { - append buffer "\\" - set slash_active 0 - } elseif {$unicode4_active} { - error "unescape_string. unexpected case slash during unicode4 not yet handled" - } elseif {$unicode8_active} { - error "unescape_string. unexpected case slash during unicode8 not yet handled" - } else { - # don't output anything (yet) - set slash_active 1 - } - } else { - if {$unicode4_active} { - if {[tcl::string::length $buffer4] < 4} { - append buffer4 $c - } - if {[tcl::string::length $buffer4] == 4} { - #we have a \uHHHH to test - set unicode4_active 0 - set result [tomlish::utils::unicode_escape_info "\\u$buffer4"] - if {[lindex $result 0] eq "ok"} { - append buffer [dict get $result ok char] - } else { - error "unescape_string error: [lindex $result 1]" - } - } - } elseif {$unicode8_active} { - if {[tcl::string::length $buffer8] < 8} { - append buffer8 $c - } - if {[tcl::string::length $buffer8] == 8} { - #we have a \UHHHHHHHH to test - set unicode8_active 0 - set result [tomlish::utils::unicode_escape_info "\\U$buffer8"] - if {[lindex $result 0] eq "ok"} { - append buffer [dict get $result ok char] - } else { - error "unescape_string error: [lindex $result 1]" - } - } - } elseif {$slash_active} { - set slash_active 0 - set ctest [tcl::string::map {{"} dq} $c] - switch -exact -- $ctest { - dq { - append buffer {"} - } - b - t - n - f - r { - append buffer [subst -nocommand -novariable "\\$c"] - } - e { - append buffer \x1b - } - u { - set unicode4_active 1 - set buffer4 "" - } - U { - set unicode8_active 1 - set buffer8 "" - } - default { - set slash_active 0 - #review - toml spec says all other escapes are reserved - #and if they are used TOML should produce an error. - #we leave detecting this for caller for now - REVIEW - append buffer "\\$c" - } - } - } else { - append buffer $c - } - } - } - #puts stdout "EOF 4:$unicode4_active 8:$unicode8_active slash:$slash_active" - if {$unicode4_active} { - error "End of string reached before complete unicode escape sequence \uHHHH" - } - if {$unicode8_active} { - error "End of string reached before complete unicode escape sequence \UHHHHHHHH" - } - if {$slash_active} { - append buffer "\\" - } - return $buffer - } - - #This does not have to do with unicode normal forms - which it seems toml has decided against regarding use in keys (review/references?) - #This is meant for internal use regarding ensuring we match equivalent keys which may have just been specified with different string mechanisms, - #e.g squoted vs dquoted vs barekey. - proc normalize_key {rawkey} { - set c1 [tcl::string::index $rawkey 0] - set c2 [tcl::string::index $rawkey end] - if {($c1 eq "'") && ($c2 eq "'")} { - #single quoted segment. No escapes allowed within it. - set key [tcl::string::range $rawkey 1 end-1] - } elseif {($c1 eq "\"") && ($c2 eq "\"")} { - #double quoted segment. Apply escapes. - # - set keydata [tcl::string::range $rawkey 1 end-1] ;#strip outer quotes only - #e.g key could have mix of \UXXXXXXXX escapes and unicode chars - #or mix of \t and literal tabs. - #unescape to convert all to literal versions for comparison - set key [::tomlish::utils::unescape_string $keydata] - #set key [subst -nocommands -novariables $keydata] ;#wrong. Todo - create a string escape substitution function. - } else { - set key $rawkey - } - return $key - } - - proc string_to_slashu {string} { - set rv {} - foreach c [split $string {}] { - scan $c %c cdec - if {$cdec > 65535} { - append rv {\U} [format %.8X $cdec] - } else { - append rv {\u} [format %.4X $cdec] - } - } - return $rv - } - - #'nonprintable' is conservative here because some systems (e.g windows console) are very limited in what they can display. - #This is used for display purposes only (error msgs) - proc nonprintable_to_slashu {s} { - set res "" - foreach i [split $s ""] { - scan $i %c cdec - - set printable 0 - if {($cdec>31) && ($cdec<127)} { - set printable 1 - } - if {$printable} { - append res $i - } else { - if {$cdec > 65535} { - append res \\U[format %.8X $cdec] - } else { - append res \\u[format %.4X $cdec] - } - } - } - set res - } ;# initial version from tcl wiki RS - - #check if str is valid for use as a toml bare key - #Early toml versions? only allowed letters + underscore + dash - proc is_barekey1 {str} { - if {[tcl::string::length $str] == 0} { - return 0 - } else { - set matches [regexp -all {[a-zA-Z0-9\_\-]} $str] - if {[tcl::string::length $str] == $matches} { - #all characters match the regexp - return 1 - } else { - return 0 - } - } - } - - #from toml.abnf in github.com/toml-lang/toml - #unquoted-key = 1*unquoted-key-char - #unquoted-key-char = ALPHA / DIGIT / %x2D / %x5F ; a-z A-Z 0-9 - _ - #unquoted-key-char =/ %xB2 / %xB3 / %xB9 / %xBC-BE ; superscript digits, fractions - #unquoted-key-char =/ %xC0-D6 / %xD8-F6 / %xF8-37D ; non-symbol chars in Latin block - #unquoted-key-char =/ %x37F-1FFF ; exclude GREEK QUESTION MARK, which is basically a semi-colon - #unquoted-key-char =/ %x200C-200D / %x203F-2040 ; from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ - #unquoted-key-char =/ %x2070-218F / %x2460-24FF ; include super-/subscripts, letterlike/numberlike forms, enclosed alphanumerics - #unquoted-key-char =/ %x2C00-2FEF / %x3001-D7FF ; skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces - #unquoted-key-char =/ %x2070-21FF / %x2300-24FF ; skip math operators - #unquoted-key-char =/ %x25A0-268B / %x2690-2757 ; skip box drawing, block elements, and some yin-yang symbols - #unquoted-key-char =/ %x2762-2767 / %x2776-27E5 ; skip some Dingbat punctuation - #unquoted-key-char =/ %x2801-297F ; skip some math brackets and arrows, and braille blank - #unquoted-key-char =/ %x2B00-2FFF / %x3001-D7FF ; skip various math operators and symbols, and ideographic space - #unquoted-key-char =/ %xF900-FDCF / %xFDF0-FFFD ; skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) - #unquoted-key-char =/ %x10000-EFFFF ; all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) - variable re_barekey - set ranges [list] - lappend ranges {a-zA-Z0-9\_\-} - lappend ranges {\u00B2} {\u00B3} {\u00B9} {\u00BC-\u00BE} ;# superscript digits, fractions - lappend ranges {\u00C0-\u00D6} {\u00D8-\u00F6} {\u00F8-\u037D} ;# non-symbol chars in Latin block - lappend ranges {\u037f-\u1FFF} ;# exclude GREEK QUESTION MARK, which is basically a semi-colon - lappend ranges {\u200C-\u200D} {\u203F-\u2040} ;# from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ - lappend ranges {\u2070-\u218f} {\u2460-\u24FF} ;# include super-subscripts, letterlike/numberlike forms, enclosed alphanumerics - lappend ranges {\u2C00-\u2FEF} {\u3001-\uD7FF} ;# skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces - lappend ranges {\u2070-\u21FF} {\u2300-\u24FF} ;# skip math operators - lappend ranges {\u25A0-\u268B} {\u2690-\u2757} ;# skip box drawing, block elements, and some yin-yang symbols - lappend ranges {\u2762-\u2767} {\u2776-\u27E5} ;# skip some Dingbat punctuation - lappend ranges {\u2801-\u297F} ;# skip some math brackets and arrows, and braille blank - lappend ranges {\u2B00-\u2FFF} {\u3001-\uD7FF} ;# skip various math operators and symbols, and ideographic space - lappend ranges {\uF900-\uFDCF} {\uFDF0-\uFFFD} ;# skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) - lappend ranges {\U10000-\UEFFFF} ;# all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) - set re_barekey {^[} - foreach r $ranges { - append re_barekey $r - } - append re_barekey {]+$} - - proc is_barekey {str} { - if {[tcl::string::length $str] == 0} { - return 0 - } - variable re_barekey - return [regexp $re_barekey $str] - } - - #test only that the characters in str are valid for the toml specified type 'integer'. - proc int_validchars1 {str} { - set numchars [tcl::string::length $str] - if {[regexp -all {[0-9\_\-\+]} $str] == $numchars} { - return 1 - } else { - return 0 - } - } - #add support for hex,octal,binary 0x.. 0o.. 0b... - proc int_validchars {str} { - set numchars [tcl::string::length $str] - if {[regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] == $numchars} { - return 1 - } else { - return 0 - } - } - - proc is_int {str} { - set matches [regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] ;#0b101 etc covered by a-f - - if {[tcl::string::length $str] == $matches} { - #all characters in legal range - - # --------------------------------------- - #check for leading zeroes in non 0x 0b 0o - #first strip any +, - or _ (just for this test) - #(but still allowing 0 -0 +0) - set check [tcl::string::map {+ "" - "" _ ""} $str] - if {([tcl::string::length $check] > 1) && ([tcl::string::index $check 0] eq "0") && ([tcl::string::index $check 1] ni {o x b})} { - return 0 - } - # --------------------------------------- - - #check +,- only occur in the first position. (excludes also +++1 etc) - if {[tcl::string::last - $str] > 0} { - return 0 - } - if {[tcl::string::last + $str] > 0} { - return 0 - } - - #------------------------------------------- - #unclear if a 'digit' includes the type specifiers x b o - #we assume the 0x 0b 0o are NOT counted as digits - as underscores here would seem - #to be likely to cause interop issues with other systems - #(e.g tcl allows 0b1_1 but not 0b_11) - #Most of this structure would be unnecessary if we could rely on string::is::integer understanding underscores (9+?) - #we still need to support earlier Tcl for now though. - - #first rule out any case with more than one underscore in a row - if {[regexp {__} $str]} { - return 0 + #first rule out any case with more than one underscore in a row + if {[regexp {__} $str]} { + return 0 } if {[string index $str 0] eq "_"} { return 0 @@ -3385,6 +2668,12 @@ namespace eval tomlish::utils { } if {[string range $utest 0 1] in {0x 0b 0o}} { set testnum [string range $utest 2 end] + #spec says *non-negative* integers may *also* be expressed in hex, octal or binary + #and also explicitly states + not allowed + #presumed to mean negative not allowed. + if {[string index $str 0] in {- +}} { + return 0 + } } else { set testnum $utest #exclude also things like 0_x 0___b that snuck past our prefix test @@ -3478,9 +2767,8 @@ namespace eval tomlish::utils { set check [tcl::string::map {+ "" - "" _ ""} $str] set r {([0-9])*} regexp $r $check intpart ;#intpart holds all numerals before the first .,e or E - set z {([0])*} - regexp $z $intpart leadingzeros - if {[tcl::string::length $leadingzeros] > 1} { + #leading zero only if exactly one zero + if {$intpart ne "0" && [string match 0* $intpart]} { return 0 } @@ -3578,30 +2866,134 @@ namespace eval tomlish::utils { proc is_localdate {str} { is_datepart $str } - proc is_timepart {str} { - set numchars [tcl::string::length $str] - #timepart can have negative or positive offsets so - and + must be accepted - if {[regexp -all {[zZt0-9\-\+\.:]} $str] == $numchars} { - #todo - #basic check that we have leading 2dig hr and 2dig min separated by colon - if {![regexp {^[0-9]{2}:[0-9]{2}$|^[0-9]{2}:[0-9]{2}[^0-9]{1}.*$} $str]} { - #nn:nn or nn:nnX.* where X is non digit + + #allow only hh:mm:ss or hh:mm (no subseconds) + proc _is_hms_or_hm_time {val} { + set numchars [tcl::string::length $val] + if {[regexp -all {[0-9:]} $val] != $numchars} { + return 0 + } + #assert now digits and colons only + set hms_cparts [split $val :] + #2 or 3 parts only are valid - check contents of each part + if {[llength $hms_cparts] == 2} { + lassign $hms_cparts hr min + if {[string length $hr] != 2 || [string length $min] != 2} { return 0 } - return 1 - } else { - return 0 - } - } + if {$hr > 23 || $min > 59} { + return 0 + } + } elseif {[llength $hms_cparts] == 3} { + lassign $hms_cparts hr min sec + if {[string length $hr] != 2 || [string length $min] != 2 || [string length $sec] !=2} { + return 0 + } + #possible for sec to be 60 - leap second RFC 3339 + if {$hr > 23 || $min > 59 || $sec > 60} { + return 0 + } + } else { + return 0 + } + return 1 + } + proc is_timepart {str} { + #validate the part after the T (or space) + #we receive only that trailing part here. + + #odt1 = 1979-05-27T07:32:00Z + #odt2 = 1979-05-27T00:32:00-07:00 + #odt3 = 1979-05-27T00:32:00.5-07:00 + #odt4 = 1979-05-27T00:32:00.999999-07:00 + + set numchars [tcl::string::length $str] + #timepart can have negative or positive offsets so - and + must be accepted + if {[regexp -all {[zZt0-9\-\+\.:]} $str] == $numchars} { + #todo + #basic check that we have leading 2dig hr and 2dig min separated by colon + if {![regexp {^[0-9]{2}:[0-9]{2}$|^[0-9]{2}:[0-9]{2}[^0-9]{1}.*$} $str]} { + #nn:nn or nn:nnX.* where X is non digit + return 0 + } + set dotparts [split $str .] + if {[llength $dotparts] ni {1 2}} { + return 0 + } + if {[llength $dotparts] == 2} { + lassign $dotparts hms tail + #validate tail - which might have +- offset + if {[string index $tail end] ni {z Z}} { + #from hh:mm:??. + #check for +/- something + if {[regexp {(.*)[+-](.*)} $tail _match fraction offset]} { + if {![string is digit -strict $fraction]} { + return 0 + } + if {![_is_hms_or_hm_time $offset]} { + return 0 + } + } + } else { + set tail [string range $tail 0 end-1] + #expect tail nnnn (from hh:mm::ss.nnnnZ) + #had a dot and a zZ - no other offset valid (?) + if {![string is digit -strict $tail]} { + return 0 + } + } + + } else { + #no dot (fraction of second) + if {[regexp {(.*)[+-](.*)} $str _match hms offset]} { + #validate offset + if {![_is_hms_or_hm_time $offset]} { + return 0 + } + } else { + set hms $str + set offset "" + #trim a *single* z or Z off hms if present - multiple should error later + if {[string index $hms end] in {z Z}} { + set hms [string range $hms 0 end-1] + } + } + } + #hms is allowed in toml to be hh:mm:ss or hh:mm + #validate we have hh:mm:ss or hh:mm - exactly 2 digits each + if {![_is_hms_or_hm_time $hms]} { + return 0 + } + + return 1 + } else { + return 0 + } + } proc is_localtime {str} { #time of day without any relation to a specific day or any offset or timezone set numchars [tcl::string::length $str] if {[regexp -all {[0-9\.:]} $str] == $numchars} { #todo - if {![regexp {^[0-9]{2}:[0-9]{2}$|^[0-9]{2}:[0-9]{2}:[0-9]{2}([.][0-9]*){0,1}$} $str]} { + if {![regexp {^[0-9]{2}:[0-9]{2}$|^[0-9]{2}:[0-9]{2}:[0-9]{2}([.][0-9]+){0,1}$} $str]} { #hh:mm or hh:mm:ss or hh:mm::ss.nnn return 0 } + set dotparts [split $str .] + if {[llength $dotparts] ni {1 2}} { + return 0 + } + if {[llength $dotparts] == 2} { + lassign $dotparts hms _tail + #validate tail - just fractional seconds - regex has confirmed at least one digit and only digits + #nothing todo? max length? + } else { + #no fractional seconds + set hms $str + } + if {![_is_hms_or_hm_time $hms]} { + return 0 + } return 1 } else { return 0 @@ -4152,7 +3544,8 @@ namespace eval tomlish::parse { set tnamestate [dict create] dict set tnamestate whitespace "NA" dict set tnamestate tablename {zeropoppushspace table-space} - dict set tnamestate tablename2 {PUSHSPACE table-space returnstate tablearrayname-tail} + #dict set tnamestate tablename2 {PUSHSPACE table-space returnstate tablearrayname-tail} + dict set tnamestate tablename2 {PUSHSPACE table-space returnstate tablename-tail} dict set tnamestate endtablename "tablename-tail" dict set tnamestate endtablearrayname "tablearrayname-tail" dict set tnamestate comma "err-state" @@ -4197,7 +3590,7 @@ namespace eval tomlish::parse { # [[xxx]] ??? set tarntail [dict create] - dict set tarntail whitespace "tablearrayname-tail" + dict set tarntail whitespace "err-state" ;#"tablearrayname-tail" ;#spec doesn't allow whitespace here dict set tarntail newline "err-state" dict set tarntail comment "err-state" dict set tarntail eof "err-state" @@ -4600,8 +3993,9 @@ namespace eval tomlish::parse { # - the possible benefit is being able to more easily process in arbitrarily split chunks (although we would still have to watch crlf splitting ?) # - interactive use? - proc tok {s} { + proc tok {} { variable nest + variable s variable v variable i variable tok @@ -4646,100 +4040,1128 @@ namespace eval tomlish::parse { for {} {$i < $sLen} {} { if {$i > 0} { set lastChar [tcl::string::index $s [expr {$i - 1}]] + set start_of_data h } else { set lastChar "" + set start_of_data 1 + #bom-handling + if {[tcl::string::index $s 0] eq "\uFEFF"} { + #bom (could be from various encodings - now decoded as single unicode char FEFF) + #incr i 1 ;#skip over initial bom? + } } + set c [tcl::string::index $s $i] set cindex $i + set ctest [tcl::string::map {\{ lc \} rc \[ lb \] rb \" dq ' sq \\ bsl \r cr \n lf \t tab \uFEFF bom} $c] tomlish::log::debug "- tokloop char <$ctest> index $i tokenType:$tokenType tok:<$tok>" #puts "got char $c during tokenType '$tokenType'" incr i ;#must incr here because we do returns inside the loop - switch -exact -- $ctest { - # { + + + switch -exact -- $ctest { + # { + set had_slash $slash_active + set slash_active 0 + + if {$had_slash} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + #for multiliteral, multistring - data and/or end + incr i -1 + return 1 + } + _start_squote_sequence { + #pseudo token beginning with underscore - never returned to state machine - review + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + barekey { + error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed (see tomlish::utils::is_barekey). [tomlish::parse::report_line]" + } + whitespace { + # hash marks end of whitespace token + #do a return for the whitespace, set token_waiting + #set_token_waiting type comment value "" complete 1 + incr i -1 ;#leave comment for next run + return 1 + } + untyped_value { + #REVIEW! the spec isn't clear.. is whitespace after an int,bool etc required before comment? + #we will accept a comment marker as an immediate terminator of the untyped_value. + incr i -1 + return 1 + } + starttablename - starttablearrayname { + #fix? + error "tomlish Character '#' is invalid first character for $tokenType. [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #invalid in bare parts - but allowed in quoted parts - let tablename parser sort it out + append tok $c + } + default { + #dquotedkey, string,literal, multistring + append tok $c + } + } + } else { + switch -- $state { + multistring-space { + set_tokenType stringpart + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "#" + } + multiliteral-space { + set_tokenType "literalpart" + set tok "#" + } + default { + #start of token if we're not in a token + set_tokenType comment + set tok "" ;#The hash is not part of the comment data + } + } + } + } + lc { + #left curly brace + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i [tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + append tok $c + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + starttablename - starttablearrayname { + #*bare* tablename can only contain letters,digits underscores + error "tomlish Invalid tablename first character \{ [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #valid in quoted parts + append tok $c + } + comment { + if {$had_slash} {append tok "\\"} + append tok "\{" + } + default { + #end any other token. + incr i -1 + return 1 + } + } + } else { + switch -exact -- $state { + itable-keyval-value-expected - keyval-value-expected { + #switch last key to tablename?? + set_tokenType "startinlinetable" + set tok "\{" + return 1 + } + array-space - array-syntax { + #nested anonymous inline table + set_tokenType "startinlinetable" + set tok "\{" + return 1 + } + table-space { + #invalid - but allow parser statemachine to report it. ? + set_tokenType "startinlinetable" + set tok "\{" + return 1 + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "\{" + } + multiliteral-space { + set_tokenType "literalpart" + set tok "\{" + } + default { + error "tomlish state: '$state'. left brace case not implemented [tomlish::parse::report_line]" + } + } + } + + } + rc { + #right curly brace + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + append tok $c + } + string - dquotedkey - comment { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + starttablename - tablename { + if {$had_slash} {append tok "\\"} + #invalid! - but leave for datastructure loading stage to catch + set_token_waiting type endinlinetable value "" complete 1 startindex $cindex + return 1 + } + starttablearrayname - tablearrayname { + if {$had_slash} {append tok "\\"} + #invalid! - but leave for datastructure loading stage to catch + set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex + return 1 + } + default { + #end any other token + incr i -1 + return 1 + } + } + } else { + #$slash_active not relevant when no tokenType + switch -exact -- $state { + table-space { + #invalid - but allow parser statemachine to report it. ? + set_tokenType "endinlinetable" + set tok "\}" + return 1 + } + itable-space { + set_tokenType "endinlinetable" + set tok "\}" + return 1 + } + tablename-state { + #e.g [] - empty tablename - allowed or not? + #empty tablename/tablearrayname ? + #error "unexpected tablename problem" + + set_tokenType "endinlinetable" + set tok "" ;#no output into the tomlish list for this token + return 1 + } + tablearrayname-state { + error "tomlish unexpected tablearrayname-state problem" + set_tokenType "endinlinetable" + set tok "" ;#no output into the tomlish list for this token + return 1 + } + array-syntax - array-space { + #invalid + set_tokenType "endinlinetable" + set tok "\}" + return 1 + } + itable-val-tail { + set_tokenType "endinlinetable" + set tok "" + #we need to pop the keyval - and then reprocess to pop the inlinetable - so we incr -1 + incr i -1 + return 1 + } + itable-keyval-syntax { + error "tomlish endinlinetable unexpected at this point. Expecting key=val syntax [tomlish::parse::report_line]" + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "\}" + } + multiliteral-space { + set_tokenType "literalpart" ; #review + set tok "\}" + } + default { + #JMN2024b keyval-tail? + error "tomlish state '$state'. endinlinetable case not implemented [tomlish::parse::report_line]" + } + } + } + + } + lb { + #left square bracket + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + append tok $c + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + starttablename { + #change the tokenType + switch_tokenType "starttablearrayname" + set tok "" ;#no output into the tomlish list for this token + #any following whitespace is part of the tablearrayname, so return now + return 1 + } + tablename - tablearrayname { + #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token + if {$had_slash} { + #resultant tablename may be invalid - but leave for datastructure loading stage to catch + #append tok "\\[" + append tok {\[} + } else { + if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { + #invalid at this point - state machine should disallow: + # table -> starttablearrayname + # tablearray -> starttablearrayname + set_token_waiting type starttablearrayname value "" complete 1 startindex $cindex + return 1 + } else { + #we appear to still be in single or double quoted section + append tok "\[" + } + } + } + comment { + if {$had_slash} {append tok "\\"} + append tok "\[" + } + default { + #end any other token. + incr i -1 + return 1 + } + } + } else { + #$slash_active not relevant when no tokenType + switch -exact -- $state { + keyval-value-expected - itable-keyval-value-expected { + set_tokenType "startarray" + set tok "\[" + return 1 + } + array-space - array-syntax { + #nested array? + set_tokenType "startarray" + set tok "\[" + return 1 + #error "state: array-space. startarray case not implemented [tomlish::parse::report_line]" + } + table-space { + #table name + #assume it's a single bracket - but we need to wait for non-bracket to confirm it's not a tablearray + #note that a starttablearrayname token may contain whitespace between the brackets + # e.g \[ \[ + set_tokenType "starttablename" + set tok "" ;#there is no output into the tomlish list for this token + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "\[" + } + multiliteral-space { + set_tokenType "literalpart" + set tok "\[" + } + itable-space { + #handle state just to give specific error msg + error "tomlish state: '$state'. Left square bracket invalid. Cannot start array in inline table without key. Use key=\[\] syntax. [tomlish::parse::report_line]" + } + default { + error "tomlish state: '$state'. startarray case not implemented [tomlish::parse::report_line]" + } + } + } + } + rb { + #right square bracket + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + append tok $c + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + comment { + if {$had_slash} {append tok "\\"} + append tok $c + } + whitespace { + if {$state eq "multistring-space"} { + #???? + incr i -1 + if {$had_slash} {incr i -1} ;#reprocess + return 1 + } else { + incr i -1 + if {$had_slash} {incr i -1} ;#reprocess + return 1 + } + } + starttablename { + #toml-test invalid/table/empty + + set_token_waiting type tablename value "" complete 1 startindex $cindex + incr i -1 + return 1 + } + tablename { + #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token + if {$had_slash} { + #resultant tablename may be invalid - but leave for datastructure loading stage to catch + append tok "\\]" + } else { + if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { + set_token_waiting type endtablename value "" complete 1 startindex $cindex + return 1 + } else { + #we appear to still be in single or double quoted section + append tok "]" + } + } + } + tablearrayname { + #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token + if {$had_slash} { + #resultant tablename may be invalid - but leave for datastructure loading stage to catch + append tok "\\]" + } else { + if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { + set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex + return 1 + } else { + #we appear to still be in single or double quoted section + append tok "]" + } + } + } + default { + incr i -1 + return 1 + } + } + } else { + #$slash_active not relevant when no tokenType + switch -exact -- $state { + array-syntax - array-space { + #invalid - but allow parser statemachine to report it. + set_tokenType "endarray" + set tok "\]" + return 1 + } + table-space { + #invalid - but allow parser statemachine to report it. ? + set_tokenType "endarray" + set tok "\]" + return 1 + } + tablename-state { + #e.g [] - empty tablename + #tomltest 1.1.0 invalid/table/empty + #should be invalid + #we parse it and let dict::from_tomlish error when it tries to split table + + set_tokenType "endtablename" + set tok "" ;#no output into the tomlish list for this token + return 1 + } + tablearrayname-state { + error "tomlish unexpected tablearrayname problem" + set_tokenType "endtablearray" + set tok "" ;#no output into the tomlish list for this token + return 1 + } + tablearrayname-tail { + #[[xxx] + set_tokenType "endtablename" + #sequence: starttablename -> starttablearrayname -> endtablearrayname -> endtablename + return 1 + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "\]" + } + multiliteral-space { + set_tokenType "literalpart" + set tok "\]" + } + default { + error "tomlish state '$state'. endarray case not implemented [tomlish::parse::report_line]" + } + } + } + } + bsl { + #backslash + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + whitespace { + if {$state eq "multistring-space"} { + #end whitespace token + incr i -1 ;#reprocess bsl in next run + return 1 + } else { + error "tomlish Unexpected backslash during whitespace. [tomlish::parse::report_line]" + } + } + literal - literalpart - squotedkey { + #never need to set slash_active true when in single quoted tokens + append tok "\\" + set slash_active 0 + } + string - dquotedkey - comment { + if {$slash_active} { + set slash_active 0 + append tok "\\\\" + } else { + set slash_active 1 + } + } + stringpart { + if {$slash_active} { + set slash_active 0 + append tok "\\\\" + } else { + set slash_active 1 + } + } + starttablename - starttablearrayname { + error "tomlish backslash is invalid as first character of $tokenType [tomlish::parse::report_line]" + } + tablename - tablearrayname { + if {$slash_active} { + set slash_active 0 + append tok "\\\\" + } else { + set slash_active 1 + } + } + barekey { + error "tomlish Unexpected backslash during barekey. [tomlish::parse::report_line]" + } + default { + error "tomlish Backslash unexpected during tokentype: '$tokenType'. [tomlish::parse::report_line]" + } + } + } else { + switch -exact -- $state { + multistring-space { + if {$slash_active} { + set_tokenType "stringpart" + set tok "\\\\" + set slash_active 0 + } else { + set slash_active 1 + } + } + multiliteral-space { + #nothing can be escaped in multiliteral-space - not even squotes (?) review + set_tokenType "literalpart" + set tok "\\" + } + default { + error "tomlish tok error: Unexpected backslash when no token is active. [tomlish::parse::report_line]" + } + } + } + } + sq { + #single quote + set had_slash $slash_active + set slash_active 0 + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote { + #for within multiliteral + #short tentative_accum_squote tokens are returned if active upon receipt of any other character + #longest allowable for leading/trailing are returned here + #### + set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote + #assert state = trailing-squote-space + append tok $c + if {$existingtoklen == 4} { + #maxlen to be a tentative_accum_squote is multisquote + 2 = 5 + #return tok with value ''''' + return 1 + } + } + tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space + switch -- [tcl::string::length $tok] { + 1 { + #no conclusion can yet be reached + append tok $c + } + 2 { + #enter multiliteral + #switch? + append tok $c + set_tokenType triple_squote + return 1 + } + default { + #if there are more than 3 leading squotes we also enter multiliteral space and the subsequent ones are handled + #by the tentative_accum_squote check for ending sequence which can accept up to 5 and reintegrate the + #extra 1 or 2 squotes as data. + error "tomlish unexpected token length [tcl::string::length $tok] in '_start_squote_sequence'" + } + } + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + whitespace { + #end whitespace + incr i -1 ;#reprocess sq + return 1 + } + literal { + #slash_active always false + #terminate the literal + set_token_waiting type endsquote value "'" complete 1 startindex $cindex + return 1 + } + literalpart { + #ended by ''' - but final could be '''' or ''''' (up to 2 squotes allowed directly before ending triple squote sequence) + #todo + # idea: end this literalpart (possibly 'temporarily') + # let the sq be reprocessed in the multiliteral-space to push an end-multiliteral-sequence to state stack + # upon popping end-multiliteral-sequence - stitch quotes back into this literalpart's token (if either too short - or a long ending sequence as shown above) + incr i -1 ;#throw the "'" back to loop - will be added to a tentative_accum_squote token for later processing + return 1 + } + XXXitablesquotedkey { + set_token_waiting type endsquote value "'" complete 1 startindex $cindex + return 1 + } + squotedkey { + ### + #set_token_waiting type endsquote value "'" complete 1 + return 1 + } + starttablename - starttablearrayname { + #!!! + incr i -1 + return 1 + } + tablename - tablearrayname { + append tok $c + } + barekey { + #barekeys now support all sorts of unicode letter/number chars for other cultures + #but not punctuation - not even for those of Irish heritage who don't object + #to the anglicised form of some names. + # o'shenanigan seems to not be a legal barekey + #The Irish will have to use an earlier form Ó - which apparently many may prefer anyway. + error "tomlish Unexpected single quote during barekey. [tomlish::parse::report_line]" + } + default { + append tok $c + } + } + } else { + switch -exact -- $state { + array-space - keyval-value-expected - itable-keyval-value-expected { + #leading squote + #pseudo-token _start_squote_sequence ss not received by state machine + #This pseudotoken will trigger production of single_squote token or triple_squote token + #It currently doesn't trigger double_squote token + #(handle '' same as 'x' ie produce a single_squote and go into processing literal) + #review - producing double_squote for empty literal may be slightly more efficient. + #This token is not used to handle squote sequences *within* a multiliteral + set_tokenType "_start_squote_sequence" + set tok "'" + } + multiliteral-space { + #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row + #we are building up a tentative_accum_squote to determine if + #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines + #b) it is exactly ''' and we can terminate the whole multiliteral + #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space + set_tokenType "tentative_trigger_squote" ;#trigger tentative_accum_squote + set tok "'" + return 1 + } + table-space - itable-space { + #tests: squotedkey.test squotedkey_itable.test + set_tokenType "squotedkey" + set tok "" + } + XXXtable-space - XXXitable-space { + #future - could there be multiline keys? MLLKEY, MLBKEY ? + #this would (almost) allow arbitrary tcl dicts to be stored in toml (aside from escaping issues) + #probably unlikely - as it's perhaps not very 'minimal' or ergonomic for config files + #@2025 ABNF for toml mentions key, simple-key, unquoted-key, quoted-key and dotted-key + #where key is simple-key or dotted-key - no MLL or MLB components + #the spec states solution for arbitrary binary data is application specific involving encodings + #such as hex, base64 + set_tokenType "_start_squote_sequence" + set tok "'" + return 1 + } + tablename-state { + #first char in tablename-state/tablearrayname-state + set_tokenType "tablename" + append tok "'" + } + tablearrayname-state { + set_tokenType "tablearrayname" + append tok "'" + } + literal-state { + #shouldn't get here? review + tomlish::log::debug "- tokloop sq during literal-state with no tokentype - empty literal?" + set_tokenType "literal" + incr -1 + return 1 + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} {append tok "\\"} + append tok "," + #error "tomlish unimplemented - squote during state '$state'. [tomlish::parse::report_line]" + } + dottedkey-space { + set_tokenType "squotedkey" + } + default { + error "tomlish unhandled squote during state '$state'. [tomlish::parse::report_line]" + } + } + } + + } + dq { + #double quote + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #review + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + tentative_accum_dquote { + #within multistring + #short tentative_accum_dquote tokens are returned if active upon receipt of any other character + #longest allowable for leading/trailing are returned here + #### + set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote + #assert state = trailing-squote-space + append tok $c + if {$existingtoklen == 4} { + #maxlen to be a tentative_accum_dquote is multidquote + 2 = 5 + #return tok with value """"" + return 1 + } + } + _start_dquote_sequence { + #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space + switch -- [tcl::string::length $tok] { + 1 { + #no conclusion can yet be reached + append tok $c + } + 2 { + #enter multistring + #switch? + append tok $c + set_tokenType triple_dquote + return 1 + } + default { + #if there are more than 3 leading dquotes we also enter multistring space and the subsequent ones are handled + #by the tentative_accum_dquote check for ending sequence which can accept up to 5 and reintegrate the + #extra 1 or 2 dquotes as data. + error "tomlish unexpected token length [tcl::string::length $tok] in '_start_dquote_sequence'" + } + } + } + literal - literalpart { + append tok $c + } + string { + if {$had_slash} { + append tok "\\" $c + } else { + #unescaped quote always terminates a string + set_token_waiting type enddquote value "\"" complete 1 startindex $cindex + return 1 + } + } + stringpart { + #sub element of multistring + if {$had_slash} { + append tok "\\" $c + } else { + incr i -1 ;#throw the {"} back to loop - will be added to a tentative_accum_dquote token for later processing + return 1 + } + } + whitespace { + #assert: had_slash will only ever be true in multistring-space + if {$had_slash} { + incr i -2 + return 1 + } else { + #end whitespace token - throw dq back for reprocessing + incr i -1 + return 1 + } + } + comment { + if {$had_slash} {append tok "\\"} + append tok $c + } + XXXdquotedkey { + if {$had_slash} { + append tok "\\" + append tok $c + } else { + set_token_waiting type enddquote value "\"" complete 1 startindex $cindex + return 1 + } + } + dquotedkey { + ### + if {$had_slash} { + append tok "\\" + append tok $c + } else { + #set_token_waiting type enddquote value {"} complete 1 + return 1 + } + } + squotedkey { + append tok $c + } + tablename - tablearrayname { + if {$had_slash} {append tok "\\"} + append tok $c + } + starttablename - starttablearrayname { + incr i -1 ;## + return 1 + } + default { + error "tomlish got quote during tokenType '$tokenType' [tomlish::parse::report_line]" + } + } + } else { + #$slash_active not relevant when no tokenType + #token is string only if we're expecting a value at this point + switch -exact -- $state { + array-space - keyval-value-expected - itable-keyval-value-expected { + #leading dquote + #pseudo-token _start_squote_sequence ss not received by state machine + #This pseudotoken will trigger production of single_dquote token or triple_dquote token + #It currently doesn't trigger double_dquote token + #(handle "" same as "x" ie produce a single_dquote and go into processing string) + #review - producing double_dquote for empty string may be slightly more efficient. + #This token is not used to handle dquote sequences once *within* a multistring + set_tokenType "_start_dquote_sequence" + set tok {"} + } + multistring-space { + if {$had_slash} { + set_tokenType "stringpart" + set tok "\\\"" + } else { + #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row + #we are building up a tentative_accum_squote to determine if + #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines + #b) it is exactly ''' and we can terminate the whole multiliteral + #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space + set_tokenType "tentative_trigger_dquote" ;#trigger tentative_accum_dquote + set tok {"} + return 1 + } + } + multiliteral-space { + set_tokenType "literalpart" + set tok "\"" + } + table-space - itable-space { + set_tokenType "dquotedkey" + set tok "" + } + dottedkey-space { + set_tokenType dquotedkey + set tok "" + + #only if complex keys become a thing + #set_tokenType dquote_seq_begin + #set tok $c + } + tablename-state { + set_tokenType tablename + set tok $c + } + tablearrayname-state { + set_tokenType tablearrayname + set tok $c + } + default { + error "tomlish Unexpected dquote during state '$state' [tomlish::parse::report_line]" + } + } + } + } + = { set had_slash $slash_active set slash_active 0 - if {$had_slash} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. - if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { + newline { + #review + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } tentative_accum_squote - tentative_accum_dquote { - #for multiliteral, multistring - data and/or end incr i -1 return 1 } _start_squote_sequence { - #pseudo token beginning with underscore - never returned to state machine - review incr i -[tcl::string::length $tok] set_tokenType "single_squote" return 1 } _start_dquote_sequence { - incr i [tcl::string::length $tok] + incr i -[tcl::string::length $tok] set_tokenType "single_dquote" return 1 } - barekey { - error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed (see tomlish::utils::is_barekey). [tomlish::parse::report_line]" + literal - literalpart - squotedkey { + #assertion had_slash 0 + append tok $c + } + string - comment - dquotedkey { + #for these tokenTypes an = is just data. + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c } whitespace { - # hash marks end of whitespace token - #do a return for the whitespace, set token_waiting - #set_token_waiting type comment value "" complete 1 - incr i -1 ;#leave comment for next run - return 1 + if {$state eq "multistring-space"} { + incr i -1 + return 1 + } else { + set_token_waiting type equal value = complete 1 startindex $cindex + return 1 + } } - untyped_value { - #REVIEW! the spec isn't clear.. is whitespace after an int,bool etc required before comment? - #we will accept a comment marker as an immediate terminator of the untyped_value. + barekey { + #set_token_waiting type equal value = complete 1 incr i -1 return 1 } starttablename - starttablearrayname { - #fix! - error "tomlish Character '#' is invalid first character for $tokenType. [tomlish::parse::report_line]" + error "tomlish Character '=' is invalid first character for $tokenType. [tomlish::parse::report_line]" } tablename - tablearrayname { - #invalid in bare parts - but allowed in quoted parts - let tablename parser sort it out + #invalid in bare name - but valid in quoted parts - leave for tablename parser to sort out append tok $c } default { - #dquotedkey, string,literal, multistring - append tok $c + error "tomlish unexpected = character during tokentype $tokenType. case not implemented. [tomlish::parse::report_line]" } } } else { - switch -- $state { + switch -exact -- $state { multistring-space { - set_tokenType stringpart + set_tokenType "stringpart" set tok "" if {$had_slash} { append tok "\\" } - append tok "#" + append tok = } multiliteral-space { set_tokenType "literalpart" - set tok "#" + set tok "=" + } + dottedkey-space { + set_tokenType "equal" + set tok "=" + return 1 } default { - #start of token if we're not in a token - set_tokenType comment - set tok "" ;#The hash is not part of the comment data + set_tokenType "equal" + set tok = + return 1 } } } } - lc { - #left curly brace - set had_slash $slash_active + cr { + #REVIEW! + # \r carriage return + if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. set slash_active 0 - if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { + newline { + #we have received a double cr + ::tomlish::log::warn "double cr - will generate cr token. needs testing" + set_tokenType "cr" ;#lone cr token will generally raise an error - but let state machine handle it + incr i -1 + return 1 + } tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 @@ -4750,86 +5172,77 @@ namespace eval tomlish::parse { return 1 } _start_dquote_sequence { - incr i [tcl::string::length $tok] + incr i -[tcl::string::length $tok] set_tokenType "single_dquote" return 1 } - literal - literalpart - squotedkey { + literal { append tok $c } - string - dquotedkey { - if {$had_slash} {append tok "\\"} - append tok $c + literalpart { + #part of MLL string (multi-line literal string) + #we need to split out crlf as a separate NEWLINE to be consistent + ::tomlish::log::warn "literalpart ended by cr - needs testing" + #return literalpart temporarily - allow cr to be reprocessed from multiliteral-space + incr i -1 + return 1 } stringpart { - if {$had_slash} {append tok "\\"} - append tok $c + #stringpart is a part of MLB string (multi-line basic string) + #throw back the cr - if followed by lf it will become a {NEWLINE crlf} entry within the MULTISTRING list (e.g between STRINGPART entries) + incr i -1 + return 1 } starttablename - starttablearrayname { - #*bare* tablename can only contain letters,digits underscores - error "tomlish Invalid tablename first character \{ [tomlish::parse::report_line]" + error "tomlish Character is invalid first character for $tokenType. [tomlish::parse::report_line]" } tablename - tablearrayname { - #valid in quoted parts + #could in theory be valid in quoted part of name + #review - might be better just to disallow here append tok $c } - comment { - if {$had_slash} {append tok "\\"} - append tok "\{" - } - default { - #end any other token. - incr i -1 - return 1 - } - } - } else { - switch -exact -- $state { - itable-keyval-value-expected - keyval-value-expected { - #switch last key to tablename?? - set_tokenType "startinlinetable" - set tok "\{" - return 1 - } - array-space - array-syntax { - #nested anonymous inline table - set_tokenType "startinlinetable" - set tok "\{" - return 1 - } - table-space { - #invalid - but allow parser statemachine to report it. ? - set_tokenType "startinlinetable" - set tok "\{" + whitespace { + #it should technically be part of whitespace if not followed by lf + #but outside of values we are also free to map it to be another NEWLINE instead? REVIEW + incr i -1 return 1 } - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "\{" + untyped_value { + incr i -1 + return 1 } - multiliteral-space { - set_tokenType "literalpart" - set tok "\{" + comment { + #JJJJ + #review + incr i -1 + return 1 } default { - error "tomlish state: '$state'. left brace case not implemented [tomlish::parse::report_line]" + #!todo - error out if cr inappropriate for tokenType + append tok $c } } + } else { + #lf may be appended if next + #review - lone cr as newline? - this is uncommon - but so is lone cr in a string(?) + set_tokenType "newline" + set tok cr } - } - rc { - #right curly brace + lf { + # \n newline set had_slash $slash_active set slash_active 0 - if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { + newline { + #review + #this lf is the trailing part of a crlf + append tok lf ;#assert we should now have tok "crlf" - as a previous cr is the only way to have an incomplete newline tok + return 1 + } tentative_accum_squote - tentative_accum_dquote { + #multiliteral or multistring incr i -1 return 1 } @@ -4843,107 +5256,95 @@ namespace eval tomlish::parse { set_tokenType "single_dquote" return 1 } - literal - literalpart - squotedkey { - append tok $c + literal { + #nl is not allowed *within* a literal - require multiliteral syntax for any literal containing a newline ''' ''' + #even though we terminate the literal without the closing quote here - the token_waiting newline should trigger a state error + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 } - string - dquotedkey - comment { - if {$had_slash} {append tok "\\"} - append tok $c + literalpart { + #we allow newlines - but store them within the multiliteral as their own element + #This is a legitimate end to the literalpart - but not the whole multiliteral + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 } stringpart { - if {$had_slash} {append tok "\\"} - append tok $c - } - starttablename - tablename { - if {$had_slash} {append tok "\\"} - #invalid! - but leave for datastructure loading stage to catch - set_token_waiting type endinlinetable value "" complete 1 startindex $cindex - return 1 + if {$had_slash} { + #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) + set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] + incr i -1 + return 1 + } else { + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 + } } - starttablearrayname - tablearrayname { - if {$had_slash} {append tok "\\"} - #invalid! - but leave for datastructure loading stage to catch - set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex - return 1 + starttablename - tablename - tablearrayname - starttablearrayname { + error "tomlish Character is invalid in $tokenType. [tomlish::parse::report_line]" } default { - #end any other token - incr i -1 + #newline ends all other tokens. + #note for string: we don't add (raw unescaped) newline to simple string. (must use multi-string for this) + #note for whitespace: + # we will use the convention that \n terminates the current whitespace even if whitespace follows + # ie whitespace is split into separate whitespace tokens at each newline + + #puts "-------------- newline lf during tokenType $tokenType" + set_token_waiting type newline value lf complete 1 startindex $cindex return 1 } } } else { - #$slash_active not relevant when no tokenType switch -exact -- $state { - table-space { - #invalid - but allow parser statemachine to report it. ? - set_tokenType "endinlinetable" - set tok "\}" - return 1 - } - itable-space { - set_tokenType "endinlinetable" - set tok "\}" - return 1 - } - tablename-state { - #e.g [] - empty tablename - allowed or not? - #empty tablename/tablearrayname ? - #error "unexpected tablename problem" - - set_tokenType "endinlinetable" - set tok "" ;#no output into the tomlish list for this token - return 1 - } - tablearrayname-state { - error "tomlish unexpected tablearrayname-state problem" - set_tokenType "endinlinetable" - set tok "" ;#no output into the tomlish list for this token - return 1 - } - array-syntax - array-space { - #invalid - set_tokenType "endinlinetable" - set tok "\}" - return 1 - } - itable-val-tail { - set_tokenType "endinlinetable" - set tok "" - #we need to pop the keyval - and then reprocess to pop the inlinetable - so we incr -1 - incr i -1 - return 1 - } - itable-keyval-syntax { - error "tomlish endinlinetable unexpected at this point. Expecting key=val syntax [tomlish::parse::report_line]" - } multistring-space { - set_tokenType "stringpart" - set tok "" if {$had_slash} { - append tok "\\" + set_tokenType "continuation" + set tok "\\" + incr i -1 + return 1 + } else { + set_tokenType "newline" + set tok lf + return 1 } - append tok "\}" } multiliteral-space { - set_tokenType "literalpart" ; #review - set tok "\}" + #assert had_slash 0 + set_tokenType "newline" + set tok "lf" + return 1 } default { - #JMN2024b keyval-tail? - error "tomlish state '$state'. endinlinetable case not implemented [tomlish::parse::report_line]" + #ignore slash? error? + set_tokenType "newline" + set tok lf + return 1 } } + #if {$had_slash} { + # #CONT directly before newline - allows strings_5_byteequivalent test to pass + # set_tokenType "continuation" + # set tok "\\" + # incr i -1 + # return 1 + #} else { + # set_tokenType newline + # set tok lf + # return 1 + #} } - } - lb { - #left square bracket + , { set had_slash $slash_active set slash_active 0 - if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 @@ -4958,105 +5359,70 @@ namespace eval tomlish::parse { set_tokenType "single_dquote" return 1 } - literal - literalpart - squotedkey { - append tok $c + comment - tablename - tablearrayname { + if {$had_slash} {append tok "\\"} + append tok , } string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { + #stringpart can have up to 2 quotes too if {$had_slash} {append tok "\\"} append tok $c } - starttablename { - #change the tokenType - switch_tokenType "starttablearrayname" - set tok "" ;#no output into the tomlish list for this token - #any following whitespace is part of the tablearrayname, so return now - return 1 + literal - literalpart - squotedkey { + #assert had_slash always 0 + append tok $c } - tablename - tablearrayname { - #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token - if {$had_slash} { - #resultant tablename may be invalid - but leave for datastructure loading stage to catch - #append tok "\\[" - append tok {\[} + whitespace { + if {$state eq "multistring-space"} { + incr i -1 + return 1 } else { - if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { - #invalid at this point - state machine should disallow: - # table -> starttablearrayname - # tablearray -> starttablearrayname - set_token_waiting type starttablearrayname value "" complete 1 startindex $cindex - return 1 - } else { - #we appear to still be in single or double quoted section - append tok "\[" - } + set_token_waiting type comma value "," complete 1 startindex $cindex + return 1 } } - comment { - if {$had_slash} {append tok "\\"} - append tok "\[" - } default { - #end any other token. - incr i -1 + set_token_waiting type comma value "," complete 1 startindex $cindex + if {$had_slash} {append tok "\\"} return 1 } } } else { - #$slash_active not relevant when no tokenType switch -exact -- $state { - keyval-value-expected - itable-keyval-value-expected { - set_tokenType "startarray" - set tok "\[" - return 1 - } - array-space - array-syntax { - #nested array? - set_tokenType "startarray" - set tok "\[" - return 1 - #error "state: array-space. startarray case not implemented [tomlish::parse::report_line]" - } - table-space { - #table name - #assume it's a single bracket - but we need to wait for non-bracket to confirm it's not a tablearray - #note that a starttablearrayname token may contain whitespace between the brackets - # e.g \[ \[ - set_tokenType "starttablename" - set tok "" ;#there is no output into the tomlish list for this token - } multistring-space { set_tokenType "stringpart" set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "\[" + if {$had_slash} {append tok "\\"} + append tok "," } multiliteral-space { + #assert had_slash 0 set_tokenType "literalpart" - set tok "\[" - } - itable-space { - #handle state just to give specific error msg - error "tomlish state: '$state'. Left square bracket invalid. Cannot start array in inline table without key. Use key=\[\] syntax. [tomlish::parse::report_line]" + set tok "," } default { - error "tomlish state: '$state'. startarray case not implemented [tomlish::parse::report_line]" + set_tokenType "comma" + set tok "," + return 1 } } } } - rb { - #right square bracket + . { set had_slash $slash_active set slash_active 0 - if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 @@ -5071,7 +5437,8 @@ namespace eval tomlish::parse { set_tokenType "single_dquote" return 1 } - literal - literalpart - squotedkey { + comment - untyped_value { + if {$had_slash} {append tok "\\"} append tok $c } string - dquotedkey { @@ -5082,115 +5449,98 @@ namespace eval tomlish::parse { if {$had_slash} {append tok "\\"} append tok $c } - comment { - if {$had_slash} {append tok "\\"} + literal - literalpart - squotedkey { + #assert had_slash always 0 append tok $c } whitespace { - if {$state eq "multistring-space"} { - #???? - incr i -1 - if {$had_slash} {incr i -1} ;#reprocess - return 1 - } else { - incr i -1 - if {$had_slash} {incr i -1} ;#reprocess - return 1 - } - } - tablename { - #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token - if {$had_slash} { - #resultant tablename may be invalid - but leave for datastructure loading stage to catch - append tok "\\]" - } else { - if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { - set_token_waiting type endtablename value "" complete 1 startindex $cindex + switch -exact -- $state { + multistring-space { + #review + if {$had_slash} { + incr i -2 + } else { + incr i -1 + } return 1 - } else { - #we appear to still be in single or double quoted section - append tok "]" } - } - } - tablearrayname { - #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token - if {$had_slash} { - #resultant tablename may be invalid - but leave for datastructure loading stage to catch - append tok "\\]" - } else { - if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { - set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex + xxxdottedkey-space { + incr i -1 return 1 - } else { - #we appear to still be in single or double quoted section - append tok "]" + } + dottedkey-space-tail { + incr i -1 + return 1 + } + default { + error "tomlish Received period during tokenType 'whitespace' [tomlish::parse::report_line]" } } } - default { - incr i -1 - return 1 - } - } - } else { - #$slash_active not relevant when no tokenType - switch -exact -- $state { - array-syntax - array-space { - #invalid - but allow parser statemachine to report it. - set_tokenType "endarray" - set tok "\]" - return 1 - } - table-space { - #invalid - but allow parser statemachine to report it. ? - set_tokenType "endarray" - set tok "\]" - return 1 + starttablename - starttablearrayname { + #This would correspond to an empty table name + error "tomlish Character '.' is not allowed as first character ($tokenType). [tomlish::parse::report_line]" } - tablename-state { - #e.g [] - empty tablename - allowed or not? - #empty tablename/tablearrayname ? - #error "unexpected tablename problem" - - set_tokenType "endtablename" - set tok "" ;#no output into the tomlish list for this token - return 1 + tablename - tablearrayname { + #subtable - split later - review + append tok $c } - tablearrayname-state { - error "tomlish unexpected tablearrayname problem" - set_tokenType "endtablearray" - set tok "" ;#no output into the tomlish list for this token + barekey { + #e.g x.y = 1 + #we need to transition the barekey to become a structured table name ??? review + #x is the tablename y is the key + set_token_waiting type dotsep value "." complete 1 startindex $cindex return 1 } - tablearrayname-tail { - #[[xxx] - set_tokenType "endtablename" - #sequence: starttablename -> starttablearrayname -> endtablearrayname -> endtablename - return 1 + default { + error "tomlish Received period during tokenType '$tokenType' [tomlish::parse::report_line]" + #set_token_waiting type period value . complete 1 + #return 1 } + } + } else { + switch -exact -- $state { multistring-space { set_tokenType "stringpart" set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "\]" + if {$had_slash} {append tok "\\"} + append tok "." } multiliteral-space { set_tokenType "literalpart" - set tok "\]" + set tok "." + } + XXXdottedkey-space { + ### obs? + set_tokenType "dotsep" + set tok "." + return 1 + } + dottedkey-space-tail { + ### + set_tokenType "dotsep" + set tok "." + return 1 } default { - error "tomlish state '$state'. endarray case not implemented [tomlish::parse::report_line]" + set_tokenType "untyped_value" + set tok "." } } } + } - bsl { - #backslash + " " - tab { if {[tcl::string::length $tokenType]} { + set had_slash $slash_active + set slash_active 0 switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 @@ -5205,258 +5555,284 @@ namespace eval tomlish::parse { set_tokenType "single_dquote" return 1 } - whitespace { - if {$state eq "multistring-space"} { - #end whitespace token - incr i -1 ;#reprocess bsl in next run - return 1 - } else { - error "tomlish Unexpected backslash during whitespace. [tomlish::parse::report_line]" - } + barekey { + #todo had_slash - emit token or error + #whitespace is a terminator for bare keys + #set_token_waiting type whitespace value $c complete 1 + incr i -1 + return 1 } - literal - literalpart - squotedkey { - #never need to set slash_active true when in single quoted tokens - append tok "\\" - set slash_active 0 + untyped_value { + #unquoted values (int,date,float etc) are terminated by whitespace + #set_token_waiting type whitespace value $c complete 1 + incr i -1 + return 1 } - string - dquotedkey - comment { - if {$slash_active} { - set slash_active 0 - append tok "\\\\" - } else { - set slash_active 1 + comment { + if {$had_slash} { + append tok "\\" } + append tok $c + } + string - dquotedkey { + if {$had_slash} { append tok "\\" } + append tok $c } stringpart { - if {$slash_active} { - #assert - quotes empty - or we wouldn't have slash_active - set slash_active 0 - append tok "\\\\" + #for stringpart we store WS separately for ease of processing continuations (CONT stripping) + if {$had_slash} { + #REVIEW + #emit the stringpart - go back to the slash + incr i -2 + return 1 } else { - set slash_active 1 + #split into STRINGPART xxx WS " " + incr i -1 + return 1 } } - starttablename - starttablearrayname { - error "tomlish backslash is invalid as first character of $tokenType [tomlish::parse::report_line]" + literal - literalpart - squotedkey { + append tok $c } - tablename - tablearrayname { - if {$slash_active} { - set slash_active 0 - append tok "\\\\" + whitespace { + if {$state eq "multistring-space"} { + append tok $c } else { - set slash_active 1 + append tok $c } } - barekey { - error "tomlish Unexpected backslash during barekey. [tomlish::parse::report_line]" + starttablename - starttablearrayname { + incr i -1 + return 1 + } + tablename - tablearrayname { + #include whitespace in the tablename/tablearrayname + #Will need to be normalized upon interpreting the tomlish as a datastructure + append tok $c } default { - error "tomlish Backslash unexpected during tokentype: '$tokenType'. [tomlish::parse::report_line]" + error "tomlish Received whitespace space during tokenType '$tokenType' [tomlish::parse::report_line]" + } + } + } else { + set had_slash $slash_active + set slash_active 0 + switch -exact -- $state { + tablename-state { + #tablename can have leading,trailing and interspersed whitespace! + #These will not be treated as whitespace tokens, instead forming part of the name. + set_tokenType tablename + set tok "" + if {$had_slash} {append tok "\\"} + append tok $c + } + tablearrayname-state { + set_tokenType tablearrayname + set tok "" + if {$had_slash} {append tok "\\"} + append tok $c } - } - } else { - switch -exact -- $state { multistring-space { - if {$slash_active} { - set_tokenType "stringpart" - set tok "\\\\" - set slash_active 0 + if {$had_slash} { + set_tokenType "continuation" + set tok "\\" + incr i -1 + return 1 } else { - set slash_active 1 + set_tokenType "whitespace" + append tok $c } } multiliteral-space { - #nothing can be escaped in multiliteral-space - not even squotes (?) review set_tokenType "literalpart" - set tok "\\" + set tok $c } default { - error "tomlish tok error: Unexpected backslash when no token is active. [tomlish::parse::report_line]" + if {$had_slash} { + error "tomlish unexpected backslash [tomlish::parse::report_line]" + } + set_tokenType "whitespace" + append tok $c } } } } - sq { - #single quote - set had_slash $slash_active - set slash_active 0 + tabX { if {[tcl::string::length $tokenType]} { + if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out (?review) + set slash_active 0 switch -exact -- $tokenType { - tentative_accum_squote { - #for within multiliteral - #short tentative_accum_squote tokens are returned if active upon receipt of any other character - #longest allowable for leading/trailing are returned here - #### - set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote - #assert state = trailing-squote-space - append tok $c - if {$existingtoklen == 4} { - #maxlen to be a tentative_accum_squote is multisquote + 2 = 5 - #return tok with value ''''' - return 1 - } + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 } - tentative_accum_dquote { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } _start_squote_sequence { - #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space - switch -- [tcl::string::length $tok] { - 1 { - #no conclusion can yet be reached - append tok $c - } - 2 { - #enter multiliteral - #switch? - append tok $c - set_tokenType triple_squote - return 1 - } - default { - #if there are more than 3 leading squotes we also enter multiliteral space and the subsequent ones are handled - #by the tentative_accum_squote check for ending sequence which can accept up to 5 and reintegrate the - #extra 1 or 2 squotes as data. - error "tomlish unexpected token length [tcl::string::length $tok] in '_start_squote_sequence'" - } - } + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 } _start_dquote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_dquote" return 1 } - whitespace { - #end whitespace - incr i -1 ;#reprocess sq + barekey { + #whitespace is a terminator for bare keys + incr i -1 + #set_token_waiting type whitespace value $c complete 1 return 1 } - literal { - #slash_active always false - #terminate the literal - set_token_waiting type endsquote value "'" complete 1 startindex $cindex + untyped_value { + #unquoted values (int,date,float etc) are terminated by whitespace + #set_token_waiting type whitespace value $c complete 1 + incr i -1 return 1 } - literalpart { - #ended by ''' - but final could be '''' or ''''' (up to 2 squotes allowed directly before ending triple squote sequence) - #todo - # idea: end this literalpart (possibly 'temporarily') - # let the sq be reprocessed in the multiliteral-space to push an end-multiliteral-sequence to state stack - # upon popping end-multiliteral-sequence - stitch quotes back into this literalpart's token (if either too short - or a long ending sequence as shown above) - incr i -1 ;#throw the "'" back to loop - will be added to a tentative_accum_squote token for later processing - return 1 + squotedkey { + append tok $c } - XXXitablesquotedkey { - set_token_waiting type endsquote value "'" complete 1 startindex $cindex - return 1 + dquotedkey - string - comment - whitespace { + #REVIEW + append tok $c } - squotedkey { - ### - #set_token_waiting type endsquote value "'" complete 1 - return 1 + stringpart { + #for stringpart we store WS separately for ease of processing continuations (CONT stripping) + if {$had_slash} { + #REVIEW + #emit the stringpart - go back to the slash + incr i -2 + return 1 + } else { + #split into STRINGPART aaa WS " " + incr i -1 + return 1 + } + } + literal - literalpart { + append tok $c } starttablename - starttablearrayname { - #!!! incr i -1 return 1 } tablename - tablearrayname { + #include whitespace in the tablename/tablearrayname + #Will need to be normalized upon interpreting the tomlish as a datastructure append tok $c } - barekey { - #barekeys now support all sorts of unicode letter/number chars for other cultures - #but not punctuation - not even for those of Irish heritage who don't object - #to the anglicised form of some names. - # o'shenanigan seems to not be a legal barekey - #The Irish will have to use an earlier form Ó - which apparently many may prefer anyway. - error "tomlish Unexpected single quote during barekey. [tomlish::parse::report_line]" - } default { - append tok $c + error "tomlish Received whitespace tab during tokenType '$tokenType' [tomlish::parse::report_line]" } } } else { + set had_slash $slash_active + if {$slash_active} { + set slash_active 0 + } switch -exact -- $state { - array-space - keyval-value-expected - itable-keyval-value-expected { - #leading squote - #pseudo-token _start_squote_sequence ss not received by state machine - #This pseudotoken will trigger production of single_squote token or triple_squote token - #It currently doesn't trigger double_squote token - #(handle '' same as 'x' ie produce a single_squote and go into processing literal) - #review - producing double_squote for empty literal may be slightly more efficient. - #This token is not used to handle squote sequences *within* a multiliteral - set_tokenType "_start_squote_sequence" - set tok "'" + tablename-state { + #tablename can have leading,trailing and interspersed whitespace! + #These will not be treated as whitespace tokens, instead forming part of the name. + set_tokenType tablename + set tok $c + } + tablearrayname-state { + set_tokenType tablearrayname + set tok $c + } + multistring-space { + if {$had_slash} { + set_tokenType "continuation" + set tok "\\" + incr i -1 + return 1 + } else { + set_tokenType whitespace + append tok $c + } } multiliteral-space { - #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row - #we are building up a tentative_accum_squote to determine if - #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines - #b) it is exactly ''' and we can terminate the whole multiliteral - #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space - set_tokenType "tentative_trigger_squote" ;#trigger tentative_accum_squote - set tok "'" + set_tokenType "literalpart" + set tok $c + } + default { + set_tokenType "whitespace" + append tok $c + } + } + } + } + bom { + #bom encoded as single unicode codepoint \uFFEF + #BOM (Byte Order Mark) - ignored by token consumer + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 return 1 } - table-space - itable-space { - #tests: squotedkey.test squotedkey_itable.test - set_tokenType "squotedkey" - set tok "" + _start_squote_sequence { + #assert - tok will be one or two squotes only + #A toml literal probably isn't allowed to contain this + #but we will parse and let the validator sort it out. + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 } - XXXtable-space - XXXitable-space { - #future - could there be multiline keys? MLLKEY, MLBKEY ? - #this would (almost) allow arbitrary tcl dicts to be stored in toml (aside from escaping issues) - #probably unlikely - as it's perhaps not very 'minimal' or ergonomic for config files - #@2025 ABNF for toml mentions key, simple-key, unquoted-key, quoted-key and dotted-key - #where key is simple-key or dotted-key - no MLL or MLB components - #the spec states solution for arbitrary binary data is application specific involving encodings - #such as hex, base64 - set_tokenType "_start_squote_sequence" - set tok "'" + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } - tablename-state { - #first char in tablename-state/tablearrayname-state - set_tokenType "tablename" - append tok "'" + literal - literalpart { + append tok $c } - tablearrayname-state { - set_tokenType "tablearrayname" - append tok "'" + string - stringpart { + append tok $c } - literal-state { - #shouldn't get here? review - tomlish::log::debug "- tokloop sq during literal-state with no tokentype - empty literal?" - set_tokenType "literal" - incr -1 + default { + #state machine will generally not have entry to accept bom - let it crash + set_token_waiting type bom value "\uFEFF" complete 1 startindex $cindex return 1 } + } + } else { + switch -exact -- $state { + multiliteral-space { + set_tokenType "literalpart" + set tok $c + } multistring-space { set_tokenType "stringpart" - set tok "" - if {$had_slash} {append tok "\\"} - append tok "," - #error "tomlish unimplemented - squote during state '$state'. [tomlish::parse::report_line]" - } - dottedkey-space { - set_tokenType "squotedkey" + set tok $c } default { - error "tomlish unhandled squote during state '$state'. [tomlish::parse::report_line]" + set_tokenType "bom" + set tok "\uFEFF" + return 1 } } } - } - dq { - #double quote - set had_slash $slash_active - set slash_active 0 + default { if {[tcl::string::length $tokenType]} { + if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. + set slash_active 0 switch -exact -- $tokenType { - tentative_accum_squote { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } @@ -5465,1702 +5841,2177 @@ namespace eval tomlish::parse { set_tokenType "single_squote" return 1 } - tentative_accum_dquote { - #within multistring - #short tentative_accum_dquote tokens are returned if active upon receipt of any other character - #longest allowable for leading/trailing are returned here - #### - set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote - #assert state = trailing-squote-space - append tok $c - if {$existingtoklen == 4} { - #maxlen to be a tentative_accum_dquote is multidquote + 2 = 5 - #return tok with value """"" - return 1 - } - } _start_dquote_sequence { - #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space - switch -- [tcl::string::length $tok] { - 1 { - #no conclusion can yet be reached - append tok $c - } - 2 { - #enter multistring - #switch? - append tok $c - set_tokenType triple_dquote - return 1 - } - default { - #if there are more than 3 leading dquotes we also enter multistring space and the subsequent ones are handled - #by the tentative_accum_dquote check for ending sequence which can accept up to 5 and reintegrate the - #extra 1 or 2 dquotes as data. - error "tomlish unexpected token length [tcl::string::length $tok] in '_start_dquote_sequence'" - } - } - } - literal - literalpart { - append tok $c - } - string { - if {$had_slash} { - append tok "\\" $c - } else { - #unescaped quote always terminates a string - set_token_waiting type enddquote value "\"" complete 1 startindex $cindex - return 1 - } - } - stringpart { - #sub element of multistring - if {$had_slash} { - append tok "\\" $c - } else { - incr i -1 ;#throw the {"} back to loop - will be added to a tentative_accum_dquote token for later processing - return 1 - } + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 } whitespace { - #assert: had_slash will only ever be true in multistring-space - if {$had_slash} { - incr i -2 - return 1 - } else { - #end whitespace token - throw dq back for reprocessing + if {$state eq "multistring-space"} { incr i -1 return 1 - } - } - comment { - if {$had_slash} {append tok "\\"} - append tok $c - } - XXXdquotedkey { - if {$had_slash} { - append tok "\\" - append tok $c } else { - set_token_waiting type enddquote value "\"" complete 1 startindex $cindex + #review + incr i -1 ;#We don't have a full token to add to the token_waiting dict - so leave this char for next run. return 1 } } - dquotedkey { - ### - if {$had_slash} { - append tok "\\" + barekey { + if {[tomlish::utils::is_barekey $c]} { append tok $c } else { - #set_token_waiting type enddquote value {"} complete 1 - return 1 + error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed. (see tomlish::utils::is_barekey) [tomlish::parse::report_line]" } } - squotedkey { - append tok $c - } - tablename - tablearrayname { - if {$had_slash} {append tok "\\"} - append tok $c - } starttablename - starttablearrayname { - incr i -1 ;## + incr i -1 + #allow statemachine to set context for subsequent chars return 1 } + string - stringpart { + append tok $c + } default { - error "tomlish got quote during tokenType '$tokenType' [tomlish::parse::report_line]" + #e.g comment/literal/literalpart/untyped_value/starttablename/starttablearrayname/tablename/tablearrayname + append tok $c } } } else { - #$slash_active not relevant when no tokenType - #token is string only if we're expecting a value at this point + set had_slash $slash_active + set slash_active 0 switch -exact -- $state { - array-space - keyval-value-expected - itable-keyval-value-expected { - #leading dquote - #pseudo-token _start_squote_sequence ss not received by state machine - #This pseudotoken will trigger production of single_dquote token or triple_dquote token - #It currently doesn't trigger double_dquote token - #(handle "" same as "x" ie produce a single_dquote and go into processing string) - #review - producing double_dquote for empty string may be slightly more efficient. - #This token is not used to handle dquote sequences once *within* a multistring - set_tokenType "_start_dquote_sequence" - set tok {"} + table-space - itable-space { + #if no currently active token - assume another key value pair + if {[tomlish::utils::is_barekey $c]} { + set_tokenType "barekey" + append tok $c + } else { + error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" + } } multistring-space { + set_tokenType "stringpart" if {$had_slash} { - set_tokenType "stringpart" - set tok "\\\"" + set tok \\$c } else { - #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row - #we are building up a tentative_accum_squote to determine if - #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines - #b) it is exactly ''' and we can terminate the whole multiliteral - #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space - set_tokenType "tentative_trigger_dquote" ;#trigger tentative_accum_dquote - set tok {"} - return 1 + set tok $c } } multiliteral-space { set_tokenType "literalpart" - set tok "\"" + set tok $c } - table-space - itable-space { - set_tokenType "dquotedkey" - set tok "" + tablename-state { + set_tokenType "tablename" + set tok $c + } + tablearrayname-state { + set_tokenType "tablearrayname" + set tok $c } dottedkey-space { - set_tokenType dquotedkey - set tok "" + set_tokenType barekey + set tok $c + } + default { + #todo - something like ansistring VIEW to show control chars? + set cshow [string map [list \t tab \v vt] $c] + tomlish::log::debug "- tokloop char '$cshow' setting to untyped_value while state:$state [tomlish::parse::report_line]" + set_tokenType "untyped_value" + set tok $c + } + } + } + } + } + + } + + #run out of characters (eof) + if {[tcl::string::length $tokenType]} { + #check for invalid ending tokens + #if {$state eq "err-state"} { + # error "Reached end of data whilst tokenType = '$tokenType'. INVALID" + #} + switch -exact -- $tokenType { + _start_squote_sequence { + set toklen [tcl::string::length $tok] + switch -- $toklen { + 1 { + #invalid eof with open literal + error "tomlish eof reached without closing single quote for string literal. [tomlish::parse::report_line]" + } + 2 { + set_tokenType "literal" + set tok "" + return 1 + + ##review + #set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] + #set_tokenType "literal" + #set tok "" + #return 1 + } + } + } + _start_dquote_sequence { + set toklen [tcl::string::length $tok] + switch -- $toklen { + 1 { + #invalid eof with open string + error "tomlish eof reached without closing double quote for string. [tomlish::parse::report_line]" + } + 2 { + set_tokenType "string" + set tok "" + return 1 + } + } + } + newline { + #The only newline token that has still not been returned should have a tok value of "cr" + puts "tomlish eof reached - with incomplete newline token '$tok'" + if {$tok eq "cr"} { + #we convert lone cr to it's own "cr" token elsewhere in the document to allow statemachine to handle it. + #(which it should generally do by not handling it ie raising an error - or emitting an ERROR list in the tomlish) + #if trailing char is a lone cr - we should encode it the same way as elsewhere that is outside of values + # ie as it's own token. + switch_tokenType "cr" + return 1 + } else { + #should be unreachable + error "tomlish eof reached - with invalid newline token. value: $tok" + } + } + } + set_token_waiting type eof value eof complete 1 startindex $i ;#review + return 1 + } else { + ::tomlish::log::debug "- No current tokenType, ran out of characters, setting tokenType to 'eof' [tomlish::parse::report_line]" + set tokenType "eof" + set tok "eof" + } + return 0 + } + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::parse ---}] +} +namespace eval tomlish::huddle { + proc from_json {json} { + package require huddle + package require huddle::json + #note - huddle may now contain raw surrogate pair - which cannot be emitted to stdout + set h [huddle::json::json2huddle parse $json] + } + proc from_dict {d} { - #only if complex keys become a thing - #set_tokenType dquote_seq_begin - #set tok $c - } - tablename-state { - set_tokenType tablename - set tok $c - } - tablearrayname-state { - set_tokenType tablearrayname - set tok $c - } - default { - error "tomlish Unexpected dquote during state '$state' [tomlish::parse::report_line]" + } + + #raw - strings must already be processed into values suitable for json e.g surrogate pair escaping + proc jsondumpraw {huddle_object {offset " "} {newline "\n"} {begin ""}} { + upvar ::huddle::types types + set nextoff "$begin$offset" + set nlof "$newline$nextoff" + set sp " " + if {[string equal $offset ""]} {set sp ""} + + set type [huddle type $huddle_object] + + switch -- $type { + boolean - + number { + return [huddle get_stripped $huddle_object] + } + null { + return null + } + string { + set data [huddle get_stripped $huddle_object] + + # JSON permits only oneline string + #set data [string map { + # \n \\n + # \t \\t + # \r \\r + # \b \\b + # \f \\f + # \\ \\\\ + # \" \\\" + # / \\/ + # } $data + #] + return "\"$data\"" + } + list { + set inner {} + set len [huddle llength $huddle_object] + for {set i 0} {$i < $len} {incr i} { + set subobject [huddle get $huddle_object $i] + lappend inner [jsondumpraw $subobject $offset $newline $nextoff] + } + if {[llength $inner] == 1} { + return "\[[lindex $inner 0]\]" + } + return "\[$nlof[join $inner ,$nlof]$newline$begin\]" + } + dict { + set inner {} + foreach {key} [huddle keys $huddle_object] { + lappend inner [subst {"$key":$sp[jsondumpraw [huddle get $huddle_object $key] $offset $newline $nextoff]}] + } + #if {[llength $inner] == 1} { + # return $inner ;#wrong - breaks with quoted list representation + # #FAILS: toml-test valid/comment/tricky + #} + + return "\{$nlof[join $inner ,$nlof]$newline$begin\}" + } + default { + set node [unwrap $huddle_object] + #foreach {tag src} $node break + lassign $node tag src + return [$types(callback:$tag) jsondumpraw $huddle_object $offset $newline $nextoff] + } + } + } +} + +#typed as per toml-test types +namespace eval tomlish::typedhuddle { + proc from_json {json} { + set plainhuddle [tomlish::huddle::from_json $json] + + error "tomlish::typedhuddle::from_json unimplemented" + } + proc from_dict {d} { + package require huddle + set h [huddle create] + if {[tomlish::dict::is_typeval $d]} { + set dtype [dict get $d type] + switch -- $dtype { + ARRAY { + #error "typedhuddle::from_dict ARRAY not yet handled" + set h_list [huddle list] + set elements [dict get $d value] + foreach el $elements { + set sub [from_dict $el] + huddle append h_list $sub + } + return $h_list + } + default { + set tinfo [tomlish::dict::convert_typeval_to_tomltest $d] + #basic non-container types + set h_tdict [huddle create] + huddle set h_tdict type [huddle string [dict get $tinfo type]] + huddle set h_tdict value [huddle string [dict get $tinfo value]] + return $h_tdict + } + } + } else { + dict for {dictkey dictval} $d { + set jsonkey [tomlish::utils::rawstring_to_jsonstring $dictkey] + if {[tomlish::dict::is_typeval $dictval]} { + set dtype [dict get $dictval type] + switch -- $dtype { + ARRAY { + #error "typedhuddle::from_dict ARRAY not yet handled" + set h_next [huddle list] + set elements [dict get $dictval value] + foreach el $elements { + set sub [from_dict $el] + huddle append h_next $sub } } + default { + set tinfo [tomlish::dict::convert_typeval_to_tomltest $dictval] + set tp [dict get $tinfo type] + #basic non-container types + set h_next [huddle create] ;#dict + huddle set h_next type [huddle string [dict get $tinfo type]] + huddle set h_next value [huddle string [dict get $tinfo value]] + } } + huddle set h $jsonkey $h_next + } else { + #dict + set sub [from_dict $dictval] + huddle set h $jsonkey $sub } - = { - set had_slash $slash_active - set slash_active 0 + } + } + return $h + } + proc is_typeval {huddled} { + set htype [huddle type $huddled] + if {$htype ne "dict"} { + return 0 + } + if {[huddle keys $huddled] ne {type value}} { + return 0 + } + set tp [huddle type $huddled type] + switch -- $tp { + string - integer - float - bool - datetime - datetime-local - date-local - time-local { + return 1 + } + } + return 0 + } - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - literal - literalpart - squotedkey { - #assertion had_slash 0 - append tok $c - } - string - comment - dquotedkey { - #for these tokenTypes an = is just data. - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $c - } - whitespace { - if {$state eq "multistring-space"} { - incr i -1 - return 1 - } else { - set_token_waiting type equal value = complete 1 startindex $cindex - return 1 - } - } - barekey { - #set_token_waiting type equal value = complete 1 - incr i -1 - return 1 - } - starttablename - starttablearrayname { - error "tomlish Character '=' is invalid first character for $tokenType. [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #invalid in bare name - but valid in quoted parts - leave for tablename parser to sort out - append tok $c - } - default { - error "tomlish unexpected = character during tokentype $tokenType. case not implemented. [tomlish::parse::report_line]" - } + #direction from typed json towards toml + proc convert_typeval_to_tomlish {huddled} { + set htype [huddle get_stripped $huddled type] + set hval [huddle get_stripped $huddled value] + switch -- $htype { + string { + #we need to decide here the type of string element to use in toml/tomlish + #STRING,MULTISTRING,LITERAL,MULTILITERAL + #set unesc [tomlish::utils::unescape_jsonstring $hval] ;#no need - json parser unescaped when creating the huddle + set unesc $hval + #(huddle::json::json2huddle parse $json) + #since it was unescaped any backslashes remaining represent themselves - reapply escape - REVIEW + #set hval [string map [list \\ \\\ ] $hval] + #JSJS + if {[string first \n $unesc] >= 0} { + #always use a MULTI + if {[string first ' $unesc] >=0} { + if {[string first ''' $unesc] >=0} { + set dtype MULTISTRING + } else { + set dtype MULTILITERAL } } else { - switch -exact -- $state { - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok = - } - multiliteral-space { - set_tokenType "literalpart" - set tok "=" - } - dottedkey-space { - set_tokenType "equal" - set tok "=" - return 1 - } - default { - set_tokenType "equal" - set tok = - return 1 - } + if {[string first \"\"\" $unesc] >=0} { + set dtype MULTILITERAL + } else { + set dtype MULTISTRING + } + } + } else { + #use multi if needed? + if {[string first '' $hval] >=0} { + if {[string first ''' $unesc] >=0} { + set dtype STRING + } else { + set dtype MULTILITERAL } + } elseif {[string first ' $unesc] >= 0} { + set dtype STRING + } elseif {[string first \"\"\" $unesc] >= 0} { + set dtype LITERAL + } else { + #STRING or LITERAL? + set dtype STRING } } - cr { - #REVIEW! - # \r carriage return - if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - newline { - #we have received a double cr - ::tomlish::log::warn "double cr - will generate cr token. needs testing" - set_tokenType "cr" ;#lone cr token will generally raise an error - but let state machine handle it - incr i -1 - return 1 - } - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - literal { - append tok $c - } - literalpart { - #part of MLL string (multi-line literal string) - #we need to split out crlf as a separate NEWLINE to be consistent - ::tomlish::log::warn "literalpart ended by cr - needs testing" - #return literalpart temporarily - allow cr to be reprocessed from multiliteral-space - incr i -1 - return 1 - } - stringpart { - #stringpart is a part of MLB string (multi-line basic string) - #throw back the cr - if followed by lf it will become a {NEWLINE crlf} entry within the MULTISTRING list (e.g between STRINGPART entries) - incr i -1 - return 1 - } - starttablename - starttablearrayname { - error "tomlish Character is invalid first character for $tokenType. [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #could in theory be valid in quoted part of name - #review - might be better just to disallow here - append tok $c - } - whitespace { - #it should technically be part of whitespace if not followed by lf - #but outside of values we are also free to map it to be another NEWLINE instead? REVIEW - incr i -1 - return 1 - } - untyped_value { - incr i -1 - return 1 - } - default { - #!todo - error out if cr inappropriate for tokenType - append tok $c + + } + datetime - bool { + set dtype [string toupper $htype] + } + float { + set dtype FLOAT + if {[string is integer -strict $hval]} { + #json FLOAT specified as integer - must have dot for toml + set hval [expr {double($hval)}] + } + } + integer { + set dtype INT + } + datetime - datetime-local - date-local - time-local { + #DDDD + #set dtype DATETIME + set dtype [string toupper $htype] + } + default { + error "tomlish::typedhuddle::convert_typeval_to_tomlish unrecognised type $htype" + } + } + return [list type $dtype value $hval] + } + +} +namespace eval tomlish::toml { + proc from_binary {bindata} { + set bom "" + set b12 [tcl::string::range $bindata 0 1] + set b12test [string map [list \xEF\xBB utf8_12 \xFE\xFF bom16be \xFF\xFE utf32le_12 \x00\x00 utf32be_12] $b12] + switch -- $b12test { + bom16be { + #FEFF + set bom utf-16be + } + utf32le_12 { + #FFFE + set b34 [tcl::string::range $bindata 2 3] + if {$b34 eq "\x00\x00"} { + set bom utf-32le + } else { + set bom utf-16le + } + } + utf32be_12 { + #0000 + set b34 [tcl::string::range $bindata 2 3] + if {$b34 eq "\xFE\xFF"} { + set bom utf-32be + } + } + utf8_12 { + set b3 [tcl::string::index $bindata 2] + if {$b3 eq "\xBF"} { + set bom utf-8 + } + } + } + if {$bom eq ""} { + #no bom - assume utf8 - but we read in as binary + #if data wasn't actually utf8 we may error here depending on content - or we may just get wrongly encoded chars + set tomldata [encoding convertfrom utf-8 $bindata] + } elseif {$bom eq "utf-8"} { + #utf-8 bom read in as binary + set tomldata [encoding convertfrom utf-8 $bindata] + #bom now encoded as single unicode char \uFFEF + } else { + return -code error -errorcode {TOML ENCODING NOTUTF8} "Input not UTF8 encoded according to BOM. Indicated encoding is '$bom' - invalid for toml" + } + return $tomldata + } + proc from_tomlish {tomlish} { + return [tomlish::encode::tomlish $tomlish] + } + + #todo - rename to taggedjson + proc from_tomlish_from_dict_from_typedjson {json} { + set d [tomlish::dict::from_typedjson $json] + from_tomlish [tomlish::from_dict $d] ;#return tomlish + } + + proc tablename_split {tablename {normalize false}} { + #we can't just split on . because we have to handle quoted segments which may contain a dot. + #eg {dog."tater.man"} + if {$tablename eq ""} { + error "tablename_split. No table name segments found. empty tablename" + } + set sLen [tcl::string::length $tablename] + set segments [list] + set mode "preval" ;#5 modes: preval, quoted,litquoted, unquoted, postval + #quoted is for double-quotes, litquoted is for single-quotes (string literal) + set seg "" + for {set i 0} {$i < $sLen} {incr i} { + + if {$i > 0} { + set lastChar [tcl::string::index $tablename [expr {$i - 1}]] + } else { + set lastChar "" + } + + #todo - track\count backslashes properly + + set c [tcl::string::index $tablename $i] + if {$c eq "\""} { + if {($lastChar eq "\\")} { + #not strictly correct - we could have had an even number prior-backslash sequence + #the toml spec would have us error out immediately on bsl in bad location - but we're + #trying to parse to unvalidated tomlish + set ctest escq + } else { + set ctest dq + } + } else { + set ctest [string map [list " " sp \t tab] $c] + } + + switch -- $ctest { + . { + switch -exact -- $mode { + preval { + error "tablename_split. dot not allowed - expecting a value" + } + unquoted { + #dot marks end of segment. + if {![tomlish::utils::is_barekey $seg]} { + error "tablename_split. unquoted key segment $seg is not a valid toml key" } + lappend segments $seg + set seg "" + set mode "preval" + } + quoted { + append seg $c + } + litquoted { + append seg $c + } + postval { + #got dot in an expected location + set mode "preval" } - } else { - #lf may be appended if next - #review - lone cr as newline? - this is uncommon - but so is lone cr in a string(?) - set_tokenType "newline" - set tok cr } } - lf { - # \n newline - set had_slash $slash_active - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - newline { - #review - #this lf is the trailing part of a crlf - append tok lf ;#assert we should now have tok "crlf" - as a previous cr is the only way to have an incomplete newline tok - return 1 - } - tentative_accum_squote - tentative_accum_dquote { - #multiliteral or multistring - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - literal { - #nl is not allowed *within* a literal - require multiliteral syntax for any literal containing a newline ''' ''' - #even though we terminate the literal without the closing quote here - the token_waiting newline should trigger a state error - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } - literalpart { - #we allow newlines - but store them within the multiliteral as their own element - #This is a legitimate end to the literalpart - but not the whole multiliteral - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } - stringpart { - if {$had_slash} { - #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) - set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] - incr i -1 - return 1 - } else { - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } - } - starttablename - tablename - tablearrayname - starttablearrayname { - error "tomlish Character is invalid in $tokenType. [tomlish::parse::report_line]" - } - default { - #newline ends all other tokens. - #note for string: we don't add (raw unescaped) newline to simple string. (must use multi-string for this) - #note for whitespace: - # we will use the convention that \n terminates the current whitespace even if whitespace follows - # ie whitespace is split into separate whitespace tokens at each newline - - #puts "-------------- newline lf during tokenType $tokenType" - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } + dq { + #unescaped dquote + switch -- $mode { + preval { + set mode "quoted" + set seg "\"" } - } else { - switch -exact -- $state { - multistring-space { - if {$had_slash} { - set_tokenType "continuation" - set tok "\\" - incr i -1 - return 1 - } else { - set_tokenType "newline" - set tok lf - return 1 - } - } - multiliteral-space { - #assert had_slash 0 - set_tokenType "newline" - set tok "lf" - return 1 - } - default { - #ignore slash? error? - set_tokenType "newline" - set tok lf - return 1 + unquoted { + #invalid in barekey - but we are after structure only + append seg $c + } + quoted { + append seg $c + #JJJJ + if {$normalize} { + lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] + } else { + lappend segments $seg } + set seg "" + set mode "postval" ;#make sure we only accept a dot or end-of-data now. + } + litquoted { + append seg $c + } + postval { + error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" } - #if {$had_slash} { - # #CONT directly before newline - allows strings_5_byteequivalent test to pass - # set_tokenType "continuation" - # set tok "\\" - # incr i -1 - # return 1 - #} else { - # set_tokenType newline - # set tok lf - # return 1 - #} } } - , { - set had_slash $slash_active - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - comment - tablename - tablearrayname { - if {$had_slash} {append tok "\\"} - append tok , - } - string - dquotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - #stringpart can have up to 2 quotes too - if {$had_slash} {append tok "\\"} - append tok $c - } - literal - literalpart - squotedkey { - #assert had_slash always 0 - append tok $c - } - whitespace { - if {$state eq "multistring-space"} { - incr i -1 - return 1 - } else { - set_token_waiting type comma value "," complete 1 startindex $cindex - return 1 - } - } - default { - set_token_waiting type comma value "," complete 1 startindex $cindex - if {$had_slash} {append tok "\\"} - return 1 - } + ' { + switch -- $mode { + preval { + append seg $c + set mode "litquoted" } - } else { - switch -exact -- $state { - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} {append tok "\\"} - append tok "," - } - multiliteral-space { - #assert had_slash 0 - set_tokenType "literalpart" - set tok "," - } - default { - set_tokenType "comma" - set tok "," - return 1 + unquoted { + #single quote inside e.g o'neill - ultimately invalid - but we pass through here. + append seg $c + } + quoted { + append seg $c + } + litquoted { + append seg $c + #no normalization to do aside from stripping squotes + if {$normalize} { + lappend segments [tcl::string::range $seg 1 end-1] + } else { + lappend segments $seg } + set seg "" + set mode "postval" + } + postval { + error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" } } } - . { - set had_slash $slash_active - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - comment - untyped_value { - if {$had_slash} {append tok "\\"} - append tok $c - } - string - dquotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $c - } - literal - literalpart - squotedkey { - #assert had_slash always 0 - append tok $c - } - whitespace { - switch -exact -- $state { - multistring-space { - #review - if {$had_slash} { - incr i -2 - } else { - incr i -1 - } - return 1 - } - xxxdottedkey-space { - incr i -1 - return 1 - } - dottedkey-space-tail { - incr i -1 - return 1 - } - default { - error "tomlish Received period during tokenType 'whitespace' [tomlish::parse::report_line]" - } - } - } - starttablename - starttablearrayname { - #This would correspond to an empty table name - error "tomlish Character '.' is not allowed as first character ($tokenType). [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #subtable - split later - review - append tok $c - } - barekey { - #e.g x.y = 1 - #we need to transition the barekey to become a structured table name ??? review - #x is the tablename y is the key - set_token_waiting type dotsep value "." complete 1 startindex $cindex - return 1 - } - default { - error "tomlish Received period during tokenType '$tokenType' [tomlish::parse::report_line]" - #set_token_waiting type period value . complete 1 - #return 1 - } + sp - tab { + switch -- $mode { + preval - postval { + #ignore } - } else { - switch -exact -- $state { - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} {append tok "\\"} - append tok "." - } - multiliteral-space { - set_tokenType "literalpart" - set tok "." - } - XXXdottedkey-space { - ### obs? - set_tokenType "dotsep" - set tok "." - return 1 - } - dottedkey-space-tail { - ### - set_tokenType "dotsep" - set tok "." - return 1 - } - default { - set_tokenType "untyped_value" - set tok "." - } + unquoted { + #terminates a barekey + lappend segments $seg + set seg "" + set mode "postval" + } + default { + #append to quoted or litquoted + append seg $c + } + } + } + default { + switch -- $mode { + preval { + set mode unquoted + append seg $c + } + postval { + error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" + } + default { + append seg $c } } + } + } + if {$i == $sLen-1} { + #end of data + ::tomlish::log::debug "End of data: mode='$mode'" + switch -exact -- $mode { + preval { + if {[llength $segments]} { + error "tablename_split. Expected a value after last dot separator. tablename: '$tablename'" + } else { + error "tablename_split. Whitespace only? No table name segments found. tablename: '$tablename'" + } + } + unquoted { + if {![tomlish::utils::is_barekey $seg]} { + #e.g toml-test invalid/table/with-pound required to fail for invalid barekey + error "tablename_split. unquoted key segment $seg is not a valid toml key" + } + lappend segments $seg + } + quoted { + error "tablename_split. Expected a trailing double quote. tablename: '$tablename'" + } + litquoted { + error "tablename_split. Expected a trailing single quote. tablename: '$tablename'" + } + postval { + #ok - segment already lappended + } } - " " { - if {[tcl::string::length $tokenType]} { - set had_slash $slash_active - set slash_active 0 - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - barekey { - #todo had_slash - emit token or error - #whitespace is a terminator for bare keys - #set_token_waiting type whitespace value $c complete 1 - incr i -1 - return 1 - } - untyped_value { - #unquoted values (int,date,float etc) are terminated by whitespace - #set_token_waiting type whitespace value $c complete 1 - incr i -1 - return 1 - } - comment { - if {$had_slash} { - append tok "\\" - } - append tok $c - } - string - dquotedkey { - if {$had_slash} { append tok "\\" } - append tok $c + } + } + + #note - we must allow 'empty' quoted strings '' & "" + # (these are 'discouraged' but valid toml keys) + + return $segments + } + + #tablenames (& tablearraynames) may contain irrelevant leading, trailing and interspersed whitespace + # tablenames can be made up of segments delimited by dots. .eg [ a.b . c ] + #trimmed, the tablename becomes {a.b.c} + # A segment may contain whitespace if it is quoted e.g [a . b . "c etc " ] + #ie whitespace is only irrelevant if it's outside a quoted segment + #trimmed, the tablename becomes {a.b."c etc "} + proc tablename_trim {tablename} { + set segments [tomlish::toml::tablename_split $tablename false] + set trimmed_segments [list] + foreach seg $segments { + lappend trimmed_segments [::string trim $seg " \t"] + } + return [join $trimmed_segments .] + } +} + +namespace eval tomlish::dict { + namespace export {[a-z]*}; # Convention: export all lowercase + namespace path [namespace parent] + + #from_taggedjson + proc from_typedjson {json} { + package require huddle + package require huddle::json + set h [huddle::json::json2huddle parse $json] + #json2huddle parse unescapes the basic json escapes \n \\ etc + #$h could now contain raw form of surrogate pair (json2huddle parse as at 2025-014 doesn't convert the surrogates - just unescapes?) + if {[catch {encoding convertto utf-8 $h} errM]} { + #This test suggests we have raw surrogate pairs - REVIEW + package require punk::cesu + set h [punk::cesu::from_surrogatestring $h] + } + tomlish::dict::from_typedhuddle $h + } + proc from_typedhuddle {h} { + set resultd [dict create] + switch -- [huddle type $h] { + dict { + foreach k [huddle keys $h] { + switch -- [huddle type $h $k] { + dict { + set huddle_d [huddle get $h $k] + #puts stderr "huddle_d: $huddle_d" + #set v [huddle get_stripped $h $k] + if {[tomlish::typedhuddle::is_typeval $huddle_d]} { + dict set resultd $k [tomlish::typedhuddle::convert_typeval_to_tomlish $huddle_d] + } else { + dict set resultd $k [from_typedhuddle $huddle_d] } - stringpart { - #for stringpart we store WS separately for ease of processing continuations (CONT stripping) - if {$had_slash} { - #REVIEW - #emit the stringpart - go back to the slash - incr i -2 - return 1 - } else { - #split into STRINGPART xxx WS " " - incr i -1 - return 1 + } + list { + set items [huddle get $h $k] + + set numitems [huddle llength $items] + if {$numitems == 0} { + dict set resultd $k [list type ARRAY value {}] + } else { + set arritems [list] + for {set i 0} {$i < $numitems} {incr i} { + set item [huddle get $items $i] + #puts stderr "item: $item" + #set v [huddle get $item] + if {[tomlish::typedhuddle::is_typeval $item]} { + lappend arritems [tomlish::typedhuddle::convert_typeval_to_tomlish $item] + } else { + lappend arritems [from_typedhuddle $item] + } } + dict set resultd $k [list type ARRAY value $arritems] } - literal - literalpart - squotedkey { - append tok $c - } - whitespace { - if {$state eq "multistring-space"} { - append tok $c - } else { - append tok $c + } + default { + error "dict_from_json unexpected subtype [huddle type $h $k] in dict" + } + } + } + } + list { + set items [huddle get $h] + set numitems [huddle llength $items] + if {$numitems == 0} { + return [list type ARRAY value {}] + } else { + set arritems [list] + for {set i 0} {$i < $numitems} {incr i} { + set item [huddle get $items $i] + #puts stderr "item: $item" + #set v [huddle get $item] + if {[tomlish::typedhuddle::is_typeval $item]} { + lappend arritems [tomlish::typedhuddle::convert_typeval_to_tomlish $item] + } else { + lappend arritems [from_typedhuddle $item] + } + } + return [list type ARRAY value $arritems] + } + + } + } + return $resultd + } + + proc is_typeval {d} { + #designed to detect {type value } e.g {type INT value 3}, {type STRING value "blah etc"} + #as a sanity check we need to avoid mistaking user data that happens to match same form + #consider x.y={type="spud",value="blah"} + #The value of type will itself have already been converted to {type STRING value spud} ie never a single element. + #check the length of the type as a quick way to see it's a tag - not something else masqerading. + expr {[string is dict $d] && [dict size $d] == 2 && [dict exists $d type] && [dict exists $d value] && [llength [dict get $d type]] == 1} + } + + #simple types only - not containers? + proc convert_typeval_to_tomltest {d} { + set dtype [dict get $d type] + set dval [dict get $d value] + switch -- $dtype { + INT { + set testtype integer + set dval [expr {$dval}] ;#convert e.g 0xDEADBEEF to 3735928559 + } + DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - FLOAT - BOOL { + #DDDD + set testtype [string tolower $dtype] + } + STRING - MULTISTRING { + set testtype string + #JJJJ + set dval [tomlish::utils::unescape_string $dval] + set dval [tomlish::utils::rawstring_to_jsonstring $dval] + } + LITERAL - MULTILITERAL { + set testtype string + #don't validate on way out to json here? + #decoder should validate by calling tomlish::from_dict + #if {![tomlish::utils::rawstring_is_valid_literal $dval]} { + # return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} $msg + #} + set dval [tomlish::utils::rawstring_to_jsonstring $dval] + } + MULTILITERAL { + #todo - escape newlines for json? + set testtype string + } + default { + error "convert_typeval_to_tomltest unhandled type $dtype" + } + } + return [list type $testtype value $dval] + } + + # Check that each leaf is a typeval or typeval dict + #importantly: must accept empty dict leaves e.g {x {}} + proc is_typeval_dict {d {checkarrays 0}} { + if {![string is dict $d]} { + return 0 + } + dict for {k v} $d { + set is_d 0 + if {!([is_typeval $v] || [set is_d [is_typeval_dict $v $checkarrays]])} { + return 0 + } + if {!$is_d} { + set vtype [dict get $v type] + switch -- $vtype { + INT - FLOAT - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - BOOL - LITERAL - STRING - MULTILITERAL - MULTISTRING {} + ARRAY { + if {$checkarrays} { + set arrdata [dict get $v value] + foreach el $arrdata { + if {![is_typeval_dict $el $checkarrays]} { + return 0 } } - starttablename - starttablearrayname { - incr i -1 - return 1 - } - tablename - tablearrayname { - #include whitespace in the tablename/tablearrayname - #Will need to be normalized upon interpreting the tomlish as a datastructure - append tok $c - } - default { - error "tomlish Received whitespace space during tokenType '$tokenType' [tomlish::parse::report_line]" - } } + } + default { + puts stderr "is_typeval_dict: Unexpected type '$vtype'" + return 0 + } + } + } + } + return 1 + } + + proc last_tomltype_posn {d} { + set last_simple -1 + set dictposn [expr {[dict size $d] -1}] + foreach k [lreverse [dict keys $d]] { + set dval [dict get $d $k] + if {[is_typeval $dval]} { + set last_simple $dictposn + break + } + incr dictposn -1 + } + return $last_simple + } + + + #review + proc name_from_tablestack {tablestack} { + set name "" + foreach tinfo [lrange $tablestack 1 end] { + lassign $tinfo type namepart + switch -- $type { + T { + if {$name eq ""} { + append name $namepart } else { - set had_slash $slash_active - set slash_active 0 - switch -exact -- $state { - tablename-state { - #tablename can have leading,trailing and interspersed whitespace! - #These will not be treated as whitespace tokens, instead forming part of the name. - set_tokenType tablename - set tok "" - if {$had_slash} {append tok "\\"} - append tok $c - } - tablearrayname-state { - set_tokenType tablearrayname - set tok "" - if {$had_slash} {append tok "\\"} - append tok $c - } - multistring-space { - if {$had_slash} { - set_tokenType "continuation" - set tok "\\" - incr i -1 - return 1 - } else { - set_tokenType "whitespace" - append tok $c - } - } - multiliteral-space { - set_tokenType "literalpart" - set tok $c - } - default { - if {$had_slash} { - error "tomlish unexpected backslash [tomlish::parse::report_line]" - } - set_tokenType "whitespace" - append tok $c - } + append name .$namepart + } + } + I { + if {$name eq ""} { + append name $namepart + } else { + append name .$namepart + } + } + default { + #end at first break in the leading sequence of T & I tablenames + break + } + } + } + return $name + } + + + #tablenames_info is a flat dict with the key being an '@@' path + proc _show_tablenames {tablenames_info} { + #e.g {@l@a @@b} {ttype header_table tdefined closed} + append msg \n "tablenames_info:" \n + dict for {tkey tinfo} $tablenames_info { + append msg " " "table: $tkey" \n + dict for {field finfo} $tinfo { + append msg " " "$field $finfo" \n + } + } + return $msg + } + + #take a raw string and classify: result is a 2 element list comprised of KEY|SQKEY|DQKEY and the value being the appropriate inner string + proc classify_rawkey {rawval} { + if {![::tomlish::utils::is_barekey $rawval]} { + #requires quoting + # + #Any dot in the key would have been split by dict::from_tomlish - so if it's present here it's part of this key - not a level separator! + # + #we'll use a basic mechanisms for now to determine the type of quoting + # - whether it has any single quotes or not. + # (can't go in an SQKEY) + # - whether it has any chars that require quoting when in a Bstring + # (if so - then its visual representation might be unsuitable for a key in a toml text file, so escape and put in DQKEY instead of literal SQKEY) + #todo - more? + #REVIEW - the backslash might often be in things like a regex or windows path - which is often better expressed in a literal SQKEY + # from literal examples: + # 'c:\Users\nodejs\templates' + # '<\i\c*\s*>' + #If these are in *keys* our basic test will express these as: + # "c:\\Users\\nodejs\\templates" + # "<\\i\\c*\\s*>" + # This still works - but a smarter test might determine when SQKEY is the better form? + #when coming from external systems - can we even know if the value was already escaped? REVIEW + #Probably when coming from json - we know it's already escaped - and so we build our dict converting keys to unescaped + #TODO - clarify in documentation that keys resulting from dict::from_tomlish are in 'normalized' (unescaped) form + # + #For keys - we currently (2025) are only allowed barekeys,basic strings and literal strings. (no multiline forms) + set k_escaped [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $rawval] + if {[string length $k_escaped] != [string length $rawval]} { + #escaping made a difference + set has_escape_requirement 1 + } else { + set has_escape_requirement 0 + } + if {[string first ' $rawval] >=0 || $has_escape_requirement} { + #basic string + # (any ANSI SGR sequence will end up here in escaped form ) + return [list DQKEY $k_escaped] + } else { + #literal string + return [list SQKEY $rawval] + } + } else { + return [list KEY $rawval] + } + } + #the quoting implies the necessary escaping for DQKEYs + proc join_and_quote_rawkey_list {rawkeylist} { + set result "" + foreach rk $rawkeylist { + lassign [tomlish::dict::classify_rawkey $rk] type val + switch -- $type { + SQKEY { + append result "'$val'." + } + DQKEY { + append result "\"$val\"." + } + KEY { + append result "$val." + } + } + } + return [string range $result 0 end-1] + } + + proc _process_tomlish_dottedkey {element {context_refpath {}}} { + upvar tablenames_info tablenames_info + upvar datastructure datastructure + set dottedtables_defined [list] + set dkey_info [tomlish::get_dottedkey_info $element] + #e.g1 keys {x.y y} keys_raw {'x.y' "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) + #e.g2 keys {x.y y} keys_raw {{"x.y"} "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) + + #[a.b] + #t1.t2.dottedtable.leafkey = "val" + #we have already checked supertables a & {a b} + # - in basic case, passed in context_refpath as {@@a @@b} + # - our context_refpath could also include some combination of keys and array indices e.g {@@a @@b 3 @@subtablekey} + #We need to check {a b t1} & {a b t2} ('creation' only) + #and then 'dottedtable' is 'defined' while leafkey is an ordinary key in dottedtable + + #note we also get here as a 'dottedkey' with the following even though there is no dot in k + #[a.b] + #leafkey = "val" + + set all_dotted_keys [dict get $dkey_info keys] + set dottedkeyname [join $all_dotted_keys .] + + if {[llength $all_dotted_keys] > 1} { + #dottedtable.k=1 + #tX.dottedtable.k=1 + #etc + + #Wrap in a list so we can detect 'null' equivalent. + #We can't use empty string as that's a valid dotted key segment + set dottedtable_bag [list [lindex $all_dotted_keys end-1]] + set dotparents [lrange $all_dotted_keys 0 end-2] + } else { + #basic case - not really a 'dotted' key + #k = 1 + set dottedtable_bag [list] ;#empty bag + set dotparents [list] + } + #assert dottedtable_bag only ever holds 0 or 1 elements + set leaf_key [lindex $all_dotted_keys end] + + #see also: https://github.com/toml-lang/toml/issues/846 "Can dotted keys insert into already-defined [tables]?" + #This code was originally written with a misinterpretation of: + #"Dotted keys create and define a table for each key part before the last one, provided that such tables were not previously created." + # 'each key part before the last one' refers to each key in a single dotted key entry + # not each 2nd-to last key in a list of dotted keys. + + + #we've already tested the table/tablearray keys that got us here.. but not the dottedkey segments (if any) prior to dottedtable & leaf_key + set dottedsuper_refpath $context_refpath + foreach normkey $dotparents { + lappend dottedsuper_refpath @@$normkey + if {![dict exists $tablenames_info $dottedsuper_refpath ttype]} { + #supertable with this combined path (context_path plus parts of dottedkey) not yet 'created' + if {[tomlish::dict::path::exists $datastructure $dottedsuper_refpath]} { + #There is data so it must have been created as a keyval + set msg "Path $dottedsuper_refpath for dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here + #dict set tablenames_info $dottedsuper_refpath ttype unknown_table ;#REVIEW + dict set tablenames_info $dottedsuper_refpath ttype unknown_dotted ;#REVIEW + + #see note above re dotted keys insert into already defined table - we need to 'define' all the dotted supers in this block + lappend dottedtables_defined $dottedsuper_refpath + + #ensure empty tables are still represented in the datastructure + tomlish::dict::path::set_endpoint datastructure $dottedsuper_refpath {} ;#set to empty subdict + } else { + #added for fixed assumption + set ttype [dict get $tablenames_info $dottedsuper_refpath ttype] + set definedstate [dictn getdef $tablenames_info [list $dottedsuper_refpath tdefined] NULL] + switch -- $ttype { + dottedkey_table - unknown_dotted { + #'created' as dotted - but make sure it's from this header section - i.e defined not set + if {$definedstate ne "NULL"} { + #collision with some other dottedkey + set msg "Table at $dottedsuper_refpath represented by dottedkey $dottedkeyname has been 'defined' elsewhere (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + } + itable { + #itables are immediately defined + set msg "Table at $dottedsuper_refpath represented by dottedkey $dottedkeyname has been 'defined' as itable (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + default { + #header_table, header_tablearray or unknown_header + #is header_tablearray any different from header_table in this context? + #we don't set tdefined for tablearray anyway - so should be ok here. + if {$definedstate ne "NULL"} { + set msg "Table at $dottedsuper_refpath represented by dottedkey $dottedkeyname has been 'defined' in a header (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg } } } - tab { - if {[tcl::string::length $tokenType]} { - if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out (?review) - set slash_active 0 - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - barekey { - #whitespace is a terminator for bare keys - incr i -1 - #set_token_waiting type whitespace value $c complete 1 - return 1 - } - untyped_value { - #unquoted values (int,date,float etc) are terminated by whitespace - #set_token_waiting type whitespace value $c complete 1 - incr i -1 - return 1 - } - squotedkey { - append tok $c - } - dquotedkey - string - comment - whitespace { - #REVIEW - append tok $c - } - stringpart { - #for stringpart we store WS separately for ease of processing continuations (CONT stripping) - if {$had_slash} { - #REVIEW - #emit the stringpart - go back to the slash - incr i -2 - return 1 - } else { - #split into STRINGPART aaa WS " " - incr i -1 - return 1 - } - } - literal - literalpart { - append tok $c - } - starttablename - starttablearrayname { - incr i -1 - return 1 - } - tablename - tablearrayname { - #include whitespace in the tablename/tablearrayname - #Will need to be normalized upon interpreting the tomlish as a datastructure - append tok $c - } - default { - error "tomlish Received whitespace tab during tokenType '$tokenType' [tomlish::parse::report_line]" - } + } + } + + #dottedtable being 2nd last segment was for original assumption - todo - tidy up? we are duplicating the logic above + #review - any need/advantage to treat 2nd to last key any different from other supers? ie D in a.b.c.D.key=1 + #no need for 'unknown_dotted' vs 'dottedkey_table' ?? + if {[llength $dottedtable_bag] == 1} { + set dottedtable [lindex $dottedtable_bag 0] + set dottedkey_refpath [list {*}$dottedsuper_refpath "@@$dottedtable"] + #our dotted key is attempting to define a table + if {![dict exists $tablenames_info $dottedkey_refpath ttype]} { + #first one - but check datastructure for collisions + if {[tomlish::dict::path::exists $datastructure $dottedkey_refpath]} { + set msg "Path $dottedkey_refpath for dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #'create' the table + dict set tablenames_info $dottedkey_refpath ttype dottedkey_table + #don't actually set 'defined' here.. use the end of TABLE record to close them off by looking at this list + tomlish::dict::path::set_endpoint datastructure $dottedkey_refpath {} + lappend dottedtables_defined $dottedkey_refpath + + # + } else { + #exists - but might be from another dottedkey within the current header section + #the table is open for adding keys until the next 'header' section ([tablename] / [[tablearray]]) + #check for 'defined' closed (or just existence) + set ttype [dict get $tablenames_info $dottedkey_refpath ttype] + set definedstate [dictn getdef $tablenames_info [list $dottedkey_refpath tdefined] NULL] + switch -- $ttype { + dottedkey_table - unknown_dotted { + #'created' as dotted - but make sure it's from this header section - i.e defined not set + if {$definedstate ne "NULL"} { + #collision with some other dottedkey + set msg "Table at $dottedkey_refpath represented by dottedkey $dottedkeyname has been 'defined' elsewhere (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg } - } else { - set had_slash $slash_active - if {$slash_active} { - set slash_active 0 + } + itable { + #itables are immediately defined + set msg "Table at $dottedkey_refpath represented by dottedkey $dottedkeyname has been 'defined' as itable (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + default { + #header_table, header_tablearray or unknown_header + #is header_tablearray any different from header_table in this context? + #we don't set tdefined for tablearray anyway - so should be ok here. + if {$definedstate ne "NULL"} { + set msg "Table at $dottedkey_refpath represented by dottedkey $dottedkeyname has been 'defined' in a header (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg } - switch -exact -- $state { - tablename-state { - #tablename can have leading,trailing and interspersed whitespace! - #These will not be treated as whitespace tokens, instead forming part of the name. - set_tokenType tablename - set tok $c - } - tablearrayname-state { - set_tokenType tablearrayname - set tok $c - } - multistring-space { - if {$had_slash} { - set_tokenType "continuation" - set tok "\\" - incr i -1 - return 1 - } else { - set_tokenType whitespace - append tok $c - } - } - multiliteral-space { - set_tokenType "literalpart" - set tok $c + } + } + } + } else { + set dottedkey_refpath $dottedsuper_refpath + } + #assert - dottedkey represents a key val pair that can be added + + + set fullkey_refpath [list {*}$dottedkey_refpath @@$leaf_key] + if {[tomlish::dict::path::exists $datastructure $fullkey_refpath]} { + set msg "Duplicate key. The key (path $fullkey_refpath) already exists at this level in the toml data. The toml data is not valid." + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + + #set keyval_dict [_get_keyval_value $element] + lassign [_get_keyval_value $element] _ keyval_dict _ sub_tablenames_info + + + #keyval_dict is either a {type value } + #or the result from parsing an arbitrary dict from an inline table - which could theoretically look the same at the topmost level + #punk::dict::is_typeval can distinguish + tomlish::log::debug "_process_tomlish_dottedkey>>> context:$context_refpath dottedkey $dottedkeyname kv: $keyval_dict" + tomlish::dict::path::set_endpoint datastructure $fullkey_refpath $keyval_dict + + #remove ? + #if {![tomlish::dict::is_typeval $keyval_dict]} { + # #the value is either empty or or a dict structure with arbitrary (from-user-data) toplevel keys + # # inner structure will contain {type value } if all leaves are not empty ITABLES + # ##set tkey [list {*}$norm_segments {*}$all_dotted_keys] + + # #by not creating a tablenames_info record - we effectively make it closed anyway? + # #it should be detected as a key + # #is there any need to store tablenames_info for it?? + # #REVIEW + + # ##TODO - update? + # #dictn incr tablenames_info [list $tkey seencount] + # ##if the keyval_dict is not a simple type x value y - then it's an inline table ? + # ##if so - we should add the path to the leaf_key as a closed table too - as it's not allowed to have more entries added. + # #dictn set tablenames_info [list $tkey closed] 1 + #} + return [dict create dottedtables_defined $dottedtables_defined] + } + #tomlish::dict::from_tomlish is a *basic* programmatic datastructure for accessing the data. + # produce a dictionary of keys and values from a tomlish tagged list. + # dict::from_tomlish is primarily for read access to toml data. + #Extraneous (not within quoted sections) whitespace and comments are not preserved in this structure, + # so a roundtrip from toml to this datastructure and back to toml will lose whitespace formatting and comments. + # creating/changing toml values can be done directly on a tomlish list if preserving (or adding) formatting/comments is desired. + #A separate package 'tomlish::object' may be needed to allow easier programmatic creating/updating/deleting of data elements whilst preserving (or adding or selectively deleting/editing) such formatting. + # + + #within an ARRAY, we store a list of items such as plain dicts (possibly empty) and {type value } for simple types + #(ARRAYS can be mixed type) + #This means our dict structure should have only ARRAY and simple types which need to be in {type value } form + #A dict within an array encodeded as a type ITABLE value should also parse - but is the unpreferred form - REVIEW test? + + #Namespacing? + #ie note the difference: + #[Data] + #temp = { cpu = 79.5, case = 72.0} + # versus + #[Data] + #temps = [{cpu = 79.5, case = 72.0}] + proc from_tomlish {tomlish} { + package require dictn + + #keep track of which tablenames have already been directly defined, + # so we can raise an error to satisfy the toml rule: 'You cannot define any key or table more than once. Doing so is invalid' + #Note that [a] and then [a.b] is ok if there are no subkey conflicts - so we are only tracking complete tablenames here. + #we don't error out just because a previous tablename segment has already appeared. + + #Declaring, Creating, and Defining Tables + #https://github.com/toml-lang/toml/issues/795 + #(update - only Creating and Defining are relevant terminology) + + #review + #tablenames_info keys ttype created, tdefined, createdby, definedby, closedby ??? review keys + # [tname] = header_table [[tname]] = header_tablearray + + #consider the following 2 which are legal: + #[table] #'table' created, defined=open type header_table + #x.y = 3 + #[table.x.z] #'table' tdefined=closed closedby={header_table table.x.z}, 'table.x' created, 'table.x.z' created tdefined=open tdefinedby={header_table table.x.z} + #k= 22 + # #'table.x.z' tdefined=closed closedby={eof eof} + + #equivalent datastructure + + #[table] #'table' created, tdefined=open definedby={header_table table} + #[table.x] #'table' tdefined=closed closedby={header_table table.x}, 'table.x' created tdefined=open definedby={header_table table.x} + #y = 3 + #[table.x.z] #'table.x' tdefined=closed closedby={header_table table.x.z}, 'table.x.z' created tdefined=open definedby={header_table table.x.z} + #k=22 + + #illegal + #[table] #'table' created and tdefined=open + #x.y = 3 #'table.x' created first keyval pair tdefined=open definedby={keyval x.y = 3} + #[table.x.y.z] #'table' tdefined=closed, 'table.x' closed because parent 'table' closed?, 'table.x.y' cannot be created + #k = 22 + # + ## - we would fail on encountering table.x.y because only table and table.x are effectively tables - but that table.x is closed should be detected (?) + + #illegal + #[table] + #x.y = {p=3} + #[table.x.y.z] + #k = 22 + ## we should fail because y is an inline table which is closed to further entries + + #note: it is not safe to compare normalized tablenames using join! + # e.g a.'b.c'.d is not the same as a.b.c.d + # instead compare {a b.c d} with {a b c d} + # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. + #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' + #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} + + + + if {[uplevel 1 [list info exists tablenames_info]]} { + upvar tablenames_info tablenames_info + } else { + set tablenames_info [dict create] ;#keyed on tablepath each of which is an @@path such as {@@config @@subgroup @@etc} (corresponding to config.subgroup.etc) + #also has non @@ indexes which are list indexes as taken by tcl list commands (int or end-1 etc) + #value is a dict with keys: ttype, tdefined + } + + + log::info "---> dict::from_tomlish processing '$tomlish'<<<" + set items $tomlish + + foreach lst $items { + if {[lindex $lst 0] ni $::tomlish::tags} { + error "supplied list does not appear to be toml parsed into a tomlish tagged list. Run tomlish::decode::toml on the raw toml data to produce a tomlish list" + } + } + + if {[lindex $tomlish 0] eq "TOMLISH"} { + #ignore TOMLISH tag at beginning + set items [lrange $tomlish 1 end] + } + + set datastructure [dict create] + set dottedtables_defined [list] + foreach item $items { + set tag [lindex $item 0] + #puts "...> item:'$item' tag:'$tag'" + switch -exact -- $tag { + KEY - DQKEY - SQKEY - INT - FLOAT - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - STRING - LITERAL { + #why would we get individual key item as opposed to DOTTEDKEY? + error "tomlish::dict::from_tomlish error: invalid tag: $tag. At the toplevel, from_tomlish can only process WS NEWLINE COMMENT and compound elements DOTTEDKEY TABLE TABLEARRAY ITABLE MULTILITERAL MULTISTRING" + } + DOTTEDKEY { + #toplevel dotted key + set dkinfo [_process_tomlish_dottedkey $item] + lappend dottedtables_defined {*}[dict get $dkinfo dottedtables_defined] + #at any level - we don't expect any more DOTTEDKEY records in a tomlish structure after TABLE or TABLEARRAY are encountered + #as those records should encapsulate their own dottedkeys + + } + TABLEARRAY { + #close off any dottedtables_defined created by dottedkeys at this level + foreach dtablepath $dottedtables_defined { + dict set tablenames_info $dtablepath tdefined closed + } + set dottedtables_defined [list] ;#for closing off at end by setting 'defined' + + set tablearrayname [lindex $item 1] + tomlish::log::debug "---> tomlish::dict::from_tomlish processing item TABLENAME (name: $tablearrayname): $item" + set norm_segments [::tomlish::toml::tablename_split $tablearrayname true] ;#true to normalize + #we expect repeated tablearray entries - each adding a sub-object to the value, which is an array/list. + #tablearrayname is likely to appear multiple times - so unlike a TABLE we don't check for 'defined' for the full name as an indicator of a problem + set supertable [list] + ############## + # [[a.b.c.d]] + # norm_segments = {a b c d} + #check a {a b} {a b c} <---- supertables of a.b.c.d + ############## + set refpath [list] ;#e.g @@j1 @@j2 1 @@k1 end + foreach normseg [lrange $norm_segments 0 end-1] { + lappend supertable $normseg + lappend refpath @@$normseg + if {![dict exists $tablenames_info $refpath ttype]} { + #supertable with this path doesn't yet exist + if {[tomlish::dict::path::exists $datastructure $refpath]} { + #There is data though - so it must have been created as a keyval + set msg "Supertable [join $supertable .] of tablearray name $tablearrayname already has data but doesn't appear to be a table - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + #test: datastructure_tablearray_supertable_keycollision + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } else { + #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here + #review - we can't later specify as tablearray so should just set ttype to header_table even though it's being created + # because of a tablearray header? + #By setting ttype to something other than table_header we can provide more precise errorCode/msg ?? + dict set tablenames_info $refpath ttype unknown_header + #ensure empty tables are still represented in the datastructure + dict set datastructure {*}$supertable [list] } - default { - set_tokenType "whitespace" - append tok $c + } else { + #REVIEW!! + # what happens with from_toml {[[a.b.c]]} {[[a.b]]} ??? + #presumed that a and a.b were 'created' as tables (supertables of tablearray at a.b.c) and can't now be redefined as tablearrays + + #supertable has already been created - and may be defined - but even if defined we can add subtables unless it is of type itable + #but if it's a tablearray - we need to point to the most 'recently defined table element of the array' + #(last member of that array - need to check type? allowed to have non-table elements ie nonhomogenous??) + set supertype [dict get $tablenames_info $refpath ttype] + if {$supertype eq "header_tablearray"} { + #exercised by toml-tests: + # valid/table/array-table-array + # valid/table/array-nest + + #puts stdout "todict!!! TABLEARRAY nesting required for supertable [join $supertable .]" + + #'refer' to the appropriate element in existing array + set arrdata [tomlish::dict::path::get $datastructure [list {*}$refpath @@value]] + set idx [expr {[llength $arrdata]-1}] + if {$idx < 0} { + #existing tablearray should have at least one entry even if empty (review) + set msg "reference to empty tablearray?" + return -code error -errorcode {TOMLISH STRUCTURE REFTOEMPTYTABLEARRAY} $msg + } + lappend refpath $idx } } } - } - bom { - #BOM (Byte Order Mark) - ignored by token consumer - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - #assert - tok will be one or two squotes only - #A toml literal probably isn't allowed to contain this - #but we will parse and let the validator sort it out. - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - literal - literalpart { - append tok $c - } - string - stringpart { - append tok $c - } - default { - #state machine will generally not have entry to accept bom - let it crash - set_token_waiting type bom value "\uFEFF" complete 1 startindex $cindex - return 1 - } + # + #puts "TABLE supertable refpath $refpath" + lappend refpath @@[lindex $norm_segments end] + tomlish::log::debug "TABLEARRAY refpath $refpath" + set tablearray_refpath $refpath + + + if {![dict exists $tablenames_info $tablearray_refpath ttype]} { + #first encounter of this tablearrayname + if {[tomlish::dict::path::exists $datastructure $tablearray_refpath]} { + #e.g from_toml {a=1} {[[a]]} + set msg "Cannot create tablearray name $tablearrayname. Key already has data but key doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + #test: datastructure_tablearray_direct_keycollision_error + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg } + #no collision - we can create the tablearray and the array in the datastructure + dict set tablenames_info $tablearray_refpath ttype header_tablearray + #dict set datastructure {*}$norm_segments [list type ARRAY value {}] + #create array along with empty array-item at position zero + tomlish::dict::path::set_endpoint datastructure $tablearray_refpath [list type ARRAY value {{}}] + set arrayitem_refpath [list {*}$tablearray_refpath 0] + #set ARRAY_ELEMENTS [list] } else { - switch -exact -- $state { - multiliteral-space { - set_tokenType "literalpart" - set tok $c - } - multistring-space { - set_tokenType "stringpart" - set tok $c - } - default { - set_tokenType "bom" - set tok "\uFEFF" - return 1 + #we have an existing tablenames_info record for this path - but is it a tablearray? + set ttype [dict get $tablenames_info $tablearray_refpath ttype] + if {$ttype ne "header_tablearray"} { + #header_table or itable + switch -- $ttype { + itable {set ttypename itable} + header_table {set ttypename table} + dottedkey_table {set ttypename dottedkey_table} + unknown_header - unknown_dotted { + #table was created e.g as supertable - but not specifically a tablearray + #violates ordering - return specific test error + set msg "Table $tablearrayname referenced as supertable before tablearray defined (ordering)" + return -code error -errorcode {TOMLISH STRUCTURE TABLEARRAYORDERING} $msg + } + default {error "unrecognised type $ttype - expected header_table or itable"} } + set msg "tablearray name $tablearrayname already appears to be already created as '$ttypename' not tablearray - invalid?" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg } + #EXISTING tablearray + #add to array + #error "add_to_array not implemented" + #{type ARRAY value } + #set ARRAY_ELEMENTS [dict get $datastructure {*}$norm_segments value] + tomlish::log::debug ">>>>pre-extend-array dict::from_tomlish datastructure: $datastructure" + set existing_array [tomlish::dict::path::get $datastructure [list {*}$tablearray_refpath @@value]] + set arrayitem_refpath [list {*}$tablearray_refpath [llength $existing_array]] + tomlish::dict::path::lappend datastructure $tablearray_refpath {} + tomlish::log::debug ">>>>post-extend-array dict::from_tomlish datastructure: $datastructure" } - } - default { - if {[tcl::string::length $tokenType]} { - if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. - set slash_active 0 - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - whitespace { - if {$state eq "multistring-space"} { - incr i -1 - return 1 - } else { - #review - incr i -1 ;#We don't have a full token to add to the token_waiting dict - so leave this char for next run. - return 1 - } - } - barekey { - if {[tomlish::utils::is_barekey $c]} { - append tok $c - } else { - error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed. (see tomlish::utils::is_barekey) [tomlish::parse::report_line]" - } + + #set object [dict create] ;#array context equivalent of 'datastructure' + + #add to ARRAY_ELEMENTS and write back in to datastructure. + foreach element [lrange $item 2 end] { + set type [lindex $element 0] + tomlish::log::debug "----> todict processing $tag subitem $type processing contained element $element" + switch -exact -- $type { + DOTTEDKEY { + set dkinfo [_process_tomlish_dottedkey $element $arrayitem_refpath] + lappend dottedtables_defined {*}[dict get $dkinfo dottedtables_defined] } - starttablename - starttablearrayname { - incr i -1 - #allow statemachine to set context for subsequent chars - return 1 + NEWLINE - COMMENT - WS { + #ignore } - stringpart { - append tok $c + TABLE { + #we *perhaps* should be able to process tablearray subtables either as part of the tablearray record, or independently. + #(or even a mixture of both, although that is somewhat an edge case, and of limited utility) + #[[fruit]] + #x=1 + # [fruit.metadata] + # [fruit.otherdata] + + #when processing a dict destined for the above - the tomlish generator (e.g from_dict) + #should create as 1 or 3 records (but could create 2 records if there was an unrelated table in between the subtables) + #choices: all in tablearray record, tablearray + 1 or 2 table records. + # + #We are going the other way here - so we just need to realise that the list of tables 'belonging' to this tablearray might not be complete. + # + #the subtable names must be prefixed with the tablearray - we should validate that for any contained TABLE records + + #The default mechanism is for from_dict to produce tomlish with separate TABLE records - and use the ordering to determine membership + #If we were to support wrapping the TABLE records within a TABLEARRAY - we should also support TABLEARRAY within TABLEARRAY + # ----------------------------------------------------------------------- + #Implementing this is not critical for standard encoding/decoding of toml! + #It would be an alternative form for the tomlish intermediate form - and adds complexity. + # + #The upside would be to provide a function for sorting/rearranging in the tomlish form if all records were fully encapsulated. + #A possible downside is that unrelated tables placed before a tablearray is fully defined (within the tablearray definition area in toml) + # would have to be re-positioned before or after the encapsulated tablearray record. + # While unrelated tables in such a position aren't a recommended way to write toml, they appear to be valid + # and preserving the author's ordering is a goal of the basic encoding/decoding operations if no explicit sorting/reordering was requested. + # + #Consider an 'encapsulate' method to this (tomlish -> tomlish) + # ----------------------------------------------------------------------- + #todo + error "tomlish::dict::from_tomlish TABLE element within TABLEARRAY not handled - TABLE should be a separate tomlish record" } default { - #e.g comment/string/literal/literalpart/untyped_value/starttablename/starttablearrayname/tablename/tablearrayname - append tok $c + error "tomlish::dict::from_tomlish Sub element of type '$type' not understood in tablearray context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" } } - } else { - set had_slash $slash_active - set slash_active 0 - switch -exact -- $state { - table-space - itable-space { - #if no currently active token - assume another key value pair - if {[tomlish::utils::is_barekey $c]} { - set_tokenType "barekey" - append tok $c - } else { - error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" - } - } - multistring-space { - set_tokenType "stringpart" - if {$had_slash} { - set tok \\$c - } else { - set tok $c - } - } - multiliteral-space { - set_tokenType "literalpart" - set tok $c - } - tablename-state { - set_tokenType "tablename" - set tok $c - } - tablearrayname-state { - set_tokenType "tablearrayname" - set tok $c - } - dottedkey-space { - set_tokenType barekey - set tok $c - } - default { - #todo - something like ansistring VIEW to show control chars? - set cshow [string map [list \t tab \v vt] $c] - tomlish::log::debug "- tokloop char '$cshow' setting to untyped_value while state:$state [tomlish::parse::report_line]" - set_tokenType "untyped_value" - set tok $c + } + + #end of TABLEARRAY record - equivalent of EOF or next header - close off the dottedtables + foreach dtablepath $dottedtables_defined { + dict set tablenames_info $dtablepath tdefined closed + } + } + TABLE { + #close off any dottedtables_defined created by dottedkeys at this level + foreach dtablepath $dottedtables_defined { + dict set tablenames_info $dtablepath tdefined closed + } + set tablename [lindex $item 1] + set dottedtables_defined [list] ;#for closing off at end by setting 'defined' + #As our TABLE record contains all it's child DOTTEDKEY records - this should be equivalent to setting them as defined at EOF or next header. + + #----------------------------------------------------------------------------------- + #default assumption - our reference is to the main tablenames_info and datastructure + #Will need to append keys appropriately if we have recursed + #----------------------------------------------------------------------------------- + + log::debug "---> tomlish::dict::from_tomlish processing item TABLE (name: $tablename): $item" + set norm_segments [::tomlish::toml::tablename_split $tablename true] ;#true to normalize + + + + set name_segments [::tomlish::toml::tablename_split $tablename 0] ;#unnormalized e.g ['a'."b".c.d] -> 'a' "b" c d + #results of tablename_split 0 are 'raw' - ie some segments may be enclosed in single or double quotes. + + + set supertable [list] + ############## + # [a.b.c.d] + # norm_segments = {a b c d} + #check a {a b} {a b c} <---- supertables of a.b.c.d + ############## + + ############## + #[[a]] + #[a.b] #supertable a is tablearray + ############## + + #also consider + ############## + # [[a.b]] + # [a.b.c.d] #supertable a is a table, supertable a.b is tablearray, supertable a.b.c is elementtable + ############## + set refpath [list] ;#e.g @@j1 @@j2 1 @@k1 end + foreach normseg [lrange $norm_segments 0 end-1] { + lappend supertable $normseg + lappend refpath @@$normseg + if {![dict exists $tablenames_info $refpath ttype]} { + #supertable with this path doesn't yet exist + if {[tomlish::dict::path::exists $datastructure $refpath]} { + #There is data though - so it must have been created as a keyval + set msg "Supertable [join $supertable .] of table name $tablename (path $refpath) already has data but doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here + #we also don't know whether it's a table or a dottedkey_table (not allowed to be tablearray - out of order?) + dict set tablenames_info $refpath ttype unknown_header + #ensure empty tables are still represented in the datastructure + #dict set datastructure {*}$supertable [list] + tomlish::dict::path::set_endpoint datastructure $refpath {} + } else { + #supertable has already been created - and may be defined - but even if defined we can add subtables unless it is of type itable + if {[dict get $tablenames_info $refpath ttype] eq "header_tablearray"} { + #'refer' to the appropriate element in existing array + set arrdata [tomlish::dict::path::get $datastructure [list {*}$refpath @@value]] + set idx [expr {[llength $arrdata]-1}] + if {$idx < 0} { + #existing tablearray should have at least one entry even if empty (review) + set msg "reference to empty tablearray?" + return -code error -errorcode {TOMLISH STRUCTURE REFTOEMPTYTABLEARRAY} $msg + } + lappend refpath $idx + } else { + #?? + if {[dictn getdef $tablenames_info [list $refpath tdefined] NULL] eq "NULL"} { + } else { + } } } } - } - } + #puts "TABLE supertable refpath $refpath" + lappend refpath @@[lindex $norm_segments end] + tomlish::log::info "TABLE refpath $refpath" + set table_refpath $refpath - } - #run out of characters (eof) - if {[tcl::string::length $tokenType]} { - #check for invalid ending tokens - #if {$state eq "err-state"} { - # error "Reached end of data whilst tokenType = '$tokenType'. INVALID" - #} - switch -exact -- $tokenType { - _start_squote_sequence { - set toklen [tcl::string::length $tok] - switch -- $toklen { - 1 { - #invalid eof with open literal - error "tomlish eof reached without closing single quote for string literal. [tomlish::parse::report_line]" + + + #table [a.b.c.d] hasn't been defined - but may have been 'created' already by a longer tablename + # - or may have existing data from a keyval + if {![dict exists $tablenames_info $table_refpath ttype]} { + if {[tomlish::dict::path::exists $datastructure $table_refpath]} { + #e.g from_toml {a=1} {[a]} + set msg "Cannot create table name $tablename. Key already has data but key doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + #test: datastructure_tablename_keyval_collision_error + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg } - 2 { - set_tokenType "literal" - set tok "" - return 1 + #no data or previously created table + dict set tablenames_info $table_refpath ttype header_table - ##review - #set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] - #set_tokenType "literal" - #set tok "" - #return 1 + #We are 'defining' this table's keys and values here (even if empty) + #dict set datastructure {*}$norm_segments [list] ;#ensure table still represented in datastructure even if we add no keyvals here + tomlish::dict::path::set_endpoint datastructure $table_refpath {} ;#ensure table still represented in datastructure even if we add no keyvals here + } else { + if {[dict get $tablenames_info $table_refpath ttype] eq "header_tablearray"} { + #e.g tomltest invalid/table/duplicate-table-array2 + #[[tbl]] + #[tbl] + set msg "Table name $tablename has already been created as a tablearray. Invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } else { + #any other type tdefined is a problem + set T_DEFINED [dictn getdef $tablenames_info [list $table_refpath tdefined] NULL] + if {$T_DEFINED ne "NULL" } { + #our tablename e.g [a.b.c.d] declares a space to 'define' subkeys - but there has already been a definition space for this path + set msg "Table name $tablename has already been defined in the toml data. Invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } } } - } - _start_dquote_sequence { - set toklen [tcl::string::length $tok] - switch -- $toklen { - 1 { - #invalid eof with open string - error "tomlish eof reached without closing double quote for string. [tomlish::parse::report_line]" - } - 2 { - set_tokenType "string" - set tok "" - return 1 + dict set tablenames_info $table_refpath tdefined open + + #now add the contained elements + foreach element [lrange $item 2 end] { + set type [lindex $element 0] + log::debug "----> todict processing $tag subitem $type processing contained element $element" + switch -exact -- $type { + DOTTEDKEY { + set dkinfo [_process_tomlish_dottedkey $element $table_refpath] + lappend dottedtables_defined {*}[dict get $dkinfo dottedtables_defined] + } + NEWLINE - COMMENT - WS { + #ignore + } + default { + error "Sub element of type '$type' not understood in table context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" + } } } - } - newline { - #The only newline token that has still not been returned should have a tok value of "cr" - puts "tomlish eof reached - with incomplete newline token '$tok'" - if {$tok eq "cr"} { - #we convert lone cr to it's own "cr" token elsewhere in the document to allow statemachine to handle it. - #(which it should generally do by not handling it ie raising an error - or emitting an ERROR list in the tomlish) - #if trailing char is a lone cr - we should encode it the same way as elsewhere that is outside of values - # ie as it's own token. - switch_tokenType "cr" - return 1 - } else { - #should be unreachable - error "tomlish eof reached - with invalid newline token. value: $tok" - } - } - } - set_token_waiting type eof value eof complete 1 startindex $i ;#review - return 1 - } else { - ::tomlish::log::debug "- No current tokenType, ran out of characters, setting tokenType to 'eof' [tomlish::parse::report_line]" - set tokenType "eof" - set tok "eof" - } - return 0 - } - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::parse ---}] -} -namespace eval tomlish::tdictn { - #access tomlish dict structure - namespace export {[a-z]*}; # Convention: export all lowercase - #access with path such as: @@k @@k 0 @@k end where dict keys marked with @@ and plain values are list indices into in {type ARRAY value } - #leaf elements returned as structured {type value } - proc get {dictval {path {}}} { - if {$path eq ""} { - return $dictval - } - ::set data $dictval - ::set pathsofar [list] - foreach p $path { - ::lappend pathsofar $p - if {[string range $p 0 1] eq "@@"} { - ::set data [dict get $data [string range $p 2 end]] - } else { - if {![tomlish::dict::is_tomlish_typeval $data]} { - error "tdictn::get error bad path $path. Attempt to access table as array at subpath $pathsofar." - } - if {[dict get $data type] ne "ARRAY"} { - error "tdictn::get error bad path $path. Subpath $pathsofar is not an array." - } - ::set arrdata [dict get $data value] - ::set data [lindex $arrdata $p] - } - } - return $data - } - proc exists {dictval path} { - ::set data $dictval - ::set pathsofar [list] - ::set exists 1 - foreach p $path { - ::lappend pathsofar $p - if {[string range $p 0 1] eq "@@"} { - ::set k [string range $p 2 end] - if {![dict exists $data $k]} { - return 0 - } - ::set data [dict get $data $k] - } else { - if {![tomlish::dict::is_tomlish_typeval $data]} { - return 0 - } - if {[dict get $data type] ne "ARRAY"} { - return 0 - } - ::set arrdata [dict get $data value] - ::set intp [punk::lib::lindex_resolve_basic $arrdata $p] ;#handle index math (end-1 etc) - if {$intp == -1} { - #out of bounds - return 0 - } - ::set data [lindex $arrdata $p] - } - } - return $exists - } - - #only endpoints - don't create intermediate paths? - proc set_endpoint {dictvariable path value} { - upvar $dictvariable originaldata - ::set data $originaldata - ::set pathsofar [list] - if {![tomlish::dict::is_tomlish_typeval $val]} { - error "tdictn::set error - value must already be in the tomlish form {type value }" - } - foreach p $path { - ::lappend pathsofar $p - if {[string range $p 0 1] eq "@@"} { - ::set k [string range $p 2 end] - if {![dict exists $data $k]} { - error "tdictn::set error bad path $path. Attempt to access nonexistent element at subpath $pathsofar." + #end of TABLE record - equivalent of EOF or next header - close off the dottedtables + foreach dtablepath $dottedtables_defined { + dict set tablenames_info $dtablepath tdefined closed + } } - ::set varname v[incr v] + ITABLE { + #As there is no other mechanism to create tables within an ITABLE than dottedkeys + # and ITABLES are fully defined/enclosed - we can rely on key collision and don't need to track dottedtables_defined - REVIEW. + set dottedtables_defined [list] + #SEP??? + #ITABLE only ever on RHS of = or inside ARRAY + set datastructure [dict create] + set tablenames_info [dict create] - if {$pathsofar eq $path} { - #see if endpoint of the path given already exists - if {[dict exists $data $k]} { - ::set endpoint [dict get $data $k] - if {[tomlish::dict::is_tomlish_typeval $endpoint]} { - error "tdictn::set_endpoint error Unable to overwrite subpath $pathsofar. Existing value not {type value tododict processing $tag subitem $type processing contained element $element" + switch -exact -- $type { + DOTTEDKEY { + set dkinfo [_process_tomlish_dottedkey $element] + } + NEWLINE - COMMENT - WS { + #ignore + } + default { + error "Sub element of type '$type' not understood in ITABLE context. Expected only KEY,DQKEY,SQKEY,NEWLINE,COMMENT,WS" + } } - #temp debug - puts stderr "overwriting [dict get $data k]" } - ::set $varname $value - dict set vdict $pathsofar $varname - break - } - ::set data [dict get $data $k] - ::set $varname $data - dict set vdict $pathsofar $varname - } else { - if {![tomlish::dict::is_tomlish_typeval $data]} { - error "tdictn::set_endpoint error bad path $path. Attempt to access table as array at subpath $pathsofar." - } - if {[dict get $data type] ne "ARRAY"} { - error "tdictn::set_endpoint error bad path $path. Subpath $pathsofar is not an array." } - ::set varname v[incr v] - if {$pathsofar eq $path} { - if {[dict get $data type] ne "ARRAY"} { - error "tdictn::lappend error bad path $path. Parent of subpath $pathsofar is not an array." - } - ::set parentarray [dict get $data value] - set idx [punk::lib::lindex_resolve_basic $parentarray $p] - if {$idx == -1} { - error "tdictn::set_endpoint error bad path $path. No existing element at $p" - } - ::set endpoint [lindex $parentarray $p] - if {[dict get $endpoint type] eq "ARRAY"} { - #disallow overwriting array - unless given value is an ARRAY? REVIEW - if {[dict get $value type] ne "ARRAY"} { - error "tdictn::set_endpoint error bad path $path. Cannot overwrite array with non-array $value" + ARRAY { + #arrays in toml are allowed to contain mixtures of types + set datastructure [list] + log::debug "--> processing array: $item" + + foreach element [lrange $item 1 end] { + set type [lindex $element 0] + log::debug "----> tododict processing $tag subitem $type processing contained element $element" + switch -exact -- $type { + INT - FLOAT - BOOL - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL { + set value [lindex $element 1] + lappend datastructure [list type $type value $value] + } + STRING { + #JJJJ + #don't unescape string! + set value [lindex $element 1] + #lappend datastructure [list type $type value [::tomlish::utils::unescape_string $value]] + lappend datastructure [list type $type value $value] + } + LITERAL { + set value [lindex $element 1] + lappend datastructure [list type $type value $value] + } + ITABLE { + #anonymous table + #lappend datastructure [list type $type value [::tomlish::to_dict [list $element]]] + lappend datastructure [::tomlish::dict::from_tomlish [list $element]] ;#store itables within arrays as raw dicts (possibly empty) + } + TABLE - TABLEARRAY { + #invalid? shouldn't be output from from_dict - but could manually be constructed as such? review + #doesn't make sense as table needs a name? + #take as synonym for ITABLE? + error "tomlish::dict::from_tomlish $type within array unexpected" + } + ARRAY - MULTISTRING - MULTILITERAL { + #set value [lindex $element 1] + lappend datastructure [list type $type value [::tomlish::dict::from_tomlish [list $element]]] + } + WS - SEP - NEWLINE - COMMENT { + #ignore whitespace, commas, newlines and comments + } + default { + error "tomlish::dict::from_tomlish Unexpected value type '$type' found in array" + } } } - ::set $varname $value - dict set vdict $pathsofar $varname - break - } else { - ::set arrdata [dict get $data value] - set idx [punk::lib::lindex_resolve_basic $arrdata $p] - if {$idx == -1} { - error "tdictn::set_endpoint error bad path $path. No existing element at $p" - } - ::set data [lindex $arrdata $p] - ::set $varname $data - dict set vdict $pathsofar $varname - } - } - } - dict for {path varname} $vdict { - puts "$path $varname\n" - puts " [::set $varname]\n" - puts "" - } - ::set i 0 - ::set reverse [lreverse $vdict] - foreach {varname path} $reverse { - set newval [::set $varname] - if {$i+2 == [llength $reverse]} { - ::set k [lindex $path end] - ::set k [string range $k 2 end] ;#first key is always @@something - dict set originaldata $k $newval - puts "--result $originaldata" - break - } - ::set nextvarname [lindex $reverse $i+2] - ::set nextval [::set $nextvarname] - ::set k [lindex $path end] - if {[string match @@* $k]} { - #dict key - dict set $nextvarname $k $newval - } else { - #list index - ::set nextarr [dict get $nextval value] - ::lset nextarr $k $newval - dict set $nextvarname value $nextarr - } - ::incr i 2 - } - - return $value - - } - #path must be to a {type ARRAY value } - #REVIEW - how to lappend to deep mixed dict/array structure without rewriting whole datastructure? - proc lappend {dictvariable path args} { - upvar $dictvariable originaldata - ::set data $originaldata - ::set pathsofar [list] - #::set newlist [list] - ::set v 0 - ::set vdict [dict create] - foreach a $args { - if {![tomlish::dict::is_tomlish_typeval $a]} { - error "tdictn::lappend error - lappended arguments must already be in the tomlish form {type value }" - } - } - foreach p $path { - ::lappend pathsofar $p - if {[string range $p 0 1] eq "@@"} { - ::set k [string range $p 2 end] - if {![dict exists $data $k]} { - error "tdictn::lappend error bad path $path. Attempt to access nonexistent element at subpath $pathsofar." } - ::set varname v[incr v] + MULTILITERAL { + #triple squoted string + #first newline stripped only if it is the very first element + #(ie *immediately* following the opening delims) + #All whitespace other than newlines is within LITERALPARTS + # ------------------------------------------------------------------------- + #todo - consider extension to toml to allow indent-aware multiline literals + # how - propose as issue in toml github? Use different delim? e.g ^^^ ? + #e.g + # xxx=?'''abc + # def + # etc + # ''' + # - we would like to trimleft each line to the column following the opening delim + # ------------------------------------------------------------------------- - if {$pathsofar eq $path} { - #see if endpoint of the path given is an ARRAY - ::set endpoint [dict get $data $k] - if {![tomlish::dict::is_tomlish_typeval $endpoint]} { - error "tdictn::lappend error bad path $path. Attempt to access table as array at subpath $pathsofar." + log::debug "---> todict processing multiliteral: $item" + set parts [lrange $item 1 end] + if {[lindex $parts 0 0] eq "NEWLINE"} { + set parts [lrange $parts 1 end] ;#skip it } - if {[dict get $endpoint type] ne "ARRAY"} { - error "tdictn::lappend error bad path $path. Subpath $pathsofar is not an array." + for {set idx 0} {$idx < [llength $parts]} {incr idx} { + set element [lindex $parts $idx] + set type [lindex $element 0] + switch -exact -- $type { + LITERALPART { + append stringvalue [lindex $element 1] + } + NEWLINE { + set val [lindex $element 1] + if {$val eq "lf"} { + append stringvalue \n + } else { + append stringvalue \r\n + } + } + default { + error "Unexpected value type '$type' found in multistring" + } + } } - ::set arrdata [dict get $endpoint value] - ::lappend arrdata {*}$args - dict set endpoint value $arrdata - ::set newlist $endpoint - ::set $varname $newlist - dict set vdict $pathsofar $varname - break + set datastructure $stringvalue } - ::set data [dict get $data $k] - ::set $varname $data - dict set vdict $pathsofar $varname - } else { - if {![tomlish::dict::is_tomlish_typeval $data]} { - error "tdictn::lappend error bad path $path. Attempt to access table as array at subpath $pathsofar." + MULTISTRING { + #triple dquoted string + log::debug "---> tomlish::dict::from_tomlish processing multistring: $item" + set stringvalue "" + set idx 0 + set parts [lrange $item 1 end] + for {set idx 0} {$idx < [llength $parts]} {incr idx} { + set element [lindex $parts $idx] + set type [lindex $element 0] + #We use STRINGPART in the tomlish representation as a distinct element to STRING - which would imply wrapping quotes to be reinserted + switch -exact -- $type { + STRING { + #todo - do away with STRING ? + #we don't build MULTISTRINGS containing STRING - but should we accept it? + tomlish::log::warn "double quoting a STRING found in MULTISTRING - should be STRINGPART?" + #append stringvalue "\"[::tomlish::utils::unescape_string [lindex $element 1]]\"" + append stringvalue "\"[lindex $element 1]\"" + } + STRINGPART { + #JJJ + #don't unescape string + #append stringvalue [::tomlish::utils::unescape_string [lindex $element 1]] + append stringvalue [lindex $element 1] + } + CONT { + #When the last non-whitespace character on a line is an unescaped backslash, + #it will be trimmed along with all whitespace (including newlines) up to the next non-whitespace character or closing delimiter + # review - we allow some whitespace in stringpart elements - can a stringpart ever be all whitespace? + set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] + if {$next_nl == -1} { + #last (or first and only) line + return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} "Invalid whitespace escape - not a valid continuation position" + #set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] + #if {$non_ws >= 0} { + # #append stringvalue "\\" + # return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} "Invalid whitespace escape - not a valid continuation position" + #} else { + # #skip over ws without emitting + # set idx [llength $parts] + #} + } else { + set parts_til_nl [lrange $parts 0 $next_nl-1] + set non_ws [lsearch -index 0 -start $idx+1 -not $parts_til_nl WS] + if {$non_ws >= 0} { + #This CONT is invalid. If there had been a non-whitespace char directly following it, + #it wouldn't have come through as a CONT token + #Now that we see it isn't the last non-whitespace backslash on the line we can reject + # as an invalid escape of space or tab + #append stringvalue "\\" + return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} "Invalid whitespace escape - not a valid continuation position" + } else { + #skip over ws on this line + set idx $next_nl + #then have to check each subsequent line until we get to first non-whitespace + set trimming 1 + while {$trimming && $idx < [llength $parts]} { + set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] + if {$next_nl == -1} { + #last line + set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] + if {$non_ws >= 0} { + set idx [expr {$non_ws -1}] + } else { + set idx [llength $parts] + } + set trimming 0 + } else { + set non_ws [lsearch -index 0 -start $idx+1 -not [lrange $parts 0 $next_nl-1] WS] + if {$non_ws >= 0} { + set idx [expr {$non_ws -1}] + set trimming 0 + } else { + set idx $next_nl + #keep trimming + } + } + } + } + } + } + NEWLINE { + #if newline is first element - it is not part of the data of a multistring + if {$idx > 0} { + set val [lindex $element 1] + if {$val eq "lf"} { + append stringvalue \n + } else { + append stringvalue \r\n + } + } + } + WS { + append stringvalue [lindex $element 1] + } + default { + error "Unexpected value type '$type' found in multistring" + } + } + } + set datastructure $stringvalue } - if {[dict get $data type] ne "ARRAY"} { - error "tdictn::lappend error bad path $path. Subpath $pathsofar is not an array." + WS - COMMENT - NEWLINE { + #ignore } - ::set varname v[incr v] - if {$pathsofar eq $path} { - if {[dict get $data type] ne "ARRAY"} { - error "tdictn::lappend error bad path $path. Parent path is not an array." - } - ::set parentarray [dict get $data value] - ::set idx [punk::lib::lindex_resolve_basic $parentarray $p] - if {$idx == -1} { - error "tdictn::lappend error bad path $path. Index $p does not exist." - } - ::set endpoint [lindex $parentarray $p] - if {[dict get $endpoint type] ne "ARRAY"} { - error "tdictn::lappend error bad path $path. Not an array." - } - - ::set arrdata [dict get $endpoint value] - ::lappend arrdata {*}$args - dict set endpoint value $arrdata - ::set newlist $endpoint - #::lset parentarray $p $newlist - #set parentarray $newlist - ::set $varname $newlist - dict set vdict $pathsofar $varname - break - } else { - ::set arrdata [dict get $data value] - set idx [punk::lib::lindex_resolve_basic $arrdata $p] - if {$idx == -1} { - error "tdictn::lappend error bad path $path. Subpath $pathsofar, index $p does not exist." - } - ::set data [lindex $arrdata $p] - ::set $varname $data - dict set vdict $pathsofar $varname + BOM { + #this token is the unicode single char \uFFEF + #It doesn't tell us what encoding was originally used (though toml should only accept UTF-8 files) + #ignore at start - what about in other positions? + } + default { + error "Unexpected tag '$tag' in Tomlish list '$tomlish'" } } } - #dict for {path varname} $vdict { - # puts "$path $varname\n" - # puts " [::set $varname]\n" - # puts "" - #} - ::set i 0 - ::set reverse [lreverse $vdict] - foreach {varname path} $reverse { - set newval [::set $varname] - if {$i+2 == [llength $reverse]} { - ::set k [lindex $path end] - ::set k [string range $k 2 end] ;#first key is always @@something - dict set originaldata $k $newval - puts "--result $originaldata" - break - } - ::set nextvarname [lindex $reverse $i+2] - ::set nextval [::set $nextvarname] - ::set k [lindex $path end] - if {[string match @@* $k]} { - #dict key - dict set $nextvarname $k $newval - } else { - #list index - ::set nextarr [dict get $nextval value] - ::lset nextarr $k $newval - dict set $nextvarname value $nextarr - } - ::incr i 2 - } - return $newlist + return $datastructure } } -namespace eval tomlish::dict { +namespace eval tomlish::dict::path { + #access tomlish dict structure namespace export {[a-z]*}; # Convention: export all lowercase - namespace path [namespace parent] - proc is_tomlish_typeval {d} { - #designed to detect {type value } e.g {type INT value 3}, {type STRING value "blah etc"} - #as a sanity check we need to avoid mistaking user data that happens to match same form - #consider x.y={type="spud",value="blah"} - #The value of type will itself have already been converted to {type STRING value spud} ie never a single element. - #check the length of the type as a quick way to see it's a tag - not something else masqerading. - expr {[dict size $d] == 2 && [dict exists $d type] && [dict exists $d value] && [llength [dict get $d type]] == 1} - } - proc is_tomlish_typeval2 {d} { - upvar ::tomlish::tags tags - expr {[lindex $d 0] eq "type" && [lindex $d 1] in $tags} - } - proc last_tomltype_posn {d} { - set last_simple -1 - set dictposn [expr {[dict size $d] -1}] - foreach k [lreverse [dict keys $d]] { - set dval [dict get $d $k] - if {[is_tomlish_typeval $dval]} { - set last_simple $dictposn - break + #access with path such as: @@k @@k 0 @@k end where dict keys marked with @@ and plain values are list indices into in {type ARRAY value } + #leaf elements returned as structured {type value } + proc get {dictval {path {}}} { + if {$path eq ""} { + return $dictval + } + ::set data $dictval + ::set pathsofar [list] + foreach p $path { + ::lappend pathsofar $p + if {[string range $p 0 1] eq "@@"} { + ::set data [dict get $data [string range $p 2 end]] + } else { + if {![tomlish::dict::is_typeval $data]} { + error "tomlish::dict::path::get error bad path $path. Attempt to access table as array at subpath $pathsofar." + } + if {[dict get $data type] ne "ARRAY"} { + error "tomlish::dict::get error bad path $path. Subpath $pathsofar is not an array." + } + ::set arrdata [dict get $data value] + ::set data [lindex $arrdata $p] } - incr dictposn -1 } - return $last_simple + return $data } - - - #review - proc name_from_tablestack {tablestack} { - set name "" - foreach tinfo [lrange $tablestack 1 end] { - lassign $tinfo type namepart - switch -- $type { - T { - if {$name eq ""} { - append name $namepart - } else { - append name .$namepart - } + proc exists {dictval path} { + ::set data $dictval + ::set pathsofar [list] + ::set exists 1 + foreach p $path { + ::lappend pathsofar $p + if {[string range $p 0 1] eq "@@"} { + ::set k [string range $p 2 end] + if {![dict exists $data $k]} { + return 0 } - I { - if {$name eq ""} { - append name $namepart - } else { - append name .$namepart - } + ::set data [dict get $data $k] + } else { + if {![tomlish::dict::is_typeval $data]} { + return 0 } - default { - #end at first break in the leading sequence of T & I tablenames - break + if {[dict get $data type] ne "ARRAY"} { + return 0 + } + ::set arrdata [dict get $data value] + ::set intp [tomlish::system::lindex_resolve_basic $arrdata $p] ;#handle index math (end-1 etc) + if {$intp == -1} { + #out of bounds + return 0 } + ::set data [lindex $arrdata $p] } } - return $name + return $exists } - - #tablenames_info is a flat dict with the key being an '@@' path - proc _show_tablenames {tablenames_info} { - #e.g {@l@a @@b} {ttype header_table tdefined closed} - append msg \n "tablenames_info:" \n - dict for {tkey tinfo} $tablenames_info { - append msg " " "table: $tkey" \n - dict for {field finfo} $tinfo { - append msg " " "$field $finfo" \n - } + #a restricted analogy of 'dictn set' + #set 'endpoints' - don't create intermediate paths + # can replace an existing dict with another dict + # can create a key when key at tail end of path is a key (ie @@keyname, not index) + # can replace an existing {type value value } + # with added restriction that if is ARRAY the new must also be ARRAY + proc set_endpoint {dictvariable path value} { + upvar $dictvariable dict_being_edited + ::set data $dict_being_edited + ::set pathsofar [list] + if {!([tomlish::dict::is_typeval $value] || [tomlish::dict::is_typeval_dict $value 0])} { + #failed check of supplied value as basic type, or a sub-dict structure (not checking arrays) + error "tomlish::dict::path::set_endpoint error - value must already be in the tomlish form {type value } or be a dict with such forms as leaves" } - return $msg - } -} -tcl::namespace::eval tomlish::to_dict { - proc tablename_split {tablename {normalize false}} { - #we can't just split on . because we have to handle quoted segments which may contain a dot. - #eg {dog."tater.man"} - set sLen [tcl::string::length $tablename] - set segments [list] - set mode "preval" ;#5 modes: preval, quoted,litquoted, unquoted, postval - #quoted is for double-quotes, litquoted is for single-quotes (string literal) - set seg "" - for {set i 0} {$i < $sLen} {incr i} { - - if {$i > 0} { - set lastChar [tcl::string::index $tablename [expr {$i - 1}]] - } else { - set lastChar "" - } - - #todo - track\count backslashes properly + foreach p $path { + ::lappend pathsofar $p + if {[string range $p 0 1] eq "@@"} { + ::set k [string range $p 2 end] - set c [tcl::string::index $tablename $i] - if {$c eq "\""} { - if {($lastChar eq "\\")} { - #not strictly correct - we could have had an even number prior-backslash sequence - #the toml spec would have us error out immediately on bsl in bad location - but we're - #trying to parse to unvalidated tomlish - set ctest escq - } else { - set ctest dq - } - } else { - set ctest [string map [list " " sp \t tab] $c] - } + #if {![dict exists $data $k]} { + # error "tomlish::dict:path::set error bad path $path. Attempt to access nonexistent element at subpath $pathsofar." + #} + ::set varname v[incr v] - switch -- $ctest { - . { - switch -exact -- $mode { - preval { - error "tablename_split. dot not allowed - expecting a value" - } - unquoted { - #dot marks end of segment. - if {![tomlish::utils::is_barekey $seg]} { - error "tablename_split. unquoted key segment $seg is not a valid toml key" + if {$pathsofar eq $path} { + #see if endpoint of the path given already exists + if {[dict exists $data $k]} { + ::set endpoint [dict get $data $k] + if {[tomlish::dict::is_typeval $endpoint]} { + set existing_tp [dict get $endpoint type] + if {![tomlish::dict::is_typeval $value]} { + error "tomlish::dict::path::set_endpoint error Unable to overwrite subpath '$pathsofar' which is of type $existing_tp with sub-dict. Supplied value not {type value val } with sub-dict: $value" + } + switch -- [dict get $endpoint type] { + ARRAY { + #disallow overwriting array - unless given value is an ARRAY? REVIEW + if {[dict get $value type] ne "ARRAY"} { + error "tomlish::dict::path::set_endpoint error bad path '$path'. Cannot overwrite array with non-array: $value" + } + } + default { + # + } + } + } else { + #endpoint is a typeval dict not a plain typeval - only allow overwrite with a typeval dict + if {![tomlish::dict::is_typeval_dict $value 0]} { + error "tomlish::dict::path::set_endpoint error path '$path'. Cannot overwrite sub-dict (size: [dict size $endpoint]) with non sub-dict: $value" + } } - postval { - #ok - segment already lappended + ::set $varname $value + dict set vdict $pathsofar $varname + break + } else { + ::set arrdata [dict get $data value] + set idx [tomlish::system::lindex_resolve_basic $arrdata $p] + if {$idx == -1} { + error "tomlish::dict::path::set_endpoint error bad path '$path'. No existing element at $p" } + ::set data [lindex $arrdata $p] + ::set $varname $data + dict set vdict $pathsofar $varname } } } + #dict for {path varname} $vdict { + # puts "$path $varname\n" + # puts " '[::set $varname]'\n" + # puts "" + #} + ::set i 0 + ::set reverse [lreverse $vdict] + foreach {varname path} $reverse { + set newval [::set $varname] + if {$i+2 == [llength $reverse]} { + ::set k [lindex $path end] + ::set k [string range $k 2 end] ;#first key is always @@something + dict set dict_being_edited $k $newval + #puts "--result $dict_being_edited" + break + } + ::set nextvarname [lindex $reverse $i+2] + ::set nextval [::set $nextvarname] + ::set k [lindex $path end] + if {[string match @@* $k]} { + #dict key + #dict set $nextvarname $k $newval + set_endpoint $nextvarname [list $k] $newval + } else { + #list index + ::set nextarr [dict get $nextval value] + ::lset nextarr $k $newval + dict set $nextvarname value $nextarr + } + ::incr i 2 + } - #note - we must allow 'empty' quoted strings '' & "" - # (these are 'discouraged' but valid toml keys) + return $dict_being_edited - return $segments } - - #tablenames (& tablearraynames) may contain irrelevant leading, trailing and interspersed whitespace - # tablenames can be made up of segments delimited by dots. .eg [ a.b . c ] - #trimmed, the tablename becomes {a.b.c} - # A segment may contain whitespace if it is quoted e.g [a . b . "c etc " ] - #ie whitespace is only irrelevant if it's outside a quoted segment - #trimmed, the tablename becomes {a.b."c etc "} - proc tablename_trim {tablename} { - set segments [tablename_split $tablename false] - set trimmed_segments [list] - foreach seg $segments { - lappend trimmed_segments [::string trim $seg " \t"] + #path must be to a {type ARRAY value } + #REVIEW - how to lappend to deep mixed dict/array structure without rewriting whole datastructure? + proc lappend {dictvariable path args} { + upvar $dictvariable dict_being_edited + ::set data $dict_being_edited + ::set pathsofar [list] + #::set newlist [list] + ::set v 0 + ::set vdict [dict create] + foreach a $args { + if {![string is dict $a]} { + error "tomlish::dict::path::lappend error - lappended arguments must already be in the tomlish form {type value } or be a dict with such forms as leaves" + } } - return [join $trimmed_segments .] - } + foreach p $path { + ::lappend pathsofar $p + if {[string range $p 0 1] eq "@@"} { + ::set k [string range $p 2 end] + if {![dict exists $data $k]} { + error "tomlish::dict::path::lappend error bad path $path. Attempt to access nonexistent element at subpath $pathsofar." + } + ::set varname v[incr v] - proc @@path {dictkeys} { - lmap v $dictkeys {list @@$v} - } - proc get_dottedkey_info {dottedkeyrecord} { - set key_hierarchy [list] - set key_hierarchy_raw [list] - if {[lindex $dottedkeyrecord 0] ne "DOTTEDKEY"} { - error "tomlish::to_dict::get_dottedkey_info error. Supplied list doesn't appear to be a DOTTEDKEY (tag: [lindex $dottedkeyrecord 0])" - } - set compoundkeylist [lindex $dottedkeyrecord 1] - set expect_sep 0 - foreach part $compoundkeylist { - set parttag [lindex $part 0] - if {$parttag eq "WS"} { - continue - } - if {$expect_sep} { - if {$parttag ne "DOTSEP"} { - error "DOTTEDKEY missing dot separator between parts. '$dottedkeyrecord'" + if {$pathsofar eq $path} { + #see if endpoint of the path given is an ARRAY + ::set endpoint [dict get $data $k] + if {![tomlish::dict::is_typeval $endpoint]} { + error "tomlish::dict::path::lappend error bad path $path. Attempt to access table as array at subpath $pathsofar." + } + if {[dict get $endpoint type] ne "ARRAY"} { + error "tomlish::dict::path::lappend error bad path $path. Subpath $pathsofar is not an array." + } + ::set arrdata [dict get $endpoint value] + ::lappend arrdata {*}$args + dict set endpoint value $arrdata + ::set newlist $endpoint + ::set $varname $newlist + dict set vdict $pathsofar $varname + break } - set expect_sep 0 + ::set data [dict get $data $k] + ::set $varname $data + dict set vdict $pathsofar $varname } else { - set val [lindex $part 1] - switch -exact -- $parttag { - KEY { - lappend key_hierarchy $val - lappend key_hierarchy_raw $val + if {![tomlish::dict::is_typeval $data]} { + error "tomlish::dict::path::lappend error bad path $path. Attempt to access table as array at subpath $pathsofar." + } + if {[dict get $data type] ne "ARRAY"} { + error "tomlish::dict::path::lappend error bad path $path. Subpath $pathsofar is not an array." + } + ::set varname v[incr v] + if {$pathsofar eq $path} { + if {[dict get $data type] ne "ARRAY"} { + error "tomlish::dict::path::lappend error bad path $path. Parent path is not an array." } - DQKEY { - lappend key_hierarchy [::tomlish::utils::unescape_string $val] - lappend key_hierarchy_raw \"$val\" + ::set parentarray [dict get $data value] + ::set idx [tomlish::system::lindex_resolve_basic $parentarray $p] + if {$idx == -1} { + error "tomlish::dict::path::lappend error bad path $path. Index $p does not exist." } - SQKEY { - lappend key_hierarchy $val - lappend key_hierarchy_raw "'$val'" + ::set endpoint [lindex $parentarray $p] + if {[dict get $endpoint type] ne "ARRAY"} { + error "tomlish::dict::path::lappend error bad path $path. Not an array." } - default { - error "tomlish::to_dict::get_dottedkey_info DOTTED key unexpected part '$parttag' - ensure dot separator is between key parts. '$item'" + + ::set arrdata [dict get $endpoint value] + ::lappend arrdata {*}$args + dict set endpoint value $arrdata + ::set newlist $endpoint + #::lset parentarray $p $newlist + #set parentarray $newlist + ::set $varname $newlist + dict set vdict $pathsofar $varname + break + } else { + ::set arrdata [dict get $data value] + set idx [tomlish::system::lindex_resolve_basic $arrdata $p] + if {$idx == -1} { + error "tomlish::dict::path::lappend error bad path $path. Subpath $pathsofar, index $p does not exist." } + ::set data [lindex $arrdata $p] + ::set $varname $data + dict set vdict $pathsofar $varname } - set expect_sep 1 } } - return [dict create keys $key_hierarchy keys_raw $key_hierarchy_raw] + #todo tomlish::log::debug ? + #dict for {path varname} $vdict { + # puts "$path $varname\n" + # puts " [::set $varname]\n" + # puts "" + #} + ::set i 0 + ::set reverse [lreverse $vdict] + foreach {varname path} $reverse { + set newval [::set $varname] + if {$i+2 == [llength $reverse]} { + ::set k [lindex $path end] + ::set k [string range $k 2 end] ;#first key is always @@something + dict set dict_being_edited $k $newval + #puts "--result $dict_being_edited" + break + } + ::set nextvarname [lindex $reverse $i+2] + ::set nextval [::set $nextvarname] + ::set k [lindex $path end] + if {[string match @@* $k]} { + #dict key + set k [string range $k 2 end] + dict set $nextvarname $k $newval + } else { + #list index + ::set nextarr [dict get $nextval value] + ::lset nextarr $k $newval + dict set $nextvarname value $nextarr + } + ::incr i 2 + } + return $dict_being_edited + } +} +tcl::namespace::eval tomlish::to_dict { + + + proc @@path {dictkeys} { + lmap v $dictkeys {string cat @@ $v} } } @@ -7196,30 +8047,44 @@ tcl::namespace::eval tomlish::app { -help -type none -help\ "Display this usage message" -inputchannel -default stdin + -inputencoding -default "iso8859-1" -choicerestricted 0 -choices {utf-8 utf-16 iso8859-1} -help\ + "configure encoding on input channel + iso8859-1 is equivalent to binary encoding" -outputchannel -default stdout -errorchannel -default stderr @values -min 0 -max 0 } proc decoder {args} { set argd [punk::args::parse $args withid ::tomlish::app::decoder] - set ch_input [dict get $argd opts -inputchannel] - set ch_output [dict get $argd opts -outputchannel] - set ch_error [dict get $argd opts -errorchannel] + set ch_input [dict get $argd opts -inputchannel] + set ch_input_enc [dict get $argd opts -inputencoding] + set ch_output [dict get $argd opts -outputchannel] + set ch_error [dict get $argd opts -errorchannel] if {[dict exists $argd received -help]} { return [punk::args::usage -scheme info ::tomlish::app::decoder] } - #fconfigure stdin -encoding utf-8 - fconfigure $ch_input -translation binary + chan configure $ch_input -encoding $ch_input_enc + #translation? + chan configure $ch_input -translation lf ;# toml-test invalid/control tests we need to see raw CRs to reject them properly - auto translation won't do. + #Just slurp it all - presumably we are not handling massive amounts of data on stdin. # - even if the input is large, we probably don't gain much (aside from possible memory savings?) by attempting to process input as it arrives. if {[catch { - set toml [read $ch_input] - }]} { - exit 2 ;#read error + set inputdata [read $ch_input] + if {$ch_input_enc eq "iso8859-1"} { + set toml [tomlish::toml::from_binary $inputdata] + } else { + set toml $inputdata + } + } errM]} { + puts stderr "read-input error: $errM" + #toml-tests expect exit code 1 + #e.g invalid/encoding/utf16-bom + exit 1 ;#read error } try { - set j [::tomlish::toml_to_json $toml] + set j [::tomlish::toml_to_typedjson $toml] } on error {em} { puts $ch_error "decoding failed: '$em'" exit 1 @@ -7242,30 +8107,46 @@ tcl::namespace::eval tomlish::app { @opts -help -type none -help \ "Display this usage message" + -restrict_barekeys -default 0 -help\ + "If true, keys containing unicode will be quoted. + If false, an extended range of barekeys will be used + in unquoted form." -inputchannel -default stdin + -inputencoding -default "" -choicerestricted 0 -choices {utf-8 utf-16 iso8859-1} -help\ + "configure encoding on input channel + If not supplied, leave at Tcl default" -outputchannel -default stdout -errorchannel -default stderr @values -min 0 -max 0 } proc encoder {args} { set argd [punk::args::parse $args withid ::tomlish::app::encoder] - set ch_input [dict get $argd opts -inputchannel] - set ch_output [dict get $argd opts -outputchannel] - set ch_error [dict get $argd opts -errorchannel] + set restrict_barekeys [dict get $argd opts -restrict_barekeys] + set ch_input [dict get $argd opts -inputchannel] + set ch_input_enc [dict get $argd opts -inputencoding] + set ch_output [dict get $argd opts -outputchannel] + set ch_error [dict get $argd opts -errorchannel] if {[dict exists $argd received -help]} { return [punk::args::usage -scheme info ::tomlish::app::encoder] } #review - fconfigure $ch_input -translation binary + if {$ch_input_enc ne ""} { + chan configure $ch_input -encoding $ch_input_enc + } + #review + chan configure $ch_input -translation lf + if {[catch { set json [read $ch_input] }]} { exit 2 ;#read error } try { - set toml [::tomlish::json_to_toml $json] - } on error {em} { - puts $ch_error "encoding failed: '$em'" + #tomlish::typedjson_to_toml + set toml [::tomlish::toml::from_tomlish_from_dict_from_typedjson $json] + } trap {} {e eopts} { + puts $ch_error "encoding failed: '$e'" + puts $ch_error "$::errorInfo" exit 1 } puts -nonewline $ch_output $toml @@ -7345,6 +8226,108 @@ namespace eval tomlish::lib { #[list_end] [comment {--- end definitions namespace tomlish::lib ---}] } # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval tomlish::system { + + #taken from punk::lib + proc lindex_resolve_basic {list index} { + #*** !doctools + #[call [fun lindex_resolve_basic] [arg list] [arg index]] + #[para] Accepts index of the forms accepted by Tcl's list commands. (e.g compound indices such as 3+1 end-2) + #[para] returns -1 for out of range at either end, or a valid integer index + #[para] Unlike lindex_resolve; lindex_resolve_basic can't determine if an out of range index was out of range at the lower or upper bound + #[para] This is only likely to be faster than average over lindex_resolve for Tcl which has the builtin lseq command + #[para] The performance advantage is more likely to be present when using compound indexes such as $x+1 or end-1 + #[para] For pure integer indices the performance should be equivalent + + set index [tcl::string::map {_ {}} $index] ;#forward compatibility with integers such as 1_000 + if {[string is integer -strict $index]} { + #can match +i -i + #avoid even the lseq overhead when the index is simple + if {$index < 0 || ($index >= [llength $list])} { + #even though in this case we could return -2 or -3 like lindex_resolve; for consistency we don't, as it's not always determinable for compound indices using the lseq method. + return -1 + } else { + #integer may still have + sign - normalize with expr + return [expr {$index}] + } + } + if {[llength $list]} { + set indices [tomlish::system::range 0 [expr {[llength $list]-1}]] ;# uses lseq if available, has fallback. + #if lseq was available - $indices is an 'arithseries' - theoretically not taking up ram(?) + } else { + set indices [list] + } + set idx [lindex $indices $index] + if {$idx eq ""} { + #we have no way to determine if out of bounds is at lower vs upper end + return -1 + } else { + return $idx + } + } + + if {[info commands ::lseq] ne ""} { + #tcl 8.7+ lseq significantly faster, especially for larger ranges + #The internal rep can be an 'arithseries' with no string representation + #support minimal set from to + proc range {from to} { + lseq $from $to + } + } else { + #lseq accepts basic expressions e.g 4-2 for both arguments + #e.g we can do lseq 0 [llength $list]-1 + #if range is to be consistent with the lseq version above - it should support that, even though we don't support most lseq functionality in either wrapper. + proc range {from to} { + set to [offset_expr $to] + set from [offset_expr $from] + if {$to > $from} { + set count [expr {($to -$from) + 1}] + if {$from == 0} { + return [lsearch -all [lrepeat $count 0] *] + } else { + incr from -1 + return [lmap v [lrepeat $count 0] {incr from}] + } + #slower methods. + #2) + #set i -1 + #set L [lrepeat $count 0] + #lmap v $L {lset L [incr i] [incr from];lindex {}} + #return $L + #3) + #set L {} + #for {set i 0} {$i < $count} {incr i} { + # lappend L [incr from] + #} + #return $L + } elseif {$from > $to} { + set count [expr {$from - $to} + 1] + #1) + if {$to == 0} { + return [lreverse [lsearch -all [lrepeat $count 0] *]] + } else { + incr from + return [lmap v [lrepeat $count 0] {incr from -1}] + } + + #2) + #set i -1 + #set L [lrepeat $count 0] + #lmap v $L {lset L [incr i] [incr from -1];lindex {}} + #return $L + #3) + #set L {} + #for {set i 0} {$i < $count} {incr i} { + # lappend L [incr from -1] + #} + #return $L + } else { + return [list $from] + } + } + } + +} if {[info exists ::argc] && $::argc > 0} { #puts stderr "argc: $::argc args: $::argv" @@ -7410,30 +8393,6 @@ if {[info exists ::argc] && $::argc > 0} { exit 0 } } - - #set opts [dict create] - #set opts [dict merge $opts $::argv] - - #set opts_understood [list -app ] - #if {"-app" in [dict keys $opts]} { - # #Don't vet the remaining opts - as they are interpreted by each app - #} else { - # foreach key [dict keys $opts] { - # if {$key ni $opts_understood} { - # puts stderr "Option '$key' not understood" - # exit 1 - # } - # } - #} - #if {[dict exists $opts -app]} { - # set app [dict get $opts -app] - # set appnames [tomlish::appnames] - # if {$app ni $appnames} { - # puts stderr "app '[dict get $opts -app]' not found. Available apps: $appnames" - # exit 1 - # } - # tomlish::app::$app {*}$opts - #} } ## Ready diff --git a/src/vfs/_vfscommon.vfs/modules/overtype-1.6.6.tm b/src/vfs/_vfscommon.vfs/modules/overtype-1.6.6.tm new file mode 100644 index 00000000..b4e59ec6 --- /dev/null +++ b/src/vfs/_vfscommon.vfs/modules/overtype-1.6.6.tm @@ -0,0 +1,4774 @@ +# -*- tcl -*- +# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt +# +# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. +# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# (C) Julian Noble 2003-2023 +# +# @@ Meta Begin +# Application overtype 1.6.6 +# Meta platform tcl +# Meta license BSD +# @@ Meta End + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# doctools header +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +#*** !doctools +#[manpage_begin overtype_module_overtype 0 1.6.6] +#[copyright "2024"] +#[titledesc {overtype text layout - ansi aware}] [comment {-- Name section and table of contents description --}] +#[moddesc {overtype text layout}] [comment {-- Description at end of page heading --}] +#[require overtype] +#[keywords module text ansi] +#[description] +#[para] - + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section Overview] +#[para] overview of overtype +#[subsection Concepts] +#[para] - + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Requirements +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[subsection dependencies] +#[para] packages used by overtype +#[list_begin itemized] + +package require Tcl 8.6- +package require textutil +package require punk::lib ;#required for lines_as_list +package require punk::ansi ;#required to detect, split, strip and calculate lengths +package require punk::char ;#box drawing - and also unicode character width determination for proper layout of text with double-column-width chars +package require punk::assertion +#*** !doctools +#[item] [package {Tcl 8.6}] +#[item] [package textutil] +#[item] [package punk::ansi] +#[para] - required to detect, split, strip and calculate lengths of text possibly containing ansi codes +#[item] [package punk::char] +#[para] - box drawing - and also unicode character width determination for proper layout of text with double-column-width chars + +# #package require frobz +# #*** !doctools +# #[item] [package {frobz}] + +#*** !doctools +#[list_end] + +#PERFORMANCE notes +#overtype is very performance sensitive - used in ansi output all over the place ie needs to be optimised +#NOTE use of tcl::dict::for tcl::string::range etc instead of ensemble versions. This is for the many tcl 8.6/8.7 interps which don't compile ensemble commands when in safe interps +#similar for tcl::namespace::eval - but this is at least on some versions of Tcl - faster even in a normal interp. Review to see if that holds for Tcl 9. +#for string map: when there are exactly 2 elements - it is faster to use a literal which has a special case optimisation in the c code +#ie use tcl::string::map {\n ""} ... instead of tcl::string::map [list \n ""] ... +#note that we can use unicode (e.g \uFF31) and other escapes such as \t within these curly braces - we don't have to use double quotes +#generally using 'list' is preferred for the map as less error prone. +#can also use: tcl::string::map "token $var" .. but be careful regarding quoting and whitespace in var. This should be used sparingly if at all. + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +#*** !doctools +#[section API] + + +#Julian Noble - 2003 +#Released under standard 'BSD license' conditions. +# +#todo - ellipsis truncation indicator for center,right + +#v1.4 2023-07 - naive ansi color handling - todo - fix tcl::string::range +# - need to extract and replace ansi codes? + +tcl::namespace::eval overtype { + namespace import ::punk::assertion::assert + punk::assertion::active true + + namespace path ::punk::lib + + namespace export * + variable default_ellipsis_horizontal "..." ;#fallback + variable default_ellipsis_vertical "..." + tcl::namespace::eval priv { + proc _init {} { + upvar ::overtype::default_ellipsis_horizontal e_h + upvar ::overtype::default_ellipsis_vertical e_v + set e_h [format %c 0x2026] ;#Unicode Horizontal Ellipsis + set e_v [format %c 0x22EE] + #The unicode ellipsis looks more natural than triple-dash which is centred vertically whereas ellipsis is at floorline of text + #Also - unicode ellipsis has semantic meaning that other processors can interpret + #unicode does also provide a midline horizontal ellipsis 0x22EF + + #set e [format %c 0x2504] ;#punk::char::charshort boxd_ltdshhz - Box Drawings Light Triple Dash Horizontal + #if {![catch {package require punk::char}]} { + # set e [punk::char::charshort boxd_ltdshhz] + #} + } + } + priv::_init +} +proc overtype::about {} { + return "Simple text formatting. Author JMN. BSD-License" +} + +tcl::namespace::eval overtype { + variable grapheme_widths [tcl::dict::create] + + variable escape_terminals + #single "final byte" in the range 0x40–0x7E (ASCII @A–Z[\]^_`a–z{|}~). + tcl::dict::set escape_terminals CSI [list @ \\ ^ _ ` | ~ a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z "\{" "\}"] + #tcl::dict::set escape_terminals CSI [list J K m n A B C D E F G s u] ;#basic + tcl::dict::set escape_terminals OSC [list \007 \033\\] ;#note mix of 1 and 2-byte terminals + + #self-contained 2 byte ansi escape sequences - review more? + variable ansi_2byte_codes_dict + set ansi_2byte_codes_dict [tcl::dict::create\ + "reset_terminal" "\u001bc"\ + "save_cursor_posn" "\u001b7"\ + "restore_cursor_posn" "\u001b8"\ + "cursor_up_one" "\u001bM"\ + "NEL - Next Line" "\u001bE"\ + "IND - Down one line" "\u001bD"\ + "HTS - Set Tab Stop" "\u001bH"\ + ] + + #debatable whether strip should reveal the somethinghidden - some terminals don't hide it anyway. + # "PM - Privacy Message" "\u001b^somethinghidden\033\\"\ +} + + + + +proc overtype::string_columns {text} { + if {[punk::ansi::ta::detect $text]} { + #error "error string_columns is for calculating character length of string - ansi codes must be stripped/rendered first e.g with punk::ansi::ansistrip. Alternatively try punk::ansi::printing_length" + set text [punk::ansi::ansistrip $text] + } + return [punk::char::ansifreestring_width $text] +} + +#todo - consider a way to merge overtype::left/centre/right +#These have similar algorithms/requirements - and should be refactored to be argument-wrappers over a function called something like overtype::renderblock +#overtype::renderblock could render the input to a defined (possibly overflowing in x or y) rectangle overlapping the underlay. +#(i.e not even necessariy having it's top left within the underlay) +tcl::namespace::eval overtype::priv { +} + +#could return larger than renderwidth +proc _get_row_append_column {row} { + #obsolete? + upvar outputlines outputlines + set idx [expr {$row -1}] + if {$row <= 1 || $row > [llength $outputlines]} { + return 1 + } else { + upvar opt_expand_right expand_right + upvar renderwidth renderwidth + set existinglen [punk::ansi::printing_length [lindex $outputlines $idx]] + set endpos [expr {$existinglen +1}] + if {$expand_right} { + return $endpos + } else { + if {$endpos > $renderwidth} { + return $renderwidth + 1 + } else { + return $endpos + } + } + } +} + +tcl::namespace::eval overtype { + #*** !doctools + #[subsection {Namespace overtype}] + #[para] Core API functions for overtype + #[list_begin definitions] + + + + #tcl::string::range should generally be avoided for both undertext and overtext which contain ansi escapes and other cursor affecting chars such as \b and \r + #render onto an already-rendered (ansi already processed) 'underlay' string, a possibly ansi-laden 'overlay' string. + #The underlay and overlay can be multiline blocks of text of varying line lengths. + #The overlay may just be an ansi-colourised block - or may contain ansi cursor movements and cursor save/restore calls - in which case the apparent length and width of the overlay can't be determined as if it was a block of text. + #This is a single-shot rendering of strings - ie there is no way to chain another call containing a cursor-restore to previously rendered output and have it know about any cursor-saves in the first call. + # a cursor start position other than top-left is a possible addition to consider. + #see editbuf in punk::repl for a more stateful ansi-processor. Both systems use loops over overtype::renderline + proc renderspace {args} { + #*** !doctools + #[call [fun overtype::renderspace] [arg args] ] + #[para] usage: ?-transparent [lb]0|1[rb]? ?-expand_right [lb]1|0[rb]? ?-ellipsis [lb]1|0[rb]? ?-ellipsistext ...? undertext overtext + + # @c overtype starting at left (overstrike) + # @c can/should we use something like this?: 'format "%-*s" $len $overtext + variable default_ellipsis_horizontal + + if {[llength $args] < 2} { + error {usage: ?-width ? ?-startcolumn ? ?-transparent [0|1|]? ?-expand_right [1|0]? ?-ellipsis [1|0]? ?-ellipsistext ...? undertext overtext} + } + set optargs [lrange $args 0 end-2] + if {[llength $optargs] % 2 == 0} { + set overblock [lindex $args end] + set underblock [lindex $args end-1] + #lassign [lrange $args end-1 end] underblock overblock + set argsflags [lrange $args 0 end-2] + } else { + set optargs [lrange $args 0 end-1] + if {[llength $optargs] %2 == 0} { + set overblock [lindex $args end] + set underblock "" + set argsflags [lrange $args 0 end-1] + } else { + error "renderspace expects opt-val pairs followed by: or just " + } + } + set opts [tcl::dict::create\ + -bias ignored\ + -width \uFFEF\ + -height \uFFEF\ + -startcolumn 1\ + -ellipsis 0\ + -ellipsistext $default_ellipsis_horizontal\ + -ellipsiswhitespace 0\ + -expand_right 0\ + -appendlines 1\ + -transparent 0\ + -exposed1 \uFFFD\ + -exposed2 \uFFFD\ + -experimental 0\ + -cp437 0\ + -looplimit \uFFEF\ + -crm_mode 0\ + -reverse_mode 0\ + -insert_mode 0\ + -wrap 0\ + -info 0\ + -console {stdin stdout stderr}\ + ] + #expand_right is perhaps consistent with the idea of the page_size being allowed to grow horizontally.. + # it does not necessarily mean the viewport grows. (which further implies need for horizontal scrolling) + # - it does need to be within some concept of terminal width - as columns must be addressable by ansi sequences. + # - This implies the -width option value must grow if it is tied to the concept of renderspace terminal width! REVIEW. + # - further implication is that if expand_right grows the virtual renderspace terminal width - + # then some sort of reflow/rerender needs to be done for preceeding lines? + # possibly not - as expand_right is distinct from a normal terminal-width change event, + # expand_right being primarily to support other operations such as textblock::table + + #todo - viewport width/height as separate concept to terminal width/height? + #-ellipsis args not used if -wrap is true + foreach {k v} $argsflags { + switch -- $k { + -looplimit - -width - -height - -startcolumn - -bias - -ellipsis - -ellipsistext - -ellipsiswhitespace + - -transparent - -exposed1 - -exposed2 - -experimental + - -expand_right - -appendlines + - -reverse_mode - -crm_mode - -insert_mode + - -cp437 + - -info - -console { + tcl::dict::set opts $k $v + } + -wrap - -autowrap_mode { + #temp alias -autowrap_mode for consistency with renderline + #todo - + tcl::dict::set opts -wrap $v + } + default { + error "overtype::renderspace unknown option '$k'. Known options: [tcl::dict::keys $opts]" + } + } + } + #set opts [tcl::dict::merge $defaults $argsflags] + # -- --- --- --- --- --- + #review - expand_left for RTL text? + set opt_expand_right [tcl::dict::get $opts -expand_right] + #for repl - standard output line indicator is a dash - todo, add a different indicator for a continued line. + set opt_width [tcl::dict::get $opts -width] + set opt_height [tcl::dict::get $opts -height] + set opt_startcolumn [tcl::dict::get $opts -startcolumn] + set opt_appendlines [tcl::dict::get $opts -appendlines] + set opt_transparent [tcl::dict::get $opts -transparent] + set opt_ellipsistext [tcl::dict::get $opts -ellipsistext] + set opt_ellipsiswhitespace [tcl::dict::get $opts -ellipsiswhitespace] + set opt_exposed1 [tcl::dict::get $opts -exposed1] ;#widechar_exposed_left - todo + set opt_exposed2 [tcl::dict::get $opts -exposed2] ;#widechar_exposed_right - todo + # -- --- --- --- --- --- + set opt_crm_mode [tcl::dict::get $opts -crm_mode] + set opt_reverse_mode [tcl::dict::get $opts -reverse_mode] + set opt_insert_mode [tcl::dict::get $opts -insert_mode] + ##### + # review -wrap should map onto DECAWM terminal mode - the wrap 2 idea may not fit in with this?. + set opt_autowrap_mode [tcl::dict::get $opts -wrap] + #??? wrap 1 is hard wrap cutting word at exact column, or 1 column early for 2w-glyph, wrap 2 is for language-based word-wrap algorithm (todo) + ##### + # -- --- --- --- --- --- + set opt_cp437 [tcl::dict::get $opts -cp437] + set opt_info [tcl::dict::get $opts -info] + + + + # ---------------------------- + # -experimental dev flag to set flags etc + # ---------------------------- + set data_mode 0 + set edit_mode 0 + set opt_experimental [tcl::dict::get $opts -experimental] + foreach o $opt_experimental { + switch -- $o { + data_mode { + set data_mode 1 + } + edit_mode { + set edit_mode 1 + } + } + } + # ---------------------------- + + + set underblock [tcl::string::map {\r\n \n} $underblock] + set overblock [tcl::string::map {\r\n \n} $overblock] + + + #set underlines [split $underblock \n] + + #underblock is a 'rendered' block - so width height make sense + #only non-cursor affecting and non-width occupying ANSI codes should be present. + #ie SGR codes and perhaps things such as PM - although generally those should have been pushed to the application already + #renderwidth & renderheight were originally used with reference to rendering into a 'column' of output e.g a table column - before cursor row/col was implemented. + + if {$opt_width eq "\uFFEF" || $opt_height eq "\uFFEF"} { + lassign [blocksize $underblock] _w renderwidth _h renderheight + if {$opt_width ne "\uFFEF"} { + set renderwidth $opt_width + } + if {$opt_height ne "\uFFEF"} { + set renderheight $opt_height + } + } else { + set renderwidth $opt_width + set renderheight $opt_height + } + #initial state for renderspace 'terminal' reset + set initial_state [dict create\ + renderwidth $renderwidth\ + renderheight $renderheight\ + crm_mode $opt_crm_mode\ + reverse_mode $opt_reverse_mode\ + insert_mode $opt_insert_mode\ + autowrap_mode $opt_autowrap_mode\ + cp437 $opt_cp437\ + ] + #modes + #e.g insert_mode can be toggled by insert key or ansi IRM sequence CSI 4 h|l + #opt_startcolumn ?? - DECSLRM ? + set vtstate $initial_state + + # -- --- --- --- + #REVIEW - do we need ansi resets in the underblock? + if {$underblock eq ""} { + set underlines [lrepeat $renderheight ""] + } else { + set underblock [textblock::join_basic -- $underblock] ;#ensure properly rendered - ansi per-line resets & replays + set underlines [split $underblock \n] + } + #if {$underblock eq ""} { + # set blank "\x1b\[0m\x1b\[0m" + # #set underlines [list "\x1b\[0m\x1b\[0m"] + # set underlines [lrepeat $renderheight $blank] + #} else { + # #lines_as_list -ansiresets 1 will do nothing if -ansiresets 1 isn't specified - REVIEW + # set underlines [lines_as_list -ansiresets 1 $underblock] + #} + # -- --- --- --- + + #todo - reconsider the 'line' as the natural chunking mechanism for the overlay. + #In practice an overlay ANSI stream can be a single line with ansi moves/restores etc - or even have no moves or newlines, just relying on wrapping at the output renderwidth + #In such cases - we process the whole shebazzle for the first output line - only reducing by the applied amount at the head each time, reprocessing the long tail each time. + #(in cases where there are interline moves or cursor jumps anyway) + #This works - but doesn't seem efficient. + #On the other hand.. maybe it depends on the data. For simpler files it's more efficient than splitting first + + #a hack until we work out how to avoid infinite loops... + # + set looplimit [tcl::dict::get $opts -looplimit] + if {$looplimit eq "\uFFEF"} { + #looping for each char is worst case (all newlines?) - anything over that is an indication of something broken? + #do we need any margin above the length? (telnet mapscii.me test) + set looplimit [expr {[tcl::string::length $overblock] + 10}] + } + + #overblock height/width isn't useful in the presence of an ansi input overlay with movements. The number of lines may bear little relationship to the output height + #lassign [blocksize $overblock] _w overblock_width _h overblock_height + + set scheme 4 + switch -- $scheme { + 0 { + #one big chunk + set inputchunks [list $overblock] + } + 1 { + set inputchunks [punk::ansi::ta::split_codes $overblock] + } + 2 { + + #split into lines if possible first - then into plaintext/ansi-sequence chunks ? + set inputchunks [list ""] ;#put an empty plaintext split in for starters + set i 1 + set lines [split $overblock \n] + foreach ln $lines { + if {$i < [llength $lines]} { + append ln \n + } + set sequence_split [punk::ansi::ta::split_codes_single $ln] ;#use split_codes Not split_codes_single? + set lastpt [lindex $inputchunks end] + lset inputchunks end [tcl::string::cat $lastpt [lindex $sequence_split 0]] + lappend inputchunks {*}[lrange $sequence_split 1 end] + incr i + } + } + 3 { + #it turns out line based chunks are faster than the above.. probably because some of those end up doing the regex splitting twice + set lflines [list] + set inputchunks [split $overblock \n] + foreach ln $inputchunks { + append ln \n + lappend lflines $ln + } + if {[llength $lflines]} { + lset lflines end [tcl::string::range [lindex $lflines end] 0 end-1] + } + #set inputchunks $lflines[unset lflines] + set inputchunks [lindex [list $lflines [unset lflines]] 0] + + } + 4 { + set inputchunks [list] + foreach ln [split $overblock \n] { + lappend inputchunks $ln\n + } + if {[llength $inputchunks]} { + lset inputchunks end [tcl::string::range [lindex $inputchunks end] 0 end-1] + } + } + } + + + + + set replay_codes_underlay [tcl::dict::create 1 ""] + #lappend replay_codes_overlay "" + set replay_codes_overlay "[punk::ansi::a]" + set unapplied "" + set cursor_saved_position [tcl::dict::create] + set cursor_saved_attributes "" + + + set outputlines $underlines + set overidx 0 + + #underlines are not necessarily processed in order - depending on cursor-moves applied from overtext + set row 1 + #if {$data_mode} { + # set col [_get_row_append_column $row] + #} else { + set col $opt_startcolumn + #} + + set instruction_stats [tcl::dict::create] + + set loop 0 + #while {$overidx < [llength $inputchunks]} { } + + while {[llength $inputchunks]} { + #set overtext [lindex $inputchunks $overidx]; lset inputchunks $overidx "" + set overtext [lpop inputchunks 0] + if {![tcl::string::length $overtext]} { + incr loop + continue + } + #puts "----->[ansistring VIEW -lf 1 -vt 1 -nul 1 $overtext]<----" + set undertext [lindex $outputlines [expr {$row -1}]] + set renderedrow $row + + #renderline pads each underaly line to width with spaces and should track where end of data is + + + #set overtext [tcl::string::cat [lindex $replay_codes_overlay $overidx] $overtext] + set overtext $replay_codes_overlay$overtext + if {[tcl::dict::exists $replay_codes_underlay $row]} { + set undertext [tcl::dict::get $replay_codes_underlay $row]$undertext + } + #review insert_mode. As an 'overtype' function whose main function is not interactive keystrokes - insert is secondary - + #but even if we didn't want it as an option to the function call - to process ansi adequately we need to support IRM (insertion-replacement mode) ESC [ 4 h|l + set renderopts [list -experimental $opt_experimental\ + -cp437 $opt_cp437\ + -info 1\ + -crm_mode [tcl::dict::get $vtstate crm_mode]\ + -insert_mode [tcl::dict::get $vtstate insert_mode]\ + -autowrap_mode [tcl::dict::get $vtstate autowrap_mode]\ + -reverse_mode [tcl::dict::get $vtstate reverse_mode]\ + -cursor_restore_attributes $cursor_saved_attributes\ + -transparent $opt_transparent\ + -width [tcl::dict::get $vtstate renderwidth]\ + -exposed1 $opt_exposed1\ + -exposed2 $opt_exposed2\ + -expand_right $opt_expand_right\ + -cursor_column $col\ + -cursor_row $row\ + ] + set rinfo [renderline {*}$renderopts $undertext $overtext] + + set instruction [tcl::dict::get $rinfo instruction] + tcl::dict::set vtstate crm_mode [tcl::dict::get $rinfo crm_mode] + tcl::dict::set vtstate insert_mode [tcl::dict::get $rinfo insert_mode] + tcl::dict::set vtstate autowrap_mode [tcl::dict::get $rinfo autowrap_mode] ;# + tcl::dict::set vtstate reverse_mode [tcl::dict::get $rinfo reverse_mode] + #how to support reverse_mode in rendered linelist? we need to examine all pt/code blocks and flip each SGR stack? + # - review - the answer is probably that we don't need to - it is set/reset only during application of overtext + + #Note carefully the difference betw overflow_right and unapplied. + #overflow_right may need to be included in next run before the unapplied data + #overflow_right most commonly has data when in insert_mode + set rendered [tcl::dict::get $rinfo result] + set overflow_right [tcl::dict::get $rinfo overflow_right] + set overflow_right_column [tcl::dict::get $rinfo overflow_right_column] + set unapplied [tcl::dict::get $rinfo unapplied] + set unapplied_list [tcl::dict::get $rinfo unapplied_list] + set post_render_col [tcl::dict::get $rinfo cursor_column] + set post_render_row [tcl::dict::get $rinfo cursor_row] + set c_saved_pos [tcl::dict::get $rinfo cursor_saved_position] + set c_saved_attributes [tcl::dict::get $rinfo cursor_saved_attributes] + set visualwidth [tcl::dict::get $rinfo visualwidth] ;#column width of what is 'rendered' for the line + set insert_lines_above [tcl::dict::get $rinfo insert_lines_above] + set insert_lines_below [tcl::dict::get $rinfo insert_lines_below] + tcl::dict::set replay_codes_underlay [expr {$renderedrow+1}] [tcl::dict::get $rinfo replay_codes_underlay] + + #lset replay_codes_overlay [expr $overidx+1] [tcl::dict::get $rinfo replay_codes_overlay] + set replay_codes_overlay [tcl::dict::get $rinfo replay_codes_overlay] + if {0 && [tcl::dict::get $vtstate reverse_mode]} { + #test branch - todo - prune + puts stderr "---->[ansistring VIEW $replay_codes_overlay] rendered: $rendered" + #review + #JMN3 + set existing_reverse_state 0 + #split_codes_single is single esc sequence - but could have multiple sgr codes within one esc sequence + #e.g \x1b\[0;31;7m has a reset,colour red and reverse + set codeinfo [punk::ansi::codetype::sgr_merge [list $replay_codes_overlay] -info 1] + set codestate_reverse [dict get $codeinfo codestate reverse] + switch -- $codestate_reverse { + 7 { + set existing_reverse_state 1 + } + 27 { + set existing_reverse_state 0 + } + "" { + } + } + if {$existing_reverse_state == 0} { + set rflip [a+ reverse] + } else { + #reverse of reverse + set rflip [a+ noreverse] + } + #note that mergeresult can have multiple esc (due to unmergeables or non sgr codes) + set replay_codes_overlay [punk::ansi::codetype::sgr_merge [list [dict get $codeinfo mergeresult] $rflip]] + puts stderr "---->[ansistring VIEW $replay_codes_overlay] rendered: $rendered" + } + + + + #-- todo - detect looping properly + if {$row > 1 && $overtext ne "" && $unapplied eq $overtext && $post_render_row == $row && $instruction eq ""} { + puts stderr "overtype::renderspace loop?" + puts [ansistring VIEW $rinfo] + break + } + #-- + + if {[tcl::dict::size $c_saved_pos] >= 1} { + set cursor_saved_position $c_saved_pos + set cursor_saved_attributes $c_saved_attributes + } + + + set overflow_handled 0 + + + + set nextprefix "" + + + #todo - handle potential insertion mode as above for cursor restore? + #keeping separate branches for debugging - review and merge as appropriate when stable + set instruction_type [lindex $instruction 0] ;#some instructions have params + tcl::dict::incr instruction_stats $instruction_type + switch -- $instruction_type { + reset { + #reset the 'renderspace terminal' (not underlying terminal) + set row 1 + set col 1 + set vtstate [tcl::dict::merge $vtstate $initial_state] + #todo - clear screen + } + {} { + #end of supplied line input + #lf included in data + set row $post_render_row + set col $post_render_col + if {![llength $unapplied_list]} { + if {$overflow_right ne ""} { + incr row + } + } else { + puts stderr "renderspace end of input line - has unapplied: [ansistring VIEW $unapplied] (review)" + } + set col $opt_startcolumn + } + up { + + #renderline knows it's own line number, and knows not to go above row l + #it knows that a move whilst 1-beyond the width conflicts with the linefeed and reduces the move by one accordingly. + #row returned should be correct. + #column may be the overflow column - as it likes to report that to the caller. + + #Note that an ansi up sequence after last column going up to a previous line and also beyond the last column, will result in the next grapheme going onto the following line. + #this seems correct - as the column remains beyond the right margin so subsequent chars wrap (?) review + #puts stderr "up $post_render_row" + #puts stderr "$rinfo" + + #puts stdout "1 row:$row col $col" + set row $post_render_row + #data_mode (naming?) determines if we move to end of existing data or not. + #data_mode 0 means ignore existing line length and go to exact column + #set by -experimental flag + if {$data_mode == 0} { + set col $post_render_col + } else { + #This doesn't really work if columns are pre-filled with spaces..we can't distinguish them from data + #we need renderline to return the number of the maximum column filled (or min if we ever do r-to-l) + set existingdata [lindex $outputlines [expr {$post_render_row -1}]] + set lastdatacol [punk::ansi::printing_length $existingdata] + if {$lastdatacol < $renderwidth} { + set col [expr {$lastdatacol+1}] + } else { + set col $renderwidth + } + } + + #puts stdout "2 row:$row col $col" + #puts stdout "-----------------------" + #puts stdout $rinfo + #flush stdout + } + down { + if {$data_mode == 0} { + #renderline doesn't know how far down we can go.. + if {$post_render_row > [llength $outputlines]} { + if {$opt_appendlines} { + set diff [expr {$post_render_row - [llength $outputlines]}] + if {$diff > 0} { + lappend outputlines {*}[lrepeat $diff ""] + } + lappend outputlines "" + } + } + set row $post_render_row + set col $post_render_col + } else { + if {$post_render_row > [llength $outputlines]} { + if {$opt_appendlines} { + set diff [expr {$post_render_row - [llength $outputlines]}] + if {$diff > 0} { + lappend outputlines {*}[lrepeat $diff ""] + } + lappend outputlines "" + } + } + set existingdata [lindex $outputlines [expr {$post_render_row -1}]] + set lastdatacol [punk::ansi::printing_length $existingdata] + if {$lastdatacol < $renderwidth} { + set col [expr {$lastdatacol+1}] + } else { + set col $renderwidth + } + + } + } + restore_cursor { + #testfile belinda.ans uses this + + #puts stdout "[a+ blue bold]CURSOR_RESTORE[a]" + if {[tcl::dict::exists $cursor_saved_position row]} { + set row [tcl::dict::get $cursor_saved_position row] + set col [tcl::dict::get $cursor_saved_position column] + #puts stdout "restoring: row $row col $col [ansistring VIEW $cursor_saved_attributes] [a] unapplied [ansistring VIEWCODES $unapplied]" + #set nextprefix $cursor_saved_attributes + #lset replay_codes_overlay [expr $overidx+1] $cursor_saved_attributes + set replay_codes_overlay [tcl::dict::get $rinfo replay_codes_overlay]$cursor_saved_attributes + #set replay_codes_overlay $cursor_saved_attributes + set cursor_saved_position [tcl::dict::create] + set cursor_saved_attributes "" + } else { + #TODO + #?restore without save? + #should move to home position and reset ansi SGR? + #puts stderr "overtype::renderspace cursor_restore without save data available" + } + #If we were inserting prior to hitting the cursor_restore - there could be overflow_right data - generally the overtype functions aren't for inserting - but ansi can enable it + #if we were already in overflow when cursor_restore was hit - it shouldn't have been processed as an action - just stored. + if {!$overflow_handled && $overflow_right ne ""} { + #wrap before restore? - possible effect on saved cursor position + #this overflow data has previously been rendered so has no cursor movements or further save/restore operations etc + #we can just insert another call to renderline to solve this.. ? + #It would perhaps be more properly handled as a queue of instructions from our initial renderline call + #we don't need to worry about overflow next call (?)- but we should carry forward our gx and ansi stacks + + puts stdout ">>>[a+ red bold]overflow_right during restore_cursor[a]" + + set sub_info [overtype::renderline\ + -info 1\ + -width [tcl::dict::get $vtstate renderwidth]\ + -insert_mode [tcl::dict::get $vtstate insert_mode]\ + -autowrap_mode [tcl::dict::get $vtstate autowrap_mode]\ + -expand_right [tcl::dict::get $opts -expand_right]\ + ""\ + $overflow_right\ + ] + set foldline [tcl::dict::get $sub_info result] + tcl::dict::set vtstate insert_mode [tcl::dict::get $sub_info insert_mode] ;#probably not needed..? + tcl::dict::set vtstate autowrap_mode [tcl::dict::get $sub_info autowrap_mode] ;#nor this.. + linsert outputlines $renderedrow $foldline + #review - row & col set by restore - but not if there was no save.. + } + set overflow_handled 1 + + } + move { + ######## + if {$post_render_row > [llength $outputlines]} { + #Ansi moves need to create new lines ? + #if {$opt_appendlines} { + # set diff [expr {$post_render_row - [llength $outputlines]}] + # if {$diff > 0} { + # lappend outputlines {*}[lrepeat $diff ""] + # } + # set row $post_render_row + #} else { + set row [llength $outputlines] + #} + } else { + set row $post_render_row + } + ####### + set col $post_render_col + #overflow + unapplied? + } + clear_and_move { + #e.g 2J + if {$post_render_row > [llength $outputlines]} { + set row [llength $outputlines] + } else { + set row $post_render_row + } + set col $post_render_col + set overflow_right "" ;#if we're clearing - any overflow due to insert_mode is irrelevant + set clearedlines [list] + foreach ln $outputlines { + lappend clearedlines \x1b\[0m$replay_codes_overlay[string repeat \000 $renderwidth]\x1b\[0m + if 0 { + + set lineparts [punk::ansi::ta::split_codes $ln] + set numcells 0 + foreach {pt _code} $lineparts { + if {$pt ne ""} { + foreach grapheme [punk::char::grapheme_split $pt] { + switch -- $grapheme { + " " - - - _ - ! - @ - # - $ - % - ^ - & - * - = - + - : - . - , - / - | - ? - + a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - + z - A - B - C - D - E - F - G - H - I - J - K - L - M - N - O - P - Q - R - S - T - U - V - W - X - Y - Z - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 { + incr numcells 1 + } + default { + if {$grapheme eq "\u0000"} { + incr numcells 1 + } else { + incr numcells [grapheme_width_cached $grapheme] + } + } + } + + } + } + } + #replays/resets each line + lappend clearedlines \x1b\[0m$replay_codes_overlay[string repeat \000 $numcells]\x1b\[0m + } + } + set outputlines $clearedlines + #todo - determine background/default to be in effect - DECECM ? + puts stderr "replay_codes_overlay: [ansistring VIEW $replay_codes_overlay]" + #lset outputlines 0 $replay_codes_overlay[lindex $outputlines 0] + + } + lf_start { + #raw newlines + # ---------------------- + #test with fruit.ans + #test - treating as newline below... + #append rendered $overflow_right + #set overflow_right "" + set row $renderedrow + incr row + if {$row > [llength $outputlines]} { + lappend outputlines "" + } + set col $opt_startcolumn + # ---------------------- + } + lf_mid { + + set edit_mode 0 + if {$edit_mode} { + set inputchunks [linsert $inputchunks 0 $overflow_right$unapplied] + set overflow_right "" + set unapplied "" + set row $post_render_row + #set col $post_render_col + set col $opt_startcolumn + if {$row > [llength $outputlines]} { + lappend outputlines {*}[lrepeat 1 ""] + } + } else { + if 1 { + if {$overflow_right ne ""} { + if {$opt_expand_right} { + append rendered $overflow_right + set overflow_right "" + } else { + #review - we should really make renderline do the work...? + set overflow_width [punk::ansi::printing_length $overflow_right] + if {$visualwidth + $overflow_width <= $renderwidth} { + append rendered $overflow_right + set overflow_right "" + } else { + if {[tcl::dict::get $vtstate autowrap_mode]} { + set outputlines [linsert $outputlines $renderedrow $overflow_right] + set overflow_right "" + set row [expr {$renderedrow + 2}] + } else { + set overflow_right "" ;#abandon + } + + if {0 && $visualwidth < $renderwidth} { + puts stderr "visualwidth: $visualwidth < renderwidth:$renderwidth" + error "incomplete - abandon?" + set overflowparts [punk::ansi::ta::split_codes $overflow_right] + set remaining_overflow $overflowparts + set filled 0 + foreach {pt code} $overflowparts { + lpop remaining_overflow 0 + if {$pt ne ""} { + set graphemes [punk::char::grapheme_split $pt] + set add "" + set addlen $visualwidth + foreach g $graphemes { + set w [overtype::grapheme_width_cached $g] + if {$addlen + $w <= $renderwidth} { + append add $g + incr addlen $w + } else { + set filled 1 + break + } + } + append rendered $add + } + if {!$filled} { + lpop remaining_overflow 0 ;#pop code + } + } + set overflow_right [join $remaining_overflow ""] + } + } + } + } + set row $post_render_row + set col $opt_startcolumn + if {$row > [llength $outputlines]} { + lappend outputlines {*}[lrepeat 1 ""] + } + } else { + #old version - known to work with various ansi graphics - e.g fruit.ans + # - but fails to limit lines to renderwidth when expand_right == 0 + append rendered $overflow_right + set overflow_right "" + set row $post_render_row + set col $opt_startcolumn + if {$row > [llength $outputlines]} { + lappend outputlines {*}[lrepeat 1 ""] + } + } + } + } + lf_overflow { + #linefeed after renderwidth e.g at column 81 for an 80 col width + #we may also have other control sequences that came after col 80 e.g cursor save + + if 0 { + set lhs [overtype::left -width 40 -wrap 1 "" [ansistring VIEWSTYLE -nul 1 -lf 1 -vt 1 $rendered]] + set lhs [textblock::frame -title "rendered $visualwidth cols" -subtitle "row-$renderedrow" $lhs] + set rhs "" + + #assertion - there should be no overflow.. + puts $lhs + } + if {![tcl::dict::get $vtstate insert_mode]} { + assert {$overflow_right eq ""} lf_overflow should not get data in overflow_right when not insert_mode + } + + set row $post_render_row + #set row $renderedrow + #incr row + #only add newline if we're at the bottom + if {$row > [llength $outputlines]} { + lappend outputlines {*}[lrepeat 1 ""] + } + set col $opt_startcolumn + + } + newlines_above { + #we get a newlines_above instruction when received at column 1 + #In some cases we want to treat that as request to insert a new blank line above, and move our row 1 down (staying with the data) + #in other cases - we want to treat at column 1 the same as any other + + puts "--->newlines_above" + puts "rinfo: $rinfo" + #renderline doesn't advance the row for us - the caller has the choice to implement or not + set row $post_render_row + set col $post_render_col + if {$insert_lines_above > 0} { + set row $renderedrow + set outputlines [linsert $outputlines $renderedrow-1 {*}[lrepeat $insert_lines_above ""]] + incr row [expr {$insert_lines_above -1}] ;#we should end up on the same line of text (at a different index), with new empties inserted above + #? set row $post_render_row #can renderline tell us? + } + } + newlines_below { + #obsolete? - use for ANSI insert lines sequence + if {$data_mode == 0} { + puts --->nl_below + set row $post_render_row + set col $post_render_col + if {$insert_lines_below == 1} { + #set lhs [overtype::left -width 40 -wrap 1 "" [ansistring VIEWSTYLE -lf 1 -vt 1 $rendered]] + #set lhs [textblock::frame -title rendered -subtitle "row-$renderedrow" $lhs] + #set rhs "" + #if {$overflow_right ne ""} { + # set rhs [overtype::left -width 40 -wrap 1 "" [ansistring VIEWSTYLE -lf 1 -vt 1 $overflow_right]] + # set rhs [textblock::frame -title overflow_right $rhs] + #} + #puts [textblock::join $lhs $rhs] + + #rendered + append rendered $overflow_right + # + + + set overflow_right "" + set row $renderedrow + #only add newline if we're at the bottom + if {$row > [llength $outputlines]} { + lappend outputlines {*}[lrepeat $insert_lines_below ""] + } + incr row $insert_lines_below + set col $opt_startcolumn + } + } else { + set row $post_render_row + if {$post_render_row > [llength $outputlines]} { + if {$opt_appendlines} { + set diff [expr {$post_render_row - [llength $outputlines]}] + if {$diff > 0} { + lappend outputlines {*}[lrepeat $diff ""] + } + lappend outputlines "" + } + } else { + set existingdata [lindex $outputlines [expr {$post_render_row -1}]] + set lastdatacol [punk::ansi::printing_length $existingdata] + if {$lastdatacol < $renderwidth} { + set col [expr {$lastdatacol+1}] + } else { + set col $renderwidth + } + } + } + } + wrapmoveforward { + #doesn't seem to be used by fruit.ans testfile + #used by dzds.ans + #note that cursor_forward may move deep into the next line - or even span multiple lines !TODO + set c $renderwidth + set r $post_render_row + if {$post_render_col > $renderwidth} { + set i $c + while {$i <= $post_render_col} { + if {$c == $renderwidth+1} { + incr r + if {$opt_appendlines} { + if {$r < [llength $outputlines]} { + lappend outputlines "" + } + } + set c $opt_startcolumn + } else { + incr c + } + incr i + } + set col $c + } else { + #why are we getting this instruction then? + puts stderr "wrapmoveforward - test" + set r [expr {$post_render_row +1}] + set c $post_render_col + } + set row $r + set col $c + } + wrapmovebackward { + set c $renderwidth + set r $post_render_row + if {$post_render_col < 1} { + set c 1 + set i $c + while {$i >= $post_render_col} { + if {$c == 0} { + if {$r > 1} { + incr r -1 + set c $renderwidth + } else { + #leave r at 1 set c 1 + #testfile besthpav.ans first line top left border alignment + set c 1 + break + } + } else { + incr c -1 + } + incr i -1 + } + set col $c + } else { + puts stderr "Wrapmovebackward - but postrendercol >= 1???" + } + set row $r + set col $c + } + overflow { + #normal single-width grapheme overflow + #puts "----normal overflow --- [ansistring VIEWSTYLE -lf 1 -nul 1 -vt 1 $rendered]" + set row $post_render_row ;#renderline will not advance row when reporting overflow char + if {[tcl::dict::get $vtstate autowrap_mode]} { + incr row + set col $opt_startcolumn ;#whether wrap or not - next data is at column 1 ?? + } else { + set col $post_render_col + #set unapplied "" ;#this seems wrong? + #set unapplied [tcl::string::range $unapplied 1 end] + #The overflow can only be triggered by a grapheme (todo cluster?) - but our unapplied could contain SGR codes prior to the grapheme that triggered overflow - so we need to skip beyond any SGRs + #There may be more than one, because although the stack leading up to overflow may have been merged - codes between the last column and the overflowing grapheme will remain separate + #We don't expect any movement or other ANSI codes - as if they came before the grapheme, they would have triggered a different instruction to 'overflow' + set idx 0 + set next_grapheme_index -1 + foreach u $unapplied_list { + if {![punk::ansi::ta::detect $u]} { + set next_grapheme_index $idx + break + } + incr idx + } + assert {$next_grapheme_index >= 0} + #drop the overflow grapheme - keeping all codes in place. + set unapplied [join [lreplace $unapplied_list $next_grapheme_index $next_grapheme_index] ""] + #we need to run the reduced unapplied on the same line - further graphemes will just overflow again, but codes or control chars could trigger jumps to other lines + + set overflow_handled 1 + #handled by dropping overflow if any + } + } + overflow_splitchar { + set row $post_render_row ;#renderline will not advance row when reporting overflow char + + #2nd half of grapheme would overflow - treggering grapheme is returned in unapplied. There may also be overflow_right from earlier inserts + #todo - consider various options .. re-render a single trailing space or placeholder on same output line, etc + if {[tcl::dict::get $vtstate autowrap_mode]} { + if {$renderwidth < 2} { + #edge case of rendering to a single column output - any 2w char will just cause a loop if we don't substitute with something, or drop the character + set idx 0 + set triggering_grapheme_index -1 + foreach u $unapplied_list { + if {![punk::ansi::ta::detect $u]} { + set triggering_grapheme_index $idx + break + } + incr idx + } + set unapplied [join [lreplace $unapplied_list $triggering_grapheme_index $triggering_grapheme_index $opt_exposed1] ""] + } else { + set col $opt_startcolumn + incr row + } + } else { + set overflow_handled 1 + #handled by dropping entire overflow if any + if {$renderwidth < 2} { + set idx 0 + set triggering_grapheme_index -1 + foreach u $unapplied_list { + if {![punk::ansi::ta::detect $u]} { + set triggering_grapheme_index $idx + break + } + incr idx + } + set unapplied [join [lreplace $unapplied_list $triggering_grapheme_index $triggering_grapheme_index $opt_exposed1] ""] + } + } + + } + vt { + + #can vt add a line like a linefeed can? + set row $post_render_row + set col $post_render_col + } + set_window_title { + set newtitle [lindex $instruction 1] + puts stderr "overtype::renderspace set_window_title [ansistring VIEW $newtitle] instruction '$instruction'" + # + } + reset_colour_palette { + puts stderr "overtype::renderspace instruction '$instruction' unimplemented" + } + default { + puts stderr "overtype::renderspace unhandled renderline instruction '$instruction'" + } + + } + + + if {!$opt_expand_right && ![tcl::dict::get $vtstate autowrap_mode]} { + #not allowed to overflow column or wrap therefore we get overflow data to truncate + if {[tcl::dict::get $opts -ellipsis]} { + set show_ellipsis 1 + if {!$opt_ellipsiswhitespace} { + #we don't want ellipsis if only whitespace was lost + set lostdata "" + if {$overflow_right ne ""} { + append lostdata $overflow_right + } + if {$unapplied ne ""} { + append lostdata $unapplied + } + if {[tcl::string::trim $lostdata] eq ""} { + set show_ellipsis 0 + } + #set lostdata [tcl::string::range $overtext end-[expr {$overflowlength-1}] end] + if {[tcl::string::trim [punk::ansi::ansistrip $lostdata]] eq ""} { + set show_ellipsis 0 + } + } + if {$show_ellipsis} { + set rendered [overtype::right $rendered $opt_ellipsistext] + } + set overflow_handled 1 + } else { + #no wrap - no ellipsis - silently truncate + set overflow_handled 1 + } + } + + + + if {$renderedrow <= [llength $outputlines]} { + lset outputlines [expr {$renderedrow-1}] $rendered + } else { + if {$opt_appendlines} { + lappend outputlines $rendered + } else { + #? + lset outputlines [expr {$renderedrow-1}] $rendered + } + } + + if {!$overflow_handled} { + append nextprefix $overflow_right + } + + append nextprefix $unapplied + + if 0 { + if {$nextprefix ne ""} { + set nextoveridx [expr {$overidx+1}] + if {$nextoveridx >= [llength $inputchunks]} { + lappend inputchunks $nextprefix + } else { + #lset overlines $nextoveridx $nextprefix[lindex $overlines $nextoveridx] + set inputchunks [linsert $inputchunks $nextoveridx $nextprefix] + } + } + } + + if {$nextprefix ne ""} { + set inputchunks [linsert $inputchunks 0 $nextprefix] + } + + + incr overidx + incr loop + if {$loop >= $looplimit} { + puts stderr "overtype::renderspace looplimit reached ($looplimit)" + lappend outputlines "[a+ red bold] - looplimit $looplimit reached[a]" + set Y [a+ yellow bold] + set RST [a] + set sep_header ----DEBUG----- + set debugmsg "" + append debugmsg "${Y}${sep_header}${RST}" \n + append debugmsg "looplimit $looplimit reached\n" + append debugmsg "data_mode:$data_mode\n" + append debugmsg "opt_appendlines:$opt_appendlines\n" + append debugmsg "prev_row :[tcl::dict::get $renderopts -cursor_row]\n" + append debugmsg "prev_col :[tcl::dict::get $renderopts -cursor_column]\n" + tcl::dict::for {k v} $rinfo { + append debugmsg "${Y}$k [ansistring VIEW -lf 1 -vt 1 $v]$RST" \n + } + append debugmsg "${Y}[string repeat - [string length $sep_header]]$RST" \n + + puts stdout $debugmsg + #todo - config regarding error dumps rather than just dumping in working dir + set fd [open [pwd]/error_overtype.txt w] + puts $fd $debugmsg + close $fd + error $debugmsg + break + } + } + + set result [join $outputlines \n] + if {!$opt_info} { + return $result + } else { + #emit to debug window like basictelnet does? make debug configurable as syslog or even a telnet server to allow on 2nd window? + #append result \n$instruction_stats\n + set inforesult [dict create\ + result $result\ + last_instruction $instruction\ + instruction_stats $instruction_stats\ + ] + if {$opt_info == 2} { + return [pdict -channel none inforesult] + } else { + return $inforesult + } + } + } + + #todo - left-right ellipsis ? + proc centre {args} { + variable default_ellipsis_horizontal + if {[llength $args] < 2} { + error {usage: ?-transparent [0|1]? ?-bias [left|right]? ?-overflow [1|0]? undertext overtext} + } + + foreach {underblock overblock} [lrange $args end-1 end] break + + #todo - vertical vs horizontal overflow for blocks + set opts [tcl::dict::create\ + -bias left\ + -ellipsis 0\ + -ellipsistext $default_ellipsis_horizontal\ + -ellipsiswhitespace 0\ + -overflow 0\ + -transparent 0\ + -exposed1 \uFFFD\ + -exposed2 \uFFFD\ + ] + set argsflags [lrange $args 0 end-2] + foreach {k v} $argsflags { + switch -- $k { + -bias - -ellipsis - -ellipsistext - -ellipsiswhitespace - -overflow - -transparent - -exposed1 - -exposed2 { + tcl::dict::set opts $k $v + } + default { + set known_opts [tcl::dict::keys $opts] + error "overtype::centre unknown option '$k'. Known options: $known_opts" + } + } + } + #set opts [tcl::dict::merge $defaults $argsflags] + # -- --- --- --- --- --- + set opt_transparent [tcl::dict::get $opts -transparent] + set opt_ellipsis [tcl::dict::get $opts -ellipsis] + set opt_ellipsistext [tcl::dict::get $opts -ellipsistext] + set opt_ellipsiswhitespace [tcl::dict::get $opts -ellipsiswhitespace] + set opt_exposed1 [tcl::dict::get $opts -exposed1] + set opt_exposed2 [tcl::dict::get $opts -exposed2] + # -- --- --- --- --- --- + + + set underblock [tcl::string::map {\r\n \n} $underblock] + set overblock [tcl::string::map {\r\n \n} $overblock] + + set underlines [split $underblock \n] + #set renderwidth [tcl::mathfunc::max {*}[lmap v $underlines {punk::ansi::printing_length $v}]] + lassign [blocksize $underblock] _w renderwidth _h renderheight + set overlines [split $overblock \n] + lassign [blocksize $overblock] _w overblock_width _h overblock_height + set under_exposed_max [expr {$renderwidth - $overblock_width}] + if {$under_exposed_max > 0} { + #background block is wider + if {$under_exposed_max % 2 == 0} { + #even left/right exposure + set left_exposed [expr {$under_exposed_max / 2}] + } else { + set beforehalf [expr {$under_exposed_max / 2}] ;#1 less than half due to integer division + if {[tcl::string::tolower [tcl::dict::get $opts -bias]] eq "left"} { + set left_exposed $beforehalf + } else { + #bias to the right + set left_exposed [expr {$beforehalf + 1}] + } + } + } else { + set left_exposed 0 + } + + set outputlines [list] + if {[punk::ansi::ta::detect_sgr [lindex $overlines 0]]} { + set replay_codes "[punk::ansi::a]" + } else { + set replay_codes "" + } + set replay_codes_underlay "" + set replay_codes_overlay "" + foreach undertext $underlines overtext $overlines { + set overtext_datalen [punk::ansi::printing_length $overtext] + set ulen [punk::ansi::printing_length $undertext] + if {$ulen < $renderwidth} { + set udiff [expr {$renderwidth - $ulen}] + set undertext "$undertext[string repeat { } $udiff]" + } + set undertext $replay_codes_underlay$undertext + set overtext $replay_codes_overlay$overtext + + set overflowlength [expr {$overtext_datalen - $renderwidth}] + #review - right-to-left langs should elide on left! - extra option required + + if {$overflowlength > 0} { + #overlay line wider or equal + #review - we still expand_right for centred for now.. possibly should do something like -expand_leftright with ellipsis each end? + set rinfo [renderline -info 1 -insert_mode 0 -transparent $opt_transparent -expand_right [tcl::dict::get $opts -overflow] -exposed1 $opt_exposed1 -exposed2 $opt_exposed2 $undertext $overtext] + set rendered [tcl::dict::get $rinfo result] + set overflow_right [tcl::dict::get $rinfo overflow_right] + set unapplied [tcl::dict::get $rinfo unapplied] + #todo - get replay_codes from overflow_right instead of wherever it was truncated? + + #overlay line data is wider - trim if overflow not specified in opts - and overtype an ellipsis at right if it was specified + if {![tcl::dict::get $opts -overflow]} { + #lappend outputlines [tcl::string::range $overtext 0 [expr {$renderwidth - 1}]] + #set overtext [tcl::string::range $overtext 0 $renderwidth-1 ] + if {$opt_ellipsis} { + set show_ellipsis 1 + if {!$opt_ellipsiswhitespace} { + #we don't want ellipsis if only whitespace was lost + #don't use tcl::string::range on ANSI data + #set lostdata [tcl::string::range $overtext end-[expr {$overflowlength-1}] end] + set lostdata "" + if {$overflow_right ne ""} { + append lostdata $overflow_right + } + if {$unapplied ne ""} { + append lostdata $unapplied + } + if {[tcl::string::trim $lostdata] eq ""} { + set show_ellipsis 0 + } + } + if {$show_ellipsis} { + set rendered [overtype::right $rendered $opt_ellipsistext] + } + } + } + lappend outputlines $rendered + #lappend outputlines [renderline -insert_mode 0 -transparent $opt_transparent $undertext $overtext] + } else { + #background block is wider than or equal to data for this line + #lappend outputlines [renderline -insert_mode 0 -startcolumn [expr {$left_exposed + 1}] -transparent $opt_transparent -exposed1 $opt_exposed1 -exposed2 $opt_exposed2 $undertext $overtext] + set rinfo [renderline -info 1 -insert_mode 0 -startcolumn [expr {$left_exposed + 1}] -transparent $opt_transparent -exposed1 $opt_exposed1 -exposed2 $opt_exposed2 $undertext $overtext] + lappend outputlines [tcl::dict::get $rinfo result] + } + set replay_codes_underlay [tcl::dict::get $rinfo replay_codes_underlay] + set replay_codes_overlay [tcl::dict::get $rinfo replay_codes_overlay] + } + return [join $outputlines \n] + } + + #overtype::right is for a rendered ragged underblock and a rendered ragged overblock + #ie we can determine the block width for bost by examining the lines and picking the longest. + # + proc right {args} { + #NOT the same as align-right - which should be done to the overblock first if required + variable default_ellipsis_horizontal + # @d !todo - implement overflow, length checks etc + + if {[llength $args] < 2} { + error {usage: ?-overflow [1|0]? ?-transparent 0|? undertext overtext} + } + foreach {underblock overblock} [lrange $args end-1 end] break + + set opts [tcl::dict::create\ + -bias ignored\ + -ellipsis 0\ + -ellipsistext $default_ellipsis_horizontal\ + -ellipsiswhitespace 0\ + -overflow 0\ + -transparent 0\ + -exposed1 \uFFFD\ + -exposed2 \uFFFD\ + -align "left"\ + ] + set argsflags [lrange $args 0 end-2] + tcl::dict::for {k v} $argsflags { + switch -- $k { + -bias - -ellipsis - -ellipsistext - -ellipsiswhitespace - -overflow - -transparent - -exposed1 - -exposed2 - -align { + tcl::dict::set opts $k $v + } + default { + set known_opts [tcl::dict::keys $opts] + error "overtype::centre unknown option '$k'. Known options: $known_opts" + } + } + } + #set opts [tcl::dict::merge $defaults $argsflags] + # -- --- --- --- --- --- + set opt_transparent [tcl::dict::get $opts -transparent] + set opt_ellipsis [tcl::dict::get $opts -ellipsis] + set opt_ellipsistext [tcl::dict::get $opts -ellipsistext] + set opt_ellipsiswhitespace [tcl::dict::get $opts -ellipsiswhitespace] + set opt_overflow [tcl::dict::get $opts -overflow] + set opt_exposed1 [tcl::dict::get $opts -exposed1] + set opt_exposed2 [tcl::dict::get $opts -exposed2] + set opt_align [tcl::dict::get $opts -align] + # -- --- --- --- --- --- + + set underblock [tcl::string::map {\r\n \n} $underblock] + set overblock [tcl::string::map {\r\n \n} $overblock] + + set underlines [split $underblock \n] + lassign [blocksize $underblock] _w renderwidth _h renderheight + set overlines [split $overblock \n] + #set overblock_width [tcl::mathfunc::max {*}[lmap v $overlines {punk::ansi::printing_length $v}]] + lassign [blocksize $overblock] _w overblock_width _h overblock_height + set under_exposed_max [expr {max(0,$renderwidth - $overblock_width)}] + set left_exposed $under_exposed_max + + + + set outputlines [list] + if {[punk::ansi::ta::detect_sgr [lindex $overlines 0]]} { + set replay_codes "[punk::ansi::a]" + } else { + set replay_codes "" + } + set replay_codes_underlay "" + set replay_codes_overlay "" + foreach undertext $underlines overtext $overlines { + set overtext_datalen [punk::ansi::printing_length $overtext] + set ulen [punk::ansi::printing_length $undertext] + if {$ulen < $renderwidth} { + set udiff [expr {$renderwidth - $ulen}] + #puts xxx + append undertext [string repeat { } $udiff] + } + if {$overtext_datalen < $overblock_width} { + set odiff [expr {$overblock_width - $overtext_datalen}] + switch -- $opt_align { + left { + set startoffset 0 + } + right { + set startoffset $odiff + } + default { + set half [expr {$odiff / 2}] + #set lhs [string repeat { } $half] + #set righthalf [expr {$odiff - $half}] ;#remainder - may be one more - so we are biased left + #set rhs [string repeat { } $righthalf] + set startoffset $half + } + } + } else { + set startoffset 0 ;#negative? + } + + set undertext $replay_codes_underlay$undertext + set overtext $replay_codes_overlay$overtext + + set overflowlength [expr {$overtext_datalen - $renderwidth}] + if {$overflowlength > 0} { + #raw overtext wider than undertext column + set rinfo [renderline\ + -info 1\ + -insert_mode 0\ + -transparent $opt_transparent\ + -exposed1 $opt_exposed1 -exposed2 $opt_exposed2\ + -overflow $opt_overflow\ + -startcolumn [expr {1 + $startoffset}]\ + $undertext $overtext] + set replay_codes [tcl::dict::get $rinfo replay_codes] + set rendered [tcl::dict::get $rinfo result] + if {!$opt_overflow} { + if {$opt_ellipsis} { + set show_ellipsis 1 + if {!$opt_ellipsiswhitespace} { + #we don't want ellipsis if only whitespace was lost + set lostdata [tcl::string::range $overtext end-[expr {$overflowlength-1}] end] + if {[tcl::string::trim $lostdata] eq ""} { + set show_ellipsis 0 + } + } + if {$show_ellipsis} { + set ellipsis $replay_codes$opt_ellipsistext + #todo - overflow on left if allign = right?? + set rendered [overtype::right $rendered $ellipsis] + } + } + } + lappend outputlines $rendered + } else { + #padded overtext + #lappend outputlines [renderline -insert_mode 0 -transparent $opt_transparent -startcolumn [expr {$left_exposed + 1}] $undertext $overtext] + #Note - we still need overflow(exapnd_right) here - as although the overtext is short - it may oveflow due to the startoffset + set rinfo [renderline -info 1 -insert_mode 0 -transparent $opt_transparent -expand_right $opt_overflow -startcolumn [expr {$left_exposed + 1 + $startoffset}] $undertext $overtext] + lappend outputlines [tcl::dict::get $rinfo result] + } + set replay_codes [tcl::dict::get $rinfo replay_codes] + set replay_codes_underlay [tcl::dict::get $rinfo replay_codes_underlay] + set replay_codes_overlay [tcl::dict::get $rinfo replay_codes_overlay] + } + + return [join $outputlines \n] + } + + proc left {args} { + overtype::block -blockalign left {*}$args + } + #overtype a (possibly ragged) underblock with a (possibly ragged) overblock + proc block {args} { + variable default_ellipsis_horizontal + # @d !todo - implement overflow, length checks etc + + if {[llength $args] < 2} { + error {usage: ?-blockalign left|centre|right? ?-textalign left|centre|right? ?-overflow [1|0]? ?-transparent 0|? undertext overtext} + } + #foreach {underblock overblock} [lrange $args end-1 end] break + lassign [lrange $args end-1 end] underblock overblock + + set opts [tcl::dict::create\ + -ellipsis 0\ + -ellipsistext $default_ellipsis_horizontal\ + -ellipsiswhitespace 0\ + -overflow 0\ + -transparent 0\ + -exposed1 \uFFFD\ + -exposed2 \uFFFD\ + -textalign "left"\ + -textvertical "top"\ + -blockalign "left"\ + -blockalignbias left\ + -blockvertical "top"\ + ] + set argsflags [lrange $args 0 end-2] + tcl::dict::for {k v} $argsflags { + switch -- $k { + -blockalignbias - -ellipsis - -ellipsistext - -ellipsiswhitespace - -overflow - -transparent - -exposed1 - -exposed2 - -textalign - -blockalign - -blockvertical { + tcl::dict::set opts $k $v + } + default { + error "overtype::block unknown option '$k'. Known options: [tcl::dict::keys $opts]" + } + } + } + # -- --- --- --- --- --- + set opt_transparent [tcl::dict::get $opts -transparent] + set opt_ellipsis [tcl::dict::get $opts -ellipsis] + set opt_ellipsistext [tcl::dict::get $opts -ellipsistext] + set opt_ellipsiswhitespace [tcl::dict::get $opts -ellipsiswhitespace] + set opt_overflow [tcl::dict::get $opts -overflow] + set opt_exposed1 [tcl::dict::get $opts -exposed1] + set opt_exposed2 [tcl::dict::get $opts -exposed2] + set opt_textalign [tcl::dict::get $opts -textalign] + set opt_blockalign [tcl::dict::get $opts -blockalign] + if {$opt_blockalign eq "center"} { + set opt_blockalign "centre" + } + # -- --- --- --- --- --- + + set underblock [tcl::string::map {\r\n \n} $underblock] + set overblock [tcl::string::map {\r\n \n} $overblock] + + set underlines [split $underblock \n] + lassign [blocksize $underblock] _w renderwidth _h renderheight + set overlines [split $overblock \n] + #set overblock_width [tcl::mathfunc::max {*}[lmap v $overlines {punk::ansi::printing_length $v}]] + lassign [blocksize $overblock] _w overblock_width _h overblock_height + set under_exposed_max [expr {max(0,$renderwidth - $overblock_width)}] + + switch -- $opt_blockalign { + left { + set left_exposed 0 + } + right { + set left_exposed $under_exposed_max + } + centre { + if {$under_exposed_max > 0} { + #background block is wider + if {$under_exposed_max % 2 == 0} { + #even left/right exposure + set left_exposed [expr {$under_exposed_max / 2}] + } else { + set beforehalf [expr {$under_exposed_max / 2}] ;#1 less than half due to integer division + if {[tcl::string::tolower [tcl::dict::get $opts -blockalignbias]] eq "left"} { + set left_exposed $beforehalf + } else { + #bias to the right + set left_exposed [expr {$beforehalf + 1}] + } + } + } else { + set left_exposed 0 + } + } + default { + set left_exposed 0 + } + } + + + + set outputlines [list] + if {[punk::ansi::ta::detect_sgr [lindex $overlines 0]]} { + set replay_codes "[punk::ansi::a]" + } else { + set replay_codes "" + } + set replay_codes_underlay "" + set replay_codes_overlay "" + foreach undertext $underlines overtext $overlines { + set overtext_datalen [punk::ansi::printing_length $overtext] + set ulen [punk::ansi::printing_length $undertext] + if {$ulen < $renderwidth} { + set udiff [expr {$renderwidth - $ulen}] + #puts xxx + append undertext [string repeat { } $udiff] + } + if {$overtext_datalen < $overblock_width} { + set odiff [expr {$overblock_width - $overtext_datalen}] + switch -- $opt_textalign { + left { + set startoffset 0 + } + right { + set startoffset $odiff + } + default { + set half [expr {$odiff / 2}] + #set lhs [string repeat { } $half] + #set righthalf [expr {$odiff - $half}] ;#remainder - may be one more - so we are biased left + #set rhs [string repeat { } $righthalf] + set startoffset $half + } + } + } else { + set startoffset 0 ;#negative? + } + + set undertext $replay_codes_underlay$undertext + set overtext $replay_codes_overlay$overtext + + set overflowlength [expr {$overtext_datalen - $renderwidth}] + if {$overflowlength > 0} { + #raw overtext wider than undertext column + set rinfo [renderline -info 1 -insert_mode 0 -transparent $opt_transparent -exposed1 $opt_exposed1 -exposed2 $opt_exposed2 -expand_right $opt_overflow -startcolumn [expr {1 + $startoffset}] $undertext $overtext] + set replay_codes [tcl::dict::get $rinfo replay_codes] + set rendered [tcl::dict::get $rinfo result] + set overflow_right [tcl::dict::get $rinfo overflow_right] + set unapplied [tcl::dict::get $rinfo unapplied] + if {!$opt_overflow} { + if {$opt_ellipsis} { + set show_ellipsis 1 + if {!$opt_ellipsiswhitespace} { + #we don't want ellipsis if only whitespace was lost + #don't use tcl::string::range on ANSI data + #set lostdata [tcl::string::range $overtext end-[expr {$overflowlength-1}] end] + set lostdata "" + if {$overflow_right ne ""} { + append lostdata $overflow_right + } + if {$unapplied ne ""} { + append lostdata $unapplied + } + if {[tcl::string::trim $lostdata] eq ""} { + set show_ellipsis 0 + } + } + if {$show_ellipsis} { + set rendered [overtype::block -blockalign right $rendered $opt_ellipsistext] + } + } + + #if {$opt_ellipsis} { + # set show_ellipsis 1 + # if {!$opt_ellipsiswhitespace} { + # #we don't want ellipsis if only whitespace was lost + # set lostdata [tcl::string::range $overtext end-[expr {$overflowlength-1}] end] + # if {[tcl::string::trim $lostdata] eq ""} { + # set show_ellipsis 0 + # } + # } + # if {$show_ellipsis} { + # set ellipsis [tcl::string::cat $replay_codes $opt_ellipsistext] + # #todo - overflow on left if allign = right?? + # set rendered [overtype::right $rendered $ellipsis] + # } + #} + } + lappend outputlines $rendered + } else { + #padded overtext + #lappend outputlines [renderline -insert_mode 0 -transparent $opt_transparent -startcolumn [expr {$left_exposed + 1}] $undertext $overtext] + #Note - we still need expand_right here - as although the overtext is short - it may oveflow due to the startoffset + set rinfo [renderline -info 1 -insert_mode 0 -transparent $opt_transparent -expand_right $opt_overflow -startcolumn [expr {$left_exposed + 1 + $startoffset}] $undertext $overtext] + #puts stderr "--> [ansistring VIEW -lf 1 -nul 1 $rinfo] <--" + set overflow_right [tcl::dict::get $rinfo overflow_right] + set unapplied [tcl::dict::get $rinfo unapplied] + lappend outputlines [tcl::dict::get $rinfo result] + } + set replay_codes [tcl::dict::get $rinfo replay_codes] + set replay_codes_underlay [tcl::dict::get $rinfo replay_codes_underlay] + set replay_codes_overlay [tcl::dict::get $rinfo replay_codes_overlay] + } + + return [join $outputlines \n] + } + + variable optimise_ptruns 10 ;# can be set to zero to disable the ptruns branches + + # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### + # renderline written from a left-right line orientation perspective as a first-shot at getting something useful. + # ultimately right-to-left, top-to-bottom and bottom-to-top are probably needed. + # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### + # + # + #-returnextra enables returning of overflow and length + #review - use punk::ansi::ta::detect to short-circuit processing and do simpler string calcs as an optimisation? + #review - DECSWL/DECDWL double width line codes - very difficult/impossible to align and compose with other elements + #(could render it by faking it with sixels and a lot of work - find/make a sixel font and ensure it's exactly 2 cols per char? + # This would probably be impractical to support for different fonts) + #todo - review transparency issues with single/double width characters + #bidi - need a base direction and concept of directional runs for RTL vs LTR - may be best handled at another layer? + proc renderline {args} { + #*** !doctools + #[call [fun overtype::renderline] [arg args] ] + #[para] renderline is the core engine for overtype string processing (frames & textblocks), and the raw mode commandline repl for the Tcl Punk Shell + #[para] It is also a central part of an ansi (micro) virtual terminal-emulator of sorts + #[para] This system does a half decent job at rendering 90's ANSI art to manipulable colour text blocks that can be joined & framed for layout display within a unix or windows terminal + #[para] Renderline helps maintain ANSI text styling reset/replay codes so that the styling of one block doesn't affect another. + #[para] Calling on the punk::ansi library - it can coalesce codes to keep the size down. + #[para] It is a giant mess of doing exactly what common wisdom says not to do... lots at once. + #[para] renderline is part of the Unicode and ANSI aware Overtype system which 'renders' a block of text onto a static underlay + #[para] The underlay is generally expected to be an ordered set of lines or a rectangular text block analogous to a terminal screen - but it can also be ragged in line length, or just blank. + #[para] The overlay couuld be similar - in which case it may often be used to overwrite a column or section of the underlay. + #[para] The overlay could however be a sequence of ANSI-laden text that jumps all over the place. + # + #[para] renderline itself only deals with a single line - or sometimes a single character. It is generally called from a loop that does further terminal-like or textblock processing. + #[para] By suppyling the -info 1 option - it can return various fields indicating the state of the render. + #[para] The main 3 are the result, overflow_right, and unapplied. + #[para] Renderline handles cursor movements from either keystrokes or ANSI sequences but for a full system the aforementioned loop will need to be in place to manage the set of lines under manipulation. + + #puts stderr "renderline '$args'" + variable optimise_ptruns + + if {[llength $args] < 2} { + error {usage: ?-info 0|1? ?-startcolumn ? ?-cursor_column ? ?-cursor_row |""? ?-transparent [0|1|]? ?-expand_right [1|0]? undertext overtext} + } + set under [lindex $args end-1] + set over [lindex $args end] + #lassign [lrange $args end-1 end] under over + if {[string last \n $under] >= 0} { + error "overtype::renderline not allowed to contain newlines in undertext" + } + #if {[string first \n $over] >=0 || [string first \n $under] >= 0} { + # error "overtype::renderline not allowed to contain newlines" + #} + + #generally faster to create a new dict in the proc than to use a namespace variable to create dict once and link to variable (2024 8.6/8.7) + set opts [tcl::dict::create\ + -etabs 0\ + -width \uFFEF\ + -expand_right 0\ + -transparent 0\ + -startcolumn 1\ + -cursor_column 1\ + -cursor_row ""\ + -insert_mode 1\ + -crm_mode 0\ + -autowrap_mode 1\ + -reverse_mode 0\ + -info 0\ + -exposed1 \uFFFD\ + -exposed2 \uFFFD\ + -cursor_restore_attributes ""\ + -cp437 0\ + -experimental {}\ + ] + #-cursor_restore_attributes only - for replay stack - position and actual setting/restoring handled by throwback to caller + + #cursor_row, when numeric will allow detection of certain row moves that are still within our row - allowing us to avoid an early return + #An empty string for cursor_row tells us we have no info about our own row context, and to return with an unapplied string if any row move occurs + + #exposed1 and exposed2 for first and second col of underying 2wide char which is truncated by transparency, currsor movements to 2nd charcol, or overflow/expand_right + #todo - return info about such grapheme 'cuts' in -info structure and/or create option to raise an error + + set argsflags [lrange $args 0 end-2] + tcl::dict::for {k v} $argsflags { + switch -- $k { + -experimental - -cp437 - -width - -expand_right - -transparent - -startcolumn - -cursor_column - -cursor_row + - -crm_mode - -insert_mode - -autowrap_mode - -reverse_mode + - -info - -exposed1 - -exposed2 - -cursor_restore_attributes { + tcl::dict::set opts $k $v + } + default { + error "overtype::renderline unknown option '$k'. Known options: [tcl::dict::keys $opts]" + } + } + } + # -- --- --- --- --- --- --- --- --- --- --- --- + set opt_width [tcl::dict::get $opts -width] + set opt_etabs [tcl::dict::get $opts -etabs] + set opt_expand_right [tcl::dict::get $opts -expand_right] + set opt_colstart [tcl::dict::get $opts -startcolumn] ;#lhs limit for overlay - an offset to cursor_column - first visible column is 1. 0 or < 0 are before the start of the underlay + set opt_colcursor [tcl::dict::get $opts -cursor_column];#start cursor column relative to overlay + set opt_row_context [tcl::dict::get $opts -cursor_row] + if {[string length $opt_row_context]} { + if {![tcl::string::is integer -strict $opt_row_context] || $opt_row_context <1 } { + error "overtype::renderline -cursor_row must be empty for unspecified/unknown or a non-zero positive integer. received: '$opt_row_context'" + } + } + # -- --- --- --- --- --- --- --- --- --- --- --- + #The _mode flags correspond to terminal modes that can be set/reset via escape sequences (e.g DECAWM wraparound mode) + set opt_insert_mode [tcl::dict::get $opts -insert_mode];#should usually be 1 for each new line in editor mode but must be initialised to 1 externally (review) + #default is for overtype + # -- --- --- --- --- --- --- --- --- --- --- --- + set opt_autowrap_mode [tcl::dict::get $opts -autowrap_mode] ;#DECAWM - char or movement can go beyond leftmost/rightmost col to prev/next line + set opt_reverse_mode [tcl::dict::get $opts -reverse_mode] ;#DECSNM + set opt_crm_mode [tcl::dict::get $opts -crm_mode];# CRM - show control character mode + # -- --- --- --- --- --- --- --- --- --- --- --- + set temp_cursor_saved [tcl::dict::get $opts -cursor_restore_attributes] + + set cp437_glyphs [tcl::dict::get $opts -cp437] + set cp437_map [tcl::dict::create] + if {$cp437_glyphs} { + set cp437_map [set ::punk::ansi::cp437_map] + #for cp437 images we need to map these *after* splitting ansi + #some old files might use newline for its glyph.. but we can't easily support that. + #Not sure how old files did it.. maybe cr lf in sequence was newline and any lone cr or lf were displayed as glyphs? + tcl::dict::unset cp437_map \n + } + + set opt_transparent [tcl::dict::get $opts -transparent] + if {$opt_transparent eq "0"} { + set do_transparency 0 + } else { + set do_transparency 1 + if {$opt_transparent eq "1"} { + set opt_transparent {[\s]} + } + } + # -- --- --- --- --- --- --- --- --- --- --- --- + set opt_returnextra [tcl::dict::get $opts -info] + # -- --- --- --- --- --- --- --- --- --- --- --- + set opt_exposed1 [tcl::dict::get $opts -exposed1] + set opt_exposed2 [tcl::dict::get $opts -exposed2] + # -- --- --- --- --- --- --- --- --- --- --- --- + + if {$opt_row_context eq ""} { + set cursor_row 1 + } else { + set cursor_row $opt_row_context + } + + set insert_mode $opt_insert_mode ;#default 1 + set autowrap_mode $opt_autowrap_mode ;#default 1 + set crm_mode $opt_crm_mode ;#default 0 (Show Control Character mode) + set reverse_mode $opt_reverse_mode + + #----- + # + if {[info exists punk::console::tabwidth]} { + #punk console is updated if punk::console::set_tabstop_width is used or rep is started/restarted + #It is way too slow to test the current width by querying the terminal here - so it could conceivably get out of sync + #todo - we also need to handle the new threaded repl where console config is in a different thread. + # - also - concept of sub-regions being mini-consoles with their own settings - 'id' for console, or use in/out channels as id? + set tw $::punk::console::tabwidth + } else { + set tw 8 + } + + set overdata $over + if {!$cp437_glyphs} { + #REVIEW! tabify will give different answers for an ANSI colourised string vs plain text + if {!$opt_etabs} { + if {[string first \t $under] >= 0} { + #set under [textutil::tabify::untabify2 $under] + set under [textutil::tabify::untabifyLine $under $tw] + } + if {[string first \t $over] >= 0} { + #set overdata [textutil::tabify::untabify2 $over] + set overdata [textutil::tabify::untabifyLine $over $tw] + } + } + } + #------- + + #ta_detect ansi and do simpler processing? + + #we repeat tests for grapheme width in different loops - rather than create another datastructure to store widths based on column, + #we'll use the grapheme_width_cached function as a lookup table of all graphemes encountered - as there will often be repeats in different positions anyway. + + # -- --- --- --- --- --- --- --- + if {$under ne ""} { + if {[punk::ansi::ta::detect $under]} { + set undermap [punk::ansi::ta::split_codes_single $under] + } else { + #single plaintext part + set undermap [list $under] + } + } else { + set undermap [list] + } + set understacks [list] + set understacks_gx [list] + set pm_list [list] + + set i_u -1 ;#underlay may legitimately be empty + set undercols [list] + set u_codestack [list] + #u_gx_stack probably isn't really a stack - I don't know if g0 g1 can stack or not - for now we support only g0 anyway + set u_gx_stack [list] ;#separate stack for g0 (g1 g2 g3?) graphics on and off (DEC special graphics) + #set pt_underchars "" ;#for string_columns length calculation for expand_right 0 truncation + set remainder [list] ;#for returnextra + foreach {pt code} $undermap { + #pt = plain text + #append pt_underchars $pt + if {$pt ne ""} { + if {$cp437_glyphs} { + set pt [tcl::string::map $cp437_map $pt] + } + set is_ptrun 0 + if {$optimise_ptruns && [tcl::string::length $pt] >= $optimise_ptruns} { + set p1 [tcl::string::index $pt 0] + set hex [format %x [scan $p1 %c]] ;#punk::char::char_hex + set re [tcl::string::cat {^[} \\U$hex {]+$}] + set is_ptrun [regexp $re $pt] + } + if {$is_ptrun} { + #switch -- $p1 { + # " " - - - _ - ! - @ - # - $ - % - ^ - & - * - = - + - : - . - , - / - | - ? - + # a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - + # z - A - B - C - D - E - F - G - H - I - J - K - L - M - N - O - P - Q - R - S - T - U - V - W - X - Y - Z - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 { + # set width 1 + # } + # default { + # if {$p1 eq "\u0000"} { + # #use null as empty cell representation - review + # #use of this will probably collide with some application at some point + # #consider an option to set the empty cell character + # set width 1 + # } else { + # set width [grapheme_width_cached $p1] ;# when zero??? + # } + # } + #} + set width [grapheme_width_cached $p1] ;# when zero??? + set ptlen [string length $pt] + if {$width <= 1} { + #review - 0 and 1? + incr i_u $ptlen + lappend understacks {*}[lrepeat $ptlen $u_codestack] + lappend understacks_gx {*}[lrepeat $ptlen $u_gx_stack] + lappend undercols {*}[lrepeat $ptlen $p1] + } else { + incr i_u $ptlen ;#2nd col empty str - so same as above + set 2ptlen [expr {$ptlen * 2}] + lappend understacks {*}[lrepeat $2ptlen $u_codestack] + lappend understacks_gx {*}[lrepeat $2ptlen $u_gx_stack] + set l [concat {*}[lrepeat $ptlen [list $p1 ""]] + lappend undercols {*}$l + unset l + } + + } else { + foreach grapheme [punk::char::grapheme_split $pt] { + #an ugly but easy hack to serve *some* common case ascii quickly with byte-compiled literal switch - feels dirty. + #.. but even 0.5uS per char (grapheme_width_cached) adds up quickly when stitching lots of lines together. + #todo - test decimal value instead, compare performance + switch -- $grapheme { + " " - - - _ - ! - @ - # - $ - % - ^ - & - * - = - + - : - . - , - / - | - ? - + a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - + z - A - B - C - D - E - F - G - H - I - J - K - L - M - N - O - P - Q - R - S - T - U - V - W - X - Y - Z - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 { + set width 1 + } + default { + if {$grapheme eq "\u0000"} { + #use null as empty cell representation - review + #use of this will probably collide with some application at some point + #consider an option to set the empty cell character + set width 1 + } else { + #zero width still acts as 1 below??? review what should happen + set width [grapheme_width_cached $grapheme] + #we still want most controls and other zero-length codepoints such as \u200d (zero width joiner) to stay zero-length + #we substitute lone ESC that weren't captured within ANSI context as a debugging aid to see malformed ANSI + #todo - default to off and add a flag (?) to enable this substitution + set sub_stray_escapes 0 + if {$sub_stray_escapes && $width == 0} { + if {$grapheme eq "\x1b"} { + set gvis [ansistring VIEW $grapheme] ;#can only use with graphemes that have a single replacement char.. + set grapheme $gvis + set width 1 + } + } + } + } + } + + #set width [grapheme_width_cached $grapheme] + incr i_u + lappend understacks $u_codestack + lappend understacks_gx $u_gx_stack + + lappend undercols $grapheme + if {$width > 1} { + #presumably there are no triple-column or wider unicode chars.. until the aliens arrive.(?) + #but what about emoji combinations etc - can they be wider than 2? + #todo - if -etabs enabled - then we treat \t as the width determined by our elastic tabstop + incr i_u + lappend understacks $u_codestack + lappend understacks_gx $u_gx_stack + lappend undercols "" + } + } + + } + } + #underlay should already have been rendered and not have non-sgr codes - but let's retain the check for them and not stack them if other codes are here + + #only stack SGR (graphics rendition) codes - not title sets, cursor moves etc + #keep any remaining PMs in place + if {$code ne ""} { + set c1c2 [tcl::string::range $code 0 1] + + set leadernorm [tcl::string::range [tcl::string::map [list\ + \x1b\[ 7CSI\ + \x9b 8CSI\ + \x1b\( 7GFX\ + \x1b^ 7PMX\ + \x1bX 7SOS\ + ] $c1c2] 0 3];# leadernorm is 1st 2 chars mapped to normalised indicator - or is original 2 chars + + switch -- $leadernorm { + 7CSI - 8CSI { + #need to exclude certain leaders after the lb e.g < for SGR 1006 mouse + #REVIEW - what else could end in m but be mistaken as a normal SGR code here? + set maybemouse "" + if {[tcl::string::index $c1c2 0] eq "\x1b"} { + set maybemouse [tcl::string::index $code 2] + } + + if {$maybemouse ne "<" && [tcl::string::index $code end] eq "m"} { + if {[punk::ansi::codetype::is_sgr_reset $code]} { + set u_codestack [list "\x1b\[m"] + } elseif {[punk::ansi::codetype::has_sgr_leadingreset $code]} { + set u_codestack [list $code] + } else { + #basic simplification first.. straight dups + set dup_posns [lsearch -all -exact $u_codestack $code] ;#-exact because of square-bracket glob chars + set u_codestack [lremove $u_codestack {*}$dup_posns] + lappend u_codestack $code + } + } + } + 7GFX { + switch -- [tcl::string::index $code 2] { + "0" { + set u_gx_stack [list gx0_on] ;#we'd better use a placeholder - or debugging will probably get into a big mess + } + B { + set u_gx_stack [list] + } + } + } + 7PMX - 7SOS { + #we can have PMs or SOS (start of string) in the underlay - though mostly the PMs should have been processed.. + #attach the PM/SOS (entire ANSI sequence) to the previous grapheme! + #It should not affect the size - but terminal display may get thrown out if terminal doesn't support them. + + #note that there may in theory already be ANSI stored - we don't assume it was a pure grapheme string + set graphemeplus [lindex $undercols end] + if {$graphemeplus ne "\0"} { + append graphemeplus $code + } else { + set graphemeplus $code + } + lset undercols end $graphemeplus + #The grapheme_width_cached function will be called on this later - and doesn't account for ansi. + #we need to manually cache the item with it's proper width + variable grapheme_widths + #stripped and plus version keys pointing to same length + dict set grapheme_widths $graphemeplus [grapheme_width_cached [::punk::ansi::ansistrip $graphemeplus]] + + } + default { + + } + + } + + #if {[punk::ansi::codetype::is_sgr_reset $code]} { + # #set u_codestack [list] + #} elseif {[punk::ansi::codetype::has_sgr_leadingreset $code]} { + #} elseif {[punk::ansi::codetype::is_sgr $code]} { + #} else { + # #leave SGR stack as is + # if {[punk::ansi::codetype::is_gx_open $code]} { + # } elseif {[punk::ansi::codetype::is_gx_close $code]} { + # } + #} + } + #consider also if there are other codes that should be stacked..? + } + + #NULL empty cell indicator + if {$opt_width ne "\uFFEF"} { + if {[llength $understacks]} { + set cs $u_codestack + set gs $u_gx_stack + } else { + set cs [list] + set gs [list] + } + if {[llength $undercols]< $opt_width} { + set diff [expr {$opt_width- [llength $undercols]}] + if {$diff > 0} { + #set undercols [list {*}$undercols {*}[lrepeat $diff "\u0000"]] ;#2024 - much slower + lappend undercols {*}[lrepeat $diff "\u0000"] + lappend understacks {*}[lrepeat $diff $cs] + lappend understacks_gx {*}[lrepeat $diff $gs] + } + } + } + + if {$opt_width ne "\uFFEF"} { + set renderwidth $opt_width + } else { + set renderwidth [llength $undercols] + } + + + if 0 { + # ----------------- + # if we aren't extending understacks & understacks_gx each time we incr idx above the undercols length.. this doesn't really serve a purpose + # Review. + # ----------------- + #replay code for last overlay position in input line + # whether or not we get that far - we need to return it for possible replay on next line + if {[llength $understacks]} { + lappend understacks $u_codestack + lappend understacks_gx $u_gx_stack + } else { + #in case overlay onto emptystring as underlay + lappend understacks [list] + lappend understacks_gx [list] + } + # ----------------- + } + + #trailing codes in effect for underlay + if {[llength $u_codestack]} { + #set replay_codes_underlay [join $u_codestack ""] + set replay_codes_underlay [punk::ansi::codetype::sgr_merge_list {*}$u_codestack] + } else { + set replay_codes_underlay "" + } + + + # -- --- --- --- --- --- --- --- + #### + #if opt_colstart - we need to build a space (or any singlewidth char ?) padding on the left containing the right number of columns. + #this will be processed as transparent - and handle doublewidth underlay characters appropriately + set startpadding [string repeat " " [expr {$opt_colstart -1}]] + #overdata with left padding spaces based on col-start under will show through for left-padding portion regardless of -transparency + if {$startpadding ne "" || $overdata ne ""} { + if {[punk::ansi::ta::detect $overdata]} { + set overmap [punk::ansi::ta::split_codes_single $startpadding$overdata] + } else { + #single plaintext part + set overmap [list $startpadding$overdata] + } + } else { + set overmap [list] + } + #### + + + #todo - detect plain ascii no-ansi input line (aside from leading ansi reset) + #will that allow some optimisations? + + #todo - detect repeated transparent char in overlay + #regexp {^(.)\1+$} ;#backtracking regexp - relatively slow. + # - try set c [string index $data 0]; regexp [string map [list %c% $c] {^[%c%]+$}] $data + #we should be able to optimize to pass through the underlay?? + + #??? + set colcursor $opt_colstart + #TODO - make a little virtual column object + #we need to refer to column1 or columnmin? or columnmax without calculating offsets due to to startcolumn + #need to lock-down what start column means from perspective of ANSI codes moving around - the offset perspective is unclear and a mess. + + + #set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+} + #as at 2024-02 punk::char::grapheme_split uses these - not aware of more complex graphemes + + set overstacks [list] + set overstacks_gx [list] + + set o_codestack [list]; #SGR codestack (not other codes such as movement,insert key etc) + set o_gxstack [list] + set pt_overchars "" + set i_o 0 + set overlay_grapheme_control_list [list] ;#tag each with g, sgr or other. 'other' are things like cursor-movement or insert-mode or codes we don't recognise/use + #experiment + set overlay_grapheme_control_stacks [list] + foreach {pt code} $overmap { + if {$pt ne ""} { + #todo - wrap in test for empty pt (we used split_codes_single - and it may be common for sgr sequences to be unmerged and so have empty pts between) + if {$cp437_glyphs} { + set pt [tcl::string::map $cp437_map $pt] + } + append pt_overchars $pt + #will get empty pt between adjacent codes + if {!$crm_mode} { + + set is_ptrun 0 + if {$optimise_ptruns && [tcl::string::length $pt] >= $optimise_ptruns} { + set p1 [tcl::string::index $pt 0] + set re [tcl::string::cat {^[} \\U[format %x [scan $p1 %c]] {]+$}] + set is_ptrun [regexp $re $pt] + + #leading only? we would have to check for graphemes at the trailing boundary? + #set re [tcl::string::cat {^[} \\U[format %x [scan $p1 %c]] {]+}] + #set is_ptrun [regexp -indices $re $pt runrange] + #if {$is_ptrun && 1} { + #} + } + if {$is_ptrun} { + #review - in theory a run over a certain length won't be part of some grapheme combo (graphemes can be long e.g 44?, but not as runs(?)) + #could be edge cases for runs at line end? (should be ok as we include trailing \n in our data) + set len [string length $pt] + set g_element [list g $p1] + + #lappend overstacks {*}[lrepeat $len $o_codestack] + #lappend overstacks_gx {*}[lrepeat $len $o_gxstack] + #incr i_o $len + #lappend overlay_grapheme_control_list {*}[lrepeat $len [list g $p1]] + #lappend overlay_grapheme_control_stacks {*}[lrepeat $len $o_codestack] + + set pi 0 + incr i_o $len + while {$pi < $len} { + lappend overstacks $o_codestack + lappend overstacks_gx $o_gxstack + lappend overlay_grapheme_control_list $g_element + lappend overlay_grapheme_control_stacks $o_codestack + incr pi + } + } else { + foreach grapheme [punk::char::grapheme_split $pt] { + lappend overstacks $o_codestack + lappend overstacks_gx $o_gxstack + incr i_o + lappend overlay_grapheme_control_list [list g $grapheme] + lappend overlay_grapheme_control_stacks $o_codestack + } + } + } else { + set tsbegin [clock micros] + foreach grapheme_original [punk::char::grapheme_split $pt] { + set pt_crm [ansistring VIEW -nul 1 -lf 2 -vt 2 -ff 2 $grapheme_original] + #puts stderr "ptlen [string length $pt] graphemelen[string length $grapheme_original] pt_crmlen[string length $pt_crm] $pt_crm" + foreach grapheme [punk::char::grapheme_split $pt_crm] { + if {$grapheme eq "\n"} { + lappend overlay_grapheme_control_stacks $o_codestack + lappend overlay_grapheme_control_list [list crmcontrol "\x1b\[00001E"] + } else { + lappend overstacks $o_codestack + lappend overstacks_gx $o_gxstack + incr i_o + lappend overlay_grapheme_control_list [list g $grapheme] + lappend overlay_grapheme_control_stacks $o_codestack + } + } + } + set elapsed [expr {[clock micros] - $tsbegin}] + puts stderr "ptlen [string length $pt] elapsedus:$elapsed" + } + } + + #only stack SGR (graphics rendition) codes - not title sets, cursor moves etc + #order of if-else based on assumptions: + # that pure resets are fairly common - more so than leading resets with other info + # that non-sgr codes are not that common, so ok to check for resets before verifying it is actually SGR at all. + if {$code ne ""} { + #we need to immediately set crm_mode here if \x1b\[3h received + if {$code eq "\x1b\[3h"} { + set crm_mode 1 + } elseif {$code eq "\x1b\[3l"} { + set crm_mode 0 + } + #else crm_mode could be set either way from options + if {$crm_mode && $code ne "\x1b\[00001E"} { + #treat the code as type 'g' like above - only allow through codes to reset mode REVIEW for now just \x1b\[3l ? + #we need to somehow convert further \n in the graphical rep to an instruction for newline that will bypass further crm_mode processing or we would loop. + set code_as_pt [ansistring VIEW -nul 1 -lf 1 -vt 1 -ff 1 $code] + #split using standard split for first foreach loop - grapheme based split when processing 2nd foreach loop + set chars [split $code_as_pt ""] + set codeparts [list] ;#list of 2-el lists each element {crmcontrol } or {g } + foreach c $chars { + if {$c eq "\n"} { + #use CNL (cursor next line) \x1b\[00001E ;#leading zeroes ok for this processor - used as debugging aid to distinguish + lappend codeparts [list crmcontrol "\x1b\[00001E"] + } else { + if {[llength $codeparts] > 0 && [lindex $codeparts end 0] eq "g"} { + set existing [lindex $codeparts end 1] + lset codeparts end [list g [string cat $existing $c]] + } else { + lappend codeparts [list g $c] + } + } + } + + set partidx 0 + foreach record $codeparts { + lassign $record rtype rval + switch -exact -- $rtype { + g { + append pt_overchars $rval + foreach grapheme [punk::char::grapheme_split $rval] { + lappend overstacks $o_codestack + lappend overstacks_gx $o_gxstack + incr i_o + lappend overlay_grapheme_control_list [list g $grapheme] + lappend overlay_grapheme_control_stacks $o_codestack + } + } + crmcontrol { + #leave o_codestack + lappend overlay_grapheme_control_stacks $o_codestack + lappend overlay_grapheme_control_list [list crmcontrol $rval] + } + } + } + } else { + lappend overlay_grapheme_control_stacks $o_codestack + #there will always be an empty code at end due to foreach on 2 vars with odd-sized list ending with pt (overmap coming from perlish split) + if {[punk::ansi::codetype::is_sgr_reset $code]} { + set o_codestack [list "\x1b\[m"] ;#reset better than empty list - fixes some ansi art issues + lappend overlay_grapheme_control_list [list sgr $code] + } elseif {[punk::ansi::codetype::has_sgr_leadingreset $code]} { + set o_codestack [list $code] + lappend overlay_grapheme_control_list [list sgr $code] + } elseif {[priv::is_sgr $code]} { + #basic simplification first - remove straight dupes + set dup_posns [lsearch -all -exact $o_codestack $code] ;#must be -exact because of square-bracket glob chars + set o_codestack [lremove $o_codestack {*}$dup_posns] + lappend o_codestack $code + lappend overlay_grapheme_control_list [list sgr $code] + } elseif {[regexp {\x1b7|\x1b\[s} $code]} { + #experiment + #cursor_save - for the replays review. + #jmn + #set temp_cursor_saved [punk::ansi::codetype::sgr_merge_list {*}$o_codestack] + lappend overlay_grapheme_control_list [list other $code] + } elseif {[regexp {\x1b8|\x1b\[u} $code]} { + #experiment + #cursor_restore - for the replays + set o_codestack [list $temp_cursor_saved] + lappend overlay_grapheme_control_list [list other $code] + } else { + #review + if {[punk::ansi::codetype::is_gx_open $code]} { + set o_gxstack [list "gx0_on"] + lappend overlay_grapheme_control_list [list gx0 gx0_on] ;#don't store code - will complicate debugging if we spit it out and jump character sets + } elseif {[punk::ansi::codetype::is_gx_close $code]} { + set o_gxstack [list] + lappend overlay_grapheme_control_list [list gx0 gx0_off] ;#don't store code - will complicate debugging if we spit it out and jump character sets + } else { + lappend overlay_grapheme_control_list [list other $code] + } + } + } + } + + } + #replay code for last overlay position in input line - should take account of possible trailing sgr code after last grapheme + set max_overlay_grapheme_index [expr {$i_o -1}] + lappend overstacks $o_codestack + lappend overstacks_gx $o_gxstack + + #set replay_codes_overlay [join $o_codestack ""] + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}$o_codestack] + + #if {[tcl::dict::exists $overstacks $max_overlay_grapheme_index]} { + # set replay_codes_overlay [join [tcl::dict::get $overstacks $max_overlay_grapheme_index] ""] + #} else { + # set replay_codes_overlay "" + #} + # -- --- --- --- --- --- --- --- + + + #potential problem - combinining diacritics directly following control chars like \r \b + + # -- --- --- + #we need to initialise overflow_idx before any potential row-movements - as they need to perform a loop break and force in_excess to 1 + if {$opt_expand_right} { + #expand_right true means we can have lines as long as we want, but either way there can be excess data that needs to be thrown back to the calling loop. + #we currently only support horizontal expansion to the right (review regarding RTL text!) + set overflow_idx -1 + } else { + #expand_right zero - we can't grow beyond our column width - so we get ellipsis or truncation + if {$opt_width ne "\uFFEF"} { + set overflow_idx [expr {$opt_width}] + } else { + #review - this is also the cursor position when adding a char at end of line? + set overflow_idx [expr {[llength $undercols]}] ;#index at which we would be *in* overflow a row move may still override it + } + } + # -- --- --- + + set outcols $undercols ;#leave undercols as is, outcols can potentially be appended to. + + set unapplied "" ;#if we break for move row (but not for /v ?) + set unapplied_list [list] + + set insert_lines_above 0 ;#return key + set insert_lines_below 0 + set instruction "" + + # -- --- --- + #cursor_save_dec, cursor_restore_dec etc + set cursor_restore_required 0 + set cursor_saved_attributes "" + set cursor_saved_position "" + # -- --- --- + + #set idx 0 ;# line index (cursor - 1) + #set idx [expr {$opt_colstart + $opt_colcursor} -1] + + #idx is the per column output index + set idx [expr {$opt_colcursor -1}] ;#don't use opt_colstart here - we have padded and won't start emitting until idx reaches opt_colstart-1 + #cursor_column is usually one above idx - but we have opt_colstart which is like a margin - todo: remove cursor_column from the following loop and calculate it's offset when breaking or at end. + #(for now we are incrementing/decrementing both in sync - which is a bit silly) + set cursor_column $opt_colcursor + + #idx_over is the per grapheme overlay index + set idx_over -1 + + + #movements only occur within the overlay range. + #an underlay is however not necessary.. e.g + #renderline -expand_right 1 "" data + + #set re_mode {\x1b\[\?([0-9]*)(h|l)} ;#e.g DECAWM + #set re_col_move {\x1b\[([0-9]*)(C|D|G)$} + #set re_row_move {\x1b\[([0-9]*)(A|B)$} + #set re_both_move {\x1b\[([0-9]*)(?:;){0,1}([0-9]*)H$} ;# or "f" ? + #set re_vt_sequence {\x1b\[([0-9]*)(?:;){0,1}([0-9]*)~$} + #set re_cursor_save {\x1b\[s$} ;#note probable incompatibility with DECSLRM (set left right margin)! + #set re_cursor_restore {\x1b\[u$} + #set re_cursor_save_dec {\x1b7$} + #set re_cursor_restore_dec {\x1b8$} + #set re_other_single {\x1b(D|M|E)$} + #set re_decstbm {\x1b\[([0-9]*)(?:;){0,1}([0-9]*)r$} ;#DECSTBM set top and bottom margins + + #puts "-->$overlay_grapheme_control_list<--" + #puts "-->overflow_idx: $overflow_idx" + for {set gci 0} {$gci < [llength $overlay_grapheme_control_list]} {incr gci} { + set gc [lindex $overlay_grapheme_control_list $gci] + lassign $gc type item + + #emit plaintext chars first using existing SGR codes from under/over stack as appropriate + #then check if the following code is a cursor movement within the line and adjust index if so + #foreach ch $overlay_graphemes {} + switch -- $type { + g { + set ch $item + #crm_mode affects both graphic and control + if {0 && $crm_mode} { + set chars [ansistring VIEW -nul 1 -lf 2 -vt 2 -ff 2 $ch] + set chars [string map [list \n "\x1b\[00001E"] $chars] + if {[llength [split $chars ""]] > 1} { + priv::render_unapplied $overlay_grapheme_control_list $gci + #prefix the unapplied controls with the string version of this control + set unapplied_list [linsert $unapplied_list 0 {*}[split $chars ""]] + set unapplied [join $unapplied_list ""] + #incr idx_over + break + } else { + set ch $chars + } + } + incr idx_over; #idx_over (until unapplied reached anyway) is per *grapheme* in the overlay - not per col. + if {($idx < ($opt_colstart -1))} { + incr idx [grapheme_width_cached $ch] + continue + } + #set within_undercols [expr {$idx <= [llength $undercols]-1}] ;#within our active data width + set within_undercols [expr {$idx <= $renderwidth-1}] + + #https://www.enigma.com/resources/blog/the-secret-world-of-newline-characters + #\x85 NEL in the c1 control set is treated by some terminal emulators (e.g Hyper) as a newline, + #on some it's invisble but doesn't change the line, on some it's a visible glyph of width 1. + #This is hard to process in any standard manner - but I think the Hyper behaviour of doing what it was intended is perhaps most reasonable + #We will map it to the same behaviour as lf here for now... but we need also to consider the equivalent ANSI sequence: \x1bE + + set chtest [tcl::string::map [list \n \x85 \b \r \v \x7f ] $ch] + #puts --->chtest:$chtest + #specials - each shoud have it's own test of what to do if it happens after overflow_idx reached + switch -- $chtest { + "" { + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + if {$idx == 0} { + #puts "---a at col 1" + #linefeed at column 1 + #leave the overflow_idx ;#? review + set instruction lf_start ;#specific instruction for newline at column 1 + priv::render_unapplied $overlay_grapheme_control_list $gci + break + } elseif {$overflow_idx != -1 && $idx == $overflow_idx} { + #linefeed after final column + #puts "---c at overflow_idx=$overflow_idx" + incr cursor_row + set overflow_idx $idx ;#override overflow_idx even if it was set to -1 due to expand_right = 1 + set instruction lf_overflow ;#only special treatment is to give it it's own instruction in case caller needs to handle differently + priv::render_unapplied $overlay_grapheme_control_list $gci + break + } else { + #linefeed occurred in middle or at end of text + #puts "---mid-or-end-text-linefeed idx:$idx overflow_idx:$overflow_idx" + if {$insert_mode == 0} { + incr cursor_row + if {$idx == -1 || $overflow_idx > $idx} { + #don't set overflow_idx higher if it's already set lower and we're adding graphemes to overflow + set overflow_idx $idx ;#override overflow_idx even if it was set to -1 due to expand_right = 1 + } + set instruction lf_mid + priv::render_unapplied $overlay_grapheme_control_list $gci + break + } else { + incr cursor_row + #don't adjust the overflow_idx + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction lf_mid + break ;# could have overdata following the \n - don't keep processing + } + } + + } + "" { + #will we want/need to use raw for keypresses in terminal? (terminal with LNM in standard reset mode means enter= this is the usual config for terminals) + #So far we are assuming the caller has translated to and handle above.. REVIEW. + + #consider also the old space-carriagereturn softwrap convention used in some terminals. + #In the context of rendering to a block of text - this works similarly in that the space gets eaten so programs emitting space-cr at the terminal width col will pretty much get what they expect. + set idx [expr {$opt_colstart -1}] + set cursor_column $opt_colstart ;#? + } + "" { + #literal backspace char - not necessarily from keyboard + #review - backspace effect on double-width chars - we are taking a column-editing perspective in overtype + #(important for -transparent option - hence replacement chars for half-exposed etc) + #review - overstrike support as per nroff/less (generally considered an old technology replaced by unicode mechanisms and/or ansi SGR) + if {$idx > ($opt_colstart -1)} { + incr idx -1 + incr cursor_column -1 + } else { + set flag 0 + if $flag { + #review - conflicting requirements? Need a different sequence for destructive interactive backspace? + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction backspace_at_start + break + } + } + } + "" { + #literal del character - some terminals send just this for what is generally expected to be a destructive backspace + #We instead treat this as a pure delete at current cursor position - it is up to the repl or terminal to remap backspace key to a sequence that has the desired effect. + priv::render_delchar $idx + } + "" { + #end processing this overline. rest of line is remainder. cursor for column as is. + #REVIEW - this theoretically depends on terminal's vertical tabulation setting (name?) + #e.g it could be configured to jump down 6 rows. + #On the other hand I've seen indications that some modern terminal emulators treat it pretty much as a linefeed. + #todo? + incr cursor_row + set overflow_idx $idx + #idx_over has already been incremented as this is both a movement-control and in some sense a grapheme + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction vt + break + } + default { + if {$overflow_idx != -1} { + #review - how to check arbitrary length item such as tab is going to overflow .. before we get to overflow_idx? + #call grapheme_width_cached on each ch, or look for tab specifically as it's currently the only known reason to have a grapheme width > 2? + #we need to decide what a tab spanning the overflow_idx means and how it affects wrap etc etc + if {$idx == $overflow_idx-1} { + set owidth [grapheme_width_cached $ch] + if {$owidth == 2} { + #review split 2w overflow? + #we don't want to make the decision here to split a 2w into replacement characters at end of line and beginning of next line + #better to consider the overlay char as unable to be applied to the line + #render empty column(?) - and reduce overlay grapheme index by one so that the current ch goes into unapplied + #throwing back to caller with instruction complicates its job - but is necessary to avoid making decsions for it here. + priv::render_addchar $idx "" [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + #change the overflow_idx + set overflow_idx $idx + incr idx + incr idx_over -1 ;#set overlay grapheme index back one so that sgr stack from previous overlay grapheme used + priv::render_unapplied $overlay_grapheme_control_list [expr {$gci-1}] ;#note $gci-1 instead of just gci + #throw back to caller's loop - add instruction to caller as this is not the usual case + #caller may for example choose to render a single replacement char to this line and omit the grapheme, or wrap it to the next line + set instruction overflow_splitchar + break + } elseif {$owidth > 2} { + #? tab? + #TODO! + puts stderr "overtype::renderline long overtext grapheme '[ansistring VIEW -lf 1 -vt 1 $ch]' not handled" + #tab of some length dependent on tabstops/elastic tabstop settings? + } + } elseif {$idx >= $overflow_idx} { + #REVIEW + set next_gc [lindex $overlay_grapheme_control_list $gci+1] ;#next grapheme or control + lassign $next_gc next_type next_item + if {$autowrap_mode || $next_type ne "g"} { + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci-1]] + #set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #don't incr idx beyond the overflow_idx + #idx_over already incremented - decrement so current overlay grapheme stacks go to unapplied + incr idx_over -1 + #priv::render_unapplied $overlay_grapheme_control_list [expr {$gci-1}] ;#back one index here too + priv::render_this_unapplied $overlay_grapheme_control_list $gci ;# + set instruction overflow + break + } else { + #no point throwing back to caller for each grapheme that is overflowing + #without this branch - renderline would be called with overtext reducing only by one grapheme per call + #processing a potentially long overtext each time (ie - very slow) + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #JMN4 + + } + } + } else { + #review. + #overflow_idx = -1 + #This corresponds to expand_right being true (at least until overflow_idx is in some cases forced to a value when throwing back to calling loop) + } + + if {($do_transparency && [regexp $opt_transparent $ch])} { + #pre opt_colstart is effectively transparent (we have applied padding of required number of columns to left of overlay) + if {$idx > [llength $outcols]-1} { + lappend outcols " " + #tcl::dict::set understacks $idx [list] ;#review - use idx-1 codestack? + #lset understacks $idx [list] ;#will get index $i out of range error + lappend understacks [list] ;#REVIEW + incr idx + incr cursor_column + } else { + #todo - punk::char::char_width + set g [lindex $outcols $idx] + #JMN + set uwidth [grapheme_width_cached $g] + if {[lindex $outcols $idx] eq ""} { + #2nd col of 2-wide char in underlay + incr idx + incr cursor_column + } elseif {$uwidth == 0} { + #e.g control char ? combining diacritic ? + incr idx + incr cursor_column + } elseif {$uwidth == 1} { + set owidth [grapheme_width_cached $ch] + incr idx + incr cursor_column + if {$owidth > 1} { + incr idx + incr cursor_column + } + } elseif {$uwidth > 1} { + if {[grapheme_width_cached $ch] == 1} { + if {!$insert_mode} { + #normal singlewide transparent overlay onto double-wide underlay + set next_pt_overchar [tcl::string::index $pt_overchars $idx_over+1] ;#lookahead of next plain-text char in overlay + if {$next_pt_overchar eq ""} { + #special-case trailing transparent - no next_pt_overchar + incr idx + incr cursor_column + } else { + if {[regexp $opt_transparent $next_pt_overchar]} { + incr idx + incr cursor_column + } else { + #next overlay char is not transparent.. first-half of underlying 2wide char is exposed + #priv::render_addchar $idx $opt_exposed1 [tcl::dict::get $overstacks $idx_over] [tcl::dict::get $overstacks_gx $idx_over] $insert_mode + priv::render_addchar $idx $opt_exposed1 [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + incr cursor_column + } + } + } else { + #? todo - decide what transparency even means for insert mode + incr idx + incr cursor_column + } + } else { + #2wide transparency over 2wide in underlay - review + incr idx + incr cursor_column + } + } + } + } else { + + set idxchar [lindex $outcols $idx] + #non-transparent char in overlay or empty cell + if {$idxchar eq "\u0000"} { + #empty/erased cell indicator + set uwidth 1 + } else { + set uwidth [grapheme_width_cached $idxchar] + } + if {$within_undercols} { + if {$idxchar eq ""} { + #2nd col of 2wide char in underlay + if {!$insert_mode} { + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] 0 + #JMN - this has to expose if our startposn chopped an underlay - but not if we already overwrote the first half of the widechar underlay grapheme + #e.g renderline \uFF21\uFF21--- a\uFF23\uFF23 + #vs + # renderline -startcolumn 2 \uFF21---- \uFF23 + if {[lindex $outcols $idx-1] != ""} { + #verified it's an empty following a filled - so it's a legit underlay remnant (REVIEW - when would it not be??) + #reset previous to an exposed 1st-half - but leave understacks code as is + priv::render_addchar [expr {$idx-1}] $opt_exposed1 [lindex $understacks $idx-1] [lindex $understacks_gx $idx-1] 0 + } + incr idx + } else { + set prevcolinfo [lindex $outcols $idx-1] + #for insert mode - first replace the empty 2ndhalf char with exposed2 before shifting it right + #REVIEW - this leaves a replacement character permanently in our columns.. but it is consistent regarding length (?) + #The alternative is to disallow insertion at a column cursor that is at 2nd half of 2wide char + #perhaps by inserting after the char - this may be worthwhile - but may cause other surprises + #It is perhaps best avoided at another level and try to make renderline do exactly as it's told + #the advantage of this 2w splitting method is that the inserted character ends up in exactly the column we expect. + priv::render_addchar $idx $opt_exposed2 [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] 0 ;#replace not insert + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] 1 ;#insert - same index + if {$prevcolinfo ne ""} { + #we've split the 2wide - it may already have been rendered as an exposed1 - but not for example if our startcolumn was current idx + priv::render_addchar [expr {$idx-1}] $opt_exposed1 [lindex $understacks $idx-1] [lindex $understacks_gx $idx-1] 0 ;#replace not insert + } ;# else?? + incr idx + } + if {$cursor_column < [llength $outcols] || $overflow_idx == -1} { + incr cursor_column + } + } elseif {$uwidth == 0} { + #what if this is some other c0/c1 control we haven't handled specifically? + + #by emitting a preceding empty-string column - we associate whatever this char is with the preceeding non-zero-length character and any existing zero-lengths that follow it + #e.g combining diacritic - increment before over char REVIEW + #arguably the previous overchar should have done this - ie lookahead for combiners? + #if we can get a proper grapheme_split function - this should be easier to tidy up. + priv::render_addchar $idx "" [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + incr cursor_column 2 + + if {$cursor_column > [llength $outcols] && $overflow_idx != -1} { + set cursor_column [llength $outcols] + } + } elseif {$uwidth == 1} { + #includes null empty cells + set owidth [grapheme_width_cached $ch] + if {$owidth == 1} { + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + } else { + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + priv::render_addchar $idx "" [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + #if next column in underlay empty - we've overwritten first half of underlying 2wide grapheme + #replace with rhs exposure in case there are no more overlay graphemes coming - use underlay's stack + if {([llength $outcols] >= $idx +2) && [lindex $outcols $idx+1] eq ""} { + priv::render_addchar [expr {$idx+1}] $opt_exposed2 [lindex $understacks $idx+1] [lindex $understacks_gx $idx+1] $insert_mode + } + incr idx + } + if {($cursor_column < [llength $outcols]) || $overflow_idx == -1} { + incr cursor_column + } + } elseif {$uwidth > 1} { + set owidth [grapheme_width_cached $ch] + if {$owidth == 1} { + #1wide over 2wide in underlay + if {!$insert_mode} { + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + incr cursor_column + priv::render_addchar $idx $opt_exposed2 [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + #don't incr idx - we are just putting a broken-indication in the underlay - which may get overwritten by next overlay char + } else { + #insert mode just pushes all to right - no exposition char here + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + incr cursor_column + } + } else { + #2wide over 2wide + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx 2 + incr cursor_column 2 + } + + if {$cursor_column > [llength $outcols] && $overflow_idx != -1} { + set cursor_column [llength $outcols] + } + } + } else { + priv::render_addchar $idx $ch [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + incr idx + incr cursor_column + } + } + } + } ;# end switch + + + } + other - crmcontrol { + if {$crm_mode && $type ne "crmcontrol" && $item ne "\x1b\[00001E"} { + if {$item eq "\x1b\[3l"} { + set crm_mode 0 + } else { + #When our initial overlay split was done - we weren't in crm_mode - so there are codes that weren't mapped to unicode control character representations + #set within_undercols [expr {$idx <= $renderwidth-1}] + #set chars [ansistring VIEW -nul 1 -lf 2 -vt 2 -ff 2 $item] + set chars [ansistring VIEW -nul 1 -lf 1 -vt 1 -ff 1 $item] + priv::render_unapplied $overlay_grapheme_control_list $gci + #prefix the unapplied controls with the string version of this control + set unapplied_list [linsert $unapplied_list 0 {*}[split $chars ""]] + set unapplied [join $unapplied_list ""] + + break + } + } + + #todo - consider CSI s DECSLRM vs ansi.sys \x1b\[s - we need \x1b\[s for oldschool ansi art - but may have to enable only for that. + #we should possibly therefore reverse this mapping so that x1b7 x1b8 are the primary codes for save/restore? + set code [tcl::string::map [list \x1b7 \x1b\[s \x1b8 \x1b\[u ] $item] + #since this element isn't a grapheme - advance idx_over to next grapheme overlay when about to fill 'unapplied' + + + #remap of DEC cursor_save/cursor_restore from ESC sequence to equivalent CSI + #probably not ideal - consider putting cursor_save/cursor_restore in functions so they can be called from the appropriate switch branch instead of using this mapping + #review - cost/benefit of function calls within these switch-arms instead of inline code? + + set c1 [tcl::string::index $code 0] + set c1c2c3 [tcl::string::range $code 0 2] + #set re_ST_open {(?:\033P|\u0090|\033X|\u0098|\033\^|\u009e|\033_|\u009f)} + #tcl 8.7 - faster to use inline list than to store it in a local var outside of loop. + #(somewhat surprising) + set leadernorm [tcl::string::range [tcl::string::map [list\ + \x1b\[< 1006\ + \x1b\[ 7CSI\ + \x1bY 7MAP\ + \x1bP 7DCS\ + \x90 8DCS\ + \x9b 8CSI\ + \x1b\] 7OSC\ + \x9d 8OSC\ + \x1b 7ESC\ + ] $c1c2c3] 0 3] ;#leadernorm is 1st 1,2 or 3 chars mapped to 4char normalised indicator - or is original first chars (1,2 or 3 len) + + #we leave the tail of the code unmapped for now + switch -- $leadernorm { + 1006 { + #https://invisible-island.net/xterm/ctlseqs/ctlseqs.html + #SGR (1006) CSI < followed by colon separated encoded-button-value,px,py ordinates and final M for button press m for button release + set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 3 end]] + } + 7CSI - 7OSC { + #set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 2 end]] + set codenorm $leadernorm[tcl::string::range $code 2 end] + } + 7DCS { + #ESC P + #Device Control String https://invisible-island.net/xterm/ctlseqs/ctlseqs.html#h4-Controls-beginning-with-ESC:ESC-F.C74 + set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 2 end]] + } + 8DCS { + #8-bit Device Control String + set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 1 end]] + } + 7MAP { + #map to another type of code to share implementation branch + set codenorm $leadernorm[tcl::string::range $code 1 end] + } + 7ESC { + #set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 1 end]] + set codenorm $leadernorm[tcl::string::range $code 1 end] + } + 8CSI - 8OSC { + set codenorm [tcl::string::cat $leadernorm [tcl::string::range $code 1 end]] + } + default { + puts stderr "Sequence detected as ANSI, but not handled in leadernorm switch. code: [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + #we haven't made a mapping for this + #could in theory be 1,2 or 3 in len + #although we shouldn't be getting here if the regexp for ansi codes is kept in sync with our switch branches + set codenorm $code + } + } + + switch -- $leadernorm { + 7MAP { + switch -- [lindex $codenorm 4] { + Y { + #vt52 movement. we expect 2 chars representing position (limited range) + set params [tcl::string::range $codenorm 5 end] + if {[tcl::string::length $params] != 2} { + #shouldn't really get here or need this branch if ansi splitting was done correctly + puts stderr "overtype::renderline ESC Y recognised as vt52 move, but incorrect parameters length ([string length $params] vs expected 2) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented codenorm:[ansistring VIEW -lf 1 -vt 1 -nul 1 $codenorm]" + } + set line [tcl::string::index $params 5] + set column [tcl::string::index $params 1] + set r [expr {[scan $line %c] -31}] + set c [expr {[scan $column %c] -31}] + + #MAP to: + #CSI n;m H - CUP - Cursor Position + set leadernorm 7CSI + set codenorm "$leadernorm${r}\;${c}H" + } + } + } + } + + #we've mapped 7 and 8bit escapes to values we can handle as literals in switch statements to take advantange of jump tables. + switch -- $leadernorm { + 1006 { + #TODO + # + switch -- [tcl::string::index $codenorm end] { + M { + puts stderr "mousedown $codenorm" + } + m { + puts stderr "mouseup $codenorm" + } + } + + } + {7CSI} - {8CSI} { + set param [tcl::string::range $codenorm 4 end-1] + #puts stdout "--> CSI [tcl::string::index $leadernorm 0] bit param:$param" + set code_end [tcl::string::index $codenorm end] ;#used for e.g h|l set/unset mode + + switch -exact -- $code_end { + A { + #Row move - up + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #todo + lassign [split $param {;}] num modifierkey + if {$modifierkey ne ""} { + puts stderr "modifierkey:$modifierkey" + } + + if {$num eq ""} {set num 1} + incr cursor_row -$num + + if {$cursor_row < 1} { + set cursor_row 1 + } + + #ensure rest of *overlay* is emitted to remainder + incr idx_over + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction up + #retain cursor_column + break + } + B { + #CUD - Cursor Down + #Row move - down + lassign [split $param {;}] num modifierkey + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #move down + if {$modifierkey ne ""} { + puts stderr "modifierkey:$modifierkey" + } + if {$num eq ""} {set num 1} + incr cursor_row $num + + + incr idx_over ;#idx_over hasn't encountered a grapheme and hasn't advanced yet + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction down + #retain cursor_column + break + } + C { + #CUF - Cursor Forward + #Col move + #puts stdout "->forward" + #todo - consider right-to-left cursor mode (e.g Hebrew).. some day. + #cursor forward + #right-arrow/move forward + lassign [split $param {;}] num modifierkey + if {$modifierkey ne ""} { + puts stderr "modifierkey:$modifierkey" + } + if {$num eq ""} {set num 1} + + #todo - retrict to moving 1 position past datalen? restrict to column width? + #should ideally wrap to next line when interactive and not on last row + #(some ansi art seems to expect this behaviour) + #This presumably depends on the terminal's wrap mode + #e.g DECAWM autowrap mode + # CSI ? 7 h - set: autowrap (also tput smam) + # CSI ? 7 l - reset: no autowrap (also tput rmam) + set version 2 + if {$version eq "2"} { + set max [llength $outcols] + if {$overflow_idx == -1} { + incr max + } + if {$cursor_column == $max+1} { + #move_forward while in overflow + incr cursor_column -1 + } + + if {($cursor_column + $num) <= $max} { + incr idx $num + incr cursor_column $num + } else { + if {$autowrap_mode} { + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #jmn + if {$idx == $overflow_idx} { + incr num + } + + #horizontal movement beyond line extent needs to wrap - throw back to caller + #we may have both overflow_right and unapplied data + #(can have overflow_right if we were in insert_mode and processed chars prior to this movement) + #leave row as is - caller will need to determine how many rows the column-movement has consumed + incr cursor_column $num ;#give our caller the necessary info as columns from start of row + #incr idx_over + #should be gci following last one applied + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction wrapmoveforward + break + } else { + set cursor_column $max + set idx [expr {$cursor_column -1}] + } + } + } else { + #review - dead branch + if {!$expand_right || ($cursor_column + $num) <= [llength $outcols+1]} { + incr idx $num + incr cursor_column $num + } else { + if {!$insert_mode} { + #block editing style with arrow keys + #overtype mode + set idxstart $idx + set idxend [llength $outcols] + set moveend [expr {$idxend - $idxstart}] + if {$moveend < 0} {set moveend 0} ;#sanity? + #puts "idxstart:$idxstart idxend:$idxend outcols[llength $outcols] undercols:[llength $undercols]" + incr idx $moveend + incr cursor_column $moveend + #if {[tcl::dict::exists $understacks $idx]} { + # set stackinfo [tcl::dict::get $understacks $idx] ;#use understack at end - which may or may not have already been replaced by stack from overtext + #} else { + # set stackinfo [list] + #} + if {$idx < [llength $understacks]} { + set stackinfo [lindex $understacks $idx] ;#use understack at end - which may or may not have already been replaced by stack from overtext + } else { + set stackinfo [list] + } + if {$idx < [llength $understacks_gx]} { + #set gxstackinfo [tcl::dict::get $understacks_gx $idx] + set gxstackinfo [lindex $understacks_gx $idx] + } else { + set gxstackinfo [list] + } + #pad outcols + set movemore [expr {$num - $moveend}] + #assert movemore always at least 1 or we wouldn't be in this branch + for {set m 1} {$m <= $movemore} {incr m} { + incr idx + incr cursor_column + priv::render_addchar $idx " " $stackinfo $gxstackinfo $insert_mode + } + } else { + #normal - insert + incr idx $num + incr cursor_column $num + if {$idx > [llength $outcols]} { + set idx [llength $outcols];#allow one beyond - for adding character at end of line + set cursor_column [expr {[llength $outcols]+1}] + } + } + } + } + } + D { + #Col move + #puts stdout "<-back" + #cursor back + #left-arrow/move-back when ltr mode + lassign [split $param {;}] num modifierkey + if {$modifierkey ne ""} { + puts stderr "modifierkey:$modifierkey" + } + if {$num eq ""} {set num 1} + + set version 2 + if {$version eq "2"} { + #todo - startcolumn offset! + if {$cursor_column - $num >= 1} { + incr idx -$num + incr cursor_column -$num + } else { + if {!$autowrap_mode} { + set cursor_column 1 + set idx 0 + } else { + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + incr cursor_column -$num + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction wrapmovebackward + break + } + } + } else { + incr idx -$num + incr cursor_column -$num + if {$idx < $opt_colstart-1} { + #wrap to previous line and position cursor at end of data + set idx [expr {$opt_colstart-1}] + set cursor_column $opt_colstart + } + } + } + E { + #CNL - Cursor Next Line + if {$param eq ""} { + set downmove 1 + } else { + set downmove [expr {$param}] + } + puts stderr "renderline CNL down-by-$downmove" + set cursor_column 1 + set cursor_row [expr {$cursor_row + $downmove}] + set idx [expr {$cursor_column -1}] + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + incr idx_over + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction move + break + + } + F { + #CPL - Cursor Previous Line + if {$param eq ""} { + set upmove 1 + } else { + set upmove [expr {$param}] + } + puts stderr "renderline CPL up-by-$upmove" + set cursor_column 1 + set cursor_row [expr {$cursor_row -$upmove}] + if {$cursor_row < 1} { + set cursor_row 1 + } + set idx [expr {$cursor_column - 1}] + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + incr idx_over + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction move + break + + } + G { + #CHA - Cursor Horizontal Absolute (move to absolute column no) + if {$param eq ""} { + set targetcol 1 + } else { + set targetcol $param + if {![string is integer -strict $targetcol]} { + puts stderr "renderline CHA (Cursor Horizontal Absolute) error. Unrecognised parameter '$param'" + } + set targetcol [expr {$param}] + set max [llength $outcols] + if {$overflow_idx == -1} { + incr max + } + if {$targetcol > $max} { + puts stderr "renderline CHA (Cursor Horizontal Absolute) error. Param '$param' > max: $max" + set targetcol $max + } + } + #adjust to colstart - as column 1 is within overlay + #??? REVIEW + set idx [expr {($targetcol -1) + $opt_colstart -1}] + + + set cursor_column $targetcol + #puts stderr "renderline absolute col move ESC G (TEST)" + } + H - f { + #CSI n;m H - CUP - Cursor Position + + #CSI n;m f - HVP - Horizontal Vertical Position REVIEW - same as CUP with differences (what?) in some terminal modes + # - 'counts as effector format function (like CR or LF) rather than an editor function (like CUD or CNL)' + # - REVIEW + #see Annex A at: https://www.ecma-international.org/wp-content/uploads/ECMA-48_5th_edition_june_1991.pdf + + #test e.g ansicat face_2.ans + #$re_both_move + lassign [split $param {;}] paramrow paramcol + #missing defaults to 1 + #CSI ;5H = CSI 1;5H -> row 1 col 5 + #CSI 17;H = CSI 17H = CSI 17;1H -> row 17 col 1 + + if {$paramcol eq ""} {set paramcol 1} + if {$paramrow eq ""} {set paramrow 1} + if {![string is integer -strict $paramcol] || ![string is integer -strict $paramrow]} { + puts stderr "renderline CUP (CSI H) unrecognised param $param" + #ignore? + } else { + set max [llength $outcols] + if {$overflow_idx == -1} { + incr max + } + if {$paramcol > $max} { + set target_column $max + } else { + set target_column [expr {$paramcol}] + } + + + if {$paramrow < 1} { + puts stderr "renderline CUP (CSI H) bad row target 0. Assuming 1" + set target_row 1 + } else { + set target_row [expr {$paramrow}] + } + if {$target_row == $cursor_row} { + #col move only - no need for break and move + #puts stderr "renderline CUP col move only to col $target_column param:$param" + set cursor_column $target_column + set idx [expr {$cursor_column -1}] + } else { + set cursor_row $target_row + set cursor_column $target_column + set idx [expr {$cursor_column -1}] + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + incr idx_over + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction move + break + } + } + } + J { + set modegroup [tcl::string::index $codenorm 4] ;#e.g ? + switch -exact -- $modegroup { + ? { + #CSI ? Pn J - selective erase + puts stderr "overtype::renderline ED - SELECTIVE ERASE IN DISPLAY (UNIMPLEMENTED) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + default { + puts stderr "overtype::renderline ED - ERASE IN DISPLAY (TESTING) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + if {$param eq ""} {set param 0} + switch -exact -- $param { + 0 { + #clear from cursor to end of screen + } + 1 { + #clear from cursor to beginning of screen + } + 2 { + #clear entire screen + #ansi.sys - move cursor to upper left REVIEW + set cursor_row 1 + set cursor_column 1 + set idx [expr {$cursor_column -1}] + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + incr idx_over + if {[llength $outcols]} { + priv::render_erasechar 0 [llength $outcols] + } + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction clear_and_move + break + } + 3 { + #clear entire screen. presumably cursor doesn't move - otherwise it would be same as 2J ? + + } + default { + } + } + + } + } + } + K { + #see DECECM regarding background colour + set modegroup [tcl::string::index $codenorm 4] ;#e.g ? + switch -exact -- $modegroup { + ? { + puts stderr "overtype::renderline DECSEL - SELECTIVE ERASE IN LINE (UNIMPLEMENTED) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + set param [string range $param 1 end] ;#chop qmark + if {$param eq ""} {set param 0} + switch -exact -- $param { + 0 { + #clear from cursor to end of line - depending on DECSCA + } + 1 { + #clear from cursor to beginning of line - depending on DECSCA + + } + 2 { + #clear entire line - depending on DECSCA + } + default { + puts stderr "overtype::renderline DECSEL - SELECTIVE ERASE IN LINE PARAM '$param' unrecognised [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + + } + default { + puts stderr "overtype::renderline EL - ERASE IN LINE (TESTING) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + if {$param eq ""} {set param 0} + switch -exact -- $param { + 0 { + #clear from cursor to end of line + } + 1 { + #clear from cursor to beginning of line + + } + 2 { + #clear entire line + } + default { + puts stderr "overtype::renderline EL - ERASE IN LINE PARAM '$param' unrecognised [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + } + } + } + L { + puts stderr "overtype::renderline IL - Insert Line - not implemented [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + M { + #CSI Pn M - DL - Delete Line + puts stderr "overtype::renderline DL - Delete Line - not implemented [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + + } + T { + #CSI Pn T - SD Pan Up (empty lines introduced at top) + #CSI Pn+T - kitty extension (lines at top come from scrollback buffer) + #Pn new lines appear at top of the display, Pn old lines disappear at the bottom of the display + if {$param eq "" || $param eq "0"} {set param 1} + if {[string index $param end] eq "+"} { + puts stderr "overtype::renderline CSI Pn + T - kitty Pan Up - not implemented [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } else { + puts stderr "overtype::renderline CSI Pn T - SD Pan Up - not implemented [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + X { + puts stderr "overtype::renderline X ECH ERASE CHARACTER - $param" + #ECH - erase character + if {$param eq "" || $param eq "0"} {set param 1}; #param=count of chars to erase + priv::render_erasechar $idx $param + #cursor position doesn't change. + } + q { + set code_secondlast [tcl::string::index $codenorm end-1] + switch -exact -- $code_secondlast { + {"} { + #DECSCA - Select Character Protection Attribute + #(for use with selective erase: DECSED and DECSEL) + set param [tcl::string::range $codenorm 4 end-2] + if {$param eq ""} {set param 0} + #TODO - store like SGR in stacks - replays? + switch -exact -- $param { + 0 - 2 { + #canerase + puts stderr "overtype::renderline - DECSCA canerase not implemented - [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + 1 { + #cannoterase + puts stderr "overtype::renderline - DECSCA cannoterase not implemented - [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + default { + puts stderr "overtype::renderline DECSCA param '$param' not understood [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + + } + default { + puts stderr "overtype::renderline - CSI ... q not implemented - [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + + } + r { + #$re_decstbm + #https://www.vt100.net/docs/vt510-rm/DECSTBM.html + #This control function sets the top and bottom margins for the current page. You cannot perform scrolling outside the margins + lassign [split $param {;}] margin_top margin_bottom + + #todo - return these for the caller to process.. + puts stderr "overtype::renderline DECSTBM set top and bottom margin not implemented" + #Also moves the cursor to col 1 line 1 of the page + set cursor_column 1 + set cursor_row 1 + + incr idx_over + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction move ;#own instruction? decstbm? + break + } + s { + #code conflict between ansi emulation and DECSLRM - REVIEW + #ANSISYSSC (when no parameters) - like other terminals - essentially treat same as DECSC + # todo - when parameters - support DECSLRM instead + + if {$param ne ""} { + #DECSLRM - should only be recognised if DECLRMM is set (vertical split screen mode) + lassign [split $param {;} margin_left margin_right + puts stderr "overtype DECSLRM not yet supported - got [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + if {$margin_left eq ""} { + set margin_left 1 + } + set columns_per_page 80 ;#todo - set to 'page width (DECSCPP set columns per page)' - could be 132 or?? + if {$margin_right eq ""} { + set margin_right $columns_per_page + } + puts stderr "DECSLRM margin left: $margin_left margin right: $margin_right" + if {![string is integer -strict $margin_left] || $margin_left < 0} { + puts stderr "DECSLRM invalid margin_left" + } + if {![string is integer -strict $margin_right] || $margin_right < 0} { + puts stderr "DECSLRM invalid margin_right" + } + set scrolling_region_size [expr {$margin_right - $margin_left}] + if {$scrolling_region_size < 2 || $scrolling_region_size > $columns_per_page} { + puts stderr "DECSLRM region size '$scrolling_regsion_size' must be between 1 and $columns_per_page" + } + #todo + + + } else { + #DECSC + #//notes on expected behaviour: + #DECSC - saves following items in terminal's memory + #cursor position + #character attributes set by the SGR command + #character sets (G0,G1,G2 or G3) currently in GL and GR + #Wrap flag (autowrap or no autowrap) + #State of origin mode (DECOM) + #selective erase attribute + #any single shift 2 (SS2) or single shift 3(SSD) functions sent + + #$re_cursor_save + #cursor save could come after last column + if {$overflow_idx != -1 && $idx == $overflow_idx} { + #bartman2.ans test file - fixes misalignment at bottom of dialog bubble + #incr cursor_row + #set cursor_column 1 + #bwings1.ans test file - breaks if we actually incr cursor (has repeated saves) + set cursor_saved_position [list row [expr {$cursor_row+1}] column 1] + } else { + set cursor_saved_position [list row $cursor_row column $cursor_column] + } + #there may be overlay stackable codes emitted that aren't in the understacks because they come between the last emmited character and the cursor_save control. + #we need the SGR and gx overlay codes prior to the cursor_save + + #a real terminal would not be able to know the state of the underlay.. so we should probably ignore it. + #set sgr_stack [lindex $understacks $idx] + #set gx_stack [lindex $understacks_gx $idx] ;#not actually a stack - just a boolean state (for now?) + + set sgr_stack [list] + set gx_stack [list] + + #we shouldn't need to scan for intermediate cursor save/restores - as restores would throw-back to the calling loop - so our overlay 'line' is since those. + #The overlay_grapheme_control_list had leading resets from previous lines - so we go back to the beginning not just the first grapheme. + + foreach gc [lrange $overlay_grapheme_control_list 0 $gci-1] { + lassign $gc type code + #types g other sgr gx0 + switch -- $type { + gx0 { + #code is actually a stand-in for the graphics on/off code - not the raw code + #It is either gx0_on or gx0_off + set gx_stack [list $code] + } + sgr { + #code is the raw code + if {[punk::ansi::codetype::is_sgr_reset $code]} { + #jmn + set sgr_stack [list "\x1b\[m"] + } elseif {[punk::ansi::codetype::has_sgr_leadingreset $code]} { + set sgr_stack [list $code] + lappend overlay_grapheme_control_list [list sgr $code] + } elseif {[priv::is_sgr $code]} { + #often we don't get resets - and codes just pile up. + #as a first step to simplifying - at least remove earlier straight up dupes + set dup_posns [lsearch -all -exact $sgr_stack $code] ;#needs -exact - codes have square-brackets (glob chars) + set sgr_stack [lremove $sgr_stack {*}$dup_posns] + lappend sgr_stack $code + } + } + } + } + set cursor_saved_attributes "" + switch -- [lindex $gx_stack 0] { + gx0_on { + append cursor_saved_attributes "\x1b(0" + } + gx0_off { + append cursor_saved_attributes "\x1b(B" + } + } + #append cursor_saved_attributes [join $sgr_stack ""] + append cursor_saved_attributes [punk::ansi::codetype::sgr_merge_list {*}$sgr_stack] + + #as there is apparently only one cursor storage element we don't need to throw back to the calling loop for a save. + + #don't incr index - or the save will cause cursor to move to the right + #carry on + } + } + u { + #ANSISYSRC save cursor (when no parameters) (DECSC) + + #$re_cursor_restore + #we are going to jump somewhere.. for now we will assume another line, and process accordingly. + #The caller has the cursor_saved_position/cursor_saved_attributes if any (?review - if we always pass it back it, we could save some calls for moves in same line) + #don't set overflow at this point. The existing underlay to the right must be preserved. + #we only want to jump and render the unapplied at the new location. + + #lset overstacks $idx_over [list] + #set replay_codes_overlay "" + + #if {$cursor_saved_attributes ne ""} { + # set replay_codes_overlay $cursor_saved_attributes ;#empty - or last save if it happend in this input chunk + #} else { + #jj + #set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + set replay_codes_overlay "" + #} + + #like priv::render_unapplied - but without the overlay's ansi reset or gx stacks from before the restore code + incr idx_over + + set unapplied "" + set unapplied_list [list] + foreach gc [lrange $overlay_grapheme_control_list $gci+1 end] { + lassign $gc type item + if {$type eq "gx0"} { + if {$item eq "gx0_on"} { + lappend unapplied_list "\x1b(0" + } elseif {$item eq "gx0_off"} { + lappend unapplied_list "\x1b(B" + } + } else { + lappend unapplied_list $item + } + #incr idx_over + } + set unapplied [join $unapplied_list ""] + #if the save occured within this line - that's ok - it's in the return value list and caller can prepend for the next loop. + set instruction restore_cursor + break + } + "{" { + + puts stderr "renderline warning - CSI.. - unimplemented [ansistring VIEW -lf 1 -nul 1 $code]" + } + "}" { + set code_secondlast [tcl::string::index $codenorm end-1] + switch -exact -- $code_secondlast { + ' { + puts stderr "renderline warning - DECIC - Insert Column - CSI...' - unimplemented [ansistring VIEW -lf 1 -nul 1 $code]" + } + default { + puts stderr "renderline warning - CSI.. - unimplemented [ansistring VIEW -lf 1 -nul 1 $code]" + } + } + } + ~ { + set code_secondlast [tcl::string::index $codenorm end-1] ;#used for e.g CSI x '~ + switch -exact -- $code_secondlast { + ' { + #DECDC - editing sequence - Delete Column + puts stderr "renderline warning - DECDC - unimplemented" + } + default { + #$re_vt_sequence + lassign [split $param {;}] key mod + + #Note that f1 to f4 show as ESCOP|Q|R|S (VT220?) but f5+ show as ESC\[15~ + # + #e.g esc \[2~ insert esc \[2;2~ shift-insert + #mod - subtract 1, and then use bitmask + #shift = 1, (left)Alt = 2, control=4, meta=8 (meta seems to do nothing on many terminals on windows? Intercepted by windows?) + #puts stderr "vt key:$key mod:$mod code:[ansistring VIEW $code]" + if {$key eq "1"} { + #home + } elseif {$key eq "2"} { + #Insert + if {$mod eq ""} { + #no modifier key + set insert_mode [expr {!$insert_mode}] + #rather than set the cursor - we return the insert mode state so the caller can decide + } + } elseif {$key eq "3"} { + #Delete - presumably this shifts other chars in the line, with empty cells coming in from the end + switch -- $mod { + "" { + priv::render_delchar $idx + } + "5" { + #ctrl-del - delete to end of word (pwsh) - possibly word on next line if current line empty(?) + } + } + } elseif {$key eq "4"} { + #End + } elseif {$key eq "5"} { + #pgup + } elseif {$key eq "6"} { + #pgDn + } elseif {$key eq "7"} { + #Home + #?? + set idx [expr {$opt_colstart -1}] + set cursor_column 1 + } elseif {$key eq "8"} { + #End + } elseif {$key eq "11"} { + #F1 - or ESCOP or e.g shift F1 ESC\[1;2P + } elseif {$key eq "12"} { + #F2 - or ESCOQ + } elseif {$key eq "13"} { + #F3 - or ESCOR + } elseif {$key eq "14"} { + #F4 - or ESCOS + } elseif {$key eq "15"} { + #F5 or shift F5 ESC\[15;2~ + } elseif {$key eq "17"} { + #F6 + } elseif {$key eq "18"} { + #F7 + } elseif {$key eq "19"} { + #F8 + } elseif {$key eq "20"} { + #F9 + } elseif {$key eq "21"} { + #F10 + } elseif {$key eq "23"} { + #F11 + } elseif {$key eq "24"} { + #F12 + } + + } + } + + } + h - l { + #set mode unset mode + #we are matching only last char to get to this arm - but are there other sequences ending in h|l we need to handle? + + #$re_mode if first after CSI is "?" + #some docs mention ESC=h|l - not seen on windows terminals.. review + #e.g https://www2.math.upenn.edu/~kazdan/210/computer/ansi.html + set modegroup [tcl::string::index $codenorm 4] ;#e.g ? = + switch -exact -- $modegroup { + ? { + set smparams [tcl::string::range $codenorm 5 end-1] ;#params between ? and h|l + #one or more modes can be set + set smparam_list [split $smparams {;}] + foreach num $smparam_list { + switch -- $num { + "" { + #ignore empties e.g extra/trailing semicolon in params + } + 5 { + #DECSNM - reverse video + #How we simulate this to render within a block of text is an open question. + #track all SGR stacks and constantly flip based on the current SGR reverse state? + #It is the job of the calling loop to do this - so at this stage we'll just set the states + + if {$code_end eq "h"} { + #set (enable) + set reverse_mode 1 + } else { + #reset (disable) + set reverse_mode 0 + } + + } + 7 { + #DECAWM autowrap + if {$code_end eq "h"} { + #set (enable) + set autowrap_mode 1 + if {$opt_width ne "\uFFEF"} { + set overflow_idx $opt_width + } else { + #review - this is also the cursor position when adding a char at end of line? + set overflow_idx [expr {[llength $undercols]}] ;#index at which we would be *in* overflow a row move may still override it + } + #review - can idx ever be beyond overflow_idx limit when we change e.g with a width setting and cursor movements? + # presume not usually - but sanity check with warning for now. + if {$idx >= $overflow_idx} { + puts stderr "renderline warning - idx '$idx' >= overflow_idx '$overflow_idx' - unexpected" + } + } else { + #reset (disable) + set autowrap_mode 0 + #REVIEW! + set overflow_idx -1 + } + } + 25 { + if {$code_end eq "h"} { + #visible cursor + + } else { + #invisible cursor + + } + } + 117 { + #DECECM - Erase Color Mode + #https://invisible-island.net/ncurses/ncurses.faq.html + #The Erase color selection controls the background color used when text is erased or new + #text is scrolled on to the screen. Screen background causes newly erased areas or + #scrolled text to be written using color index zero, the screen background. This is VT + #and DECterm compatible. Text background causes erased areas or scrolled text to be + #written using the current text background color. This is PC console compatible and is + #the factory default. + + #see also: https://unix.stackexchange.com/questions/251726/clear-to-end-of-line-uses-the-wrong-background-color-in-screen + } + } + } + } + = { + set num [tcl::string::range $codenorm 5 end-1] ;#param between = and h|l + puts stderr "overtype::renderline CSI=...h|l code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented" + } + default { + #e.g CSI 4 h + set num [tcl::string::range $codenorm 4 end-1] ;#param before h|l + switch -exact -- $num { + 3 { + puts stderr "CRM MODE $code_end" + #CRM - Show control character mode + # 'No control functions are executed except LF,FF and VT which are represented in the CRM FONT before a CRLF(new line) is executed' + # + #use ansistring VIEW -nul 1 -lf 2 -ff 2 -vt 2 + #https://vt100.net/docs/vt510-rm/CRM.html + #NOTE - vt100 CRM always does auto-wrap at right margin. + #disabling auto-wrap in set-up or by sequence is disabled. + #We should default to turning off auto-wrap when crm_mode enabled.. but + #displaying truncated (on rhs) crm can still be very useful - and we have optimisation in overflow to avoid excess renderline calls (per grapheme) + #we therefore could reasonably put in an exception to allow auto_wrap to be disabled after crm_mode is engaged, + #although this would be potentially an annoying difference to some.. REVIEW + if {$code_end eq "h"} { + set crm_mode 1 + set autowrap_mode 1 + if {$opt_width ne "\uFFEF"} { + set overflow_idx $opt_width + } else { + #review - this is also the cursor position when adding a char at end of line? + set overflow_idx [expr {[llength $undercols]}] ;#index at which we would be *in* overflow a row move may still override it + } + } else { + set crm_mode 0 + } + } + 4 { + #IRM - Insert/Replace Mode + if {$code_end eq "h"} { + #CSI 4 h + set insert_mode 1 + } else { + #CSI 4 l + #replace mode + set insert_mode 0 + } + } + default { + puts stderr "overtype::renderline CSI...h|l code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented" + } + } + } + } + } + | { + switch -- [tcl::string::index $codenorm end-1] { + {$} { + #CSI ... $ | DECSCPP set columns per page- (recommended in vt510 docs as preferable to DECCOLM) + #real terminals generally only supported 80/132 + #some other virtuals support any where from 2 to 65,536? + #we will allow arbitrary widths >= 2 .. to some as yet undetermined limit. + #CSI $ | + #empty or 0 param is 80 for compatibility - other numbers > 2 accepted + set page_width -1 ;#flag as unset + if {$param eq ""} { + set page_width 80 + } elseif {[string is integer -strict $param] && $param >=2 0} { + set page_width [expr {$param}] ;#we should allow leading zeros in the number - but lets normalize using expr + } else { + puts stderr "overtype::renderline unacceptable DECSPP value '$param'" + } + + if {$page_width > 2} { + puts stderr "overtype::renderline DECSCPP - not implemented - but selected width '$page_width' looks ok" + #if cursor already beyond new page_width - will move to right colum - otherwise no cursor movement + + } + + } + default { + puts stderr "overtype::renderline unrecognised CSI code ending in pipe (|) [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented" + } + } + } + default { + puts stderr "overtype::renderline CSI code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented" + } + } + } + 7ESC { + # + #re_other_single {\x1b(D|M|E)$} + #also vt52 Y.. + #also PM \x1b^...(ST) + switch -- [tcl::string::index $codenorm 4] { + c { + #RIS - reset terminal to initial state - where 'terminal' in this case is the renderspace - not the underlying terminal! + puts stderr "renderline reset" + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction reset + break + } + D { + #\x84 + #index (IND) + #vt102-docs: "Moves cursor down one line in same column. If cursor is at bottom margin, screen performs a scroll-up" + puts stderr "renderline ESC D not fully implemented" + incr cursor_row + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction down + #retain cursor_column + break + } + E { + #\x85 + #review - is behaviour different to lf? + #todo - possibly(?) same logic as handling above. i.e return instruction depends on where column_cursor is at the time we get NEL + #leave implementation until logic for is set in stone... still under review + #It's arguable NEL is a pure cursor movement as opposed to the semantic meaning of crlf or lf in a file. + # + #Next Line (NEL) "Move the cursor to the left margin on the next line. If the cursor is at the bottom margin, scroll the page up" + puts stderr "overtype::renderline ESC E unimplemented" + + } + H { + #\x88 + #Tab Set + puts stderr "overtype::renderline ESC H tab set unimplemented" + } + M { + #\x8D + #Reverse Index (RI) + #vt102-docs: "Moves cursor up one line in same column. If cursor is at top margin, screen performs a scroll-down" + puts stderr "overtype::renderline ESC M not fully implemented" + + set replay_codes_overlay [punk::ansi::codetype::sgr_merge_list {*}[lindex $overlay_grapheme_control_stacks $gci]] + #move up + incr cursor_row -1 + if {$cursor_row < 1} { + set cursor_row 1 + } + #ensure rest of *overlay* is emitted to remainder + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction up ;#need instruction for scroll-down? + #retain cursor_column + break + } + N { + #\x8e - affects next character only + puts stderr "overtype::renderline single shift select G2 command UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + O { + #\x8f - affects next character only + puts stderr "overtype::renderline single shift select G3 command UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + P { + #\x90 + #DCS - shouldn't get here - handled in 7DCS branch + #similarly \] OSC (\x9d) and \\ (\x9c) ST + } + V { + #\x96 + + } + W { + #\x97 + } + X { + #\x98 + #SOS + if {[string index $code end] eq "\007"} { + set sos_content [string range $code 2 end-1] ;#ST is \007 + } else { + set sos_content [string range $code 2 end-2] ;#ST is \x1b\\ + } + #return in some useful form to the caller + #TODO! + lappend sos_list [list string $sos_content row $cursor_row column $cursor_column] + puts stderr "overtype::renderline ESCX SOS UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + ^ { + #puts stderr "renderline PM" + #Privacy Message. + if {[string index $code end] eq "\007"} { + set pm_content [string range $code 2 end-1] ;#ST is \007 + } else { + set pm_content [string range $code 2 end-2] ;#ST is \x1b\\ + } + #We don't want to render it - but we need to make it available to the application + #see the textblock library in punk, for the exception we make here for single backspace. + #It is unlikely to be encountered as a useful PM - so we hack to pass it through as a fix + #for spacing issues on old terminals which miscalculate the single-width 'Symbols for Legacy Computing' + if {$pm_content eq "\b"} { + #puts stderr "renderline PM sole backspace special handling for \U1FB00 - \U1FBFF" + #esc^\b\007 or esc^\besc\\ + #HACKY pass-through - targeting terminals that both mis-space legacy symbols *and* don't support PMs + #The result is repair of the extra space. If the terminal is a modern one and does support PM - the \b should be hidden anyway. + #If the terminal has the space problem AND does support PMs - then this just won't fix it. + #The fix relies on the symbol-supplier to cooperate by appending esc^\b\esc\\ to the problematic symbols. + + #priv::render_addchar $idx $code [lindex $overstacks $idx_over] [lindex $overstacks_gx $idx_over] $insert_mode + #idx has been incremented after last grapheme added + priv::render_append_to_char [expr {$idx -1}] $code + } + #lappend to a dict element in the result for application-specific processing + lappend pm_list $pm_content + } + _ { + #APC Application Program Command + #just warn for now.. + puts stderr "overtype::renderline ESC_ APC command UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + default { + puts stderr "overtype::renderline ESC code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code] not implemented codenorm:[ansistring VIEW -lf 1 -vt 1 -nul 1 $codenorm]" + } + } + + } + 7DCS - 8DCS { + puts stderr "overtype::renderline DCS - DEVICE CONTROL STRING command UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + #ST (string terminator) \x9c or \x1b\\ + if {[tcl::string::index $codenorm end] eq "\x9c"} { + set code_content [tcl::string::range $codenorm 4 end-1] ;#ST is 8-bit 0x9c + } else { + set code_content [tcl::string::range $codenorm 4 end-2] ;#ST is \x1b\\ + } + + } + 7OSC - 8OSC { + # OSCs are terminated with ST of either \007 or \x1b\\ - we allow either whether code was 7 or 8 bit + if {[tcl::string::index $codenorm end] eq "\007"} { + set code_content [tcl::string::range $codenorm 4 end-1] ;#ST is \007 + } else { + set code_content [tcl::string::range $codenorm 4 end-2] ;#ST is \x1b\\ + } + set first_colon [tcl::string::first {;} $code_content] + if {$first_colon == -1} { + #there probably should always be a colon - but we'll try to make sense of it without + set osc_code $code_content ;#e.g \x1b\]104\007 vs \x1b\]104\;\007 + } else { + set osc_code [tcl::string::range $code_content 0 $first_colon-1] + } + switch -exact -- $osc_code { + 2 { + set newtitle [tcl::string::range $code_content 2 end] + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction [list set_window_title $newtitle] + break + } + 4 { + #OSC 4 - set colour palette + #can take multiple params + #e.g \x1b\]4\;1\;red\;2\;green\x1b\\ + set params [tcl::string::range $code_content 2 end] ;#strip 4 and first semicolon + set cmap [dict create] + foreach {cnum spec} [split $params {;}] { + if {$cnum >= 0 and $cnum <= 255} { + #todo - parse spec from names like 'red' to RGB + #todo - accept rgb:ab/cd/ef as well as rgb:/a/b/c (as alias for aa/bb/cc) + #also - what about rgb:abcd/defg/hijk and 12-bit abc/def/ghi ? + dict set cmap $cnum $spec + } else { + #todo - log + puts stderr "overtype::renderline OSC 4 set colour palette - bad color number: $cnum must be from 0 to 255. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + + puts stderr "overtype::renderline OSC 4 set colour palette unimplemented. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + + + } + 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 { + #OSC 10 through 17 - so called 'dynamic colours' + #can take multiple params - each successive parameter changes the next colour in the list + #- e.g if code started at 11 - next param is for 12. 17 takes only one param because there are no more + #10 change text foreground colour + #11 change text background colour + #12 change text cursor colour + #13 change mouse foreground colour + #14 change mouse background colour + #15 change tektronix foreground colour + #16 change tektronix background colour + #17 change highlight colour + set params [tcl::string::range $code_content 2 end] + + puts stderr "overtype::renderline OSC $osc_code set dynamic colours unimplemented. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + + + } + 18 { + #why is this not considered one of the dynamic colours above? + #https://www.xfree86.org/current/ctlseqs.html + #tektronix cursor color + puts stderr "overtype::renderline OSC 18 - set tektronix cursor color unimplemented. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + 99 { + #kitty desktop notifications + #https://sw.kovidgoyal.net/kitty/desktop-notifications/ + # 99 ; metadata ; payload + puts stderr "overtype::renderline OSC 99 kitty desktop notification unimplemented. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + 104 { + #reset colour palette + #we want to do it for the current rendering context only (vvt) - not just pass through to underlying vt + puts stderr "overtype::renderline OSC 104 reset colour palette unimplemented. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + priv::render_unapplied $overlay_grapheme_control_list $gci + set instruction [list reset_colour_palette] + break + } + 1337 { + #iterm2 graphics and file transfer + puts stderr "overtype::renderline OSC 1337 iterm2 graphics/file_transfer unimplemented. 1st 100 chars of code [ansistring VIEW -lf 1 -vt 1 -nul 1 [string range $code 0 99]]" + } + 5113 { + puts stderr "overtype::renderline OSC 5113 kitty file transfer unimplemented. 1st 100 chars of code [ansistring VIEW -lf 1 -vt 1 -nul 1 [string range $code 0 99]]" + } + default { + puts stderr "overtype::renderline OSC - UNIMPLEMENTED. code [ansistring VIEW -lf 1 -vt 1 -nul 1 $code]" + } + } + + } + default { + } + } + + + } + default { + #don't need to handle sgr or gx0 types + #we have our sgr gx0 codes already in stacks for each overlay grapheme + } + } + } + + #-------- + if {$opt_expand_right == 0} { + #need to truncate to the width of the original undertext + #review - string_width vs printing_length here. undertext requirement to be already rendered therefore punk::char::string_width ok? + #set num_under_columns [punk::char::string_width $pt_underchars] ;#plaintext underchars + } + if {$overflow_idx == -1} { + #overflow was initially unlimited and hasn't been overridden + } else { + + } + #-------- + + + #coalesce and replay codestacks for outcols grapheme list + set outstring "" ;#output prior to overflow + set overflow_right "" ;#remainder after overflow point reached + set i 0 + set cstack [list] + set prevstack [list] + set prev_g0 [list] + #note overflow_idx may already have been set lower if we had a row move above due to \v or ANSI moves + set in_overflow 0 ;#used to stop char-width scanning once in overflow + if {$overflow_idx == 0} { + #how does caller avoid an infinite loop if they have autowrap on and keep throwing graphemes to the next line? REVIEW + set in_overflow 1 + } + set trailing_nulls 0 + foreach ch [lreverse $outcols] { + if {$ch eq "\u0000"} { + incr trailing_nulls + } else { + break + } + } + if {$trailing_nulls} { + set first_tail_null_posn [expr {[llength $outcols] - $trailing_nulls}] + } else { + set first_tail_null_posn -1 + } + + #puts stderr "first_tail_null_posn: $first_tail_null_posn" + #puts stderr "colview: [ansistring VIEW $outcols]" + + foreach ch $outcols { + #puts "---- [ansistring VIEW $ch]" + + set gxleader "" + if {$i < [llength $understacks_gx]} { + #set g0 [tcl::dict::get $understacks_gx $i] + set g0 [lindex $understacks_gx $i] + if {$g0 ne $prev_g0} { + if {$g0 eq [list "gx0_on"]} { + set gxleader "\x1b(0" + } else { + set gxleader "\x1b(B" + } + } + set prev_g0 $g0 + } else { + set prev_g0 [list] + } + + set sgrleader "" + if {$i < [llength $understacks]} { + #set cstack [tcl::dict::get $understacks $i] + set cstack [lindex $understacks $i] + if {$cstack ne $prevstack} { + if {[llength $prevstack] && ![llength $cstack]} { + #This reset is important e.g testfile fruit.ans - we get overhang on rhs without it. But why is cstack empty? + append sgrleader \033\[m + } else { + append sgrleader [punk::ansi::codetype::sgr_merge_list {*}$cstack] + } + } + set prevstack $cstack + } else { + set prevstack [list] + } + + + + if {$in_overflow} { + if {$i == $overflow_idx} { + set 0 [lindex $understacks_gx $i] + set gxleader "" + if {$g0 eq [list "gx0_on"]} { + set gxleader "\x1b(0" + } elseif {$g0 eq [list "gx0_off"]} { + set gxleader "\x1b(B" + } + append overflow_right $gxleader + set cstack [lindex $understacks $i] + set sgrleader "" + #whether cstack is same or differs from previous char's stack - we must have an output at the start of the overflow_right + #if {[llength $prevstack] && ![llength $cstack]} { + # append sgrleader \033\[m + #} + append sgrleader [punk::ansi::codetype::sgr_merge_list {*}$cstack] + append overflow_right $sgrleader + append overflow_right $ch + } else { + append overflow_right $gxleader + append overflow_right $sgrleader + append overflow_right $ch + } + } else { + if {$overflow_idx != -1 && $i+1 == $overflow_idx} { + #one before overflow + #will be in overflow in next iteration + set in_overflow 1 + if {[grapheme_width_cached $ch]> 1} { + #we overflowed with second-half of a double-width char - replace first-half with user-supplied exposition char (should be 1 wide) + set ch $opt_exposed1 + } + } + append outstring $gxleader + append outstring $sgrleader + if {$ch eq "\u0000"} { + if {$cp437_glyphs} { + #map all nulls including at tail to space + append outstring " " + } else { + if {$trailing_nulls && $i < $first_tail_null_posn} { + append outstring " " ;#map inner nulls to space + } else { + append outstring \u0000 + } + } + } else { + append outstring $ch + } + } + incr i + } + #flower.ans good test for null handling - reverse line building + #review - presence of overflow_right doesn't indicate line's trailing nulls should remain. + #The cells could have been erased? + #if {!$cp437_glyphs} { + # #if {![ansistring length $overflow_right]} { + # # set outstring [tcl::string::trimright $outstring "\u0000"] + # #} + # set outstring [tcl::string::trimright $outstring "\u0000"] + # set outstring [tcl::string::map {\u0000 " "} $outstring] + #} + + + #REVIEW + #set overflow_right [tcl::string::trimright $overflow_right "\u0000"] + #set overflow_right [tcl::string::map {\u0000 " "} $overflow_right] + + set replay_codes "" + if {[llength $understacks] > 0} { + if {$overflow_idx == -1} { + #set tail_idx [tcl::dict::size $understacks] + set tail_idx [llength $understacks] + } else { + set tail_idx [llength $undercols] + } + if {$tail_idx-1 < [llength $understacks]} { + #set replay_codes [join [lindex $understacks $tail_idx-1] ""] ;#tail replay codes + set replay_codes [punk::ansi::codetype::sgr_merge_list {*}[lindex $understacks $tail_idx-1]] ;#tail replay codes + } + if {$tail_idx-1 < [llength $understacks_gx]} { + set gx0 [lindex $understacks_gx $tail_idx-1] + if {$gx0 eq [list "gx0_on"]} { + #if it was on, turn gx0 off at the point we stop processing overlay + append outstring "\x1b(B" + } + } + } + if {[string length $overflow_right]} { + #puts stderr "remainder:$overflow_right" + } + #pdict $understacks + + if {[punk::ansi::ta::detect_sgr $outstring]} { + append outstring [punk::ansi::a] ;#without this - we would get for example, trailing backgrounds after rightmost column + + #close off any open gx? + #probably should - and overflow_right reopen? + } + + if {$opt_returnextra} { + #replay_codes is the codestack at the boundary - used for ellipsis colouring to match elided text - review + #replay_codes_underlay is the set of codes in effect at the very end of the original underlay + + #review + #replay_codes_overlay is the set of codes in effect at the very end of the original overlay (even if not all overlay was applied) + #todo - replay_codes for gx0 mode + + #overflow_idx may change during ansi & character processing + if {$overflow_idx == -1} { + set overflow_right_column "" + } else { + set overflow_right_column [expr {$overflow_idx+1}] + } + set result [tcl::dict::create\ + result $outstring\ + visualwidth [punk::ansi::printing_length $outstring]\ + instruction $instruction\ + stringlen [string length $outstring]\ + overflow_right_column $overflow_right_column\ + overflow_right $overflow_right\ + unapplied $unapplied\ + unapplied_list $unapplied_list\ + insert_mode $insert_mode\ + autowrap_mode $autowrap_mode\ + crm_mode $crm_mode\ + reverse_mode $reverse_mode\ + insert_lines_above $insert_lines_above\ + insert_lines_below $insert_lines_below\ + cursor_saved_position $cursor_saved_position\ + cursor_saved_attributes $cursor_saved_attributes\ + cursor_column $cursor_column\ + cursor_row $cursor_row\ + expand_right $opt_expand_right\ + replay_codes $replay_codes\ + replay_codes_underlay $replay_codes_underlay\ + replay_codes_overlay $replay_codes_overlay\ + pm_list $pm_list\ + ] + if {$opt_returnextra == 1} { + #puts stderr "renderline: $result" + return $result + } else { + #human/debug - map special chars to visual glyphs + set viewop VIEW + switch -- $opt_returnextra { + 2 { + #codes and character data + set viewop VIEWCODES ;#ansi colorisation of codes - green for SGR, blue/blue reverse for cursor_save/cursor_restore, cyan for movements, orange for others + } + 3 { + set viewop VIEWSTYLE ;#ansi colorise the characters within the output with preceding codes, stacking codes only within each dict value - may not be same SGR effect as the effect in-situ. + } + } + tcl::dict::set result result [ansistring $viewop -lf 1 -vt 1 [tcl::dict::get $result result]] + tcl::dict::set result overflow_right [ansistring VIEW -lf 1 -vt 1 [tcl::dict::get $result overflow_right]] + tcl::dict::set result unapplied [ansistring VIEW -lf 1 -vt 1 [tcl::dict::get $result unapplied]] + tcl::dict::set result unapplied_list [ansistring VIEW -lf 1 -vt 1 [tcl::dict::get $result unapplied_list]] + tcl::dict::set result replay_codes [ansistring $viewop -lf 1 -vt 1 [tcl::dict::get $result replay_codes]] + tcl::dict::set result replay_codes_underlay [ansistring $viewop -lf 1 -vt 1 [tcl::dict::get $result replay_codes_underlay]] + tcl::dict::set result replay_codes_overlay [ansistring $viewop -lf 1 -vt 1 [tcl::dict::get $result replay_codes_overlay]] + tcl::dict::set result cursor_saved_attributes [ansistring $viewop -lf 1 -vt 1 [tcl::dict::get $result cursor_saved_attributes]] + return $result + } + } else { + #puts stderr "renderline returning: result $outstring instruction $instruction unapplied $unapplied overflow_right $overflow_right" + return $outstring + } + #return [join $out ""] + } + + #*** !doctools + #[list_end] [comment {--- end definitions namespace overtype ---}] +} + +tcl::namespace::eval overtype::piper { + proc overcentre {args} { + if {[llength $args] < 2} { + error {usage: ?-bias left|right? ?-transparent [0|1|]? ?-exposed1 ? ?-exposed2 ? ?-overflow [1|0]? overtext pipelinedata} + } + lassign [lrange $args end-1 end] over under + set argsflags [lrange $args 0 end-2] + tailcall overtype::centre {*}$argsflags $under $over + } + proc overleft {args} { + if {[llength $args] < 2} { + error {usage: ?-startcolumn ? ?-transparent [0|1|]? ?-exposed1 ? ?-exposed2 ? ?-overflow [1|0]? overtext pipelinedata} + } + lassign [lrange $args end-1 end] over under + set argsflags [lrange $args 0 end-2] + tailcall overtype::left {*}$argsflags $under $over + } +} + + +# -- --- --- --- --- --- --- --- --- --- --- +proc overtype::transparentline {args} { + foreach {under over} [lrange $args end-1 end] break + set argsflags [lrange $args 0 end-2] + set defaults [tcl::dict::create\ + -transparent 1\ + -exposed 1 " "\ + -exposed 2 " "\ + ] + set newargs [tcl::dict::merge $defaults $argsflags] + tailcall overtype::renderline {*}$newargs $under $over +} +#renderline may not make sense as it is in the long run for blocks of text - but is handy in the single-line-handling form anyway. +# We are trying to handle ansi codes in a block of text which is acting like a mini-terminal in some sense. +#We can process standard cursor moves such as \b \r - but no way to respond to other cursor movements e.g moving to other lines. +# +tcl::namespace::eval overtype::piper { + proc renderline {args} { + if {[llength $args] < 2} { + error {usage: ?-start ? ?-transparent [0|1|]? ?-overflow [1|0]? overtext pipelinedata} + } + foreach {over under} [lrange $args end-1 end] break + set argsflags [lrange $args 0 end-2] + tailcall overtype::renderline {*}$argsflags $under $over + } +} +interp alias "" piper_renderline "" overtype::piper::renderline + +#intended primarily for single grapheme - but will work for multiple +#WARNING: query CAN contain ansi or newlines - but if cache was not already set manually,the answer will be incorrect! +#We deliberately allow this for PM/SOS attached within a column +#(a cache of ansifreestring_width calls - as these are quite regex heavy) +proc overtype::grapheme_width_cached {ch} { + variable grapheme_widths + if {[tcl::dict::exists $grapheme_widths $ch]} { + return [tcl::dict::get $grapheme_widths $ch] + } + set width [punk::char::ansifreestring_width $ch] + tcl::dict::set grapheme_widths $ch $width + return $width +} + + + +proc overtype::test_renderline {} { + set t \uFF5E ;#2-wide tilde + set u \uFF3F ;#2-wide underscore + set missing \uFFFD + return [list $t $u A${t}B] +} + +#maintenance warning +#same as textblock::size - but we don't want that circular dependency +#block width and height can be tricky. e.g \v handled differently on different terminal emulators and can affect both +proc overtype::blocksize {textblock} { + if {$textblock eq ""} { + return [tcl::dict::create width 0 height 1] ;#no such thing as zero-height block - for consistency with non-empty strings having no line-endings + } + if {[tcl::string::first \t $textblock] >= 0} { + if {[info exists punk::console::tabwidth]} { + set tw $::punk::console::tabwidth + } else { + set tw 8 + } + set textblock [textutil::tabify::untabify2 $textblock $tw] + } + #ansistrip on entire block in one go rather than line by line - result should be the same - review - make tests + if {[punk::ansi::ta::detect $textblock]} { + set textblock [punk::ansi::ansistrip $textblock] + } + if {[tcl::string::last \n $textblock] >= 0} { + set num_le [expr {[tcl::string::length $textblock]-[tcl::string::length [tcl::string::map {\n {}} $textblock]]}] ;#faster than splitting into single-char list + set width [tcl::mathfunc::max {*}[lmap v [split $textblock \n] {::punk::char::ansifreestring_width $v}]] + } else { + set num_le 0 + set width [punk::char::ansifreestring_width $textblock] + } + #our concept of block-height is likely to be different to other line-counting mechanisms + set height [expr {$num_le + 1}] ;# one line if no le - 2 if there is one trailing le even if no data follows le + + return [tcl::dict::create width $width height $height] ;#maintain order in 'image processing' standard width then height - caller may use lassign [dict values [blocksize ]] width height +} + +tcl::namespace::eval overtype::priv { + variable cache_is_sgr [tcl::dict::create] + + #we are likely to be asking the same question of the same ansi codes repeatedly + #caching the answer saves some regex expense - possibly a few uS to lookup vs under 1uS + #todo - test if still worthwhile after a large cache is built up. (limit cache size?) + proc is_sgr {code} { + variable cache_is_sgr + if {[tcl::dict::exists $cache_is_sgr $code]} { + return [tcl::dict::get $cache_is_sgr $code] + } + set answer [punk::ansi::codetype::is_sgr $code] + tcl::dict::set cache_is_sgr $code $answer + return $answer + } + # better named render_to_unapplied? + proc render_unapplied {overlay_grapheme_control_list gci} { + upvar idx_over idx_over + upvar unapplied unapplied + upvar unapplied_list unapplied_list ;#maintaining as a list allows caller to utilize it without having to re-split + upvar overstacks overstacks + upvar overstacks_gx overstacks_gx + upvar overlay_grapheme_control_stacks og_stacks + + #set unapplied [join [lrange $overlay_grapheme_control_list $gci+1 end]] + set unapplied "" + set unapplied_list [list] + #append unapplied [join [lindex $overstacks $idx_over] ""] + #append unapplied [punk::ansi::codetype::sgr_merge_list {*}[lindex $overstacks $idx_over]] + set sgr_merged [punk::ansi::codetype::sgr_merge_list {*}[lindex $og_stacks $gci]] + if {$sgr_merged ne ""} { + lappend unapplied_list $sgr_merged + } + switch -- [lindex $overstacks_gx $idx_over] { + "gx0_on" { + lappend unapplied_list "\x1b(0" + } + "gx0_off" { + lappend unapplied_list "\x1b(B" + } + } + + foreach gc [lrange $overlay_grapheme_control_list $gci+1 end] { + lassign $gc type item + #types g other sgr gx0 + if {$type eq "gx0"} { + if {$item eq "gx0_on"} { + lappend unapplied_list "\x1b(0" + } elseif {$item eq "gx0_off"} { + lappend unapplied_list "\x1b(B" + } + } else { + lappend unapplied_list $item + } + } + set unapplied [join $unapplied_list ""] + } + + #clearer - renders the specific gci forward as unapplied - prefixed with it's merged sgr stack + proc render_this_unapplied {overlay_grapheme_control_list gci} { + upvar idx_over idx_over + upvar unapplied unapplied + upvar unapplied_list unapplied_list + upvar overstacks overstacks + upvar overstacks_gx overstacks_gx + upvar overlay_grapheme_control_stacks og_stacks + + #set unapplied [join [lrange $overlay_grapheme_control_list $gci+1 end]] + set unapplied "" + set unapplied_list [list] + + set sgr_merged [punk::ansi::codetype::sgr_merge_list {*}[lindex $og_stacks $gci]] + if {$sgr_merged ne ""} { + lappend unapplied_list $sgr_merged + } + switch -- [lindex $overstacks_gx $idx_over] { + "gx0_on" { + lappend unapplied_list "\x1b(0" + } + "gx0_off" { + lappend unapplied_list "\x1b(B" + } + } + + foreach gc [lrange $overlay_grapheme_control_list $gci end] { + lassign $gc type item + #types g other sgr gx0 + if {$type eq "gx0"} { + if {$item eq "gx0_on"} { + lappend unapplied_list "\x1b(0" + } elseif {$item eq "gx0_off"} { + lappend unapplied_list "\x1b(B" + } + } else { + lappend unapplied_list $item + } + } + set unapplied [join $unapplied_list ""] + } + proc render_delchar {i} { + upvar outcols o + upvar understacks ustacks + upvar understacks_gx gxstacks + set nxt [llength $o] + if {$i < $nxt} { + set o [lreplace $o $i $i] + set ustacks [lreplace $ustacks $i $i] + set gxstacks [lreplace $gxstacks $i $i] + } elseif {$i == 0 || $i == $nxt} { + #nothing to do + } else { + puts stderr "render_delchar - attempt to delchar at index $i >= number of outcols $nxt - shouldn't happen" + } + } + proc render_erasechar {i count} { + upvar outcols o + upvar understacks ustacks + upvar understacks_gx gxstacks + upvar replay_codes_overlay replay + #ECH clears character attributes from erased character positions + #ECH accepts 0 or empty parameter, which is equivalent to 1. Caller of render_erasechar should do that mapping and only supply 1 or greater. + if {![tcl::string::is integer -strict $count] || $count < 1} { + error "render_erasechar count must be integer >= 1" + } + set start $i + set end [expr {$i + $count -1}] + #we restrict ECH to current line - as some terminals do - review - is that the only way it's implemented? + if {$i > [llength $o]-1} { + return + } + if {$end > [llength $o]-1} { + set end [expr {[llength $o]-1}] + } + set num [expr {$end - $start + 1}] + set o [lreplace $o $start $end {*}[lrepeat $num \u0000]] ;#or space? + #DECECM ??? + set ustacks [lreplace $ustacks $start $end {*}[lrepeat $num [list $replay]]] + set gxstacks [lreplace $gxstacks $start $end {*}[lrepeat $num [list]]] ;# ??? review + return + } + proc render_setchar {i c } { + upvar outcols o + lset o $i $c + } + + #Initial usecase is for old-terminal hack to add PM-wrapped \b + #review - can be used for other multibyte sequences that occupy one column? + #combiners? diacritics? + proc render_append_to_char {i c} { + upvar outcols o + if {$i > [llength $o]-1} { + error "render_append_to_char cannot append [ansistring VIEW -lf 1 -nul 1 $c] to existing char at index $i while $i >= llength outcols [llength $o]" + } + set existing [lindex $o $i] + if {$existing eq "\0"} { + lset o $i $c + } else { + lset o $i $existing$c + } + } + #is actually addgrapheme? + proc render_addchar {i c sgrstack gx0stack {insert_mode 0}} { + upvar outcols o + upvar understacks ustacks + upvar understacks_gx gxstacks + + # -- --- --- + #this is somewhat of a hack.. probably not really the equivalent of proper reverse video? review + #we should ideally be able to reverse the video of a sequence that already includes SGR reverse/noreverse attributes + upvar reverse_mode do_reverse + #if {$do_reverse} { + # lappend sgrstack [a+ reverse] + #} else { + # lappend sgrstack [a+ noreverse] + #} + + #JMN3 + if {$do_reverse} { + #note we can't just look for \x1b\[7m or \x1b\[27m + # it may be a more complex sequence like \x1b\[0\;\;7\;31m etc + + set existing_reverse_state 0 + set codeinfo [punk::ansi::codetype::sgr_merge $sgrstack -info 1] + set codestate_reverse [dict get $codeinfo codestate reverse] + switch -- $codestate_reverse { + 7 { + set existing_reverse_state 1 + } + 27 { + set existing_reverse_state 0 + } + "" { + } + } + if {$existing_reverse_state == 0} { + set rflip [a+ reverse] + } else { + #reverse of reverse + set rflip [a+ noreverse] + } + #note that mergeresult can have multiple esc (due to unmergeables or non sgr codes) + set sgrstack [list [dict get $codeinfo mergeresult] $rflip] + #set sgrstack [punk::ansi::codetype::sgr_merge [list [dict get $codeinfo mergeresult] $rflip]] + } + + # -- --- --- + + set nxt [llength $o] + if {!$insert_mode} { + if {$i < $nxt} { + #These lists must always be in sync + lset o $i $c + } else { + lappend o $c + } + if {$i < [llength $ustacks]} { + lset ustacks $i $sgrstack + lset gxstacks $i $gx0stack + } else { + lappend ustacks $sgrstack + lappend gxstacks $gx0stack + } + } else { + #insert of single-width vs double-width when underlying is double-width? + if {$i < $nxt} { + set o [linsert $o $i $c] + } else { + lappend o $c + } + if {$i < [llength $ustacks]} { + set ustacks [linsert $ustacks $i $sgrstack] + set gxstacks [linsert $gxstacks $i $gx0stack] + } else { + lappend ustacks $sgrstack + lappend gxstacks $gx0stack + } + } + } + +} + + + +# -- --- --- --- --- --- --- --- --- --- --- +tcl::namespace::eval overtype { + interp alias {} ::overtype::center {} ::overtype::centre +} + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Ready +package provide overtype [tcl::namespace::eval overtype { + variable version + set version 1.6.6 +}] +return + +#*** !doctools +#[manpage_end] diff --git a/src/vfs/_vfscommon.vfs/modules/punk/cesu-0.1.0.tm b/src/vfs/_vfscommon.vfs/modules/punk/cesu-0.1.1.tm similarity index 79% rename from src/vfs/_vfscommon.vfs/modules/punk/cesu-0.1.0.tm rename to src/vfs/_vfscommon.vfs/modules/punk/cesu-0.1.1.tm index 5f1b813a..872cf36e 100644 --- a/src/vfs/_vfscommon.vfs/modules/punk/cesu-0.1.0.tm +++ b/src/vfs/_vfscommon.vfs/modules/punk/cesu-0.1.1.tm @@ -9,7 +9,7 @@ # http://paste.tclers.tk/5977 # # @@ Meta Begin -# Application punk::cesu 0.1.0 +# Application punk::cesu 0.1.1 # Meta platform tcl # Meta license # @@ Meta End @@ -19,14 +19,14 @@ # doctools header # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ #*** !doctools -#[manpage_begin punkshell_module_punk::cesu 0 0.1.0] +#[manpage_begin punkshell_module_punk::cesu 0 0.1.1] #[copyright "2024"] #[titledesc {CESU compatibility ehcoding scheme for utf-16: 8-Bit (CESU-8) ??}] [comment {-- Name section and table of contents description --}] -#[moddesc {CESU experimental}] [comment {-- Description at end of page heading --}] +#[moddesc {CESU experimental}] [comment {-- Description at end of page heading --}] #[require punk::cesu] #[keywords module cesu encoding compatibility experimental unofficial] #[description] -#[para] experimental +#[para] experimental # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ @@ -34,8 +34,8 @@ #[section Overview] #[para] overview of punk::cesu #[subsection Concepts] -#[para] cesu-8 may be mistaken for utf-8 if no supplementary chars present. -#[para] see: https://www.unicode.org/reports/tr26/tr26-4.html +#[para] cesu-8 may be mistaken for utf-8 if no supplementary chars present. +#[para] see: https://www.unicode.org/reports/tr26/tr26-4.html #[para] Particulary note discouragement of use especially in external interchange. @@ -52,9 +52,6 @@ package require Tcl 8.6- #*** !doctools #[item] [package {Tcl 8.6}] -# #package require frobz -# #*** !doctools -# #[item] [package {frobz}] #*** !doctools #[list_end] @@ -70,11 +67,11 @@ package require Tcl 8.6- # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ tcl::namespace::eval punk::cesu { tcl::namespace::export {[a-z]*} ;# Convention: export all lowercase - variable PUNKARGS + variable PUNKARGS #*** !doctools #[subsection {Namespace punk::cesu}] - #[para] Core API functions for punk::cesu + #[para] Core API functions for punk::cesu #[list_begin definitions] @@ -127,7 +124,7 @@ tcl::namespace::eval punk::cesu { binary scan $1 c 1 binary scan $2 c 2 binary scan $3 c 3 - puts [list $1 $2 $3] + #puts [list $1 $2 $3] #binary scan $4 c 4 incr 1 ;#// Effectively adds 0x10000 to the codepoint ? @@ -155,7 +152,7 @@ tcl::namespace::eval punk::cesu { [expr {0x80 | (($1 & 0x3) << 4) | (($2 & 0x3C) >> 2)}] \ [expr {0x80 | (($2 & 0x3) << 4) | ($3 & 0xF)}] \ $4] - + } else { puts "Invalid sequence: $char" return $char @@ -177,26 +174,78 @@ tcl::namespace::eval punk::cesu { #e.g from_surrogatestring "note \ud83f\udd1e etc" #e.g from_surrogatestring "faces \ud83d\ude10 \ud83d\ude21 \ud83d\ude31" - #note: from_surrogatestring \U1f600 returns a mouse (\U1f400) instead of smiley + #note: from_surrogatestring \U1f600 returns a mouse (\U1f400) instead of smiley # but from_surrogatestring \U1f400 returns a mouse. # Tcl bug - fixed some time in 9.x - # surrogated_string shouldn't include non BMP chars anyway (G.I.G.O?) + # surrogated_string theoretically shouldn't include non BMP chars anyway (but may in some contexts? mixed surrogate escapes and raw nonbmp?) lappend PUNKARGS [list { @id -id ::punk::cesu::from_surrogatestring @cmd -name punk::cesu::from_surrogatestring -help\ "Convert a string containing surrogate pairs - to string with pairs converted to unicode non-BMP + to Tcl string with pairs converted to unicode non-BMP characters" - @values + @values surrogated_string -help\ "May contain a mix of surrogate pairs and other characters - only the surrogate pairs will be converted." }] - proc from_surrogatestring {surrogated_string} { + + proc from_surrogatestring {str} { + #high surrogate character rep followed by low surrogate character rep + if {[regexp {[\uD800-\uDBFF][\uDC00-\uDFFF]} $str]} { + set str [string map {\[ \\\[ \] \\\]} $str] ;#Make sure any existing commandlike structures aren't executed + return [subst -novariables -nobackslashes [regsub -all {([\uD800-\uDBFF])([\uDC00-\uDFFF])} $str {[surrogatepair_to_codepoint \1 \2]} ]] + } else { + return $str + } + } + proc surrogatepair_to_codepoint {highchar lowchar} { + if {[string length $highchar] != 1 || [string length $lowchar] !=1} { + error "surrogatepair_to_codepoint expected surrogate pair encoded as 2 characters" + } + #NOTE in tcl8 - we get oddity that 'split ""' returns a list of length 1 even though there are 2 chars + #fixed in tcl9 + #lassign [split $2_surrogate_chars ""] highSurrogateChar lowSurrogateChar + + scan $highchar %c highDecimal + scan $lowchar %c lowDecimal + + set highDecimal [expr {$highDecimal - 0xD800}] + set lowDecimal [expr {$lowDecimal - 0xDC00}] + # Combine the values and add 0x10000 to get the original code point + set codepointDecimal [expr {($highDecimal << 10) + $lowDecimal + 0x10000}] + #puts "->codepointDecimal $codepointDecimal" + + #In tcl8 - we will get \uFFFD for non BMP codepoints - todo ? + return [format %c $codepointDecimal] + } + #e.g {\ud83d\ude00} + proc escaped_surrogatepair_to_codepoint {spair} { + set spair [string map {" " ""} $spair] + if {[string length $spair] != 12} { + error "escaped_surrogatepair_to_codepoint expected input of form \\uXXXX\\uXXXX" + } + set normalised [regsub -all {\\+u} $spair ""] + set highSurrogate [string range $normalised 0 3] + set lowSurrogate [string range $normalised 4 end] + scan $highSurrogate %x highDecimal + scan $lowSurrogate %x lowDecimal + + set highDecimal [expr {$highDecimal - 0xD800}] + set lowDecimal [expr {$lowDecimal - 0xDC00}] + # Combine the values and add 0x10000 to get the original code point + set codepointDecimal [expr {($highDecimal << 10) + $lowDecimal + 0x10000}] + return [format %c $codepointDecimal] + } + + proc from_surrogatestring_via_cesu {surrogated_string} { + #we can do this without cesu (from_surrogatestring) set cesu [encoding convertto cesu-8 $surrogated_string] set x [cesu2utf $cesu] encoding convertfrom utf-8 $x } + + proc _to_test {emoji} { puts stderr "_to_test incomplete" set cesu [encoding convertto cesu-8 $e] @@ -209,7 +258,7 @@ tcl::namespace::eval punk::cesu { -format -default escape -choices {raw escape} -choicelabels { raw\ " emit raw surrogate pairs - may not be writable to + may not be writable to output channels" escape\ " emit unprocessed backslash hex @@ -224,7 +273,7 @@ tcl::namespace::eval punk::cesu { e.g >to_surrogatestring -format escape \"mouse: \\U1f400\" mouse: \\uD83D\\uDC00 - " + " }] proc to_surrogatestring {args} { set argd [punk::args::parse $args withid ::punk::cesu::to_surrogatestring] @@ -273,14 +322,14 @@ tcl::namespace::eval punk::cesu { #set lsbinfo [punk::char::char_info_dec $lsbfinal -fields all -except testwidth] ;#don't use all/testwidth will try to emit the char and fail/show error set esc "\\u$msbhex\\u$lsbhex" set raw [format %c $msbfinal][format %c $lsbfinal] - return [dict create escapes $esc msbdec $msbfinal msbhex $msbhex lsbdec $lsbfinal lsbhex $lsbhex raw $raw] + return [dict create escapes $esc msbdec $msbfinal msbhex $msbhex lsbdec $lsbfinal lsbhex $lsbhex raw $raw] } # #test_enc_equivalency \U1f400 \U1f600 proc test_enc_equivalency {c1 c2} { package require punk::ansi - namespace import ::punk::ansi::a+ ::punk::ansi::a + namespace import ::punk::ansi::a+ ::punk::ansi::a foreach enc [lsort [encoding names]] { puts stdout "testing $enc" if {$enc in "iso2022 iso2022-jp iso2022-kr"} { @@ -315,14 +364,14 @@ tcl::namespace::eval punk::cesu::lib { tcl::namespace::path [tcl::namespace::parent] #*** !doctools #[subsection {Namespace punk::cesu::lib}] - #[para] Secondary functions that are part of the API + #[para] Secondary functions that are part of the API #[list_begin definitions] #proc utility1 {p1 args} { # #*** !doctools # #[call lib::[fun utility1] [arg p1] [opt {?option value...?}]] - # #[para]Description of utility1 - # return 1 + # #[para]Description of utility1 + # return 1 #} @@ -340,15 +389,15 @@ tcl::namespace::eval punk::cesu::lib { #tcl::namespace::eval punk::cesu::system { #*** !doctools #[subsection {Namespace punk::cesu::system}] - #[para] Internal functions that are not part of the API + #[para] Internal functions that are not part of the API #} # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -# == === === === === === === === === === === === === === === +# == === === === === === === === === === === === === === === # Sample 'about' function with punk::args documentation -# == === === === === === === === === === === === === === === +# == === === === === === === === === === === === === === === tcl::namespace::eval punk::cesu { tcl::namespace::export {[a-z]*} ;# Convention: export all lowercase variable PUNKARGS @@ -371,7 +420,7 @@ tcl::namespace::eval punk::cesu { set about_topics [list] foreach f $topic_funs { set tail [namespace tail $f] - lappend about_topics [string range $tail [string length get_topic_] end] + lappend about_topics [string range $tail [string length get_topic_] end] } #Adjust this function or 'default_topics' if a different order is required return [lsort $about_topics] @@ -379,12 +428,12 @@ tcl::namespace::eval punk::cesu { proc default_topics {} {return [list Description *]} # ------------------------------------------------------------- - # get_topic_ functions add more to auto-include in about topics + # get_topic_ functions add more to auto-include in about topics # ------------------------------------------------------------- proc get_topic_Description {} { - punk::args::lib::tstr [string trim { + punk::args::lib::tstr [string trim { package punk::cesu - description to come.. + cesu and surrogate-pair processing } \n] } proc get_topic_License {} { @@ -406,7 +455,8 @@ tcl::namespace::eval punk::cesu { } proc get_topic_custom-topic {} { punk::args::lib::tstr -return string { - nothing to see here + This library can be used for surrogate-pair handling. + cesu utilities are used internally in from_surrogatestring } } # ------------------------------------------------------------- @@ -415,9 +465,9 @@ tcl::namespace::eval punk::cesu { # we re-use the argument definition from punk::args::standard_about and override some items set overrides [dict create] dict set overrides @id -id "::punk::cesu::about" - dict set overrides @cmd -name "punk::cesu::about" + dict set overrides @cmd -name "punk::cesu::about" dict set overrides @cmd -help [string trim [punk::args::lib::tstr { - About punk::cesu + About punk::cesu }] \n] dict set overrides topic -choices [list {*}[punk::cesu::argdoc::about_topics] *] dict set overrides topic -choicerestricted 1 @@ -433,7 +483,7 @@ tcl::namespace::eval punk::cesu { } } # end of sample 'about' function -# == === === === === === === === === === === === === === === +# == === === === === === === === === === === === === === === # ----------------------------------------------------------------------------- @@ -446,11 +496,11 @@ namespace eval ::punk::args::register { lappend ::punk::args::register::NAMESPACES ::punk::cesu } # ----------------------------------------------------------------------------- -## Ready +## Ready package provide punk::cesu [tcl::namespace::eval punk::cesu { variable pkg punk::cesu variable version - set version 0.1.0 + set version 0.1.1 }] return diff --git a/src/vfs/_vfscommon.vfs/modules/test/tomlish-1.1.5.tm b/src/vfs/_vfscommon.vfs/modules/test/tomlish-1.1.5.tm index 536e3fa3..35de5e70 100644 Binary files a/src/vfs/_vfscommon.vfs/modules/test/tomlish-1.1.5.tm and b/src/vfs/_vfscommon.vfs/modules/test/tomlish-1.1.5.tm differ diff --git a/src/vfs/_vfscommon.vfs/modules/test/tomlish-1.1.5.tm.x b/src/vfs/_vfscommon.vfs/modules/test/tomlish-1.1.5.tm.x new file mode 100644 index 00000000..536e3fa3 Binary files /dev/null and b/src/vfs/_vfscommon.vfs/modules/test/tomlish-1.1.5.tm.x differ diff --git a/src/vfs/_vfscommon.vfs/modules/tomlish-1.1.5.tm b/src/vfs/_vfscommon.vfs/modules/tomlish-1.1.5.tm index 7ff93c3e..a8f33d38 100644 --- a/src/vfs/_vfscommon.vfs/modules/tomlish-1.1.5.tm +++ b/src/vfs/_vfscommon.vfs/modules/tomlish-1.1.5.tm @@ -531,17 +531,34 @@ namespace eval tomlish { } else { #we have a table - but is it a tablearray? set ttype [dictn get $tablenames_info [list $norm_segments type]] - #use a tabletype_unknown type for previous 'created' only tables? - if {$ttype ne "header_tablearray"} { - set msg "tablearray name $tablearrayname already appears to be already created as a table not a tablearray - invalid?" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + #we use a header_unknown type for previous 'created' only tables + + if {$ttype eq "header_unknown"} { + dictn set tablenames_info [list $norm_segments type] header_tablearray + set ttype header_tablearray + #assert - must not be 'defined' + #we have seen it before as a supertable ie 'created' only + #Not 'defined' but could still have subtables - treat it as a dict + set ARRAY_ELEMENTS [dict get $datastructure {*}$norm_segments] + } else { + if {$ttype ne "header_tablearray"} { + #header_table or itable + switch -- $ttype { + itable {set ttypename itable} + header_table {set ttypename table} + default {error "unrecognised type - expected header_table or itable"} + } + set msg "tablearray name $tablearrayname already appears to be already created as '$ttypename' not tablearray - invalid?" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #EXISTING tablearray + #add to array + #error "add_to_array not implemented" + #{type ARRAY value } + set ARRAY_ELEMENTS [dict get $datastructure {*}$norm_segments value] } - #add to array - #error "add_to_array not implemented" - #{type ARRAY value } - set ARRAY_ELEMENTS [dict get $datastructure {*}$norm_segments value] } @@ -748,7 +765,7 @@ namespace eval tomlish { set norm_segments [::tomlish::to_dict::tablename_split $tablename true] ;#true to normalize set T_DEFINED [dictn getdef $tablenames_info [list $norm_segments defined] NULL] - if {$T_DEFINED ne "NULL"} { + if {$T_DEFINED ni [list NULL header_tablearray]} { #our tablename e.g [a.b.c.d] declares a space to 'define' subkeys - but there has already been a definition space for this path set msg "Table name $tablename has already been directly defined in the toml data. Invalid" append msg \n [tomlish::dict::_show_tablenames $tablenames_info] @@ -779,7 +796,8 @@ namespace eval tomlish { return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg } #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here - dictn set tablenames_info [list $supertable type] header_table + #we also don't know whether it's a table or a tablearray + dictn set tablenames_info [list $supertable type] header_unknown #ensure empty tables are still represented in the datastructure dict set datastructure {*}$supertable [list] } else { diff --git a/src/vfs/_vfscommon.vfs/modules/tomlish-1.1.6.tm b/src/vfs/_vfscommon.vfs/modules/tomlish-1.1.6.tm new file mode 100644 index 00000000..dddcd0bb --- /dev/null +++ b/src/vfs/_vfscommon.vfs/modules/tomlish-1.1.6.tm @@ -0,0 +1,8408 @@ +# -*- tcl -*- +# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt +# +# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. +# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# (C) 2024 +# +# @@ Meta Begin +# Application tomlish 1.1.6 +# Meta platform tcl +# Meta license +# @@ Meta End + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# doctools header +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +#*** !doctools +#[manpage_begin tomlish_module_tomlish 0 1.1.6] +#[copyright "2024"] +#[titledesc {tomlish toml parser}] [comment {-- Name section and table of contents description --}] +#[moddesc {tomlish}] [comment {-- Description at end of page heading --}] +#[require tomlish] +#[keywords module parsing toml configuration] +#[description] +#[para] tomlish is an intermediate representation of toml data in a tree structure (tagged lists representing type information) +#[para] The design goals are for tomlish to be whitespace and comment preserving ie byte-for byte preservation during roundtrips from toml to tomlish and back to toml +#[para] The tomlish representation can then be converted to a Tcl dict structure or to other formats such as json, +#[para] although these other formats are generally unlikely to retain whitespace or comments +#[para] The other formats also won't preserve roundtripability e.g \t and a literal tab coming from a toml file will be indistinguishable. +#[para] A further goal is to allow at least a useful subset of in-place editing operations which also preserve whitespace and comments. +#[para] e.g leaf key value editing, and table reordering/sorting, key-renaming at any level, key insertions/deletions +#[para] The API for editing (tomldoc object?) may require explicit setting of type if accessing an existing key +#[para] e.g setting a key that already exists and is a different type (especially if nested structure such as a table or array) +#[para] will need a -type option (-force ?) to force overriding with another type such as an int. + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section Overview] +#[para] overview of tomlish +#[subsection Concepts] +#[para] - + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Requirements +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[subsection dependencies] +#[para] packages used by tomlish +#[list_begin itemized] + +package require Tcl 8.6- +package require struct::stack +package require logger + +#*** !doctools +#[item] [package {Tcl 8.6-}] +#[item] [package {struct::stack}] + +#limit ourselves to clear, destroy, peek, pop, push, rotate, or size (e.g v 1.3 does not implement 'get') + + +#*** !doctools +#[list_end] + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +#*** !doctools +#[section API] + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# Base namespace +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval tomlish { + namespace export {[a-z]*}; # Convention: export all lowercase + variable types + + #*** !doctools + #[subsection {Namespace tomlish}] + #[para] Core API functions for tomlish + #[list_begin definitions] + + #default interp recursionlimit of 1000 is insufficient to pass 1000 deep nested structures as in certain toml tests. + #e.g https://github.com/iarna/toml-spec-tests/tree/latest/values + #1000 seems deep for a 'configuration' format - but toml sometimes used for other serialisation purposes. + #todo - review + set existing_recursionlimit [interp recursionlimit {}] + if {$existing_recursionlimit < 5000} { + interp recursionlimit {} 5000 + } + + #IDEAS: + # since get_toml produces tomlish with whitespace/comments intact: + # tomldoc object - allow (at least basic?) editing of toml whilst preserving comments & whitespace + # - setKey (set leaf only to value) how to specify type? -type option? - whole array vs index into arrays and further nested objects? - option for raw toml additions? + # - separate addKey?? + # - deleteKey (delete leaf) + # - deleteTable (delete table - if only has leaves? - option to delete with child tables?) + # - set/add Table? - position in doc based on existing tables/subtables? + + #The tomlish intermediate representation allows things such as sorting the toml document by table name or other re-ordering of tables - + # because the tables include subkeys, comments and newlines within their structure - those elements all come along with it nicely during reordering. + #The same goes for the first newline following a keyval e.g x=1 \ny=2\n\n + #The newline is part of the keyval structure so makes reordering easier + #example from_toml "a=1\nb=2\n\n\n" + # 0 = TOMLISH + # 1 = KEY a = {INT 1} {NEWLINE lf} + # 2 = NEWLINE lf + # 3 = KEY b = {INT 2} {NEWLINE lf} + # 4 = NEWLINE lf + # 5 = NEWLINE lf + + #This reordering idea is complicated by the nature of tablearrays - especially as a table header references last tablearrayname, + # and duplicate table headers are allowed in that context. + #e.g + #[[fruits]] + # name="apple" + # [fruits.metadata] + # id=1 + # + #[unrelated1] + # + #[[fruits]] + # name="pear" + # + #[unrelated2] + # silly="ordering" + # + #[fruits.metadata] + #id=2 + #The TABLEARRAY record can't be completely selfcontained on the default parsing mechanism - because it is legal (though not recommended) to have unrelated tables in between. + #If we were to 'insert' later related records (such as the 2nd [fruits.metadata] above) into the TABLEARRAY structure - then, even though it might produce 'nicer' toml, + # we would lose roundtripability toml->tomlish->toml + # ----------------------------------------------------- + #REVIEW + #todo - some sort of 'normalize'/'grouping' function on tomlish that at least makes records self-contained, and perhaps then (optionally) reorders resulting records sensibly. + #such a function on the tomlish may work - although it would be unwise to duplicate the validation aspects of dict::from_tomlish + #The most practical way might be to use dict::from_tomlish followed by from_dict - but that would lose comment info and formatting. + #In the above example - The decision by the toml author to put [unrelated1] between related tablearrays should be respected, + #but the positioning of [unrelated2] between a tablearray and one of its contained tables is suspect. + #Both [fruits.metadata] table records should theoretically be added as children to their corresponding [[fruits]] tablearray record in the tomlish. (just as their name keys are) + # ----------------------------------------------------- + + + + #ARRAY is analogous to a Tcl list + #TABLE is analogous to a Tcl dict + #WS = inline whitespace + #KEY = bare key and value + #DQKEY = double quoted key and value + #SQKEY = single quoted key and value + #ITABLE = inline table (*can* be anonymous table) + # inline table values immediately create a table with the opening brace + # inline tables are fully defined between their braces, as are dotted-key subtables defined within + # No additional subtables or arrays of tables may be defined within an inline table after the ending brace - they must be entirely self-contained + + set tags [list TOMLISH BOM ARRAY TABLE ITABLE TABLEARRAY WS NEWLINE COMMENT DOTTEDKEY KEY DQKEY SQKEY STRING STRINGPART MULTISTRING LITERAL LITERALPART MULTILITERAL INT FLOAT BOOL] + #DDDD + lappend tags {*}[list\ + DATETIME\ + DATETIME-LOCAL\ + DATE-LOCAL\ + TIME-LOCAL\ + ] + + #removed - ANONTABLE + #tomlish v1.0 should accept arbitrary 64-bit signed ints (from -2^63 to 2^63-1) + #we will restrict to this range for compatibility for now - although Tcl can handle larger (arbitrarily so?) + #todo - configurable - allow empty string for 'unlimited' + set min_int -9223372036854775808 ;#-2^63 + set max_int +9223372036854775807 ;#2^63-1 + + proc Dolog {lvl txt} { + #return "$lvl -- $txt" + set msg "[clock format [clock seconds] -format "%Y-%m-%dT%H:%M:%S"] tomlish '$txt'" + puts stderr $msg + } + logger::initNamespace ::tomlish + foreach lvl [logger::levels] { + interp alias {} tomlish_log_$lvl {} ::tomlish::Dolog $lvl + log::logproc $lvl tomlish_log_$lvl + } + + + proc tags {} { + return $::tomlish::tags + } + + proc get_dottedkey_info {dottedkeyrecord} { + set key_hierarchy [list] + set key_hierarchy_raw [list] + if {[lindex $dottedkeyrecord 0] ne "DOTTEDKEY"} { + error "tomlish::get_dottedkey_info error. Supplied list doesn't appear to be a DOTTEDKEY (tag: [lindex $dottedkeyrecord 0])" + } + set compoundkeylist [lindex $dottedkeyrecord 1] + set expect_sep 0 + foreach part $compoundkeylist { + set parttag [lindex $part 0] + if {$parttag eq "WS"} { + continue + } + if {$expect_sep} { + if {$parttag ne "DOTSEP"} { + error "DOTTEDKEY missing dot separator between parts. '$dottedkeyrecord'" + } + set expect_sep 0 + } else { + set val [lindex $part 1] + switch -exact -- $parttag { + KEY { + lappend key_hierarchy $val + lappend key_hierarchy_raw $val + } + DQKEY { + #REVIEW unescape or not? + #JJJJ + lappend key_hierarchy [::tomlish::utils::unescape_string $val] + lappend key_hierarchy_raw \"$val\" + } + SQKEY { + lappend key_hierarchy $val + lappend key_hierarchy_raw "'$val'" + } + default { + error "tomlish::get_dottedkey_info DOTTED key unexpected part '$parttag' - ensure dot separator is between key parts. '$compoundkeylist'" + } + } + set expect_sep 1 + } + } + return [dict create keys $key_hierarchy keys_raw $key_hierarchy_raw] + } + + #helper function for tomlish::dict::from_tomlish + proc _get_keyval_value {keyval_element} { + #e.g + #DOTTEDKEY {{KEY a} {WS { }}} = {WS { }} {ARRAY {INT 1} SEP {ITABLE {DOTTEDKEY {{KEY x}} = {INT 1} SEP} {DOTTEDKEY {{KEY y}} = {INT 2}}}} + + log::notice ">>> _get_keyval_value from '$keyval_element'<<<" + #find the value (or 2 values if space separated datetime - and stitch back into one) + # 3 is the earliest index at which the value could occur (depending on whitespace) + if {[lindex $keyval_element 2] ne "="} { + error "tomlish _get_keyval_value keyval_element doesn't seem to be a properly structured { = } list\n $keyval_element" + } + + #review + if {[uplevel 1 [list info exists tablenames_info]]} { + upvar tablenames_info tablenames_info + } else { + set tablenames_info [dict create] ;#keys are @@ paths {@@parenttable @@arrayable @@etc} corresponding to parenttable.arraytable[].etc + #value is a dict with keys such as ttype, tdefined + } + set sublist [lrange $keyval_element 3 end] ;# rhs of = + + set values [list] + set value_posns [list] + set posn 0 + foreach sub $sublist { + #note that a barekey/dquotedkey won't occur directly inside a barekey/dquotedkey + #DDDD + switch -exact -- [lindex $sub 0] { + STRING - LITERAL - MULTISTRING - MULTILITERAL - INT - FLOAT - BOOL - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - TABLE - ARRAY - ITABLE { + lappend values $sub + lappend value_posns $posn + } + DOTTEDKEY { + #we should never see DOTTEDKEY as a toplevel element on RHS + #sanity check in case manually manipulated tomlish - or something went very wrong + set msg "tomlish::_get_keyval_value Unexpected toplevel value element DOTTEDKEY after =" + return -code error -errorcode {TOMLISH SYNTAX UNEXPECTEDDOTTEDKEYRHS} $msg + } + WS - NEWLINE - COMMENT {} + SEP {} + default { + set msg "tomlish::_get_keyval_value Unexpected toplevel value element [lindex $sub 0] after =" + return -code error -errorcode {TOMLISH SYNTAX UNEXPECTED} $msg + } + } + incr posn + } + switch -- [llength $values] { + 0 { + error "tomlish Failed to find value element in KEY. '$keyval_element'" + } + 1 { + lassign [lindex $values 0] type value + } + 2 { + #we generally expect a single 'value' item on RHS of = + #(ignoring WS,NEWLINE,SEP + #(either a simple type, or a container which has multiple values inside) + #exception for space separated datetime which is two toplevel values + + #validate than exactly single space was between the two values + lassign $value_posns p1 p2 + if {$p2 != $p1 +2} { + #sanity check + #can probably only get here through manual manipulation of the tomlish list to an unprocessable form + error "tomlish KEY appears to have more than one part - but not separated by whitespace - invalid '$keyval_element'" + } + set between_token [lindex $sublist $p1+1] + if {[lindex $between_token 1] ne " "} { + error "tomlish KEY in 2 parts is not separated by a single space - cannot consider for datetime '$keyval_element'" + } + lassign [lindex $values 0] type_d1 value_d1 + lassign [lindex $values 1] type_d2 value_d2 + #DDDD + if {$type_d1 ne "DATE-LOCAL" || $type_d2 ni {DATETIME TIME-LOCAL}} { + #we reuse DATETIME tag for standalone time with tz offset (or zZ) + error "tomlish KEY in 2 parts does not appear to be datetime '$keyval_element'" + } + if {$type_d2 eq "TIME-LOCAL"} { + set type DATETIME-LOCAL + } else { + #extra check that 2nd part is actually a time + if {![tomlish::utils::is_timepart $value_d2]} { + error "tomlish KEY in 2 parts does not appear to be datetime. (part 2 not a time value) '$keyval_element'" + } + set type DATETIME + } + set value "${value_d1}T${value_d2}" + } + default { + error "tomlish Found multiple value elements in KEY, expected one. (or 2 for space-separated datetime) '$keyval_element'" + } + } + set sub_tablenames_info [dict create] + switch -exact -- $type { + INT - FLOAT - BOOL - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL { + #DDDD + #simple (non-container, no-substitution) datatype + set result [list type $type value $value] + } + STRING - STRINGPART { + #JJJ + #!!! review + #set result [list type $type value [::tomlish::utils::unescape_string $value]] + set result [list type $type value $value] + } + LITERAL - LITERALPART { + #REVIEW + set result [list type $type value $value] + } + TABLE { + #invalid? + error "tomlish _get_keyval_value invalid to have type TABLE on rhs of =" + } + ITABLE { + #This one should not be returned as a type value structure! + # + set prev_tablenames_info $tablenames_info + set tablenames_info [dict create] + set result [::tomlish::dict::from_tomlish [ list [lindex $values 0] ]] + set sub_tablenames_info $tablenames_info + set tablenames_info $prev_tablenames_info + } + ARRAY { + #we need to recurse to get the corresponding dict for the contained item(s) + #pass in the whole [lindex $values 0] (type val) - not just the $value! + set prev_tablenames_info $tablenames_info + set tablenames_info [dict create] + set result [list type $type value [ ::tomlish::dict::from_tomlish [ list [lindex $values 0] ] ]] + set sub_tablenames_info $tablenames_info + set tablenames_info $prev_tablenames_info + } + MULTISTRING - MULTILITERAL { + #review - mapping these to STRING might make some conversions harder? + #if we keep the MULTI - we know we have to look for newlines for example when converting to json + #without specific types we'd have to check every STRING - and lose info about how best to map chars within it + set result [list type $type value [ ::tomlish::dict::from_tomlish [ list [lindex $values 0] ] ]] + } + default { + error "tomlish Unexpected value type '$type' found in keyval '$keyval_element'" + } + } + return [dict create result $result tablenames_info $sub_tablenames_info] + } + + + proc to_dict {tomlish} { + tomlish::dict::from_tomlish $tomlish + } + + + + proc _from_dictval_tomltype {parents tablestack keys typeval} { + set type [dict get $typeval type] + set val [dict get $typeval value] + #These are the restricted sets of typed used in the tomlish::dict representation + #They are a subset of the types in tomlish: data types plus ARRAY, arranged in a dictionary form. + #The container types: ITABLE, TABLE, TABLEARRAY are not used as they are represented as dictionary keys and ARRAY items. + #The WS, COMMENT, and NEWLINE elements are also unrepresented in the dict structure. + switch -- $type { + ARRAY { + set subitems [list] + foreach item $val { + lappend subitems [_from_dictval [list {*}$parents ARRAY] $tablestack $keys $item] SEP + } + if {[lindex $subitems end] eq "SEP"} { + set subitems [lrange $subitems 0 end-1] + } + return [list ARRAY {*}$subitems] + } + ITABLE { + error "not applicable" + if {$val eq ""} { + return ITABLE + } else { + return [_from_dictval [list {*}$parents ITABLE] $tablestack $keys $val] + } + } + STRING { + #JSJS + #if our dict came from json - we have already decided what type of STRING/LITERAL etc to use when building the dict + + #do not validate like this - important that eg json val\\ue -> dict val\ue -> tomlish/toml val\\ue + #see toml-tests + #if {![tomlish::utils::rawstring_is_valid_tomlstring $val]} { + # #todo? + # return -code error -errorcode {TOML SYNTAX INVALIDSTRING} "Unescaped controls in string or invalid escapes" + #} + return [list STRING [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $val]] + } + MULTISTRING { + #value is a raw string that isn't encoded as tomlish + #create a valid toml snippet with the raw value and decode it to the proper tomlish MULTISTRING format + #We need to convert controls in $val to escape sequences - except for newlines + # + #consider an *option* to reformat for long lines? (perhaps overcomplex - byte equiv - but may fold in ugly places) + #we could use a line-length limit to decide when to put in a "line ending backslash" + #and even format it with a reasonable indent so that proper CONT and WS entries are made (?) REVIEW + # + #TODO + set tomlpart "x=\"\"\"\\\n" + append tomlpart [tomlish::utils::rawstring_to_MultiBstring_with_escaped_controls $val] + append tomlpart "\"\"\"" + set tomlish [tomlish::from_toml $tomlpart] + #e.g if val = " etc\nblah" + #TOMLISH {DOTTEDKEY {{KEY x}} = {MULTISTRING CONT {NEWLINE LF} {WS { }} {STRINGPART etc} {NEWLINE lf} {STRINGPART blah} } } + #lindex 1 3 is the MULTISTRING tomlish list + return [lindex $tomlish 1 3] + } + MULTILITERAL { + #MLL string can contain newlines - but still no control chars + #todo - validate - e.g val can't contain more than 2 squotes in a row + if {[string first ''' $val] >=0} { + set msg "_from_dictval_tomltype error: more than 2 single quotes in a row found in MULTILITERAL - cannot encode dict to TOML-VALID TOMLISH" + return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} $msg + } + + #rawstring_is_valid_multiliteral - allow newlines as lf or crlf - but not bare cr + if {![tomlish::utils::rawstring_is_valid_multiliteral $val]} { + return -code error -errorcode {TOML SYNTAX INVALIDMULTILITERAL} "Controls other than tab or newlines found in multiliteral" + } + + set tomlpart "x='''\n" + append tomlpart $val ''' + set tomlish [tomlish::from_toml $tomlpart] + return [lindex $tomlish 1 3] + } + LITERAL { + #from v1.0 spec - "Control characters other than tab are not permitted in a literal string" + #(This rules out raw ANSI SGR - which is somewhat restrictive - but perhaps justified for a config format + # as copy-pasting ansi to a config value is probably not always wise, and it's not something that can be + # easily input via a text editor. ANSI can go in Basic strings using the \e escape if that's accepted v1.1?) + #we could choose to change the type to another format here when encountering invalid chars - but that seems + #like too much magic. We elect to error out and require the dict to have valid data for the types it specifies. + if {[string first ' $val] >=0} { + set msg "_from_dictval_tomltype error: single quote found in LITERAL - cannot encode dict to TOML-VALID TOMLISH" + return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} $msg + } + #JJJJ + if {![tomlish::utils::rawstring_is_valid_literal $val]} { + #has controls other than tab + #todo - squote? + return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} "Controls other than tab found in literal" + } + return [list LITERAL $val] + } + INT { + if {![::tomlish::utils::is_int $val]} { + error "_from_dictval_tomltype error: bad INT value '$val' - cannot encode dict to TOML-VALID TOMLISH" + } + return [list INT $val] + } + FLOAT { + if {![::tomlish::utils::is_float $val]} { + error "_from_dictval_tomltype error: bad FLOAT value '$val' - cannot encode dict to TOML-VALID TOMLISH" + } + return [list FLOAT $val] + } + default { + if {$type ni [::tomlish::tags]} { + error "_from_dictval_tomltype error: Unrecognised typename '$type' in {type value } - cannot encode dict to TOML-VALID TOMLISH" + } + return [list $type $val] + } + } + } + + proc _from_dictval {parents tablestack keys vinfo} { + set k [lindex $keys end] + set K_PART [tomlish::dict::classify_rawkey $k] ;#get [list SQKEY ] + #puts stderr "---parents:'$parents' keys:'$keys' vinfo: $vinfo---" + #puts stderr "---tablestack: $tablestack---" + set result [list] + set lastparent [lindex $parents end] + if {$lastparent in [list "" do_inline]} { + if {[tomlish::dict::is_typeval $vinfo]} { + set type [dict get $vinfo type] + #treat ITABLE differently? + set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] + lappend result DOTTEDKEY [list $K_PART {WS { }}] = {WS { }} $sublist {NEWLINE lf} + } else { + if {$vinfo ne ""} { + + #set result [list DOTTEDKEY [list [list KEY $k]] = ] + #set records [list ITABLE] + + set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] + + if {$lastparent eq "do_inline"} { + set result [list DOTTEDKEY [list $K_PART] =] + set records [list ITABLE] + } else { + set tname [tomlish::dict::join_and_quote_rawkey_list [list $k]] + set result [list TABLE $tname {NEWLINE lf}] + set tablestack [list {*}$tablestack [list T $k]] + set records [list] + } + + + + set lastidx [expr {[dict size $vinfo] -1}] + set dictidx 0 + dict for {vk vv} $vinfo { + set VK_PART [tomlish::dict::classify_rawkey $vk] ;#get [list SQKEY ] + #(SQKEY & DQKEY do not have the enclosing quotes in their returned val) + #if {[regexp {\s} $vk] || [string first . $vk] >= 0} { + # set VK_PART [list SQKEY $vk] + #} else { + # set VK_PART [list KEY $vk] + #} + if {[tomlish::dict::is_typeval $vv]} { + #type x value y + #REVIEW - we could detect if value is an array of objects, + #and depending on parent context - emit a series of TABLEARRAY records instead of a DOTTEDKEY record containing an ARRAY of objects + set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] + set record [list DOTTEDKEY [list $VK_PART {WS { }}] = {WS { }} $sublist] + } else { + if {$vv eq ""} { + #experimental + if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { + ::tomlish::log::notice "_from_dictval could uninline KEY $vk (tablestack:$tablestack)" + #set tname [tomlish::dict::name_from_tablestack [list {*}$tablestack [list T $vk]]] + + #we can't just join normalized keys - need keys with appropriate quotes and escapes + #set tname [join [list {*}$keys $vk] .] ;#WRONG + set tq [tomlish::dict::join_and_quote_rawkey_list [list {*}$keys $vk]] + + + ##wrong? results in TABLE within TABLE record?? todo pop? + #set record [list TABLE $tq {NEWLINE lf}] + #set tablestack [list {*}$tablestack [list T $vk]] + + #REVIEW!!! + + set record [list DOTTEDKEY [list $VK_PART] = ITABLE] + set tablestack [list {*}$tablestack [list I $vk]] + + } else { + set record [list DOTTEDKEY [list $VK_PART] = ITABLE] + set tablestack [list {*}$tablestack [list I $vk]] + } + } else { + if { 0 } { + #experiment.. sort of getting there. + if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { + ::tomlish::log::notice "_from_dictval could uninline2 KEYS [list {*}$keys $vk] (tablestack:$tablestack)" + set tq [tomlish::dict::join_and_quote_rawkey_list [list {*}$keys $vk]] + set record [list TABLE $tq {NEWLINE lf}] + set tablestack [list {*}$tablestack [list T $vk]] + + #review - todo? + set dottedkey_value [_from_dictval [list {*}$parents TABLE] $tablestack [list {*}$keys $vk] $vv] + lappend record {*}$dottedkey_value + + } else { + set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] + set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] + } + } else { + set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] + set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] + } + } + } + if {$dictidx != $lastidx} { + #lappend record SEP + if {$lastparent eq "do_inline"} { + lappend record SEP + } else { + lappend record {NEWLINE lf} + } + } + if {[llength $record]} { + lappend records $record + } + incr dictidx + } + if {$lastparent eq "do_inline"} { + lappend result $records {NEWLINE lf} + } else { + lappend result {*}$records {NEWLINE lf} + } + } else { + if {$lastparent eq "do_inline"} { + lappend result DOTTEDKEY [list $K_PART] = ITABLE {NEWLINE lf} + } else { + set tname [tomlish::dict::join_and_quote_rawkey_list [list $k]] + #REVIEW + lappend result TABLE $tname {NEWLINE lf} + } + } + } + } else { + #lastparent is not toplevel "" or "do_inline" + if {[tomlish::dict::is_typeval $vinfo]} { + #type x value y + set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] + lappend result {*}$sublist + } else { + if {$lastparent eq "TABLE"} { + #review + dict for {vk vv} $vinfo { + set VK_PART [tomlish::dict::classify_rawkey $vk] ;#get [list SQKEY ] + set dottedkey_value [_from_dictval [list {*}$parents DOTTEDKEY] $tablestack [list {*}$keys $vk] $vv] + lappend result [list DOTTEDKEY [list $VK_PART] = $dottedkey_value {NEWLINE lf}] + } + } else { + if {$vinfo ne ""} { + set lastidx [expr {[dict size $vinfo] -1}] + set dictidx 0 + set sub [list] + #REVIEW + #set result $lastparent ;#e.g sets ITABLE + set result ITABLE + set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] + dict for {vk vv} $vinfo { + set VK_PART [tomlish::dict::classify_rawkey $vk] ;#get [list SQKEY ] + if {[tomlish::dict::is_typeval $vv]} { + #type x value y + set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] + set record [list DOTTEDKEY [list $VK_PART] = $sublist] + } else { + if {$vv eq ""} { + #can't just uninline at this level + #we need a better method to query main dict for uninlinability at each level + # (including what's been inlined already) + #if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { + # puts stderr "_from_dictval uninline2 KEY $keys" + # set tname [tomlish::dict::join_and_quote_rawkey_list [list {*}$keys $vk]] + # set record [list TABLE $tname {NEWLINE lf}] + # set tablestack [list {*}$tablestack [list T $vk]] + #} else { + set record [list DOTTEDKEY [list $VK_PART] = ITABLE] + #} + } else { + #set sub [_from_dictval ITABLE $vk $vv] + set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] + #set record [list DOTTEDKEY [list $VK_PART] = ITABLE $dottedkey_value] + set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] + } + } + if {$dictidx != $lastidx} { + lappend record SEP + } + lappend result $record + incr dictidx + } + } else { + #e.g x=[{}] + log::debug "---> _from_dictval empty ITABLE x-1" + #lappend result DOTTEDKEY [list $K_PART] = ITABLE ;#wrong + lappend result ITABLE + } + } + } + } + return $result + } + + + proc from_dict {d} { + #consider: + # t1={a=1,b=2} + # x = 1 + + # from_dict gives us: t1 {a {type INT value 1} b {type INT value 2}} x {type INT value 1} + + #If we represent t1 as an expanded table we get + # [t1] + # a=1 + # b=2 + # x=1 + # --- which is incorrect - as x was a toplevel key like t1! + #This issue doesn't occur if x is itself an inline table + # t1={a=1,b=2} + # x= {no="problem"} + # + # (or if we were to reorder x to come before t1) + + #ie the order of the dict elements influences how the toml can be represented. + + #As the dictionary form doesn't distinguish the structure used to create tables {[table1]\nk=v} vs inline {table1={k=v}} + #Without a solution, from_dict would have to always produce the inline form for toplevel tables unless we allowed re-ordering, + #which is unpreferred here. + + #A possible solution: + #scan the top level to see if all (trailing) elements are themselves dicts + # (ie not of form {type XXX value yyy}) + # + # A further point is that if all root level values are at the 'top' - we can treat lower table-like structures as {[table]} elements + #ie we don't need to force do_inline if all the 'simple' keys are before any compound keys + + #set root_has_values 0 + #approach 1) - the naive approach - forces inline when not always necessary + #dict for {k v} $d { + # if {[llength $v] == 4 && [lindex $v 0] eq "type"} { + # set root_has_values 1 + # break + # } + #} + + + #approach 2) - track the position of last {type x value y} in the dictionary built by dict::from_tomlish + # - still not perfect. Inlines dotted tables unnecessarily + #This means from_dict doesn't produce output optimal for human editing. + set last_simple [tomlish::dict::last_tomltype_posn $d] + + + ## set parent "do_inline" ;#a value used in _from_dictval to distinguish from "" or other context based parent values + #Any keys that are themselves tables - will need to be represented inline + #to avoid reordering, or incorrect assignment of plain values to the wrong table. + + ## set parent "" + #all toplevel keys in the dict structure can represent subtables. + #we are free to use {[tablename]\n} syntax for toplevel elements. + + + set tomlish [list TOMLISH] + set dictposn 0 + set tablestack [list [list T root]] ;#todo + dict for {t tinfo} $d { + if {$last_simple > $dictposn} { + set parents [list do_inline] + } else { + set parents [list ""] + } + set keys [list $t] + #review - where to make decision on + # DOTTEDKEY containing array of objs + #vs + # list of TABLEARRAY records + #At least for the top + set trecord [_from_dictval $parents $tablestack $keys $tinfo] + lappend tomlish $trecord + incr dictposn + } + return $tomlish + } + + proc typedjson_to_toml {json} { + #*** !doctools + #[call [fun typedjson_to_toml] [arg json]] + #[para] + + set tomlish [::tomlish::from_dict_from_typedjson $json] + lappend tomlish [list NEWLINE lf] + set toml [::tomlish::to_toml $tomlish] + } + + set json1 {{ "a": {"type": "integer", "value": "42"}}} + set json2 {{ + "a": {"type": "integer", "value": "42"}, + "b": {"type": "string", "value": "test"} + }} + set json3 { +{ + "best-day-ever": {"type": "datetime", "value": "1987-07-05T17:45:00Z"}, + "numtheory": { + "boring": {"type": "bool", "value": "false"}, + "perfection": [ + {"type": "integer", "value": "6"}, + {"type": "integer", "value": "28"}, + {"type": "integer", "value": "496"} + ] + } +} + } + + set json4 { +{ + "best-day-ever": {"type": "datetime", "value": "1987-07-05T17:45:00Z"}, + "numtheory": { + "boring": {"type": "bool", "value": "false"}, + "perfection": [ + {"type": "integer", "value": "6"}, + {"type": "integer", "value": "28"}, + {"type": "integer", "value": "496"} + ] + }, + "emptyobj": {}, + "emptyarray": [] +} + } + + set json5 { +{ + "a": { + " x ": {}, + "b.c": {}, + "d.e": {}, + "b": { + "c": {} + } + } +} + } + + #surrogate pair face emoji + set json6 { +{ + "surrogatepair": {"type": "string", "value": "\uD83D\uDE10"} +} + } + + + set json7 { +{ + "escapes": {"type": "string", "value": "val\\ue"} +} + } + + + proc from_dict_from_typedjson {json} { + set d [tomlish::dict::from_typedjson $json] + tomlish::from_dict $d ;#return tomlish + } + + + proc toml_to_typedjson {toml} { + set tomlish [::tomlish::from_toml $toml] + set d [tomlish::dict::from_tomlish $tomlish] + #full validation only occurs by re-encoding dict to tomlish + set test [tomlish::from_dict $d] + + set h [tomlish::typedhuddle::from_dict $d] + #huddle jsondump $h + tomlish::huddle::jsondumpraw $h + } + + #proc get_json {tomlish} { + # package require fish::json + # set d [::tomlish::dict::from_tomlish $tomlish] + + # #return [::tomlish::dict_to_json $d] + # return [fish::json::from "struct" $d] + #} + + #return a Tcl list of tomlish tokens + #i.e get a standard list of all the toml terms in string $s + #where each element of the list is a *tomlish* term.. i.e a specially 'tagged' Tcl list. + #(simliar to a tcl 'Huddle' - but also supporting whitespace preservation) + # ---------------------------------------------------------------------------------------------- + # NOTE: the production of tomlish from toml source doesn't indicate the toml source was valid!!! + # e.g we deliberately don't check certain things such as duplicate table declarations here. + # ---------------------------------------------------------------------------------------------- + #Part of the justification for this is that as long as the syntax is toml shaped - we can load files which violate certain rules and allow programmatic manipulation. + # (e.g perhaps a toml editor to highlight violations for fixing) + # A further stage is then necessary to load the tomlish tagged list into a data structure more suitable for efficient query/reading. + # e.g dicts or an object oriented structure + #Note also - *no* escapes in quoted strings are processed. This is up to the datastructure stage + #e.g dict::from_tomlish will substitute \r \n \uHHHH \UHHHHHHH etc + #This is important for tomlish to maintain the ability to perform competely lossless round-trips from toml to tomlish and back to toml. + # (which is handy for testing as well as editing some part of the structure with absolutely no effect on other parts of the document) + #If we were to unescape a tab character for example + # - we have no way of knowing if it was originally specified as \t \u0009 or \U00000009 or directly as a tab character. + # For this reason, we also do absolutely no line-ending transformations based on platform. + # All line-endings are maintained as is, and even a file with mixed lf crlf line-endings will be correctly interpreted and can be 'roundtripped' + + proc from_toml {args} { + + namespace upvar ::tomlish::parse s s + set s [join $args \n] + namespace upvar ::tomlish::parse i i + set i 0 ;#index into s + + namespace upvar ::tomlish::parse is_parsing is_parsing + set is_parsing 1 + + if {[info command ::tomlish::parse::spacestack] eq "::tomlish::parse::spacestack"} { + tomlish::parse::spacestack destroy + } + struct::stack ::tomlish::parse::spacestack + + namespace upvar ::tomlish::parse last_space_action last_space_action + namespace upvar ::tomlish::parse last_space_type last_space_type + + namespace upvar ::tomlish::parse tok tok + set tok "" + + namespace upvar ::tomlish::parse type type + namespace upvar ::tomlish::parse tokenType tokenType + ::tomlish::parse::set_tokenType "" + namespace upvar ::tomlish::parse tokenType_list tokenType_list + set tokenType [list] ;#Flat (un-nested) list of tokentypes found + + namespace upvar ::tomlish::parse lastChar lastChar + set lastChar "" + + + set result "" + namespace upvar ::tomlish::parse nest nest + set nest 0 + + namespace upvar ::tomlish::parse v v ;#array keyed on nest level + + + set v(0) {TOMLISH} + array set s0 [list] ;#whitespace data to go in {SPACE {}} element. + set parentlevel 0 + + + namespace upvar ::tomlish::parse state state + + namespace upvar ::tomlish::parse braceCount braceCount + set barceCount 0 + namespace upvar ::tomlish::parse bracketCount bracketCount + set bracketCount 0 + + set sep 0 + set r 1 + namespace upvar ::tomlish::parse token_waiting token_waiting + set token_waiting [dict create] ;#if ::tok finds a *complete* second token during a run, it will put the 2nd one here to be returned by the next call. + + + + set state "table-space" + ::tomlish::parse::spacestack push {type space state table-space} + namespace upvar ::tomlish::parse linenum linenum;#'line number' of input data. (incremented for each literal linefeed - but not escaped ones in data) + set linenum 1 + + set ::tomlish::parse::state_list [list] + try { + while {$r} { + set r [::tomlish::parse::tok] + #puts stdout "got tok: '$tok' while parsing string '$s' " + set next_tokenType_known 0 ;#whether we begin a new token here based on what terminated the token result of 'tok' + + + #puts "got token: '$tok' tokenType='$tokenType'. while v($nest) = [set v($nest)]" + #puts "-->tok: $tok tokenType='$tokenType'" + set prevstate $state + set transition_info [::tomlish::parse::goNextState $tokenType $tok $state] + #review goNextState could perform more than one space_action + set space_action [dict get $transition_info space_action] + set newstate [dict get $transition_info newstate] ;#use of 'newstate' vs 'state' makes code clearer below + + if {[tcl::string::match "err-*" $state]} { + ::tomlish::log::warn "---- State error in state $prevstate for tokenType: $tokenType token value: $tok. $state aborting parse. [tomlish::parse::report_line]" + lappend v(0) [list ERROR tokentype $tokenType state $prevstate to $state leveldata [set v($nest)]] + return $v(0) + } + # --------------------------------------------------------- + #NOTE there may already be a token_waiting at this point + #set_token_waiting can raise an error here, + # in which case the space_action branch needs to be rewritten to handle the existing token_waiting + # --------------------------------------------------------- + + if {$space_action eq "pop"} { + #pop_trigger_tokens: newline tablename endarray endinlinetable + #note a token is a pop trigger depending on context. e.g first newline during keyval is a pop trigger. + set parentlevel [expr {$nest -1}] + set do_append_to_parent 1 ;#most tokens will leave this alone - but some like tentative_accum_squote need to do their own append + switch -exact -- $tokenType { + tentative_accum_squote { + #should only apply within a multiliteral + #### + set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed + #Without this - we would get extraneous empty list entries in the parent + # - as the xxx-squote-space isn't a space level from the toml perspective + # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop + #assert prevstate always trailing-squote-space + #dev guardrail - remove? assertion lib? + switch -exact -- $prevstate { + trailing-squote-space { + } + default { + error "--- unexpected popped due to tentative_accum_squote but came from state '$prevstate' should have been trailing-squote-space" + } + } + switch -- $tok { + ' { + tomlish::parse::set_token_waiting type single_squote value $tok complete 1 startindex [expr {$i -1}] + } + '' { + #review - we should perhaps return double_squote instead? + #tomlish::parse::set_token_waiting type literal value "" complete 1 + tomlish::parse::set_token_waiting type double_squote value "" complete 1 startindex [expr {$i - 2}] + } + ''' { + #### + #if already an eof in token_waiting - set_token_waiting will insert before it + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 3}] + } + '''' { + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] + #todo integrate left squote with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + LITERALPART { + set newval "[lindex $lastpart 1]'" + set parentdata $v($parentlevel) + lset parentdata end [list LITERALPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE { + lappend v($parentlevel) [list LITERALPART "'"] + } + MULTILITERAL { + #empty + lappend v($parentlevel) [list LITERALPART "'"] + } + default { + error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" + } + } + } + ''''' { + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] + #todo integrate left 2 squotes with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + LITERALPART { + set newval "[lindex $lastpart 1]''" + set parentdata $v($parentlevel) + lset parentdata end [list LITERALPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE { + lappend v($parentlevel) [list LITERALPART "''"] + } + MULTILITERAL { + lappend v($parentlevel) [list LITERALPART "''"] + } + default { + error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" + } + } + } + } + } + triple_squote { + #presumably popping multiliteral-space + ::tomlish::log::debug "---- triple_squote for last_space_action pop leveldata: $v($nest)" + set merged [list] + set lasttype "" + foreach part $v($nest) { + switch -exact -- [lindex $part 0] { + MULTILITERAL { + lappend merged $part + } + LITERALPART { + if {$lasttype eq "LITERALPART"} { + set prevpart [lindex $merged end] + lset prevpart 1 [lindex $prevpart 1][lindex $part 1] + lset merged end $prevpart + } else { + lappend merged $part + } + } + NEWLINE { + #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here + #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. + lappend merged $part + } + default { + error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" + } + } + set lasttype [lindex $part 0] + } + set v($nest) $merged + } + tentative_accum_dquote { + #should only apply within a multistring + #### + set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed + #Without this - we would get extraneous empty list entries in the parent + # - as the trailing-dquote-space isn't a space level from the toml perspective + # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop + #assert prevstate always trailing-dquote-space + #dev guardrail - remove? assertion lib? + switch -exact -- $prevstate { + trailing-dquote-space { + } + default { + error "--- unexpected popped due to tentative_accum_dquote but came from state '$prevstate' should have been trailing-dquote-space" + } + } + switch -- $tok { + {"} { + tomlish::parse::set_token_waiting type single_dquote value $tok complete 1 startindex [expr {$i -1}] + } + {""} { + #review - we should perhaps return double_dquote instead? + #tomlish::parse::set_token_waiting type literal value "" complete 1 + tomlish::parse::set_token_waiting type double_dquote value "" complete 1 startindex [expr {$i - 2}] + } + {"""} { + #### + #if already an eof in token_waiting - set_token_waiting will insert before it + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 3}] + } + {""""} { + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 4}] + #todo integrate left dquote with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + STRINGPART { + set newval "[lindex $lastpart 1]\"" + set parentdata $v($parentlevel) + lset parentdata end [list STRINGPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE - CONT - WS { + lappend v($parentlevel) [list STRINGPART {"}] + } + MULTISTRING { + #empty + lappend v($parentlevel) [list STRINGPART {"}] + } + default { + error "--- don't know how to integrate extra trailing dquote with data $v($parentlevel)" + } + } + } + {"""""} { + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i-5}] + #todo integrate left 2 dquotes with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + STRINGPART { + set newval "[lindex $lastpart 1]\"\"" + set parentdata $v($parentlevel) + lset parentdata end [list STRINGPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE - CONT - WS { + lappend v($parentlevel) [list STRINGPART {""}] + } + MULTISTRING { + lappend v($parentlevel) [list STRINGPART {""}] + } + default { + error "--- don't know how to integrate extra trailing 2 dquotes with data $v($parentlevel)" + } + } + } + } + } + triple_dquote { + #presumably popping multistring-space + ::tomlish::log::debug "---- triple_dquote for last_space_action pop leveldata: $v($nest)" + set merged [list] + set lasttype "" + foreach part $v($nest) { + switch -exact -- [lindex $part 0] { + MULTISTRING { + lappend merged $part + } + STRINGPART { + if {$lasttype eq "STRINGPART"} { + set prevpart [lindex $merged end] + lset prevpart 1 [lindex $prevpart 1][lindex $part 1] + lset merged end $prevpart + } else { + lappend merged $part + } + } + CONT - WS { + lappend merged $part + } + NEWLINE { + #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here + #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. + lappend merged $part + } + default { + error "---- triple_dquote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" + } + } + set lasttype [lindex $part 0] + } + set v($nest) $merged + } + equal { + #pop caused by = + switch -exact -- $prevstate { + dottedkey-space { + tomlish::log::debug "---- equal ending dottedkey-space for last_space_action pop" + #re-emit for parent space + tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] + } + dottedkey-space-tail { + #experiment? + tomlish::log::debug "---- equal ending dottedkey-space-tail for last_space_action pop" + #re-emit for parent space + tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] + } + } + } + newline { + incr linenum + lappend v($nest) [list NEWLINE $tok] + } + tablename { + #note: a tablename only 'pops' if we are greater than zero + error "---- tablename pop should already have been handled as special case zeropoppushspace in goNextState" + } + tablearrayname { + #!review - tablearrayname different to tablename regarding push/pop? + #note: a tablename only 'pops' if we are greater than zero + error "---- tablearrayname pop should already have been handled as special case zeropoppushspace in goNextState" + } + endarray { + #nothing to do here. + } + comma { + #comma for inline table will pop the keyvalue space + lappend v($nest) "SEP" + } + endinlinetable { + ::tomlish::log::debug "---- endinlinetable for last_space_action pop" + } + default { + error "---- unexpected tokenType '$tokenType' for last_space_action 'pop'" + } + } + if {$do_append_to_parent} { + #e.g tentative_accum_squote does it's own appends as necessary - so won't get here + lappend v($parentlevel) [set v($nest)] + } + + incr nest -1 + + } elseif {$last_space_action eq "push"} { + set prevnest $nest + incr nest 1 + set v($nest) [list] + # push_trigger_tokens: barekey dquotedkey startinlinetable startarray tablename tablearrayname + + + switch -exact -- $tokenType { + tentative_trigger_squote - tentative_trigger_dquote { + #### this startok will always be tentative_accum_squote/tentative_accum_dquote starting with one accumulated squote/dquote + if {[dict exists $transition_info starttok] && [dict get $transition_info starttok] ne ""} { + lassign [dict get $transition_info starttok] starttok_type starttok_val + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType $starttok_type + set tok $starttok_val + } + } + single_squote { + #JMN - REVIEW + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "squotedkey" + set tok "" + } + triple_squote { + ::tomlish::log::debug "---- push trigger tokenType triple_squote" + set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERALPART + } + squotedkey { + switch -exact -- $prevstate { + table-space - itable-space { + set v($nest) [list DOTTEDKEY] + } + } + #todo - check not something already waiting? + tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space + } + triple_dquote { + set v($nest) [list MULTISTRING] ;#container for NEWLINE,STRINGPART,CONT + } + dquotedkey { + switch -exact -- $prevstate { + table-space - itable-space { + set v($nest) [list DOTTEDKEY] + } + } + #todo - check not something already waiting? + tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space + } + barekey { + switch -exact -- $prevstate { + table-space - itable-space { + set v($nest) [list DOTTEDKEY] + } + } + #todo - check not something already waiting? + set waiting [tomlish::parse::get_token_waiting] + if {[llength $waiting]} { + set i [dict get $waiting startindex] + tomlish::parse::clear_token_waiting + tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space + } else { + tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space + } + } + tablename { + #note: we do not use the output of tablename_trim to produce a tablename for storage in the tomlish list! + #The tomlish list is intended to preserve all whitespace (and comments) - so a roundtrip from toml file to tomlish + # back to toml file will be identical. + #It is up to the datastructure stage to normalize and interpret tomlish for programmatic access. + # we call tablename_trim here only to to validate that the tablename data is well-formed at the outermost level, + # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names from + # a structural perspective. + + #todo - review! It's arguable that we should not do any validation here, and just store even incorrect raw tablenames, + # so that the tomlish list is more useful for say a toml editor. Consider adding an 'err' tag to the appropriate place in the + # tomlish list? + + #set trimtable [tablename_trim $tok] + #::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$trimtable'" + set v($nest) [list TABLE $tok] ;#$tok is the *raw* table name + #note also that equivalent tablenames may have different toml representations even after being trimmed! + #e.g ["x\t\t"] & ["x "] (tab escapes vs literals) + #These will show as above in the tomlish list, but should normalize to the same tablename when used as keys by the datastructure stage. + } + tablearrayname { + #set trimtable [tablename_trim $tok] + #::tomlish::log::debug "---- trimmed (but not normalized) tablearrayname: '$trimtable'" + set v($nest) [list TABLEARRAY $tok] ;#$tok is the *raw* tablearray name + } + startarray { + set v($nest) [list ARRAY] ;#$tok is just the opening bracket - don't output. + } + startinlinetable { + set v($nest) [list ITABLE] ;#$tok is just the opening curly brace - don't output. + } + default { + error "---- push trigger tokenType '$tokenType' not yet implemented" + } + } + + } else { + #no space level change + switch -exact -- $tokenType { + squotedkey { + #puts "---- squotedkey in state $prevstate (no space level change)" + lappend v($nest) [list SQKEY $tok] + } + dquotedkey { + #puts "---- dquotedkey in state $prevstate (no space level change)" + lappend v($nest) [list DQKEY $tok] + } + barekey { + lappend v($nest) [list KEY $tok] + } + dotsep { + lappend v($nest) [list DOTSEP] + } + starttablename { + #$tok is triggered by the opening bracket and sends nothing to output + } + starttablearrayname { + #$tok is triggered by the double opening brackets and sends nothing to output + } + tablename - tablenamearray { + error "---- did not expect 'tablename/tablearrayname' without space level change (no space level change)" + #set v($nest) [list TABLE $tok] + } + endtablename - endtablearrayname { + #no output into the tomlish list for this token + } + startinlinetable { + puts stderr "---- decode::toml error. did not expect startinlinetable without space level change (no space level change)" + } + single_dquote { + switch -exact -- $newstate { + string-state { + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "string" + set tok "" + } + dquoted-key { + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "dquotedkey" + set tok "" + } + multistring-space { + lappend v($nest) [list STRINGPART {"}] + #may need to be joined on pop if there are neighbouring STRINGPARTS + } + default { + error "---- single_dquote switch case not implemented for nextstate: $newstate (no space level change)" + } + } + } + double_dquote { + #leading extra quotes - test: toml_multistring_startquote2 + switch -exact -- $prevstate { + itable-keyval-value-expected - keyval-value-expected { + puts stderr "tomlish::decode::toml double_dquote TEST" + #empty string + lappend v($nest) [list STRINGPART ""] + } + multistring-space { + #multistring-space to multistring-space + lappend v($nest) [list STRINGPART {""}] + } + default { + error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" + } + } + + } + single_squote { + switch -exact -- $newstate { + literal-state { + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "literal" + set tok "" + } + squoted-key { + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "squotedkey" + set tok "" + } + multiliteral-space { + #false alarm squote returned from tentative_accum_squote pop + ::tomlish::log::debug "---- adding lone squote to own LITERALPART nextstate: $newstate (no space level change)" + #(single squote - not terminating space) + lappend v($nest) [list LITERALPART '] + #may need to be joined on pop if there are neighbouring LITERALPARTs + } + default { + error "---- single_squote switch case not implemented for nextstate: $newstate (no space level change)" + } + } + } + double_squote { + switch -exact -- $prevstate { + keyval-value-expected { + lappend v($nest) [list LITERAL ""] + } + multiliteral-space { + #multiliteral-space to multiliteral-space + lappend v($nest) [list LITERALPART ''] + } + default { + error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" + } + } + } + enddquote { + #nothing to do? + set tok "" + } + endsquote { + set tok "" + } + string { + #JJJJ + set tok [tomlish::from_Bstring $tok] + lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes + } + literal { + lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes + } + multistring { + #review + #JJJJ ? + lappend v($nest) [list MULTISTRING $tok] + } + stringpart { + #JJJJ + set tok [tomlish::from_Bstring $tok] + lappend v($nest) [list STRINGPART $tok] ;#will not get wrapped in dquotes directly + } + multiliteral { + lappend v($nest) [LIST MULTILITERAL $tok] + } + literalpart { + lappend v($nest) [list LITERALPART $tok] ;#will not get wrapped in squotes directly + } + untyped_value { + #would be better termed unclassified_value + #we can't determine the type of unquoted values (int,float,datetime,bool) until the entire token was read. + unset -nocomplain tag + if {$tok in {true false}} { + set tag BOOL + } else { + if {[::tomlish::utils::is_int $tok]} { + set tag INT + } else { + if {[string is integer -strict $tok]} { + #didn't qualify as a toml int - but still an int + #probably means is_int is limiting size and not accepting bigints (configurable?) + #or it didn't qualify due to more than 1 leading zero + #or other integer format issue such as repeated underscores + error "---- Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. (looks close to being an int. Formatting or range issue?) [tomlish::parse::report_line] (no space level change)" + } else { + #DDDD + if {[::tomlish::utils::is_float $tok]} { + set tag FLOAT + } elseif {[::tomlish::utils::is_localtime $tok]} { + set tag TIME-LOCAL + } elseif {[::tomlish::utils::is_timepart $tok]} { + #Note we must allow lone timepart here (not just is_localtime which doesn't allow tz offsets) in case it followed a previous localdate + set tag DATETIME ;#?? review standalone time with tz - no specific tag - only allowed as followup value from DATETIME-LOCAL + } elseif {[::tomlish::utils::is_datepart $tok]} { + set tag DATE-LOCAL + } elseif {[::tomlish::utils::is_datetime $tok]} { + #not just a date or just a time + #could be either local or have tz offset + #DDDD JJJ + set norm [string map {" " T} $tok];#prob unneeded - we won't get here if there was a space - would arrive as 2 separate tokens review. + lassign [split $norm T] dp tp + if {[::tomlish::utils::is_localtime $tp]} { + set tag DATETIME-LOCAL + } else { + set tag DATETIME + } + } elseif {[::tomlish::utils::is_datetime X$tok] || [::tomlish::utils::is_timepart X$tok]} { + # obsolete + #Note we must allow lone timepart here (not just is_localtime which doesn't allow tz offsets) in case it followed a previous localdate + #e.g x= 2025-01-01 02:34Z + #The dict::from_tomlish validation will catch an invalid standaline timepart, or combine with leading date if applicable. + set tag DATETIME + } else { + error "---- Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. [tomlish::parse::report_line] (no space level change)" + } + } + } + } + #assert either tag is set, or we errored out. + lappend v($nest) [list $tag $tok] + + } + comment { + #puts stdout "----- comment token returned '$tok'------" + #JJJJ + set tok [tomlish::from_comment $tok] + lappend v($nest) [list COMMENT "$tok"] + } + equal { + #we append '=' to the nest so that any surrounding whitespace is retained. + lappend v($nest) = + } + comma { + lappend v($nest) SEP + } + newline { + incr linenum + lappend v($nest) [list NEWLINE $tok] + } + whitespace { + lappend v($nest) [list WS $tok] + } + continuation { + lappend v($nest) CONT + } + bom { + lappend v($nest) BOM + } + eof { + #ok - nothing more to add to the tomlish list. + #!todo - check previous tokens are complete/valid? + } + default { + error "--- unknown tokenType '$tokenType' during state $prevstate [::tomlish::parse::report_line] (no space level change)" + } + } + } + + if {!$next_tokenType_known} { + ::tomlish::log::notice "---- tomlish::decode::toml - current tokenType:$tokenType Next token type not known" + ::tomlish::parse::set_tokenType "" + set tok "" + } + + if {$state eq "end-state"} { + break + } + + + } + + #while {$nest > 0} { + # lappend v([expr {$nest -1}]) [set v($nest)] + # incr nest -1 + #} + while {[::tomlish::parse::spacestack size] > 1} { + ::tomlish::parse::spacestack pop + lappend v([expr {$nest -1}]) [set v($nest)] + incr nest -1 + + #set parent [spacestack peek] ;#the level being appended to + #lassign $parent type state + #if {$type eq "space"} { + # + #} elseif {$type eq "buffer"} { + # lappend v([expr {$nest -1}]) {*}[set v($nest)] + #} else { + # error "invalid spacestack item: $parent" + #} + } + + } finally { + set is_parsing 0 + } + return $v(0) + } + + #toml dquoted string to tomlish STRING + # - only allow specified escape sequences + # - allow any unicode except those that must be escaped: dquote, bsl, and control chars(except tab) + proc from_Bstring {bstr} { + #JJJJ + if {[catch { + tomlish::utils::unescape_string $bstr + } errM]} { + return -code error -errorcode {TOML SYNTAX INVALIDESCAPE} "tomlish::from_Bstring toml Bstring contains invalid escape sequence\n$errM" ;#review + } + #assert: all escapes are now valid + + if {[regexp {[\u0000-\u0008\u000A-\u001F\u007f]} $bstr]} { + set msg "tomlish::from_Bstring toml Bstring contains controls that must be escaped" + return -code error -errorcode {TOML SYNTAX BSTRINGUNESCAPEDCONTROLS} $msg ;#review + } + return $bstr + } + #validate toml comment + # - disallow controls that must be escaped + #from spec: + # "Control characters other than tab (U+0000 to U+0008, U+000A to U+001F, U+007F) are not permitted in comments." + proc from_comment {comment} { + if {[regexp {[\u0000-\u0008\u000A-\u001F\u007f]} $comment]} { + set msg "tomlish::from_comment toml comment contains controls that must be escaped" + return -code error -errorcode {TOML SYNTAX COMMENTUNESCAPEDCONTROLS} $msg ;#review + } + return $comment + } + + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish ---}] +} +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ + +namespace eval tomlish::build { + #STRING,INT,FLOAT,BOOL, DATETIME - simple wrappers for completeness + # take a value of the appropriate type and wrap as a tomlish tagged item + proc STRING {s} { + return [list STRING [::tomlish::utils::rawstring_to_Bstring_with_escaped_controls $s]] + } + proc LITERAL {litstring} { + error todo + } + + proc INT {i} { + #whole numbers, may be prefixed with a + or - + #Leading zeros are not allowed + #Hex,octal binary forms are allowed (toml 1.0) + #We will error out on encountering commas, as commas are interpreted differently depending on locale (and don't seem to be supported in the toml spec anyway) + #!todo - Tcl can handle bignums - bigger than a 64bit signed long as specified in toml. + # - We should probably raise an error for number larger than this and suggest the user supply it as a string? + if {[tcl::string::last , $i] > -1} { + error "Unable to interpret '$i' as an integer. Use underscores if you need a thousands separator [::tomlish::parse::report_line]" + } + if {![::tomlish::utils::int_validchars $i]} { + error "Unable to interpret '$i' as an integer. Only 0-9 + 1 _ characters are acceptable. [::tomlish::parse::report_line]" + } + + if {[::tomlish::utils::is_int $i]} { + return [list INT $i] + } else { + error "'$i' is not a valid integer as per the Toml spec. [::tomlish::parse::report_line]" + } + + } + + proc FLOAT {f} { + #convert any non-lower case variants of special values to lowercase for Toml + if {[::tcl::string::tolower $f] in {nan +nan -nan inf +inf -inf}} { + return [list FLOAT [tcl::string::tolower $f]] + } + if {[::tomlish::utils::is_float $f]} { + return [list FLOAT $f] + } else { + error "Unable to interpret '$f' as Toml float. Check your input, or check that tomlish is able to handle all Toml floats properly [::tomlish::parse::report_line]" + } + } + + proc DATETIME {str} { + if {[::tomlish::utils::is_datetime $str]} { + return [list DATETIME $str] + } else { + error "Unable to interpret '$str' as Toml datetime. Check your input, or check that tomlish is able to handle all Toml datetimes properly [::tomlish::parse::report_line]" + } + } + proc DATETIME-LOCAL {str} { + error "build::DATETIME-LOCAL todo" + } + + proc BOOLEAN {b} { + #convert any Tcl-acceptable boolean to boolean as accepted by toml - lower case true/false + if {![tcl::string::is boolean -strict $b]} { + error "Unable to convert '$b' to Toml boolean true|false. [::tomlish::parse::report_line]" + } else { + if {$b && 1} { + return [::list BOOL true] + } else { + return [::list BOOL false] + } + } + } + + #REVIEW + #Take tablename followed by + # a) *tomlish* name-value pairs e.g table mydata [list KEY item11 = [list STRING "test"]] {KEY item2 = [list INT 1]} + # (accept also key value {STRING }) + # b) simple 2-element tcl lists being name & *simple* value pairs for which basic heuristics will be used to determine types + proc _table {name args} { + set pairs [list] + foreach t $args { + if {[llength $t] == 4} { + if {[tcl::string::tolower [lindex $t 0]] ne "key" || [tcl::string::tolower [lindex $t 2]] ni "= value"} { + error "Only items tagged as KEY = currently accepted as name-value pairs for table command" + } + lassign $t _k keystr _eq valuepart + if {[llength $valuepart] != 2} { + error "supplied value must be typed. e.g {INT 1} or {STRING test}" + } + lappend pairs [list KEY $keystr = $valuepart] + } elseif {[llength $t] == 2} { + #!todo - type heuristics + lassign $t n v + lappend pairs [list KEY $n = [list STRING $v]] + } else { + error "'KEY = { toml but + # the first newline is not part of the data. + # we elect instead to maintain a basic LITERALPART that must not contain newlines.. + # and to compose MULTILITERAL of multiple NEWLINE LITERALPART parts, + #with the datastructure representation dropping the first newline (if immediately following opening delim) when building the value. + set literal "" + foreach part [lrange $item 1 end] { + append literal [::tomlish::encode::tomlish [list $part] $nextcontext] + } + append toml '''$literal''' + } + INT - + BOOL - + FLOAT - + DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL { + #DDDD + append toml [lindex $item 1] + } + INCOMPLETE { + error "cannot process tomlish term tagged as INCOMPLETE" + } + COMMENT { + append toml "#[lindex $item 1]" + } + BOM { + #Byte Order Mark may appear at beginning of a file. Needs to be preserved. + append toml "\uFEFF" + } + default { + error "Not a properly formed 'tomlish' taggedlist.\n '$list'\n Unknown tag '[lindex $item 0]'. See output of \[tomlish::tags\] command." + } + } + + } + return $toml + } + + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::encode ---}] +} +#fish toml from tomlish + +#(encode tomlish as toml) +interp alias {} tomlish::to_toml {} tomlish::encode::tomlish + +# + + +namespace eval tomlish::decode { + #*** !doctools + #[subsection {Namespace tomlish::decode}] + #[para] + #[list_begin definitions] + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::decode ---}] +} +#decode toml to tomlish +#interp alias {} tomlish::from_toml {} tomlish::decode::toml + +namespace eval tomlish::utils { + #*** !doctools + #[subsection {Namespace tomlish::utils}] + #[para] + #[list_begin definitions] + + + + #basic generic quote matching for single and double quotes + #note for example that {[o'malley]} will return sq - as the single quote is not closed or wrapped in double quotes + proc tok_in_quotedpart {tok} { + set sLen [tcl::string::length $tok] + set quote_type "" + set had_slash 0 + for {set i 0} {$i < $sLen} {incr i} { + set c [tcl::string::index $tok $i] + if {$quote_type eq ""} { + if {$had_slash} { + #don't enter quote mode + #leave slash_mode because even if current char is slash - it is escaped + set had_slash 0 + } else { + set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] + switch -- $ctype { + dq { + set quote_type dq + } + sq { + set quote_type sq + } + bsl { + set had_slash 1 + } + } + } + } else { + if {$had_slash} { + #don't leave quoted mode + #leave slash_mode because even if current char is slash - it is escaped + set had_slash 0 + } else { + set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] + switch -- $ctype { + dq { + if {$quote_type eq "dq"} { + set quote_type "" + } + } + sq { + if {$quote_type eq "sq"} { + set quote_type "" + } + } + bsl { + set had_slash 1 + } + } + } + } + } + return $quote_type ;#dq | sq + } + + proc hex_escape_info {slashx} { + set exp {^\\x([0-9a-fA-F]{2}$)} + if {[regexp $exp $slashx match hex]} { + return [list ok [list char [subst -nocommand -novariable $slashx]]] + } else { + return [list err [list reason "Supplied string not of the form \\xHH where H in \[0-9a-fA-F\]"]] + } + } + proc unicode_escape_info {slashu} { + #!todo + # validate that slashu is either a \uxxxx or \Uxxxxxxxx value of the correct length and + # is a valid 'unicode scalar value' (any Unicode code point except high-surrogate and low-surrogate code points) + # ie integers in the range 0 to D7FF16 and E00016 to 10FFFF16 inclusive + #expr {(($x >= 0) && ($x <= 0xD7FF16)) || (($x >= 0xE00016) && ($x <= 0x10FFFF16))} + if {[tcl::string::match {\\u*} $slashu]} { + set exp {^\\u([0-9a-fA-F]{4}$)} + if {[regexp $exp $slashu match hex]} { + if {[scan $hex %4x dec] != 1} { + #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? + return [list err [list reason "Failed to convert '$hex' to decimal"]] + } else { + return [list ok [list char [subst -nocommand -novariable $slashu]]] + } + } else { + return [list err [list reason "Supplied string not of the form \\uHHHH where H in \[0-9a-fA-F\]"]] + } + } elseif {[tcl::string::match {\\U*} $slashu]} { + set exp {^\\U([0-9a-fA-F]{8}$)} + if {[regexp $exp $slashu match hex]} { + if {[scan $hex %8x dec] != 1} { + #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? + return [list err [list reason "Failed to convert '$hex' to decimal"]] + } else { + if {(($dec >= 0) && ($dec <= 0xD7FF16)) || (($dec >= 0xE00016) && ($dec <= 0x10FFFF16))} { + return [list ok [list char [subst -nocommand -novariable $slashu]]] + } else { + return [list err [list reason "$slashu is not within the 'unicode scalar value' ranges 0 to 0xD7FF16 or 0xE00016 to 0x10FFFF16"]] + } + } + } else { + return [list err [list reason "Supplied string not of the form \\UHHHHHHHH where H in \[0-9a-fA-F\]"]] + } + } else { + return [list err [list reason "Supplied string did not start with \\u or \\U" ]] + } + + } + + #Note that unicode characters don't *have* to be escaped. + #So if we provide a function named 'escape_string', the name implies the inverse of unescape_string which unescapes unicode \u \U values. + #- an inverse of unescape_string would encode all unicode chars unnecessarily. + #- as toml accepts a compact escape sequence for common chars such as tab,backspace,linefeed etc but also allows the full form \u009 etc + #- escape_string and unescape_string would not be reliably roundtrippable inverses anyway. + #REVIEW - provide it anyway? When would it be desirable to use? + + variable Bstring_control_map [dict create] + dict set Bstring_control_map \b {\b} + dict set Bstring_control_map \n {\n} + dict set Bstring_control_map \r {\r} + dict set Bstring_control_map \" {\"} + dict set Bstring_control_map \x1b {\e} ;#In spec it's included in the list of 'must be escaped', as well as the 'convenience' escapes - so we make it go both ways. + dict set Bstring_control_map \\ "\\\\" + + #\e for \x1b seems like it might be included - v1.1?? hard to find current state of where toml is going :/ + #for a Bstring (Basic string) tab is explicitly mentioned as not being one that must be escaped. + #8 = \b - already in list. + #built the remainder whilst checking for entries already hardcoded above -in case more are added to the hardcoded list + for {set cdec 0} {$cdec <= 7} {incr cdec} { + set hhhh [format %.4X $cdec] + set char [format %c $cdec] + if {![dict exists $Bstring_control_map $char]} { + dict set Bstring_control_map $char \\u$hhhh + } + } + for {set cdec [expr {0x0A}]} {$cdec <= 0x1F} {incr cdec} { + set hhhh [format %.4X $cdec] + set char [format %c $cdec] + if {![dict exists $Bstring_control_map $char]} { + dict set Bstring_control_map $char \\u$hhhh + } + } + # \u007F = 127 + dict set Bstring_control_map [format %c 127] \\u007F + + # ------------------------------------------------------------------ + variable Literal_control_map [dict create] + #controls other than tab + for {set cdec 0} {$cdec <= 8} {incr cdec} { + set hhhh [format %.4X $cdec] + set char [format %c $cdec] + if {![dict exists $Literal_control_map $char]} { + dict set Literal_control_map $char \\u$hhhh + } + } + for {set cdec [expr {0x0A}]} {$cdec <= 0x1F} {incr cdec} { + set hhhh [format %.4X $cdec] + set char [format %c $cdec] + if {![dict exists $Literal_control_map $char]} { + dict set Literal_control_map $char \\u$hhhh + } + } + # \u007F = 127 + dict set Literal_control_map [format %c 127] \\u007F + # ------------------------------------------------------------------ + variable Multiliteral_control_map + set Multiliteral_control_map [dict remove $Literal_control_map \n] + + variable String_control_map + set String_control_map [dict remove $Literal_control_map \\] + + + variable MultiBstring_totoml_map + #'minimally' escaped sequences of double quotes. + #e.g {""\"""\"} vs {\"\"\"\"\"} + #This produces easier to read toml - and in many cases may be more likely to match original format when roundtripped from dict datastructure + # REVIEW - should this be configurable? + set MultiBstring_totoml_map [dict remove $Bstring_control_map {"} \r \n] + dict set MultiBstring_totoml_map {"""} {""\"} ;#" editor hack: commented quote for dumb syntax highlighers + + #Note the inclusion of backslash in the list of controls makes this non idempotent - subsequent runs would keep encoding the backslashes! + #escape only those chars that must be escaped in a Bstring (e.g not tab which can be literal or escaped) + #for example - can be used by from_dict to produce valid Bstring data for a tomlish record + proc rawstring_to_Bstring_with_escaped_controls {str} { + #for the well known chars that have compact escape sequences allowed by toml - we choose that form over the full \u form. + #we'll use a string map with an explicit list rather than algorithmic at runtime + # - the string map is probably more performant than splitting a string, especially if it's large + + upvar ::tomlish::utils::Bstring_control_map map + + return [string map $map $str] + } + proc rawstring_to_MultiBstring_with_escaped_controls {str} { + #for the well known chars that have compact escape sequences allowed by toml - we choose that form over the full \u form. + #we'll use a string map with an explicit list rather than algorithmic at runtime + # - the string map is probably more performant than splitting a string, especially if it's large + + upvar ::tomlish::utils::MultiBstring_totoml_map map + + return [string map $map $str] + } + + proc rawstring_is_valid_tomlstring {str} { + #controls are allowed in this direction dict -> toml (they get quoted) + + #check any existing escapes are valid + if {[catch { + unescape_string $str + } errM]} { + return 0 + } + return 1 + } + + proc rawstring_is_valid_literal {str} { + #detect control chars other than tab + variable Literal_control_map + set testval [string map $Literal_control_map $str] + return [expr {$testval eq $str}] + } + proc rawstring_is_valid_multiliteral {str} { + #detect control chars other than tab + variable Multiliteral_control_map + + set teststr [string map [list \r\n ok] $str] + + set testval [string map $Multiliteral_control_map $teststr] + return [expr {$testval eq $teststr}] + } + + #review - unescape what string? Bstring vs MLBstring? + #we should be specific in the function naming here + #used by dict::from_tomlish - so part of validation? - REVIEW + proc unescape_string {str} { + #note we can't just use Tcl subst because: + # it also transforms \a (audible bell) and \v (vertical tab) which are not in the toml spec. + # it would strip out backslashes inappropriately: e.g "\j" becomes just j + # it recognizes other escapes which aren't approprite e.g octal \nnn + # it replaces \ with a single whitespace (trailing backslash) + #This means we shouldn't use 'subst' on the whole string, but instead substitute only the toml-specified escapes (\r \n \b \t \f \\ \" \uhhhh & \Uhhhhhhhh + #plus \e for \x1b? + + set buffer "" + set buffer2 "" ;#buffer for 2 hex characters following a \x + set buffer4 "" ;#buffer for 4 hex characters following a \u + set buffer8 "" ;#buffer for 8 hex characters following a \u + + set sLen [tcl::string::length $str] + + #we need to handle arbitrarily long sequences of backslashes. \\\\\ etc + set slash_active 0 + set unicode2_active 0 + set unicode4_active 0 + set unicode8_active 0 + + ::tomlish::log::debug "unescape_string. got len [string length str] str $str" + + #!todo - check for invalid data in the form of a raw carriage return (decimal 13) without following linefeed? + set i 0 + for {} {$i < $sLen} {} { + if {$i > 0} { + set lastChar [tcl::string::index $str [expr {$i - 1}]] + } else { + set lastChar "" + } + + set c [tcl::string::index $str $i] + #::tomlish::log::debug "unescape_string. got char $c" ;#too much? + + ##---------------------- + ##as we are 'unescaping' - should we really be testing here for existing values that should have been escaped? + ##The answer is probably no - keep this function to a single purpose - test elsewhere for raw controls. + ##this test looks incomplete anyway REVIEW + #scan $c %c n + #if {($n <= 31) && ($n != 9) && ($n != 10) && ($n != 13)} { + # #we don't expect unescaped unicode characters from 0000 to 001F - + # #*except* for raw tab (which is whitespace) and newlines + # error "unescape_string. Invalid data for a toml string. Unescaped control character (decimal $n) [::tomlish::utils::string_to_slashu $c]" + #} + ##---------------------- + + incr i ;#must incr here because we do'returns'inside the loop + if {$c eq "\\"} { + if {$slash_active} { + append buffer "\\" + set slash_active 0 + } elseif {$unicode2_active} { + error "unescape_string. unexpected case slash during unicode2 not yet handled" + } elseif {$unicode4_active} { + error "unescape_string. unexpected case slash during unicode4 not yet handled" + } elseif {$unicode8_active} { + error "unescape_string. unexpected case slash during unicode8 not yet handled" + } else { + # don't output anything (yet) + set slash_active 1 + } + } else { + if {$unicode2_active} { + if {[tcl::string::length $buffer2] < 2} { + append buffer2 $c + } + if {[tcl::string::length $buffer2] == 2} { + #we have a \xHH to test + set unicode2_active 0 + set result [tomlish::utils::hex_escape_info "\\x$buffer2"] + if {[lindex $result 0] eq "ok"} { + append buffer [dict get $result ok char] + } else { + error "unescape_string error: [lindex $result 1]" + } + } + } elseif {$unicode4_active} { + if {[tcl::string::length $buffer4] < 4} { + append buffer4 $c + } + if {[tcl::string::length $buffer4] == 4} { + #we have a \uHHHH to test + set unicode4_active 0 + set result [tomlish::utils::unicode_escape_info "\\u$buffer4"] + if {[lindex $result 0] eq "ok"} { + append buffer [dict get $result ok char] + } else { + error "unescape_string error: [lindex $result 1]" + } + } + } elseif {$unicode8_active} { + if {[tcl::string::length $buffer8] < 8} { + append buffer8 $c + } + if {[tcl::string::length $buffer8] == 8} { + #we have a \UHHHHHHHH to test + set unicode8_active 0 + set result [tomlish::utils::unicode_escape_info "\\U$buffer8"] + if {[lindex $result 0] eq "ok"} { + append buffer [dict get $result ok char] + } else { + error "unescape_string error: [lindex $result 1]" + } + } + } elseif {$slash_active} { + set slash_active 0 + set ctest [tcl::string::map {{"} dq} $c] + switch -exact -- $ctest { + dq { + append buffer {"} + } + b - t - n - f - r { + append buffer [subst -nocommand -novariable "\\$c"] + } + e { + append buffer \x1b + } + x { + #introduced in 1.1.0 \xHH + set unicode2_active 1 + set buffer2 "" + } + u { + set unicode4_active 1 + set buffer4 "" + } + U { + set unicode8_active 1 + set buffer8 "" + } + default { + set slash_active 0 + #review - toml spec says all other escapes are reserved + #and if they are used TOML should produce an error. + #append buffer "\\$c" + set msg "Invalid escape sequence \\ followed by '$c'" + return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} $msg + } + } + } else { + append buffer $c + } + } + } + #puts stdout "EOF 4:$unicode4_active 8:$unicode8_active slash:$slash_active" + if {$unicode2_active} { + error "End of string reached before complete hex escape sequence \xHH" + } + if {$unicode4_active} { + error "End of string reached before complete unicode escape sequence \uHHHH" + } + if {$unicode8_active} { + error "End of string reached before complete unicode escape sequence \UHHHHHHHH" + } + if {$slash_active} { + append buffer "\\" + } + try { + encoding convertto utf-8 $buffer + } trap {} {emsg eopts} { + return -code error -errorcode {TOMLISH SYNTAX ENCODINGERROR} $emsg + } + return $buffer + } + + #This does not have to do with unicode normal forms - which it seems toml has decided against regarding use in keys (review/references?) + #This is meant for internal use regarding ensuring we match equivalent keys which may have just been specified with different string mechanisms, + #e.g squoted vs dquoted vs barekey. + proc normalize_key {rawkey} { + set c1 [tcl::string::index $rawkey 0] + set c2 [tcl::string::index $rawkey end] + if {($c1 eq "'") && ($c2 eq "'")} { + #single quoted segment. No escapes allowed within it. + set key [tcl::string::range $rawkey 1 end-1] + } elseif {($c1 eq "\"") && ($c2 eq "\"")} { + #double quoted segment. Unapply escapes. + # + set keydata [tcl::string::range $rawkey 1 end-1] ;#strip outer quotes only + #e.g key could have mix of \UXXXXXXXX escapes and unicode chars + #or mix of \t and literal tabs. + #unescape to convert all to literal versions for comparison + set key [::tomlish::utils::unescape_string $keydata] + #set key [subst -nocommands -novariables $keydata] ;#wrong. Todo - create a string escape substitution function. + } else { + set key $rawkey + } + return $key + } + + proc string_to_slashu {string} { + set rv {} + foreach c [split $string {}] { + scan $c %c cdec + if {$cdec > 65535} { + append rv {\U} [format %.8X $cdec] + } else { + append rv {\u} [format %.4X $cdec] + } + } + return $rv + } + + #'nonprintable' is conservative here because some systems (e.g windows console) are very limited in what they can display. + #This is used for display purposes only (error msgs) + proc nonprintable_to_slashu {s} { + set res "" + foreach i [split $s ""] { + scan $i %c cdec + + set printable 0 + if {($cdec>31) && ($cdec<127)} { + set printable 1 + } + if {$printable} { + append res $i + } else { + if {$cdec > 65535} { + append res \\U[format %.8X $cdec] + } else { + append res \\u[format %.4X $cdec] + } + } + } + set res + } ;# initial version from tcl wiki RS + + proc rawstring_to_jsonstring {s} { + #like nonprintable_to_slashu + # - also escape every dquote + # - escape newlines + set res "" + foreach i [split $s ""] { + scan $i %c cdec + switch -- $cdec { + 34 { + #double quote + append res \\\" + } + 13 { + #carriage return + append res \\r + } + 8 { + append res \\b + } + 9 { + append res \\t + } + 10 { + #linefeed + append res \\n + } + 92 { + append res \\\\ + } + default { + set printable 0 + if {($cdec>31) && ($cdec<127)} { + set printable 1 + } + if {$printable} { + append res $i + } else { + if {$cdec > 65535} { + #append res $i + #append res \\U[format %.8X $cdec] ;#wrong + #append res "\\U{[format %.8x $cdec]}" ;#some variation of json? + package require punk::cesu + #e.g \U0001f610 emoticon face + #surrogate pair: \uD83D\uDE10 + set surrogatepair [punk::cesu::to_surrogatestring -format escape $i] + append res $surrogatepair + } else { + append res \\u[format %.4X $cdec] + } + } + } + } + } + set res + + } + + #check if str is valid for use as a toml bare key + #Early toml versions only allowed letters + underscore + dash + proc is_basic_barekey {str} { + if {[tcl::string::length $str] == 0} { + return 0 + } else { + set matches [regexp -all {[a-zA-Z0-9\_\-]} $str] + if {[tcl::string::length $str] == $matches} { + #all characters match the regexp + return 1 + } else { + return 0 + } + } + } + + #from toml.abnf in github.com/toml-lang/toml + #unquoted-key = 1*unquoted-key-char + #unquoted-key-char = ALPHA / DIGIT / %x2D / %x5F ; a-z A-Z 0-9 - _ + #unquoted-key-char =/ %xB2 / %xB3 / %xB9 / %xBC-BE ; superscript digits, fractions + #unquoted-key-char =/ %xC0-D6 / %xD8-F6 / %xF8-37D ; non-symbol chars in Latin block + #unquoted-key-char =/ %x37F-1FFF ; exclude GREEK QUESTION MARK, which is basically a semi-colon + #unquoted-key-char =/ %x200C-200D / %x203F-2040 ; from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ + #unquoted-key-char =/ %x2070-218F / %x2460-24FF ; include super-/subscripts, letterlike/numberlike forms, enclosed alphanumerics + #unquoted-key-char =/ %x2C00-2FEF / %x3001-D7FF ; skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces + #unquoted-key-char =/ %x2070-21FF / %x2300-24FF ; skip math operators + #unquoted-key-char =/ %x25A0-268B / %x2690-2757 ; skip box drawing, block elements, and some yin-yang symbols + #unquoted-key-char =/ %x2762-2767 / %x2776-27E5 ; skip some Dingbat punctuation + #unquoted-key-char =/ %x2801-297F ; skip some math brackets and arrows, and braille blank + #unquoted-key-char =/ %x2B00-2FFF / %x3001-D7FF ; skip various math operators and symbols, and ideographic space + #unquoted-key-char =/ %xF900-FDCF / %xFDF0-FFFD ; skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) + #unquoted-key-char =/ %x10000-EFFFF ; all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) + variable re_barekey + set ranges [list] + lappend ranges {a-zA-Z0-9\_\-} + lappend ranges {\u00B2} {\u00B3} {\u00B9} {\u00BC-\u00BE} ;# superscript digits, fractions + lappend ranges {\u00C0-\u00D6} {\u00D8-\u00F6} {\u00F8-\u037D} ;# non-symbol chars in Latin block + lappend ranges {\u037f-\u1FFF} ;# exclude GREEK QUESTION MARK, which is basically a semi-colon + lappend ranges {\u200C-\u200D} {\u203F-\u2040} ;# from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ + lappend ranges {\u2070-\u218f} {\u2460-\u24FF} ;# include super-subscripts, letterlike/numberlike forms, enclosed alphanumerics + lappend ranges {\u2C00-\u2FEF} {\u3001-\uD7FF} ;# skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces + lappend ranges {\u2070-\u21FF} {\u2300-\u24FF} ;# skip math operators + lappend ranges {\u25A0-\u268B} {\u2690-\u2757} ;# skip box drawing, block elements, and some yin-yang symbols + lappend ranges {\u2762-\u2767} {\u2776-\u27E5} ;# skip some Dingbat punctuation + lappend ranges {\u2801-\u297F} ;# skip some math brackets and arrows, and braille blank + lappend ranges {\u2B00-\u2FFF} {\u3001-\uD7FF} ;# skip various math operators and symbols, and ideographic space + lappend ranges {\uF900-\uFDCF} {\uFDF0-\uFFFD} ;# skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) + lappend ranges {\U10000-\UEFFFF} ;# all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) + set re_barekey {^[} + foreach r $ranges { + append re_barekey $r + } + append re_barekey {]+$} + + proc is_barekey {str} { + if {[tcl::string::length $str] == 0} { + return 0 + } + variable re_barekey + return [regexp $re_barekey $str] + } + + #test only that the characters in str are valid for the toml specified type 'integer'. + proc int_validchars1 {str} { + set numchars [tcl::string::length $str] + if {[regexp -all {[0-9\_\-\+]} $str] == $numchars} { + return 1 + } else { + return 0 + } + } + #add support for hex,octal,binary 0x.. 0o.. 0b... + proc int_validchars {str} { + set numchars [tcl::string::length $str] + if {[regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] == $numchars} { + return 1 + } else { + return 0 + } + } + + proc is_int {str} { + set matches [regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] ;#0b101 etc covered by a-f + + if {[tcl::string::length $str] == $matches} { + #all characters in legal range + + # --------------------------------------- + #check for leading zeroes in non 0x 0b 0o + #first strip any +, - or _ (just for this test) + #(but still allowing 0 -0 +0) + set check [tcl::string::map {+ "" - "" _ ""} $str] + if {([tcl::string::length $check] > 1) && ([tcl::string::index $check 0] eq "0") && ([tcl::string::index $check 1] ni {o x b})} { + return 0 + } + # --------------------------------------- + + #check +,- only occur in the first position. (excludes also +++1 etc) + if {[tcl::string::last - $str] > 0} { + return 0 + } + if {[tcl::string::last + $str] > 0} { + return 0 + } + + #------------------------------------------- + #unclear if a 'digit' includes the type specifiers x b o + #we assume the 0x 0b 0o are NOT counted as digits - as underscores here would seem + #to be likely to cause interop issues with other systems + #(e.g tcl allows 0b1_1 but not 0b_11) + #Most of this structure would be unnecessary if we could rely on string::is::integer understanding underscores (9+?) + #we still need to support earlier Tcl for now though. + + #first rule out any case with more than one underscore in a row + if {[regexp {__} $str]} { + return 0 + } + if {[string index $str 0] eq "_"} { + return 0 + } + set utest [string trimleft $str +-] + #test again for further trick like _+_0xFF + if {[string index $utest 0] eq "_"} { + return 0 + } + if {[string range $utest 0 1] in {0x 0b 0o}} { + set testnum [string range $utest 2 end] + #spec says *non-negative* integers may *also* be expressed in hex, octal or binary + #and also explicitly states + not allowed + #presumed to mean negative not allowed. + if {[string index $str 0] in {- +}} { + return 0 + } + } else { + set testnum $utest + #exclude also things like 0_x 0___b that snuck past our prefix test + if {![string is digit -strict [string map {_ ""} $testnum]]} { + return 0 + } + #assert - only digits and underscores in testnum + #still may have underscores at each end + } + #assert testnum is now the 'digits' portion of a , 0x 0b 0o number + #(+ and - already stripped) + #It may still have chars unsuitable for its type - which will be caught by the string::is::integer test below + if {[string length $testnum] != [string length [string trim $testnum _]]} { + #had non-inner underscores in 'digit' part + return 0 + } + #assert str only has solo inner underscores (if any) between 'digits' + #------------------------------------------- + + set numeric_value [tcl::string::map {_ ""} $str] ;#allow some earlier tcl versions which don't support underscores + #use Tcl's integer check to ensure we don't let things like 3e4 through - which is a float (would need to be 0x3e4 for hex) + if {![tcl::string::is integer -strict $numeric_value]} { + return 0 + } + + + + #!todo - check bounds only based on some config value + #even though Tcl can handle bignums, we won't accept anything outside of toml 1.0 minimum requirements by default (for now) + #presumably very large numbers would have to be supplied in a toml file as strings. + #Review - toml 1.0 only says that it must handle up to 2^63 - not that this is a max + #some question around implementations allowed to use lower values such as 2^31 on some systems? + if {$::tomlish::max_int ne "" && $numeric_value > $::tomlish::max_int} { + return 0 + } + if {$::tomlish::min_int ne "" && $numeric_value < $::tomlish::min_int} { + return 0 + } + } else { + return 0 + } + #Got this far - didn't find anything wrong with it. + return 1 + } + + #test only that the characters in str are valid for the toml specified type 'float'. + proc float_validchars {str} { + set numchars [tcl::string::length $str] + if {[regexp -all {[eE0-9\_\-\+\.]} $str] == $numchars} { + return 1 + } else { + #only allow lower case for these special values - as per Toml 1.0 spec + if {$str ni {inf +inf -inf nan +nan -nan}} { + return 0 + } else { + return 1 + } + } + } + + #note - Tcl's string is double will return true also for the subset of float values which are integers + #This function is to determine whether it matches the Toml float concept - so requires a . or e or E + proc is_float {str} { + #vip greenlight known literals, don't test for case variations - as Toml doesn't allow (whereas Tcl allows Inf NaN etc) + if {$str in {inf +inf -inf nan +nan -nan}} { + return 1 + } + #doorcheck the basics for floatiness vs members of that rival gang - ints + if {![regexp {[.eE]} $str]} { + #could be an integer - which isn't specifically a float for Toml purposes. + return 0 + } + + + #patdown for any contraband chars + set matches [regexp -all {[eE0-9\_\-\+\.]} $str] + if {[tcl::string::length $str] != $matches} { + return 0 + } + + #all characters in legal range + + #A leading zero is ok, but we should disallow multiple leading zeroes (same rules as toml ints) + + #Early Toml spec also disallowed leading zeros in the exponent part(?) + #... this seems less interoperable anyway (some libraries generate leading zeroes in exponents) + #we allow leading zeros in exponents here. + + #Check for leading zeros in main part + #first strip any +, - or _ (just for this test) + set check [tcl::string::map {+ "" - "" _ ""} $str] + set r {([0-9])*} + regexp $r $check intpart ;#intpart holds all numerals before the first .,e or E + #leading zero only if exactly one zero + if {$intpart ne "0" && [string match 0* $intpart]} { + return 0 + } + + #for floats, +,- may occur in multiple places + #e.g -2E-22 +3e34 + #!todo - check bounds ? + + #----------------------------------------- + if {[regexp {__} $str]} { + return 0 + } + if {[string index $str 0] eq "_" || [string index $str end] eq "_"} { + return 0 + } + set utest [string trimleft $str +-] + #test again for further trick like _+_ + if {[string index $utest 0] eq "_"} { + return 0 + } + #----------------------------------------- + + #decimal point, if used must be surrounded by at least one digit on each side + #e.g 3.e+20 also illegal + set dposn [string first . $str] + if {$dposn > -1 } { + set d3 [string range $str $dposn-1 $dposn+1] + if {![string is integer -strict [string index $d3 0]] || ![string is integer -strict [string index $d3 2]]} { + return 0 + } + } + #we've already eliminated leading/trailing underscores + #now ensure each inner underscore is surrounded by digits + if {[regexp {_[^0-9]|[^0-9]_} $str]} { + return 0 + } + + #strip underscores for tcl double check so we can support < tcl 9 versions which didn't allow underscores + set check [tcl::string::map {_ ""} $str] + #string is double accepts inf nan +NaN etc. + if {![tcl::string::is double $check]} { + return 0 + } + + #All good - seems to be a toml-approved float and not an int. + return 1 + } + + #test only that the characters in str are valid for the toml specified type 'datetime'. + proc datetime_validchars {str} { + set numchars [tcl::string::length $str] + if {[regexp -all {[zZtT0-9\-\+\.:]} $str] == $numchars} { + return 1 + } else { + return 0 + } + } + + proc is_datepart {str} { + set matches [regexp -all {[0-9\-]} $str] + if {[tcl::string::length $str] != $matches} { + return 0 + } + #seems to require yyyy-mm-dd (e.g not allowing just yyyy-mm) + if {![regexp {^([0-9]{4})-([0-9]{2})-([0-9]{2})$} $str _match y m d]} { + return 0 + } + if {$m > 12 || $m == 0} { + return 0 + } + switch -- [expr {$m}] { + 1 - 3 - 5 - 7 - 8 - 10 - 12 { + if {$d > 31 || $d == 0} { + return 0 + } + } + 2 { + if {$d > 29 || $d == 0} { + return 0 + } + if {$d == 29} { + #leapyear check + if {[catch {clock scan $str -format %Y-%m-%d} errM]} { + return 0 + } + } + } + 4 - 6 - 9 - 11 { + if {$d > 30 || $d == 0} { + return 0 + } + } + } + return 1 + } + proc is_localdate {str} { + is_datepart $str + } + + #allow only hh:mm:ss or hh:mm (no subseconds) + proc _is_hms_or_hm_time {val} { + set numchars [tcl::string::length $val] + if {[regexp -all {[0-9:]} $val] != $numchars} { + return 0 + } + #assert now digits and colons only + set hms_cparts [split $val :] + #2 or 3 parts only are valid - check contents of each part + if {[llength $hms_cparts] == 2} { + lassign $hms_cparts hr min + if {[string length $hr] != 2 || [string length $min] != 2} { + return 0 + } + if {$hr > 23 || $min > 59} { + return 0 + } + } elseif {[llength $hms_cparts] == 3} { + lassign $hms_cparts hr min sec + if {[string length $hr] != 2 || [string length $min] != 2 || [string length $sec] !=2} { + return 0 + } + #possible for sec to be 60 - leap second RFC 3339 + if {$hr > 23 || $min > 59 || $sec > 60} { + return 0 + } + } else { + return 0 + } + return 1 + } + proc is_timepart {str} { + #validate the part after the T (or space) + #we receive only that trailing part here. + + #odt1 = 1979-05-27T07:32:00Z + #odt2 = 1979-05-27T00:32:00-07:00 + #odt3 = 1979-05-27T00:32:00.5-07:00 + #odt4 = 1979-05-27T00:32:00.999999-07:00 + + set numchars [tcl::string::length $str] + #timepart can have negative or positive offsets so - and + must be accepted + if {[regexp -all {[zZt0-9\-\+\.:]} $str] == $numchars} { + #todo + #basic check that we have leading 2dig hr and 2dig min separated by colon + if {![regexp {^[0-9]{2}:[0-9]{2}$|^[0-9]{2}:[0-9]{2}[^0-9]{1}.*$} $str]} { + #nn:nn or nn:nnX.* where X is non digit + return 0 + } + set dotparts [split $str .] + if {[llength $dotparts] ni {1 2}} { + return 0 + } + if {[llength $dotparts] == 2} { + lassign $dotparts hms tail + #validate tail - which might have +- offset + if {[string index $tail end] ni {z Z}} { + #from hh:mm:??. + #check for +/- something + if {[regexp {(.*)[+-](.*)} $tail _match fraction offset]} { + if {![string is digit -strict $fraction]} { + return 0 + } + if {![_is_hms_or_hm_time $offset]} { + return 0 + } + } + } else { + set tail [string range $tail 0 end-1] + #expect tail nnnn (from hh:mm::ss.nnnnZ) + #had a dot and a zZ - no other offset valid (?) + if {![string is digit -strict $tail]} { + return 0 + } + } + + } else { + #no dot (fraction of second) + if {[regexp {(.*)[+-](.*)} $str _match hms offset]} { + #validate offset + if {![_is_hms_or_hm_time $offset]} { + return 0 + } + } else { + set hms $str + set offset "" + #trim a *single* z or Z off hms if present - multiple should error later + if {[string index $hms end] in {z Z}} { + set hms [string range $hms 0 end-1] + } + } + } + #hms is allowed in toml to be hh:mm:ss or hh:mm + #validate we have hh:mm:ss or hh:mm - exactly 2 digits each + if {![_is_hms_or_hm_time $hms]} { + return 0 + } + + return 1 + } else { + return 0 + } + } + proc is_localtime {str} { + #time of day without any relation to a specific day or any offset or timezone + set numchars [tcl::string::length $str] + if {[regexp -all {[0-9\.:]} $str] == $numchars} { + #todo + if {![regexp {^[0-9]{2}:[0-9]{2}$|^[0-9]{2}:[0-9]{2}:[0-9]{2}([.][0-9]+){0,1}$} $str]} { + #hh:mm or hh:mm:ss or hh:mm::ss.nnn + return 0 + } + set dotparts [split $str .] + if {[llength $dotparts] ni {1 2}} { + return 0 + } + if {[llength $dotparts] == 2} { + lassign $dotparts hms _tail + #validate tail - just fractional seconds - regex has confirmed at least one digit and only digits + #nothing todo? max length? + } else { + #no fractional seconds + set hms $str + } + if {![_is_hms_or_hm_time $hms]} { + return 0 + } + return 1 + } else { + return 0 + } + } + + #review + proc is_datetime {str} { + #Essentially RFC3339 formatted date-time - but: + #1) allowing seconds to be omitted (:00 assumed) + #2) T may be replaced with a single space character TODO - parser support for space in datetime! + # (RFC 3339 allows space instead of T also - but doesn't specify it *must* be a single space) + + #toml-lint @2025-04 doesn't accept t for T or z for Z - but RFC3339 does + #toml spec doesn't clarify - we will accept + + #e.g 1979-05-27 + #e.g 1979-05-27T00:32:00Z + #e.g 1979-05-27 00:32:00-07:00 + #e.g 1979-05-27 00:32:00+10:00 + #e.g 1979-05-27 00:32:00.999999-07:00 + + #review + #minimal datetimes? + # 2024 not ok - 2024T not accepted by tomlint why? + # 02:00 ok + # 02:00:00.5 ok + # 1:00 - not ok - RFC3339 requires 2-digit hr,min,sec + + #toml-lint.com accepts 2025-01 + + if {[string length $str] < 5} { + return 0 + } + + set matches [regexp -all {[zZtT0-9\ \-\+\.:]} $str] + if {[tcl::string::length $str] == $matches} { + #all characters in legal range + if {[regexp -all {\ } $str] > 1} { + #only a single space is allowed. + return 0 + } + #If we get a space - it is only valid as a convience to represent the T separator + #we can normalize by converting to T here before more tests + set str [string map {" " T t T} $str] + #a further sanity check on T + if {[regexp -all {T} $str] > 1} { + return 0 + } + + #!todo - use full RFC 3339 parser? + #!todo - what if the value is 'time only'? + + if {[string first T $str] > -1} { + lassign [split $str T] datepart timepart + if {![is_datepart $datepart]} { + return 0 + } + if {![is_timepart $timepart]} { + return 0 + } + } else { + #either a datepart or a localtime + #spec: "If you include only the time portion of an RFC 3339 formatted date-time, it will represent that time of day + # without any relation to a specific day or any offset or timezone." + if {!([is_datepart $str] || [is_localtime $str])} { + return 0 + } + } + + + #Tcl's free-form clock scan (no -format option) is deprecated + # + #if {[catch {clock scan $datepart} err]} { + # puts stderr "tcl clock scan failed err:'$err'" + # return 0 + #} + + } else { + return 0 + } + return 1 + } + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::utils ---}] +} + +namespace eval tomlish::parse { + #*** !doctools + #[subsection {Namespace tomlish::parse}] + #[para] + #[list_begin definitions] + + #This is a somewhat curly mix of a statemachine and toml-nesting-stack littered with special cases. + #The code is a pig's-nest - but it should be noted that for example trailing single double quotes in multiline strings are perhaps not so trivial to parse using more standard methods either: + # - e.g some kind of backtracking required if using an ABNF parser? + #I don't know the precise technical name for this sort of parser; probably something like "Dog's Breakfast" + #More seriously, we don't have distinct lex/parse steps - so it is basically a 'fused lexer' or 'scannerless parser' + + #It is also desirable for this system to be useful in 'interactive' use. review - would a separate lexer make this easier or harder? + + #A possible alternative more structured approach might be to use a PEG (Parsing Expression Grammar) + + + variable is_parsing 0 ;#whether we are in the middle of parsing tomlish text + + variable state + # states: + # table-space, itable-space, array-space + # array-value-expected,keyval-value-expected,itable-keyval-value-expected, keyval-syntax, + # dquoted-key, squoted-key + # string-state, literal-state, multistring... + # + # notes: + # only the -space states are also 'spaces' ie a container which is pushed/popped on the spacestack + + # + # xxx_value-expected - we also allow for leading whitespace in this state, but once a value is returned we jump to a state based on the containing space. e.g keyval-tail or array-syntax + # + #stateMatrix defines for each state, actions to take for each possible token. + #single-element actions are the name of the next state into which to transition, or a 'POPSPACE' instruction to pop a level off the spacestack and add the data to the parent container. + #dual-element actions are a push instruction and the name of the space to push on the stack. + # - PUSHSPACE is a simple push onto the spacestack, zeropoppushspace also pushes, but will first do a pop *if* the current space level is greater than zero (ie if only if not already in root table-space) + + # -- --- --- --- --- --- + #token/state naming guide + # -- --- --- --- --- --- + #tokens : underscore separated or bare name e.g newline, start_quote, start_squote + #private tokens: always have a leading underscore (These are private 'temporary state' tokens that are never returned as actual tokens e.g _start_squote_sequence + #states : always contain at least one dash e.g err-state, table-space + #instructions + # -- --- --- --- --- --- + + + #stateMatrix dict of elements mapping current state to next state based on returned tokens + # current-state {token-encountered next-state ... } + # where next-state can be a 1 or 2 element list. + #If 2 element - the first item is an instruction (ucase) + #If 1 element - it is either a lowercase dashed state name or an ucase instruction + #e.g {PUSHSPACE } or POPSPACE or SAMESPACE + + + #SAMESPACE - got to same space as parent without popping a level, but has it's own autotransition lookup - strange concept - review usecases + + variable stateMatrix + set stateMatrix [dict create] + #--------------------------------------------------------- + #WARNING + #The stateMatrix implementation here is currently messy. + #The code is a mixture of declarative via the stateMatrix and imperative via switch statements during PUSH/POP/SAMESPACE transitions. + #This means the state behaviour has to be reasoned about by looking at both in conjuction. + #--------------------------------------------------------- + + #xxx-space vs xxx-syntax inadequately documented - TODO + + #review - out of date? + # --------------------------------------------------------------------------------------------------------------# + # incomplete example of some state starting at table-space + # --------------------------------------------------------------------------------------------------------------# + # ( = -> keyval-value-expected) + # keyval-syntax (popped -> keyval-space -> keyval-tail) (autotransition on pop) + # keyval-space (autotransition on push ^) + # table-space (barekey^) (startdquote -> dquoted-key ^) + # --------------------------------------------------------------------------------------------------------------# + + dict set stateMatrix\ + table-space { + bom "table-space"\ + whitespace "table-space"\ + newline "table-space"\ + barekey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ + squotedkey {PUSHSPACE "keyval-space" state "keyval-syntax" note ""}\ + dquotedkey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ + XXXsingle_dquote "quoted-key"\ + XXXsingle_squote "squoted-key"\ + comment "table-space"\ + starttablename "tablename-state"\ + starttablearrayname "tablearrayname-state"\ + enddquote "err-state"\ + endsquote "err-state"\ + comma "err-state"\ + eof "end-state"\ + equal "err-state"\ + cr "err-lonecr"\ + } + + + + dict set stateMatrix\ + keyval-space {\ + whitespace "keyval-syntax"\ + equal "keyval-value-expected"\ + } + + # ' = ' portion of keyval + dict set stateMatrix\ + keyval-syntax {\ + whitespace "keyval-syntax"\ + barekey {PUSHSPACE "dottedkey-space"}\ + squotedkey {PUSHSPACE "dottedkey-space"}\ + dquotedkey {PUSHSPACE "dottedkey-space"}\ + equal "keyval-value-expected"\ + comma "err-state"\ + newline "err-state"\ + eof "err-state"\ + } + #### + dict set stateMatrix\ + keyval-value-expected {\ + whitespace "keyval-value-expected"\ + untyped_value {TOSTATE "keyval-untyped-sequence" note "possible datetime datepart"}\ + literal {TOSTATE "keyval-tail" note "required for empty literal at EOF"}\ + string {TOSTATE "keyval-tail" note "required for empty string at EOF"}\ + single_dquote {TOSTATE "string-state" returnstate keyval-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ + single_squote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ + triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ + startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ + startarray {PUSHSPACE array-space returnstate keyval-tail}\ + } + #double_squote {TOSTATE "keyval-tail" note "empty literal received when double squote occurs"} + + #untyped_value sequences without intervening comma are allowed for datepart timepart + #we will produce tomlish with missing SEPS and to_dict must validate whether 2 adjacent barekeys are valid + dict set stateMatrix\ + keyval-untyped-sequence {\ + whitespace "keyval-untyped-sequence"\ + untyped_value {TOSTATE "keyval-tail"}\ + literal {TOSTATE "keyval-tail" note "required for empty literal at EOF"}\ + string {TOSTATE "keyval-tail" note "required for empty string at EOF"}\ + single_dquote {TOSTATE "string-state" returnstate keyval-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ + single_squote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ + triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ + startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ + startarray {PUSHSPACE array-space returnstate keyval-tail}\ + newline "POPSPACE"\ + comment "keyval-tail"\ + eof "end-state"\ + } + + #2025 - no leading-squote-space - only trailing-squote-space. + + dict set stateMatrix\ + keyval-tail {\ + whitespace "keyval-tail"\ + newline "POPSPACE"\ + comment "keyval-tail"\ + eof "end-state"\ + } + + + #itable-space/ curly-syntax : itables + # x={y=1,} + dict set stateMatrix\ + itable-space {\ + whitespace "itable-space"\ + newline "itable-space"\ + barekey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + squotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + dquotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + endinlinetable "POPSPACE"\ + comma "err-state"\ + comment "itable-space"\ + eof "err-state"\ + } + #we don't get single_squote etc here - instead we get the resulting squotedkey token + + + # ??? review - something like this + # + # x={y =1,} + dict set stateMatrix\ + itable-keyval-syntax {\ + whitespace {TOSTATE "itable-keyval-syntax"}\ + barekey {PUSHSPACE "dottedkey-space"}\ + squotedkey {PUSHSPACE "dottedkey-space"}\ + dquotedkey {PUSHSPACE "dottedkey-space"}\ + equal {TOSTATE "itable-keyval-value-expected"}\ + newline "err-state"\ + eof "err-state"\ + } + + # x={y=1} + dict set stateMatrix\ + itable-keyval-space {\ + whitespace "itable-keyval-syntax"\ + equal {TOSTATE "itable-keyval-value-expected" note "required"}\ + } + + dict set stateMatrix\ + itable-keyval-value-expected {\ + whitespace "itable-keyval-value-expected"\ + untyped_value {TOSTATE "itable-val-tail" note ""}\ + single_dquote {TOSTATE "string-state" returnstate itable-val-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate itable-val-tail}\ + single_squote {TOSTATE "literal-state" returnstate itable-val-tail note "usual way a literal is triggered"}\ + triple_squote {PUSHSPACE "multiliteral-space" returnstate itable-val-tail}\ + startinlinetable {PUSHSPACE "itable-space" returnstate itable-val-tail}\ + startarray {PUSHSPACE "array-space" returnstate itable-val-tail}\ + } + #double_squote not currently generated by _start_squote_sequence - '' processed as single_squote to literal-state just like 'xxx' + # review + # double_squote {TOSTATE "itable-val-tail" note "empty literal received when double squote occurs"} + + + + # x={y=1,z="x"} + #POPSPACE is transition from itable-keyval-space to parent itable-space + dict set stateMatrix\ + itable-val-tail {\ + whitespace "itable-val-tail"\ + endinlinetable "POPSPACE"\ + comma "POPSPACE"\ + newline {TOSTATE "itable-val-tail" note "itable-space ??"}\ + comment "itable-val-tail"\ + eof "err-state"\ + } + # XXXnewline "POPSPACE" + # We shouldn't popspace on newline - as if there was no comma we need to stay in itable-val-tail + # This means the newline and subsequent whitespace, comments etc become part of the preceeding dottedkey record + #e.g + # x = { + # j=1 + # #comment within dottedkey j record + # , # comment unattached + # #comment unattached + # k=2 , #comment unattached + # l=3 #comment within l record + # , m=4 + # #comment associated with m record + # + # #still associated with m record + # } + ## - This doesn't quite correspond to what a user might expect - but seems like a consistent mechanism. + #The awkwardness is because there is no way to put in a comment that doesn't consume a trailing comma + #so we cant do: j= 1 #comment for j1 , + # and have the trailing comma recognised. + # + # To associate: j= 1, #comment for j1 + # we would need some extra processing . (not popping until next key ? extra state itable-sep-tail?) REVIEW - worth doing? + # + # The same issue occurs with multiline arrays. The most natural assumption is that a comment on same line after a comma + # is 'associated' with the previous entry. + # + # These comment issues are independent of the data dictionary being generated for conversion to json etc - as the comments don't carry through anyway, + # but are a potential oddity for manipulating the intermediate tomlish structure whilst attempting to preserve 'associated' comments + # (e.g reordering records within an itable) + #The user's intention for 'associated' isn't always clear and the specs don't really guide on this. + + + #dottedkey-space is not (currently) used within [tablename] or [[tablearrayname]] + #it is for keyval ie x.y.z = value + + #this is the state after dot + #we are expecting a complete key token or whitespace + #(initial entry to the space is by one of the keys - which will immediately go to dottedkey-space-tail) + dict set stateMatrix\ + dottedkey-space {\ + whitespace "dottedkey-space"\ + dotsep "err-state"\ + barekey "dottedkey-space-tail"\ + squotedkey "dottedkey-space-tail"\ + dquotedkey "dottedkey-space-tail"\ + newline "err-state"\ + comma "err-state"\ + comment "err-state"\ + equal "err-state"\ + } + + #dottedkeyend "POPSPACE" + #equal "POPSPACE"\ + + + #jmn 2025 + #we have 1 or more dottedkeys so far - need dotsep to add more, whitespace to maintain, equal to pop + dict set stateMatrix\ + dottedkey-space-tail {\ + whitespace "dottedkey-space-tail" + dotsep "dottedkey-space" + equal "POPSPACE"\ + eof "err-state"\ + newline "err-state"\ + } + + #-------------------------------------------------------------------------- + #scratch area + #from_toml {x=1} + # barekey tok + # table-space PUSHSPACE keyval-space state keyval-syntax + # + + + #-------------------------------------------------------------------------- + + + #REVIEW + #toml spec looks like heading towards allowing newlines within inline tables + #https://github.com/toml-lang/toml/issues/781 + + #2025 - multiline itables appear to be valid for 1.1 - which we are targeting. + #https://github.com/toml-lang/toml/blob/main/toml.md#inline-table + + #JMN2025 + #review comment "err-state" vs comment "itable-space" - see if TOML 1.1 comes out and allows comments in multiline ITABLES + #We currently allow multiline ITABLES (also with comments) in the tokenizer. + #if we want to disallow as per TOML 1.0 - we should do so when attempting to get structure? + + + #JMN REVIEW + #dict set stateMatrix\ + # array-space {\ + # whitespace "array-space"\ + # newline "array-space"\ + # untyped_value "SAMESPACE"\ + # startarray {PUSHSPACE "array-space"}\ + # endarray "POPSPACE"\ + # startinlinetable {PUSHSPACE itable-space}\ + # single_dquote "string-state"\ + # single_squote "literal-state"\ + # triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax note "seems ok 2024"}\ + # comma "array-space"\ + # comment "array-space"\ + # eof "err-state-array-space-got-eof"\ + # } + + ## array-space ## + set aspace [dict create] + dict set aspace whitespace "array-space" + dict set aspace newline "array-space" + #dict set aspace untyped_value "SAMESPACE" + dict set aspace untyped_value "array-syntax" + dict set aspace startarray {PUSHSPACE "array-space"} + dict set aspace endarray "POPSPACE" + dict set aspace single_dquote {TOSTATE "string-state" returnstate array-syntax} + dict set aspace triple_dquote {PUSHSPACE "multistring-space" returnstate array-syntax} + dict set aspace single_squote {TOSTATE "literal-state" returnstate array-syntax} + dict set aspace triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax} + dict set aspace startinlinetable {PUSHSPACE itable-space} + #dict set aspace comma "array-space" + dict set aspace comment "array-space" + dict set aspace eof "err-state-array-space-got-eof" + dict set stateMatrix array-space $aspace + + #when we pop from an inner array we get to array-syntax + #e.g {x=[[]] ??? + set tarntail [dict create] + dict set tarntail whitespace "err-state" ;#"tablearrayname-tail" ;#spec doesn't allow whitespace here + dict set tarntail newline "err-state" + dict set tarntail comment "err-state" + dict set tarntail eof "err-state" + dict set tarntail endtablename "tablearray-tail" + dict set stateMatrix tablearrayname-tail $tarntail + + #review - somewhat counterintuitive...? + # [(starttablearrayname) (endtablearrayname] + # [(starttablename) (endtablename)] + + # [[xxx]] ??? + set tartail [dict create] + dict set tartail whitespace "tablearray-tail" + dict set tartail newline "table-space" + dict set tartail comment "tablearray-tail" + dict set tartail eof "end-state" + dict set stateMatrix tablearray-tail $tartail + + + + + + + dict set stateMatrix\ + end-state {} + + set knowntokens [list] + set knownstates [list] + dict for {state transitions} $stateMatrix { + if {$state ni $knownstates} {lappend knownstates $state} + dict for {tok instructions} $transitions { + if {$tok ni $knowntokens} {lappend knowntokens $tok} + } + } + dict set stateMatrix nostate {} + foreach tok $knowntokens { + dict set stateMatrix nostate $tok "err-nostate-received-token-$tok" + } + + + # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + #purpose - debugging? remove? + # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + #build a list of 'push triggers' from the stateMatrix + # ie tokens which can push a new space onto spacestack + set push_trigger_tokens [list] + tcl::dict::for {s transitions} $stateMatrix { + tcl::dict::for {token transition_to} $transitions { + set instruction [lindex $transition_to 0] + switch -exact -- $instruction { + PUSHSPACE - zeropoppushspace { + if {$token ni $push_trigger_tokens} { + lappend push_trigger_tokens $token + } + } + } + } + } + ::tomlish::log::debug "push_trigger_tokens: $push_trigger_tokens" + # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- + + + + #This seems hacky... (deprecate in favour of explicit arguments to the instructions in stateMatrix?) + #spacePopTransitions, spacePushTransitions, spaceSameTransitions below for auto state redirections on POPSPACE,PUSHSPACE,SAMESPACE + + #mainly for the -space states: + #redirect to another state $c based on a state transition from $whatever to $b + # e.g "string {array-space array-syntax}" means when transitioning from string to array-space, jump to array-syntax instead. + #this is useful as we often don't know state $b. e.g when it is decided by 'POPSPACE' + + #use dict set to add values so we can easily add/remove/comment lines + + #Push to, next + #default first states when we push to these spaces + variable spacePushTransitions [dict create] + dict set spacePushTransitions keyval-space keyval-syntax + dict set spacePushTransitions itable-keyval-space itable-keyval-syntax + dict set spacePushTransitions array-space array-space + dict set spacePushTransitions table-space tablename-state + #dict set spacePushTransitions #itable-space itable-space + + #Pop to, next + variable spacePopTransitions [dict create] + dict set spacePopTransitions array-space array-syntax + + + #itable-keyval-space itable-val-tail + #review + #we pop to keyval-space from dottedkey-space or from keyval-value-expected? we don't always want to go to keyval-tail + #leave it out and make the POPSPACE caller explicitly specify it + #keyval-space keyval-tail + + variable spaceSameTransitions [dict create] + #JMN test + #dict set spaceSameTransitions array-space array-syntax + + #itable-keyval-space itable-val-tail + + + variable state_list ;#reset every tomlish::decode::toml + + namespace export tomlish toml + namespace ensemble create + + #goNextState has various side-effects e.g pushes and pops spacestack + #REVIEW - setting nest and v elements here is ugly + #todo - make neater, more single-purpose? + proc goNextState {tokentype tok currentstate} { + variable state + variable nest + variable v + + set prevstate $currentstate + + + variable spacePopTransitions + variable spacePushTransitions + variable spaceSameTransitions + + variable last_space_action "none" + variable last_space_type "none" + variable state_list + + set result "" + set starttok "" + + if {[dict exists $::tomlish::parse::stateMatrix $currentstate $tokentype]} { + set transition_to [dict get $::tomlish::parse::stateMatrix $currentstate $tokentype] + ::tomlish::log::debug "--->> goNextState tokentype:$tokentype tok:$tok currentstate:$currentstate : transition_to = $transition_to" + switch -exact -- [lindex $transition_to 0] { + POPSPACE { + set popfromspace_info [spacestack peek] + set popfromspace_state [dict get $popfromspace_info state] + spacestack pop + set parent_info [spacestack peek] + set type [dict get $parent_info type] + set parentspace [dict get $parent_info state] + + set last_space_action "pop" + set last_space_type $type + + if {[dict exists $parent_info returnstate]} { + set next [dict get $parent_info returnstate] + #clear the returnstate on current level + set existing [spacestack pop] + dict unset existing returnstate + spacestack push $existing ;#re-push modification + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected to stored returnstate $next <<---" + } else { + ### + #review - do away with spacePopTransitions - which although useful to provide a default.. + # - involve error-prone configurations distant to the main state transition configuration in stateMatrix + if {[dict exists $::tomlish::parse::spacePopTransitions $parentspace]} { + set next [dict get $::tomlish::parse::spacePopTransitions $parentspace] + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected state to $next (spacePopTransitions)<<---" + } else { + set next $parentspace + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace<<---" + } + } + set result $next + } + SAMESPACE { + set currentspace_info [spacestack peek] + ::tomlish::log::debug "--->> SAMESPACE got current space entry: $currentspace_info <<<<<" + set type [dict get $currentspace_info type] + set currentspace [dict get $currentspace_info state] + + if {[dict exists $currentspace_info returnstate]} { + set next [dict get $currentspace_info returnstate] + #clear the returnstate on current level + set existing [spacestack pop] + dict unset existing returnstate + spacestack push $existing ;#re-push modification + ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace redirected to stored returnstate $next" + } else { + if {[dict exists $::tomlish::parse::spaceSameTransitions $currentspace]} { + set next [dict get $::tomlish::parse::spaceSameTransitions $currentspace] + ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace redirected state to $next (spaceSameTransitions)" + } else { + set next $currentspace + ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace" + } + } + set result $next + } + zeropoppushspace { + if {$nest > 0} { + #pop back down to the root level (table-space) + spacestack pop + set parentinfo [spacestack peek] + set type [dict get $parentinfo type] + set target [dict get $parentinfo state] + + set last_space_action "pop" + set last_space_type $type + + #----- + #standard pop + set parentlevel [expr {$nest -1}] + lappend v($parentlevel) [set v($nest)] + incr nest -1 + #----- + } + #re-entrancy + + #set next [list PUSHSPACE [lindex $transition_to 1]] + set nexttokentype ${tokentype}2 ;#fake token type e.g tablename2 or tablearrayname2 + ::tomlish::log::debug "--->> zeropoppushspace goNextState RECURSE. calling goNextState $nexttokentype $currentstate" + set transition_info [::tomlish::parse::goNextState $nexttokentype $tok $currentstate] + set result [dict get $transition_info newstate] + } + PUSHSPACE { + set original_target [dict get $transition_to PUSHSPACE] + if {[dict exists $transition_to returnstate]} { + #adjust the existing space record on the stack. + #struct::stack doesn't really support that - so we have to pop and re-push + #todo - investigate a custom stack implementation where we can efficiently lset the top of the stack + set currentspace [spacestack pop] + dict set currentspace returnstate [dict get $transition_to returnstate] + spacestack push $currentspace ;#return modified info to stack so when we POPSPACE the returnstate is available. + } + if {[dict exists $transition_to starttok]} { + set starttok [dict get $transition_to starttok] + } + spacestack push [dict create type space state $original_target] + + set last_space_action "push" + set last_space_type "space" + + if {[dict exists $transition_to state]} { + #an explicit state in the pushed space was requested in the stateMatrix - override the spacePushTransition (spacePushTransitions can be deprecated if we require explicitness?) + set next [dict get $transition_to state] + ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target redirected state to $next by explicit 'state' entry" + } else { + #puts $::tomlish::parse::spacePushTransitions + if {[dict exists $::tomlish::parse::spacePushTransitions $original_target]} { + set next [dict get $::tomlish::parse::spacePushTransitions $original_target] + ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target redirected state to $next (spacePushTransitions) " + } else { + set next $original_target + ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target" + } + } + set result $next + } + TOSTATE { + if {[dict exists $transition_to returnstate]} { + #adjust the existing space record on the stack. + #struct::stack doesn't really support that - so we have to pop and re-push + #todo - investigate a custom stack implementation where we can efficiently lset the top of the stack + set currentspace [spacestack pop] + dict set currentspace returnstate [dict get $transition_to returnstate] + spacestack push $currentspace ;#return modified info to stack so when we POPSPACE the returnstate is available. + } + set result [dict get $transition_to TOSTATE] + } + default { + #simplified version of TOSTATE + set result [lindex $transition_to 0] ;#ignore everything but first word + } + } + } else { + ::tomlish::log::error "--->> No state transition defined from state $currentstate when tokentype $tokentype received" + set result "nostate" + } + lappend state_list [list tokentype $tokentype from $currentstate to $result] + set state $result + ::tomlish::log::notice "--->> STATE TRANSITION tokenType: '$tokentype' tok:$tok triggering '$currentstate' -> '$result' last_space_action:$last_space_action " + return [dict create prevstate $prevstate newstate $result space_action $last_space_action starttok $starttok] + } + + proc report_line {{line ""}} { + variable linenum + variable is_parsing + if {$is_parsing} { + if {$line eq ""} { + set line $linenum + } + return "Line Number: $line" + } else { + #not in the middle of parsing tomlish text - return nothing. + return "" + } + } + + #produce a *slightly* more readable string rep of the nest for puts etc. + proc nest_pretty1 {list} { + set prettier "{" + + foreach el $list { + if { [lindex $el 0] eq "NEWLINE"} { + append prettier "[list $el]\n" + } elseif {([llength $el] > 1) && ([lindex $el 0] in {KEY DQKEY SQKEY TABLE ARRAY})} { + append prettier [nest_pretty1 $el] + } else { + append prettier "[list $el] " + } + } + append prettier "}" + return $prettier + } + + proc set_tokenType {t} { + variable tokenType + variable tokenType_list + if {![info exists tokenType]} { + set tokenType "" + } + lappend tokenType_list $t + set tokenType $t + } + + proc switch_tokenType {t} { + variable tokenType + variable tokenType_list + lset tokenType_list end $t + set tokenType $t + } + + proc get_tokenType {} { + variable tokenType + return $tokenType + } + + + proc get_token_waiting {} { + variable token_waiting + return [lindex $token_waiting 0] + } + proc clear_token_waiting {} { + variable token_waiting + set token_waiting [list] + } + + #token_waiting is a list - but our standard case is to have only one + #in certain circumstances such as near eof we may have 2 + #the set_token_waiting function only allows setting when there is not already one waiting. + #we want to catch cases of inadvertently trying to set multiple + # - the reason being that the state transition triggered by the previous token may have invalidated the assumptions made when a token was added as waiting. + proc set_token_waiting {args} { + if {[llength $args] %2 != 0} { + error "tomlish set_token_waiting must have args of form: type value complete 0|1" + } + variable token_waiting + + if {[llength $token_waiting] && [dict get [lindex $token_waiting end] type] ne "eof"} { + #tokloop already set a token_waiting - but something (post tokloop processing?) is trying to set another + #we may need to remove the existing token_waiting and reset the tokloop index to the previous char so it's reprocessed in the possibly new context + #rather than attempt to make the right decision here - we raise an error and require the caller to check/handle it + set err "tomlish set_token_waiting already has token_waiting: [lindex $token_waiting 0]" + append err \n " - cannot add token_waiting: $args" + error $err + #set tomlish::parse::i [expr {[dict get $token_waiting startindex] -1}] + #set token_waiting [list] + } + + set waiting [dict create] + dict for {k v} $args { + switch -exact $k { + type - complete { + dict set waiting $k $v + } + value { + dict set waiting tok $v + } + startindex { + dict set waiting startindex $v + } + default { + error "tomlish set_token_waiting error - unrecognised key $k. known keys: [dict keys $args]" + } + } + } + if {![tcl::string::is boolean -strict [dict get $waiting complete]]} { + error "tomlish set_token_waiting error - 'complete' must be a boolean. got [dict get $waiting complete]" + } + if {![llength $token_waiting]} { + set token_waiting [list $waiting] + } else { + #an extra sanity-check that we don't have more than just the eof.. + if {[llength $token_waiting] > 1} { + set err "tomlish Unexpected. Existing token_waiting count > 1.\n" + foreach tw $token_waiting { + append err " $tw" \n + } + append err " - cannot add token_waiting: $waiting" + error $err + } + #last entry must be a waiting eof + set token_waiting [list $waiting [lindex $token_waiting end]] + } + return + } + + #returns 0 or 1 + #tomlish::parse::tok + #we attempt to do this without lookahead (potential use in streaming toml? for what benefit?) todo -final flag + # - the possible benefit is being able to more easily process in arbitrarily split chunks (although we would still have to watch crlf splitting ?) + # - interactive use? + + proc tok {} { + variable nest + variable s + variable v + variable i + variable tok + variable type ;#character type + variable state ;#FSM + + + variable tokenType + variable tokenType_list + + + variable endToken + + variable lastChar + + variable braceCount + variable bracketCount + + + #------------------------------ + #Previous run found another (presumably single-char) token + #The normal case is for there to be only one dict in the list + #multiple is an exception - primarily for eof + variable token_waiting + if {[llength $token_waiting]} { + set waiting [lindex $token_waiting 0] + + set tokenType [dict get $waiting type] + set tok [dict get $waiting tok] + #todo: dict get $token_waiting complete + set token_waiting [lrange $token_waiting 1 end] + return 1 + } + #------------------------------ + + set resultlist [list] + set sLen [tcl::string::length $s] + + set slash_active 0 + set quote 0 + set c "" + for {} {$i < $sLen} {} { + if {$i > 0} { + set lastChar [tcl::string::index $s [expr {$i - 1}]] + set start_of_data h + } else { + set lastChar "" + set start_of_data 1 + #bom-handling + if {[tcl::string::index $s 0] eq "\uFEFF"} { + #bom (could be from various encodings - now decoded as single unicode char FEFF) + #incr i 1 ;#skip over initial bom? + } + } + + + set c [tcl::string::index $s $i] + set cindex $i + + set ctest [tcl::string::map {\{ lc \} rc \[ lb \] rb \" dq ' sq \\ bsl \r cr \n lf \t tab \uFEFF bom} $c] + tomlish::log::debug "- tokloop char <$ctest> index $i tokenType:$tokenType tok:<$tok>" + #puts "got char $c during tokenType '$tokenType'" + incr i ;#must incr here because we do returns inside the loop + + + + switch -exact -- $ctest { + # { + set had_slash $slash_active + set slash_active 0 + + if {$had_slash} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + #for multiliteral, multistring - data and/or end + incr i -1 + return 1 + } + _start_squote_sequence { + #pseudo token beginning with underscore - never returned to state machine - review + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + barekey { + error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed (see tomlish::utils::is_barekey). [tomlish::parse::report_line]" + } + whitespace { + # hash marks end of whitespace token + #do a return for the whitespace, set token_waiting + #set_token_waiting type comment value "" complete 1 + incr i -1 ;#leave comment for next run + return 1 + } + untyped_value { + #REVIEW! the spec isn't clear.. is whitespace after an int,bool etc required before comment? + #we will accept a comment marker as an immediate terminator of the untyped_value. + incr i -1 + return 1 + } + starttablename - starttablearrayname { + #fix? + error "tomlish Character '#' is invalid first character for $tokenType. [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #invalid in bare parts - but allowed in quoted parts - let tablename parser sort it out + append tok $c + } + default { + #dquotedkey, string,literal, multistring + append tok $c + } + } + } else { + switch -- $state { + multistring-space { + set_tokenType stringpart + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "#" + } + multiliteral-space { + set_tokenType "literalpart" + set tok "#" + } + default { + #start of token if we're not in a token + set_tokenType comment + set tok "" ;#The hash is not part of the comment data + } + } + } + } + lc { + #left curly brace + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i [tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + append tok $c + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + starttablename - starttablearrayname { + #*bare* tablename can only contain letters,digits underscores + error "tomlish Invalid tablename first character \{ [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #valid in quoted parts + append tok $c + } + comment { + if {$had_slash} {append tok "\\"} + append tok "\{" + } + default { + #end any other token. + incr i -1 + return 1 + } + } + } else { + switch -exact -- $state { + itable-keyval-value-expected - keyval-value-expected { + #switch last key to tablename?? + set_tokenType "startinlinetable" + set tok "\{" + return 1 + } + array-space - array-syntax { + #nested anonymous inline table + set_tokenType "startinlinetable" + set tok "\{" + return 1 + } + table-space { + #invalid - but allow parser statemachine to report it. ? + set_tokenType "startinlinetable" + set tok "\{" + return 1 + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "\{" + } + multiliteral-space { + set_tokenType "literalpart" + set tok "\{" + } + default { + error "tomlish state: '$state'. left brace case not implemented [tomlish::parse::report_line]" + } + } + } + + } + rc { + #right curly brace + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + append tok $c + } + string - dquotedkey - comment { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + starttablename - tablename { + if {$had_slash} {append tok "\\"} + #invalid! - but leave for datastructure loading stage to catch + set_token_waiting type endinlinetable value "" complete 1 startindex $cindex + return 1 + } + starttablearrayname - tablearrayname { + if {$had_slash} {append tok "\\"} + #invalid! - but leave for datastructure loading stage to catch + set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex + return 1 + } + default { + #end any other token + incr i -1 + return 1 + } + } + } else { + #$slash_active not relevant when no tokenType + switch -exact -- $state { + table-space { + #invalid - but allow parser statemachine to report it. ? + set_tokenType "endinlinetable" + set tok "\}" + return 1 + } + itable-space { + set_tokenType "endinlinetable" + set tok "\}" + return 1 + } + tablename-state { + #e.g [] - empty tablename - allowed or not? + #empty tablename/tablearrayname ? + #error "unexpected tablename problem" + + set_tokenType "endinlinetable" + set tok "" ;#no output into the tomlish list for this token + return 1 + } + tablearrayname-state { + error "tomlish unexpected tablearrayname-state problem" + set_tokenType "endinlinetable" + set tok "" ;#no output into the tomlish list for this token + return 1 + } + array-syntax - array-space { + #invalid + set_tokenType "endinlinetable" + set tok "\}" + return 1 + } + itable-val-tail { + set_tokenType "endinlinetable" + set tok "" + #we need to pop the keyval - and then reprocess to pop the inlinetable - so we incr -1 + incr i -1 + return 1 + } + itable-keyval-syntax { + error "tomlish endinlinetable unexpected at this point. Expecting key=val syntax [tomlish::parse::report_line]" + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "\}" + } + multiliteral-space { + set_tokenType "literalpart" ; #review + set tok "\}" + } + default { + #JMN2024b keyval-tail? + error "tomlish state '$state'. endinlinetable case not implemented [tomlish::parse::report_line]" + } + } + } + + } + lb { + #left square bracket + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + append tok $c + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + starttablename { + #change the tokenType + switch_tokenType "starttablearrayname" + set tok "" ;#no output into the tomlish list for this token + #any following whitespace is part of the tablearrayname, so return now + return 1 + } + tablename - tablearrayname { + #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token + if {$had_slash} { + #resultant tablename may be invalid - but leave for datastructure loading stage to catch + #append tok "\\[" + append tok {\[} + } else { + if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { + #invalid at this point - state machine should disallow: + # table -> starttablearrayname + # tablearray -> starttablearrayname + set_token_waiting type starttablearrayname value "" complete 1 startindex $cindex + return 1 + } else { + #we appear to still be in single or double quoted section + append tok "\[" + } + } + } + comment { + if {$had_slash} {append tok "\\"} + append tok "\[" + } + default { + #end any other token. + incr i -1 + return 1 + } + } + } else { + #$slash_active not relevant when no tokenType + switch -exact -- $state { + keyval-value-expected - itable-keyval-value-expected { + set_tokenType "startarray" + set tok "\[" + return 1 + } + array-space - array-syntax { + #nested array? + set_tokenType "startarray" + set tok "\[" + return 1 + #error "state: array-space. startarray case not implemented [tomlish::parse::report_line]" + } + table-space { + #table name + #assume it's a single bracket - but we need to wait for non-bracket to confirm it's not a tablearray + #note that a starttablearrayname token may contain whitespace between the brackets + # e.g \[ \[ + set_tokenType "starttablename" + set tok "" ;#there is no output into the tomlish list for this token + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "\[" + } + multiliteral-space { + set_tokenType "literalpart" + set tok "\[" + } + itable-space { + #handle state just to give specific error msg + error "tomlish state: '$state'. Left square bracket invalid. Cannot start array in inline table without key. Use key=\[\] syntax. [tomlish::parse::report_line]" + } + default { + error "tomlish state: '$state'. startarray case not implemented [tomlish::parse::report_line]" + } + } + } + } + rb { + #right square bracket + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + append tok $c + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + comment { + if {$had_slash} {append tok "\\"} + append tok $c + } + whitespace { + if {$state eq "multistring-space"} { + #???? + incr i -1 + if {$had_slash} {incr i -1} ;#reprocess + return 1 + } else { + incr i -1 + if {$had_slash} {incr i -1} ;#reprocess + return 1 + } + } + starttablename { + #toml-test invalid/table/empty + + set_token_waiting type tablename value "" complete 1 startindex $cindex + incr i -1 + return 1 + } + tablename { + #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token + if {$had_slash} { + #resultant tablename may be invalid - but leave for datastructure loading stage to catch + append tok "\\]" + } else { + if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { + set_token_waiting type endtablename value "" complete 1 startindex $cindex + return 1 + } else { + #we appear to still be in single or double quoted section + append tok "]" + } + } + } + tablearrayname { + #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token + if {$had_slash} { + #resultant tablename may be invalid - but leave for datastructure loading stage to catch + append tok "\\]" + } else { + if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { + set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex + return 1 + } else { + #we appear to still be in single or double quoted section + append tok "]" + } + } + } + default { + incr i -1 + return 1 + } + } + } else { + #$slash_active not relevant when no tokenType + switch -exact -- $state { + array-syntax - array-space { + #invalid - but allow parser statemachine to report it. + set_tokenType "endarray" + set tok "\]" + return 1 + } + table-space { + #invalid - but allow parser statemachine to report it. ? + set_tokenType "endarray" + set tok "\]" + return 1 + } + tablename-state { + #e.g [] - empty tablename + #tomltest 1.1.0 invalid/table/empty + #should be invalid + #we parse it and let dict::from_tomlish error when it tries to split table + + set_tokenType "endtablename" + set tok "" ;#no output into the tomlish list for this token + return 1 + } + tablearrayname-state { + error "tomlish unexpected tablearrayname problem" + set_tokenType "endtablearray" + set tok "" ;#no output into the tomlish list for this token + return 1 + } + tablearrayname-tail { + #[[xxx] + set_tokenType "endtablename" + #sequence: starttablename -> starttablearrayname -> endtablearrayname -> endtablename + return 1 + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok "\]" + } + multiliteral-space { + set_tokenType "literalpart" + set tok "\]" + } + default { + error "tomlish state '$state'. endarray case not implemented [tomlish::parse::report_line]" + } + } + } + } + bsl { + #backslash + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + whitespace { + if {$state eq "multistring-space"} { + #end whitespace token + incr i -1 ;#reprocess bsl in next run + return 1 + } else { + error "tomlish Unexpected backslash during whitespace. [tomlish::parse::report_line]" + } + } + literal - literalpart - squotedkey { + #never need to set slash_active true when in single quoted tokens + append tok "\\" + set slash_active 0 + } + string - dquotedkey - comment { + if {$slash_active} { + set slash_active 0 + append tok "\\\\" + } else { + set slash_active 1 + } + } + stringpart { + if {$slash_active} { + set slash_active 0 + append tok "\\\\" + } else { + set slash_active 1 + } + } + starttablename - starttablearrayname { + error "tomlish backslash is invalid as first character of $tokenType [tomlish::parse::report_line]" + } + tablename - tablearrayname { + if {$slash_active} { + set slash_active 0 + append tok "\\\\" + } else { + set slash_active 1 + } + } + barekey { + error "tomlish Unexpected backslash during barekey. [tomlish::parse::report_line]" + } + default { + error "tomlish Backslash unexpected during tokentype: '$tokenType'. [tomlish::parse::report_line]" + } + } + } else { + switch -exact -- $state { + multistring-space { + if {$slash_active} { + set_tokenType "stringpart" + set tok "\\\\" + set slash_active 0 + } else { + set slash_active 1 + } + } + multiliteral-space { + #nothing can be escaped in multiliteral-space - not even squotes (?) review + set_tokenType "literalpart" + set tok "\\" + } + default { + error "tomlish tok error: Unexpected backslash when no token is active. [tomlish::parse::report_line]" + } + } + } + } + sq { + #single quote + set had_slash $slash_active + set slash_active 0 + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote { + #for within multiliteral + #short tentative_accum_squote tokens are returned if active upon receipt of any other character + #longest allowable for leading/trailing are returned here + #### + set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote + #assert state = trailing-squote-space + append tok $c + if {$existingtoklen == 4} { + #maxlen to be a tentative_accum_squote is multisquote + 2 = 5 + #return tok with value ''''' + return 1 + } + } + tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space + switch -- [tcl::string::length $tok] { + 1 { + #no conclusion can yet be reached + append tok $c + } + 2 { + #enter multiliteral + #switch? + append tok $c + set_tokenType triple_squote + return 1 + } + default { + #if there are more than 3 leading squotes we also enter multiliteral space and the subsequent ones are handled + #by the tentative_accum_squote check for ending sequence which can accept up to 5 and reintegrate the + #extra 1 or 2 squotes as data. + error "tomlish unexpected token length [tcl::string::length $tok] in '_start_squote_sequence'" + } + } + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + whitespace { + #end whitespace + incr i -1 ;#reprocess sq + return 1 + } + literal { + #slash_active always false + #terminate the literal + set_token_waiting type endsquote value "'" complete 1 startindex $cindex + return 1 + } + literalpart { + #ended by ''' - but final could be '''' or ''''' (up to 2 squotes allowed directly before ending triple squote sequence) + #todo + # idea: end this literalpart (possibly 'temporarily') + # let the sq be reprocessed in the multiliteral-space to push an end-multiliteral-sequence to state stack + # upon popping end-multiliteral-sequence - stitch quotes back into this literalpart's token (if either too short - or a long ending sequence as shown above) + incr i -1 ;#throw the "'" back to loop - will be added to a tentative_accum_squote token for later processing + return 1 + } + XXXitablesquotedkey { + set_token_waiting type endsquote value "'" complete 1 startindex $cindex + return 1 + } + squotedkey { + ### + #set_token_waiting type endsquote value "'" complete 1 + return 1 + } + starttablename - starttablearrayname { + #!!! + incr i -1 + return 1 + } + tablename - tablearrayname { + append tok $c + } + barekey { + #barekeys now support all sorts of unicode letter/number chars for other cultures + #but not punctuation - not even for those of Irish heritage who don't object + #to the anglicised form of some names. + # o'shenanigan seems to not be a legal barekey + #The Irish will have to use an earlier form Ó - which apparently many may prefer anyway. + error "tomlish Unexpected single quote during barekey. [tomlish::parse::report_line]" + } + default { + append tok $c + } + } + } else { + switch -exact -- $state { + array-space - keyval-value-expected - itable-keyval-value-expected { + #leading squote + #pseudo-token _start_squote_sequence ss not received by state machine + #This pseudotoken will trigger production of single_squote token or triple_squote token + #It currently doesn't trigger double_squote token + #(handle '' same as 'x' ie produce a single_squote and go into processing literal) + #review - producing double_squote for empty literal may be slightly more efficient. + #This token is not used to handle squote sequences *within* a multiliteral + set_tokenType "_start_squote_sequence" + set tok "'" + } + multiliteral-space { + #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row + #we are building up a tentative_accum_squote to determine if + #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines + #b) it is exactly ''' and we can terminate the whole multiliteral + #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space + set_tokenType "tentative_trigger_squote" ;#trigger tentative_accum_squote + set tok "'" + return 1 + } + table-space - itable-space { + #tests: squotedkey.test squotedkey_itable.test + set_tokenType "squotedkey" + set tok "" + } + XXXtable-space - XXXitable-space { + #future - could there be multiline keys? MLLKEY, MLBKEY ? + #this would (almost) allow arbitrary tcl dicts to be stored in toml (aside from escaping issues) + #probably unlikely - as it's perhaps not very 'minimal' or ergonomic for config files + #@2025 ABNF for toml mentions key, simple-key, unquoted-key, quoted-key and dotted-key + #where key is simple-key or dotted-key - no MLL or MLB components + #the spec states solution for arbitrary binary data is application specific involving encodings + #such as hex, base64 + set_tokenType "_start_squote_sequence" + set tok "'" + return 1 + } + tablename-state { + #first char in tablename-state/tablearrayname-state + set_tokenType "tablename" + append tok "'" + } + tablearrayname-state { + set_tokenType "tablearrayname" + append tok "'" + } + literal-state { + #shouldn't get here? review + tomlish::log::debug "- tokloop sq during literal-state with no tokentype - empty literal?" + set_tokenType "literal" + incr -1 + return 1 + } + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} {append tok "\\"} + append tok "," + #error "tomlish unimplemented - squote during state '$state'. [tomlish::parse::report_line]" + } + dottedkey-space { + set_tokenType "squotedkey" + } + default { + error "tomlish unhandled squote during state '$state'. [tomlish::parse::report_line]" + } + } + } + + } + dq { + #double quote + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #review + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + tentative_accum_dquote { + #within multistring + #short tentative_accum_dquote tokens are returned if active upon receipt of any other character + #longest allowable for leading/trailing are returned here + #### + set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote + #assert state = trailing-squote-space + append tok $c + if {$existingtoklen == 4} { + #maxlen to be a tentative_accum_dquote is multidquote + 2 = 5 + #return tok with value """"" + return 1 + } + } + _start_dquote_sequence { + #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space + switch -- [tcl::string::length $tok] { + 1 { + #no conclusion can yet be reached + append tok $c + } + 2 { + #enter multistring + #switch? + append tok $c + set_tokenType triple_dquote + return 1 + } + default { + #if there are more than 3 leading dquotes we also enter multistring space and the subsequent ones are handled + #by the tentative_accum_dquote check for ending sequence which can accept up to 5 and reintegrate the + #extra 1 or 2 dquotes as data. + error "tomlish unexpected token length [tcl::string::length $tok] in '_start_dquote_sequence'" + } + } + } + literal - literalpart { + append tok $c + } + string { + if {$had_slash} { + append tok "\\" $c + } else { + #unescaped quote always terminates a string + set_token_waiting type enddquote value "\"" complete 1 startindex $cindex + return 1 + } + } + stringpart { + #sub element of multistring + if {$had_slash} { + append tok "\\" $c + } else { + incr i -1 ;#throw the {"} back to loop - will be added to a tentative_accum_dquote token for later processing + return 1 + } + } + whitespace { + #assert: had_slash will only ever be true in multistring-space + if {$had_slash} { + incr i -2 + return 1 + } else { + #end whitespace token - throw dq back for reprocessing + incr i -1 + return 1 + } + } + comment { + if {$had_slash} {append tok "\\"} + append tok $c + } + XXXdquotedkey { + if {$had_slash} { + append tok "\\" + append tok $c + } else { + set_token_waiting type enddquote value "\"" complete 1 startindex $cindex + return 1 + } + } + dquotedkey { + ### + if {$had_slash} { + append tok "\\" + append tok $c + } else { + #set_token_waiting type enddquote value {"} complete 1 + return 1 + } + } + squotedkey { + append tok $c + } + tablename - tablearrayname { + if {$had_slash} {append tok "\\"} + append tok $c + } + starttablename - starttablearrayname { + incr i -1 ;## + return 1 + } + default { + error "tomlish got quote during tokenType '$tokenType' [tomlish::parse::report_line]" + } + } + } else { + #$slash_active not relevant when no tokenType + #token is string only if we're expecting a value at this point + switch -exact -- $state { + array-space - keyval-value-expected - itable-keyval-value-expected { + #leading dquote + #pseudo-token _start_squote_sequence ss not received by state machine + #This pseudotoken will trigger production of single_dquote token or triple_dquote token + #It currently doesn't trigger double_dquote token + #(handle "" same as "x" ie produce a single_dquote and go into processing string) + #review - producing double_dquote for empty string may be slightly more efficient. + #This token is not used to handle dquote sequences once *within* a multistring + set_tokenType "_start_dquote_sequence" + set tok {"} + } + multistring-space { + if {$had_slash} { + set_tokenType "stringpart" + set tok "\\\"" + } else { + #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row + #we are building up a tentative_accum_squote to determine if + #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines + #b) it is exactly ''' and we can terminate the whole multiliteral + #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space + set_tokenType "tentative_trigger_dquote" ;#trigger tentative_accum_dquote + set tok {"} + return 1 + } + } + multiliteral-space { + set_tokenType "literalpart" + set tok "\"" + } + table-space - itable-space { + set_tokenType "dquotedkey" + set tok "" + } + dottedkey-space { + set_tokenType dquotedkey + set tok "" + + #only if complex keys become a thing + #set_tokenType dquote_seq_begin + #set tok $c + } + tablename-state { + set_tokenType tablename + set tok $c + } + tablearrayname-state { + set_tokenType tablearrayname + set tok $c + } + default { + error "tomlish Unexpected dquote during state '$state' [tomlish::parse::report_line]" + } + } + } + } + = { + set had_slash $slash_active + set slash_active 0 + + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #review + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart - squotedkey { + #assertion had_slash 0 + append tok $c + } + string - comment - dquotedkey { + #for these tokenTypes an = is just data. + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + whitespace { + if {$state eq "multistring-space"} { + incr i -1 + return 1 + } else { + set_token_waiting type equal value = complete 1 startindex $cindex + return 1 + } + } + barekey { + #set_token_waiting type equal value = complete 1 + incr i -1 + return 1 + } + starttablename - starttablearrayname { + error "tomlish Character '=' is invalid first character for $tokenType. [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #invalid in bare name - but valid in quoted parts - leave for tablename parser to sort out + append tok $c + } + default { + error "tomlish unexpected = character during tokentype $tokenType. case not implemented. [tomlish::parse::report_line]" + } + } + } else { + switch -exact -- $state { + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} { + append tok "\\" + } + append tok = + } + multiliteral-space { + set_tokenType "literalpart" + set tok "=" + } + dottedkey-space { + set_tokenType "equal" + set tok "=" + return 1 + } + default { + set_tokenType "equal" + set tok = + return 1 + } + } + } + } + cr { + #REVIEW! + # \r carriage return + if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. + set slash_active 0 + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #we have received a double cr + ::tomlish::log::warn "double cr - will generate cr token. needs testing" + set_tokenType "cr" ;#lone cr token will generally raise an error - but let state machine handle it + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal { + append tok $c + } + literalpart { + #part of MLL string (multi-line literal string) + #we need to split out crlf as a separate NEWLINE to be consistent + ::tomlish::log::warn "literalpart ended by cr - needs testing" + #return literalpart temporarily - allow cr to be reprocessed from multiliteral-space + incr i -1 + return 1 + } + stringpart { + #stringpart is a part of MLB string (multi-line basic string) + #throw back the cr - if followed by lf it will become a {NEWLINE crlf} entry within the MULTISTRING list (e.g between STRINGPART entries) + incr i -1 + return 1 + } + starttablename - starttablearrayname { + error "tomlish Character is invalid first character for $tokenType. [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #could in theory be valid in quoted part of name + #review - might be better just to disallow here + append tok $c + } + whitespace { + #it should technically be part of whitespace if not followed by lf + #but outside of values we are also free to map it to be another NEWLINE instead? REVIEW + incr i -1 + return 1 + } + untyped_value { + incr i -1 + return 1 + } + comment { + #JJJJ + #review + incr i -1 + return 1 + } + default { + #!todo - error out if cr inappropriate for tokenType + append tok $c + } + } + } else { + #lf may be appended if next + #review - lone cr as newline? - this is uncommon - but so is lone cr in a string(?) + set_tokenType "newline" + set tok cr + } + } + lf { + # \n newline + set had_slash $slash_active + set slash_active 0 + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #review + #this lf is the trailing part of a crlf + append tok lf ;#assert we should now have tok "crlf" - as a previous cr is the only way to have an incomplete newline tok + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + #multiliteral or multistring + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal { + #nl is not allowed *within* a literal - require multiliteral syntax for any literal containing a newline ''' ''' + #even though we terminate the literal without the closing quote here - the token_waiting newline should trigger a state error + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 + } + literalpart { + #we allow newlines - but store them within the multiliteral as their own element + #This is a legitimate end to the literalpart - but not the whole multiliteral + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 + } + stringpart { + if {$had_slash} { + #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) + set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] + incr i -1 + return 1 + } else { + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 + } + } + starttablename - tablename - tablearrayname - starttablearrayname { + error "tomlish Character is invalid in $tokenType. [tomlish::parse::report_line]" + } + default { + #newline ends all other tokens. + #note for string: we don't add (raw unescaped) newline to simple string. (must use multi-string for this) + #note for whitespace: + # we will use the convention that \n terminates the current whitespace even if whitespace follows + # ie whitespace is split into separate whitespace tokens at each newline + + #puts "-------------- newline lf during tokenType $tokenType" + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 + } + } + } else { + switch -exact -- $state { + multistring-space { + if {$had_slash} { + set_tokenType "continuation" + set tok "\\" + incr i -1 + return 1 + } else { + set_tokenType "newline" + set tok lf + return 1 + } + } + multiliteral-space { + #assert had_slash 0 + set_tokenType "newline" + set tok "lf" + return 1 + } + default { + #ignore slash? error? + set_tokenType "newline" + set tok lf + return 1 + } + } + #if {$had_slash} { + # #CONT directly before newline - allows strings_5_byteequivalent test to pass + # set_tokenType "continuation" + # set tok "\\" + # incr i -1 + # return 1 + #} else { + # set_tokenType newline + # set tok lf + # return 1 + #} + } + } + , { + set had_slash $slash_active + set slash_active 0 + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + comment - tablename - tablearrayname { + if {$had_slash} {append tok "\\"} + append tok , + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + #stringpart can have up to 2 quotes too + if {$had_slash} {append tok "\\"} + append tok $c + } + literal - literalpart - squotedkey { + #assert had_slash always 0 + append tok $c + } + whitespace { + if {$state eq "multistring-space"} { + incr i -1 + return 1 + } else { + set_token_waiting type comma value "," complete 1 startindex $cindex + return 1 + } + } + default { + set_token_waiting type comma value "," complete 1 startindex $cindex + if {$had_slash} {append tok "\\"} + return 1 + } + } + } else { + switch -exact -- $state { + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} {append tok "\\"} + append tok "," + } + multiliteral-space { + #assert had_slash 0 + set_tokenType "literalpart" + set tok "," + } + default { + set_tokenType "comma" + set tok "," + return 1 + } + } + } + } + . { + set had_slash $slash_active + set slash_active 0 + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + comment - untyped_value { + if {$had_slash} {append tok "\\"} + append tok $c + } + string - dquotedkey { + if {$had_slash} {append tok "\\"} + append tok $c + } + stringpart { + if {$had_slash} {append tok "\\"} + append tok $c + } + literal - literalpart - squotedkey { + #assert had_slash always 0 + append tok $c + } + whitespace { + switch -exact -- $state { + multistring-space { + #review + if {$had_slash} { + incr i -2 + } else { + incr i -1 + } + return 1 + } + xxxdottedkey-space { + incr i -1 + return 1 + } + dottedkey-space-tail { + incr i -1 + return 1 + } + default { + error "tomlish Received period during tokenType 'whitespace' [tomlish::parse::report_line]" + } + } + } + starttablename - starttablearrayname { + #This would correspond to an empty table name + error "tomlish Character '.' is not allowed as first character ($tokenType). [tomlish::parse::report_line]" + } + tablename - tablearrayname { + #subtable - split later - review + append tok $c + } + barekey { + #e.g x.y = 1 + #we need to transition the barekey to become a structured table name ??? review + #x is the tablename y is the key + set_token_waiting type dotsep value "." complete 1 startindex $cindex + return 1 + } + default { + error "tomlish Received period during tokenType '$tokenType' [tomlish::parse::report_line]" + #set_token_waiting type period value . complete 1 + #return 1 + } + } + } else { + switch -exact -- $state { + multistring-space { + set_tokenType "stringpart" + set tok "" + if {$had_slash} {append tok "\\"} + append tok "." + } + multiliteral-space { + set_tokenType "literalpart" + set tok "." + } + XXXdottedkey-space { + ### obs? + set_tokenType "dotsep" + set tok "." + return 1 + } + dottedkey-space-tail { + ### + set_tokenType "dotsep" + set tok "." + return 1 + } + default { + set_tokenType "untyped_value" + set tok "." + } + } + } + + } + " " - tab { + if {[tcl::string::length $tokenType]} { + set had_slash $slash_active + set slash_active 0 + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + barekey { + #todo had_slash - emit token or error + #whitespace is a terminator for bare keys + #set_token_waiting type whitespace value $c complete 1 + incr i -1 + return 1 + } + untyped_value { + #unquoted values (int,date,float etc) are terminated by whitespace + #set_token_waiting type whitespace value $c complete 1 + incr i -1 + return 1 + } + comment { + if {$had_slash} { + append tok "\\" + } + append tok $c + } + string - dquotedkey { + if {$had_slash} { append tok "\\" } + append tok $c + } + stringpart { + #for stringpart we store WS separately for ease of processing continuations (CONT stripping) + if {$had_slash} { + #REVIEW + #emit the stringpart - go back to the slash + incr i -2 + return 1 + } else { + #split into STRINGPART xxx WS " " + incr i -1 + return 1 + } + } + literal - literalpart - squotedkey { + append tok $c + } + whitespace { + if {$state eq "multistring-space"} { + append tok $c + } else { + append tok $c + } + } + starttablename - starttablearrayname { + incr i -1 + return 1 + } + tablename - tablearrayname { + #include whitespace in the tablename/tablearrayname + #Will need to be normalized upon interpreting the tomlish as a datastructure + append tok $c + } + default { + error "tomlish Received whitespace space during tokenType '$tokenType' [tomlish::parse::report_line]" + } + } + } else { + set had_slash $slash_active + set slash_active 0 + switch -exact -- $state { + tablename-state { + #tablename can have leading,trailing and interspersed whitespace! + #These will not be treated as whitespace tokens, instead forming part of the name. + set_tokenType tablename + set tok "" + if {$had_slash} {append tok "\\"} + append tok $c + } + tablearrayname-state { + set_tokenType tablearrayname + set tok "" + if {$had_slash} {append tok "\\"} + append tok $c + } + multistring-space { + if {$had_slash} { + set_tokenType "continuation" + set tok "\\" + incr i -1 + return 1 + } else { + set_tokenType "whitespace" + append tok $c + } + } + multiliteral-space { + set_tokenType "literalpart" + set tok $c + } + default { + if {$had_slash} { + error "tomlish unexpected backslash [tomlish::parse::report_line]" + } + set_tokenType "whitespace" + append tok $c + } + } + } + } + tabX { + if {[tcl::string::length $tokenType]} { + if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out (?review) + set slash_active 0 + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + barekey { + #whitespace is a terminator for bare keys + incr i -1 + #set_token_waiting type whitespace value $c complete 1 + return 1 + } + untyped_value { + #unquoted values (int,date,float etc) are terminated by whitespace + #set_token_waiting type whitespace value $c complete 1 + incr i -1 + return 1 + } + squotedkey { + append tok $c + } + dquotedkey - string - comment - whitespace { + #REVIEW + append tok $c + } + stringpart { + #for stringpart we store WS separately for ease of processing continuations (CONT stripping) + if {$had_slash} { + #REVIEW + #emit the stringpart - go back to the slash + incr i -2 + return 1 + } else { + #split into STRINGPART aaa WS " " + incr i -1 + return 1 + } + } + literal - literalpart { + append tok $c + } + starttablename - starttablearrayname { + incr i -1 + return 1 + } + tablename - tablearrayname { + #include whitespace in the tablename/tablearrayname + #Will need to be normalized upon interpreting the tomlish as a datastructure + append tok $c + } + default { + error "tomlish Received whitespace tab during tokenType '$tokenType' [tomlish::parse::report_line]" + } + } + } else { + set had_slash $slash_active + if {$slash_active} { + set slash_active 0 + } + switch -exact -- $state { + tablename-state { + #tablename can have leading,trailing and interspersed whitespace! + #These will not be treated as whitespace tokens, instead forming part of the name. + set_tokenType tablename + set tok $c + } + tablearrayname-state { + set_tokenType tablearrayname + set tok $c + } + multistring-space { + if {$had_slash} { + set_tokenType "continuation" + set tok "\\" + incr i -1 + return 1 + } else { + set_tokenType whitespace + append tok $c + } + } + multiliteral-space { + set_tokenType "literalpart" + set tok $c + } + default { + set_tokenType "whitespace" + append tok $c + } + } + } + } + bom { + #bom encoded as single unicode codepoint \uFFEF + #BOM (Byte Order Mark) - ignored by token consumer + if {[tcl::string::length $tokenType]} { + switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + #assert - tok will be one or two squotes only + #A toml literal probably isn't allowed to contain this + #but we will parse and let the validator sort it out. + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + literal - literalpart { + append tok $c + } + string - stringpart { + append tok $c + } + default { + #state machine will generally not have entry to accept bom - let it crash + set_token_waiting type bom value "\uFEFF" complete 1 startindex $cindex + return 1 + } + } + } else { + switch -exact -- $state { + multiliteral-space { + set_tokenType "literalpart" + set tok $c + } + multistring-space { + set_tokenType "stringpart" + set tok $c + } + default { + set_tokenType "bom" + set tok "\uFEFF" + return 1 + } + } + } + } + default { + + if {[tcl::string::length $tokenType]} { + if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. + set slash_active 0 + switch -exact -- $tokenType { + newline { + #incomplete newline + set_tokenType "cr" + incr i -1 + return 1 + } + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } + _start_squote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + whitespace { + if {$state eq "multistring-space"} { + incr i -1 + return 1 + } else { + #review + incr i -1 ;#We don't have a full token to add to the token_waiting dict - so leave this char for next run. + return 1 + } + } + barekey { + if {[tomlish::utils::is_barekey $c]} { + append tok $c + } else { + error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed. (see tomlish::utils::is_barekey) [tomlish::parse::report_line]" + } + } + starttablename - starttablearrayname { + incr i -1 + #allow statemachine to set context for subsequent chars + return 1 + } + string - stringpart { + append tok $c + } + default { + #e.g comment/literal/literalpart/untyped_value/starttablename/starttablearrayname/tablename/tablearrayname + append tok $c + } + } + } else { + set had_slash $slash_active + set slash_active 0 + switch -exact -- $state { + table-space - itable-space { + #if no currently active token - assume another key value pair + if {[tomlish::utils::is_barekey $c]} { + set_tokenType "barekey" + append tok $c + } else { + error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" + } + } + multistring-space { + set_tokenType "stringpart" + if {$had_slash} { + set tok \\$c + } else { + set tok $c + } + } + multiliteral-space { + set_tokenType "literalpart" + set tok $c + } + tablename-state { + set_tokenType "tablename" + set tok $c + } + tablearrayname-state { + set_tokenType "tablearrayname" + set tok $c + } + dottedkey-space { + set_tokenType barekey + set tok $c + } + default { + #todo - something like ansistring VIEW to show control chars? + set cshow [string map [list \t tab \v vt] $c] + tomlish::log::debug "- tokloop char '$cshow' setting to untyped_value while state:$state [tomlish::parse::report_line]" + set_tokenType "untyped_value" + set tok $c + } + } + } + } + } + + } + + #run out of characters (eof) + if {[tcl::string::length $tokenType]} { + #check for invalid ending tokens + #if {$state eq "err-state"} { + # error "Reached end of data whilst tokenType = '$tokenType'. INVALID" + #} + switch -exact -- $tokenType { + _start_squote_sequence { + set toklen [tcl::string::length $tok] + switch -- $toklen { + 1 { + #invalid eof with open literal + error "tomlish eof reached without closing single quote for string literal. [tomlish::parse::report_line]" + } + 2 { + set_tokenType "literal" + set tok "" + return 1 + + ##review + #set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] + #set_tokenType "literal" + #set tok "" + #return 1 + } + } + } + _start_dquote_sequence { + set toklen [tcl::string::length $tok] + switch -- $toklen { + 1 { + #invalid eof with open string + error "tomlish eof reached without closing double quote for string. [tomlish::parse::report_line]" + } + 2 { + set_tokenType "string" + set tok "" + return 1 + } + } + } + newline { + #The only newline token that has still not been returned should have a tok value of "cr" + puts "tomlish eof reached - with incomplete newline token '$tok'" + if {$tok eq "cr"} { + #we convert lone cr to it's own "cr" token elsewhere in the document to allow statemachine to handle it. + #(which it should generally do by not handling it ie raising an error - or emitting an ERROR list in the tomlish) + #if trailing char is a lone cr - we should encode it the same way as elsewhere that is outside of values + # ie as it's own token. + switch_tokenType "cr" + return 1 + } else { + #should be unreachable + error "tomlish eof reached - with invalid newline token. value: $tok" + } + } + } + set_token_waiting type eof value eof complete 1 startindex $i ;#review + return 1 + } else { + ::tomlish::log::debug "- No current tokenType, ran out of characters, setting tokenType to 'eof' [tomlish::parse::report_line]" + set tokenType "eof" + set tok "eof" + } + return 0 + } + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::parse ---}] +} +namespace eval tomlish::huddle { + proc from_json {json} { + package require huddle + package require huddle::json + #note - huddle may now contain raw surrogate pair - which cannot be emitted to stdout + set h [huddle::json::json2huddle parse $json] + } + proc from_dict {d} { + + } + + #raw - strings must already be processed into values suitable for json e.g surrogate pair escaping + proc jsondumpraw {huddle_object {offset " "} {newline "\n"} {begin ""}} { + upvar ::huddle::types types + set nextoff "$begin$offset" + set nlof "$newline$nextoff" + set sp " " + if {[string equal $offset ""]} {set sp ""} + + set type [huddle type $huddle_object] + + switch -- $type { + boolean - + number { + return [huddle get_stripped $huddle_object] + } + null { + return null + } + string { + set data [huddle get_stripped $huddle_object] + + # JSON permits only oneline string + #set data [string map { + # \n \\n + # \t \\t + # \r \\r + # \b \\b + # \f \\f + # \\ \\\\ + # \" \\\" + # / \\/ + # } $data + #] + return "\"$data\"" + } + list { + set inner {} + set len [huddle llength $huddle_object] + for {set i 0} {$i < $len} {incr i} { + set subobject [huddle get $huddle_object $i] + lappend inner [jsondumpraw $subobject $offset $newline $nextoff] + } + if {[llength $inner] == 1} { + return "\[[lindex $inner 0]\]" + } + return "\[$nlof[join $inner ,$nlof]$newline$begin\]" + } + dict { + set inner {} + foreach {key} [huddle keys $huddle_object] { + lappend inner [subst {"$key":$sp[jsondumpraw [huddle get $huddle_object $key] $offset $newline $nextoff]}] + } + #if {[llength $inner] == 1} { + # return $inner ;#wrong - breaks with quoted list representation + # #FAILS: toml-test valid/comment/tricky + #} + + return "\{$nlof[join $inner ,$nlof]$newline$begin\}" + } + default { + set node [unwrap $huddle_object] + #foreach {tag src} $node break + lassign $node tag src + return [$types(callback:$tag) jsondumpraw $huddle_object $offset $newline $nextoff] + } + } + } +} + +#typed as per toml-test types +namespace eval tomlish::typedhuddle { + proc from_json {json} { + set plainhuddle [tomlish::huddle::from_json $json] + + error "tomlish::typedhuddle::from_json unimplemented" + } + proc from_dict {d} { + package require huddle + set h [huddle create] + if {[tomlish::dict::is_typeval $d]} { + set dtype [dict get $d type] + switch -- $dtype { + ARRAY { + #error "typedhuddle::from_dict ARRAY not yet handled" + set h_list [huddle list] + set elements [dict get $d value] + foreach el $elements { + set sub [from_dict $el] + huddle append h_list $sub + } + return $h_list + } + default { + set tinfo [tomlish::dict::convert_typeval_to_tomltest $d] + #basic non-container types + set h_tdict [huddle create] + huddle set h_tdict type [huddle string [dict get $tinfo type]] + huddle set h_tdict value [huddle string [dict get $tinfo value]] + return $h_tdict + } + } + } else { + dict for {dictkey dictval} $d { + set jsonkey [tomlish::utils::rawstring_to_jsonstring $dictkey] + if {[tomlish::dict::is_typeval $dictval]} { + set dtype [dict get $dictval type] + switch -- $dtype { + ARRAY { + #error "typedhuddle::from_dict ARRAY not yet handled" + set h_next [huddle list] + set elements [dict get $dictval value] + foreach el $elements { + set sub [from_dict $el] + huddle append h_next $sub + } + } + default { + set tinfo [tomlish::dict::convert_typeval_to_tomltest $dictval] + set tp [dict get $tinfo type] + #basic non-container types + set h_next [huddle create] ;#dict + huddle set h_next type [huddle string [dict get $tinfo type]] + huddle set h_next value [huddle string [dict get $tinfo value]] + } + } + huddle set h $jsonkey $h_next + } else { + #dict + set sub [from_dict $dictval] + huddle set h $jsonkey $sub + } + } + } + return $h + } + proc is_typeval {huddled} { + set htype [huddle type $huddled] + if {$htype ne "dict"} { + return 0 + } + if {[huddle keys $huddled] ne {type value}} { + return 0 + } + set tp [huddle type $huddled type] + switch -- $tp { + string - integer - float - bool - datetime - datetime-local - date-local - time-local { + return 1 + } + } + return 0 + } + + #direction from typed json towards toml + proc convert_typeval_to_tomlish {huddled} { + set htype [huddle get_stripped $huddled type] + set hval [huddle get_stripped $huddled value] + switch -- $htype { + string { + #we need to decide here the type of string element to use in toml/tomlish + #STRING,MULTISTRING,LITERAL,MULTILITERAL + #set unesc [tomlish::utils::unescape_jsonstring $hval] ;#no need - json parser unescaped when creating the huddle + set unesc $hval + #(huddle::json::json2huddle parse $json) + #since it was unescaped any backslashes remaining represent themselves - reapply escape - REVIEW + #set hval [string map [list \\ \\\ ] $hval] + #JSJS + if {[string first \n $unesc] >= 0} { + #always use a MULTI + if {[string first ' $unesc] >=0} { + if {[string first ''' $unesc] >=0} { + set dtype MULTISTRING + } else { + set dtype MULTILITERAL + } + } else { + if {[string first \"\"\" $unesc] >=0} { + set dtype MULTILITERAL + } else { + set dtype MULTISTRING + } + } + } else { + #use multi if needed? + if {[string first '' $hval] >=0} { + if {[string first ''' $unesc] >=0} { + set dtype STRING + } else { + set dtype MULTILITERAL + } + } elseif {[string first ' $unesc] >= 0} { + set dtype STRING + } elseif {[string first \"\"\" $unesc] >= 0} { + set dtype LITERAL + } else { + #STRING or LITERAL? + set dtype STRING + } + } + + } + datetime - bool { + set dtype [string toupper $htype] + } + float { + set dtype FLOAT + if {[string is integer -strict $hval]} { + #json FLOAT specified as integer - must have dot for toml + set hval [expr {double($hval)}] + } + } + integer { + set dtype INT + } + datetime - datetime-local - date-local - time-local { + #DDDD + #set dtype DATETIME + set dtype [string toupper $htype] + } + default { + error "tomlish::typedhuddle::convert_typeval_to_tomlish unrecognised type $htype" + } + } + return [list type $dtype value $hval] + } + +} +namespace eval tomlish::toml { + proc from_binary {bindata} { + set bom "" + set b12 [tcl::string::range $bindata 0 1] + set b12test [string map [list \xEF\xBB utf8_12 \xFE\xFF bom16be \xFF\xFE utf32le_12 \x00\x00 utf32be_12] $b12] + switch -- $b12test { + bom16be { + #FEFF + set bom utf-16be + } + utf32le_12 { + #FFFE + set b34 [tcl::string::range $bindata 2 3] + if {$b34 eq "\x00\x00"} { + set bom utf-32le + } else { + set bom utf-16le + } + } + utf32be_12 { + #0000 + set b34 [tcl::string::range $bindata 2 3] + if {$b34 eq "\xFE\xFF"} { + set bom utf-32be + } + } + utf8_12 { + set b3 [tcl::string::index $bindata 2] + if {$b3 eq "\xBF"} { + set bom utf-8 + } + } + } + if {$bom eq ""} { + #no bom - assume utf8 - but we read in as binary + #if data wasn't actually utf8 we may error here depending on content - or we may just get wrongly encoded chars + set tomldata [encoding convertfrom utf-8 $bindata] + } elseif {$bom eq "utf-8"} { + #utf-8 bom read in as binary + set tomldata [encoding convertfrom utf-8 $bindata] + #bom now encoded as single unicode char \uFFEF + } else { + return -code error -errorcode {TOML ENCODING NOTUTF8} "Input not UTF8 encoded according to BOM. Indicated encoding is '$bom' - invalid for toml" + } + return $tomldata + } + proc from_tomlish {tomlish} { + return [tomlish::encode::tomlish $tomlish] + } + + #todo - rename to taggedjson + proc from_tomlish_from_dict_from_typedjson {json} { + set d [tomlish::dict::from_typedjson $json] + from_tomlish [tomlish::from_dict $d] ;#return tomlish + } + + proc tablename_split {tablename {normalize false}} { + #we can't just split on . because we have to handle quoted segments which may contain a dot. + #eg {dog."tater.man"} + if {$tablename eq ""} { + error "tablename_split. No table name segments found. empty tablename" + } + set sLen [tcl::string::length $tablename] + set segments [list] + set mode "preval" ;#5 modes: preval, quoted,litquoted, unquoted, postval + #quoted is for double-quotes, litquoted is for single-quotes (string literal) + set seg "" + for {set i 0} {$i < $sLen} {incr i} { + + if {$i > 0} { + set lastChar [tcl::string::index $tablename [expr {$i - 1}]] + } else { + set lastChar "" + } + + #todo - track\count backslashes properly + + set c [tcl::string::index $tablename $i] + if {$c eq "\""} { + if {($lastChar eq "\\")} { + #not strictly correct - we could have had an even number prior-backslash sequence + #the toml spec would have us error out immediately on bsl in bad location - but we're + #trying to parse to unvalidated tomlish + set ctest escq + } else { + set ctest dq + } + } else { + set ctest [string map [list " " sp \t tab] $c] + } + + switch -- $ctest { + . { + switch -exact -- $mode { + preval { + error "tablename_split. dot not allowed - expecting a value" + } + unquoted { + #dot marks end of segment. + if {![tomlish::utils::is_barekey $seg]} { + error "tablename_split. unquoted key segment $seg is not a valid toml key" + } + lappend segments $seg + set seg "" + set mode "preval" + } + quoted { + append seg $c + } + litquoted { + append seg $c + } + postval { + #got dot in an expected location + set mode "preval" + } + } + } + dq { + #unescaped dquote + switch -- $mode { + preval { + set mode "quoted" + set seg "\"" + } + unquoted { + #invalid in barekey - but we are after structure only + append seg $c + } + quoted { + append seg $c + #JJJJ + if {$normalize} { + lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] + } else { + lappend segments $seg + } + set seg "" + set mode "postval" ;#make sure we only accept a dot or end-of-data now. + } + litquoted { + append seg $c + } + postval { + error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" + } + } + } + ' { + switch -- $mode { + preval { + append seg $c + set mode "litquoted" + } + unquoted { + #single quote inside e.g o'neill - ultimately invalid - but we pass through here. + append seg $c + } + quoted { + append seg $c + } + litquoted { + append seg $c + #no normalization to do aside from stripping squotes + if {$normalize} { + lappend segments [tcl::string::range $seg 1 end-1] + } else { + lappend segments $seg + } + set seg "" + set mode "postval" + } + postval { + error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" + } + } + } + sp - tab { + switch -- $mode { + preval - postval { + #ignore + } + unquoted { + #terminates a barekey + lappend segments $seg + set seg "" + set mode "postval" + } + default { + #append to quoted or litquoted + append seg $c + } + } + } + default { + switch -- $mode { + preval { + set mode unquoted + append seg $c + } + postval { + error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" + } + default { + append seg $c + } + } + } + } + + if {$i == $sLen-1} { + #end of data + ::tomlish::log::debug "End of data: mode='$mode'" + switch -exact -- $mode { + preval { + if {[llength $segments]} { + error "tablename_split. Expected a value after last dot separator. tablename: '$tablename'" + } else { + error "tablename_split. Whitespace only? No table name segments found. tablename: '$tablename'" + } + } + unquoted { + if {![tomlish::utils::is_barekey $seg]} { + #e.g toml-test invalid/table/with-pound required to fail for invalid barekey + error "tablename_split. unquoted key segment $seg is not a valid toml key" + } + lappend segments $seg + } + quoted { + error "tablename_split. Expected a trailing double quote. tablename: '$tablename'" + } + litquoted { + error "tablename_split. Expected a trailing single quote. tablename: '$tablename'" + } + postval { + #ok - segment already lappended + } + } + } + } + + #note - we must allow 'empty' quoted strings '' & "" + # (these are 'discouraged' but valid toml keys) + + return $segments + } + + #tablenames (& tablearraynames) may contain irrelevant leading, trailing and interspersed whitespace + # tablenames can be made up of segments delimited by dots. .eg [ a.b . c ] + #trimmed, the tablename becomes {a.b.c} + # A segment may contain whitespace if it is quoted e.g [a . b . "c etc " ] + #ie whitespace is only irrelevant if it's outside a quoted segment + #trimmed, the tablename becomes {a.b."c etc "} + proc tablename_trim {tablename} { + set segments [tomlish::toml::tablename_split $tablename false] + set trimmed_segments [list] + foreach seg $segments { + lappend trimmed_segments [::string trim $seg " \t"] + } + return [join $trimmed_segments .] + } +} + +namespace eval tomlish::dict { + namespace export {[a-z]*}; # Convention: export all lowercase + namespace path [namespace parent] + + #from_taggedjson + proc from_typedjson {json} { + package require huddle + package require huddle::json + set h [huddle::json::json2huddle parse $json] + #json2huddle parse unescapes the basic json escapes \n \\ etc + #$h could now contain raw form of surrogate pair (json2huddle parse as at 2025-014 doesn't convert the surrogates - just unescapes?) + if {[catch {encoding convertto utf-8 $h} errM]} { + #This test suggests we have raw surrogate pairs - REVIEW + package require punk::cesu + set h [punk::cesu::from_surrogatestring $h] + } + tomlish::dict::from_typedhuddle $h + } + proc from_typedhuddle {h} { + set resultd [dict create] + switch -- [huddle type $h] { + dict { + foreach k [huddle keys $h] { + switch -- [huddle type $h $k] { + dict { + set huddle_d [huddle get $h $k] + #puts stderr "huddle_d: $huddle_d" + #set v [huddle get_stripped $h $k] + if {[tomlish::typedhuddle::is_typeval $huddle_d]} { + dict set resultd $k [tomlish::typedhuddle::convert_typeval_to_tomlish $huddle_d] + } else { + dict set resultd $k [from_typedhuddle $huddle_d] + } + } + list { + set items [huddle get $h $k] + + set numitems [huddle llength $items] + if {$numitems == 0} { + dict set resultd $k [list type ARRAY value {}] + } else { + set arritems [list] + for {set i 0} {$i < $numitems} {incr i} { + set item [huddle get $items $i] + #puts stderr "item: $item" + #set v [huddle get $item] + if {[tomlish::typedhuddle::is_typeval $item]} { + lappend arritems [tomlish::typedhuddle::convert_typeval_to_tomlish $item] + } else { + lappend arritems [from_typedhuddle $item] + } + } + dict set resultd $k [list type ARRAY value $arritems] + } + } + default { + error "dict_from_json unexpected subtype [huddle type $h $k] in dict" + } + } + } + } + list { + set items [huddle get $h] + set numitems [huddle llength $items] + if {$numitems == 0} { + return [list type ARRAY value {}] + } else { + set arritems [list] + for {set i 0} {$i < $numitems} {incr i} { + set item [huddle get $items $i] + #puts stderr "item: $item" + #set v [huddle get $item] + if {[tomlish::typedhuddle::is_typeval $item]} { + lappend arritems [tomlish::typedhuddle::convert_typeval_to_tomlish $item] + } else { + lappend arritems [from_typedhuddle $item] + } + } + return [list type ARRAY value $arritems] + } + + } + } + return $resultd + } + + proc is_typeval {d} { + #designed to detect {type value } e.g {type INT value 3}, {type STRING value "blah etc"} + #as a sanity check we need to avoid mistaking user data that happens to match same form + #consider x.y={type="spud",value="blah"} + #The value of type will itself have already been converted to {type STRING value spud} ie never a single element. + #check the length of the type as a quick way to see it's a tag - not something else masqerading. + expr {[string is dict $d] && [dict size $d] == 2 && [dict exists $d type] && [dict exists $d value] && [llength [dict get $d type]] == 1} + } + + #simple types only - not containers? + proc convert_typeval_to_tomltest {d} { + set dtype [dict get $d type] + set dval [dict get $d value] + switch -- $dtype { + INT { + set testtype integer + set dval [expr {$dval}] ;#convert e.g 0xDEADBEEF to 3735928559 + } + DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - FLOAT - BOOL { + #DDDD + set testtype [string tolower $dtype] + } + STRING - MULTISTRING { + set testtype string + #JJJJ + set dval [tomlish::utils::unescape_string $dval] + set dval [tomlish::utils::rawstring_to_jsonstring $dval] + } + LITERAL - MULTILITERAL { + set testtype string + #don't validate on way out to json here? + #decoder should validate by calling tomlish::from_dict + #if {![tomlish::utils::rawstring_is_valid_literal $dval]} { + # return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} $msg + #} + set dval [tomlish::utils::rawstring_to_jsonstring $dval] + } + MULTILITERAL { + #todo - escape newlines for json? + set testtype string + } + default { + error "convert_typeval_to_tomltest unhandled type $dtype" + } + } + return [list type $testtype value $dval] + } + + # Check that each leaf is a typeval or typeval dict + #importantly: must accept empty dict leaves e.g {x {}} + proc is_typeval_dict {d {checkarrays 0}} { + if {![string is dict $d]} { + return 0 + } + dict for {k v} $d { + set is_d 0 + if {!([is_typeval $v] || [set is_d [is_typeval_dict $v $checkarrays]])} { + return 0 + } + if {!$is_d} { + set vtype [dict get $v type] + switch -- $vtype { + INT - FLOAT - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - BOOL - LITERAL - STRING - MULTILITERAL - MULTISTRING {} + ARRAY { + if {$checkarrays} { + set arrdata [dict get $v value] + foreach el $arrdata { + if {![is_typeval_dict $el $checkarrays]} { + return 0 + } + } + } + } + default { + puts stderr "is_typeval_dict: Unexpected type '$vtype'" + return 0 + } + } + } + } + return 1 + } + + proc last_tomltype_posn {d} { + set last_simple -1 + set dictposn [expr {[dict size $d] -1}] + foreach k [lreverse [dict keys $d]] { + set dval [dict get $d $k] + if {[is_typeval $dval]} { + set last_simple $dictposn + break + } + incr dictposn -1 + } + return $last_simple + } + + + #review + proc name_from_tablestack {tablestack} { + set name "" + foreach tinfo [lrange $tablestack 1 end] { + lassign $tinfo type namepart + switch -- $type { + T { + if {$name eq ""} { + append name $namepart + } else { + append name .$namepart + } + } + I { + if {$name eq ""} { + append name $namepart + } else { + append name .$namepart + } + } + default { + #end at first break in the leading sequence of T & I tablenames + break + } + } + } + return $name + } + + + #tablenames_info is a flat dict with the key being an '@@' path + proc _show_tablenames {tablenames_info} { + #e.g {@l@a @@b} {ttype header_table tdefined closed} + append msg \n "tablenames_info:" \n + dict for {tkey tinfo} $tablenames_info { + append msg " " "table: $tkey" \n + dict for {field finfo} $tinfo { + append msg " " "$field $finfo" \n + } + } + return $msg + } + + #take a raw string and classify: result is a 2 element list comprised of KEY|SQKEY|DQKEY and the value being the appropriate inner string + proc classify_rawkey {rawval} { + if {![::tomlish::utils::is_barekey $rawval]} { + #requires quoting + # + #Any dot in the key would have been split by dict::from_tomlish - so if it's present here it's part of this key - not a level separator! + # + #we'll use a basic mechanisms for now to determine the type of quoting + # - whether it has any single quotes or not. + # (can't go in an SQKEY) + # - whether it has any chars that require quoting when in a Bstring + # (if so - then its visual representation might be unsuitable for a key in a toml text file, so escape and put in DQKEY instead of literal SQKEY) + #todo - more? + #REVIEW - the backslash might often be in things like a regex or windows path - which is often better expressed in a literal SQKEY + # from literal examples: + # 'c:\Users\nodejs\templates' + # '<\i\c*\s*>' + #If these are in *keys* our basic test will express these as: + # "c:\\Users\\nodejs\\templates" + # "<\\i\\c*\\s*>" + # This still works - but a smarter test might determine when SQKEY is the better form? + #when coming from external systems - can we even know if the value was already escaped? REVIEW + #Probably when coming from json - we know it's already escaped - and so we build our dict converting keys to unescaped + #TODO - clarify in documentation that keys resulting from dict::from_tomlish are in 'normalized' (unescaped) form + # + #For keys - we currently (2025) are only allowed barekeys,basic strings and literal strings. (no multiline forms) + set k_escaped [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $rawval] + if {[string length $k_escaped] != [string length $rawval]} { + #escaping made a difference + set has_escape_requirement 1 + } else { + set has_escape_requirement 0 + } + if {[string first ' $rawval] >=0 || $has_escape_requirement} { + #basic string + # (any ANSI SGR sequence will end up here in escaped form ) + return [list DQKEY $k_escaped] + } else { + #literal string + return [list SQKEY $rawval] + } + } else { + return [list KEY $rawval] + } + } + #the quoting implies the necessary escaping for DQKEYs + proc join_and_quote_rawkey_list {rawkeylist} { + set result "" + foreach rk $rawkeylist { + lassign [tomlish::dict::classify_rawkey $rk] type val + switch -- $type { + SQKEY { + append result "'$val'." + } + DQKEY { + append result "\"$val\"." + } + KEY { + append result "$val." + } + } + } + return [string range $result 0 end-1] + } + + proc _process_tomlish_dottedkey {element {context_refpath {}}} { + upvar tablenames_info tablenames_info + upvar datastructure datastructure + set dottedtables_defined [list] + set dkey_info [tomlish::get_dottedkey_info $element] + #e.g1 keys {x.y y} keys_raw {'x.y' "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) + #e.g2 keys {x.y y} keys_raw {{"x.y"} "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) + + #[a.b] + #t1.t2.dottedtable.leafkey = "val" + #we have already checked supertables a & {a b} + # - in basic case, passed in context_refpath as {@@a @@b} + # - our context_refpath could also include some combination of keys and array indices e.g {@@a @@b 3 @@subtablekey} + #We need to check {a b t1} & {a b t2} ('creation' only) + #and then 'dottedtable' is 'defined' while leafkey is an ordinary key in dottedtable + + #note we also get here as a 'dottedkey' with the following even though there is no dot in k + #[a.b] + #leafkey = "val" + + set all_dotted_keys [dict get $dkey_info keys] + set dottedkeyname [join $all_dotted_keys .] + + if {[llength $all_dotted_keys] > 1} { + #dottedtable.k=1 + #tX.dottedtable.k=1 + #etc + + #Wrap in a list so we can detect 'null' equivalent. + #We can't use empty string as that's a valid dotted key segment + set dottedtable_bag [list [lindex $all_dotted_keys end-1]] + set dotparents [lrange $all_dotted_keys 0 end-2] + } else { + #basic case - not really a 'dotted' key + #k = 1 + set dottedtable_bag [list] ;#empty bag + set dotparents [list] + } + #assert dottedtable_bag only ever holds 0 or 1 elements + set leaf_key [lindex $all_dotted_keys end] + + #see also: https://github.com/toml-lang/toml/issues/846 "Can dotted keys insert into already-defined [tables]?" + #This code was originally written with a misinterpretation of: + #"Dotted keys create and define a table for each key part before the last one, provided that such tables were not previously created." + # 'each key part before the last one' refers to each key in a single dotted key entry + # not each 2nd-to last key in a list of dotted keys. + + + #we've already tested the table/tablearray keys that got us here.. but not the dottedkey segments (if any) prior to dottedtable & leaf_key + set dottedsuper_refpath $context_refpath + foreach normkey $dotparents { + lappend dottedsuper_refpath @@$normkey + if {![dict exists $tablenames_info $dottedsuper_refpath ttype]} { + #supertable with this combined path (context_path plus parts of dottedkey) not yet 'created' + if {[tomlish::dict::path::exists $datastructure $dottedsuper_refpath]} { + #There is data so it must have been created as a keyval + set msg "Path $dottedsuper_refpath for dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here + #dict set tablenames_info $dottedsuper_refpath ttype unknown_table ;#REVIEW + dict set tablenames_info $dottedsuper_refpath ttype unknown_dotted ;#REVIEW + + #see note above re dotted keys insert into already defined table - we need to 'define' all the dotted supers in this block + lappend dottedtables_defined $dottedsuper_refpath + + #ensure empty tables are still represented in the datastructure + tomlish::dict::path::set_endpoint datastructure $dottedsuper_refpath {} ;#set to empty subdict + } else { + #added for fixed assumption + set ttype [dict get $tablenames_info $dottedsuper_refpath ttype] + set definedstate [dictn getdef $tablenames_info [list $dottedsuper_refpath tdefined] NULL] + switch -- $ttype { + dottedkey_table - unknown_dotted { + #'created' as dotted - but make sure it's from this header section - i.e defined not set + if {$definedstate ne "NULL"} { + #collision with some other dottedkey + set msg "Table at $dottedsuper_refpath represented by dottedkey $dottedkeyname has been 'defined' elsewhere (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + } + itable { + #itables are immediately defined + set msg "Table at $dottedsuper_refpath represented by dottedkey $dottedkeyname has been 'defined' as itable (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + default { + #header_table, header_tablearray or unknown_header + #is header_tablearray any different from header_table in this context? + #we don't set tdefined for tablearray anyway - so should be ok here. + if {$definedstate ne "NULL"} { + set msg "Table at $dottedsuper_refpath represented by dottedkey $dottedkeyname has been 'defined' in a header (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + } + } + } + } + + #dottedtable being 2nd last segment was for original assumption - todo - tidy up? we are duplicating the logic above + #review - any need/advantage to treat 2nd to last key any different from other supers? ie D in a.b.c.D.key=1 + #no need for 'unknown_dotted' vs 'dottedkey_table' ?? + if {[llength $dottedtable_bag] == 1} { + set dottedtable [lindex $dottedtable_bag 0] + set dottedkey_refpath [list {*}$dottedsuper_refpath "@@$dottedtable"] + #our dotted key is attempting to define a table + if {![dict exists $tablenames_info $dottedkey_refpath ttype]} { + #first one - but check datastructure for collisions + if {[tomlish::dict::path::exists $datastructure $dottedkey_refpath]} { + set msg "Path $dottedkey_refpath for dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #'create' the table + dict set tablenames_info $dottedkey_refpath ttype dottedkey_table + #don't actually set 'defined' here.. use the end of TABLE record to close them off by looking at this list + tomlish::dict::path::set_endpoint datastructure $dottedkey_refpath {} + lappend dottedtables_defined $dottedkey_refpath + + # + } else { + #exists - but might be from another dottedkey within the current header section + #the table is open for adding keys until the next 'header' section ([tablename] / [[tablearray]]) + #check for 'defined' closed (or just existence) + set ttype [dict get $tablenames_info $dottedkey_refpath ttype] + set definedstate [dictn getdef $tablenames_info [list $dottedkey_refpath tdefined] NULL] + switch -- $ttype { + dottedkey_table - unknown_dotted { + #'created' as dotted - but make sure it's from this header section - i.e defined not set + if {$definedstate ne "NULL"} { + #collision with some other dottedkey + set msg "Table at $dottedkey_refpath represented by dottedkey $dottedkeyname has been 'defined' elsewhere (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + } + itable { + #itables are immediately defined + set msg "Table at $dottedkey_refpath represented by dottedkey $dottedkeyname has been 'defined' as itable (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + default { + #header_table, header_tablearray or unknown_header + #is header_tablearray any different from header_table in this context? + #we don't set tdefined for tablearray anyway - so should be ok here. + if {$definedstate ne "NULL"} { + set msg "Table at $dottedkey_refpath represented by dottedkey $dottedkeyname has been 'defined' in a header (table redefinition) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + } + } + } + } else { + set dottedkey_refpath $dottedsuper_refpath + } + #assert - dottedkey represents a key val pair that can be added + + + set fullkey_refpath [list {*}$dottedkey_refpath @@$leaf_key] + if {[tomlish::dict::path::exists $datastructure $fullkey_refpath]} { + set msg "Duplicate key. The key (path $fullkey_refpath) already exists at this level in the toml data. The toml data is not valid." + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + + #set keyval_dict [_get_keyval_value $element] + lassign [_get_keyval_value $element] _ keyval_dict _ sub_tablenames_info + + + #keyval_dict is either a {type value } + #or the result from parsing an arbitrary dict from an inline table - which could theoretically look the same at the topmost level + #punk::dict::is_typeval can distinguish + tomlish::log::debug "_process_tomlish_dottedkey>>> context:$context_refpath dottedkey $dottedkeyname kv: $keyval_dict" + tomlish::dict::path::set_endpoint datastructure $fullkey_refpath $keyval_dict + + #remove ? + #if {![tomlish::dict::is_typeval $keyval_dict]} { + # #the value is either empty or or a dict structure with arbitrary (from-user-data) toplevel keys + # # inner structure will contain {type value } if all leaves are not empty ITABLES + # ##set tkey [list {*}$norm_segments {*}$all_dotted_keys] + + # #by not creating a tablenames_info record - we effectively make it closed anyway? + # #it should be detected as a key + # #is there any need to store tablenames_info for it?? + # #REVIEW + + # ##TODO - update? + # #dictn incr tablenames_info [list $tkey seencount] + # ##if the keyval_dict is not a simple type x value y - then it's an inline table ? + # ##if so - we should add the path to the leaf_key as a closed table too - as it's not allowed to have more entries added. + # #dictn set tablenames_info [list $tkey closed] 1 + #} + return [dict create dottedtables_defined $dottedtables_defined] + } + #tomlish::dict::from_tomlish is a *basic* programmatic datastructure for accessing the data. + # produce a dictionary of keys and values from a tomlish tagged list. + # dict::from_tomlish is primarily for read access to toml data. + #Extraneous (not within quoted sections) whitespace and comments are not preserved in this structure, + # so a roundtrip from toml to this datastructure and back to toml will lose whitespace formatting and comments. + # creating/changing toml values can be done directly on a tomlish list if preserving (or adding) formatting/comments is desired. + #A separate package 'tomlish::object' may be needed to allow easier programmatic creating/updating/deleting of data elements whilst preserving (or adding or selectively deleting/editing) such formatting. + # + + #within an ARRAY, we store a list of items such as plain dicts (possibly empty) and {type value } for simple types + #(ARRAYS can be mixed type) + #This means our dict structure should have only ARRAY and simple types which need to be in {type value } form + #A dict within an array encodeded as a type ITABLE value should also parse - but is the unpreferred form - REVIEW test? + + #Namespacing? + #ie note the difference: + #[Data] + #temp = { cpu = 79.5, case = 72.0} + # versus + #[Data] + #temps = [{cpu = 79.5, case = 72.0}] + proc from_tomlish {tomlish} { + package require dictn + + #keep track of which tablenames have already been directly defined, + # so we can raise an error to satisfy the toml rule: 'You cannot define any key or table more than once. Doing so is invalid' + #Note that [a] and then [a.b] is ok if there are no subkey conflicts - so we are only tracking complete tablenames here. + #we don't error out just because a previous tablename segment has already appeared. + + #Declaring, Creating, and Defining Tables + #https://github.com/toml-lang/toml/issues/795 + #(update - only Creating and Defining are relevant terminology) + + #review + #tablenames_info keys ttype created, tdefined, createdby, definedby, closedby ??? review keys + # [tname] = header_table [[tname]] = header_tablearray + + #consider the following 2 which are legal: + #[table] #'table' created, defined=open type header_table + #x.y = 3 + #[table.x.z] #'table' tdefined=closed closedby={header_table table.x.z}, 'table.x' created, 'table.x.z' created tdefined=open tdefinedby={header_table table.x.z} + #k= 22 + # #'table.x.z' tdefined=closed closedby={eof eof} + + #equivalent datastructure + + #[table] #'table' created, tdefined=open definedby={header_table table} + #[table.x] #'table' tdefined=closed closedby={header_table table.x}, 'table.x' created tdefined=open definedby={header_table table.x} + #y = 3 + #[table.x.z] #'table.x' tdefined=closed closedby={header_table table.x.z}, 'table.x.z' created tdefined=open definedby={header_table table.x.z} + #k=22 + + #illegal + #[table] #'table' created and tdefined=open + #x.y = 3 #'table.x' created first keyval pair tdefined=open definedby={keyval x.y = 3} + #[table.x.y.z] #'table' tdefined=closed, 'table.x' closed because parent 'table' closed?, 'table.x.y' cannot be created + #k = 22 + # + ## - we would fail on encountering table.x.y because only table and table.x are effectively tables - but that table.x is closed should be detected (?) + + #illegal + #[table] + #x.y = {p=3} + #[table.x.y.z] + #k = 22 + ## we should fail because y is an inline table which is closed to further entries + + #note: it is not safe to compare normalized tablenames using join! + # e.g a.'b.c'.d is not the same as a.b.c.d + # instead compare {a b.c d} with {a b c d} + # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. + #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' + #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} + + + + if {[uplevel 1 [list info exists tablenames_info]]} { + upvar tablenames_info tablenames_info + } else { + set tablenames_info [dict create] ;#keyed on tablepath each of which is an @@path such as {@@config @@subgroup @@etc} (corresponding to config.subgroup.etc) + #also has non @@ indexes which are list indexes as taken by tcl list commands (int or end-1 etc) + #value is a dict with keys: ttype, tdefined + } + + + log::info "---> dict::from_tomlish processing '$tomlish'<<<" + set items $tomlish + + foreach lst $items { + if {[lindex $lst 0] ni $::tomlish::tags} { + error "supplied list does not appear to be toml parsed into a tomlish tagged list. Run tomlish::decode::toml on the raw toml data to produce a tomlish list" + } + } + + if {[lindex $tomlish 0] eq "TOMLISH"} { + #ignore TOMLISH tag at beginning + set items [lrange $tomlish 1 end] + } + + set datastructure [dict create] + set dottedtables_defined [list] + foreach item $items { + set tag [lindex $item 0] + #puts "...> item:'$item' tag:'$tag'" + switch -exact -- $tag { + KEY - DQKEY - SQKEY - INT - FLOAT - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - STRING - LITERAL { + #why would we get individual key item as opposed to DOTTEDKEY? + error "tomlish::dict::from_tomlish error: invalid tag: $tag. At the toplevel, from_tomlish can only process WS NEWLINE COMMENT and compound elements DOTTEDKEY TABLE TABLEARRAY ITABLE MULTILITERAL MULTISTRING" + } + DOTTEDKEY { + #toplevel dotted key + set dkinfo [_process_tomlish_dottedkey $item] + lappend dottedtables_defined {*}[dict get $dkinfo dottedtables_defined] + #at any level - we don't expect any more DOTTEDKEY records in a tomlish structure after TABLE or TABLEARRAY are encountered + #as those records should encapsulate their own dottedkeys + + } + TABLEARRAY { + #close off any dottedtables_defined created by dottedkeys at this level + foreach dtablepath $dottedtables_defined { + dict set tablenames_info $dtablepath tdefined closed + } + set dottedtables_defined [list] ;#for closing off at end by setting 'defined' + + set tablearrayname [lindex $item 1] + tomlish::log::debug "---> tomlish::dict::from_tomlish processing item TABLENAME (name: $tablearrayname): $item" + set norm_segments [::tomlish::toml::tablename_split $tablearrayname true] ;#true to normalize + #we expect repeated tablearray entries - each adding a sub-object to the value, which is an array/list. + #tablearrayname is likely to appear multiple times - so unlike a TABLE we don't check for 'defined' for the full name as an indicator of a problem + set supertable [list] + ############## + # [[a.b.c.d]] + # norm_segments = {a b c d} + #check a {a b} {a b c} <---- supertables of a.b.c.d + ############## + set refpath [list] ;#e.g @@j1 @@j2 1 @@k1 end + foreach normseg [lrange $norm_segments 0 end-1] { + lappend supertable $normseg + lappend refpath @@$normseg + if {![dict exists $tablenames_info $refpath ttype]} { + #supertable with this path doesn't yet exist + if {[tomlish::dict::path::exists $datastructure $refpath]} { + #There is data though - so it must have been created as a keyval + set msg "Supertable [join $supertable .] of tablearray name $tablearrayname already has data but doesn't appear to be a table - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + #test: datastructure_tablearray_supertable_keycollision + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } else { + #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here + #review - we can't later specify as tablearray so should just set ttype to header_table even though it's being created + # because of a tablearray header? + #By setting ttype to something other than table_header we can provide more precise errorCode/msg ?? + dict set tablenames_info $refpath ttype unknown_header + #ensure empty tables are still represented in the datastructure + dict set datastructure {*}$supertable [list] + } + } else { + #REVIEW!! + # what happens with from_toml {[[a.b.c]]} {[[a.b]]} ??? + #presumed that a and a.b were 'created' as tables (supertables of tablearray at a.b.c) and can't now be redefined as tablearrays + + #supertable has already been created - and may be defined - but even if defined we can add subtables unless it is of type itable + #but if it's a tablearray - we need to point to the most 'recently defined table element of the array' + #(last member of that array - need to check type? allowed to have non-table elements ie nonhomogenous??) + set supertype [dict get $tablenames_info $refpath ttype] + if {$supertype eq "header_tablearray"} { + #exercised by toml-tests: + # valid/table/array-table-array + # valid/table/array-nest + + #puts stdout "todict!!! TABLEARRAY nesting required for supertable [join $supertable .]" + + #'refer' to the appropriate element in existing array + set arrdata [tomlish::dict::path::get $datastructure [list {*}$refpath @@value]] + set idx [expr {[llength $arrdata]-1}] + if {$idx < 0} { + #existing tablearray should have at least one entry even if empty (review) + set msg "reference to empty tablearray?" + return -code error -errorcode {TOMLISH STRUCTURE REFTOEMPTYTABLEARRAY} $msg + } + lappend refpath $idx + } + } + } + # + #puts "TABLE supertable refpath $refpath" + lappend refpath @@[lindex $norm_segments end] + tomlish::log::debug "TABLEARRAY refpath $refpath" + set tablearray_refpath $refpath + + + if {![dict exists $tablenames_info $tablearray_refpath ttype]} { + #first encounter of this tablearrayname + if {[tomlish::dict::path::exists $datastructure $tablearray_refpath]} { + #e.g from_toml {a=1} {[[a]]} + set msg "Cannot create tablearray name $tablearrayname. Key already has data but key doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + #test: datastructure_tablearray_direct_keycollision_error + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #no collision - we can create the tablearray and the array in the datastructure + dict set tablenames_info $tablearray_refpath ttype header_tablearray + #dict set datastructure {*}$norm_segments [list type ARRAY value {}] + #create array along with empty array-item at position zero + tomlish::dict::path::set_endpoint datastructure $tablearray_refpath [list type ARRAY value {{}}] + set arrayitem_refpath [list {*}$tablearray_refpath 0] + #set ARRAY_ELEMENTS [list] + } else { + #we have an existing tablenames_info record for this path - but is it a tablearray? + set ttype [dict get $tablenames_info $tablearray_refpath ttype] + if {$ttype ne "header_tablearray"} { + #header_table or itable + switch -- $ttype { + itable {set ttypename itable} + header_table {set ttypename table} + dottedkey_table {set ttypename dottedkey_table} + unknown_header - unknown_dotted { + #table was created e.g as supertable - but not specifically a tablearray + #violates ordering - return specific test error + set msg "Table $tablearrayname referenced as supertable before tablearray defined (ordering)" + return -code error -errorcode {TOMLISH STRUCTURE TABLEARRAYORDERING} $msg + } + default {error "unrecognised type $ttype - expected header_table or itable"} + } + set msg "tablearray name $tablearrayname already appears to be already created as '$ttypename' not tablearray - invalid?" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #EXISTING tablearray + #add to array + #error "add_to_array not implemented" + #{type ARRAY value } + #set ARRAY_ELEMENTS [dict get $datastructure {*}$norm_segments value] + tomlish::log::debug ">>>>pre-extend-array dict::from_tomlish datastructure: $datastructure" + set existing_array [tomlish::dict::path::get $datastructure [list {*}$tablearray_refpath @@value]] + set arrayitem_refpath [list {*}$tablearray_refpath [llength $existing_array]] + tomlish::dict::path::lappend datastructure $tablearray_refpath {} + tomlish::log::debug ">>>>post-extend-array dict::from_tomlish datastructure: $datastructure" + } + + + #set object [dict create] ;#array context equivalent of 'datastructure' + + #add to ARRAY_ELEMENTS and write back in to datastructure. + foreach element [lrange $item 2 end] { + set type [lindex $element 0] + tomlish::log::debug "----> todict processing $tag subitem $type processing contained element $element" + switch -exact -- $type { + DOTTEDKEY { + set dkinfo [_process_tomlish_dottedkey $element $arrayitem_refpath] + lappend dottedtables_defined {*}[dict get $dkinfo dottedtables_defined] + } + NEWLINE - COMMENT - WS { + #ignore + } + TABLE { + #we *perhaps* should be able to process tablearray subtables either as part of the tablearray record, or independently. + #(or even a mixture of both, although that is somewhat an edge case, and of limited utility) + #[[fruit]] + #x=1 + # [fruit.metadata] + # [fruit.otherdata] + + #when processing a dict destined for the above - the tomlish generator (e.g from_dict) + #should create as 1 or 3 records (but could create 2 records if there was an unrelated table in between the subtables) + #choices: all in tablearray record, tablearray + 1 or 2 table records. + # + #We are going the other way here - so we just need to realise that the list of tables 'belonging' to this tablearray might not be complete. + # + #the subtable names must be prefixed with the tablearray - we should validate that for any contained TABLE records + + #The default mechanism is for from_dict to produce tomlish with separate TABLE records - and use the ordering to determine membership + #If we were to support wrapping the TABLE records within a TABLEARRAY - we should also support TABLEARRAY within TABLEARRAY + # ----------------------------------------------------------------------- + #Implementing this is not critical for standard encoding/decoding of toml! + #It would be an alternative form for the tomlish intermediate form - and adds complexity. + # + #The upside would be to provide a function for sorting/rearranging in the tomlish form if all records were fully encapsulated. + #A possible downside is that unrelated tables placed before a tablearray is fully defined (within the tablearray definition area in toml) + # would have to be re-positioned before or after the encapsulated tablearray record. + # While unrelated tables in such a position aren't a recommended way to write toml, they appear to be valid + # and preserving the author's ordering is a goal of the basic encoding/decoding operations if no explicit sorting/reordering was requested. + # + #Consider an 'encapsulate' method to this (tomlish -> tomlish) + # ----------------------------------------------------------------------- + #todo + error "tomlish::dict::from_tomlish TABLE element within TABLEARRAY not handled - TABLE should be a separate tomlish record" + } + default { + error "tomlish::dict::from_tomlish Sub element of type '$type' not understood in tablearray context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" + } + } + } + + #end of TABLEARRAY record - equivalent of EOF or next header - close off the dottedtables + foreach dtablepath $dottedtables_defined { + dict set tablenames_info $dtablepath tdefined closed + } + } + TABLE { + #close off any dottedtables_defined created by dottedkeys at this level + foreach dtablepath $dottedtables_defined { + dict set tablenames_info $dtablepath tdefined closed + } + set tablename [lindex $item 1] + set dottedtables_defined [list] ;#for closing off at end by setting 'defined' + #As our TABLE record contains all it's child DOTTEDKEY records - this should be equivalent to setting them as defined at EOF or next header. + + #----------------------------------------------------------------------------------- + #default assumption - our reference is to the main tablenames_info and datastructure + #Will need to append keys appropriately if we have recursed + #----------------------------------------------------------------------------------- + + log::debug "---> tomlish::dict::from_tomlish processing item TABLE (name: $tablename): $item" + set norm_segments [::tomlish::toml::tablename_split $tablename true] ;#true to normalize + + + + set name_segments [::tomlish::toml::tablename_split $tablename 0] ;#unnormalized e.g ['a'."b".c.d] -> 'a' "b" c d + #results of tablename_split 0 are 'raw' - ie some segments may be enclosed in single or double quotes. + + + set supertable [list] + ############## + # [a.b.c.d] + # norm_segments = {a b c d} + #check a {a b} {a b c} <---- supertables of a.b.c.d + ############## + + ############## + #[[a]] + #[a.b] #supertable a is tablearray + ############## + + #also consider + ############## + # [[a.b]] + # [a.b.c.d] #supertable a is a table, supertable a.b is tablearray, supertable a.b.c is elementtable + ############## + set refpath [list] ;#e.g @@j1 @@j2 1 @@k1 end + foreach normseg [lrange $norm_segments 0 end-1] { + lappend supertable $normseg + lappend refpath @@$normseg + if {![dict exists $tablenames_info $refpath ttype]} { + #supertable with this path doesn't yet exist + if {[tomlish::dict::path::exists $datastructure $refpath]} { + #There is data though - so it must have been created as a keyval + set msg "Supertable [join $supertable .] of table name $tablename (path $refpath) already has data but doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here + #we also don't know whether it's a table or a dottedkey_table (not allowed to be tablearray - out of order?) + dict set tablenames_info $refpath ttype unknown_header + #ensure empty tables are still represented in the datastructure + #dict set datastructure {*}$supertable [list] + tomlish::dict::path::set_endpoint datastructure $refpath {} + } else { + #supertable has already been created - and may be defined - but even if defined we can add subtables unless it is of type itable + if {[dict get $tablenames_info $refpath ttype] eq "header_tablearray"} { + #'refer' to the appropriate element in existing array + set arrdata [tomlish::dict::path::get $datastructure [list {*}$refpath @@value]] + set idx [expr {[llength $arrdata]-1}] + if {$idx < 0} { + #existing tablearray should have at least one entry even if empty (review) + set msg "reference to empty tablearray?" + return -code error -errorcode {TOMLISH STRUCTURE REFTOEMPTYTABLEARRAY} $msg + } + lappend refpath $idx + } else { + #?? + if {[dictn getdef $tablenames_info [list $refpath tdefined] NULL] eq "NULL"} { + } else { + } + } + } + } + #puts "TABLE supertable refpath $refpath" + lappend refpath @@[lindex $norm_segments end] + tomlish::log::info "TABLE refpath $refpath" + set table_refpath $refpath + + + + + #table [a.b.c.d] hasn't been defined - but may have been 'created' already by a longer tablename + # - or may have existing data from a keyval + if {![dict exists $tablenames_info $table_refpath ttype]} { + if {[tomlish::dict::path::exists $datastructure $table_refpath]} { + #e.g from_toml {a=1} {[a]} + set msg "Cannot create table name $tablename. Key already has data but key doesn't appear to be a table (keycollision) - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + #test: datastructure_tablename_keyval_collision_error + return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg + } + #no data or previously created table + dict set tablenames_info $table_refpath ttype header_table + + #We are 'defining' this table's keys and values here (even if empty) + #dict set datastructure {*}$norm_segments [list] ;#ensure table still represented in datastructure even if we add no keyvals here + tomlish::dict::path::set_endpoint datastructure $table_refpath {} ;#ensure table still represented in datastructure even if we add no keyvals here + } else { + if {[dict get $tablenames_info $table_refpath ttype] eq "header_tablearray"} { + #e.g tomltest invalid/table/duplicate-table-array2 + #[[tbl]] + #[tbl] + set msg "Table name $tablename has already been created as a tablearray. Invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } else { + #any other type tdefined is a problem + set T_DEFINED [dictn getdef $tablenames_info [list $table_refpath tdefined] NULL] + if {$T_DEFINED ne "NULL" } { + #our tablename e.g [a.b.c.d] declares a space to 'define' subkeys - but there has already been a definition space for this path + set msg "Table name $tablename has already been defined in the toml data. Invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + #raise a specific type of error for tests to check + return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg + } + } + } + dict set tablenames_info $table_refpath tdefined open + + #now add the contained elements + foreach element [lrange $item 2 end] { + set type [lindex $element 0] + log::debug "----> todict processing $tag subitem $type processing contained element $element" + switch -exact -- $type { + DOTTEDKEY { + set dkinfo [_process_tomlish_dottedkey $element $table_refpath] + lappend dottedtables_defined {*}[dict get $dkinfo dottedtables_defined] + } + NEWLINE - COMMENT - WS { + #ignore + } + default { + error "Sub element of type '$type' not understood in table context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" + } + } + } + + #end of TABLE record - equivalent of EOF or next header - close off the dottedtables + foreach dtablepath $dottedtables_defined { + dict set tablenames_info $dtablepath tdefined closed + } + } + ITABLE { + #As there is no other mechanism to create tables within an ITABLE than dottedkeys + # and ITABLES are fully defined/enclosed - we can rely on key collision and don't need to track dottedtables_defined - REVIEW. + set dottedtables_defined [list] + #SEP??? + #ITABLE only ever on RHS of = or inside ARRAY + set datastructure [dict create] + set tablenames_info [dict create] + + foreach element [lrange $item 1 end] { + set type [lindex $element 0] + log::debug "----> tododict processing $tag subitem $type processing contained element $element" + switch -exact -- $type { + DOTTEDKEY { + set dkinfo [_process_tomlish_dottedkey $element] + } + NEWLINE - COMMENT - WS { + #ignore + } + default { + error "Sub element of type '$type' not understood in ITABLE context. Expected only KEY,DQKEY,SQKEY,NEWLINE,COMMENT,WS" + } + } + } + } + ARRAY { + #arrays in toml are allowed to contain mixtures of types + set datastructure [list] + log::debug "--> processing array: $item" + + foreach element [lrange $item 1 end] { + set type [lindex $element 0] + log::debug "----> tododict processing $tag subitem $type processing contained element $element" + switch -exact -- $type { + INT - FLOAT - BOOL - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL { + set value [lindex $element 1] + lappend datastructure [list type $type value $value] + } + STRING { + #JJJJ + #don't unescape string! + set value [lindex $element 1] + #lappend datastructure [list type $type value [::tomlish::utils::unescape_string $value]] + lappend datastructure [list type $type value $value] + } + LITERAL { + set value [lindex $element 1] + lappend datastructure [list type $type value $value] + } + ITABLE { + #anonymous table + #lappend datastructure [list type $type value [::tomlish::to_dict [list $element]]] + lappend datastructure [::tomlish::dict::from_tomlish [list $element]] ;#store itables within arrays as raw dicts (possibly empty) + } + TABLE - TABLEARRAY { + #invalid? shouldn't be output from from_dict - but could manually be constructed as such? review + #doesn't make sense as table needs a name? + #take as synonym for ITABLE? + error "tomlish::dict::from_tomlish $type within array unexpected" + } + ARRAY - MULTISTRING - MULTILITERAL { + #set value [lindex $element 1] + lappend datastructure [list type $type value [::tomlish::dict::from_tomlish [list $element]]] + } + WS - SEP - NEWLINE - COMMENT { + #ignore whitespace, commas, newlines and comments + } + default { + error "tomlish::dict::from_tomlish Unexpected value type '$type' found in array" + } + } + } + } + MULTILITERAL { + #triple squoted string + #first newline stripped only if it is the very first element + #(ie *immediately* following the opening delims) + #All whitespace other than newlines is within LITERALPARTS + # ------------------------------------------------------------------------- + #todo - consider extension to toml to allow indent-aware multiline literals + # how - propose as issue in toml github? Use different delim? e.g ^^^ ? + #e.g + # xxx=?'''abc + # def + # etc + # ''' + # - we would like to trimleft each line to the column following the opening delim + # ------------------------------------------------------------------------- + + log::debug "---> todict processing multiliteral: $item" + set parts [lrange $item 1 end] + if {[lindex $parts 0 0] eq "NEWLINE"} { + set parts [lrange $parts 1 end] ;#skip it + } + for {set idx 0} {$idx < [llength $parts]} {incr idx} { + set element [lindex $parts $idx] + set type [lindex $element 0] + switch -exact -- $type { + LITERALPART { + append stringvalue [lindex $element 1] + } + NEWLINE { + set val [lindex $element 1] + if {$val eq "lf"} { + append stringvalue \n + } else { + append stringvalue \r\n + } + } + default { + error "Unexpected value type '$type' found in multistring" + } + } + } + set datastructure $stringvalue + } + MULTISTRING { + #triple dquoted string + log::debug "---> tomlish::dict::from_tomlish processing multistring: $item" + set stringvalue "" + set idx 0 + set parts [lrange $item 1 end] + for {set idx 0} {$idx < [llength $parts]} {incr idx} { + set element [lindex $parts $idx] + set type [lindex $element 0] + #We use STRINGPART in the tomlish representation as a distinct element to STRING - which would imply wrapping quotes to be reinserted + switch -exact -- $type { + STRING { + #todo - do away with STRING ? + #we don't build MULTISTRINGS containing STRING - but should we accept it? + tomlish::log::warn "double quoting a STRING found in MULTISTRING - should be STRINGPART?" + #append stringvalue "\"[::tomlish::utils::unescape_string [lindex $element 1]]\"" + append stringvalue "\"[lindex $element 1]\"" + } + STRINGPART { + #JJJ + #don't unescape string + #append stringvalue [::tomlish::utils::unescape_string [lindex $element 1]] + append stringvalue [lindex $element 1] + } + CONT { + #When the last non-whitespace character on a line is an unescaped backslash, + #it will be trimmed along with all whitespace (including newlines) up to the next non-whitespace character or closing delimiter + # review - we allow some whitespace in stringpart elements - can a stringpart ever be all whitespace? + set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] + if {$next_nl == -1} { + #last (or first and only) line + return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} "Invalid whitespace escape - not a valid continuation position" + #set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] + #if {$non_ws >= 0} { + # #append stringvalue "\\" + # return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} "Invalid whitespace escape - not a valid continuation position" + #} else { + # #skip over ws without emitting + # set idx [llength $parts] + #} + } else { + set parts_til_nl [lrange $parts 0 $next_nl-1] + set non_ws [lsearch -index 0 -start $idx+1 -not $parts_til_nl WS] + if {$non_ws >= 0} { + #This CONT is invalid. If there had been a non-whitespace char directly following it, + #it wouldn't have come through as a CONT token + #Now that we see it isn't the last non-whitespace backslash on the line we can reject + # as an invalid escape of space or tab + #append stringvalue "\\" + return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} "Invalid whitespace escape - not a valid continuation position" + } else { + #skip over ws on this line + set idx $next_nl + #then have to check each subsequent line until we get to first non-whitespace + set trimming 1 + while {$trimming && $idx < [llength $parts]} { + set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] + if {$next_nl == -1} { + #last line + set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] + if {$non_ws >= 0} { + set idx [expr {$non_ws -1}] + } else { + set idx [llength $parts] + } + set trimming 0 + } else { + set non_ws [lsearch -index 0 -start $idx+1 -not [lrange $parts 0 $next_nl-1] WS] + if {$non_ws >= 0} { + set idx [expr {$non_ws -1}] + set trimming 0 + } else { + set idx $next_nl + #keep trimming + } + } + } + } + } + } + NEWLINE { + #if newline is first element - it is not part of the data of a multistring + if {$idx > 0} { + set val [lindex $element 1] + if {$val eq "lf"} { + append stringvalue \n + } else { + append stringvalue \r\n + } + } + } + WS { + append stringvalue [lindex $element 1] + } + default { + error "Unexpected value type '$type' found in multistring" + } + } + } + set datastructure $stringvalue + } + WS - COMMENT - NEWLINE { + #ignore + } + BOM { + #this token is the unicode single char \uFFEF + #It doesn't tell us what encoding was originally used (though toml should only accept UTF-8 files) + #ignore at start - what about in other positions? + } + default { + error "Unexpected tag '$tag' in Tomlish list '$tomlish'" + } + } + } + return $datastructure + } +} +namespace eval tomlish::dict::path { + #access tomlish dict structure + namespace export {[a-z]*}; # Convention: export all lowercase + + #access with path such as: @@k @@k 0 @@k end where dict keys marked with @@ and plain values are list indices into in {type ARRAY value } + #leaf elements returned as structured {type value } + proc get {dictval {path {}}} { + if {$path eq ""} { + return $dictval + } + ::set data $dictval + ::set pathsofar [list] + foreach p $path { + ::lappend pathsofar $p + if {[string range $p 0 1] eq "@@"} { + ::set data [dict get $data [string range $p 2 end]] + } else { + if {![tomlish::dict::is_typeval $data]} { + error "tomlish::dict::path::get error bad path $path. Attempt to access table as array at subpath $pathsofar." + } + if {[dict get $data type] ne "ARRAY"} { + error "tomlish::dict::get error bad path $path. Subpath $pathsofar is not an array." + } + ::set arrdata [dict get $data value] + ::set data [lindex $arrdata $p] + } + } + return $data + } + proc exists {dictval path} { + ::set data $dictval + ::set pathsofar [list] + ::set exists 1 + foreach p $path { + ::lappend pathsofar $p + if {[string range $p 0 1] eq "@@"} { + ::set k [string range $p 2 end] + if {![dict exists $data $k]} { + return 0 + } + ::set data [dict get $data $k] + } else { + if {![tomlish::dict::is_typeval $data]} { + return 0 + } + if {[dict get $data type] ne "ARRAY"} { + return 0 + } + ::set arrdata [dict get $data value] + ::set intp [tomlish::system::lindex_resolve_basic $arrdata $p] ;#handle index math (end-1 etc) + if {$intp == -1} { + #out of bounds + return 0 + } + ::set data [lindex $arrdata $p] + } + } + return $exists + } + + #a restricted analogy of 'dictn set' + #set 'endpoints' - don't create intermediate paths + # can replace an existing dict with another dict + # can create a key when key at tail end of path is a key (ie @@keyname, not index) + # can replace an existing {type value value } + # with added restriction that if is ARRAY the new must also be ARRAY + proc set_endpoint {dictvariable path value} { + upvar $dictvariable dict_being_edited + ::set data $dict_being_edited + ::set pathsofar [list] + if {!([tomlish::dict::is_typeval $value] || [tomlish::dict::is_typeval_dict $value 0])} { + #failed check of supplied value as basic type, or a sub-dict structure (not checking arrays) + error "tomlish::dict::path::set_endpoint error - value must already be in the tomlish form {type value } or be a dict with such forms as leaves" + } + foreach p $path { + ::lappend pathsofar $p + if {[string range $p 0 1] eq "@@"} { + ::set k [string range $p 2 end] + + #if {![dict exists $data $k]} { + # error "tomlish::dict:path::set error bad path $path. Attempt to access nonexistent element at subpath $pathsofar." + #} + ::set varname v[incr v] + + if {$pathsofar eq $path} { + #see if endpoint of the path given already exists + if {[dict exists $data $k]} { + ::set endpoint [dict get $data $k] + if {[tomlish::dict::is_typeval $endpoint]} { + set existing_tp [dict get $endpoint type] + if {![tomlish::dict::is_typeval $value]} { + error "tomlish::dict::path::set_endpoint error Unable to overwrite subpath '$pathsofar' which is of type $existing_tp with sub-dict. Supplied value not {type value val } with sub-dict: $value" + } + switch -- [dict get $endpoint type] { + ARRAY { + #disallow overwriting array - unless given value is an ARRAY? REVIEW + if {[dict get $value type] ne "ARRAY"} { + error "tomlish::dict::path::set_endpoint error bad path '$path'. Cannot overwrite array with non-array: $value" + } + } + default { + # + } + } + } else { + #endpoint is a typeval dict not a plain typeval - only allow overwrite with a typeval dict + if {![tomlish::dict::is_typeval_dict $value 0]} { + error "tomlish::dict::path::set_endpoint error path '$path'. Cannot overwrite sub-dict (size: [dict size $endpoint]) with non sub-dict: $value" + } + } + ::set $varname $value + dict set vdict $pathsofar $varname + break + } else { + ::set arrdata [dict get $data value] + set idx [tomlish::system::lindex_resolve_basic $arrdata $p] + if {$idx == -1} { + error "tomlish::dict::path::set_endpoint error bad path '$path'. No existing element at $p" + } + ::set data [lindex $arrdata $p] + ::set $varname $data + dict set vdict $pathsofar $varname + } + } + } + #dict for {path varname} $vdict { + # puts "$path $varname\n" + # puts " '[::set $varname]'\n" + # puts "" + #} + ::set i 0 + ::set reverse [lreverse $vdict] + foreach {varname path} $reverse { + set newval [::set $varname] + if {$i+2 == [llength $reverse]} { + ::set k [lindex $path end] + ::set k [string range $k 2 end] ;#first key is always @@something + dict set dict_being_edited $k $newval + #puts "--result $dict_being_edited" + break + } + ::set nextvarname [lindex $reverse $i+2] + ::set nextval [::set $nextvarname] + ::set k [lindex $path end] + if {[string match @@* $k]} { + #dict key + #dict set $nextvarname $k $newval + set_endpoint $nextvarname [list $k] $newval + } else { + #list index + ::set nextarr [dict get $nextval value] + ::lset nextarr $k $newval + dict set $nextvarname value $nextarr + } + ::incr i 2 + } + + return $dict_being_edited + + } + #path must be to a {type ARRAY value } + #REVIEW - how to lappend to deep mixed dict/array structure without rewriting whole datastructure? + proc lappend {dictvariable path args} { + upvar $dictvariable dict_being_edited + ::set data $dict_being_edited + ::set pathsofar [list] + #::set newlist [list] + ::set v 0 + ::set vdict [dict create] + foreach a $args { + if {![string is dict $a]} { + error "tomlish::dict::path::lappend error - lappended arguments must already be in the tomlish form {type value } or be a dict with such forms as leaves" + } + } + foreach p $path { + ::lappend pathsofar $p + if {[string range $p 0 1] eq "@@"} { + ::set k [string range $p 2 end] + if {![dict exists $data $k]} { + error "tomlish::dict::path::lappend error bad path $path. Attempt to access nonexistent element at subpath $pathsofar." + } + ::set varname v[incr v] + + if {$pathsofar eq $path} { + #see if endpoint of the path given is an ARRAY + ::set endpoint [dict get $data $k] + if {![tomlish::dict::is_typeval $endpoint]} { + error "tomlish::dict::path::lappend error bad path $path. Attempt to access table as array at subpath $pathsofar." + } + if {[dict get $endpoint type] ne "ARRAY"} { + error "tomlish::dict::path::lappend error bad path $path. Subpath $pathsofar is not an array." + } + ::set arrdata [dict get $endpoint value] + ::lappend arrdata {*}$args + dict set endpoint value $arrdata + ::set newlist $endpoint + ::set $varname $newlist + dict set vdict $pathsofar $varname + break + } + ::set data [dict get $data $k] + ::set $varname $data + dict set vdict $pathsofar $varname + } else { + if {![tomlish::dict::is_typeval $data]} { + error "tomlish::dict::path::lappend error bad path $path. Attempt to access table as array at subpath $pathsofar." + } + if {[dict get $data type] ne "ARRAY"} { + error "tomlish::dict::path::lappend error bad path $path. Subpath $pathsofar is not an array." + } + ::set varname v[incr v] + if {$pathsofar eq $path} { + if {[dict get $data type] ne "ARRAY"} { + error "tomlish::dict::path::lappend error bad path $path. Parent path is not an array." + } + ::set parentarray [dict get $data value] + ::set idx [tomlish::system::lindex_resolve_basic $parentarray $p] + if {$idx == -1} { + error "tomlish::dict::path::lappend error bad path $path. Index $p does not exist." + } + ::set endpoint [lindex $parentarray $p] + if {[dict get $endpoint type] ne "ARRAY"} { + error "tomlish::dict::path::lappend error bad path $path. Not an array." + } + + ::set arrdata [dict get $endpoint value] + ::lappend arrdata {*}$args + dict set endpoint value $arrdata + ::set newlist $endpoint + #::lset parentarray $p $newlist + #set parentarray $newlist + ::set $varname $newlist + dict set vdict $pathsofar $varname + break + } else { + ::set arrdata [dict get $data value] + set idx [tomlish::system::lindex_resolve_basic $arrdata $p] + if {$idx == -1} { + error "tomlish::dict::path::lappend error bad path $path. Subpath $pathsofar, index $p does not exist." + } + ::set data [lindex $arrdata $p] + ::set $varname $data + dict set vdict $pathsofar $varname + } + } + } + #todo tomlish::log::debug ? + #dict for {path varname} $vdict { + # puts "$path $varname\n" + # puts " [::set $varname]\n" + # puts "" + #} + ::set i 0 + ::set reverse [lreverse $vdict] + foreach {varname path} $reverse { + set newval [::set $varname] + if {$i+2 == [llength $reverse]} { + ::set k [lindex $path end] + ::set k [string range $k 2 end] ;#first key is always @@something + dict set dict_being_edited $k $newval + #puts "--result $dict_being_edited" + break + } + ::set nextvarname [lindex $reverse $i+2] + ::set nextval [::set $nextvarname] + ::set k [lindex $path end] + if {[string match @@* $k]} { + #dict key + set k [string range $k 2 end] + dict set $nextvarname $k $newval + } else { + #list index + ::set nextarr [dict get $nextval value] + ::lset nextarr $k $newval + dict set $nextvarname value $nextarr + } + ::incr i 2 + } + return $dict_being_edited + } +} +tcl::namespace::eval tomlish::to_dict { + + + proc @@path {dictkeys} { + lmap v $dictkeys {string cat @@ $v} + } + +} + + +tcl::namespace::eval tomlish::app { + #*** !doctools + #[subsection {Namespace tomlish::app}] + #[para] + #[list_begin definitions] + + tcl::namespace::eval argdoc { + proc test_suites {} { + if {[package provide test::tomlish] eq ""} { + return [list] + } + return [test::tomlish::SUITES] + } + } + + package require punk::args + punk::args::define { + @id -id ::tomlish::app::decoder + @cmd -name tomlish::app::decoder -help\ + "Read toml on stdin until EOF + on error - returns non-zero exit code and writes error to + the errorchannel. + on success - returns zero exit code and writes JSON encoding + of the data to the outputchannel. + This decoder is intended to be compatble with toml-test." + @leaders -min 0 -max 0 + @opts + -help -type none -help\ + "Display this usage message" + -inputchannel -default stdin + -inputencoding -default "iso8859-1" -choicerestricted 0 -choices {utf-8 utf-16 iso8859-1} -help\ + "configure encoding on input channel + iso8859-1 is equivalent to binary encoding" + -outputchannel -default stdout + -errorchannel -default stderr + @values -min 0 -max 0 + } + proc decoder {args} { + set argd [punk::args::parse $args withid ::tomlish::app::decoder] + set ch_input [dict get $argd opts -inputchannel] + set ch_input_enc [dict get $argd opts -inputencoding] + set ch_output [dict get $argd opts -outputchannel] + set ch_error [dict get $argd opts -errorchannel] + if {[dict exists $argd received -help]} { + return [punk::args::usage -scheme info ::tomlish::app::decoder] + } + + chan configure $ch_input -encoding $ch_input_enc + #translation? + chan configure $ch_input -translation lf ;# toml-test invalid/control tests we need to see raw CRs to reject them properly - auto translation won't do. + + #Just slurp it all - presumably we are not handling massive amounts of data on stdin. + # - even if the input is large, we probably don't gain much (aside from possible memory savings?) by attempting to process input as it arrives. + if {[catch { + set inputdata [read $ch_input] + if {$ch_input_enc eq "iso8859-1"} { + set toml [tomlish::toml::from_binary $inputdata] + } else { + set toml $inputdata + } + } errM]} { + puts stderr "read-input error: $errM" + #toml-tests expect exit code 1 + #e.g invalid/encoding/utf16-bom + exit 1 ;#read error + } + try { + set j [::tomlish::toml_to_typedjson $toml] + } on error {em} { + puts $ch_error "decoding failed: '$em'" + exit 1 + } + puts -nonewline $ch_output $j + exit 0 + } + + package require punk::args + punk::args::define { + @id -id ::tomlish::app::encoder + @cmd -name tomlish::app::encoder -help\ + "Read JSON on input until EOF + return non-zero exitcode if JSON data cannot be converted to + a valid TOML representation. + return zero exitcode and TOML data on output if JSON data can + be converted. + This encoder is intended to be compatible with toml-test." + @leaders -min 0 -max 0 + @opts + -help -type none -help \ + "Display this usage message" + -restrict_barekeys -default 0 -help\ + "If true, keys containing unicode will be quoted. + If false, an extended range of barekeys will be used + in unquoted form." + -inputchannel -default stdin + -inputencoding -default "" -choicerestricted 0 -choices {utf-8 utf-16 iso8859-1} -help\ + "configure encoding on input channel + If not supplied, leave at Tcl default" + -outputchannel -default stdout + -errorchannel -default stderr + @values -min 0 -max 0 + } + proc encoder {args} { + set argd [punk::args::parse $args withid ::tomlish::app::encoder] + set restrict_barekeys [dict get $argd opts -restrict_barekeys] + set ch_input [dict get $argd opts -inputchannel] + set ch_input_enc [dict get $argd opts -inputencoding] + set ch_output [dict get $argd opts -outputchannel] + set ch_error [dict get $argd opts -errorchannel] + if {[dict exists $argd received -help]} { + return [punk::args::usage -scheme info ::tomlish::app::encoder] + } + #review + if {$ch_input_enc ne ""} { + chan configure $ch_input -encoding $ch_input_enc + } + #review + chan configure $ch_input -translation lf + + if {[catch { + set json [read $ch_input] + }]} { + exit 2 ;#read error + } + try { + #tomlish::typedjson_to_toml + set toml [::tomlish::toml::from_tomlish_from_dict_from_typedjson $json] + } trap {} {e eopts} { + puts $ch_error "encoding failed: '$e'" + puts $ch_error "$::errorInfo" + exit 1 + } + puts -nonewline $ch_output $toml + exit 0 + } + + punk::args::define { + @dynamic + @id -id ::tomlish::app::test + @cmd -name tomlish::app::test + @leaders + @opts -any 1 + -help -type none -help\ + "Display this usage message + or further info if more args." + -suite -default tests -choices {${[::tomlish::app::argdoc::test_suites]}} + @values -min 0 -max -1 + } + proc test {args} { + package require test::tomlish + set argd [punk::args::parse $args withid ::tomlish::app::test] + set opts [dict get $argd opts] + set values [dict get $argd values] + set received [dict get $argd received] + set solos [dict get $argd solos] + set opt_suite [dict get $opts -suite] + if {[dict exists $received -help] && ![dict exists $received -suite]} { + return [punk::args::usage -scheme info ::tomlish::app::test] + } + + test::tomlish::SUITE $opt_suite + #if {[catch {test::tomlish::SUITE $opt_suite} errM]} { + # puts stderr "Unknown test suite '$opt_suite'. Available suites: [test::tomlish::SUITES]" + # exit 1 + #} + set run_opts [dict remove $opts -suite] + set run_opts [dict remove $run_opts {*}$solos] + set result [test::tomlish::RUN {*}$run_opts {*}$solos {*}$values] + return $result + } + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::app ---}] +} + +proc ::tomlish::appnames {} { + set applist [list] + foreach cmd [info commands ::tomlish::app::*] { + lappend applist [namespace tail $cmd] + } + return $applist +} + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# Secondary API namespace +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval tomlish::lib { + namespace export {[a-z]*}; # Convention: export all lowercase + namespace path [namespace parent] + #*** !doctools + #[subsection {Namespace tomlish::lib}] + #[para] Secondary functions that are part of the API + #[list_begin definitions] + + #proc utility1 {p1 args} { + # #*** !doctools + # #[call lib::[fun utility1] [arg p1] [opt {?option value...?}]] + # #[para]Description of utility1 + # return 1 + #} + + + + #*** !doctools + #[list_end] [comment {--- end definitions namespace tomlish::lib ---}] +} +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval tomlish::system { + + #taken from punk::lib + proc lindex_resolve_basic {list index} { + #*** !doctools + #[call [fun lindex_resolve_basic] [arg list] [arg index]] + #[para] Accepts index of the forms accepted by Tcl's list commands. (e.g compound indices such as 3+1 end-2) + #[para] returns -1 for out of range at either end, or a valid integer index + #[para] Unlike lindex_resolve; lindex_resolve_basic can't determine if an out of range index was out of range at the lower or upper bound + #[para] This is only likely to be faster than average over lindex_resolve for Tcl which has the builtin lseq command + #[para] The performance advantage is more likely to be present when using compound indexes such as $x+1 or end-1 + #[para] For pure integer indices the performance should be equivalent + + set index [tcl::string::map {_ {}} $index] ;#forward compatibility with integers such as 1_000 + if {[string is integer -strict $index]} { + #can match +i -i + #avoid even the lseq overhead when the index is simple + if {$index < 0 || ($index >= [llength $list])} { + #even though in this case we could return -2 or -3 like lindex_resolve; for consistency we don't, as it's not always determinable for compound indices using the lseq method. + return -1 + } else { + #integer may still have + sign - normalize with expr + return [expr {$index}] + } + } + if {[llength $list]} { + set indices [tomlish::system::range 0 [expr {[llength $list]-1}]] ;# uses lseq if available, has fallback. + #if lseq was available - $indices is an 'arithseries' - theoretically not taking up ram(?) + } else { + set indices [list] + } + set idx [lindex $indices $index] + if {$idx eq ""} { + #we have no way to determine if out of bounds is at lower vs upper end + return -1 + } else { + return $idx + } + } + + if {[info commands ::lseq] ne ""} { + #tcl 8.7+ lseq significantly faster, especially for larger ranges + #The internal rep can be an 'arithseries' with no string representation + #support minimal set from to + proc range {from to} { + lseq $from $to + } + } else { + #lseq accepts basic expressions e.g 4-2 for both arguments + #e.g we can do lseq 0 [llength $list]-1 + #if range is to be consistent with the lseq version above - it should support that, even though we don't support most lseq functionality in either wrapper. + proc range {from to} { + set to [offset_expr $to] + set from [offset_expr $from] + if {$to > $from} { + set count [expr {($to -$from) + 1}] + if {$from == 0} { + return [lsearch -all [lrepeat $count 0] *] + } else { + incr from -1 + return [lmap v [lrepeat $count 0] {incr from}] + } + #slower methods. + #2) + #set i -1 + #set L [lrepeat $count 0] + #lmap v $L {lset L [incr i] [incr from];lindex {}} + #return $L + #3) + #set L {} + #for {set i 0} {$i < $count} {incr i} { + # lappend L [incr from] + #} + #return $L + } elseif {$from > $to} { + set count [expr {$from - $to} + 1] + #1) + if {$to == 0} { + return [lreverse [lsearch -all [lrepeat $count 0] *]] + } else { + incr from + return [lmap v [lrepeat $count 0] {incr from -1}] + } + + #2) + #set i -1 + #set L [lrepeat $count 0] + #lmap v $L {lset L [incr i] [incr from -1];lindex {}} + #return $L + #3) + #set L {} + #for {set i 0} {$i < $count} {incr i} { + # lappend L [incr from -1] + #} + #return $L + } else { + return [list $from] + } + } + } + +} + +if {[info exists ::argc] && $::argc > 0} { + #puts stderr "argc: $::argc args: $::argv" + set arglist $::argv + # -------------- + #make sure any dependant packages that are sourced don't get any commandline args + set ::argv {} + set ::argc 0 + # -------------- + package require punk::args + punk::args::define { + @dynamic + @id -id tomlish::cmdline + @cmd -name tomlish -help\ + "toml encoder/decoder written in Tcl" + @opts -any 1 + -help -type none -help\ + "Display this usage message or more specific + help if further arguments provided." + -app -choices {${[tomlish::appnames]}} + } + try { + set argd [punk::args::parse $arglist withid tomlish::cmdline] + } trap {PUNKARGS VALIDATION} {msg erroropts} { + puts stderr $msg + exit 1 + } + + + lassign [dict values $argd] leaders opts values received solos + if {[dict exists $received -help] && ![dict exists $received -app]} { + #only emit cmdline help if -app not supplied as well - otherwise app function can act on -help for more specific help + #puts stdout "Usage: -app where appname one of:[tomlish::appnames]" + puts stdout [punk::args::usage -scheme info tomlish::cmdline] + exit 0 + } + if {![dict exists $received -app]} { + puts stderr [punk::args::usage -scheme error tomlish::cmdline] + exit 1 + } + + set app [dict get $opts -app] + set appnames [tomlish::appnames] + set app_opts [dict remove $opts -app {*}$solos] + try { + set result [tomlish::app::$app {*}$app_opts {*}$solos {*}$values] + } trap {PUNKARGS VALIDATION} {msg erroropts} { + #The validation error should fully describe the issue + #no need for errortrace - keep the output cleaner + puts stderr $msg + exit 1 + } trap {} {msg erroropts} { + #unexpected error - uncaught throw will produce error trace + #todo - a support msg? Otherwise we may as well just leave off this trap. + throw [dict get $erroropts -errorcode] [dict get $erroropts -errorinfo] + } + if {"-help" in $solos} { + puts stderr $result + exit 1 + } else { + if {$result ne ""} { + puts stdout $result + exit 0 + } + } +} + +## Ready +package provide tomlish [namespace eval tomlish { + variable pkg tomlish + variable version + set version 1.1.6 +}] +return + +#*** !doctools +#[manpage_end] +