diff --git a/src/bootsupport/modules/dictn-0.1.1.tm b/src/bootsupport/modules/dictn-0.1.1.tm new file mode 100644 index 00000000..c9ef87f2 --- /dev/null +++ b/src/bootsupport/modules/dictn-0.1.1.tm @@ -0,0 +1,349 @@ +# -*- tcl -*- +# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt +# +# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. +# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# (C) 2023 +# +# @@ Meta Begin +# Application dictn 0.1.1 +# Meta platform tcl +# Meta license +# @@ Meta End + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Requirements +##e.g package require frobz + + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval dictn { + namespace export {[a-z]*} + namespace ensemble create +} + + +## ::dictn::append +#This can of course 'ruin' a nested dict if applied to the wrong element +# - i.e using the string op 'append' on an element that is itself a nested dict is analogous to the standard Tcl: +# %set list {a b {c d}} +# %append list x +# a b {c d}x +# IOW - don't do that unless you really know that's what you want. +# +proc ::dictn::append {dictvar path {value {}}} { + if {[llength $path] == 1} { + uplevel 1 [list dict append $dictvar $path $value] + } else { + upvar 1 $dictvar dvar + + ::set str [dict get $dvar {*}$path] + append str $val + dict set dvar {*}$path $str + } +} + +proc ::dictn::create {args} { + ::set data {} + foreach {path val} $args { + dict set data {*}$path $val + } + return $data +} + +proc ::dictn::exists {dictval path} { + return [dict exists $dictval {*}$path] +} + +proc ::dictn::filter {dictval path filterType args} { + ::set sub [dict get $dictval {*}$path] + dict filter $sub $filterType {*}$args +} + +proc ::dictn::for {keyvalvars dictval path body} { + ::set sub [dict get $dictval {*}$path] + dict for $keyvalvars $sub $body +} + +proc ::dictn::get {dictval {path {}}} { + return [dict get $dictval {*}$path] +} + +proc ::dictn::getdef {dictval path default} { + return [dict getdef $dictval {*}$path $default] +} + +proc ::dictn::getwithdefault {dictval path default} { + return [dict getdef $dictval {*}$path $default] +} + +if {[info commands ::tcl::dict::getdef] ne ""} { + proc ::dictn::incr {dictvar path {increment {}} } { + if {$increment eq ""} { + ::set increment 1 + } + if {[llength $path] == 1} { + uplevel 1 [list dict incr $dictvar $path $increment] + } else { + upvar 1 $dictvar dvar + if {![::info exists dvar]} { + dict set dvar {*}$path $increment + } else { + ::set newval [expr {[dict getdef $dvar {*}$path 0] + $increment}] + dict set dvar {*}$path $newval + } + return $dvar + } + } +} else { + proc ::dictn::incr {dictvar path {increment {}} } { + if {$increment eq ""} { + ::set increment 1 + } + if {[llength $path] == 1} { + uplevel 1 [list dict incr $dictvar $path $increment] + } else { + upvar 1 $dictvar dvar + if {![::info exists dvar]} { + dict set dvar {*}$path $increment + } else { + if {![dict exists $dvar {*}$path]} { + ::set val 0 + } else { + ::set val [dict get $dvar {*}$path] + } + ::set newval [expr {$val + $increment}] + dict set dvar {*}$path $newval + } + return $dvar + } + } +} + +proc ::dictn::info {dictval {path {}}} { + if {![string length $path]} { + return [dict info $dictval] + } else { + ::set sub [dict get $dictval {*}$path] + return [dict info $sub] + } +} + +proc ::dictn::keys {dictval {path {}} {glob {}}} { + ::set sub [dict get $dictval {*}$path] + if {[string length $glob]} { + return [dict keys $sub $glob] + } else { + return [dict keys $sub] + } +} + +proc ::dictn::lappend {dictvar path args} { + if {[llength $path] == 1} { + uplevel 1 [list dict lappend $dictvar $path {*}$args] + } else { + upvar 1 $dictvar dvar + + ::set list [dict get $dvar {*}$path] + ::lappend list {*}$args + dict set dvar {*}$path $list + } +} + +proc ::dictn::merge {args} { + error "nested merge not yet supported" +} + +#dictn remove dictionaryValue ?path ...? +proc ::dictn::remove {dictval args} { + ::set basic [list] ;#buffer basic (1element path) removals to do in a single call. + + foreach path $args { + if {[llength $path] == 1} { + ::lappend basic $path + } else { + #extract,modify,replace + ::set subpath [lrange $path 0 end-1] + + ::set sub [dict get $dictval {*}$subpath] + ::set sub [dict remove $sub [lindex $path end]] + + dict set dictval {*}$subpath $sub + } + } + + if {[llength $basic]} { + return [dict remove $dictval {*}$basic] + } else { + return $dictval + } +} + + +proc ::dictn::replace {dictval args} { + ::set basic [list] ;#buffer basic (1element path) replacements to do in a single call. + + foreach {path val} $args { + if {[llength $path] == 1} { + ::lappend basic $path $val + } else { + #extract,modify,replace + ::set subpath [lrange $path 0 end-1] + + ::set sub [dict get $dictval {*}$subpath] + ::set sub [dict replace $sub [lindex $path end] $val] + + dict set dictval {*}$subpath $sub + } + } + + + if {[llength $basic]} { + return [dict replace $dictval {*}$basic] + } else { + return $dictval + } +} + + +proc ::dictn::set {dictvar path newval} { + upvar 1 $dictvar dvar + return [dict set dvar {*}$path $newval] +} + +proc ::dictn::size {dictval {path {}}} { + return [dict size [dict get $dictval {*}$path]] +} + +proc ::dictn::unset {dictvar path} { + upvar 1 $dictvar dvar + return [dict unset dvar {*}$path +} + +proc ::dictn::update {dictvar args} { + ::set body [lindex $args end] + ::set maplist [lrange $args 0 end-1] + + upvar 1 $dictvar dvar + foreach {path var} $maplist { + if {[dict exists $dvar {*}$path]} { + uplevel 1 [list set $var [dict get $dvar $path]] + } + } + + catch {uplevel 1 $body} result + + foreach {path var} $maplist { + if {[dict exists $dvar {*}$path]} { + upvar 1 $var $var + if {![::info exists $var]} { + uplevel 1 [list dict unset $dictvar {*}$path] + } else { + uplevel 1 [list dict set $dictvar {*}$path [::set $var]] + } + } + } + return $result +} + +#an experiment. +proc ::dictn::Applyupdate {dictvar args} { + ::set body [lindex $args end] + ::set maplist [lrange $args 0 end-1] + + upvar 1 $dictvar dvar + + ::set headscript "" + ::set i 0 + foreach {path var} $maplist { + if {[dict exists $dvar {*}$path]} { + #uplevel 1 [list set $var [dict get $dvar $path]] + ::lappend arglist $var + ::lappend vallist [dict get $dvar {*}$path] + ::append headscript [string map [list %i% $i %v% $var] {upvar 1 %v% %v%; set %v% [lindex $args %i%]} ] + ::append headscript \n + ::incr i + } + } + + ::set body $headscript\r\n$body + + puts stderr "BODY: $body" + + #set result [apply [list args $body] {*}$vallist] + catch {apply [list args $body] {*}$vallist} result + + foreach {path var} $maplist { + if {[dict exists $dvar {*}$path] && [::info exists $var]} { + dict set dvar {*}$path [::set $var] + } + } + return $result +} + +proc ::dictn::values {dictval {path {}} {glob {}}} { + ::set sub [dict get $dictval {*}$path] + if {[string length $glob]} { + return [dict values $sub $glob] + } else { + return [dict values $sub] + } +} + +# Standard form: +#'dictn with dictVariable path body' +# +# Extended form: +#'dictn with dictVariable path arrayVariable body' +# +proc ::dictn::with {dictvar path args} { + if {[llength $args] == 1} { + ::set body [lindex $args 0] + return [uplevel 1 [list dict with $dictvar {*}$path $body]] + } else { + upvar 1 $dictvar dvar + ::lassign $args arrayname body + + upvar 1 $arrayname arr + array set arr [dict get $dvar {*}$path] + ::set prevkeys [array names arr] + + catch {uplevel 1 $body} result + + + foreach k $prevkeys { + if {![::info exists arr($k)]} { + dict unset $dvar {*}$path $k + } + } + foreach k [array names arr] { + dict set $dvar {*}$path $k $arr($k) + } + + return $result + } +} + + + + + + + + + + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Ready +package provide dictn [namespace eval dictn { + variable version + ::set version 0.1.1 +}] +return \ No newline at end of file diff --git a/src/bootsupport/modules/include_modules.config b/src/bootsupport/modules/include_modules.config index 247371ee..afd1e8f2 100644 --- a/src/bootsupport/modules/include_modules.config +++ b/src/bootsupport/modules/include_modules.config @@ -27,6 +27,7 @@ set bootsupport_modules [list\ src/vendormodules sha1\ src/vendormodules tomlish\ src/vendormodules test::tomlish\ + src/vendormodules dictn\ src/vendormodules textutil::adjust\ src/vendormodules textutil::repeat\ src/vendormodules textutil::split\ diff --git a/src/bootsupport/modules/test/tomlish-1.1.3.tm b/src/bootsupport/modules/test/tomlish-1.1.3.tm index ed5044a7..8afb43d9 100644 Binary files a/src/bootsupport/modules/test/tomlish-1.1.3.tm and b/src/bootsupport/modules/test/tomlish-1.1.3.tm differ diff --git a/src/bootsupport/modules/tomlish-1.1.4.tm b/src/bootsupport/modules/tomlish-1.1.4.tm index 7a6d5205..33d5b912 100644 --- a/src/bootsupport/modules/tomlish-1.1.4.tm +++ b/src/bootsupport/modules/tomlish-1.1.4.tm @@ -153,15 +153,10 @@ namespace eval tomlish { } #review - if {[uplevel 1 [list info exists tablenames_seen]]} { - upvar tablenames_seen tablenames_seen + if {[uplevel 1 [list info exists tablenames_info]]} { + upvar tablenames_info tablenames_info } else { - set tablenames_seen [list] ;#list of lists - } - if {[uplevel 1 [list info exists tablenames_closed]]} { - upvar tablenames_closed tablenames_closed - } else { - set tablenames_closed [list] ;#list of lists + set tablenames_info [dict create] ;#keys are lists {parenttable subtable etc} corresponding to parenttable.subtable.etc } foreach sub [lrange $keyval_element 2 end] { @@ -207,13 +202,10 @@ namespace eval tomlish { ARRAY { #we need to recurse to get the corresponding dict for the contained item(s) #pass in the whole $found_sub - not just the $value! - set prev_tablenames_seen $tablenames_seen - set prev_tablenames_closed $tablenames_closed - set tablenames_seen [list] - set tablenames_closed [list] + set prev_tablenames_info $tablenames_info + set tablenames_info [dict create] set result [list type $type value [::tomlish::to_dict [list $found_sub]]] - set tablenames_seen $prev_tablenames_seen - set tablenames_closed $prev_tablenames_closed + set tablenames_info $prev_tablenames_info } MULTISTRING - MULTILITERAL { #review - mapping these to STRING might make some conversions harder? @@ -295,23 +287,66 @@ namespace eval tomlish { #[Data] #temps = [{cpu = 79.5, case = 72.0}] proc to_dict {tomlish} { + package require dictn #keep track of which tablenames have already been directly defined, # so we can raise an error to satisfy the toml rule: 'You cannot define any key or table more than once. Doing so is invalid' #Note that [a] and then [a.b] is ok if there are no subkey conflicts - so we are only tracking complete tablenames here. #we don't error out just because a previous tablename segment has already appeared. - ##variable tablenames_seen [list] - if {[uplevel 1 [list info exists tablenames_seen]]} { - upvar tablenames_seen tablenames_seen - } else { - set tablenames_seen [list] ;#list of lists - } - if {[uplevel 1 [list info exists tablenames_closed]]} { - upvar tablenames_closed tablenames_closed + + #Declaring, Creating, and Defining Tables + #https://github.com/toml-lang/toml/issues/795 + #(update - only Creating and Defining are relevant terminology) + + #review + #tablenames_info keys created, defined, createdby, definedby, closedby + + #consider the following 2 which are legal: + #[table] #'table' created, defined=open definedby={header table} + #x.y = 3 + #[table.x.z] #'table' defined=closed closedby={header table.x.z}, 'table.x' created, 'table.x.z' created defined=open definedby={header table.x.z} + #k= 22 + # #'table.x.z' defined=closed closedby={eof eof} + + #equivalent datastructure + + #[table] #'table' created, defined=open definedby={header table} + #[table.x] #'table' defined=closed closedby={header table.x}, 'table.x' created defined=open definedby={header table.x} + #y = 3 + #[table.x.z] #'table.x' defined=closed closedby={header table.x.z}, 'table.x.z' created defined=open definedby={header table.x.z} + #k=22 + + #illegal + #[table] #'table' created and defined=open + #x.y = 3 #'table.x' created first keyval pair defined=open definedby={keyval x.y = 3} + #[table.x.y.z] #'table' defined=closed, 'table.x' closed because parent 'table' closed?, 'table.x.y' cannot be created + #k = 22 + # + ## - we would fail on encountering table.x.y because only table and table.x are effectively tables - but that table.x is closed should be detected (?) + + #illegal + #[table] + #x.y = {p=3} + #[table.x.y.z] + #k = 22 + ## we should fail because y is an inline table which is closed to further entries + + #note: it is not safe to compare normalized tablenames using join! + # e.g a.'b.c'.d is not the same as a.b.c.d + # instead compare {a b.c d} with {a b c d} + # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. + #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' + #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} + + + + if {[uplevel 1 [list info exists tablenames_info]]} { + upvar tablenames_info tablenames_info } else { - set tablenames_closed [list] ;#list of lists + set tablenames_info [dict create] ;#keyed on tablepath each of which is a list such as {config subgroup etc} (corresponding to config.subgroup.etc) } + log::info "---> to_dict processing '$tomlish'<<<" set items $tomlish @@ -354,7 +389,7 @@ namespace eval tomlish { #a.b.c = 1 #table_key_hierarchy -> a b - #leafkey -> c + #tleaf -> c if {[llength $dotted_key_hierarchy] == 0} { #empty?? probably invalid. review #This is different to '' = 1 or ''.'' = 1 which have lengths 1 and 2 respectively @@ -362,10 +397,10 @@ namespace eval tomlish { } elseif {[llength $dotted_key_hierarchy] == 1} { #dottedkey is only a key - no table component set table_hierarchy [list] - set leafkey [lindex $dotted_key_hierarchy 0] + set tleaf [lindex $dotted_key_hierarchy 0] } else { set table_hierarchy [lrange $dotted_key_hierarchy 0 end-1] - set leafkey [lindex $dotted_key_hierarchy end] + set tleaf [lindex $dotted_key_hierarchy end] } #ensure empty tables are still represented in the datastructure @@ -380,143 +415,101 @@ namespace eval tomlish { } } #review? - if {[dict exists $datastructure {*}$table_hierarchy $leafkey]} { - error "Duplicate key '$table_hierarchy $leafkey'. The key already exists at this level in the toml data. The toml data is not valid." + if {[dict exists $datastructure {*}$table_hierarchy $tleaf]} { + error "Duplicate key '$table_hierarchy $tleaf'. The key already exists at this level in the toml data. The toml data is not valid." } #JMN test 2025 if {[llength $table_hierarchy]} { - lappend tablenames_seen $table_hierarchy + dictn incr tablenames_info [list $table_hierarchy seencount] } set keyval_dict [_get_keyval_value $item] if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { - lappend tablenames_seen [list {*}$table_hierarchy $leafkey] - lappend tablenames_closed [list {*}$table_hierarchy $leafkey] + set t [list {*}$table_hierarchy $tleaf] + dictn incr tablenames_info [list $t seencount] + dictn set tablenames_info [list $t closed] 1 #review - item is an ITABLE - we recurse here without datastructure context :/ #overwriting keys? todo ? - dict set datastructure {*}$table_hierarchy $leafkey $keyval_dict + dict set datastructure {*}$table_hierarchy $tleaf $keyval_dict } else { - dict set datastructure {*}$table_hierarchy $leafkey $keyval_dict + dict set datastructure {*}$table_hierarchy $tleaf $keyval_dict } + } + TABLEARRAY { + set tablename [lindex $item 1] + log::debug "---> to_dict processing item TABLENAME (name: $tablename): $item" + set norm_segments [::tomlish::utils::tablename_split $tablename true] ;#true to normalize + #we expect repeated tablearray entries - each adding a sub-object to the value, which is an array/list. + } TABLE { set tablename [lindex $item 1] + log::debug "---> to_dict processing item TABLE (name: $tablename): $item" #set tablename [::tomlish::utils::tablename_trim $tablename] set norm_segments [::tomlish::utils::tablename_split $tablename true] ;#true to normalize - if {$norm_segments in $tablenames_seen} { - error "Table name '$tablename' has already been directly defined in the toml data. Invalid." - } - log::debug "---> to_dict processing item $tag (name: $tablename): $item" - set name_segments [::tomlish::utils::tablename_split $tablename] ;#unnormalized - set last_seg "" - #toml spec rule - all segments mst be non-empty - #note that the results of tablename_split are 'raw' - ie some segments may be enclosed in single or double quotes. - - set table_key_sublist [list] - - foreach normseg $norm_segments { - lappend table_key_sublist $normseg - if {[dict exists $datastructure {*}$table_key_sublist]} { - #It's ok for this key to already exist *if* it was defined by a previous tablename or equivalent - #and if this key is longer - - #consider the following 2 which are legal: - #[table] - #x.y = 3 - #[table.x.z] - #k= 22 - - #equivalent - - #[table] - #[table.x] - #y = 3 - #[table.x.z] - #k=22 - - #illegal - #[table] - #x.y = 3 - #[table.x.y.z] - #k = 22 - ## - we should fail on encountering table.x.y because only table and table.x are effectively tables - - #illegal - #[table] - #x.y = {p=3} - #[table.x.y.z] - #k = 22 - ## we should fail because y is an inline table which is closed to further entries - - - #note: it is not safe to compare normalized tablenames using join! - # e.g a.'b.c'.d is not the same as a.b.c.d - # instead compare {a b.c d} with {a b c d} - # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. - #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' - #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} - - set sublist_length [llength $table_key_sublist] - set found_testkey 0 - if {$table_key_sublist in $tablenames_seen} { - set found_testkey 1 - } else { - #see if it was defined by a longer entry - foreach seen_table_segments $tablenames_seen { - if {[llength $seen_table_segments] <= $sublist_length} { - continue - } - #each tablenames_seen entry is already a list of normalized segments - - #we could have [a.b.c.d] early on - # followed by [a.b] - which was still defined by the earlier one. + set T_DEFINED [dictn getdef $tablenames_info [list $norm_segments defined] NULL] + if {$T_DEFINED ne "NULL"} { + #our tablename e.g [a.b.c.d] declares a space to 'define' subkeys - but there has already been a definition space for this path + set msg "Table name $tablename has already been directly defined in the toml data. Invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + error $msg + } - set seen_longer [lrange $seen_segments 0 [expr {$sublist_length -1}]] - puts stderr "testkey:'$table_key_sublist' vs seen_match:'$seen_longer'" - if {$table_key_sublist eq $seen_longer} { - set found_testkey 1 - } - } - } - if {$found_testkey == 0} { - #the raw unnormalized tablename might be ok to display in the error message, although it's not the actual dict keyset - set msg "key $table_key_sublist already exists in datastructure, but wasn't defined by a supertable." - append msg \n "tablenames_seen:" \n - foreach ts $tablenames_seen { - append msg " " $ts \n - } + set name_segments [::tomlish::utils::tablename_split $tablename 0] ;#unnormalized e.g ['a'."b".c.d] -> 'a' "b" c d + #results of tablename_split 0 are 'raw' - ie some segments may be enclosed in single or double quotes. + + + set supertable [list] + ############## + # [a.b.c.d] + # norm_segments = {a b c d} + #check a {a b} {a b c} <---- supertables of a.b.c.d + ############## + foreach normseg [lrange $norm_segments 0 end-1] { + lappend supertable $normseg + if {![dictn exists $tablenames_info [list $supertable type]]} { + #supertable with this path doesn't yet exist + if {[dict exists $datastructure {*}$supertable]} { + #There is data though - so it must have been created as a keyval + set msg "Supertable [join $supertable .] of table name $tablename already has data - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] error $msg + } else { + #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here + dictn set tablenames_info [list $supertable type] header + #ensure empty tables are still represented in the datastructure + dict set datastructure {*}$supertable [list] } - } - - } - - #ensure empty tables are still represented in the datastructure - set key_sublist [list] - foreach k $norm_segments { - lappend key_sublist $k - if {![dict exists $datastructure {*}$key_sublist]} { - dict set datastructure {*}$key_sublist [list] } else { - tomlish::log::notice "to_dict datastructure at (TABLE) subkey $key_sublist already had data: [dict get $datastructure {*}$key_sublist]" + #supertable has already been created - and maybe defined - but even if defined we can add subtables } } + #table [a.b.c.d] hasn't been defined - but may have been 'created' already by a longer tablename + # - or may have existing data from a keyval + if {![dictn exists $tablenames_info [list $norm_segments type]]} { + if {[dict exists $datastructure {*}$norm_segments]} { + set msg "Table name $tablename already has data - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + error $msg + } + #no data or previously created table + dictn set tablenames_info [list $norm_segments type] header - #We must do this after the key-collision test above! - lappend tablenames_seen $norm_segments - - + #We are 'defining' this table's keys and values here (even if empty) + dict set datastructure {*}$norm_segments [list] ;#ensure table still represented in datastructure even if we add no keyvals here + } + dictn set tablenames_info [list $norm_segments defined] open log::debug ">>> to_dict >>>>>>>>>>>>>>>>> normalized table key hierarchy : $norm_segments" #now add the contained elements foreach element [lrange $item 2 end] { set type [lindex $element 0] - log::debug "----> tododict processing $tag subitem $type processing contained element $element" + log::debug "----> todict processing $tag subitem $type processing contained element $element" switch -exact -- $type { DOTTEDKEY { set dkey_info [_get_dottedkey_info $element] @@ -547,14 +540,19 @@ namespace eval tomlish { puts stdout "to_dict>>> $keyval_dict" dict set datastructure {*}$norm_segments {*}$dkeys $leaf_key $keyval_dict #JMN 2025 - lappend tablenames_seen [list {*}$norm_segments {*}$dkeys] + #lappend tablenames_info [list {*}$norm_segments {*}$dkeys] + set tkey [list {*}$norm_segments {*}$dkeys] + dictn incr tablenames_info [list $tkey seencount] if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { #the value is either empty or or a dict structure with arbitrary (from-user-data) toplevel keys # inner structure will contain {type value } if all leaves are not empty ITABLES - lappend tablenames_seen [list {*}$norm_segments {*}$dkeys $leaf_key] + set tkey [list {*}$norm_segments {*}$dkeys $leaf_key] + #lappend tablenames_info [list {*}$norm_segments {*}$dkeys $leaf_key] + dictn incr tablenames_info [list $tkey seencount] #if the keyval_dict is not a simple type x value y - then it's an inline table ? #if so - we should add the path to the leaf_key as a closed table too - as it's not allowed to have more entries added. + dictn set tablenames_info [list $tkey closed] 1 } } @@ -562,7 +560,7 @@ namespace eval tomlish { #ignore } default { - error "Sub element of type '$type' not understood in table context. Expected only KEY,DQKEY,SQKEY,NEWLINE,COMMENT,WS" + error "Sub element of type '$type' not understood in table context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" } } } @@ -1316,7 +1314,12 @@ namespace eval tomlish::encode { #NOTE - this DELIBERATELY does not validate the data, or process escapes etc #It encodes the tomlish records as they are. #ie it only produces toml shaped data from a tomlish list. + # #It is part of the roundtripability of data from toml to tomlish + #!! ie - it is not the place to do formatting of inline vs multiline !! + # That needs to be encoded in the tomlish data that is being passed in + # (e.g from_dict could make formatting decisions in the tomlish it produces) + # #e.g duplicate keys etc can exist in the toml output. #The to_dict from_dict (or any equivalent processor pair) is responsible for validation and conversion #back and forth of escape sequences where appropriate. @@ -1646,17 +1649,27 @@ namespace eval tomlish::decode { #pop_trigger_tokens: newline tablename endarray endinlinetable #note a token is a pop trigger depending on context. e.g first newline during keyval is a pop trigger. set parentlevel [expr {$nest -1}] - set do_append_to_parent 1 ;#most tokens will leave this alone - but some like squote_seq need to do their own append + set do_append_to_parent 1 ;#most tokens will leave this alone - but some like tentative_accum_squote need to do their own append switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote { + #should only apply within a multiliteral #### set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed #Without this - we would get extraneous empty list entries in the parent # - as the xxx-squote-space isn't a space level from the toml perspective # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop + #assert prevstate always trailing-squote-space + #dev guardrail - remove? assertion lib? + switch -exact -- $prevstate { + trailing-squote-space { + } + default { + error "--- unexpected popped due to tentative_accum_squote but came from state '$prevstate' should have been trailing-squote-space" + } + } switch -- $tok { ' { - tomlish::parse::set_token_waiting type startsquote value $tok complete 1 startindex [expr {$i -1}] + tomlish::parse::set_token_waiting type single_squote value $tok complete 1 startindex [expr {$i -1}] } '' { #review - we should perhaps return double_squote instead? @@ -1669,74 +1682,51 @@ namespace eval tomlish::decode { tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 3}] } '''' { - switch -exact -- $prevstate { - leading-squote-space { - error "---- 4 squotes from leading-squote-space - shouldn't get here" - #we should have emitted the triple and left the last for next loop + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] + #todo integrate left squote with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + LITERALPART { + set newval "[lindex $lastpart 1]'" + set parentdata $v($parentlevel) + lset parentdata end [list LITERALPART $newval] + set v($parentlevel) $parentdata } - trailing-squote-space { - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] - #todo integrate left squote with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - LITERALPART { - set newval "[lindex $lastpart 1]'" - set parentdata $v($parentlevel) - lset parentdata end [list LITERALPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE { - lappend v($parentlevel) [list LITERALPART "'"] - } - MULTILITERAL { - #empty - lappend v($parentlevel) [list LITERALPART "'"] - } - default { - error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" - } - } + NEWLINE { + lappend v($parentlevel) [list LITERALPART "'"] + } + MULTILITERAL { + #empty + lappend v($parentlevel) [list LITERALPART "'"] } default { - error "--- unexpected popped due to squote_seq but came from state '$prevstate' should have been leading-squote-space or trailing-squote-space" + error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" } } } ''''' { - switch -exact -- $prevstate { - leading-squote-space { - error "---- 5 squotes from leading-squote-space - shouldn't get here" - #we should have emitted the triple and left the following squotes for next loop + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] + #todo integrate left 2 squotes with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + LITERALPART { + set newval "[lindex $lastpart 1]''" + set parentdata $v($parentlevel) + lset parentdata end [list LITERALPART $newval] + set v($parentlevel) $parentdata } - trailing-squote-space { - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] - #todo integrate left 2 squotes with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - LITERALPART { - set newval "[lindex $lastpart 1]''" - set parentdata $v($parentlevel) - lset parentdata end [list LITERALPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE { - lappend v($parentlevel) [list LITERALPART "''"] - } - MULTILITERAL { - lappend v($parentlevel) [list LITERALPART "''"] - } - default { - error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" - } - } + NEWLINE { + lappend v($parentlevel) [list LITERALPART "''"] + } + MULTILITERAL { + lappend v($parentlevel) [list LITERALPART "''"] } default { - error "--- unexpected popped due to squote_seq but came from state '$prevstate' should have been leading-squote-space or trailing-squote-space" + error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" } } } } - puts stderr "tomlish::decode::toml ---- HERE squote_seq pop <$tok>" } triple_squote { #presumably popping multiliteral-space @@ -1763,7 +1753,119 @@ namespace eval tomlish::decode { lappend merged $part } default { - error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($next)" + error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" + } + } + set lasttype [lindex $part 0] + } + set v($nest) $merged + } + tentative_accum_dquote { + #should only apply within a multistring + #### + set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed + #Without this - we would get extraneous empty list entries in the parent + # - as the trailing-dquote-space isn't a space level from the toml perspective + # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop + #assert prevstate always trailing-dquote-space + #dev guardrail - remove? assertion lib? + switch -exact -- $prevstate { + trailing-dquote-space { + } + default { + error "--- unexpected popped due to tentative_accum_dquote but came from state '$prevstate' should have been trailing-dquote-space" + } + } + switch -- $tok { + {"} { + tomlish::parse::set_token_waiting type single_dquote value $tok complete 1 startindex [expr {$i -1}] + } + {""} { + #review - we should perhaps return double_dquote instead? + #tomlish::parse::set_token_waiting type literal value "" complete 1 + tomlish::parse::set_token_waiting type double_dquote value "" complete 1 startindex [expr {$i - 2}] + } + {"""} { + #### + #if already an eof in token_waiting - set_token_waiting will insert before it + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 3}] + } + {""""} { + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 4}] + #todo integrate left dquote with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + STRINGPART { + set newval "[lindex $lastpart 1]\"" + set parentdata $v($parentlevel) + lset parentdata end [list STRINGPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE - CONT - WS { + lappend v($parentlevel) [list STRINGPART {"}] + } + MULTISTRING { + #empty + lappend v($parentlevel) [list STRINGPART {"}] + } + default { + error "--- don't know how to integrate extra trailing dquote with data $v($parentlevel)" + } + } + } + {"""""} { + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i-5}] + #todo integrate left 2 dquotes with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + STRINGPART { + set newval "[lindex $lastpart 1]\"\"" + set parentdata $v($parentlevel) + lset parentdata end [list STRINGPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE - CONT - WS { + lappend v($parentlevel) [list STRINGPART {""}] + } + MULTISTRING { + lappend v($parentlevel) [list STRINGPART {""}] + } + default { + error "--- don't know how to integrate extra trailing 2 dquotes with data $v($parentlevel)" + } + } + } + } + } + triple_dquote { + #presumably popping multistring-space + ::tomlish::log::debug "---- triple_dquote for last_space_action pop leveldata: $v($nest)" + set merged [list] + set lasttype "" + foreach part $v($nest) { + switch -exact -- [lindex $part 0] { + MULTISTRING { + lappend merged $part + } + STRINGPART { + if {$lasttype eq "STRINGPART"} { + set prevpart [lindex $merged end] + lset prevpart 1 [lindex $prevpart 1][lindex $part 1] + lset merged end $prevpart + } else { + lappend merged $part + } + } + CONT - WS { + lappend merged $part + } + NEWLINE { + #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here + #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. + lappend merged $part + } + default { + error "---- triple_dquote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" } } set lasttype [lindex $part 0] @@ -1809,15 +1911,12 @@ namespace eval tomlish::decode { endinlinetable { ::tomlish::log::debug "---- endinlinetable for last_space_action pop" } - endmultiquote { - ::tomlish::log::debug "---- endmultiquote for last_space_action 'pop'" - } default { error "---- unexpected tokenType '$tokenType' for last_space_action 'pop'" } } if {$do_append_to_parent} { - #e.g squote_seq does it's own appends as necessary - so won't get here + #e.g tentative_accum_squote does it's own appends as necessary - so won't get here lappend v($parentlevel) [set v($nest)] } @@ -1831,8 +1930,8 @@ namespace eval tomlish::decode { switch -exact -- $tokenType { - squote_seq_begin { - #### + tentative_trigger_squote - tentative_trigger_dquote { + #### this startok will always be tentative_accum_squote/tentative_accum_dquote starting with one accumulated squote/dquote if {[dict exists $transition_info starttok] && [dict get $transition_info starttok] ne ""} { lassign [dict get $transition_info starttok] starttok_type starttok_val set next_tokenType_known 1 @@ -1840,6 +1939,16 @@ namespace eval tomlish::decode { set tok $starttok_val } } + single_squote { + #JMN - REVIEW + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "squotedkey" + set tok "" + } + triple_squote { + ::tomlish::log::debug "---- push trigger tokenType triple_squote" + set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERALPART + } squotedkey { switch -exact -- $prevstate { table-space - itable-space { @@ -1849,6 +1958,9 @@ namespace eval tomlish::decode { #todo - check not something already waiting? tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } + triple_dquote { + set v($nest) [list MULTISTRING] ;#container for NEWLINE,STRINGPART,CONT + } dquotedkey { switch -exact -- $prevstate { table-space - itable-space { @@ -1858,7 +1970,7 @@ namespace eval tomlish::decode { #todo - check not something already waiting? tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } - XXXdquotedkey - XXXitablequotedkey { + XXXdquotedkey { #todo set v($nest) [list DQKEY $tok] ;#$tok is the keyname } @@ -1878,34 +1990,29 @@ namespace eval tomlish::decode { tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } } - startsquote { - #JMN - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "squotedkey" - set tok "" - } tablename { #note: we do not use the output of tomlish::tablename_trim to produce a tablename for storage in the tomlish list! #The tomlish list is intended to preserve all whitespace (and comments) - so a roundtrip from toml file to tomlish # back to toml file will be identical. #It is up to the datastructure stage to normalize and interpret tomlish for programmatic access. # we call tablename_trim here only to to validate that the tablename data is well-formed at the outermost level, - # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names. + # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names from + # a structural perspective. #todo - review! It's arguable that we should not do any validation here, and just store even incorrect raw tablenames, # so that the tomlish list is more useful for say a toml editor. Consider adding an 'err' tag to the appropriate place in the # tomlish list? - set test_only [::tomlish::utils::tablename_trim $tok] - ::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$test_only'" + #set trimtable [::tomlish::utils::tablename_trim $tok] + #::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$trimtable'" set v($nest) [list TABLE $tok] ;#$tok is the *raw* table name #note also that equivalent tablenames may have different toml representations even after being trimmed! #e.g ["x\t\t"] & ["x "] (tab escapes vs literals) #These will show as above in the tomlish list, but should normalize to the same tablename when used as keys by the datastructure stage. } tablearrayname { - set test_only [::tomlish::utils::tablename_trim $tok] - puts stdout "trimmed (but not normalized) tablearrayname: '$test_only'" + #set trimtable [::tomlish::utils::tablename_trim $tok] + #::tomlish::log::debug "---- trimmed (but not normalized) tablearrayname: '$trimtable'" set v($nest) [list TABLEARRAY $tok] ;#$tok is the *raw* tablearray name } startarray { @@ -1914,14 +2021,6 @@ namespace eval tomlish::decode { startinlinetable { set v($nest) [list ITABLE] ;#$tok is just the opening curly brace - don't output. } - startmultiquote { - ::tomlish::log::debug "---- push trigger tokenType startmultiquote" - set v($nest) [list MULTISTRING] ;#container for STRINGPART, WS, CONT, NEWLINE - } - triple_squote { - ::tomlish::log::debug "---- push trigger tokenType triple_squote" - set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERAL - } default { error "---- push trigger tokenType '$tokenType' not yet implemented" } @@ -1931,11 +2030,11 @@ namespace eval tomlish::decode { #no space level change switch -exact -- $tokenType { squotedkey { - puts "---- squotedkey in state $prevstate (no space level change)" + #puts "---- squotedkey in state $prevstate (no space level change)" lappend v($nest) [list SQKEY $tok] } dquotedkey { - puts "---- dquotedkey in state $prevstate (no space level change)" + #puts "---- dquotedkey in state $prevstate (no space level change)" lappend v($nest) [list DQKEY $tok] } barekey { @@ -1960,29 +2059,46 @@ namespace eval tomlish::decode { startinlinetable { puts stderr "---- decode::toml error. did not expect startinlinetable without space level change (no space level change)" } - startquote { + single_dquote { switch -exact -- $newstate { string-state { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "string" set tok "" } - quoted-key { + dquoted-key { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "dquotedkey" set tok "" } - XXXitable-quoted-key { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "itablequotedkey" - set tok "" + multistring-space { + lappend v($nest) [list STRINGPART {"}] + #may need to be joined on pop if there are neighbouring STRINGPARTS + } + default { + error "---- single_dquote switch case not implemented for nextstate: $newstate (no space level change)" + } + } + } + double_dquote { + #leading extra quotes - test: toml_multistring_startquote2 + switch -exact -- $prevstate { + itable-keyval-value-expected - keyval-value-expected { + puts stderr "tomlish::decode::toml double_dquote TEST" + #empty string + lappend v($nest) [list STRINGPART ""] + } + multistring-space { + #multistring-space to multistring-space + lappend v($nest) [list STRINGPART {""}] } default { - error "---- startquote switch case not implemented for nextstate: $newstate (no space level change)" + error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" } } + } - startsquote { + single_squote { switch -exact -- $newstate { literal-state { set next_tokenType_known 1 @@ -1995,41 +2111,17 @@ namespace eval tomlish::decode { set tok "" } multiliteral-space { - #false alarm squote returned from squote_seq pop + #false alarm squote returned from tentative_accum_squote pop ::tomlish::log::debug "---- adding lone squote to own LITERALPART nextstate: $newstate (no space level change)" #(single squote - not terminating space) lappend v($nest) [list LITERALPART '] #may need to be joined on pop if there are neighbouring LITERALPARTs } default { - error "---- startsquote switch case not implemented for nextstate: $newstate (no space level change)" + error "---- single_squote switch case not implemented for nextstate: $newstate (no space level change)" } } } - startmultiquote { - #review - puts stderr "---- got startmultiquote in state $prevstate (no space level change)" - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "stringpart" - set tok "" - } - endquote { - #nothing to do? - set tok "" - } - endsquote { - set tok "" - } - endmultiquote { - #JMN!! - set tok "" - } - string { - lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes - } - literal { - lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes - } double_squote { switch -exact -- $prevstate { keyval-value-expected { @@ -2044,6 +2136,19 @@ namespace eval tomlish::decode { } } } + enddquote { + #nothing to do? + set tok "" + } + endsquote { + set tok "" + } + string { + lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes + } + literal { + lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes + } multistring { #review lappend v($nest) [list MULTISTRING $tok] @@ -2056,11 +2161,9 @@ namespace eval tomlish::decode { } literalpart { lappend v($nest) [list LITERALPART $tok] ;#will not get wrapped in squotes directly - } - itablequotedkey { - } untyped_value { + #would be better termed unclassified_value #we can't determine the type of unquoted values (int,float,datetime,bool) until the entire token was read. if {$tok in {true false}} { set tag BOOL @@ -2238,7 +2341,7 @@ namespace eval tomlish::utils { #eg {dog."tater.man"} set sLen [tcl::string::length $tablename] set segments [list] - set mode "unknown" ;#5 modes: unknown, quoted,litquoted, unquoted, syntax + set mode "preval" ;#5 modes: preval, quoted,litquoted, unquoted, postval #quoted is for double-quotes, litquoted is for single-quotes (string literal) set seg "" for {set i 0} {$i < $sLen} {incr i} { @@ -2249,139 +2352,166 @@ namespace eval tomlish::utils { set lastChar "" } + #todo - track\count backslashes properly + set c [tcl::string::index $tablename $i] + if {$c eq "\""} { + if {($lastChar eq "\\")} { + #not strictly correct - we could have had an even number prior-backslash sequence + #the toml spec would have us error out immediately on bsl in bad location - but we're + #trying to parse to unvalidated tomlish + set ctest escq + } else { + set ctest dq + } + } else { + set ctest [string map [list " " sp \t tab] $c] + } - if {$c eq "."} { - switch -exact -- $mode { - unquoted { - #dot marks end of segment. - lappend segments $seg - set seg "" - set mode "unknown" - } - quoted { - append seg $c - } - unknown { - lappend segments $seg - set seg "" - } - litquoted { - append seg $c - } - default { - #mode: syntax - #we got our dot. - the syntax mode is now satisfied. - set mode "unknown" + switch -- $ctest { + . { + switch -exact -- $mode { + preval { + error "tablename_split. dot not allowed - expecting a value" + } + unquoted { + #dot marks end of segment. + #if {![is_barekey $seg]} { + # error "tablename_split. dot not allowed - expecting a value" + #} + lappend segments $seg + set seg "" + set mode "preval" + } + quoted { + append seg $c + } + litquoted { + append seg $c + } + postval { + #got dot in an expected location + set mode "preval" + } } } - } elseif {($c eq "\"") && ($lastChar ne "\\")} { - if {$mode eq "unknown"} { - if {[tcl::string::trim $seg] ne ""} { - #we don't allow a quote in the middle of a bare key - error "tablename_split. character '\"' invalid at this point in tablename. tablename: '$tablename'" - } - set mode "quoted" - set seg "\"" - } elseif {$mode eq "unquoted"} { - append seg $c - } elseif {$mode eq "quoted"} { - append seg $c - - if {$normalize} { - lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] - } else { - lappend segments $seg + dq { + #unescaped dquote + switch -- $mode { + preval { + set mode "quoted" + set seg "\"" + } + unquoted { + #invalid in barekey - but we are after structure only + append seg $c + } + quoted { + append seg $c + if {$normalize} { + lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] + } else { + lappend segments $seg + } + set seg "" + set mode "postval" ;#make sure we only accept a dot or end-of-data now. + } + litquoted { + append seg $c + } + postval { + error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" + } } - - set seg "" - set mode "syntax" ;#make sure we only accept a dot or end-of-data now. - } elseif {$mode eq "litquoted"} { - append seg $c - } elseif {$mode eq "syntax"} { - error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" - } - } elseif {($c eq "\'")} { - if {$mode eq "unknown"} { - append seg $c - set mode "litquoted" - } elseif {$mode eq "unquoted"} { - #single quote inside e.g o'neill - append seg $c - } elseif {$mode eq "quoted"} { - append seg $c - - } elseif {$mode eq "litquoted"} { - append seg $c - #no normalization to do - lappend segments $seg - set seg "" - set mode "syntax" - } elseif {$mode eq "syntax"} { - error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" } - - } elseif {$c in [list " " \t]} { - if {$mode eq "syntax"} { - #ignore - } else { - append seg $c + ' { + switch -- $mode { + preval { + append seg $c + set mode "litquoted" + } + unquoted { + #single quote inside e.g o'neill - ultimately invalid - but we pass through here. + append seg $c + } + quoted { + append seg $c + } + litquoted { + append seg $c + #no normalization to do aside from stripping squotes + if {$normalize} { + lappend segments [tcl::string::range $seg 1 end-1] + } else { + lappend segments $seg + } + set seg "" + set mode "postval" + } + postval { + error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" + } + } } - } else { - if {$mode eq "syntax"} { - error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" + sp - tab { + switch -- $mode { + preval - postval { + #ignore + } + unquoted { + #terminates a barekey + lappend segments $seg + set seg "" + set mode "postval" + } + default { + #append to quoted or litquoted + append seg $c + } + } } - if {$mode eq "unknown"} { - set mode "unquoted" + default { + switch -- $mode { + preval { + set mode unquoted + append seg $c + } + postval { + error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" + } + default { + append seg $c + } + } } - append seg $c } + if {$i == $sLen-1} { #end of data ::tomlish::log::debug "End of data: mode='$mode'" - #REVIEW - we can only end up in unquoted or syntax here? are other branches reachable? switch -exact -- $mode { - quoted { - if {$c ne "\""} { - error "tablename_split. missing closing double-quote in a segment. tablename: '$tablename'" - } - if {$normalize} { - lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] - #lappend segments [subst -nocommands -novariables [::string range $seg 1 end-1]] ;#wrong - } else { - lappend segments $seg - } + preval { + error "tablename_split. Expected a value after last dot separator. tablename: '$tablename'" } - litquoted { - set trimmed_seg [tcl::string::trim $seg] - if {[tcl::string::index $trimmed_seg end] ne "\'"} { - error "tablename_split. missing closing single-quote in a segment. tablename: '$tablename'" - } + unquoted { lappend segments $seg } - unquoted - unknown { - lappend segments $seg + quoted { + error "tablename_split. Expected a trailing double quote. tablename: '$tablename'" } - syntax { - #ok - segment already lappended + litquoted { + error "tablename_split. Expected a trailing single quote. tablename: '$tablename'" } - default { - lappend segments $seg + postval { + #ok - segment already lappended } } } } - foreach seg $segments { - set trimmed [tcl::string::trim $seg " \t"] - #note - we explicitly allow 'empty' quoted strings '' & "" - # (these are 'discouraged' but valid toml keys) - #if {$trimmed in [list "''" "\"\""]} { - # puts stderr "tablename_split. warning - Empty quoted string as tablename segment" - #} - if {$trimmed eq "" } { - error "tablename_split. Empty segment found. tablename: '$tablename' segments [llength $segments] ($segments)" - } - } + + #note - we must allow 'empty' quoted strings '' & "" + # (these are 'discouraged' but valid toml keys) + return $segments } @@ -2432,26 +2562,34 @@ namespace eval tomlish::utils { #- escape_string and unescape_string would not be reliably roundtrippable inverses anyway. #REVIEW - provide it anyway? When would it be desirable to use? - variable Bstring_control_map [list\ - \b {\b}\ - \n {\n}\ - \r {\r}\ - \" {\"}\ - \x1b {\e}\ - \\ "\\\\"\ - ] + variable Bstring_control_map [dict create] + dict set Bstring_control_map \b {\b} + dict set Bstring_control_map \n {\n} + dict set Bstring_control_map \r {\r} + dict set Bstring_control_map \" {\"} + #dict set Bstring_control_map \x1b {\e} ;#should presumably be only be a convenience for decode - going the other way we get \u001B + dict set Bstring_control_map \\ "\\\\" + #\e for \x1b seems like it might be included - v1.1?? hard to find current state of where toml is going :/ #for a Bstring (Basic string) tab is explicitly mentioned as not being one that must be escaped. - for {set cdec 0} {$cdec <= 8} {incr cdec} { + #8 = \b - already in list. + #built the remainder whilst checking for entries already hardcoded above -in case more are added to the hardcoded list + for {set cdec 0} {$cdec <= 7} {incr cdec} { set hhhh [format %.4X $cdec] - lappend Bstring_control_map [format %c $cdec] \\u$hhhh + set char [format %c $cdec] + if {![dict exists $Bstring_control_map $char]} { + dict set Bstring_control_map $char \\u$hhhh + } } for {set cdec [expr {0x0A}]} {$cdec <= 0x1F} {incr cdec} { set hhhh [format %.4X $cdec] - lappend Bstring_control_map [format %c $cdec] \\u$hhhh + set char [format %c $cdec] + if {![dict exists $Bstring_control_map $char]} { + dict set Bstring_control_map $char \\u$hhhh + } } # \u007F = 127 - lappend Bstring_control_map [format %c 127] \\u007F + dict set Bstring_control_map [format %c 127] \\u007F #Note the inclusion of backslash in the list of controls makes this non idempotent - subsequent runs would keep encoding the backslashes! #escape only those chars that must be escaped in a Bstring (e.g not tab which can be literal or escaped) @@ -2474,6 +2612,7 @@ namespace eval tomlish::utils { # it recognizes other escapes which aren't approprite e.g \xhh and octal \nnn # it replaces \ with a single whitespace (trailing backslash) #This means we shouldn't use 'subst' on the whole string, but instead substitute only the toml-specified escapes (\r \n \b \t \f \\ \" \uhhhh & \Uhhhhhhhh + #plus \e for \x1b? set buffer "" set buffer4 "" ;#buffer for 4 hex characters following a \u @@ -2558,12 +2697,13 @@ namespace eval tomlish::utils { set ctest [tcl::string::map {{"} dq} $c] switch -exact -- $ctest { dq { - set e "\\\"" - append buffer [subst -nocommand -novariable $e] + append buffer {"} } b - t - n - f - r { - set e "\\$c" - append buffer [subst -nocommand -novariable $e] + append buffer [subst -nocommand -novariable "\\$c"] + } + e { + append buffer \x1b } u { set unicode4_active 1 @@ -2578,8 +2718,7 @@ namespace eval tomlish::utils { #review - toml spec says all other escapes are reserved #and if they are used TOML should produce an error. #we leave detecting this for caller for now - REVIEW - append buffer "\\" - append buffer $c + append buffer "\\$c" } } } else { @@ -3003,7 +3142,7 @@ namespace eval tomlish::parse { # states: # table-space, itable-space, array-space # array-value-expected,keyval-value-expected,itable-keyval-value-expected, keyval-syntax, - # quoted-key, squoted-key + # dquoted-key, squoted-key # string-state, literal-state, multistring... # # notes: @@ -3039,6 +3178,12 @@ namespace eval tomlish::parse { variable stateMatrix set stateMatrix [dict create] + #--------------------------------------------------------- + #WARNING + #The stateMatrix implementation here is currently messy. + #The code is a mixture of declarative via the stateMatrix and imperative via switch statements during PUSH/POP/SAMESPACE transitions. + #This means the state behaviour has to be reasoned about by looking at both in conjuction. + #--------------------------------------------------------- #xxx-space vs xxx-syntax inadequately documented - TODO @@ -3060,35 +3205,19 @@ namespace eval tomlish::parse { barekey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ squotedkey {PUSHSPACE "keyval-space" state "keyval-syntax" note ""}\ dquotedkey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ - XXXstartquote "quoted-key"\ - XXXstartsquote "squoted-key"\ + XXXsingle_dquote "quoted-key"\ + XXXsingle_squote "squoted-key"\ comment "table-space"\ starttablename "tablename-state"\ starttablearrayname "tablearrayname-state"\ - startmultiquote "err-state"\ - endquote "err-state"\ + enddquote "err-state"\ + endsquote "err-state"\ comma "err-state"\ eof "end-state"\ equal "err-state"\ cr "err-lonecr"\ } - #itable-space/ curly-syntax : itables - dict set stateMatrix\ - itable-space {\ - whitespace "itable-space"\ - newline "itable-space"\ - barekey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - squotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - dquotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - endinlinetable "POPSPACE"\ - XXXstartquote "quoted-key"\ - XXXstartsquote {TOSTATE "squoted-key" comment "jn-testing"}\ - comma "err-state"\ - comment "itable-space"\ - eof "err-state"\ - } - #squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate itable-space starttok {squote_seq "'"}} dict set stateMatrix\ @@ -3113,26 +3242,19 @@ namespace eval tomlish::parse { dict set stateMatrix\ keyval-value-expected {\ whitespace "keyval-value-expected"\ - untyped_value {TOSTATE "keyval-tail" note ""}\ - startquote {TOSTATE "string-state" returnstate keyval-tail}\ - startmultiquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ - squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate keyval-value-expected starttok {squote_seq "'"}}\ - startsquote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ - double_squote {TOSTATE "keyval-tail" note "empty literal received when double squote occurs"}\ - triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ - startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ - startarray {PUSHSPACE array-space returnstate keyval-tail}\ - } - #squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate keyval-process-leading-squotes starttok {squote_seq "'"}} - dict set stateMatrix\ - leading-squote-space {\ - squote_seq "POPSPACE"\ + untyped_value {TOSTATE "keyval-tail" note ""}\ + literal {TOSTATE "keyval-tail" note "required for empty literal at EOF"}\ + string {TOSTATE "keyval-tail" note "required for empty string at EOF"}\ + single_dquote {TOSTATE "string-state" returnstate keyval-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ + single_squote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ + triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ + startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ + startarray {PUSHSPACE array-space returnstate keyval-tail}\ } - #dict set stateMatrix\ - # keyval-process-leading-squotes {\ - # startsquote "literal-state"\ - # triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ - # } + #double_squote {TOSTATE "keyval-tail" note "empty literal received when double squote occurs"} + + #2025 - no leading-squote-space - only trailing-squote-space. dict set stateMatrix\ keyval-tail {\ @@ -3142,81 +3264,106 @@ namespace eval tomlish::parse { eof "end-state"\ } + + #itable-space/ curly-syntax : itables + # x={y=1,} + dict set stateMatrix\ + itable-space {\ + whitespace "itable-space"\ + newline "itable-space"\ + barekey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + squotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + dquotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + endinlinetable "POPSPACE"\ + comma "err-state"\ + comment "itable-space"\ + eof "err-state"\ + } + #we don't get single_squote etc here - instead we get the resulting squotedkey token + + + # ??? review - something like this + # + # x={y =1,} dict set stateMatrix\ itable-keyval-syntax {\ - whitespace "itable-keyval-syntax"\ - barekey {PUSHSPACE "dottedkey-space"}\ - squotedkey {PUSHSPACE "dottedkey-space"}\ - dquotedkey {PUSHSPACE "dottedkey-space"}\ - equal "itable-keyval-value-expected"\ + whitespace {TOSTATE "itable-keyval-syntax"}\ + barekey {PUSHSPACE "dottedkey-space"}\ + squotedkey {PUSHSPACE "dottedkey-space"}\ + dquotedkey {PUSHSPACE "dottedkey-space"}\ + equal {TOSTATE "itable-keyval-value-expected"}\ newline "err-state"\ eof "err-state"\ } + + # x={y=1} + dict set stateMatrix\ + itable-keyval-space {\ + whitespace "itable-keyval-syntax"\ + equal {TOSTATE "itable-keyval-value-expected" note "required"}\ + } + dict set stateMatrix\ itable-keyval-value-expected {\ whitespace "itable-keyval-value-expected"\ untyped_value {TOSTATE "itable-val-tail" note ""}\ - startquote {TOSTATE "string-state" returnstate itable-val-tail}\ - startmultiquote {PUSHSPACE "multistring-space" returnstate itable-val-tail}\ - squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate itable-keyval-value-expected starttok {squote_seq "'"}}\ - startsquote {TOSTATE "literal-state" returnstate itable-val-tail note "usual way a literal is triggered"}\ - double_squote {TOSTATE "itable-val-tail" note "empty literal received when double squote occurs"}\ + single_dquote {TOSTATE "string-state" returnstate itable-val-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate itable-val-tail}\ + single_squote {TOSTATE "literal-state" returnstate itable-val-tail note "usual way a literal is triggered"}\ triple_squote {PUSHSPACE "multiliteral-space" returnstate itable-val-tail}\ startinlinetable {PUSHSPACE "itable-space" returnstate itable-val-tail}\ startarray {PUSHSPACE "array-space" returnstate itable-val-tail}\ } - dict set stateMatrix\ - itable-keyval-space {\ - whitespace "itable-keyval-syntax"\ - equal {TOSTATE "itable-keyval-value-expected" note "required"}\ - } + #double_squote not currently generated by _start_squote_sequence - '' processed as single_squote to literal-state just like 'xxx' + # review + # double_squote {TOSTATE "itable-val-tail" note "empty literal received when double squote occurs"} + + + # x={y=1,z="x"} + #POPSPACE is transition from itable-keyval-space to parent itable-space dict set stateMatrix\ itable-val-tail {\ whitespace "itable-val-tail"\ endinlinetable "POPSPACE"\ comma "POPSPACE"\ - XXXnewline {TOSTATE "itable-val-tail" note "itable-space ??"}\ - newline "POPSPACE"\ + newline {TOSTATE "itable-val-tail" note "itable-space ??"}\ comment "itable-val-tail"\ eof "err-state"\ } - #dict set stateMatrix\ - # itable-quoted-key {\ - # whitespace "NA"\ - # itablequotedkey {PUSHSPACE "itable-keyval-space"}\ - # newline "err-state"\ - # endquote "itable-keyval-syntax"\ - # } - #dict set stateMatrix\ - # itable-squoted-key {\ - # whitespace "NA"\ - # itablesquotedkey {PUSHSPACE "itable-keyval-space"}\ - # newline "err-state"\ - # endsquote "itable-keyval-syntax"\ - # } + # XXXnewline "POPSPACE" + # We shouldn't popspace on newline - as if there was no comma we need to stay in itable-val-tail + # This means the newline and subsequent whitespace, comments etc become part of the preceeding dottedkey record + #e.g + # x = { + # j=1 + # #comment within dottedkey j record + # , # comment unattached + # #comment unattached + # k=2 , #comment unattached + # l=3 #comment within l record + # , m=4 + # #comment associated with m record + # + # #still associated with m record + # } + ## - This doesn't quite correspond to what a user might expect - but seems like a consistent mechanism. + #The awkwardness is because there is no way to put in a comment that doesn't consume a trailing comma + #so we cant do: j= 1 #comment for j1 , + # and have the trailing comma recognised. + # + # To associate: j= 1, #comment for j1 + # we would need some extra processing . (not popping until next key ? extra state itable-sep-tail?) REVIEW - worth doing? + # + # The same issue occurs with multiline arrays. The most natural assumption is that a comment on same line after a comma + # is 'associated' with the previous entry. + # + # These comment issues are independent of the data dictionary being generated for conversion to json etc - as the comments don't carry through anyway, + # but are a potential oddity for manipulating the intermediate tomlish structure whilst attempting to preserve 'associated' comments + # (e.g reordering records within an itable) + #The user's intention for 'associated' isn't always clear and the specs don't really guide on this. - - - #array-value-expected ? - dict set stateMatrix\ - XXXvalue-expected {\ - whitespace "value-expected"\ - untyped_value {"SAMESPACE" "" replay untyped_value}\ - startquote "string-state"\ - startsquote "literal-state"\ - triple_squote {PUSHSPACE "multiliteral-space"}\ - startmultiquote {PUSHSPACE "multistring-space"}\ - startinlinetable {PUSHSPACE itable-space}\ - startarray {PUSHSPACE array-space}\ - comment "err-state-value-expected-got-comment"\ - comma "err-state"\ - newline "err-state"\ - eof "err-state"\ - } - #note comment token should never be delivered to array-value-expected state? - #dottedkey-space is not (currently) used within [tablename] or [[tablearrayname]] #it is for keyval ie x.y.z = value @@ -3245,6 +3392,8 @@ namespace eval tomlish::parse { whitespace "dottedkey-space-tail" dotsep "dottedkey-space" equal "POPSPACE"\ + eof "err-state"\ + newline "err-state"\ } #-------------------------------------------------------------------------- @@ -3262,22 +3411,10 @@ namespace eval tomlish::parse { #toml spec looks like heading towards allowing newlines within inline tables #https://github.com/toml-lang/toml/issues/781 - #2025 - appears to be valid for 1.1 - which we are targeting. + #2025 - multiline itables appear to be valid for 1.1 - which we are targeting. #https://github.com/toml-lang/toml/blob/main/toml.md#inline-table #JMN2025 - #dict set stateMatrix\ - # curly-syntax {\ - # whitespace "curly-syntax"\ - # newline "curly-syntax"\ - # barekey {PUSHSPACE "itable-keyval-space"}\ - # itablequotedkey "itable-keyval-space"\ - # endinlinetable "POPSPACE"\ - # startquote "itable-quoted-key"\ - # comma "itable-space"\ - # comment "itable-space"\ - # eof "err-state"\ - # } #review comment "err-state" vs comment "itable-space" - see if TOML 1.1 comes out and allows comments in multiline ITABLES #We currently allow multiline ITABLES (also with comments) in the tokenizer. #if we want to disallow as per TOML 1.0 - we should do so when attempting to get structure? @@ -3291,10 +3428,9 @@ namespace eval tomlish::parse { # untyped_value "SAMESPACE"\ # startarray {PUSHSPACE "array-space"}\ # endarray "POPSPACE"\ - # startmultiquote {PUSHSPACE multistring-space}\ # startinlinetable {PUSHSPACE itable-space}\ - # startquote "string-state"\ - # startsquote "literal-state"\ + # single_dquote "string-state"\ + # single_squote "literal-state"\ # triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax note "seems ok 2024"}\ # comma "array-space"\ # comment "array-space"\ @@ -3305,15 +3441,16 @@ namespace eval tomlish::parse { set aspace [dict create] dict set aspace whitespace "array-space" dict set aspace newline "array-space" - dict set aspace untyped_value "SAMESPACE" + #dict set aspace untyped_value "SAMESPACE" + dict set aspace untyped_value "array-syntax" dict set aspace startarray {PUSHSPACE "array-space"} dict set aspace endarray "POPSPACE" - dict set aspace startmultiquote {PUSHSPACE multistring-space} + dict set aspace single_dquote {TOSTATE "string-state" returnstate array-syntax} + dict set aspace triple_dquote {PUSHSPACE "multistring-space" returnstate array-syntax} + dict set aspace single_squote {TOSTATE "literal-state" returnstate array-syntax} + dict set aspace triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax} dict set aspace startinlinetable {PUSHSPACE itable-space} - dict set aspace startquote "string-state" - dict set aspace startsquote "literal-state" - dict set aspace triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax note "seems ok 2024"} - dict set aspace comma "array-space" + #dict set aspace comma "array-space" dict set aspace comment "array-space" dict set aspace eof "err-state-array-space-got-eof" dict set stateMatrix array-space $aspace @@ -3329,26 +3466,16 @@ namespace eval tomlish::parse { #dict set asyntax untyped_value "SAMESPACE" #dict set asyntax startarray {PUSHSPACE array-space} dict set asyntax endarray "POPSPACE" - #dict set asyntax startmultiquote {PUSHSPACE multistring-space} - #dict set asyntax startquote "string-state" - #dict set asyntax startsquote "literal-state" + #dict set asyntax single_dquote "string-state" + #dict set asyntax single_squote "literal-state" dict set asyntax comma "array-space" dict set asyntax comment "array-syntax" dict set stateMatrix array-syntax $asyntax - #quoted-key & squoted-key need to PUSHSPACE from own token to keyval-space - dict set stateMatrix\ - quoted-key {\ - whitespace "NA"\ - dquotedkey {PUSHSPACE "keyval-space"}\ - newline "err-state"\ - endquote "keyval-syntax"\ - } - - #review + #dquotedkey is a token - dquoted-key is a state dict set stateMatrix\ dquoted-key {\ whitespace "NA"\ @@ -3367,7 +3494,7 @@ namespace eval tomlish::parse { string-state {\ whitespace "NA"\ string "string-state"\ - endquote "SAMESPACE"\ + enddquote "SAMESPACE"\ newline "err-state"\ eof "err-state"\ } @@ -3381,20 +3508,21 @@ namespace eval tomlish::parse { } - #dict set stateMatrix\ - # stringpart {\ - # continuation "SAMESPACE"\ - # endmultiquote "POPSPACE"\ - # eof "err-state"\ - # } dict set stateMatrix\ multistring-space {\ - whitespace "multistring-space"\ - continuation "multistring-space"\ - stringpart "multistring-space"\ - newline "multistring-space"\ - endmultiquote "POPSPACE"\ - eof "err-state"\ + whitespace "multistring-space"\ + continuation "multistring-space"\ + stringpart "multistring-space"\ + newline "multistring-space"\ + tentative_trigger_dquote {PUSHSPACE "trailing-dquote-space" returnstate multistring-space starttok {tentative_accum_dquote {"}}}\ + single_dquote {TOSTATE multistring-space}\ + double_dquote {TOSTATE multistring-space}\ + triple_dquote {POPSPACE}\ + eof "err-state"\ + } + dict set stateMatrix\ + trailing-dquote-space { + tentative_accum_dquote "POPSPACE" } @@ -3402,19 +3530,19 @@ namespace eval tomlish::parse { #todo - treat sole cr as part of literalpart but crlf and lf as newline dict set stateMatrix\ multiliteral-space {\ - literalpart "multiliteral-space"\ - newline "multiliteral-space"\ - squote_seq_begin {PUSHSPACE "trailing-squote-space" returnstate multiliteral-space starttok {squote_seq "'"}}\ - triple_squote {POPSPACE note "on popping - we do any necessary concatenation of LITERALPART items due to squote processing"}\ - double_squote {TOSTATE multiliteral-space note "short squote_seq: can occur anywhere in the space e.g emitted at end when 5 squotes occur"}\ - startsquote {TOSTATE multiliteral-space note "short squote_seq: same as double_squote - false alarm"}\ - eof "err-premature-eof-in-multiliteral-space"\ + literalpart "multiliteral-space"\ + newline "multiliteral-space"\ + tentative_trigger_squote {PUSHSPACE "trailing-squote-space" returnstate multiliteral-space starttok {tentative_accum_squote "'"}}\ + single_squote {TOSTATE multiliteral-space note "short tentative_accum_squote: false alarm this squote is part of data"}\ + double_squote {TOSTATE multiliteral-space note "short tentative_accum_squote: can occur anywhere in the space e.g emitted at end when 5 squotes occur"}\ + triple_squote {POPSPACE note "on popping - we do any necessary concatenation of LITERALPART items due to squote processing"}\ + eof "err-premature-eof-in-multiliteral-space"\ } #trailing because we are looking for possible terminating ''' - but must accept '''' or ''''' and re-integrate the 1st one or 2 extra squotes dict set stateMatrix\ - trailing-squote-space {\ - squote_seq "POPSPACE"\ + trailing-squote-space { + tentative_accum_squote "POPSPACE" } @@ -3499,7 +3627,7 @@ namespace eval tomlish::parse { - + dict set stateMatrix\ end-state {} @@ -3557,14 +3685,13 @@ namespace eval tomlish::parse { dict set spacePushTransitions itable-keyval-space itable-keyval-syntax dict set spacePushTransitions array-space array-space dict set spacePushTransitions table-space tablename-state - dict set spacePushTransitions #itable-space itable-space + #dict set spacePushTransitions #itable-space itable-space #Pop to, next variable spacePopTransitions [dict create] dict set spacePopTransitions array-space array-syntax - #itable-space curly-syntax #itable-keyval-space itable-val-tail #review #we pop to keyval-space from dottedkey-space or from keyval-value-expected? we don't always want to go to keyval-tail @@ -3575,7 +3702,6 @@ namespace eval tomlish::parse { #JMN test #dict set spaceSameTransitions array-space array-syntax - #itable-space curly-syntax #itable-keyval-space itable-val-tail @@ -3611,6 +3737,8 @@ namespace eval tomlish::parse { ::tomlish::log::debug "--->> goNextState tokentype:$tokentype tok:$tok currentstate:$currentstate : transition_to = $transition_to" switch -exact -- [lindex $transition_to 0] { POPSPACE { + set popfromspace_info [spacestack peek] + set popfromspace_state [dict get $popfromspace_info state] spacestack pop set parent_info [spacestack peek] set type [dict get $parent_info type] @@ -3625,17 +3753,17 @@ namespace eval tomlish::parse { set existing [spacestack pop] dict unset existing returnstate spacestack push $existing ;#re-push modification - ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace redirected to stored returnstate $next <<---" + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected to stored returnstate $next <<---" } else { ### #review - do away with spacePopTransitions - which although useful to provide a default.. # - involve error-prone configurations distant to the main state transition configuration in stateMatrix if {[dict exists $::tomlish::parse::spacePopTransitions $parentspace]} { set next [dict get $::tomlish::parse::spacePopTransitions $parentspace] - ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace redirected state to $next (spacePopTransitions)<<---" + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected state to $next (spacePopTransitions)<<---" } else { set next $parentspace - ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace<<---" + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace<<---" } } set result $next @@ -3805,22 +3933,6 @@ namespace eval tomlish::parse { return $tokenType } - proc _shortcircuit_startquotesequence {} { - variable tok - variable i - set toklen [tcl::string::length $tok] - if {$toklen == 1} { - set_tokenType "startquote" - incr i -1 - return -level 2 1 - } elseif {$toklen == 2} { - puts stderr "_shortcircuit_startquotesequence toklen 2" - set_tokenType "startquote" - set tok "\"" - incr i -2 - return -level 2 1 - } - } proc get_token_waiting {} { variable token_waiting @@ -3940,7 +4052,6 @@ namespace eval tomlish::parse { set slash_active 0 set quote 0 set c "" - set multi_dquote "" for {} {$i < $sLen} {} { if {$i > 0} { set lastChar [tcl::string::index $s [expr {$i - 1}]] @@ -3957,8 +4068,6 @@ namespace eval tomlish::parse { switch -exact -- $ctest { # { - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 @@ -3966,16 +4075,20 @@ namespace eval tomlish::parse { if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { + #for multiliteral, multistring - data and/or end incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { + #pseudo token beginning with underscore - never returned to state machine - review incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i [tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } barekey { @@ -4003,7 +4116,7 @@ namespace eval tomlish::parse { append tok $c } default { - #dquotedkey, itablequotedkey, string,literal, multistring + #dquotedkey, string,literal, multistring append tok $c } } @@ -4015,7 +4128,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes#" + append tok "#" } multiliteral-space { set_tokenType "literalpart" @@ -4031,23 +4144,23 @@ namespace eval tomlish::parse { } lc { #left curly brace - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i [tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { @@ -4059,7 +4172,7 @@ namespace eval tomlish::parse { } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } starttablename - starttablearrayname { #*bare* tablename can only contain letters,digits underscores @@ -4105,7 +4218,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes\{" + append tok "\{" } multiliteral-space { set_tokenType "literalpart" @@ -4120,37 +4233,35 @@ namespace eval tomlish::parse { } rc { #right curly brace - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { append tok $c } - XXXitablesquotedkey { - } - string - dquotedkey - itablequotedkey - comment { + string - dquotedkey - comment { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } starttablename - tablename { if {$had_slash} {append tok "\\"} @@ -4221,7 +4332,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes\}" + append tok "\}" } multiliteral-space { set_tokenType "literalpart" ; #review @@ -4237,35 +4348,35 @@ namespace eval tomlish::parse { } lb { #left square bracket - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { append tok $c } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } starttablename { #change the tokenType @@ -4332,7 +4443,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes\[" + append tok "\[" } multiliteral-space { set_tokenType "literalpart" @@ -4350,37 +4461,35 @@ namespace eval tomlish::parse { } rb { #right square bracket - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { append tok $c } - XXXitablesquotedkey { - } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } comment { if {$had_slash} {append tok "\\"} @@ -4428,16 +4537,6 @@ namespace eval tomlish::parse { } } } - XXXtablearraynames { - puts "rb @ tablearraynames ??" - #switch? - - #todo? - if {$had_slash} {append tok "\\"} - #invalid! - but leave for datastructure loading stage to catch - set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex - return 1 - } default { incr i -1 return 1 @@ -4485,7 +4584,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes\]" + append tok "\]" } multiliteral-space { set_tokenType "literalpart" @@ -4498,21 +4597,21 @@ namespace eval tomlish::parse { } } bsl { - set dquotes $multi_dquote - set multi_dquote "" ;#!! #backslash if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } whitespace { @@ -4529,9 +4628,7 @@ namespace eval tomlish::parse { append tok "\\" set slash_active 0 } - XXXitablesquotedkey { - } - string - dquotedkey - itablequotedkey - comment { + string - dquotedkey - comment { if {$slash_active} { set slash_active 0 append tok "\\\\" @@ -4545,7 +4642,6 @@ namespace eval tomlish::parse { set slash_active 0 append tok "\\\\" } else { - append tok $dquotes set slash_active 1 } } @@ -4575,10 +4671,6 @@ namespace eval tomlish::parse { set tok "\\\\" set slash_active 0 } else { - if {$dquotes ne ""} { - set_tokenType "stringpart" - set tok $dquotes - } set slash_active 1 } } @@ -4599,58 +4691,56 @@ namespace eval tomlish::parse { set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { - #short squote_seq tokens are returned if active during any other character + tentative_accum_squote { + #for within multiliteral + #short tentative_accum_squote tokens are returned if active upon receipt of any other character #longest allowable for leading/trailing are returned here #### set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote - switch -- $state { - leading-squote-space { - append tok $c - if {$existingtoklen > 2} { - error "tomlish tok error: squote_seq unexpected length $existingtoklen when another received" - } elseif {$existingtoklen == 2} { - return 1 ;#return tok ''' - } - } - trailing-squote-space { - append tok $c - if {$existingtoklen == 4} { - #maxlen to be an squote_seq is multisquote + 2 = 5 - #return tok ''''' - return 1 - } - } - default { - error "tomlish tok error: squote_seq in unexpected state '$state' - expected leading-squote-space or trailing-squote-space" - } + #assert state = trailing-squote-space + append tok $c + if {$existingtoklen == 4} { + #maxlen to be a tentative_accum_squote is multisquote + 2 = 5 + #return tok with value ''''' + return 1 } } - whitespace { - #end whitespace - incr i -1 ;#reprocess sq + tentative_accum_dquote { + incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { - #temp token creatable only during value-expected or array-space + #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space switch -- [tcl::string::length $tok] { 1 { + #no conclusion can yet be reached append tok $c } 2 { + #enter multiliteral #switch? append tok $c set_tokenType triple_squote return 1 } default { + #if there are more than 3 leading squotes we also enter multiliteral space and the subsequent ones are handled + #by the tentative_accum_squote check for ending sequence which can accept up to 5 and reintegrate the + #extra 1 or 2 squotes as data. error "tomlish unexpected token length [tcl::string::length $tok] in '_start_squote_sequence'" } } } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + whitespace { + #end whitespace + incr i -1 ;#reprocess sq + return 1 + } literal { #slash_active always false #terminate the literal @@ -4663,7 +4753,7 @@ namespace eval tomlish::parse { # idea: end this literalpart (possibly 'temporarily') # let the sq be reprocessed in the multiliteral-space to push an end-multiliteral-sequence to state stack # upon popping end-multiliteral-sequence - stitch quotes back into this literalpart's token (if either too short - or a long ending sequence as shown above) - incr i -1 ;#throw the "'" back to loop - will be added to an squote_seq token for later processing + incr i -1 ;#throw the "'" back to loop - will be added to a tentative_accum_squote token for later processing return 1 } XXXitablesquotedkey { @@ -4684,7 +4774,11 @@ namespace eval tomlish::parse { append tok $c } barekey { - #not clear why o'shennanigan shouldn't be a legal barekey - but it seems not to be. + #barekeys now support all sorts of unicode letter/number chars for other cultures + #but not punctuation - not even for those of Irish heritage who don't object + #to the anglicised form of some names. + # o'shenanigan seems to not be a legal barekey + #The Irish will have to use an earlier form Ó - which apparently many may prefer anyway. error "tomlish Unexpected single quote during barekey. [tomlish::parse::report_line]" } default { @@ -4693,63 +4787,69 @@ namespace eval tomlish::parse { } } else { switch -exact -- $state { - array-space { + array-space - keyval-value-expected - itable-keyval-value-expected { + #leading squote + #pseudo-token _start_squote_sequence ss not received by state machine + #This pseudotoken will trigger production of single_squote token or triple_squote token + #It currently doesn't trigger double_squote token + #(handle '' same as 'x' ie produce a single_squote and go into processing literal) + #review - producing double_squote for empty literal may be slightly more efficient. + #This token is not used to handle squote sequences *within* a multiliteral set_tokenType "_start_squote_sequence" set tok "'" } - itable-keyval-value-expected - keyval-value-expected { - set_tokenType "squote_seq_begin" + multiliteral-space { + #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row + #we are building up a tentative_accum_squote to determine if + #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines + #b) it is exactly ''' and we can terminate the whole multiliteral + #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space + set_tokenType "tentative_trigger_squote" ;#trigger tentative_accum_squote set tok "'" return 1 } - table-space { - #tests: squotedkey.test - set_tokenType "squotedkey" - set tok "" - } - itable-space { - #tests: squotedkey_itable.test + table-space - itable-space { + #tests: squotedkey.test squotedkey_itable.test set_tokenType "squotedkey" set tok "" } - XXXitable-space { - #future - could there be multiline keys? - #this would allow arbitrary tcl dicts to be stored in toml + XXXtable-space - XXXitable-space { + #future - could there be multiline keys? MLLKEY, MLBKEY ? + #this would (almost) allow arbitrary tcl dicts to be stored in toml (aside from escaping issues) #probably unlikely - as it's perhaps not very 'minimal' or ergonomic for config files - set_tokenType "squote_seq_begin" + #@2025 ABNF for toml mentions key, simple-key, unquoted-key, quoted-key and dotted-key + #where key is simple-key or dotted-key - no MLL or MLB components + #the spec states solution for arbitrary binary data is application specific involving encodings + #such as hex, base64 + set_tokenType "_start_squote_sequence" set tok "'" return 1 } tablename-state { #first char in tablename-state/tablearrayname-state - set_tokenType tablename + set_tokenType "tablename" append tok "'" } tablearrayname-state { - set_tokenType tablearrayname + set_tokenType "tablearrayname" append tok "'" } literal-state { + #shouldn't get here? review tomlish::log::debug "- tokloop sq during literal-state with no tokentype - empty literal?" - set_tokenType literal + set_tokenType "literal" incr -1 return 1 } multistring-space { - error "tomlish unimplemented - squote during state '$state'. [tomlish::parse::report_line]" - } - multiliteral-space { - #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row - #we are building up an squote_seq to determine if - #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines - #b) it is exactly ''' and we can terminate the whole multiliteral - #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space - set_tokenType "squote_seq_begin" - set tok "'" - return 1 + set_tokenType "stringpart" + set tok "" + if {$had_slash} {append tok "\\"} + append tok "," + #error "tomlish unimplemented - squote during state '$state'. [tomlish::parse::report_line]" } dottedkey-space { - set_tokenType squotedkey + set_tokenType "squotedkey" } default { error "tomlish unhandled squote during state '$state'. [tomlish::parse::report_line]" @@ -4765,44 +4865,50 @@ namespace eval tomlish::parse { if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote { incr i -1 return 1 } - startquotesequence { - set toklen [tcl::string::length $tok] - if {$toklen == 1} { - append tok $c - } elseif {$toklen == 2} { - append tok $c - #switch vs set? - set_tokenType "startmultiquote" - return 1 - } else { - error "tomlish unexpected token length $toklen in 'startquotesequence'" - } - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" return 1 - - #set toklen [tcl::string::length $tok] - #switch -- $toklen { - # 1 { - # set_tokenType "startsquote" - # incr i -1 - # return 1 - # } - # 2 { - # set_tokenType "startsquote" - # incr i -2 - # return 1 - # } - # default { - # error "tomlish unexpected _start_squote_sequence length $toklen" - # } - #} + } + tentative_accum_dquote { + #within multistring + #short tentative_accum_dquote tokens are returned if active upon receipt of any other character + #longest allowable for leading/trailing are returned here + #### + set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote + #assert state = trailing-squote-space + append tok $c + if {$existingtoklen == 4} { + #maxlen to be a tentative_accum_dquote is multidquote + 2 = 5 + #return tok with value """"" + return 1 + } + } + _start_dquote_sequence { + #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space + switch -- [tcl::string::length $tok] { + 1 { + #no conclusion can yet be reached + append tok $c + } + 2 { + #enter multistring + #switch? + append tok $c + set_tokenType triple_dquote + return 1 + } + default { + #if there are more than 3 leading dquotes we also enter multistring space and the subsequent ones are handled + #by the tentative_accum_dquote check for ending sequence which can accept up to 5 and reintegrate the + #extra 1 or 2 dquotes as data. + error "tomlish unexpected token length [tcl::string::length $tok] in '_start_dquote_sequence'" + } + } } literal - literalpart { append tok $c @@ -4811,8 +4917,8 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" $c } else { - #unescaped quote always terminates a string? - set_token_waiting type endquote value "\"" complete 1 startindex $cindex + #unescaped quote always terminates a string + set_token_waiting type enddquote value "\"" complete 1 startindex $cindex return 1 } } @@ -4821,77 +4927,31 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" $c } else { - #incr i -1 - - if {$multi_dquote eq "\"\""} { - set_token_waiting type endmultiquote value "\"\"\"" complete 1 startindex [expr {$cindex -2}] - set multi_dquote "" - return 1 - } else { - append multi_dquote "\"" - } + incr i -1 ;#throw the {"} back to loop - will be added to a tentative_accum_dquote token for later processing + return 1 } } whitespace { - switch -exact -- $state { - multistring-space { - #REVIEW - if {$had_slash} { - incr i -2 - return 1 - } else { - switch -- [tcl::string::length $multi_dquote] { - 2 { - set_token_waiting type endmultiquote value "\"\"\"" complete 1 startindex [expr {$cindex-2}] - set multi_dquote "" - return 1 - } - 1 { - incr i -2 - return 1 - } - 0 { - incr i -1 - return 1 - } - } - } - } - keyval-value-expected { - #end whitespace token and reprocess - incr i -1 - return 1 - - #if {$multi_dquote eq "\"\""} { - # set_token_waiting type startmultiquote value "\"\"\"" complete 1 - # set multi_dquote "" - # return 1 - #} else { - # #end whitespace token and reprocess - # incr i -1 - # return 1 - #} - } - table-space - itable-space { - incr i -1 - return 1 - } - default { - set_token_waiting type startquote value "\"" complete 1 startindex $cindex - return 1 - } + #assert: had_slash will only ever be true in multistring-space + if {$had_slash} { + incr i -2 + return 1 + } else { + #end whitespace token - throw dq back for reprocessing + incr i -1 + return 1 } } comment { if {$had_slash} {append tok "\\"} append tok $c } - XXXdquotedkey - XXXitablequotedkey { + XXXdquotedkey { if {$had_slash} { append tok "\\" append tok $c } else { - set_token_waiting type endquote value "\"" complete 1 startindex $cindex + set_token_waiting type enddquote value "\"" complete 1 startindex $cindex return 1 } } @@ -4901,7 +4961,7 @@ namespace eval tomlish::parse { append tok "\\" append tok $c } else { - #set_token_waiting type endsquote value "'" complete 1 + #set_token_waiting type enddquote value {"} complete 1 return 1 } } @@ -4924,64 +4984,40 @@ namespace eval tomlish::parse { #$slash_active not relevant when no tokenType #token is string only if we're expecting a value at this point switch -exact -- $state { - array-space { - #!? start looking for possible multistartquote - #set_tokenType startquote - #set tok $c - #return 1 - set_tokenType "startquotesequence" ;#one or more quotes in a row - either startquote or multistartquote - set tok $c - } - keyval-value-expected - itable-keyval-value-expected { - set_tokenType "startquotesequence" ;#one or more quotes in a row - either startquote or multistartquote - set tok $c + array-space - keyval-value-expected - itable-keyval-value-expected { + #leading dquote + #pseudo-token _start_squote_sequence ss not received by state machine + #This pseudotoken will trigger production of single_dquote token or triple_dquote token + #It currently doesn't trigger double_dquote token + #(handle "" same as "x" ie produce a single_dquote and go into processing string) + #review - producing double_dquote for empty string may be slightly more efficient. + #This token is not used to handle dquote sequences once *within* a multistring + set_tokenType "_start_dquote_sequence" + set tok {"} } multistring-space { - #TODO - had_slash!!! - #REVIEW if {$had_slash} { set_tokenType "stringpart" set tok "\\\"" - set multi_dquote "" } else { - if {$multi_dquote eq "\"\""} { - tomlish::log::debug "- tokloop char dq ---> endmultiquote" - set_tokenType "endmultiquote" - set tok "\"\"\"" - return 1 - #set_token_waiting type endmultiquote value "\"\"\"" complete 1 - #set multi_dquote "" - #return 1 - } else { - append multi_dquote "\"" - } + #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row + #we are building up a tentative_accum_squote to determine if + #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines + #b) it is exactly ''' and we can terminate the whole multiliteral + #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space + set_tokenType "tentative_trigger_dquote" ;#trigger tentative_accum_dquote + set tok {"} + return 1 } } multiliteral-space { set_tokenType "literalpart" set tok "\"" } - XXXtable-space { - set_tokenType "startquote" - set tok $c - return 1 - } - XXXitable-space { - set_tokenType "startquote" - set tok $c - } table-space - itable-space { set_tokenType "dquotedkey" set tok "" } - tablename-state { - set_tokenType tablename - set tok $c - } - tablearrayname-state { - set_tokenType tablearrayname - set tok $c - } dottedkey-space { set_tokenType dquotedkey set tok "" @@ -4990,49 +5026,56 @@ namespace eval tomlish::parse { #set_tokenType dquote_seq_begin #set tok $c } + tablename-state { + set_tokenType tablename + set tok $c + } + tablearrayname-state { + set_tokenType tablearrayname + set tok $c + } default { - error "tomlish Unexpected quote during state '$state' [tomlish::parse::report_line]" + error "tomlish Unexpected dquote during state '$state' [tomlish::parse::report_line]" } } } } = { - set dquotes $multi_dquote - set multi_dquote "" ;#!! set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { - #assertion had_slash 0, multi_dquote "" + #assertion had_slash 0 append tok $c } - string - comment - dquotedkey - itablequotedkey { + string - comment - dquotedkey { #for these tokenTypes an = is just data. if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } whitespace { if {$state eq "multistring-space"} { - set backlen [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backlen + incr i -1 return 1 } else { set_token_waiting type equal value = complete 1 startindex $cindex @@ -5063,7 +5106,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok ${dquotes}= + append tok = } multiliteral-space { set_tokenType "literalpart" @@ -5084,8 +5127,6 @@ namespace eval tomlish::parse { } cr { #REVIEW! - set dquotes $multi_dquote - set multi_dquote "" ;#!! # \r carriage return if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. set slash_active 0 @@ -5098,16 +5139,18 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal { @@ -5159,8 +5202,6 @@ namespace eval tomlish::parse { } lf { # \n newline - set dquotes $multi_dquote - set multi_dquote "" ;#!! set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { @@ -5171,16 +5212,19 @@ namespace eval tomlish::parse { append tok lf ;#assert we should now have tok "crlf" - as a previous cr is the only way to have an incomplete newline tok return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { + #multiliteral or multistring incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal { @@ -5196,20 +5240,14 @@ namespace eval tomlish::parse { return 1 } stringpart { - if {$dquotes ne ""} { - append tok $dquotes + if {$had_slash} { + #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) + set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] incr i -1 return 1 } else { - if {$had_slash} { - #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) - set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] - incr i -1 - return 1 - } else { - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 } } starttablename - tablename - tablearrayname - starttablearrayname { @@ -5236,20 +5274,13 @@ namespace eval tomlish::parse { incr i -1 return 1 } else { - if {$dquotes ne ""} { - #e.g one or 2 quotes just before nl - set_tokenType "stringpart" - set tok $dquotes - incr i -1 - return 1 - } set_tokenType "newline" set tok lf return 1 } } multiliteral-space { - #assert had_slash 0, multi_dquote "" + #assert had_slash 0 set_tokenType "newline" set tok "lf" return 1 @@ -5275,8 +5306,6 @@ namespace eval tomlish::parse { } } , { - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { @@ -5287,39 +5316,40 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } comment - tablename - tablearrayname { if {$had_slash} {append tok "\\"} append tok , } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { #stringpart can have up to 2 quotes too if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } literal - literalpart - squotedkey { - #assert had_slash always 0, multi_dquote "" + #assert had_slash always 0 append tok $c } whitespace { if {$state eq "multistring-space"} { - set backlen [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backlen + incr i -1 return 1 } else { set_token_waiting type comma value "," complete 1 startindex $cindex @@ -5338,10 +5368,10 @@ namespace eval tomlish::parse { set_tokenType "stringpart" set tok "" if {$had_slash} {append tok "\\"} - append tok "$dquotes," + append tok "," } multiliteral-space { - #assert had_slash 0, multi_dquote "" + #assert had_slash 0 set_tokenType "literalpart" set tok "," } @@ -5354,8 +5384,6 @@ namespace eval tomlish::parse { } } . { - set dquotes $multi_dquote - set multi_dquote "" ;#!! set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { @@ -5366,42 +5394,45 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } comment - untyped_value { if {$had_slash} {append tok "\\"} append tok $c } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } literal - literalpart - squotedkey { - #assert had_slash always 0, multi_dquote "" + #assert had_slash always 0 append tok $c } whitespace { switch -exact -- $state { multistring-space { - set backchars [expr {[tcl::string::length $dquotes] + 1}] + #review if {$had_slash} { - incr backchars 1 + incr i -2 + } else { + incr i -1 } - incr i -$backchars return 1 } xxxdottedkey-space { @@ -5444,7 +5475,7 @@ namespace eval tomlish::parse { set_tokenType "stringpart" set tok "" if {$had_slash} {append tok "\\"} - append tok "$dquotes." + append tok "." } multiliteral-space { set_tokenType "literalpart" @@ -5471,8 +5502,6 @@ namespace eval tomlish::parse { } " " { - set dquotes $multi_dquote - set multi_dquote "" ;#!! if {[tcl::string::length $tokenType]} { set had_slash $slash_active set slash_active 0 @@ -5483,16 +5512,18 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } barekey { @@ -5512,9 +5543,9 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok $dquotes$c + append tok $c } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} { append tok "\\" } append tok $c } @@ -5526,8 +5557,7 @@ namespace eval tomlish::parse { incr i -2 return 1 } else { - #split into STRINGPART aaa WS " " - append tok $dquotes + #split into STRINGPART xxx WS " " incr i -1 return 1 } @@ -5537,15 +5567,7 @@ namespace eval tomlish::parse { } whitespace { if {$state eq "multistring-space"} { - if {$dquotes ne ""} { - #end whitespace token - #go back by the number of quotes plus this space char - set backchars [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backchars - return 1 - } else { - append tok $c - } + append tok $c } else { append tok $c } @@ -5588,12 +5610,6 @@ namespace eval tomlish::parse { incr i -1 return 1 } else { - if {$dquotes ne ""} { - set_tokenType "stringpart" - set tok $dquotes - incr i -1 - return 1 - } set_tokenType "whitespace" append tok $c } @@ -5613,9 +5629,6 @@ namespace eval tomlish::parse { } } tab { - set dquotes $multi_dquote - set multi_dquote "" ;#!! - if {[tcl::string::length $tokenType]} { if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out (?review) set slash_active 0 @@ -5626,12 +5639,18 @@ namespace eval tomlish::parse { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } barekey { @@ -5662,7 +5681,6 @@ namespace eval tomlish::parse { return 1 } else { #split into STRINGPART aaa WS " " - append tok $dquotes incr i -1 return 1 } @@ -5706,15 +5724,8 @@ namespace eval tomlish::parse { incr i -1 return 1 } else { - if {$dquotes ne ""} { - set_tokenType stringpart - set tok $dquotes - incr i -1 - return 1 - } else { - set_tokenType whitespace - append tok $c - } + set_tokenType whitespace + append tok $c } } multiliteral-space { @@ -5732,16 +5743,31 @@ namespace eval tomlish::parse { #BOM (Byte Order Mark) - ignored by token consumer if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } _start_squote_sequence { #assert - tok will be one or two squotes only + #A toml literal probably isn't allowed to contain this + #but we will parse and let the validator sort it out. incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart { append tok $c } + string - stringpart { + append tok $c + } default { + #state machine will generally not have entry to accept bom - let it crash set_token_waiting type bom value "\uFEFF" complete 1 startindex $cindex return 1 } @@ -5752,6 +5778,10 @@ namespace eval tomlish::parse { set_tokenType "literalpart" set tok $c } + multistring-space { + set_tokenType "stringpart" + set tok $c + } default { set_tokenType "bom" set tok "\uFEFF" @@ -5761,8 +5791,6 @@ namespace eval tomlish::parse { } } default { - set dquotes $multi_dquote - set multi_dquote "" ;#!! if {[tcl::string::length $tokenType]} { if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. @@ -5774,28 +5802,24 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } whitespace { if {$state eq "multistring-space"} { - if {$dquotes ne ""} { - set backlen [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backlen - return 1 - } else { - incr i -1 - return 1 - } + incr i -1 + return 1 } else { #review incr i -1 ;#We don't have a full token to add to the token_waiting dict - so leave this char for next run. @@ -5815,7 +5839,7 @@ namespace eval tomlish::parse { return 1 } stringpart { - append tok $dquotes$c + append tok $c } default { #e.g comment/string/literal/literalpart/untyped_value/starttablename/starttablearrayname/tablename/tablearrayname @@ -5835,22 +5859,12 @@ namespace eval tomlish::parse { error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" } } - XXXcurly-syntax { - puts stderr "curly-syntax - review" - if {[tomlish::utils::is_barekey $c]} { - set_tokenType "barekey" - append tok $c - } else { - error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" - } - } multistring-space { set_tokenType "stringpart" if {$had_slash} { - #assert - we don't get had_slash and dquotes at same time set tok \\$c } else { - set tok $dquotes$c + set tok $c } } multiliteral-space { @@ -5890,21 +5904,6 @@ namespace eval tomlish::parse { # error "Reached end of data whilst tokenType = '$tokenType'. INVALID" #} switch -exact -- $tokenType { - startquotesequence { - set toklen [tcl::string::length $tok] - if {$toklen == 1} { - #invalid - #eof with open string - error "tomlish eof reached without closing quote for string. [tomlish::parse::report_line]" - } elseif {$toklen == 2} { - #valid - #we ended in a double quote, not actually a startquoteseqence - effectively an empty string - switch_tokenType "startquote" - incr i -1 - #set_token_waiting type string value "" complete 1 - return 1 - } - } _start_squote_sequence { set toklen [tcl::string::length $tok] switch -- $toklen { @@ -5913,11 +5912,29 @@ namespace eval tomlish::parse { error "tomlish eof reached without closing single quote for string literal. [tomlish::parse::report_line]" } 2 { - #review - set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] set_tokenType "literal" set tok "" return 1 + + ##review + #set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] + #set_tokenType "literal" + #set tok "" + #return 1 + } + } + } + _start_dquote_sequence { + set toklen [tcl::string::length $tok] + switch -- $toklen { + 1 { + #invalid eof with open string + error "tomlish eof reached without closing double quote for string. [tomlish::parse::report_line]" + } + 2 { + set_tokenType "string" + set tok "" + return 1 } } } @@ -6011,6 +6028,16 @@ namespace eval tomlish::dict { return $name } + proc _show_tablenames {tablenames_info} { + append msg \n "tablenames_info:" \n + dict for {tkey tinfo} $tablenames_info { + append msg " " "table: $tkey" \n + dict for {field finfo} $tinfo { + append msg " " "$field $finfo" \n + } + } + return $msg + } } tcl::namespace::eval tomlish::app { diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/dictn-0.1.1.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/dictn-0.1.1.tm new file mode 100644 index 00000000..c9ef87f2 --- /dev/null +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/dictn-0.1.1.tm @@ -0,0 +1,349 @@ +# -*- tcl -*- +# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt +# +# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. +# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# (C) 2023 +# +# @@ Meta Begin +# Application dictn 0.1.1 +# Meta platform tcl +# Meta license +# @@ Meta End + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Requirements +##e.g package require frobz + + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval dictn { + namespace export {[a-z]*} + namespace ensemble create +} + + +## ::dictn::append +#This can of course 'ruin' a nested dict if applied to the wrong element +# - i.e using the string op 'append' on an element that is itself a nested dict is analogous to the standard Tcl: +# %set list {a b {c d}} +# %append list x +# a b {c d}x +# IOW - don't do that unless you really know that's what you want. +# +proc ::dictn::append {dictvar path {value {}}} { + if {[llength $path] == 1} { + uplevel 1 [list dict append $dictvar $path $value] + } else { + upvar 1 $dictvar dvar + + ::set str [dict get $dvar {*}$path] + append str $val + dict set dvar {*}$path $str + } +} + +proc ::dictn::create {args} { + ::set data {} + foreach {path val} $args { + dict set data {*}$path $val + } + return $data +} + +proc ::dictn::exists {dictval path} { + return [dict exists $dictval {*}$path] +} + +proc ::dictn::filter {dictval path filterType args} { + ::set sub [dict get $dictval {*}$path] + dict filter $sub $filterType {*}$args +} + +proc ::dictn::for {keyvalvars dictval path body} { + ::set sub [dict get $dictval {*}$path] + dict for $keyvalvars $sub $body +} + +proc ::dictn::get {dictval {path {}}} { + return [dict get $dictval {*}$path] +} + +proc ::dictn::getdef {dictval path default} { + return [dict getdef $dictval {*}$path $default] +} + +proc ::dictn::getwithdefault {dictval path default} { + return [dict getdef $dictval {*}$path $default] +} + +if {[info commands ::tcl::dict::getdef] ne ""} { + proc ::dictn::incr {dictvar path {increment {}} } { + if {$increment eq ""} { + ::set increment 1 + } + if {[llength $path] == 1} { + uplevel 1 [list dict incr $dictvar $path $increment] + } else { + upvar 1 $dictvar dvar + if {![::info exists dvar]} { + dict set dvar {*}$path $increment + } else { + ::set newval [expr {[dict getdef $dvar {*}$path 0] + $increment}] + dict set dvar {*}$path $newval + } + return $dvar + } + } +} else { + proc ::dictn::incr {dictvar path {increment {}} } { + if {$increment eq ""} { + ::set increment 1 + } + if {[llength $path] == 1} { + uplevel 1 [list dict incr $dictvar $path $increment] + } else { + upvar 1 $dictvar dvar + if {![::info exists dvar]} { + dict set dvar {*}$path $increment + } else { + if {![dict exists $dvar {*}$path]} { + ::set val 0 + } else { + ::set val [dict get $dvar {*}$path] + } + ::set newval [expr {$val + $increment}] + dict set dvar {*}$path $newval + } + return $dvar + } + } +} + +proc ::dictn::info {dictval {path {}}} { + if {![string length $path]} { + return [dict info $dictval] + } else { + ::set sub [dict get $dictval {*}$path] + return [dict info $sub] + } +} + +proc ::dictn::keys {dictval {path {}} {glob {}}} { + ::set sub [dict get $dictval {*}$path] + if {[string length $glob]} { + return [dict keys $sub $glob] + } else { + return [dict keys $sub] + } +} + +proc ::dictn::lappend {dictvar path args} { + if {[llength $path] == 1} { + uplevel 1 [list dict lappend $dictvar $path {*}$args] + } else { + upvar 1 $dictvar dvar + + ::set list [dict get $dvar {*}$path] + ::lappend list {*}$args + dict set dvar {*}$path $list + } +} + +proc ::dictn::merge {args} { + error "nested merge not yet supported" +} + +#dictn remove dictionaryValue ?path ...? +proc ::dictn::remove {dictval args} { + ::set basic [list] ;#buffer basic (1element path) removals to do in a single call. + + foreach path $args { + if {[llength $path] == 1} { + ::lappend basic $path + } else { + #extract,modify,replace + ::set subpath [lrange $path 0 end-1] + + ::set sub [dict get $dictval {*}$subpath] + ::set sub [dict remove $sub [lindex $path end]] + + dict set dictval {*}$subpath $sub + } + } + + if {[llength $basic]} { + return [dict remove $dictval {*}$basic] + } else { + return $dictval + } +} + + +proc ::dictn::replace {dictval args} { + ::set basic [list] ;#buffer basic (1element path) replacements to do in a single call. + + foreach {path val} $args { + if {[llength $path] == 1} { + ::lappend basic $path $val + } else { + #extract,modify,replace + ::set subpath [lrange $path 0 end-1] + + ::set sub [dict get $dictval {*}$subpath] + ::set sub [dict replace $sub [lindex $path end] $val] + + dict set dictval {*}$subpath $sub + } + } + + + if {[llength $basic]} { + return [dict replace $dictval {*}$basic] + } else { + return $dictval + } +} + + +proc ::dictn::set {dictvar path newval} { + upvar 1 $dictvar dvar + return [dict set dvar {*}$path $newval] +} + +proc ::dictn::size {dictval {path {}}} { + return [dict size [dict get $dictval {*}$path]] +} + +proc ::dictn::unset {dictvar path} { + upvar 1 $dictvar dvar + return [dict unset dvar {*}$path +} + +proc ::dictn::update {dictvar args} { + ::set body [lindex $args end] + ::set maplist [lrange $args 0 end-1] + + upvar 1 $dictvar dvar + foreach {path var} $maplist { + if {[dict exists $dvar {*}$path]} { + uplevel 1 [list set $var [dict get $dvar $path]] + } + } + + catch {uplevel 1 $body} result + + foreach {path var} $maplist { + if {[dict exists $dvar {*}$path]} { + upvar 1 $var $var + if {![::info exists $var]} { + uplevel 1 [list dict unset $dictvar {*}$path] + } else { + uplevel 1 [list dict set $dictvar {*}$path [::set $var]] + } + } + } + return $result +} + +#an experiment. +proc ::dictn::Applyupdate {dictvar args} { + ::set body [lindex $args end] + ::set maplist [lrange $args 0 end-1] + + upvar 1 $dictvar dvar + + ::set headscript "" + ::set i 0 + foreach {path var} $maplist { + if {[dict exists $dvar {*}$path]} { + #uplevel 1 [list set $var [dict get $dvar $path]] + ::lappend arglist $var + ::lappend vallist [dict get $dvar {*}$path] + ::append headscript [string map [list %i% $i %v% $var] {upvar 1 %v% %v%; set %v% [lindex $args %i%]} ] + ::append headscript \n + ::incr i + } + } + + ::set body $headscript\r\n$body + + puts stderr "BODY: $body" + + #set result [apply [list args $body] {*}$vallist] + catch {apply [list args $body] {*}$vallist} result + + foreach {path var} $maplist { + if {[dict exists $dvar {*}$path] && [::info exists $var]} { + dict set dvar {*}$path [::set $var] + } + } + return $result +} + +proc ::dictn::values {dictval {path {}} {glob {}}} { + ::set sub [dict get $dictval {*}$path] + if {[string length $glob]} { + return [dict values $sub $glob] + } else { + return [dict values $sub] + } +} + +# Standard form: +#'dictn with dictVariable path body' +# +# Extended form: +#'dictn with dictVariable path arrayVariable body' +# +proc ::dictn::with {dictvar path args} { + if {[llength $args] == 1} { + ::set body [lindex $args 0] + return [uplevel 1 [list dict with $dictvar {*}$path $body]] + } else { + upvar 1 $dictvar dvar + ::lassign $args arrayname body + + upvar 1 $arrayname arr + array set arr [dict get $dvar {*}$path] + ::set prevkeys [array names arr] + + catch {uplevel 1 $body} result + + + foreach k $prevkeys { + if {![::info exists arr($k)]} { + dict unset $dvar {*}$path $k + } + } + foreach k [array names arr] { + dict set $dvar {*}$path $k $arr($k) + } + + return $result + } +} + + + + + + + + + + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Ready +package provide dictn [namespace eval dictn { + variable version + ::set version 0.1.1 +}] +return \ No newline at end of file diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/include_modules.config b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/include_modules.config index 247371ee..afd1e8f2 100644 --- a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/include_modules.config +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/include_modules.config @@ -27,6 +27,7 @@ set bootsupport_modules [list\ src/vendormodules sha1\ src/vendormodules tomlish\ src/vendormodules test::tomlish\ + src/vendormodules dictn\ src/vendormodules textutil::adjust\ src/vendormodules textutil::repeat\ src/vendormodules textutil::split\ diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/test/tomlish-1.1.3.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/test/tomlish-1.1.3.tm index ed5044a7..8afb43d9 100644 Binary files a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/test/tomlish-1.1.3.tm and b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/test/tomlish-1.1.3.tm differ diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/tomlish-1.1.4.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/tomlish-1.1.4.tm index 7a6d5205..33d5b912 100644 --- a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/tomlish-1.1.4.tm +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/tomlish-1.1.4.tm @@ -153,15 +153,10 @@ namespace eval tomlish { } #review - if {[uplevel 1 [list info exists tablenames_seen]]} { - upvar tablenames_seen tablenames_seen + if {[uplevel 1 [list info exists tablenames_info]]} { + upvar tablenames_info tablenames_info } else { - set tablenames_seen [list] ;#list of lists - } - if {[uplevel 1 [list info exists tablenames_closed]]} { - upvar tablenames_closed tablenames_closed - } else { - set tablenames_closed [list] ;#list of lists + set tablenames_info [dict create] ;#keys are lists {parenttable subtable etc} corresponding to parenttable.subtable.etc } foreach sub [lrange $keyval_element 2 end] { @@ -207,13 +202,10 @@ namespace eval tomlish { ARRAY { #we need to recurse to get the corresponding dict for the contained item(s) #pass in the whole $found_sub - not just the $value! - set prev_tablenames_seen $tablenames_seen - set prev_tablenames_closed $tablenames_closed - set tablenames_seen [list] - set tablenames_closed [list] + set prev_tablenames_info $tablenames_info + set tablenames_info [dict create] set result [list type $type value [::tomlish::to_dict [list $found_sub]]] - set tablenames_seen $prev_tablenames_seen - set tablenames_closed $prev_tablenames_closed + set tablenames_info $prev_tablenames_info } MULTISTRING - MULTILITERAL { #review - mapping these to STRING might make some conversions harder? @@ -295,23 +287,66 @@ namespace eval tomlish { #[Data] #temps = [{cpu = 79.5, case = 72.0}] proc to_dict {tomlish} { + package require dictn #keep track of which tablenames have already been directly defined, # so we can raise an error to satisfy the toml rule: 'You cannot define any key or table more than once. Doing so is invalid' #Note that [a] and then [a.b] is ok if there are no subkey conflicts - so we are only tracking complete tablenames here. #we don't error out just because a previous tablename segment has already appeared. - ##variable tablenames_seen [list] - if {[uplevel 1 [list info exists tablenames_seen]]} { - upvar tablenames_seen tablenames_seen - } else { - set tablenames_seen [list] ;#list of lists - } - if {[uplevel 1 [list info exists tablenames_closed]]} { - upvar tablenames_closed tablenames_closed + + #Declaring, Creating, and Defining Tables + #https://github.com/toml-lang/toml/issues/795 + #(update - only Creating and Defining are relevant terminology) + + #review + #tablenames_info keys created, defined, createdby, definedby, closedby + + #consider the following 2 which are legal: + #[table] #'table' created, defined=open definedby={header table} + #x.y = 3 + #[table.x.z] #'table' defined=closed closedby={header table.x.z}, 'table.x' created, 'table.x.z' created defined=open definedby={header table.x.z} + #k= 22 + # #'table.x.z' defined=closed closedby={eof eof} + + #equivalent datastructure + + #[table] #'table' created, defined=open definedby={header table} + #[table.x] #'table' defined=closed closedby={header table.x}, 'table.x' created defined=open definedby={header table.x} + #y = 3 + #[table.x.z] #'table.x' defined=closed closedby={header table.x.z}, 'table.x.z' created defined=open definedby={header table.x.z} + #k=22 + + #illegal + #[table] #'table' created and defined=open + #x.y = 3 #'table.x' created first keyval pair defined=open definedby={keyval x.y = 3} + #[table.x.y.z] #'table' defined=closed, 'table.x' closed because parent 'table' closed?, 'table.x.y' cannot be created + #k = 22 + # + ## - we would fail on encountering table.x.y because only table and table.x are effectively tables - but that table.x is closed should be detected (?) + + #illegal + #[table] + #x.y = {p=3} + #[table.x.y.z] + #k = 22 + ## we should fail because y is an inline table which is closed to further entries + + #note: it is not safe to compare normalized tablenames using join! + # e.g a.'b.c'.d is not the same as a.b.c.d + # instead compare {a b.c d} with {a b c d} + # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. + #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' + #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} + + + + if {[uplevel 1 [list info exists tablenames_info]]} { + upvar tablenames_info tablenames_info } else { - set tablenames_closed [list] ;#list of lists + set tablenames_info [dict create] ;#keyed on tablepath each of which is a list such as {config subgroup etc} (corresponding to config.subgroup.etc) } + log::info "---> to_dict processing '$tomlish'<<<" set items $tomlish @@ -354,7 +389,7 @@ namespace eval tomlish { #a.b.c = 1 #table_key_hierarchy -> a b - #leafkey -> c + #tleaf -> c if {[llength $dotted_key_hierarchy] == 0} { #empty?? probably invalid. review #This is different to '' = 1 or ''.'' = 1 which have lengths 1 and 2 respectively @@ -362,10 +397,10 @@ namespace eval tomlish { } elseif {[llength $dotted_key_hierarchy] == 1} { #dottedkey is only a key - no table component set table_hierarchy [list] - set leafkey [lindex $dotted_key_hierarchy 0] + set tleaf [lindex $dotted_key_hierarchy 0] } else { set table_hierarchy [lrange $dotted_key_hierarchy 0 end-1] - set leafkey [lindex $dotted_key_hierarchy end] + set tleaf [lindex $dotted_key_hierarchy end] } #ensure empty tables are still represented in the datastructure @@ -380,143 +415,101 @@ namespace eval tomlish { } } #review? - if {[dict exists $datastructure {*}$table_hierarchy $leafkey]} { - error "Duplicate key '$table_hierarchy $leafkey'. The key already exists at this level in the toml data. The toml data is not valid." + if {[dict exists $datastructure {*}$table_hierarchy $tleaf]} { + error "Duplicate key '$table_hierarchy $tleaf'. The key already exists at this level in the toml data. The toml data is not valid." } #JMN test 2025 if {[llength $table_hierarchy]} { - lappend tablenames_seen $table_hierarchy + dictn incr tablenames_info [list $table_hierarchy seencount] } set keyval_dict [_get_keyval_value $item] if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { - lappend tablenames_seen [list {*}$table_hierarchy $leafkey] - lappend tablenames_closed [list {*}$table_hierarchy $leafkey] + set t [list {*}$table_hierarchy $tleaf] + dictn incr tablenames_info [list $t seencount] + dictn set tablenames_info [list $t closed] 1 #review - item is an ITABLE - we recurse here without datastructure context :/ #overwriting keys? todo ? - dict set datastructure {*}$table_hierarchy $leafkey $keyval_dict + dict set datastructure {*}$table_hierarchy $tleaf $keyval_dict } else { - dict set datastructure {*}$table_hierarchy $leafkey $keyval_dict + dict set datastructure {*}$table_hierarchy $tleaf $keyval_dict } + } + TABLEARRAY { + set tablename [lindex $item 1] + log::debug "---> to_dict processing item TABLENAME (name: $tablename): $item" + set norm_segments [::tomlish::utils::tablename_split $tablename true] ;#true to normalize + #we expect repeated tablearray entries - each adding a sub-object to the value, which is an array/list. + } TABLE { set tablename [lindex $item 1] + log::debug "---> to_dict processing item TABLE (name: $tablename): $item" #set tablename [::tomlish::utils::tablename_trim $tablename] set norm_segments [::tomlish::utils::tablename_split $tablename true] ;#true to normalize - if {$norm_segments in $tablenames_seen} { - error "Table name '$tablename' has already been directly defined in the toml data. Invalid." - } - log::debug "---> to_dict processing item $tag (name: $tablename): $item" - set name_segments [::tomlish::utils::tablename_split $tablename] ;#unnormalized - set last_seg "" - #toml spec rule - all segments mst be non-empty - #note that the results of tablename_split are 'raw' - ie some segments may be enclosed in single or double quotes. - - set table_key_sublist [list] - - foreach normseg $norm_segments { - lappend table_key_sublist $normseg - if {[dict exists $datastructure {*}$table_key_sublist]} { - #It's ok for this key to already exist *if* it was defined by a previous tablename or equivalent - #and if this key is longer - - #consider the following 2 which are legal: - #[table] - #x.y = 3 - #[table.x.z] - #k= 22 - - #equivalent - - #[table] - #[table.x] - #y = 3 - #[table.x.z] - #k=22 - - #illegal - #[table] - #x.y = 3 - #[table.x.y.z] - #k = 22 - ## - we should fail on encountering table.x.y because only table and table.x are effectively tables - - #illegal - #[table] - #x.y = {p=3} - #[table.x.y.z] - #k = 22 - ## we should fail because y is an inline table which is closed to further entries - - - #note: it is not safe to compare normalized tablenames using join! - # e.g a.'b.c'.d is not the same as a.b.c.d - # instead compare {a b.c d} with {a b c d} - # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. - #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' - #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} - - set sublist_length [llength $table_key_sublist] - set found_testkey 0 - if {$table_key_sublist in $tablenames_seen} { - set found_testkey 1 - } else { - #see if it was defined by a longer entry - foreach seen_table_segments $tablenames_seen { - if {[llength $seen_table_segments] <= $sublist_length} { - continue - } - #each tablenames_seen entry is already a list of normalized segments - - #we could have [a.b.c.d] early on - # followed by [a.b] - which was still defined by the earlier one. + set T_DEFINED [dictn getdef $tablenames_info [list $norm_segments defined] NULL] + if {$T_DEFINED ne "NULL"} { + #our tablename e.g [a.b.c.d] declares a space to 'define' subkeys - but there has already been a definition space for this path + set msg "Table name $tablename has already been directly defined in the toml data. Invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + error $msg + } - set seen_longer [lrange $seen_segments 0 [expr {$sublist_length -1}]] - puts stderr "testkey:'$table_key_sublist' vs seen_match:'$seen_longer'" - if {$table_key_sublist eq $seen_longer} { - set found_testkey 1 - } - } - } - if {$found_testkey == 0} { - #the raw unnormalized tablename might be ok to display in the error message, although it's not the actual dict keyset - set msg "key $table_key_sublist already exists in datastructure, but wasn't defined by a supertable." - append msg \n "tablenames_seen:" \n - foreach ts $tablenames_seen { - append msg " " $ts \n - } + set name_segments [::tomlish::utils::tablename_split $tablename 0] ;#unnormalized e.g ['a'."b".c.d] -> 'a' "b" c d + #results of tablename_split 0 are 'raw' - ie some segments may be enclosed in single or double quotes. + + + set supertable [list] + ############## + # [a.b.c.d] + # norm_segments = {a b c d} + #check a {a b} {a b c} <---- supertables of a.b.c.d + ############## + foreach normseg [lrange $norm_segments 0 end-1] { + lappend supertable $normseg + if {![dictn exists $tablenames_info [list $supertable type]]} { + #supertable with this path doesn't yet exist + if {[dict exists $datastructure {*}$supertable]} { + #There is data though - so it must have been created as a keyval + set msg "Supertable [join $supertable .] of table name $tablename already has data - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] error $msg + } else { + #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here + dictn set tablenames_info [list $supertable type] header + #ensure empty tables are still represented in the datastructure + dict set datastructure {*}$supertable [list] } - } - - } - - #ensure empty tables are still represented in the datastructure - set key_sublist [list] - foreach k $norm_segments { - lappend key_sublist $k - if {![dict exists $datastructure {*}$key_sublist]} { - dict set datastructure {*}$key_sublist [list] } else { - tomlish::log::notice "to_dict datastructure at (TABLE) subkey $key_sublist already had data: [dict get $datastructure {*}$key_sublist]" + #supertable has already been created - and maybe defined - but even if defined we can add subtables } } + #table [a.b.c.d] hasn't been defined - but may have been 'created' already by a longer tablename + # - or may have existing data from a keyval + if {![dictn exists $tablenames_info [list $norm_segments type]]} { + if {[dict exists $datastructure {*}$norm_segments]} { + set msg "Table name $tablename already has data - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + error $msg + } + #no data or previously created table + dictn set tablenames_info [list $norm_segments type] header - #We must do this after the key-collision test above! - lappend tablenames_seen $norm_segments - - + #We are 'defining' this table's keys and values here (even if empty) + dict set datastructure {*}$norm_segments [list] ;#ensure table still represented in datastructure even if we add no keyvals here + } + dictn set tablenames_info [list $norm_segments defined] open log::debug ">>> to_dict >>>>>>>>>>>>>>>>> normalized table key hierarchy : $norm_segments" #now add the contained elements foreach element [lrange $item 2 end] { set type [lindex $element 0] - log::debug "----> tododict processing $tag subitem $type processing contained element $element" + log::debug "----> todict processing $tag subitem $type processing contained element $element" switch -exact -- $type { DOTTEDKEY { set dkey_info [_get_dottedkey_info $element] @@ -547,14 +540,19 @@ namespace eval tomlish { puts stdout "to_dict>>> $keyval_dict" dict set datastructure {*}$norm_segments {*}$dkeys $leaf_key $keyval_dict #JMN 2025 - lappend tablenames_seen [list {*}$norm_segments {*}$dkeys] + #lappend tablenames_info [list {*}$norm_segments {*}$dkeys] + set tkey [list {*}$norm_segments {*}$dkeys] + dictn incr tablenames_info [list $tkey seencount] if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { #the value is either empty or or a dict structure with arbitrary (from-user-data) toplevel keys # inner structure will contain {type value } if all leaves are not empty ITABLES - lappend tablenames_seen [list {*}$norm_segments {*}$dkeys $leaf_key] + set tkey [list {*}$norm_segments {*}$dkeys $leaf_key] + #lappend tablenames_info [list {*}$norm_segments {*}$dkeys $leaf_key] + dictn incr tablenames_info [list $tkey seencount] #if the keyval_dict is not a simple type x value y - then it's an inline table ? #if so - we should add the path to the leaf_key as a closed table too - as it's not allowed to have more entries added. + dictn set tablenames_info [list $tkey closed] 1 } } @@ -562,7 +560,7 @@ namespace eval tomlish { #ignore } default { - error "Sub element of type '$type' not understood in table context. Expected only KEY,DQKEY,SQKEY,NEWLINE,COMMENT,WS" + error "Sub element of type '$type' not understood in table context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" } } } @@ -1316,7 +1314,12 @@ namespace eval tomlish::encode { #NOTE - this DELIBERATELY does not validate the data, or process escapes etc #It encodes the tomlish records as they are. #ie it only produces toml shaped data from a tomlish list. + # #It is part of the roundtripability of data from toml to tomlish + #!! ie - it is not the place to do formatting of inline vs multiline !! + # That needs to be encoded in the tomlish data that is being passed in + # (e.g from_dict could make formatting decisions in the tomlish it produces) + # #e.g duplicate keys etc can exist in the toml output. #The to_dict from_dict (or any equivalent processor pair) is responsible for validation and conversion #back and forth of escape sequences where appropriate. @@ -1646,17 +1649,27 @@ namespace eval tomlish::decode { #pop_trigger_tokens: newline tablename endarray endinlinetable #note a token is a pop trigger depending on context. e.g first newline during keyval is a pop trigger. set parentlevel [expr {$nest -1}] - set do_append_to_parent 1 ;#most tokens will leave this alone - but some like squote_seq need to do their own append + set do_append_to_parent 1 ;#most tokens will leave this alone - but some like tentative_accum_squote need to do their own append switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote { + #should only apply within a multiliteral #### set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed #Without this - we would get extraneous empty list entries in the parent # - as the xxx-squote-space isn't a space level from the toml perspective # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop + #assert prevstate always trailing-squote-space + #dev guardrail - remove? assertion lib? + switch -exact -- $prevstate { + trailing-squote-space { + } + default { + error "--- unexpected popped due to tentative_accum_squote but came from state '$prevstate' should have been trailing-squote-space" + } + } switch -- $tok { ' { - tomlish::parse::set_token_waiting type startsquote value $tok complete 1 startindex [expr {$i -1}] + tomlish::parse::set_token_waiting type single_squote value $tok complete 1 startindex [expr {$i -1}] } '' { #review - we should perhaps return double_squote instead? @@ -1669,74 +1682,51 @@ namespace eval tomlish::decode { tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 3}] } '''' { - switch -exact -- $prevstate { - leading-squote-space { - error "---- 4 squotes from leading-squote-space - shouldn't get here" - #we should have emitted the triple and left the last for next loop + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] + #todo integrate left squote with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + LITERALPART { + set newval "[lindex $lastpart 1]'" + set parentdata $v($parentlevel) + lset parentdata end [list LITERALPART $newval] + set v($parentlevel) $parentdata } - trailing-squote-space { - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] - #todo integrate left squote with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - LITERALPART { - set newval "[lindex $lastpart 1]'" - set parentdata $v($parentlevel) - lset parentdata end [list LITERALPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE { - lappend v($parentlevel) [list LITERALPART "'"] - } - MULTILITERAL { - #empty - lappend v($parentlevel) [list LITERALPART "'"] - } - default { - error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" - } - } + NEWLINE { + lappend v($parentlevel) [list LITERALPART "'"] + } + MULTILITERAL { + #empty + lappend v($parentlevel) [list LITERALPART "'"] } default { - error "--- unexpected popped due to squote_seq but came from state '$prevstate' should have been leading-squote-space or trailing-squote-space" + error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" } } } ''''' { - switch -exact -- $prevstate { - leading-squote-space { - error "---- 5 squotes from leading-squote-space - shouldn't get here" - #we should have emitted the triple and left the following squotes for next loop + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] + #todo integrate left 2 squotes with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + LITERALPART { + set newval "[lindex $lastpart 1]''" + set parentdata $v($parentlevel) + lset parentdata end [list LITERALPART $newval] + set v($parentlevel) $parentdata } - trailing-squote-space { - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] - #todo integrate left 2 squotes with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - LITERALPART { - set newval "[lindex $lastpart 1]''" - set parentdata $v($parentlevel) - lset parentdata end [list LITERALPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE { - lappend v($parentlevel) [list LITERALPART "''"] - } - MULTILITERAL { - lappend v($parentlevel) [list LITERALPART "''"] - } - default { - error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" - } - } + NEWLINE { + lappend v($parentlevel) [list LITERALPART "''"] + } + MULTILITERAL { + lappend v($parentlevel) [list LITERALPART "''"] } default { - error "--- unexpected popped due to squote_seq but came from state '$prevstate' should have been leading-squote-space or trailing-squote-space" + error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" } } } } - puts stderr "tomlish::decode::toml ---- HERE squote_seq pop <$tok>" } triple_squote { #presumably popping multiliteral-space @@ -1763,7 +1753,119 @@ namespace eval tomlish::decode { lappend merged $part } default { - error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($next)" + error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" + } + } + set lasttype [lindex $part 0] + } + set v($nest) $merged + } + tentative_accum_dquote { + #should only apply within a multistring + #### + set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed + #Without this - we would get extraneous empty list entries in the parent + # - as the trailing-dquote-space isn't a space level from the toml perspective + # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop + #assert prevstate always trailing-dquote-space + #dev guardrail - remove? assertion lib? + switch -exact -- $prevstate { + trailing-dquote-space { + } + default { + error "--- unexpected popped due to tentative_accum_dquote but came from state '$prevstate' should have been trailing-dquote-space" + } + } + switch -- $tok { + {"} { + tomlish::parse::set_token_waiting type single_dquote value $tok complete 1 startindex [expr {$i -1}] + } + {""} { + #review - we should perhaps return double_dquote instead? + #tomlish::parse::set_token_waiting type literal value "" complete 1 + tomlish::parse::set_token_waiting type double_dquote value "" complete 1 startindex [expr {$i - 2}] + } + {"""} { + #### + #if already an eof in token_waiting - set_token_waiting will insert before it + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 3}] + } + {""""} { + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 4}] + #todo integrate left dquote with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + STRINGPART { + set newval "[lindex $lastpart 1]\"" + set parentdata $v($parentlevel) + lset parentdata end [list STRINGPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE - CONT - WS { + lappend v($parentlevel) [list STRINGPART {"}] + } + MULTISTRING { + #empty + lappend v($parentlevel) [list STRINGPART {"}] + } + default { + error "--- don't know how to integrate extra trailing dquote with data $v($parentlevel)" + } + } + } + {"""""} { + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i-5}] + #todo integrate left 2 dquotes with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + STRINGPART { + set newval "[lindex $lastpart 1]\"\"" + set parentdata $v($parentlevel) + lset parentdata end [list STRINGPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE - CONT - WS { + lappend v($parentlevel) [list STRINGPART {""}] + } + MULTISTRING { + lappend v($parentlevel) [list STRINGPART {""}] + } + default { + error "--- don't know how to integrate extra trailing 2 dquotes with data $v($parentlevel)" + } + } + } + } + } + triple_dquote { + #presumably popping multistring-space + ::tomlish::log::debug "---- triple_dquote for last_space_action pop leveldata: $v($nest)" + set merged [list] + set lasttype "" + foreach part $v($nest) { + switch -exact -- [lindex $part 0] { + MULTISTRING { + lappend merged $part + } + STRINGPART { + if {$lasttype eq "STRINGPART"} { + set prevpart [lindex $merged end] + lset prevpart 1 [lindex $prevpart 1][lindex $part 1] + lset merged end $prevpart + } else { + lappend merged $part + } + } + CONT - WS { + lappend merged $part + } + NEWLINE { + #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here + #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. + lappend merged $part + } + default { + error "---- triple_dquote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" } } set lasttype [lindex $part 0] @@ -1809,15 +1911,12 @@ namespace eval tomlish::decode { endinlinetable { ::tomlish::log::debug "---- endinlinetable for last_space_action pop" } - endmultiquote { - ::tomlish::log::debug "---- endmultiquote for last_space_action 'pop'" - } default { error "---- unexpected tokenType '$tokenType' for last_space_action 'pop'" } } if {$do_append_to_parent} { - #e.g squote_seq does it's own appends as necessary - so won't get here + #e.g tentative_accum_squote does it's own appends as necessary - so won't get here lappend v($parentlevel) [set v($nest)] } @@ -1831,8 +1930,8 @@ namespace eval tomlish::decode { switch -exact -- $tokenType { - squote_seq_begin { - #### + tentative_trigger_squote - tentative_trigger_dquote { + #### this startok will always be tentative_accum_squote/tentative_accum_dquote starting with one accumulated squote/dquote if {[dict exists $transition_info starttok] && [dict get $transition_info starttok] ne ""} { lassign [dict get $transition_info starttok] starttok_type starttok_val set next_tokenType_known 1 @@ -1840,6 +1939,16 @@ namespace eval tomlish::decode { set tok $starttok_val } } + single_squote { + #JMN - REVIEW + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "squotedkey" + set tok "" + } + triple_squote { + ::tomlish::log::debug "---- push trigger tokenType triple_squote" + set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERALPART + } squotedkey { switch -exact -- $prevstate { table-space - itable-space { @@ -1849,6 +1958,9 @@ namespace eval tomlish::decode { #todo - check not something already waiting? tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } + triple_dquote { + set v($nest) [list MULTISTRING] ;#container for NEWLINE,STRINGPART,CONT + } dquotedkey { switch -exact -- $prevstate { table-space - itable-space { @@ -1858,7 +1970,7 @@ namespace eval tomlish::decode { #todo - check not something already waiting? tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } - XXXdquotedkey - XXXitablequotedkey { + XXXdquotedkey { #todo set v($nest) [list DQKEY $tok] ;#$tok is the keyname } @@ -1878,34 +1990,29 @@ namespace eval tomlish::decode { tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } } - startsquote { - #JMN - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "squotedkey" - set tok "" - } tablename { #note: we do not use the output of tomlish::tablename_trim to produce a tablename for storage in the tomlish list! #The tomlish list is intended to preserve all whitespace (and comments) - so a roundtrip from toml file to tomlish # back to toml file will be identical. #It is up to the datastructure stage to normalize and interpret tomlish for programmatic access. # we call tablename_trim here only to to validate that the tablename data is well-formed at the outermost level, - # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names. + # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names from + # a structural perspective. #todo - review! It's arguable that we should not do any validation here, and just store even incorrect raw tablenames, # so that the tomlish list is more useful for say a toml editor. Consider adding an 'err' tag to the appropriate place in the # tomlish list? - set test_only [::tomlish::utils::tablename_trim $tok] - ::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$test_only'" + #set trimtable [::tomlish::utils::tablename_trim $tok] + #::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$trimtable'" set v($nest) [list TABLE $tok] ;#$tok is the *raw* table name #note also that equivalent tablenames may have different toml representations even after being trimmed! #e.g ["x\t\t"] & ["x "] (tab escapes vs literals) #These will show as above in the tomlish list, but should normalize to the same tablename when used as keys by the datastructure stage. } tablearrayname { - set test_only [::tomlish::utils::tablename_trim $tok] - puts stdout "trimmed (but not normalized) tablearrayname: '$test_only'" + #set trimtable [::tomlish::utils::tablename_trim $tok] + #::tomlish::log::debug "---- trimmed (but not normalized) tablearrayname: '$trimtable'" set v($nest) [list TABLEARRAY $tok] ;#$tok is the *raw* tablearray name } startarray { @@ -1914,14 +2021,6 @@ namespace eval tomlish::decode { startinlinetable { set v($nest) [list ITABLE] ;#$tok is just the opening curly brace - don't output. } - startmultiquote { - ::tomlish::log::debug "---- push trigger tokenType startmultiquote" - set v($nest) [list MULTISTRING] ;#container for STRINGPART, WS, CONT, NEWLINE - } - triple_squote { - ::tomlish::log::debug "---- push trigger tokenType triple_squote" - set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERAL - } default { error "---- push trigger tokenType '$tokenType' not yet implemented" } @@ -1931,11 +2030,11 @@ namespace eval tomlish::decode { #no space level change switch -exact -- $tokenType { squotedkey { - puts "---- squotedkey in state $prevstate (no space level change)" + #puts "---- squotedkey in state $prevstate (no space level change)" lappend v($nest) [list SQKEY $tok] } dquotedkey { - puts "---- dquotedkey in state $prevstate (no space level change)" + #puts "---- dquotedkey in state $prevstate (no space level change)" lappend v($nest) [list DQKEY $tok] } barekey { @@ -1960,29 +2059,46 @@ namespace eval tomlish::decode { startinlinetable { puts stderr "---- decode::toml error. did not expect startinlinetable without space level change (no space level change)" } - startquote { + single_dquote { switch -exact -- $newstate { string-state { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "string" set tok "" } - quoted-key { + dquoted-key { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "dquotedkey" set tok "" } - XXXitable-quoted-key { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "itablequotedkey" - set tok "" + multistring-space { + lappend v($nest) [list STRINGPART {"}] + #may need to be joined on pop if there are neighbouring STRINGPARTS + } + default { + error "---- single_dquote switch case not implemented for nextstate: $newstate (no space level change)" + } + } + } + double_dquote { + #leading extra quotes - test: toml_multistring_startquote2 + switch -exact -- $prevstate { + itable-keyval-value-expected - keyval-value-expected { + puts stderr "tomlish::decode::toml double_dquote TEST" + #empty string + lappend v($nest) [list STRINGPART ""] + } + multistring-space { + #multistring-space to multistring-space + lappend v($nest) [list STRINGPART {""}] } default { - error "---- startquote switch case not implemented for nextstate: $newstate (no space level change)" + error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" } } + } - startsquote { + single_squote { switch -exact -- $newstate { literal-state { set next_tokenType_known 1 @@ -1995,41 +2111,17 @@ namespace eval tomlish::decode { set tok "" } multiliteral-space { - #false alarm squote returned from squote_seq pop + #false alarm squote returned from tentative_accum_squote pop ::tomlish::log::debug "---- adding lone squote to own LITERALPART nextstate: $newstate (no space level change)" #(single squote - not terminating space) lappend v($nest) [list LITERALPART '] #may need to be joined on pop if there are neighbouring LITERALPARTs } default { - error "---- startsquote switch case not implemented for nextstate: $newstate (no space level change)" + error "---- single_squote switch case not implemented for nextstate: $newstate (no space level change)" } } } - startmultiquote { - #review - puts stderr "---- got startmultiquote in state $prevstate (no space level change)" - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "stringpart" - set tok "" - } - endquote { - #nothing to do? - set tok "" - } - endsquote { - set tok "" - } - endmultiquote { - #JMN!! - set tok "" - } - string { - lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes - } - literal { - lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes - } double_squote { switch -exact -- $prevstate { keyval-value-expected { @@ -2044,6 +2136,19 @@ namespace eval tomlish::decode { } } } + enddquote { + #nothing to do? + set tok "" + } + endsquote { + set tok "" + } + string { + lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes + } + literal { + lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes + } multistring { #review lappend v($nest) [list MULTISTRING $tok] @@ -2056,11 +2161,9 @@ namespace eval tomlish::decode { } literalpart { lappend v($nest) [list LITERALPART $tok] ;#will not get wrapped in squotes directly - } - itablequotedkey { - } untyped_value { + #would be better termed unclassified_value #we can't determine the type of unquoted values (int,float,datetime,bool) until the entire token was read. if {$tok in {true false}} { set tag BOOL @@ -2238,7 +2341,7 @@ namespace eval tomlish::utils { #eg {dog."tater.man"} set sLen [tcl::string::length $tablename] set segments [list] - set mode "unknown" ;#5 modes: unknown, quoted,litquoted, unquoted, syntax + set mode "preval" ;#5 modes: preval, quoted,litquoted, unquoted, postval #quoted is for double-quotes, litquoted is for single-quotes (string literal) set seg "" for {set i 0} {$i < $sLen} {incr i} { @@ -2249,139 +2352,166 @@ namespace eval tomlish::utils { set lastChar "" } + #todo - track\count backslashes properly + set c [tcl::string::index $tablename $i] + if {$c eq "\""} { + if {($lastChar eq "\\")} { + #not strictly correct - we could have had an even number prior-backslash sequence + #the toml spec would have us error out immediately on bsl in bad location - but we're + #trying to parse to unvalidated tomlish + set ctest escq + } else { + set ctest dq + } + } else { + set ctest [string map [list " " sp \t tab] $c] + } - if {$c eq "."} { - switch -exact -- $mode { - unquoted { - #dot marks end of segment. - lappend segments $seg - set seg "" - set mode "unknown" - } - quoted { - append seg $c - } - unknown { - lappend segments $seg - set seg "" - } - litquoted { - append seg $c - } - default { - #mode: syntax - #we got our dot. - the syntax mode is now satisfied. - set mode "unknown" + switch -- $ctest { + . { + switch -exact -- $mode { + preval { + error "tablename_split. dot not allowed - expecting a value" + } + unquoted { + #dot marks end of segment. + #if {![is_barekey $seg]} { + # error "tablename_split. dot not allowed - expecting a value" + #} + lappend segments $seg + set seg "" + set mode "preval" + } + quoted { + append seg $c + } + litquoted { + append seg $c + } + postval { + #got dot in an expected location + set mode "preval" + } } } - } elseif {($c eq "\"") && ($lastChar ne "\\")} { - if {$mode eq "unknown"} { - if {[tcl::string::trim $seg] ne ""} { - #we don't allow a quote in the middle of a bare key - error "tablename_split. character '\"' invalid at this point in tablename. tablename: '$tablename'" - } - set mode "quoted" - set seg "\"" - } elseif {$mode eq "unquoted"} { - append seg $c - } elseif {$mode eq "quoted"} { - append seg $c - - if {$normalize} { - lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] - } else { - lappend segments $seg + dq { + #unescaped dquote + switch -- $mode { + preval { + set mode "quoted" + set seg "\"" + } + unquoted { + #invalid in barekey - but we are after structure only + append seg $c + } + quoted { + append seg $c + if {$normalize} { + lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] + } else { + lappend segments $seg + } + set seg "" + set mode "postval" ;#make sure we only accept a dot or end-of-data now. + } + litquoted { + append seg $c + } + postval { + error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" + } } - - set seg "" - set mode "syntax" ;#make sure we only accept a dot or end-of-data now. - } elseif {$mode eq "litquoted"} { - append seg $c - } elseif {$mode eq "syntax"} { - error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" - } - } elseif {($c eq "\'")} { - if {$mode eq "unknown"} { - append seg $c - set mode "litquoted" - } elseif {$mode eq "unquoted"} { - #single quote inside e.g o'neill - append seg $c - } elseif {$mode eq "quoted"} { - append seg $c - - } elseif {$mode eq "litquoted"} { - append seg $c - #no normalization to do - lappend segments $seg - set seg "" - set mode "syntax" - } elseif {$mode eq "syntax"} { - error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" } - - } elseif {$c in [list " " \t]} { - if {$mode eq "syntax"} { - #ignore - } else { - append seg $c + ' { + switch -- $mode { + preval { + append seg $c + set mode "litquoted" + } + unquoted { + #single quote inside e.g o'neill - ultimately invalid - but we pass through here. + append seg $c + } + quoted { + append seg $c + } + litquoted { + append seg $c + #no normalization to do aside from stripping squotes + if {$normalize} { + lappend segments [tcl::string::range $seg 1 end-1] + } else { + lappend segments $seg + } + set seg "" + set mode "postval" + } + postval { + error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" + } + } } - } else { - if {$mode eq "syntax"} { - error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" + sp - tab { + switch -- $mode { + preval - postval { + #ignore + } + unquoted { + #terminates a barekey + lappend segments $seg + set seg "" + set mode "postval" + } + default { + #append to quoted or litquoted + append seg $c + } + } } - if {$mode eq "unknown"} { - set mode "unquoted" + default { + switch -- $mode { + preval { + set mode unquoted + append seg $c + } + postval { + error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" + } + default { + append seg $c + } + } } - append seg $c } + if {$i == $sLen-1} { #end of data ::tomlish::log::debug "End of data: mode='$mode'" - #REVIEW - we can only end up in unquoted or syntax here? are other branches reachable? switch -exact -- $mode { - quoted { - if {$c ne "\""} { - error "tablename_split. missing closing double-quote in a segment. tablename: '$tablename'" - } - if {$normalize} { - lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] - #lappend segments [subst -nocommands -novariables [::string range $seg 1 end-1]] ;#wrong - } else { - lappend segments $seg - } + preval { + error "tablename_split. Expected a value after last dot separator. tablename: '$tablename'" } - litquoted { - set trimmed_seg [tcl::string::trim $seg] - if {[tcl::string::index $trimmed_seg end] ne "\'"} { - error "tablename_split. missing closing single-quote in a segment. tablename: '$tablename'" - } + unquoted { lappend segments $seg } - unquoted - unknown { - lappend segments $seg + quoted { + error "tablename_split. Expected a trailing double quote. tablename: '$tablename'" } - syntax { - #ok - segment already lappended + litquoted { + error "tablename_split. Expected a trailing single quote. tablename: '$tablename'" } - default { - lappend segments $seg + postval { + #ok - segment already lappended } } } } - foreach seg $segments { - set trimmed [tcl::string::trim $seg " \t"] - #note - we explicitly allow 'empty' quoted strings '' & "" - # (these are 'discouraged' but valid toml keys) - #if {$trimmed in [list "''" "\"\""]} { - # puts stderr "tablename_split. warning - Empty quoted string as tablename segment" - #} - if {$trimmed eq "" } { - error "tablename_split. Empty segment found. tablename: '$tablename' segments [llength $segments] ($segments)" - } - } + + #note - we must allow 'empty' quoted strings '' & "" + # (these are 'discouraged' but valid toml keys) + return $segments } @@ -2432,26 +2562,34 @@ namespace eval tomlish::utils { #- escape_string and unescape_string would not be reliably roundtrippable inverses anyway. #REVIEW - provide it anyway? When would it be desirable to use? - variable Bstring_control_map [list\ - \b {\b}\ - \n {\n}\ - \r {\r}\ - \" {\"}\ - \x1b {\e}\ - \\ "\\\\"\ - ] + variable Bstring_control_map [dict create] + dict set Bstring_control_map \b {\b} + dict set Bstring_control_map \n {\n} + dict set Bstring_control_map \r {\r} + dict set Bstring_control_map \" {\"} + #dict set Bstring_control_map \x1b {\e} ;#should presumably be only be a convenience for decode - going the other way we get \u001B + dict set Bstring_control_map \\ "\\\\" + #\e for \x1b seems like it might be included - v1.1?? hard to find current state of where toml is going :/ #for a Bstring (Basic string) tab is explicitly mentioned as not being one that must be escaped. - for {set cdec 0} {$cdec <= 8} {incr cdec} { + #8 = \b - already in list. + #built the remainder whilst checking for entries already hardcoded above -in case more are added to the hardcoded list + for {set cdec 0} {$cdec <= 7} {incr cdec} { set hhhh [format %.4X $cdec] - lappend Bstring_control_map [format %c $cdec] \\u$hhhh + set char [format %c $cdec] + if {![dict exists $Bstring_control_map $char]} { + dict set Bstring_control_map $char \\u$hhhh + } } for {set cdec [expr {0x0A}]} {$cdec <= 0x1F} {incr cdec} { set hhhh [format %.4X $cdec] - lappend Bstring_control_map [format %c $cdec] \\u$hhhh + set char [format %c $cdec] + if {![dict exists $Bstring_control_map $char]} { + dict set Bstring_control_map $char \\u$hhhh + } } # \u007F = 127 - lappend Bstring_control_map [format %c 127] \\u007F + dict set Bstring_control_map [format %c 127] \\u007F #Note the inclusion of backslash in the list of controls makes this non idempotent - subsequent runs would keep encoding the backslashes! #escape only those chars that must be escaped in a Bstring (e.g not tab which can be literal or escaped) @@ -2474,6 +2612,7 @@ namespace eval tomlish::utils { # it recognizes other escapes which aren't approprite e.g \xhh and octal \nnn # it replaces \ with a single whitespace (trailing backslash) #This means we shouldn't use 'subst' on the whole string, but instead substitute only the toml-specified escapes (\r \n \b \t \f \\ \" \uhhhh & \Uhhhhhhhh + #plus \e for \x1b? set buffer "" set buffer4 "" ;#buffer for 4 hex characters following a \u @@ -2558,12 +2697,13 @@ namespace eval tomlish::utils { set ctest [tcl::string::map {{"} dq} $c] switch -exact -- $ctest { dq { - set e "\\\"" - append buffer [subst -nocommand -novariable $e] + append buffer {"} } b - t - n - f - r { - set e "\\$c" - append buffer [subst -nocommand -novariable $e] + append buffer [subst -nocommand -novariable "\\$c"] + } + e { + append buffer \x1b } u { set unicode4_active 1 @@ -2578,8 +2718,7 @@ namespace eval tomlish::utils { #review - toml spec says all other escapes are reserved #and if they are used TOML should produce an error. #we leave detecting this for caller for now - REVIEW - append buffer "\\" - append buffer $c + append buffer "\\$c" } } } else { @@ -3003,7 +3142,7 @@ namespace eval tomlish::parse { # states: # table-space, itable-space, array-space # array-value-expected,keyval-value-expected,itable-keyval-value-expected, keyval-syntax, - # quoted-key, squoted-key + # dquoted-key, squoted-key # string-state, literal-state, multistring... # # notes: @@ -3039,6 +3178,12 @@ namespace eval tomlish::parse { variable stateMatrix set stateMatrix [dict create] + #--------------------------------------------------------- + #WARNING + #The stateMatrix implementation here is currently messy. + #The code is a mixture of declarative via the stateMatrix and imperative via switch statements during PUSH/POP/SAMESPACE transitions. + #This means the state behaviour has to be reasoned about by looking at both in conjuction. + #--------------------------------------------------------- #xxx-space vs xxx-syntax inadequately documented - TODO @@ -3060,35 +3205,19 @@ namespace eval tomlish::parse { barekey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ squotedkey {PUSHSPACE "keyval-space" state "keyval-syntax" note ""}\ dquotedkey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ - XXXstartquote "quoted-key"\ - XXXstartsquote "squoted-key"\ + XXXsingle_dquote "quoted-key"\ + XXXsingle_squote "squoted-key"\ comment "table-space"\ starttablename "tablename-state"\ starttablearrayname "tablearrayname-state"\ - startmultiquote "err-state"\ - endquote "err-state"\ + enddquote "err-state"\ + endsquote "err-state"\ comma "err-state"\ eof "end-state"\ equal "err-state"\ cr "err-lonecr"\ } - #itable-space/ curly-syntax : itables - dict set stateMatrix\ - itable-space {\ - whitespace "itable-space"\ - newline "itable-space"\ - barekey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - squotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - dquotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - endinlinetable "POPSPACE"\ - XXXstartquote "quoted-key"\ - XXXstartsquote {TOSTATE "squoted-key" comment "jn-testing"}\ - comma "err-state"\ - comment "itable-space"\ - eof "err-state"\ - } - #squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate itable-space starttok {squote_seq "'"}} dict set stateMatrix\ @@ -3113,26 +3242,19 @@ namespace eval tomlish::parse { dict set stateMatrix\ keyval-value-expected {\ whitespace "keyval-value-expected"\ - untyped_value {TOSTATE "keyval-tail" note ""}\ - startquote {TOSTATE "string-state" returnstate keyval-tail}\ - startmultiquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ - squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate keyval-value-expected starttok {squote_seq "'"}}\ - startsquote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ - double_squote {TOSTATE "keyval-tail" note "empty literal received when double squote occurs"}\ - triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ - startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ - startarray {PUSHSPACE array-space returnstate keyval-tail}\ - } - #squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate keyval-process-leading-squotes starttok {squote_seq "'"}} - dict set stateMatrix\ - leading-squote-space {\ - squote_seq "POPSPACE"\ + untyped_value {TOSTATE "keyval-tail" note ""}\ + literal {TOSTATE "keyval-tail" note "required for empty literal at EOF"}\ + string {TOSTATE "keyval-tail" note "required for empty string at EOF"}\ + single_dquote {TOSTATE "string-state" returnstate keyval-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ + single_squote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ + triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ + startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ + startarray {PUSHSPACE array-space returnstate keyval-tail}\ } - #dict set stateMatrix\ - # keyval-process-leading-squotes {\ - # startsquote "literal-state"\ - # triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ - # } + #double_squote {TOSTATE "keyval-tail" note "empty literal received when double squote occurs"} + + #2025 - no leading-squote-space - only trailing-squote-space. dict set stateMatrix\ keyval-tail {\ @@ -3142,81 +3264,106 @@ namespace eval tomlish::parse { eof "end-state"\ } + + #itable-space/ curly-syntax : itables + # x={y=1,} + dict set stateMatrix\ + itable-space {\ + whitespace "itable-space"\ + newline "itable-space"\ + barekey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + squotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + dquotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + endinlinetable "POPSPACE"\ + comma "err-state"\ + comment "itable-space"\ + eof "err-state"\ + } + #we don't get single_squote etc here - instead we get the resulting squotedkey token + + + # ??? review - something like this + # + # x={y =1,} dict set stateMatrix\ itable-keyval-syntax {\ - whitespace "itable-keyval-syntax"\ - barekey {PUSHSPACE "dottedkey-space"}\ - squotedkey {PUSHSPACE "dottedkey-space"}\ - dquotedkey {PUSHSPACE "dottedkey-space"}\ - equal "itable-keyval-value-expected"\ + whitespace {TOSTATE "itable-keyval-syntax"}\ + barekey {PUSHSPACE "dottedkey-space"}\ + squotedkey {PUSHSPACE "dottedkey-space"}\ + dquotedkey {PUSHSPACE "dottedkey-space"}\ + equal {TOSTATE "itable-keyval-value-expected"}\ newline "err-state"\ eof "err-state"\ } + + # x={y=1} + dict set stateMatrix\ + itable-keyval-space {\ + whitespace "itable-keyval-syntax"\ + equal {TOSTATE "itable-keyval-value-expected" note "required"}\ + } + dict set stateMatrix\ itable-keyval-value-expected {\ whitespace "itable-keyval-value-expected"\ untyped_value {TOSTATE "itable-val-tail" note ""}\ - startquote {TOSTATE "string-state" returnstate itable-val-tail}\ - startmultiquote {PUSHSPACE "multistring-space" returnstate itable-val-tail}\ - squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate itable-keyval-value-expected starttok {squote_seq "'"}}\ - startsquote {TOSTATE "literal-state" returnstate itable-val-tail note "usual way a literal is triggered"}\ - double_squote {TOSTATE "itable-val-tail" note "empty literal received when double squote occurs"}\ + single_dquote {TOSTATE "string-state" returnstate itable-val-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate itable-val-tail}\ + single_squote {TOSTATE "literal-state" returnstate itable-val-tail note "usual way a literal is triggered"}\ triple_squote {PUSHSPACE "multiliteral-space" returnstate itable-val-tail}\ startinlinetable {PUSHSPACE "itable-space" returnstate itable-val-tail}\ startarray {PUSHSPACE "array-space" returnstate itable-val-tail}\ } - dict set stateMatrix\ - itable-keyval-space {\ - whitespace "itable-keyval-syntax"\ - equal {TOSTATE "itable-keyval-value-expected" note "required"}\ - } + #double_squote not currently generated by _start_squote_sequence - '' processed as single_squote to literal-state just like 'xxx' + # review + # double_squote {TOSTATE "itable-val-tail" note "empty literal received when double squote occurs"} + + + # x={y=1,z="x"} + #POPSPACE is transition from itable-keyval-space to parent itable-space dict set stateMatrix\ itable-val-tail {\ whitespace "itable-val-tail"\ endinlinetable "POPSPACE"\ comma "POPSPACE"\ - XXXnewline {TOSTATE "itable-val-tail" note "itable-space ??"}\ - newline "POPSPACE"\ + newline {TOSTATE "itable-val-tail" note "itable-space ??"}\ comment "itable-val-tail"\ eof "err-state"\ } - #dict set stateMatrix\ - # itable-quoted-key {\ - # whitespace "NA"\ - # itablequotedkey {PUSHSPACE "itable-keyval-space"}\ - # newline "err-state"\ - # endquote "itable-keyval-syntax"\ - # } - #dict set stateMatrix\ - # itable-squoted-key {\ - # whitespace "NA"\ - # itablesquotedkey {PUSHSPACE "itable-keyval-space"}\ - # newline "err-state"\ - # endsquote "itable-keyval-syntax"\ - # } + # XXXnewline "POPSPACE" + # We shouldn't popspace on newline - as if there was no comma we need to stay in itable-val-tail + # This means the newline and subsequent whitespace, comments etc become part of the preceeding dottedkey record + #e.g + # x = { + # j=1 + # #comment within dottedkey j record + # , # comment unattached + # #comment unattached + # k=2 , #comment unattached + # l=3 #comment within l record + # , m=4 + # #comment associated with m record + # + # #still associated with m record + # } + ## - This doesn't quite correspond to what a user might expect - but seems like a consistent mechanism. + #The awkwardness is because there is no way to put in a comment that doesn't consume a trailing comma + #so we cant do: j= 1 #comment for j1 , + # and have the trailing comma recognised. + # + # To associate: j= 1, #comment for j1 + # we would need some extra processing . (not popping until next key ? extra state itable-sep-tail?) REVIEW - worth doing? + # + # The same issue occurs with multiline arrays. The most natural assumption is that a comment on same line after a comma + # is 'associated' with the previous entry. + # + # These comment issues are independent of the data dictionary being generated for conversion to json etc - as the comments don't carry through anyway, + # but are a potential oddity for manipulating the intermediate tomlish structure whilst attempting to preserve 'associated' comments + # (e.g reordering records within an itable) + #The user's intention for 'associated' isn't always clear and the specs don't really guide on this. - - - #array-value-expected ? - dict set stateMatrix\ - XXXvalue-expected {\ - whitespace "value-expected"\ - untyped_value {"SAMESPACE" "" replay untyped_value}\ - startquote "string-state"\ - startsquote "literal-state"\ - triple_squote {PUSHSPACE "multiliteral-space"}\ - startmultiquote {PUSHSPACE "multistring-space"}\ - startinlinetable {PUSHSPACE itable-space}\ - startarray {PUSHSPACE array-space}\ - comment "err-state-value-expected-got-comment"\ - comma "err-state"\ - newline "err-state"\ - eof "err-state"\ - } - #note comment token should never be delivered to array-value-expected state? - #dottedkey-space is not (currently) used within [tablename] or [[tablearrayname]] #it is for keyval ie x.y.z = value @@ -3245,6 +3392,8 @@ namespace eval tomlish::parse { whitespace "dottedkey-space-tail" dotsep "dottedkey-space" equal "POPSPACE"\ + eof "err-state"\ + newline "err-state"\ } #-------------------------------------------------------------------------- @@ -3262,22 +3411,10 @@ namespace eval tomlish::parse { #toml spec looks like heading towards allowing newlines within inline tables #https://github.com/toml-lang/toml/issues/781 - #2025 - appears to be valid for 1.1 - which we are targeting. + #2025 - multiline itables appear to be valid for 1.1 - which we are targeting. #https://github.com/toml-lang/toml/blob/main/toml.md#inline-table #JMN2025 - #dict set stateMatrix\ - # curly-syntax {\ - # whitespace "curly-syntax"\ - # newline "curly-syntax"\ - # barekey {PUSHSPACE "itable-keyval-space"}\ - # itablequotedkey "itable-keyval-space"\ - # endinlinetable "POPSPACE"\ - # startquote "itable-quoted-key"\ - # comma "itable-space"\ - # comment "itable-space"\ - # eof "err-state"\ - # } #review comment "err-state" vs comment "itable-space" - see if TOML 1.1 comes out and allows comments in multiline ITABLES #We currently allow multiline ITABLES (also with comments) in the tokenizer. #if we want to disallow as per TOML 1.0 - we should do so when attempting to get structure? @@ -3291,10 +3428,9 @@ namespace eval tomlish::parse { # untyped_value "SAMESPACE"\ # startarray {PUSHSPACE "array-space"}\ # endarray "POPSPACE"\ - # startmultiquote {PUSHSPACE multistring-space}\ # startinlinetable {PUSHSPACE itable-space}\ - # startquote "string-state"\ - # startsquote "literal-state"\ + # single_dquote "string-state"\ + # single_squote "literal-state"\ # triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax note "seems ok 2024"}\ # comma "array-space"\ # comment "array-space"\ @@ -3305,15 +3441,16 @@ namespace eval tomlish::parse { set aspace [dict create] dict set aspace whitespace "array-space" dict set aspace newline "array-space" - dict set aspace untyped_value "SAMESPACE" + #dict set aspace untyped_value "SAMESPACE" + dict set aspace untyped_value "array-syntax" dict set aspace startarray {PUSHSPACE "array-space"} dict set aspace endarray "POPSPACE" - dict set aspace startmultiquote {PUSHSPACE multistring-space} + dict set aspace single_dquote {TOSTATE "string-state" returnstate array-syntax} + dict set aspace triple_dquote {PUSHSPACE "multistring-space" returnstate array-syntax} + dict set aspace single_squote {TOSTATE "literal-state" returnstate array-syntax} + dict set aspace triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax} dict set aspace startinlinetable {PUSHSPACE itable-space} - dict set aspace startquote "string-state" - dict set aspace startsquote "literal-state" - dict set aspace triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax note "seems ok 2024"} - dict set aspace comma "array-space" + #dict set aspace comma "array-space" dict set aspace comment "array-space" dict set aspace eof "err-state-array-space-got-eof" dict set stateMatrix array-space $aspace @@ -3329,26 +3466,16 @@ namespace eval tomlish::parse { #dict set asyntax untyped_value "SAMESPACE" #dict set asyntax startarray {PUSHSPACE array-space} dict set asyntax endarray "POPSPACE" - #dict set asyntax startmultiquote {PUSHSPACE multistring-space} - #dict set asyntax startquote "string-state" - #dict set asyntax startsquote "literal-state" + #dict set asyntax single_dquote "string-state" + #dict set asyntax single_squote "literal-state" dict set asyntax comma "array-space" dict set asyntax comment "array-syntax" dict set stateMatrix array-syntax $asyntax - #quoted-key & squoted-key need to PUSHSPACE from own token to keyval-space - dict set stateMatrix\ - quoted-key {\ - whitespace "NA"\ - dquotedkey {PUSHSPACE "keyval-space"}\ - newline "err-state"\ - endquote "keyval-syntax"\ - } - - #review + #dquotedkey is a token - dquoted-key is a state dict set stateMatrix\ dquoted-key {\ whitespace "NA"\ @@ -3367,7 +3494,7 @@ namespace eval tomlish::parse { string-state {\ whitespace "NA"\ string "string-state"\ - endquote "SAMESPACE"\ + enddquote "SAMESPACE"\ newline "err-state"\ eof "err-state"\ } @@ -3381,20 +3508,21 @@ namespace eval tomlish::parse { } - #dict set stateMatrix\ - # stringpart {\ - # continuation "SAMESPACE"\ - # endmultiquote "POPSPACE"\ - # eof "err-state"\ - # } dict set stateMatrix\ multistring-space {\ - whitespace "multistring-space"\ - continuation "multistring-space"\ - stringpart "multistring-space"\ - newline "multistring-space"\ - endmultiquote "POPSPACE"\ - eof "err-state"\ + whitespace "multistring-space"\ + continuation "multistring-space"\ + stringpart "multistring-space"\ + newline "multistring-space"\ + tentative_trigger_dquote {PUSHSPACE "trailing-dquote-space" returnstate multistring-space starttok {tentative_accum_dquote {"}}}\ + single_dquote {TOSTATE multistring-space}\ + double_dquote {TOSTATE multistring-space}\ + triple_dquote {POPSPACE}\ + eof "err-state"\ + } + dict set stateMatrix\ + trailing-dquote-space { + tentative_accum_dquote "POPSPACE" } @@ -3402,19 +3530,19 @@ namespace eval tomlish::parse { #todo - treat sole cr as part of literalpart but crlf and lf as newline dict set stateMatrix\ multiliteral-space {\ - literalpart "multiliteral-space"\ - newline "multiliteral-space"\ - squote_seq_begin {PUSHSPACE "trailing-squote-space" returnstate multiliteral-space starttok {squote_seq "'"}}\ - triple_squote {POPSPACE note "on popping - we do any necessary concatenation of LITERALPART items due to squote processing"}\ - double_squote {TOSTATE multiliteral-space note "short squote_seq: can occur anywhere in the space e.g emitted at end when 5 squotes occur"}\ - startsquote {TOSTATE multiliteral-space note "short squote_seq: same as double_squote - false alarm"}\ - eof "err-premature-eof-in-multiliteral-space"\ + literalpart "multiliteral-space"\ + newline "multiliteral-space"\ + tentative_trigger_squote {PUSHSPACE "trailing-squote-space" returnstate multiliteral-space starttok {tentative_accum_squote "'"}}\ + single_squote {TOSTATE multiliteral-space note "short tentative_accum_squote: false alarm this squote is part of data"}\ + double_squote {TOSTATE multiliteral-space note "short tentative_accum_squote: can occur anywhere in the space e.g emitted at end when 5 squotes occur"}\ + triple_squote {POPSPACE note "on popping - we do any necessary concatenation of LITERALPART items due to squote processing"}\ + eof "err-premature-eof-in-multiliteral-space"\ } #trailing because we are looking for possible terminating ''' - but must accept '''' or ''''' and re-integrate the 1st one or 2 extra squotes dict set stateMatrix\ - trailing-squote-space {\ - squote_seq "POPSPACE"\ + trailing-squote-space { + tentative_accum_squote "POPSPACE" } @@ -3499,7 +3627,7 @@ namespace eval tomlish::parse { - + dict set stateMatrix\ end-state {} @@ -3557,14 +3685,13 @@ namespace eval tomlish::parse { dict set spacePushTransitions itable-keyval-space itable-keyval-syntax dict set spacePushTransitions array-space array-space dict set spacePushTransitions table-space tablename-state - dict set spacePushTransitions #itable-space itable-space + #dict set spacePushTransitions #itable-space itable-space #Pop to, next variable spacePopTransitions [dict create] dict set spacePopTransitions array-space array-syntax - #itable-space curly-syntax #itable-keyval-space itable-val-tail #review #we pop to keyval-space from dottedkey-space or from keyval-value-expected? we don't always want to go to keyval-tail @@ -3575,7 +3702,6 @@ namespace eval tomlish::parse { #JMN test #dict set spaceSameTransitions array-space array-syntax - #itable-space curly-syntax #itable-keyval-space itable-val-tail @@ -3611,6 +3737,8 @@ namespace eval tomlish::parse { ::tomlish::log::debug "--->> goNextState tokentype:$tokentype tok:$tok currentstate:$currentstate : transition_to = $transition_to" switch -exact -- [lindex $transition_to 0] { POPSPACE { + set popfromspace_info [spacestack peek] + set popfromspace_state [dict get $popfromspace_info state] spacestack pop set parent_info [spacestack peek] set type [dict get $parent_info type] @@ -3625,17 +3753,17 @@ namespace eval tomlish::parse { set existing [spacestack pop] dict unset existing returnstate spacestack push $existing ;#re-push modification - ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace redirected to stored returnstate $next <<---" + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected to stored returnstate $next <<---" } else { ### #review - do away with spacePopTransitions - which although useful to provide a default.. # - involve error-prone configurations distant to the main state transition configuration in stateMatrix if {[dict exists $::tomlish::parse::spacePopTransitions $parentspace]} { set next [dict get $::tomlish::parse::spacePopTransitions $parentspace] - ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace redirected state to $next (spacePopTransitions)<<---" + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected state to $next (spacePopTransitions)<<---" } else { set next $parentspace - ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace<<---" + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace<<---" } } set result $next @@ -3805,22 +3933,6 @@ namespace eval tomlish::parse { return $tokenType } - proc _shortcircuit_startquotesequence {} { - variable tok - variable i - set toklen [tcl::string::length $tok] - if {$toklen == 1} { - set_tokenType "startquote" - incr i -1 - return -level 2 1 - } elseif {$toklen == 2} { - puts stderr "_shortcircuit_startquotesequence toklen 2" - set_tokenType "startquote" - set tok "\"" - incr i -2 - return -level 2 1 - } - } proc get_token_waiting {} { variable token_waiting @@ -3940,7 +4052,6 @@ namespace eval tomlish::parse { set slash_active 0 set quote 0 set c "" - set multi_dquote "" for {} {$i < $sLen} {} { if {$i > 0} { set lastChar [tcl::string::index $s [expr {$i - 1}]] @@ -3957,8 +4068,6 @@ namespace eval tomlish::parse { switch -exact -- $ctest { # { - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 @@ -3966,16 +4075,20 @@ namespace eval tomlish::parse { if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { + #for multiliteral, multistring - data and/or end incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { + #pseudo token beginning with underscore - never returned to state machine - review incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i [tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } barekey { @@ -4003,7 +4116,7 @@ namespace eval tomlish::parse { append tok $c } default { - #dquotedkey, itablequotedkey, string,literal, multistring + #dquotedkey, string,literal, multistring append tok $c } } @@ -4015,7 +4128,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes#" + append tok "#" } multiliteral-space { set_tokenType "literalpart" @@ -4031,23 +4144,23 @@ namespace eval tomlish::parse { } lc { #left curly brace - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i [tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { @@ -4059,7 +4172,7 @@ namespace eval tomlish::parse { } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } starttablename - starttablearrayname { #*bare* tablename can only contain letters,digits underscores @@ -4105,7 +4218,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes\{" + append tok "\{" } multiliteral-space { set_tokenType "literalpart" @@ -4120,37 +4233,35 @@ namespace eval tomlish::parse { } rc { #right curly brace - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { append tok $c } - XXXitablesquotedkey { - } - string - dquotedkey - itablequotedkey - comment { + string - dquotedkey - comment { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } starttablename - tablename { if {$had_slash} {append tok "\\"} @@ -4221,7 +4332,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes\}" + append tok "\}" } multiliteral-space { set_tokenType "literalpart" ; #review @@ -4237,35 +4348,35 @@ namespace eval tomlish::parse { } lb { #left square bracket - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { append tok $c } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } starttablename { #change the tokenType @@ -4332,7 +4443,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes\[" + append tok "\[" } multiliteral-space { set_tokenType "literalpart" @@ -4350,37 +4461,35 @@ namespace eval tomlish::parse { } rb { #right square bracket - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { append tok $c } - XXXitablesquotedkey { - } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } comment { if {$had_slash} {append tok "\\"} @@ -4428,16 +4537,6 @@ namespace eval tomlish::parse { } } } - XXXtablearraynames { - puts "rb @ tablearraynames ??" - #switch? - - #todo? - if {$had_slash} {append tok "\\"} - #invalid! - but leave for datastructure loading stage to catch - set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex - return 1 - } default { incr i -1 return 1 @@ -4485,7 +4584,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes\]" + append tok "\]" } multiliteral-space { set_tokenType "literalpart" @@ -4498,21 +4597,21 @@ namespace eval tomlish::parse { } } bsl { - set dquotes $multi_dquote - set multi_dquote "" ;#!! #backslash if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } whitespace { @@ -4529,9 +4628,7 @@ namespace eval tomlish::parse { append tok "\\" set slash_active 0 } - XXXitablesquotedkey { - } - string - dquotedkey - itablequotedkey - comment { + string - dquotedkey - comment { if {$slash_active} { set slash_active 0 append tok "\\\\" @@ -4545,7 +4642,6 @@ namespace eval tomlish::parse { set slash_active 0 append tok "\\\\" } else { - append tok $dquotes set slash_active 1 } } @@ -4575,10 +4671,6 @@ namespace eval tomlish::parse { set tok "\\\\" set slash_active 0 } else { - if {$dquotes ne ""} { - set_tokenType "stringpart" - set tok $dquotes - } set slash_active 1 } } @@ -4599,58 +4691,56 @@ namespace eval tomlish::parse { set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { - #short squote_seq tokens are returned if active during any other character + tentative_accum_squote { + #for within multiliteral + #short tentative_accum_squote tokens are returned if active upon receipt of any other character #longest allowable for leading/trailing are returned here #### set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote - switch -- $state { - leading-squote-space { - append tok $c - if {$existingtoklen > 2} { - error "tomlish tok error: squote_seq unexpected length $existingtoklen when another received" - } elseif {$existingtoklen == 2} { - return 1 ;#return tok ''' - } - } - trailing-squote-space { - append tok $c - if {$existingtoklen == 4} { - #maxlen to be an squote_seq is multisquote + 2 = 5 - #return tok ''''' - return 1 - } - } - default { - error "tomlish tok error: squote_seq in unexpected state '$state' - expected leading-squote-space or trailing-squote-space" - } + #assert state = trailing-squote-space + append tok $c + if {$existingtoklen == 4} { + #maxlen to be a tentative_accum_squote is multisquote + 2 = 5 + #return tok with value ''''' + return 1 } } - whitespace { - #end whitespace - incr i -1 ;#reprocess sq + tentative_accum_dquote { + incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { - #temp token creatable only during value-expected or array-space + #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space switch -- [tcl::string::length $tok] { 1 { + #no conclusion can yet be reached append tok $c } 2 { + #enter multiliteral #switch? append tok $c set_tokenType triple_squote return 1 } default { + #if there are more than 3 leading squotes we also enter multiliteral space and the subsequent ones are handled + #by the tentative_accum_squote check for ending sequence which can accept up to 5 and reintegrate the + #extra 1 or 2 squotes as data. error "tomlish unexpected token length [tcl::string::length $tok] in '_start_squote_sequence'" } } } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + whitespace { + #end whitespace + incr i -1 ;#reprocess sq + return 1 + } literal { #slash_active always false #terminate the literal @@ -4663,7 +4753,7 @@ namespace eval tomlish::parse { # idea: end this literalpart (possibly 'temporarily') # let the sq be reprocessed in the multiliteral-space to push an end-multiliteral-sequence to state stack # upon popping end-multiliteral-sequence - stitch quotes back into this literalpart's token (if either too short - or a long ending sequence as shown above) - incr i -1 ;#throw the "'" back to loop - will be added to an squote_seq token for later processing + incr i -1 ;#throw the "'" back to loop - will be added to a tentative_accum_squote token for later processing return 1 } XXXitablesquotedkey { @@ -4684,7 +4774,11 @@ namespace eval tomlish::parse { append tok $c } barekey { - #not clear why o'shennanigan shouldn't be a legal barekey - but it seems not to be. + #barekeys now support all sorts of unicode letter/number chars for other cultures + #but not punctuation - not even for those of Irish heritage who don't object + #to the anglicised form of some names. + # o'shenanigan seems to not be a legal barekey + #The Irish will have to use an earlier form Ó - which apparently many may prefer anyway. error "tomlish Unexpected single quote during barekey. [tomlish::parse::report_line]" } default { @@ -4693,63 +4787,69 @@ namespace eval tomlish::parse { } } else { switch -exact -- $state { - array-space { + array-space - keyval-value-expected - itable-keyval-value-expected { + #leading squote + #pseudo-token _start_squote_sequence ss not received by state machine + #This pseudotoken will trigger production of single_squote token or triple_squote token + #It currently doesn't trigger double_squote token + #(handle '' same as 'x' ie produce a single_squote and go into processing literal) + #review - producing double_squote for empty literal may be slightly more efficient. + #This token is not used to handle squote sequences *within* a multiliteral set_tokenType "_start_squote_sequence" set tok "'" } - itable-keyval-value-expected - keyval-value-expected { - set_tokenType "squote_seq_begin" + multiliteral-space { + #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row + #we are building up a tentative_accum_squote to determine if + #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines + #b) it is exactly ''' and we can terminate the whole multiliteral + #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space + set_tokenType "tentative_trigger_squote" ;#trigger tentative_accum_squote set tok "'" return 1 } - table-space { - #tests: squotedkey.test - set_tokenType "squotedkey" - set tok "" - } - itable-space { - #tests: squotedkey_itable.test + table-space - itable-space { + #tests: squotedkey.test squotedkey_itable.test set_tokenType "squotedkey" set tok "" } - XXXitable-space { - #future - could there be multiline keys? - #this would allow arbitrary tcl dicts to be stored in toml + XXXtable-space - XXXitable-space { + #future - could there be multiline keys? MLLKEY, MLBKEY ? + #this would (almost) allow arbitrary tcl dicts to be stored in toml (aside from escaping issues) #probably unlikely - as it's perhaps not very 'minimal' or ergonomic for config files - set_tokenType "squote_seq_begin" + #@2025 ABNF for toml mentions key, simple-key, unquoted-key, quoted-key and dotted-key + #where key is simple-key or dotted-key - no MLL or MLB components + #the spec states solution for arbitrary binary data is application specific involving encodings + #such as hex, base64 + set_tokenType "_start_squote_sequence" set tok "'" return 1 } tablename-state { #first char in tablename-state/tablearrayname-state - set_tokenType tablename + set_tokenType "tablename" append tok "'" } tablearrayname-state { - set_tokenType tablearrayname + set_tokenType "tablearrayname" append tok "'" } literal-state { + #shouldn't get here? review tomlish::log::debug "- tokloop sq during literal-state with no tokentype - empty literal?" - set_tokenType literal + set_tokenType "literal" incr -1 return 1 } multistring-space { - error "tomlish unimplemented - squote during state '$state'. [tomlish::parse::report_line]" - } - multiliteral-space { - #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row - #we are building up an squote_seq to determine if - #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines - #b) it is exactly ''' and we can terminate the whole multiliteral - #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space - set_tokenType "squote_seq_begin" - set tok "'" - return 1 + set_tokenType "stringpart" + set tok "" + if {$had_slash} {append tok "\\"} + append tok "," + #error "tomlish unimplemented - squote during state '$state'. [tomlish::parse::report_line]" } dottedkey-space { - set_tokenType squotedkey + set_tokenType "squotedkey" } default { error "tomlish unhandled squote during state '$state'. [tomlish::parse::report_line]" @@ -4765,44 +4865,50 @@ namespace eval tomlish::parse { if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote { incr i -1 return 1 } - startquotesequence { - set toklen [tcl::string::length $tok] - if {$toklen == 1} { - append tok $c - } elseif {$toklen == 2} { - append tok $c - #switch vs set? - set_tokenType "startmultiquote" - return 1 - } else { - error "tomlish unexpected token length $toklen in 'startquotesequence'" - } - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" return 1 - - #set toklen [tcl::string::length $tok] - #switch -- $toklen { - # 1 { - # set_tokenType "startsquote" - # incr i -1 - # return 1 - # } - # 2 { - # set_tokenType "startsquote" - # incr i -2 - # return 1 - # } - # default { - # error "tomlish unexpected _start_squote_sequence length $toklen" - # } - #} + } + tentative_accum_dquote { + #within multistring + #short tentative_accum_dquote tokens are returned if active upon receipt of any other character + #longest allowable for leading/trailing are returned here + #### + set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote + #assert state = trailing-squote-space + append tok $c + if {$existingtoklen == 4} { + #maxlen to be a tentative_accum_dquote is multidquote + 2 = 5 + #return tok with value """"" + return 1 + } + } + _start_dquote_sequence { + #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space + switch -- [tcl::string::length $tok] { + 1 { + #no conclusion can yet be reached + append tok $c + } + 2 { + #enter multistring + #switch? + append tok $c + set_tokenType triple_dquote + return 1 + } + default { + #if there are more than 3 leading dquotes we also enter multistring space and the subsequent ones are handled + #by the tentative_accum_dquote check for ending sequence which can accept up to 5 and reintegrate the + #extra 1 or 2 dquotes as data. + error "tomlish unexpected token length [tcl::string::length $tok] in '_start_dquote_sequence'" + } + } } literal - literalpart { append tok $c @@ -4811,8 +4917,8 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" $c } else { - #unescaped quote always terminates a string? - set_token_waiting type endquote value "\"" complete 1 startindex $cindex + #unescaped quote always terminates a string + set_token_waiting type enddquote value "\"" complete 1 startindex $cindex return 1 } } @@ -4821,77 +4927,31 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" $c } else { - #incr i -1 - - if {$multi_dquote eq "\"\""} { - set_token_waiting type endmultiquote value "\"\"\"" complete 1 startindex [expr {$cindex -2}] - set multi_dquote "" - return 1 - } else { - append multi_dquote "\"" - } + incr i -1 ;#throw the {"} back to loop - will be added to a tentative_accum_dquote token for later processing + return 1 } } whitespace { - switch -exact -- $state { - multistring-space { - #REVIEW - if {$had_slash} { - incr i -2 - return 1 - } else { - switch -- [tcl::string::length $multi_dquote] { - 2 { - set_token_waiting type endmultiquote value "\"\"\"" complete 1 startindex [expr {$cindex-2}] - set multi_dquote "" - return 1 - } - 1 { - incr i -2 - return 1 - } - 0 { - incr i -1 - return 1 - } - } - } - } - keyval-value-expected { - #end whitespace token and reprocess - incr i -1 - return 1 - - #if {$multi_dquote eq "\"\""} { - # set_token_waiting type startmultiquote value "\"\"\"" complete 1 - # set multi_dquote "" - # return 1 - #} else { - # #end whitespace token and reprocess - # incr i -1 - # return 1 - #} - } - table-space - itable-space { - incr i -1 - return 1 - } - default { - set_token_waiting type startquote value "\"" complete 1 startindex $cindex - return 1 - } + #assert: had_slash will only ever be true in multistring-space + if {$had_slash} { + incr i -2 + return 1 + } else { + #end whitespace token - throw dq back for reprocessing + incr i -1 + return 1 } } comment { if {$had_slash} {append tok "\\"} append tok $c } - XXXdquotedkey - XXXitablequotedkey { + XXXdquotedkey { if {$had_slash} { append tok "\\" append tok $c } else { - set_token_waiting type endquote value "\"" complete 1 startindex $cindex + set_token_waiting type enddquote value "\"" complete 1 startindex $cindex return 1 } } @@ -4901,7 +4961,7 @@ namespace eval tomlish::parse { append tok "\\" append tok $c } else { - #set_token_waiting type endsquote value "'" complete 1 + #set_token_waiting type enddquote value {"} complete 1 return 1 } } @@ -4924,64 +4984,40 @@ namespace eval tomlish::parse { #$slash_active not relevant when no tokenType #token is string only if we're expecting a value at this point switch -exact -- $state { - array-space { - #!? start looking for possible multistartquote - #set_tokenType startquote - #set tok $c - #return 1 - set_tokenType "startquotesequence" ;#one or more quotes in a row - either startquote or multistartquote - set tok $c - } - keyval-value-expected - itable-keyval-value-expected { - set_tokenType "startquotesequence" ;#one or more quotes in a row - either startquote or multistartquote - set tok $c + array-space - keyval-value-expected - itable-keyval-value-expected { + #leading dquote + #pseudo-token _start_squote_sequence ss not received by state machine + #This pseudotoken will trigger production of single_dquote token or triple_dquote token + #It currently doesn't trigger double_dquote token + #(handle "" same as "x" ie produce a single_dquote and go into processing string) + #review - producing double_dquote for empty string may be slightly more efficient. + #This token is not used to handle dquote sequences once *within* a multistring + set_tokenType "_start_dquote_sequence" + set tok {"} } multistring-space { - #TODO - had_slash!!! - #REVIEW if {$had_slash} { set_tokenType "stringpart" set tok "\\\"" - set multi_dquote "" } else { - if {$multi_dquote eq "\"\""} { - tomlish::log::debug "- tokloop char dq ---> endmultiquote" - set_tokenType "endmultiquote" - set tok "\"\"\"" - return 1 - #set_token_waiting type endmultiquote value "\"\"\"" complete 1 - #set multi_dquote "" - #return 1 - } else { - append multi_dquote "\"" - } + #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row + #we are building up a tentative_accum_squote to determine if + #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines + #b) it is exactly ''' and we can terminate the whole multiliteral + #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space + set_tokenType "tentative_trigger_dquote" ;#trigger tentative_accum_dquote + set tok {"} + return 1 } } multiliteral-space { set_tokenType "literalpart" set tok "\"" } - XXXtable-space { - set_tokenType "startquote" - set tok $c - return 1 - } - XXXitable-space { - set_tokenType "startquote" - set tok $c - } table-space - itable-space { set_tokenType "dquotedkey" set tok "" } - tablename-state { - set_tokenType tablename - set tok $c - } - tablearrayname-state { - set_tokenType tablearrayname - set tok $c - } dottedkey-space { set_tokenType dquotedkey set tok "" @@ -4990,49 +5026,56 @@ namespace eval tomlish::parse { #set_tokenType dquote_seq_begin #set tok $c } + tablename-state { + set_tokenType tablename + set tok $c + } + tablearrayname-state { + set_tokenType tablearrayname + set tok $c + } default { - error "tomlish Unexpected quote during state '$state' [tomlish::parse::report_line]" + error "tomlish Unexpected dquote during state '$state' [tomlish::parse::report_line]" } } } } = { - set dquotes $multi_dquote - set multi_dquote "" ;#!! set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { - #assertion had_slash 0, multi_dquote "" + #assertion had_slash 0 append tok $c } - string - comment - dquotedkey - itablequotedkey { + string - comment - dquotedkey { #for these tokenTypes an = is just data. if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } whitespace { if {$state eq "multistring-space"} { - set backlen [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backlen + incr i -1 return 1 } else { set_token_waiting type equal value = complete 1 startindex $cindex @@ -5063,7 +5106,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok ${dquotes}= + append tok = } multiliteral-space { set_tokenType "literalpart" @@ -5084,8 +5127,6 @@ namespace eval tomlish::parse { } cr { #REVIEW! - set dquotes $multi_dquote - set multi_dquote "" ;#!! # \r carriage return if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. set slash_active 0 @@ -5098,16 +5139,18 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal { @@ -5159,8 +5202,6 @@ namespace eval tomlish::parse { } lf { # \n newline - set dquotes $multi_dquote - set multi_dquote "" ;#!! set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { @@ -5171,16 +5212,19 @@ namespace eval tomlish::parse { append tok lf ;#assert we should now have tok "crlf" - as a previous cr is the only way to have an incomplete newline tok return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { + #multiliteral or multistring incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal { @@ -5196,20 +5240,14 @@ namespace eval tomlish::parse { return 1 } stringpart { - if {$dquotes ne ""} { - append tok $dquotes + if {$had_slash} { + #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) + set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] incr i -1 return 1 } else { - if {$had_slash} { - #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) - set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] - incr i -1 - return 1 - } else { - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 } } starttablename - tablename - tablearrayname - starttablearrayname { @@ -5236,20 +5274,13 @@ namespace eval tomlish::parse { incr i -1 return 1 } else { - if {$dquotes ne ""} { - #e.g one or 2 quotes just before nl - set_tokenType "stringpart" - set tok $dquotes - incr i -1 - return 1 - } set_tokenType "newline" set tok lf return 1 } } multiliteral-space { - #assert had_slash 0, multi_dquote "" + #assert had_slash 0 set_tokenType "newline" set tok "lf" return 1 @@ -5275,8 +5306,6 @@ namespace eval tomlish::parse { } } , { - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { @@ -5287,39 +5316,40 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } comment - tablename - tablearrayname { if {$had_slash} {append tok "\\"} append tok , } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { #stringpart can have up to 2 quotes too if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } literal - literalpart - squotedkey { - #assert had_slash always 0, multi_dquote "" + #assert had_slash always 0 append tok $c } whitespace { if {$state eq "multistring-space"} { - set backlen [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backlen + incr i -1 return 1 } else { set_token_waiting type comma value "," complete 1 startindex $cindex @@ -5338,10 +5368,10 @@ namespace eval tomlish::parse { set_tokenType "stringpart" set tok "" if {$had_slash} {append tok "\\"} - append tok "$dquotes," + append tok "," } multiliteral-space { - #assert had_slash 0, multi_dquote "" + #assert had_slash 0 set_tokenType "literalpart" set tok "," } @@ -5354,8 +5384,6 @@ namespace eval tomlish::parse { } } . { - set dquotes $multi_dquote - set multi_dquote "" ;#!! set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { @@ -5366,42 +5394,45 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } comment - untyped_value { if {$had_slash} {append tok "\\"} append tok $c } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } literal - literalpart - squotedkey { - #assert had_slash always 0, multi_dquote "" + #assert had_slash always 0 append tok $c } whitespace { switch -exact -- $state { multistring-space { - set backchars [expr {[tcl::string::length $dquotes] + 1}] + #review if {$had_slash} { - incr backchars 1 + incr i -2 + } else { + incr i -1 } - incr i -$backchars return 1 } xxxdottedkey-space { @@ -5444,7 +5475,7 @@ namespace eval tomlish::parse { set_tokenType "stringpart" set tok "" if {$had_slash} {append tok "\\"} - append tok "$dquotes." + append tok "." } multiliteral-space { set_tokenType "literalpart" @@ -5471,8 +5502,6 @@ namespace eval tomlish::parse { } " " { - set dquotes $multi_dquote - set multi_dquote "" ;#!! if {[tcl::string::length $tokenType]} { set had_slash $slash_active set slash_active 0 @@ -5483,16 +5512,18 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } barekey { @@ -5512,9 +5543,9 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok $dquotes$c + append tok $c } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} { append tok "\\" } append tok $c } @@ -5526,8 +5557,7 @@ namespace eval tomlish::parse { incr i -2 return 1 } else { - #split into STRINGPART aaa WS " " - append tok $dquotes + #split into STRINGPART xxx WS " " incr i -1 return 1 } @@ -5537,15 +5567,7 @@ namespace eval tomlish::parse { } whitespace { if {$state eq "multistring-space"} { - if {$dquotes ne ""} { - #end whitespace token - #go back by the number of quotes plus this space char - set backchars [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backchars - return 1 - } else { - append tok $c - } + append tok $c } else { append tok $c } @@ -5588,12 +5610,6 @@ namespace eval tomlish::parse { incr i -1 return 1 } else { - if {$dquotes ne ""} { - set_tokenType "stringpart" - set tok $dquotes - incr i -1 - return 1 - } set_tokenType "whitespace" append tok $c } @@ -5613,9 +5629,6 @@ namespace eval tomlish::parse { } } tab { - set dquotes $multi_dquote - set multi_dquote "" ;#!! - if {[tcl::string::length $tokenType]} { if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out (?review) set slash_active 0 @@ -5626,12 +5639,18 @@ namespace eval tomlish::parse { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } barekey { @@ -5662,7 +5681,6 @@ namespace eval tomlish::parse { return 1 } else { #split into STRINGPART aaa WS " " - append tok $dquotes incr i -1 return 1 } @@ -5706,15 +5724,8 @@ namespace eval tomlish::parse { incr i -1 return 1 } else { - if {$dquotes ne ""} { - set_tokenType stringpart - set tok $dquotes - incr i -1 - return 1 - } else { - set_tokenType whitespace - append tok $c - } + set_tokenType whitespace + append tok $c } } multiliteral-space { @@ -5732,16 +5743,31 @@ namespace eval tomlish::parse { #BOM (Byte Order Mark) - ignored by token consumer if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } _start_squote_sequence { #assert - tok will be one or two squotes only + #A toml literal probably isn't allowed to contain this + #but we will parse and let the validator sort it out. incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart { append tok $c } + string - stringpart { + append tok $c + } default { + #state machine will generally not have entry to accept bom - let it crash set_token_waiting type bom value "\uFEFF" complete 1 startindex $cindex return 1 } @@ -5752,6 +5778,10 @@ namespace eval tomlish::parse { set_tokenType "literalpart" set tok $c } + multistring-space { + set_tokenType "stringpart" + set tok $c + } default { set_tokenType "bom" set tok "\uFEFF" @@ -5761,8 +5791,6 @@ namespace eval tomlish::parse { } } default { - set dquotes $multi_dquote - set multi_dquote "" ;#!! if {[tcl::string::length $tokenType]} { if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. @@ -5774,28 +5802,24 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } whitespace { if {$state eq "multistring-space"} { - if {$dquotes ne ""} { - set backlen [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backlen - return 1 - } else { - incr i -1 - return 1 - } + incr i -1 + return 1 } else { #review incr i -1 ;#We don't have a full token to add to the token_waiting dict - so leave this char for next run. @@ -5815,7 +5839,7 @@ namespace eval tomlish::parse { return 1 } stringpart { - append tok $dquotes$c + append tok $c } default { #e.g comment/string/literal/literalpart/untyped_value/starttablename/starttablearrayname/tablename/tablearrayname @@ -5835,22 +5859,12 @@ namespace eval tomlish::parse { error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" } } - XXXcurly-syntax { - puts stderr "curly-syntax - review" - if {[tomlish::utils::is_barekey $c]} { - set_tokenType "barekey" - append tok $c - } else { - error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" - } - } multistring-space { set_tokenType "stringpart" if {$had_slash} { - #assert - we don't get had_slash and dquotes at same time set tok \\$c } else { - set tok $dquotes$c + set tok $c } } multiliteral-space { @@ -5890,21 +5904,6 @@ namespace eval tomlish::parse { # error "Reached end of data whilst tokenType = '$tokenType'. INVALID" #} switch -exact -- $tokenType { - startquotesequence { - set toklen [tcl::string::length $tok] - if {$toklen == 1} { - #invalid - #eof with open string - error "tomlish eof reached without closing quote for string. [tomlish::parse::report_line]" - } elseif {$toklen == 2} { - #valid - #we ended in a double quote, not actually a startquoteseqence - effectively an empty string - switch_tokenType "startquote" - incr i -1 - #set_token_waiting type string value "" complete 1 - return 1 - } - } _start_squote_sequence { set toklen [tcl::string::length $tok] switch -- $toklen { @@ -5913,11 +5912,29 @@ namespace eval tomlish::parse { error "tomlish eof reached without closing single quote for string literal. [tomlish::parse::report_line]" } 2 { - #review - set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] set_tokenType "literal" set tok "" return 1 + + ##review + #set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] + #set_tokenType "literal" + #set tok "" + #return 1 + } + } + } + _start_dquote_sequence { + set toklen [tcl::string::length $tok] + switch -- $toklen { + 1 { + #invalid eof with open string + error "tomlish eof reached without closing double quote for string. [tomlish::parse::report_line]" + } + 2 { + set_tokenType "string" + set tok "" + return 1 } } } @@ -6011,6 +6028,16 @@ namespace eval tomlish::dict { return $name } + proc _show_tablenames {tablenames_info} { + append msg \n "tablenames_info:" \n + dict for {tkey tinfo} $tablenames_info { + append msg " " "table: $tkey" \n + dict for {field finfo} $tinfo { + append msg " " "$field $finfo" \n + } + } + return $msg + } } tcl::namespace::eval tomlish::app { diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/dictn-0.1.1.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/dictn-0.1.1.tm new file mode 100644 index 00000000..c9ef87f2 --- /dev/null +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/dictn-0.1.1.tm @@ -0,0 +1,349 @@ +# -*- tcl -*- +# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt +# +# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. +# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# (C) 2023 +# +# @@ Meta Begin +# Application dictn 0.1.1 +# Meta platform tcl +# Meta license +# @@ Meta End + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Requirements +##e.g package require frobz + + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval dictn { + namespace export {[a-z]*} + namespace ensemble create +} + + +## ::dictn::append +#This can of course 'ruin' a nested dict if applied to the wrong element +# - i.e using the string op 'append' on an element that is itself a nested dict is analogous to the standard Tcl: +# %set list {a b {c d}} +# %append list x +# a b {c d}x +# IOW - don't do that unless you really know that's what you want. +# +proc ::dictn::append {dictvar path {value {}}} { + if {[llength $path] == 1} { + uplevel 1 [list dict append $dictvar $path $value] + } else { + upvar 1 $dictvar dvar + + ::set str [dict get $dvar {*}$path] + append str $val + dict set dvar {*}$path $str + } +} + +proc ::dictn::create {args} { + ::set data {} + foreach {path val} $args { + dict set data {*}$path $val + } + return $data +} + +proc ::dictn::exists {dictval path} { + return [dict exists $dictval {*}$path] +} + +proc ::dictn::filter {dictval path filterType args} { + ::set sub [dict get $dictval {*}$path] + dict filter $sub $filterType {*}$args +} + +proc ::dictn::for {keyvalvars dictval path body} { + ::set sub [dict get $dictval {*}$path] + dict for $keyvalvars $sub $body +} + +proc ::dictn::get {dictval {path {}}} { + return [dict get $dictval {*}$path] +} + +proc ::dictn::getdef {dictval path default} { + return [dict getdef $dictval {*}$path $default] +} + +proc ::dictn::getwithdefault {dictval path default} { + return [dict getdef $dictval {*}$path $default] +} + +if {[info commands ::tcl::dict::getdef] ne ""} { + proc ::dictn::incr {dictvar path {increment {}} } { + if {$increment eq ""} { + ::set increment 1 + } + if {[llength $path] == 1} { + uplevel 1 [list dict incr $dictvar $path $increment] + } else { + upvar 1 $dictvar dvar + if {![::info exists dvar]} { + dict set dvar {*}$path $increment + } else { + ::set newval [expr {[dict getdef $dvar {*}$path 0] + $increment}] + dict set dvar {*}$path $newval + } + return $dvar + } + } +} else { + proc ::dictn::incr {dictvar path {increment {}} } { + if {$increment eq ""} { + ::set increment 1 + } + if {[llength $path] == 1} { + uplevel 1 [list dict incr $dictvar $path $increment] + } else { + upvar 1 $dictvar dvar + if {![::info exists dvar]} { + dict set dvar {*}$path $increment + } else { + if {![dict exists $dvar {*}$path]} { + ::set val 0 + } else { + ::set val [dict get $dvar {*}$path] + } + ::set newval [expr {$val + $increment}] + dict set dvar {*}$path $newval + } + return $dvar + } + } +} + +proc ::dictn::info {dictval {path {}}} { + if {![string length $path]} { + return [dict info $dictval] + } else { + ::set sub [dict get $dictval {*}$path] + return [dict info $sub] + } +} + +proc ::dictn::keys {dictval {path {}} {glob {}}} { + ::set sub [dict get $dictval {*}$path] + if {[string length $glob]} { + return [dict keys $sub $glob] + } else { + return [dict keys $sub] + } +} + +proc ::dictn::lappend {dictvar path args} { + if {[llength $path] == 1} { + uplevel 1 [list dict lappend $dictvar $path {*}$args] + } else { + upvar 1 $dictvar dvar + + ::set list [dict get $dvar {*}$path] + ::lappend list {*}$args + dict set dvar {*}$path $list + } +} + +proc ::dictn::merge {args} { + error "nested merge not yet supported" +} + +#dictn remove dictionaryValue ?path ...? +proc ::dictn::remove {dictval args} { + ::set basic [list] ;#buffer basic (1element path) removals to do in a single call. + + foreach path $args { + if {[llength $path] == 1} { + ::lappend basic $path + } else { + #extract,modify,replace + ::set subpath [lrange $path 0 end-1] + + ::set sub [dict get $dictval {*}$subpath] + ::set sub [dict remove $sub [lindex $path end]] + + dict set dictval {*}$subpath $sub + } + } + + if {[llength $basic]} { + return [dict remove $dictval {*}$basic] + } else { + return $dictval + } +} + + +proc ::dictn::replace {dictval args} { + ::set basic [list] ;#buffer basic (1element path) replacements to do in a single call. + + foreach {path val} $args { + if {[llength $path] == 1} { + ::lappend basic $path $val + } else { + #extract,modify,replace + ::set subpath [lrange $path 0 end-1] + + ::set sub [dict get $dictval {*}$subpath] + ::set sub [dict replace $sub [lindex $path end] $val] + + dict set dictval {*}$subpath $sub + } + } + + + if {[llength $basic]} { + return [dict replace $dictval {*}$basic] + } else { + return $dictval + } +} + + +proc ::dictn::set {dictvar path newval} { + upvar 1 $dictvar dvar + return [dict set dvar {*}$path $newval] +} + +proc ::dictn::size {dictval {path {}}} { + return [dict size [dict get $dictval {*}$path]] +} + +proc ::dictn::unset {dictvar path} { + upvar 1 $dictvar dvar + return [dict unset dvar {*}$path +} + +proc ::dictn::update {dictvar args} { + ::set body [lindex $args end] + ::set maplist [lrange $args 0 end-1] + + upvar 1 $dictvar dvar + foreach {path var} $maplist { + if {[dict exists $dvar {*}$path]} { + uplevel 1 [list set $var [dict get $dvar $path]] + } + } + + catch {uplevel 1 $body} result + + foreach {path var} $maplist { + if {[dict exists $dvar {*}$path]} { + upvar 1 $var $var + if {![::info exists $var]} { + uplevel 1 [list dict unset $dictvar {*}$path] + } else { + uplevel 1 [list dict set $dictvar {*}$path [::set $var]] + } + } + } + return $result +} + +#an experiment. +proc ::dictn::Applyupdate {dictvar args} { + ::set body [lindex $args end] + ::set maplist [lrange $args 0 end-1] + + upvar 1 $dictvar dvar + + ::set headscript "" + ::set i 0 + foreach {path var} $maplist { + if {[dict exists $dvar {*}$path]} { + #uplevel 1 [list set $var [dict get $dvar $path]] + ::lappend arglist $var + ::lappend vallist [dict get $dvar {*}$path] + ::append headscript [string map [list %i% $i %v% $var] {upvar 1 %v% %v%; set %v% [lindex $args %i%]} ] + ::append headscript \n + ::incr i + } + } + + ::set body $headscript\r\n$body + + puts stderr "BODY: $body" + + #set result [apply [list args $body] {*}$vallist] + catch {apply [list args $body] {*}$vallist} result + + foreach {path var} $maplist { + if {[dict exists $dvar {*}$path] && [::info exists $var]} { + dict set dvar {*}$path [::set $var] + } + } + return $result +} + +proc ::dictn::values {dictval {path {}} {glob {}}} { + ::set sub [dict get $dictval {*}$path] + if {[string length $glob]} { + return [dict values $sub $glob] + } else { + return [dict values $sub] + } +} + +# Standard form: +#'dictn with dictVariable path body' +# +# Extended form: +#'dictn with dictVariable path arrayVariable body' +# +proc ::dictn::with {dictvar path args} { + if {[llength $args] == 1} { + ::set body [lindex $args 0] + return [uplevel 1 [list dict with $dictvar {*}$path $body]] + } else { + upvar 1 $dictvar dvar + ::lassign $args arrayname body + + upvar 1 $arrayname arr + array set arr [dict get $dvar {*}$path] + ::set prevkeys [array names arr] + + catch {uplevel 1 $body} result + + + foreach k $prevkeys { + if {![::info exists arr($k)]} { + dict unset $dvar {*}$path $k + } + } + foreach k [array names arr] { + dict set $dvar {*}$path $k $arr($k) + } + + return $result + } +} + + + + + + + + + + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Ready +package provide dictn [namespace eval dictn { + variable version + ::set version 0.1.1 +}] +return \ No newline at end of file diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/include_modules.config b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/include_modules.config index 247371ee..afd1e8f2 100644 --- a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/include_modules.config +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/include_modules.config @@ -27,6 +27,7 @@ set bootsupport_modules [list\ src/vendormodules sha1\ src/vendormodules tomlish\ src/vendormodules test::tomlish\ + src/vendormodules dictn\ src/vendormodules textutil::adjust\ src/vendormodules textutil::repeat\ src/vendormodules textutil::split\ diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/test/tomlish-1.1.3.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/test/tomlish-1.1.3.tm index ed5044a7..8afb43d9 100644 Binary files a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/test/tomlish-1.1.3.tm and b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/test/tomlish-1.1.3.tm differ diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/tomlish-1.1.4.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/tomlish-1.1.4.tm index 7a6d5205..33d5b912 100644 --- a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/tomlish-1.1.4.tm +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/tomlish-1.1.4.tm @@ -153,15 +153,10 @@ namespace eval tomlish { } #review - if {[uplevel 1 [list info exists tablenames_seen]]} { - upvar tablenames_seen tablenames_seen + if {[uplevel 1 [list info exists tablenames_info]]} { + upvar tablenames_info tablenames_info } else { - set tablenames_seen [list] ;#list of lists - } - if {[uplevel 1 [list info exists tablenames_closed]]} { - upvar tablenames_closed tablenames_closed - } else { - set tablenames_closed [list] ;#list of lists + set tablenames_info [dict create] ;#keys are lists {parenttable subtable etc} corresponding to parenttable.subtable.etc } foreach sub [lrange $keyval_element 2 end] { @@ -207,13 +202,10 @@ namespace eval tomlish { ARRAY { #we need to recurse to get the corresponding dict for the contained item(s) #pass in the whole $found_sub - not just the $value! - set prev_tablenames_seen $tablenames_seen - set prev_tablenames_closed $tablenames_closed - set tablenames_seen [list] - set tablenames_closed [list] + set prev_tablenames_info $tablenames_info + set tablenames_info [dict create] set result [list type $type value [::tomlish::to_dict [list $found_sub]]] - set tablenames_seen $prev_tablenames_seen - set tablenames_closed $prev_tablenames_closed + set tablenames_info $prev_tablenames_info } MULTISTRING - MULTILITERAL { #review - mapping these to STRING might make some conversions harder? @@ -295,23 +287,66 @@ namespace eval tomlish { #[Data] #temps = [{cpu = 79.5, case = 72.0}] proc to_dict {tomlish} { + package require dictn #keep track of which tablenames have already been directly defined, # so we can raise an error to satisfy the toml rule: 'You cannot define any key or table more than once. Doing so is invalid' #Note that [a] and then [a.b] is ok if there are no subkey conflicts - so we are only tracking complete tablenames here. #we don't error out just because a previous tablename segment has already appeared. - ##variable tablenames_seen [list] - if {[uplevel 1 [list info exists tablenames_seen]]} { - upvar tablenames_seen tablenames_seen - } else { - set tablenames_seen [list] ;#list of lists - } - if {[uplevel 1 [list info exists tablenames_closed]]} { - upvar tablenames_closed tablenames_closed + + #Declaring, Creating, and Defining Tables + #https://github.com/toml-lang/toml/issues/795 + #(update - only Creating and Defining are relevant terminology) + + #review + #tablenames_info keys created, defined, createdby, definedby, closedby + + #consider the following 2 which are legal: + #[table] #'table' created, defined=open definedby={header table} + #x.y = 3 + #[table.x.z] #'table' defined=closed closedby={header table.x.z}, 'table.x' created, 'table.x.z' created defined=open definedby={header table.x.z} + #k= 22 + # #'table.x.z' defined=closed closedby={eof eof} + + #equivalent datastructure + + #[table] #'table' created, defined=open definedby={header table} + #[table.x] #'table' defined=closed closedby={header table.x}, 'table.x' created defined=open definedby={header table.x} + #y = 3 + #[table.x.z] #'table.x' defined=closed closedby={header table.x.z}, 'table.x.z' created defined=open definedby={header table.x.z} + #k=22 + + #illegal + #[table] #'table' created and defined=open + #x.y = 3 #'table.x' created first keyval pair defined=open definedby={keyval x.y = 3} + #[table.x.y.z] #'table' defined=closed, 'table.x' closed because parent 'table' closed?, 'table.x.y' cannot be created + #k = 22 + # + ## - we would fail on encountering table.x.y because only table and table.x are effectively tables - but that table.x is closed should be detected (?) + + #illegal + #[table] + #x.y = {p=3} + #[table.x.y.z] + #k = 22 + ## we should fail because y is an inline table which is closed to further entries + + #note: it is not safe to compare normalized tablenames using join! + # e.g a.'b.c'.d is not the same as a.b.c.d + # instead compare {a b.c d} with {a b c d} + # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. + #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' + #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} + + + + if {[uplevel 1 [list info exists tablenames_info]]} { + upvar tablenames_info tablenames_info } else { - set tablenames_closed [list] ;#list of lists + set tablenames_info [dict create] ;#keyed on tablepath each of which is a list such as {config subgroup etc} (corresponding to config.subgroup.etc) } + log::info "---> to_dict processing '$tomlish'<<<" set items $tomlish @@ -354,7 +389,7 @@ namespace eval tomlish { #a.b.c = 1 #table_key_hierarchy -> a b - #leafkey -> c + #tleaf -> c if {[llength $dotted_key_hierarchy] == 0} { #empty?? probably invalid. review #This is different to '' = 1 or ''.'' = 1 which have lengths 1 and 2 respectively @@ -362,10 +397,10 @@ namespace eval tomlish { } elseif {[llength $dotted_key_hierarchy] == 1} { #dottedkey is only a key - no table component set table_hierarchy [list] - set leafkey [lindex $dotted_key_hierarchy 0] + set tleaf [lindex $dotted_key_hierarchy 0] } else { set table_hierarchy [lrange $dotted_key_hierarchy 0 end-1] - set leafkey [lindex $dotted_key_hierarchy end] + set tleaf [lindex $dotted_key_hierarchy end] } #ensure empty tables are still represented in the datastructure @@ -380,143 +415,101 @@ namespace eval tomlish { } } #review? - if {[dict exists $datastructure {*}$table_hierarchy $leafkey]} { - error "Duplicate key '$table_hierarchy $leafkey'. The key already exists at this level in the toml data. The toml data is not valid." + if {[dict exists $datastructure {*}$table_hierarchy $tleaf]} { + error "Duplicate key '$table_hierarchy $tleaf'. The key already exists at this level in the toml data. The toml data is not valid." } #JMN test 2025 if {[llength $table_hierarchy]} { - lappend tablenames_seen $table_hierarchy + dictn incr tablenames_info [list $table_hierarchy seencount] } set keyval_dict [_get_keyval_value $item] if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { - lappend tablenames_seen [list {*}$table_hierarchy $leafkey] - lappend tablenames_closed [list {*}$table_hierarchy $leafkey] + set t [list {*}$table_hierarchy $tleaf] + dictn incr tablenames_info [list $t seencount] + dictn set tablenames_info [list $t closed] 1 #review - item is an ITABLE - we recurse here without datastructure context :/ #overwriting keys? todo ? - dict set datastructure {*}$table_hierarchy $leafkey $keyval_dict + dict set datastructure {*}$table_hierarchy $tleaf $keyval_dict } else { - dict set datastructure {*}$table_hierarchy $leafkey $keyval_dict + dict set datastructure {*}$table_hierarchy $tleaf $keyval_dict } + } + TABLEARRAY { + set tablename [lindex $item 1] + log::debug "---> to_dict processing item TABLENAME (name: $tablename): $item" + set norm_segments [::tomlish::utils::tablename_split $tablename true] ;#true to normalize + #we expect repeated tablearray entries - each adding a sub-object to the value, which is an array/list. + } TABLE { set tablename [lindex $item 1] + log::debug "---> to_dict processing item TABLE (name: $tablename): $item" #set tablename [::tomlish::utils::tablename_trim $tablename] set norm_segments [::tomlish::utils::tablename_split $tablename true] ;#true to normalize - if {$norm_segments in $tablenames_seen} { - error "Table name '$tablename' has already been directly defined in the toml data. Invalid." - } - log::debug "---> to_dict processing item $tag (name: $tablename): $item" - set name_segments [::tomlish::utils::tablename_split $tablename] ;#unnormalized - set last_seg "" - #toml spec rule - all segments mst be non-empty - #note that the results of tablename_split are 'raw' - ie some segments may be enclosed in single or double quotes. - - set table_key_sublist [list] - - foreach normseg $norm_segments { - lappend table_key_sublist $normseg - if {[dict exists $datastructure {*}$table_key_sublist]} { - #It's ok for this key to already exist *if* it was defined by a previous tablename or equivalent - #and if this key is longer - - #consider the following 2 which are legal: - #[table] - #x.y = 3 - #[table.x.z] - #k= 22 - - #equivalent - - #[table] - #[table.x] - #y = 3 - #[table.x.z] - #k=22 - - #illegal - #[table] - #x.y = 3 - #[table.x.y.z] - #k = 22 - ## - we should fail on encountering table.x.y because only table and table.x are effectively tables - - #illegal - #[table] - #x.y = {p=3} - #[table.x.y.z] - #k = 22 - ## we should fail because y is an inline table which is closed to further entries - - - #note: it is not safe to compare normalized tablenames using join! - # e.g a.'b.c'.d is not the same as a.b.c.d - # instead compare {a b.c d} with {a b c d} - # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. - #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' - #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} - - set sublist_length [llength $table_key_sublist] - set found_testkey 0 - if {$table_key_sublist in $tablenames_seen} { - set found_testkey 1 - } else { - #see if it was defined by a longer entry - foreach seen_table_segments $tablenames_seen { - if {[llength $seen_table_segments] <= $sublist_length} { - continue - } - #each tablenames_seen entry is already a list of normalized segments - - #we could have [a.b.c.d] early on - # followed by [a.b] - which was still defined by the earlier one. + set T_DEFINED [dictn getdef $tablenames_info [list $norm_segments defined] NULL] + if {$T_DEFINED ne "NULL"} { + #our tablename e.g [a.b.c.d] declares a space to 'define' subkeys - but there has already been a definition space for this path + set msg "Table name $tablename has already been directly defined in the toml data. Invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + error $msg + } - set seen_longer [lrange $seen_segments 0 [expr {$sublist_length -1}]] - puts stderr "testkey:'$table_key_sublist' vs seen_match:'$seen_longer'" - if {$table_key_sublist eq $seen_longer} { - set found_testkey 1 - } - } - } - if {$found_testkey == 0} { - #the raw unnormalized tablename might be ok to display in the error message, although it's not the actual dict keyset - set msg "key $table_key_sublist already exists in datastructure, but wasn't defined by a supertable." - append msg \n "tablenames_seen:" \n - foreach ts $tablenames_seen { - append msg " " $ts \n - } + set name_segments [::tomlish::utils::tablename_split $tablename 0] ;#unnormalized e.g ['a'."b".c.d] -> 'a' "b" c d + #results of tablename_split 0 are 'raw' - ie some segments may be enclosed in single or double quotes. + + + set supertable [list] + ############## + # [a.b.c.d] + # norm_segments = {a b c d} + #check a {a b} {a b c} <---- supertables of a.b.c.d + ############## + foreach normseg [lrange $norm_segments 0 end-1] { + lappend supertable $normseg + if {![dictn exists $tablenames_info [list $supertable type]]} { + #supertable with this path doesn't yet exist + if {[dict exists $datastructure {*}$supertable]} { + #There is data though - so it must have been created as a keyval + set msg "Supertable [join $supertable .] of table name $tablename already has data - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] error $msg + } else { + #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here + dictn set tablenames_info [list $supertable type] header + #ensure empty tables are still represented in the datastructure + dict set datastructure {*}$supertable [list] } - } - - } - - #ensure empty tables are still represented in the datastructure - set key_sublist [list] - foreach k $norm_segments { - lappend key_sublist $k - if {![dict exists $datastructure {*}$key_sublist]} { - dict set datastructure {*}$key_sublist [list] } else { - tomlish::log::notice "to_dict datastructure at (TABLE) subkey $key_sublist already had data: [dict get $datastructure {*}$key_sublist]" + #supertable has already been created - and maybe defined - but even if defined we can add subtables } } + #table [a.b.c.d] hasn't been defined - but may have been 'created' already by a longer tablename + # - or may have existing data from a keyval + if {![dictn exists $tablenames_info [list $norm_segments type]]} { + if {[dict exists $datastructure {*}$norm_segments]} { + set msg "Table name $tablename already has data - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + error $msg + } + #no data or previously created table + dictn set tablenames_info [list $norm_segments type] header - #We must do this after the key-collision test above! - lappend tablenames_seen $norm_segments - - + #We are 'defining' this table's keys and values here (even if empty) + dict set datastructure {*}$norm_segments [list] ;#ensure table still represented in datastructure even if we add no keyvals here + } + dictn set tablenames_info [list $norm_segments defined] open log::debug ">>> to_dict >>>>>>>>>>>>>>>>> normalized table key hierarchy : $norm_segments" #now add the contained elements foreach element [lrange $item 2 end] { set type [lindex $element 0] - log::debug "----> tododict processing $tag subitem $type processing contained element $element" + log::debug "----> todict processing $tag subitem $type processing contained element $element" switch -exact -- $type { DOTTEDKEY { set dkey_info [_get_dottedkey_info $element] @@ -547,14 +540,19 @@ namespace eval tomlish { puts stdout "to_dict>>> $keyval_dict" dict set datastructure {*}$norm_segments {*}$dkeys $leaf_key $keyval_dict #JMN 2025 - lappend tablenames_seen [list {*}$norm_segments {*}$dkeys] + #lappend tablenames_info [list {*}$norm_segments {*}$dkeys] + set tkey [list {*}$norm_segments {*}$dkeys] + dictn incr tablenames_info [list $tkey seencount] if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { #the value is either empty or or a dict structure with arbitrary (from-user-data) toplevel keys # inner structure will contain {type value } if all leaves are not empty ITABLES - lappend tablenames_seen [list {*}$norm_segments {*}$dkeys $leaf_key] + set tkey [list {*}$norm_segments {*}$dkeys $leaf_key] + #lappend tablenames_info [list {*}$norm_segments {*}$dkeys $leaf_key] + dictn incr tablenames_info [list $tkey seencount] #if the keyval_dict is not a simple type x value y - then it's an inline table ? #if so - we should add the path to the leaf_key as a closed table too - as it's not allowed to have more entries added. + dictn set tablenames_info [list $tkey closed] 1 } } @@ -562,7 +560,7 @@ namespace eval tomlish { #ignore } default { - error "Sub element of type '$type' not understood in table context. Expected only KEY,DQKEY,SQKEY,NEWLINE,COMMENT,WS" + error "Sub element of type '$type' not understood in table context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" } } } @@ -1316,7 +1314,12 @@ namespace eval tomlish::encode { #NOTE - this DELIBERATELY does not validate the data, or process escapes etc #It encodes the tomlish records as they are. #ie it only produces toml shaped data from a tomlish list. + # #It is part of the roundtripability of data from toml to tomlish + #!! ie - it is not the place to do formatting of inline vs multiline !! + # That needs to be encoded in the tomlish data that is being passed in + # (e.g from_dict could make formatting decisions in the tomlish it produces) + # #e.g duplicate keys etc can exist in the toml output. #The to_dict from_dict (or any equivalent processor pair) is responsible for validation and conversion #back and forth of escape sequences where appropriate. @@ -1646,17 +1649,27 @@ namespace eval tomlish::decode { #pop_trigger_tokens: newline tablename endarray endinlinetable #note a token is a pop trigger depending on context. e.g first newline during keyval is a pop trigger. set parentlevel [expr {$nest -1}] - set do_append_to_parent 1 ;#most tokens will leave this alone - but some like squote_seq need to do their own append + set do_append_to_parent 1 ;#most tokens will leave this alone - but some like tentative_accum_squote need to do their own append switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote { + #should only apply within a multiliteral #### set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed #Without this - we would get extraneous empty list entries in the parent # - as the xxx-squote-space isn't a space level from the toml perspective # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop + #assert prevstate always trailing-squote-space + #dev guardrail - remove? assertion lib? + switch -exact -- $prevstate { + trailing-squote-space { + } + default { + error "--- unexpected popped due to tentative_accum_squote but came from state '$prevstate' should have been trailing-squote-space" + } + } switch -- $tok { ' { - tomlish::parse::set_token_waiting type startsquote value $tok complete 1 startindex [expr {$i -1}] + tomlish::parse::set_token_waiting type single_squote value $tok complete 1 startindex [expr {$i -1}] } '' { #review - we should perhaps return double_squote instead? @@ -1669,74 +1682,51 @@ namespace eval tomlish::decode { tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 3}] } '''' { - switch -exact -- $prevstate { - leading-squote-space { - error "---- 4 squotes from leading-squote-space - shouldn't get here" - #we should have emitted the triple and left the last for next loop + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] + #todo integrate left squote with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + LITERALPART { + set newval "[lindex $lastpart 1]'" + set parentdata $v($parentlevel) + lset parentdata end [list LITERALPART $newval] + set v($parentlevel) $parentdata } - trailing-squote-space { - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] - #todo integrate left squote with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - LITERALPART { - set newval "[lindex $lastpart 1]'" - set parentdata $v($parentlevel) - lset parentdata end [list LITERALPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE { - lappend v($parentlevel) [list LITERALPART "'"] - } - MULTILITERAL { - #empty - lappend v($parentlevel) [list LITERALPART "'"] - } - default { - error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" - } - } + NEWLINE { + lappend v($parentlevel) [list LITERALPART "'"] + } + MULTILITERAL { + #empty + lappend v($parentlevel) [list LITERALPART "'"] } default { - error "--- unexpected popped due to squote_seq but came from state '$prevstate' should have been leading-squote-space or trailing-squote-space" + error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" } } } ''''' { - switch -exact -- $prevstate { - leading-squote-space { - error "---- 5 squotes from leading-squote-space - shouldn't get here" - #we should have emitted the triple and left the following squotes for next loop + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] + #todo integrate left 2 squotes with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + LITERALPART { + set newval "[lindex $lastpart 1]''" + set parentdata $v($parentlevel) + lset parentdata end [list LITERALPART $newval] + set v($parentlevel) $parentdata } - trailing-squote-space { - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] - #todo integrate left 2 squotes with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - LITERALPART { - set newval "[lindex $lastpart 1]''" - set parentdata $v($parentlevel) - lset parentdata end [list LITERALPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE { - lappend v($parentlevel) [list LITERALPART "''"] - } - MULTILITERAL { - lappend v($parentlevel) [list LITERALPART "''"] - } - default { - error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" - } - } + NEWLINE { + lappend v($parentlevel) [list LITERALPART "''"] + } + MULTILITERAL { + lappend v($parentlevel) [list LITERALPART "''"] } default { - error "--- unexpected popped due to squote_seq but came from state '$prevstate' should have been leading-squote-space or trailing-squote-space" + error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" } } } } - puts stderr "tomlish::decode::toml ---- HERE squote_seq pop <$tok>" } triple_squote { #presumably popping multiliteral-space @@ -1763,7 +1753,119 @@ namespace eval tomlish::decode { lappend merged $part } default { - error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($next)" + error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" + } + } + set lasttype [lindex $part 0] + } + set v($nest) $merged + } + tentative_accum_dquote { + #should only apply within a multistring + #### + set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed + #Without this - we would get extraneous empty list entries in the parent + # - as the trailing-dquote-space isn't a space level from the toml perspective + # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop + #assert prevstate always trailing-dquote-space + #dev guardrail - remove? assertion lib? + switch -exact -- $prevstate { + trailing-dquote-space { + } + default { + error "--- unexpected popped due to tentative_accum_dquote but came from state '$prevstate' should have been trailing-dquote-space" + } + } + switch -- $tok { + {"} { + tomlish::parse::set_token_waiting type single_dquote value $tok complete 1 startindex [expr {$i -1}] + } + {""} { + #review - we should perhaps return double_dquote instead? + #tomlish::parse::set_token_waiting type literal value "" complete 1 + tomlish::parse::set_token_waiting type double_dquote value "" complete 1 startindex [expr {$i - 2}] + } + {"""} { + #### + #if already an eof in token_waiting - set_token_waiting will insert before it + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 3}] + } + {""""} { + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 4}] + #todo integrate left dquote with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + STRINGPART { + set newval "[lindex $lastpart 1]\"" + set parentdata $v($parentlevel) + lset parentdata end [list STRINGPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE - CONT - WS { + lappend v($parentlevel) [list STRINGPART {"}] + } + MULTISTRING { + #empty + lappend v($parentlevel) [list STRINGPART {"}] + } + default { + error "--- don't know how to integrate extra trailing dquote with data $v($parentlevel)" + } + } + } + {"""""} { + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i-5}] + #todo integrate left 2 dquotes with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + STRINGPART { + set newval "[lindex $lastpart 1]\"\"" + set parentdata $v($parentlevel) + lset parentdata end [list STRINGPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE - CONT - WS { + lappend v($parentlevel) [list STRINGPART {""}] + } + MULTISTRING { + lappend v($parentlevel) [list STRINGPART {""}] + } + default { + error "--- don't know how to integrate extra trailing 2 dquotes with data $v($parentlevel)" + } + } + } + } + } + triple_dquote { + #presumably popping multistring-space + ::tomlish::log::debug "---- triple_dquote for last_space_action pop leveldata: $v($nest)" + set merged [list] + set lasttype "" + foreach part $v($nest) { + switch -exact -- [lindex $part 0] { + MULTISTRING { + lappend merged $part + } + STRINGPART { + if {$lasttype eq "STRINGPART"} { + set prevpart [lindex $merged end] + lset prevpart 1 [lindex $prevpart 1][lindex $part 1] + lset merged end $prevpart + } else { + lappend merged $part + } + } + CONT - WS { + lappend merged $part + } + NEWLINE { + #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here + #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. + lappend merged $part + } + default { + error "---- triple_dquote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" } } set lasttype [lindex $part 0] @@ -1809,15 +1911,12 @@ namespace eval tomlish::decode { endinlinetable { ::tomlish::log::debug "---- endinlinetable for last_space_action pop" } - endmultiquote { - ::tomlish::log::debug "---- endmultiquote for last_space_action 'pop'" - } default { error "---- unexpected tokenType '$tokenType' for last_space_action 'pop'" } } if {$do_append_to_parent} { - #e.g squote_seq does it's own appends as necessary - so won't get here + #e.g tentative_accum_squote does it's own appends as necessary - so won't get here lappend v($parentlevel) [set v($nest)] } @@ -1831,8 +1930,8 @@ namespace eval tomlish::decode { switch -exact -- $tokenType { - squote_seq_begin { - #### + tentative_trigger_squote - tentative_trigger_dquote { + #### this startok will always be tentative_accum_squote/tentative_accum_dquote starting with one accumulated squote/dquote if {[dict exists $transition_info starttok] && [dict get $transition_info starttok] ne ""} { lassign [dict get $transition_info starttok] starttok_type starttok_val set next_tokenType_known 1 @@ -1840,6 +1939,16 @@ namespace eval tomlish::decode { set tok $starttok_val } } + single_squote { + #JMN - REVIEW + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "squotedkey" + set tok "" + } + triple_squote { + ::tomlish::log::debug "---- push trigger tokenType triple_squote" + set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERALPART + } squotedkey { switch -exact -- $prevstate { table-space - itable-space { @@ -1849,6 +1958,9 @@ namespace eval tomlish::decode { #todo - check not something already waiting? tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } + triple_dquote { + set v($nest) [list MULTISTRING] ;#container for NEWLINE,STRINGPART,CONT + } dquotedkey { switch -exact -- $prevstate { table-space - itable-space { @@ -1858,7 +1970,7 @@ namespace eval tomlish::decode { #todo - check not something already waiting? tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } - XXXdquotedkey - XXXitablequotedkey { + XXXdquotedkey { #todo set v($nest) [list DQKEY $tok] ;#$tok is the keyname } @@ -1878,34 +1990,29 @@ namespace eval tomlish::decode { tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } } - startsquote { - #JMN - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "squotedkey" - set tok "" - } tablename { #note: we do not use the output of tomlish::tablename_trim to produce a tablename for storage in the tomlish list! #The tomlish list is intended to preserve all whitespace (and comments) - so a roundtrip from toml file to tomlish # back to toml file will be identical. #It is up to the datastructure stage to normalize and interpret tomlish for programmatic access. # we call tablename_trim here only to to validate that the tablename data is well-formed at the outermost level, - # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names. + # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names from + # a structural perspective. #todo - review! It's arguable that we should not do any validation here, and just store even incorrect raw tablenames, # so that the tomlish list is more useful for say a toml editor. Consider adding an 'err' tag to the appropriate place in the # tomlish list? - set test_only [::tomlish::utils::tablename_trim $tok] - ::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$test_only'" + #set trimtable [::tomlish::utils::tablename_trim $tok] + #::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$trimtable'" set v($nest) [list TABLE $tok] ;#$tok is the *raw* table name #note also that equivalent tablenames may have different toml representations even after being trimmed! #e.g ["x\t\t"] & ["x "] (tab escapes vs literals) #These will show as above in the tomlish list, but should normalize to the same tablename when used as keys by the datastructure stage. } tablearrayname { - set test_only [::tomlish::utils::tablename_trim $tok] - puts stdout "trimmed (but not normalized) tablearrayname: '$test_only'" + #set trimtable [::tomlish::utils::tablename_trim $tok] + #::tomlish::log::debug "---- trimmed (but not normalized) tablearrayname: '$trimtable'" set v($nest) [list TABLEARRAY $tok] ;#$tok is the *raw* tablearray name } startarray { @@ -1914,14 +2021,6 @@ namespace eval tomlish::decode { startinlinetable { set v($nest) [list ITABLE] ;#$tok is just the opening curly brace - don't output. } - startmultiquote { - ::tomlish::log::debug "---- push trigger tokenType startmultiquote" - set v($nest) [list MULTISTRING] ;#container for STRINGPART, WS, CONT, NEWLINE - } - triple_squote { - ::tomlish::log::debug "---- push trigger tokenType triple_squote" - set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERAL - } default { error "---- push trigger tokenType '$tokenType' not yet implemented" } @@ -1931,11 +2030,11 @@ namespace eval tomlish::decode { #no space level change switch -exact -- $tokenType { squotedkey { - puts "---- squotedkey in state $prevstate (no space level change)" + #puts "---- squotedkey in state $prevstate (no space level change)" lappend v($nest) [list SQKEY $tok] } dquotedkey { - puts "---- dquotedkey in state $prevstate (no space level change)" + #puts "---- dquotedkey in state $prevstate (no space level change)" lappend v($nest) [list DQKEY $tok] } barekey { @@ -1960,29 +2059,46 @@ namespace eval tomlish::decode { startinlinetable { puts stderr "---- decode::toml error. did not expect startinlinetable without space level change (no space level change)" } - startquote { + single_dquote { switch -exact -- $newstate { string-state { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "string" set tok "" } - quoted-key { + dquoted-key { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "dquotedkey" set tok "" } - XXXitable-quoted-key { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "itablequotedkey" - set tok "" + multistring-space { + lappend v($nest) [list STRINGPART {"}] + #may need to be joined on pop if there are neighbouring STRINGPARTS + } + default { + error "---- single_dquote switch case not implemented for nextstate: $newstate (no space level change)" + } + } + } + double_dquote { + #leading extra quotes - test: toml_multistring_startquote2 + switch -exact -- $prevstate { + itable-keyval-value-expected - keyval-value-expected { + puts stderr "tomlish::decode::toml double_dquote TEST" + #empty string + lappend v($nest) [list STRINGPART ""] + } + multistring-space { + #multistring-space to multistring-space + lappend v($nest) [list STRINGPART {""}] } default { - error "---- startquote switch case not implemented for nextstate: $newstate (no space level change)" + error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" } } + } - startsquote { + single_squote { switch -exact -- $newstate { literal-state { set next_tokenType_known 1 @@ -1995,41 +2111,17 @@ namespace eval tomlish::decode { set tok "" } multiliteral-space { - #false alarm squote returned from squote_seq pop + #false alarm squote returned from tentative_accum_squote pop ::tomlish::log::debug "---- adding lone squote to own LITERALPART nextstate: $newstate (no space level change)" #(single squote - not terminating space) lappend v($nest) [list LITERALPART '] #may need to be joined on pop if there are neighbouring LITERALPARTs } default { - error "---- startsquote switch case not implemented for nextstate: $newstate (no space level change)" + error "---- single_squote switch case not implemented for nextstate: $newstate (no space level change)" } } } - startmultiquote { - #review - puts stderr "---- got startmultiquote in state $prevstate (no space level change)" - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "stringpart" - set tok "" - } - endquote { - #nothing to do? - set tok "" - } - endsquote { - set tok "" - } - endmultiquote { - #JMN!! - set tok "" - } - string { - lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes - } - literal { - lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes - } double_squote { switch -exact -- $prevstate { keyval-value-expected { @@ -2044,6 +2136,19 @@ namespace eval tomlish::decode { } } } + enddquote { + #nothing to do? + set tok "" + } + endsquote { + set tok "" + } + string { + lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes + } + literal { + lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes + } multistring { #review lappend v($nest) [list MULTISTRING $tok] @@ -2056,11 +2161,9 @@ namespace eval tomlish::decode { } literalpart { lappend v($nest) [list LITERALPART $tok] ;#will not get wrapped in squotes directly - } - itablequotedkey { - } untyped_value { + #would be better termed unclassified_value #we can't determine the type of unquoted values (int,float,datetime,bool) until the entire token was read. if {$tok in {true false}} { set tag BOOL @@ -2238,7 +2341,7 @@ namespace eval tomlish::utils { #eg {dog."tater.man"} set sLen [tcl::string::length $tablename] set segments [list] - set mode "unknown" ;#5 modes: unknown, quoted,litquoted, unquoted, syntax + set mode "preval" ;#5 modes: preval, quoted,litquoted, unquoted, postval #quoted is for double-quotes, litquoted is for single-quotes (string literal) set seg "" for {set i 0} {$i < $sLen} {incr i} { @@ -2249,139 +2352,166 @@ namespace eval tomlish::utils { set lastChar "" } + #todo - track\count backslashes properly + set c [tcl::string::index $tablename $i] + if {$c eq "\""} { + if {($lastChar eq "\\")} { + #not strictly correct - we could have had an even number prior-backslash sequence + #the toml spec would have us error out immediately on bsl in bad location - but we're + #trying to parse to unvalidated tomlish + set ctest escq + } else { + set ctest dq + } + } else { + set ctest [string map [list " " sp \t tab] $c] + } - if {$c eq "."} { - switch -exact -- $mode { - unquoted { - #dot marks end of segment. - lappend segments $seg - set seg "" - set mode "unknown" - } - quoted { - append seg $c - } - unknown { - lappend segments $seg - set seg "" - } - litquoted { - append seg $c - } - default { - #mode: syntax - #we got our dot. - the syntax mode is now satisfied. - set mode "unknown" + switch -- $ctest { + . { + switch -exact -- $mode { + preval { + error "tablename_split. dot not allowed - expecting a value" + } + unquoted { + #dot marks end of segment. + #if {![is_barekey $seg]} { + # error "tablename_split. dot not allowed - expecting a value" + #} + lappend segments $seg + set seg "" + set mode "preval" + } + quoted { + append seg $c + } + litquoted { + append seg $c + } + postval { + #got dot in an expected location + set mode "preval" + } } } - } elseif {($c eq "\"") && ($lastChar ne "\\")} { - if {$mode eq "unknown"} { - if {[tcl::string::trim $seg] ne ""} { - #we don't allow a quote in the middle of a bare key - error "tablename_split. character '\"' invalid at this point in tablename. tablename: '$tablename'" - } - set mode "quoted" - set seg "\"" - } elseif {$mode eq "unquoted"} { - append seg $c - } elseif {$mode eq "quoted"} { - append seg $c - - if {$normalize} { - lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] - } else { - lappend segments $seg + dq { + #unescaped dquote + switch -- $mode { + preval { + set mode "quoted" + set seg "\"" + } + unquoted { + #invalid in barekey - but we are after structure only + append seg $c + } + quoted { + append seg $c + if {$normalize} { + lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] + } else { + lappend segments $seg + } + set seg "" + set mode "postval" ;#make sure we only accept a dot or end-of-data now. + } + litquoted { + append seg $c + } + postval { + error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" + } } - - set seg "" - set mode "syntax" ;#make sure we only accept a dot or end-of-data now. - } elseif {$mode eq "litquoted"} { - append seg $c - } elseif {$mode eq "syntax"} { - error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" - } - } elseif {($c eq "\'")} { - if {$mode eq "unknown"} { - append seg $c - set mode "litquoted" - } elseif {$mode eq "unquoted"} { - #single quote inside e.g o'neill - append seg $c - } elseif {$mode eq "quoted"} { - append seg $c - - } elseif {$mode eq "litquoted"} { - append seg $c - #no normalization to do - lappend segments $seg - set seg "" - set mode "syntax" - } elseif {$mode eq "syntax"} { - error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" } - - } elseif {$c in [list " " \t]} { - if {$mode eq "syntax"} { - #ignore - } else { - append seg $c + ' { + switch -- $mode { + preval { + append seg $c + set mode "litquoted" + } + unquoted { + #single quote inside e.g o'neill - ultimately invalid - but we pass through here. + append seg $c + } + quoted { + append seg $c + } + litquoted { + append seg $c + #no normalization to do aside from stripping squotes + if {$normalize} { + lappend segments [tcl::string::range $seg 1 end-1] + } else { + lappend segments $seg + } + set seg "" + set mode "postval" + } + postval { + error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" + } + } } - } else { - if {$mode eq "syntax"} { - error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" + sp - tab { + switch -- $mode { + preval - postval { + #ignore + } + unquoted { + #terminates a barekey + lappend segments $seg + set seg "" + set mode "postval" + } + default { + #append to quoted or litquoted + append seg $c + } + } } - if {$mode eq "unknown"} { - set mode "unquoted" + default { + switch -- $mode { + preval { + set mode unquoted + append seg $c + } + postval { + error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" + } + default { + append seg $c + } + } } - append seg $c } + if {$i == $sLen-1} { #end of data ::tomlish::log::debug "End of data: mode='$mode'" - #REVIEW - we can only end up in unquoted or syntax here? are other branches reachable? switch -exact -- $mode { - quoted { - if {$c ne "\""} { - error "tablename_split. missing closing double-quote in a segment. tablename: '$tablename'" - } - if {$normalize} { - lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] - #lappend segments [subst -nocommands -novariables [::string range $seg 1 end-1]] ;#wrong - } else { - lappend segments $seg - } + preval { + error "tablename_split. Expected a value after last dot separator. tablename: '$tablename'" } - litquoted { - set trimmed_seg [tcl::string::trim $seg] - if {[tcl::string::index $trimmed_seg end] ne "\'"} { - error "tablename_split. missing closing single-quote in a segment. tablename: '$tablename'" - } + unquoted { lappend segments $seg } - unquoted - unknown { - lappend segments $seg + quoted { + error "tablename_split. Expected a trailing double quote. tablename: '$tablename'" } - syntax { - #ok - segment already lappended + litquoted { + error "tablename_split. Expected a trailing single quote. tablename: '$tablename'" } - default { - lappend segments $seg + postval { + #ok - segment already lappended } } } } - foreach seg $segments { - set trimmed [tcl::string::trim $seg " \t"] - #note - we explicitly allow 'empty' quoted strings '' & "" - # (these are 'discouraged' but valid toml keys) - #if {$trimmed in [list "''" "\"\""]} { - # puts stderr "tablename_split. warning - Empty quoted string as tablename segment" - #} - if {$trimmed eq "" } { - error "tablename_split. Empty segment found. tablename: '$tablename' segments [llength $segments] ($segments)" - } - } + + #note - we must allow 'empty' quoted strings '' & "" + # (these are 'discouraged' but valid toml keys) + return $segments } @@ -2432,26 +2562,34 @@ namespace eval tomlish::utils { #- escape_string and unescape_string would not be reliably roundtrippable inverses anyway. #REVIEW - provide it anyway? When would it be desirable to use? - variable Bstring_control_map [list\ - \b {\b}\ - \n {\n}\ - \r {\r}\ - \" {\"}\ - \x1b {\e}\ - \\ "\\\\"\ - ] + variable Bstring_control_map [dict create] + dict set Bstring_control_map \b {\b} + dict set Bstring_control_map \n {\n} + dict set Bstring_control_map \r {\r} + dict set Bstring_control_map \" {\"} + #dict set Bstring_control_map \x1b {\e} ;#should presumably be only be a convenience for decode - going the other way we get \u001B + dict set Bstring_control_map \\ "\\\\" + #\e for \x1b seems like it might be included - v1.1?? hard to find current state of where toml is going :/ #for a Bstring (Basic string) tab is explicitly mentioned as not being one that must be escaped. - for {set cdec 0} {$cdec <= 8} {incr cdec} { + #8 = \b - already in list. + #built the remainder whilst checking for entries already hardcoded above -in case more are added to the hardcoded list + for {set cdec 0} {$cdec <= 7} {incr cdec} { set hhhh [format %.4X $cdec] - lappend Bstring_control_map [format %c $cdec] \\u$hhhh + set char [format %c $cdec] + if {![dict exists $Bstring_control_map $char]} { + dict set Bstring_control_map $char \\u$hhhh + } } for {set cdec [expr {0x0A}]} {$cdec <= 0x1F} {incr cdec} { set hhhh [format %.4X $cdec] - lappend Bstring_control_map [format %c $cdec] \\u$hhhh + set char [format %c $cdec] + if {![dict exists $Bstring_control_map $char]} { + dict set Bstring_control_map $char \\u$hhhh + } } # \u007F = 127 - lappend Bstring_control_map [format %c 127] \\u007F + dict set Bstring_control_map [format %c 127] \\u007F #Note the inclusion of backslash in the list of controls makes this non idempotent - subsequent runs would keep encoding the backslashes! #escape only those chars that must be escaped in a Bstring (e.g not tab which can be literal or escaped) @@ -2474,6 +2612,7 @@ namespace eval tomlish::utils { # it recognizes other escapes which aren't approprite e.g \xhh and octal \nnn # it replaces \ with a single whitespace (trailing backslash) #This means we shouldn't use 'subst' on the whole string, but instead substitute only the toml-specified escapes (\r \n \b \t \f \\ \" \uhhhh & \Uhhhhhhhh + #plus \e for \x1b? set buffer "" set buffer4 "" ;#buffer for 4 hex characters following a \u @@ -2558,12 +2697,13 @@ namespace eval tomlish::utils { set ctest [tcl::string::map {{"} dq} $c] switch -exact -- $ctest { dq { - set e "\\\"" - append buffer [subst -nocommand -novariable $e] + append buffer {"} } b - t - n - f - r { - set e "\\$c" - append buffer [subst -nocommand -novariable $e] + append buffer [subst -nocommand -novariable "\\$c"] + } + e { + append buffer \x1b } u { set unicode4_active 1 @@ -2578,8 +2718,7 @@ namespace eval tomlish::utils { #review - toml spec says all other escapes are reserved #and if they are used TOML should produce an error. #we leave detecting this for caller for now - REVIEW - append buffer "\\" - append buffer $c + append buffer "\\$c" } } } else { @@ -3003,7 +3142,7 @@ namespace eval tomlish::parse { # states: # table-space, itable-space, array-space # array-value-expected,keyval-value-expected,itable-keyval-value-expected, keyval-syntax, - # quoted-key, squoted-key + # dquoted-key, squoted-key # string-state, literal-state, multistring... # # notes: @@ -3039,6 +3178,12 @@ namespace eval tomlish::parse { variable stateMatrix set stateMatrix [dict create] + #--------------------------------------------------------- + #WARNING + #The stateMatrix implementation here is currently messy. + #The code is a mixture of declarative via the stateMatrix and imperative via switch statements during PUSH/POP/SAMESPACE transitions. + #This means the state behaviour has to be reasoned about by looking at both in conjuction. + #--------------------------------------------------------- #xxx-space vs xxx-syntax inadequately documented - TODO @@ -3060,35 +3205,19 @@ namespace eval tomlish::parse { barekey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ squotedkey {PUSHSPACE "keyval-space" state "keyval-syntax" note ""}\ dquotedkey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ - XXXstartquote "quoted-key"\ - XXXstartsquote "squoted-key"\ + XXXsingle_dquote "quoted-key"\ + XXXsingle_squote "squoted-key"\ comment "table-space"\ starttablename "tablename-state"\ starttablearrayname "tablearrayname-state"\ - startmultiquote "err-state"\ - endquote "err-state"\ + enddquote "err-state"\ + endsquote "err-state"\ comma "err-state"\ eof "end-state"\ equal "err-state"\ cr "err-lonecr"\ } - #itable-space/ curly-syntax : itables - dict set stateMatrix\ - itable-space {\ - whitespace "itable-space"\ - newline "itable-space"\ - barekey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - squotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - dquotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - endinlinetable "POPSPACE"\ - XXXstartquote "quoted-key"\ - XXXstartsquote {TOSTATE "squoted-key" comment "jn-testing"}\ - comma "err-state"\ - comment "itable-space"\ - eof "err-state"\ - } - #squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate itable-space starttok {squote_seq "'"}} dict set stateMatrix\ @@ -3113,26 +3242,19 @@ namespace eval tomlish::parse { dict set stateMatrix\ keyval-value-expected {\ whitespace "keyval-value-expected"\ - untyped_value {TOSTATE "keyval-tail" note ""}\ - startquote {TOSTATE "string-state" returnstate keyval-tail}\ - startmultiquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ - squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate keyval-value-expected starttok {squote_seq "'"}}\ - startsquote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ - double_squote {TOSTATE "keyval-tail" note "empty literal received when double squote occurs"}\ - triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ - startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ - startarray {PUSHSPACE array-space returnstate keyval-tail}\ - } - #squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate keyval-process-leading-squotes starttok {squote_seq "'"}} - dict set stateMatrix\ - leading-squote-space {\ - squote_seq "POPSPACE"\ + untyped_value {TOSTATE "keyval-tail" note ""}\ + literal {TOSTATE "keyval-tail" note "required for empty literal at EOF"}\ + string {TOSTATE "keyval-tail" note "required for empty string at EOF"}\ + single_dquote {TOSTATE "string-state" returnstate keyval-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ + single_squote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ + triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ + startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ + startarray {PUSHSPACE array-space returnstate keyval-tail}\ } - #dict set stateMatrix\ - # keyval-process-leading-squotes {\ - # startsquote "literal-state"\ - # triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ - # } + #double_squote {TOSTATE "keyval-tail" note "empty literal received when double squote occurs"} + + #2025 - no leading-squote-space - only trailing-squote-space. dict set stateMatrix\ keyval-tail {\ @@ -3142,81 +3264,106 @@ namespace eval tomlish::parse { eof "end-state"\ } + + #itable-space/ curly-syntax : itables + # x={y=1,} + dict set stateMatrix\ + itable-space {\ + whitespace "itable-space"\ + newline "itable-space"\ + barekey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + squotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + dquotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + endinlinetable "POPSPACE"\ + comma "err-state"\ + comment "itable-space"\ + eof "err-state"\ + } + #we don't get single_squote etc here - instead we get the resulting squotedkey token + + + # ??? review - something like this + # + # x={y =1,} dict set stateMatrix\ itable-keyval-syntax {\ - whitespace "itable-keyval-syntax"\ - barekey {PUSHSPACE "dottedkey-space"}\ - squotedkey {PUSHSPACE "dottedkey-space"}\ - dquotedkey {PUSHSPACE "dottedkey-space"}\ - equal "itable-keyval-value-expected"\ + whitespace {TOSTATE "itable-keyval-syntax"}\ + barekey {PUSHSPACE "dottedkey-space"}\ + squotedkey {PUSHSPACE "dottedkey-space"}\ + dquotedkey {PUSHSPACE "dottedkey-space"}\ + equal {TOSTATE "itable-keyval-value-expected"}\ newline "err-state"\ eof "err-state"\ } + + # x={y=1} + dict set stateMatrix\ + itable-keyval-space {\ + whitespace "itable-keyval-syntax"\ + equal {TOSTATE "itable-keyval-value-expected" note "required"}\ + } + dict set stateMatrix\ itable-keyval-value-expected {\ whitespace "itable-keyval-value-expected"\ untyped_value {TOSTATE "itable-val-tail" note ""}\ - startquote {TOSTATE "string-state" returnstate itable-val-tail}\ - startmultiquote {PUSHSPACE "multistring-space" returnstate itable-val-tail}\ - squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate itable-keyval-value-expected starttok {squote_seq "'"}}\ - startsquote {TOSTATE "literal-state" returnstate itable-val-tail note "usual way a literal is triggered"}\ - double_squote {TOSTATE "itable-val-tail" note "empty literal received when double squote occurs"}\ + single_dquote {TOSTATE "string-state" returnstate itable-val-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate itable-val-tail}\ + single_squote {TOSTATE "literal-state" returnstate itable-val-tail note "usual way a literal is triggered"}\ triple_squote {PUSHSPACE "multiliteral-space" returnstate itable-val-tail}\ startinlinetable {PUSHSPACE "itable-space" returnstate itable-val-tail}\ startarray {PUSHSPACE "array-space" returnstate itable-val-tail}\ } - dict set stateMatrix\ - itable-keyval-space {\ - whitespace "itable-keyval-syntax"\ - equal {TOSTATE "itable-keyval-value-expected" note "required"}\ - } + #double_squote not currently generated by _start_squote_sequence - '' processed as single_squote to literal-state just like 'xxx' + # review + # double_squote {TOSTATE "itable-val-tail" note "empty literal received when double squote occurs"} + + + # x={y=1,z="x"} + #POPSPACE is transition from itable-keyval-space to parent itable-space dict set stateMatrix\ itable-val-tail {\ whitespace "itable-val-tail"\ endinlinetable "POPSPACE"\ comma "POPSPACE"\ - XXXnewline {TOSTATE "itable-val-tail" note "itable-space ??"}\ - newline "POPSPACE"\ + newline {TOSTATE "itable-val-tail" note "itable-space ??"}\ comment "itable-val-tail"\ eof "err-state"\ } - #dict set stateMatrix\ - # itable-quoted-key {\ - # whitespace "NA"\ - # itablequotedkey {PUSHSPACE "itable-keyval-space"}\ - # newline "err-state"\ - # endquote "itable-keyval-syntax"\ - # } - #dict set stateMatrix\ - # itable-squoted-key {\ - # whitespace "NA"\ - # itablesquotedkey {PUSHSPACE "itable-keyval-space"}\ - # newline "err-state"\ - # endsquote "itable-keyval-syntax"\ - # } + # XXXnewline "POPSPACE" + # We shouldn't popspace on newline - as if there was no comma we need to stay in itable-val-tail + # This means the newline and subsequent whitespace, comments etc become part of the preceeding dottedkey record + #e.g + # x = { + # j=1 + # #comment within dottedkey j record + # , # comment unattached + # #comment unattached + # k=2 , #comment unattached + # l=3 #comment within l record + # , m=4 + # #comment associated with m record + # + # #still associated with m record + # } + ## - This doesn't quite correspond to what a user might expect - but seems like a consistent mechanism. + #The awkwardness is because there is no way to put in a comment that doesn't consume a trailing comma + #so we cant do: j= 1 #comment for j1 , + # and have the trailing comma recognised. + # + # To associate: j= 1, #comment for j1 + # we would need some extra processing . (not popping until next key ? extra state itable-sep-tail?) REVIEW - worth doing? + # + # The same issue occurs with multiline arrays. The most natural assumption is that a comment on same line after a comma + # is 'associated' with the previous entry. + # + # These comment issues are independent of the data dictionary being generated for conversion to json etc - as the comments don't carry through anyway, + # but are a potential oddity for manipulating the intermediate tomlish structure whilst attempting to preserve 'associated' comments + # (e.g reordering records within an itable) + #The user's intention for 'associated' isn't always clear and the specs don't really guide on this. - - - #array-value-expected ? - dict set stateMatrix\ - XXXvalue-expected {\ - whitespace "value-expected"\ - untyped_value {"SAMESPACE" "" replay untyped_value}\ - startquote "string-state"\ - startsquote "literal-state"\ - triple_squote {PUSHSPACE "multiliteral-space"}\ - startmultiquote {PUSHSPACE "multistring-space"}\ - startinlinetable {PUSHSPACE itable-space}\ - startarray {PUSHSPACE array-space}\ - comment "err-state-value-expected-got-comment"\ - comma "err-state"\ - newline "err-state"\ - eof "err-state"\ - } - #note comment token should never be delivered to array-value-expected state? - #dottedkey-space is not (currently) used within [tablename] or [[tablearrayname]] #it is for keyval ie x.y.z = value @@ -3245,6 +3392,8 @@ namespace eval tomlish::parse { whitespace "dottedkey-space-tail" dotsep "dottedkey-space" equal "POPSPACE"\ + eof "err-state"\ + newline "err-state"\ } #-------------------------------------------------------------------------- @@ -3262,22 +3411,10 @@ namespace eval tomlish::parse { #toml spec looks like heading towards allowing newlines within inline tables #https://github.com/toml-lang/toml/issues/781 - #2025 - appears to be valid for 1.1 - which we are targeting. + #2025 - multiline itables appear to be valid for 1.1 - which we are targeting. #https://github.com/toml-lang/toml/blob/main/toml.md#inline-table #JMN2025 - #dict set stateMatrix\ - # curly-syntax {\ - # whitespace "curly-syntax"\ - # newline "curly-syntax"\ - # barekey {PUSHSPACE "itable-keyval-space"}\ - # itablequotedkey "itable-keyval-space"\ - # endinlinetable "POPSPACE"\ - # startquote "itable-quoted-key"\ - # comma "itable-space"\ - # comment "itable-space"\ - # eof "err-state"\ - # } #review comment "err-state" vs comment "itable-space" - see if TOML 1.1 comes out and allows comments in multiline ITABLES #We currently allow multiline ITABLES (also with comments) in the tokenizer. #if we want to disallow as per TOML 1.0 - we should do so when attempting to get structure? @@ -3291,10 +3428,9 @@ namespace eval tomlish::parse { # untyped_value "SAMESPACE"\ # startarray {PUSHSPACE "array-space"}\ # endarray "POPSPACE"\ - # startmultiquote {PUSHSPACE multistring-space}\ # startinlinetable {PUSHSPACE itable-space}\ - # startquote "string-state"\ - # startsquote "literal-state"\ + # single_dquote "string-state"\ + # single_squote "literal-state"\ # triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax note "seems ok 2024"}\ # comma "array-space"\ # comment "array-space"\ @@ -3305,15 +3441,16 @@ namespace eval tomlish::parse { set aspace [dict create] dict set aspace whitespace "array-space" dict set aspace newline "array-space" - dict set aspace untyped_value "SAMESPACE" + #dict set aspace untyped_value "SAMESPACE" + dict set aspace untyped_value "array-syntax" dict set aspace startarray {PUSHSPACE "array-space"} dict set aspace endarray "POPSPACE" - dict set aspace startmultiquote {PUSHSPACE multistring-space} + dict set aspace single_dquote {TOSTATE "string-state" returnstate array-syntax} + dict set aspace triple_dquote {PUSHSPACE "multistring-space" returnstate array-syntax} + dict set aspace single_squote {TOSTATE "literal-state" returnstate array-syntax} + dict set aspace triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax} dict set aspace startinlinetable {PUSHSPACE itable-space} - dict set aspace startquote "string-state" - dict set aspace startsquote "literal-state" - dict set aspace triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax note "seems ok 2024"} - dict set aspace comma "array-space" + #dict set aspace comma "array-space" dict set aspace comment "array-space" dict set aspace eof "err-state-array-space-got-eof" dict set stateMatrix array-space $aspace @@ -3329,26 +3466,16 @@ namespace eval tomlish::parse { #dict set asyntax untyped_value "SAMESPACE" #dict set asyntax startarray {PUSHSPACE array-space} dict set asyntax endarray "POPSPACE" - #dict set asyntax startmultiquote {PUSHSPACE multistring-space} - #dict set asyntax startquote "string-state" - #dict set asyntax startsquote "literal-state" + #dict set asyntax single_dquote "string-state" + #dict set asyntax single_squote "literal-state" dict set asyntax comma "array-space" dict set asyntax comment "array-syntax" dict set stateMatrix array-syntax $asyntax - #quoted-key & squoted-key need to PUSHSPACE from own token to keyval-space - dict set stateMatrix\ - quoted-key {\ - whitespace "NA"\ - dquotedkey {PUSHSPACE "keyval-space"}\ - newline "err-state"\ - endquote "keyval-syntax"\ - } - - #review + #dquotedkey is a token - dquoted-key is a state dict set stateMatrix\ dquoted-key {\ whitespace "NA"\ @@ -3367,7 +3494,7 @@ namespace eval tomlish::parse { string-state {\ whitespace "NA"\ string "string-state"\ - endquote "SAMESPACE"\ + enddquote "SAMESPACE"\ newline "err-state"\ eof "err-state"\ } @@ -3381,20 +3508,21 @@ namespace eval tomlish::parse { } - #dict set stateMatrix\ - # stringpart {\ - # continuation "SAMESPACE"\ - # endmultiquote "POPSPACE"\ - # eof "err-state"\ - # } dict set stateMatrix\ multistring-space {\ - whitespace "multistring-space"\ - continuation "multistring-space"\ - stringpart "multistring-space"\ - newline "multistring-space"\ - endmultiquote "POPSPACE"\ - eof "err-state"\ + whitespace "multistring-space"\ + continuation "multistring-space"\ + stringpart "multistring-space"\ + newline "multistring-space"\ + tentative_trigger_dquote {PUSHSPACE "trailing-dquote-space" returnstate multistring-space starttok {tentative_accum_dquote {"}}}\ + single_dquote {TOSTATE multistring-space}\ + double_dquote {TOSTATE multistring-space}\ + triple_dquote {POPSPACE}\ + eof "err-state"\ + } + dict set stateMatrix\ + trailing-dquote-space { + tentative_accum_dquote "POPSPACE" } @@ -3402,19 +3530,19 @@ namespace eval tomlish::parse { #todo - treat sole cr as part of literalpart but crlf and lf as newline dict set stateMatrix\ multiliteral-space {\ - literalpart "multiliteral-space"\ - newline "multiliteral-space"\ - squote_seq_begin {PUSHSPACE "trailing-squote-space" returnstate multiliteral-space starttok {squote_seq "'"}}\ - triple_squote {POPSPACE note "on popping - we do any necessary concatenation of LITERALPART items due to squote processing"}\ - double_squote {TOSTATE multiliteral-space note "short squote_seq: can occur anywhere in the space e.g emitted at end when 5 squotes occur"}\ - startsquote {TOSTATE multiliteral-space note "short squote_seq: same as double_squote - false alarm"}\ - eof "err-premature-eof-in-multiliteral-space"\ + literalpart "multiliteral-space"\ + newline "multiliteral-space"\ + tentative_trigger_squote {PUSHSPACE "trailing-squote-space" returnstate multiliteral-space starttok {tentative_accum_squote "'"}}\ + single_squote {TOSTATE multiliteral-space note "short tentative_accum_squote: false alarm this squote is part of data"}\ + double_squote {TOSTATE multiliteral-space note "short tentative_accum_squote: can occur anywhere in the space e.g emitted at end when 5 squotes occur"}\ + triple_squote {POPSPACE note "on popping - we do any necessary concatenation of LITERALPART items due to squote processing"}\ + eof "err-premature-eof-in-multiliteral-space"\ } #trailing because we are looking for possible terminating ''' - but must accept '''' or ''''' and re-integrate the 1st one or 2 extra squotes dict set stateMatrix\ - trailing-squote-space {\ - squote_seq "POPSPACE"\ + trailing-squote-space { + tentative_accum_squote "POPSPACE" } @@ -3499,7 +3627,7 @@ namespace eval tomlish::parse { - + dict set stateMatrix\ end-state {} @@ -3557,14 +3685,13 @@ namespace eval tomlish::parse { dict set spacePushTransitions itable-keyval-space itable-keyval-syntax dict set spacePushTransitions array-space array-space dict set spacePushTransitions table-space tablename-state - dict set spacePushTransitions #itable-space itable-space + #dict set spacePushTransitions #itable-space itable-space #Pop to, next variable spacePopTransitions [dict create] dict set spacePopTransitions array-space array-syntax - #itable-space curly-syntax #itable-keyval-space itable-val-tail #review #we pop to keyval-space from dottedkey-space or from keyval-value-expected? we don't always want to go to keyval-tail @@ -3575,7 +3702,6 @@ namespace eval tomlish::parse { #JMN test #dict set spaceSameTransitions array-space array-syntax - #itable-space curly-syntax #itable-keyval-space itable-val-tail @@ -3611,6 +3737,8 @@ namespace eval tomlish::parse { ::tomlish::log::debug "--->> goNextState tokentype:$tokentype tok:$tok currentstate:$currentstate : transition_to = $transition_to" switch -exact -- [lindex $transition_to 0] { POPSPACE { + set popfromspace_info [spacestack peek] + set popfromspace_state [dict get $popfromspace_info state] spacestack pop set parent_info [spacestack peek] set type [dict get $parent_info type] @@ -3625,17 +3753,17 @@ namespace eval tomlish::parse { set existing [spacestack pop] dict unset existing returnstate spacestack push $existing ;#re-push modification - ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace redirected to stored returnstate $next <<---" + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected to stored returnstate $next <<---" } else { ### #review - do away with spacePopTransitions - which although useful to provide a default.. # - involve error-prone configurations distant to the main state transition configuration in stateMatrix if {[dict exists $::tomlish::parse::spacePopTransitions $parentspace]} { set next [dict get $::tomlish::parse::spacePopTransitions $parentspace] - ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace redirected state to $next (spacePopTransitions)<<---" + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected state to $next (spacePopTransitions)<<---" } else { set next $parentspace - ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace<<---" + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace<<---" } } set result $next @@ -3805,22 +3933,6 @@ namespace eval tomlish::parse { return $tokenType } - proc _shortcircuit_startquotesequence {} { - variable tok - variable i - set toklen [tcl::string::length $tok] - if {$toklen == 1} { - set_tokenType "startquote" - incr i -1 - return -level 2 1 - } elseif {$toklen == 2} { - puts stderr "_shortcircuit_startquotesequence toklen 2" - set_tokenType "startquote" - set tok "\"" - incr i -2 - return -level 2 1 - } - } proc get_token_waiting {} { variable token_waiting @@ -3940,7 +4052,6 @@ namespace eval tomlish::parse { set slash_active 0 set quote 0 set c "" - set multi_dquote "" for {} {$i < $sLen} {} { if {$i > 0} { set lastChar [tcl::string::index $s [expr {$i - 1}]] @@ -3957,8 +4068,6 @@ namespace eval tomlish::parse { switch -exact -- $ctest { # { - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 @@ -3966,16 +4075,20 @@ namespace eval tomlish::parse { if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { + #for multiliteral, multistring - data and/or end incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { + #pseudo token beginning with underscore - never returned to state machine - review incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i [tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } barekey { @@ -4003,7 +4116,7 @@ namespace eval tomlish::parse { append tok $c } default { - #dquotedkey, itablequotedkey, string,literal, multistring + #dquotedkey, string,literal, multistring append tok $c } } @@ -4015,7 +4128,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes#" + append tok "#" } multiliteral-space { set_tokenType "literalpart" @@ -4031,23 +4144,23 @@ namespace eval tomlish::parse { } lc { #left curly brace - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i [tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { @@ -4059,7 +4172,7 @@ namespace eval tomlish::parse { } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } starttablename - starttablearrayname { #*bare* tablename can only contain letters,digits underscores @@ -4105,7 +4218,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes\{" + append tok "\{" } multiliteral-space { set_tokenType "literalpart" @@ -4120,37 +4233,35 @@ namespace eval tomlish::parse { } rc { #right curly brace - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { append tok $c } - XXXitablesquotedkey { - } - string - dquotedkey - itablequotedkey - comment { + string - dquotedkey - comment { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } starttablename - tablename { if {$had_slash} {append tok "\\"} @@ -4221,7 +4332,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes\}" + append tok "\}" } multiliteral-space { set_tokenType "literalpart" ; #review @@ -4237,35 +4348,35 @@ namespace eval tomlish::parse { } lb { #left square bracket - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { append tok $c } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } starttablename { #change the tokenType @@ -4332,7 +4443,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes\[" + append tok "\[" } multiliteral-space { set_tokenType "literalpart" @@ -4350,37 +4461,35 @@ namespace eval tomlish::parse { } rb { #right square bracket - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { append tok $c } - XXXitablesquotedkey { - } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } comment { if {$had_slash} {append tok "\\"} @@ -4428,16 +4537,6 @@ namespace eval tomlish::parse { } } } - XXXtablearraynames { - puts "rb @ tablearraynames ??" - #switch? - - #todo? - if {$had_slash} {append tok "\\"} - #invalid! - but leave for datastructure loading stage to catch - set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex - return 1 - } default { incr i -1 return 1 @@ -4485,7 +4584,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes\]" + append tok "\]" } multiliteral-space { set_tokenType "literalpart" @@ -4498,21 +4597,21 @@ namespace eval tomlish::parse { } } bsl { - set dquotes $multi_dquote - set multi_dquote "" ;#!! #backslash if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } whitespace { @@ -4529,9 +4628,7 @@ namespace eval tomlish::parse { append tok "\\" set slash_active 0 } - XXXitablesquotedkey { - } - string - dquotedkey - itablequotedkey - comment { + string - dquotedkey - comment { if {$slash_active} { set slash_active 0 append tok "\\\\" @@ -4545,7 +4642,6 @@ namespace eval tomlish::parse { set slash_active 0 append tok "\\\\" } else { - append tok $dquotes set slash_active 1 } } @@ -4575,10 +4671,6 @@ namespace eval tomlish::parse { set tok "\\\\" set slash_active 0 } else { - if {$dquotes ne ""} { - set_tokenType "stringpart" - set tok $dquotes - } set slash_active 1 } } @@ -4599,58 +4691,56 @@ namespace eval tomlish::parse { set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { - #short squote_seq tokens are returned if active during any other character + tentative_accum_squote { + #for within multiliteral + #short tentative_accum_squote tokens are returned if active upon receipt of any other character #longest allowable for leading/trailing are returned here #### set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote - switch -- $state { - leading-squote-space { - append tok $c - if {$existingtoklen > 2} { - error "tomlish tok error: squote_seq unexpected length $existingtoklen when another received" - } elseif {$existingtoklen == 2} { - return 1 ;#return tok ''' - } - } - trailing-squote-space { - append tok $c - if {$existingtoklen == 4} { - #maxlen to be an squote_seq is multisquote + 2 = 5 - #return tok ''''' - return 1 - } - } - default { - error "tomlish tok error: squote_seq in unexpected state '$state' - expected leading-squote-space or trailing-squote-space" - } + #assert state = trailing-squote-space + append tok $c + if {$existingtoklen == 4} { + #maxlen to be a tentative_accum_squote is multisquote + 2 = 5 + #return tok with value ''''' + return 1 } } - whitespace { - #end whitespace - incr i -1 ;#reprocess sq + tentative_accum_dquote { + incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { - #temp token creatable only during value-expected or array-space + #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space switch -- [tcl::string::length $tok] { 1 { + #no conclusion can yet be reached append tok $c } 2 { + #enter multiliteral #switch? append tok $c set_tokenType triple_squote return 1 } default { + #if there are more than 3 leading squotes we also enter multiliteral space and the subsequent ones are handled + #by the tentative_accum_squote check for ending sequence which can accept up to 5 and reintegrate the + #extra 1 or 2 squotes as data. error "tomlish unexpected token length [tcl::string::length $tok] in '_start_squote_sequence'" } } } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + whitespace { + #end whitespace + incr i -1 ;#reprocess sq + return 1 + } literal { #slash_active always false #terminate the literal @@ -4663,7 +4753,7 @@ namespace eval tomlish::parse { # idea: end this literalpart (possibly 'temporarily') # let the sq be reprocessed in the multiliteral-space to push an end-multiliteral-sequence to state stack # upon popping end-multiliteral-sequence - stitch quotes back into this literalpart's token (if either too short - or a long ending sequence as shown above) - incr i -1 ;#throw the "'" back to loop - will be added to an squote_seq token for later processing + incr i -1 ;#throw the "'" back to loop - will be added to a tentative_accum_squote token for later processing return 1 } XXXitablesquotedkey { @@ -4684,7 +4774,11 @@ namespace eval tomlish::parse { append tok $c } barekey { - #not clear why o'shennanigan shouldn't be a legal barekey - but it seems not to be. + #barekeys now support all sorts of unicode letter/number chars for other cultures + #but not punctuation - not even for those of Irish heritage who don't object + #to the anglicised form of some names. + # o'shenanigan seems to not be a legal barekey + #The Irish will have to use an earlier form Ó - which apparently many may prefer anyway. error "tomlish Unexpected single quote during barekey. [tomlish::parse::report_line]" } default { @@ -4693,63 +4787,69 @@ namespace eval tomlish::parse { } } else { switch -exact -- $state { - array-space { + array-space - keyval-value-expected - itable-keyval-value-expected { + #leading squote + #pseudo-token _start_squote_sequence ss not received by state machine + #This pseudotoken will trigger production of single_squote token or triple_squote token + #It currently doesn't trigger double_squote token + #(handle '' same as 'x' ie produce a single_squote and go into processing literal) + #review - producing double_squote for empty literal may be slightly more efficient. + #This token is not used to handle squote sequences *within* a multiliteral set_tokenType "_start_squote_sequence" set tok "'" } - itable-keyval-value-expected - keyval-value-expected { - set_tokenType "squote_seq_begin" + multiliteral-space { + #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row + #we are building up a tentative_accum_squote to determine if + #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines + #b) it is exactly ''' and we can terminate the whole multiliteral + #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space + set_tokenType "tentative_trigger_squote" ;#trigger tentative_accum_squote set tok "'" return 1 } - table-space { - #tests: squotedkey.test - set_tokenType "squotedkey" - set tok "" - } - itable-space { - #tests: squotedkey_itable.test + table-space - itable-space { + #tests: squotedkey.test squotedkey_itable.test set_tokenType "squotedkey" set tok "" } - XXXitable-space { - #future - could there be multiline keys? - #this would allow arbitrary tcl dicts to be stored in toml + XXXtable-space - XXXitable-space { + #future - could there be multiline keys? MLLKEY, MLBKEY ? + #this would (almost) allow arbitrary tcl dicts to be stored in toml (aside from escaping issues) #probably unlikely - as it's perhaps not very 'minimal' or ergonomic for config files - set_tokenType "squote_seq_begin" + #@2025 ABNF for toml mentions key, simple-key, unquoted-key, quoted-key and dotted-key + #where key is simple-key or dotted-key - no MLL or MLB components + #the spec states solution for arbitrary binary data is application specific involving encodings + #such as hex, base64 + set_tokenType "_start_squote_sequence" set tok "'" return 1 } tablename-state { #first char in tablename-state/tablearrayname-state - set_tokenType tablename + set_tokenType "tablename" append tok "'" } tablearrayname-state { - set_tokenType tablearrayname + set_tokenType "tablearrayname" append tok "'" } literal-state { + #shouldn't get here? review tomlish::log::debug "- tokloop sq during literal-state with no tokentype - empty literal?" - set_tokenType literal + set_tokenType "literal" incr -1 return 1 } multistring-space { - error "tomlish unimplemented - squote during state '$state'. [tomlish::parse::report_line]" - } - multiliteral-space { - #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row - #we are building up an squote_seq to determine if - #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines - #b) it is exactly ''' and we can terminate the whole multiliteral - #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space - set_tokenType "squote_seq_begin" - set tok "'" - return 1 + set_tokenType "stringpart" + set tok "" + if {$had_slash} {append tok "\\"} + append tok "," + #error "tomlish unimplemented - squote during state '$state'. [tomlish::parse::report_line]" } dottedkey-space { - set_tokenType squotedkey + set_tokenType "squotedkey" } default { error "tomlish unhandled squote during state '$state'. [tomlish::parse::report_line]" @@ -4765,44 +4865,50 @@ namespace eval tomlish::parse { if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote { incr i -1 return 1 } - startquotesequence { - set toklen [tcl::string::length $tok] - if {$toklen == 1} { - append tok $c - } elseif {$toklen == 2} { - append tok $c - #switch vs set? - set_tokenType "startmultiquote" - return 1 - } else { - error "tomlish unexpected token length $toklen in 'startquotesequence'" - } - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" return 1 - - #set toklen [tcl::string::length $tok] - #switch -- $toklen { - # 1 { - # set_tokenType "startsquote" - # incr i -1 - # return 1 - # } - # 2 { - # set_tokenType "startsquote" - # incr i -2 - # return 1 - # } - # default { - # error "tomlish unexpected _start_squote_sequence length $toklen" - # } - #} + } + tentative_accum_dquote { + #within multistring + #short tentative_accum_dquote tokens are returned if active upon receipt of any other character + #longest allowable for leading/trailing are returned here + #### + set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote + #assert state = trailing-squote-space + append tok $c + if {$existingtoklen == 4} { + #maxlen to be a tentative_accum_dquote is multidquote + 2 = 5 + #return tok with value """"" + return 1 + } + } + _start_dquote_sequence { + #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space + switch -- [tcl::string::length $tok] { + 1 { + #no conclusion can yet be reached + append tok $c + } + 2 { + #enter multistring + #switch? + append tok $c + set_tokenType triple_dquote + return 1 + } + default { + #if there are more than 3 leading dquotes we also enter multistring space and the subsequent ones are handled + #by the tentative_accum_dquote check for ending sequence which can accept up to 5 and reintegrate the + #extra 1 or 2 dquotes as data. + error "tomlish unexpected token length [tcl::string::length $tok] in '_start_dquote_sequence'" + } + } } literal - literalpart { append tok $c @@ -4811,8 +4917,8 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" $c } else { - #unescaped quote always terminates a string? - set_token_waiting type endquote value "\"" complete 1 startindex $cindex + #unescaped quote always terminates a string + set_token_waiting type enddquote value "\"" complete 1 startindex $cindex return 1 } } @@ -4821,77 +4927,31 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" $c } else { - #incr i -1 - - if {$multi_dquote eq "\"\""} { - set_token_waiting type endmultiquote value "\"\"\"" complete 1 startindex [expr {$cindex -2}] - set multi_dquote "" - return 1 - } else { - append multi_dquote "\"" - } + incr i -1 ;#throw the {"} back to loop - will be added to a tentative_accum_dquote token for later processing + return 1 } } whitespace { - switch -exact -- $state { - multistring-space { - #REVIEW - if {$had_slash} { - incr i -2 - return 1 - } else { - switch -- [tcl::string::length $multi_dquote] { - 2 { - set_token_waiting type endmultiquote value "\"\"\"" complete 1 startindex [expr {$cindex-2}] - set multi_dquote "" - return 1 - } - 1 { - incr i -2 - return 1 - } - 0 { - incr i -1 - return 1 - } - } - } - } - keyval-value-expected { - #end whitespace token and reprocess - incr i -1 - return 1 - - #if {$multi_dquote eq "\"\""} { - # set_token_waiting type startmultiquote value "\"\"\"" complete 1 - # set multi_dquote "" - # return 1 - #} else { - # #end whitespace token and reprocess - # incr i -1 - # return 1 - #} - } - table-space - itable-space { - incr i -1 - return 1 - } - default { - set_token_waiting type startquote value "\"" complete 1 startindex $cindex - return 1 - } + #assert: had_slash will only ever be true in multistring-space + if {$had_slash} { + incr i -2 + return 1 + } else { + #end whitespace token - throw dq back for reprocessing + incr i -1 + return 1 } } comment { if {$had_slash} {append tok "\\"} append tok $c } - XXXdquotedkey - XXXitablequotedkey { + XXXdquotedkey { if {$had_slash} { append tok "\\" append tok $c } else { - set_token_waiting type endquote value "\"" complete 1 startindex $cindex + set_token_waiting type enddquote value "\"" complete 1 startindex $cindex return 1 } } @@ -4901,7 +4961,7 @@ namespace eval tomlish::parse { append tok "\\" append tok $c } else { - #set_token_waiting type endsquote value "'" complete 1 + #set_token_waiting type enddquote value {"} complete 1 return 1 } } @@ -4924,64 +4984,40 @@ namespace eval tomlish::parse { #$slash_active not relevant when no tokenType #token is string only if we're expecting a value at this point switch -exact -- $state { - array-space { - #!? start looking for possible multistartquote - #set_tokenType startquote - #set tok $c - #return 1 - set_tokenType "startquotesequence" ;#one or more quotes in a row - either startquote or multistartquote - set tok $c - } - keyval-value-expected - itable-keyval-value-expected { - set_tokenType "startquotesequence" ;#one or more quotes in a row - either startquote or multistartquote - set tok $c + array-space - keyval-value-expected - itable-keyval-value-expected { + #leading dquote + #pseudo-token _start_squote_sequence ss not received by state machine + #This pseudotoken will trigger production of single_dquote token or triple_dquote token + #It currently doesn't trigger double_dquote token + #(handle "" same as "x" ie produce a single_dquote and go into processing string) + #review - producing double_dquote for empty string may be slightly more efficient. + #This token is not used to handle dquote sequences once *within* a multistring + set_tokenType "_start_dquote_sequence" + set tok {"} } multistring-space { - #TODO - had_slash!!! - #REVIEW if {$had_slash} { set_tokenType "stringpart" set tok "\\\"" - set multi_dquote "" } else { - if {$multi_dquote eq "\"\""} { - tomlish::log::debug "- tokloop char dq ---> endmultiquote" - set_tokenType "endmultiquote" - set tok "\"\"\"" - return 1 - #set_token_waiting type endmultiquote value "\"\"\"" complete 1 - #set multi_dquote "" - #return 1 - } else { - append multi_dquote "\"" - } + #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row + #we are building up a tentative_accum_squote to determine if + #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines + #b) it is exactly ''' and we can terminate the whole multiliteral + #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space + set_tokenType "tentative_trigger_dquote" ;#trigger tentative_accum_dquote + set tok {"} + return 1 } } multiliteral-space { set_tokenType "literalpart" set tok "\"" } - XXXtable-space { - set_tokenType "startquote" - set tok $c - return 1 - } - XXXitable-space { - set_tokenType "startquote" - set tok $c - } table-space - itable-space { set_tokenType "dquotedkey" set tok "" } - tablename-state { - set_tokenType tablename - set tok $c - } - tablearrayname-state { - set_tokenType tablearrayname - set tok $c - } dottedkey-space { set_tokenType dquotedkey set tok "" @@ -4990,49 +5026,56 @@ namespace eval tomlish::parse { #set_tokenType dquote_seq_begin #set tok $c } + tablename-state { + set_tokenType tablename + set tok $c + } + tablearrayname-state { + set_tokenType tablearrayname + set tok $c + } default { - error "tomlish Unexpected quote during state '$state' [tomlish::parse::report_line]" + error "tomlish Unexpected dquote during state '$state' [tomlish::parse::report_line]" } } } } = { - set dquotes $multi_dquote - set multi_dquote "" ;#!! set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { - #assertion had_slash 0, multi_dquote "" + #assertion had_slash 0 append tok $c } - string - comment - dquotedkey - itablequotedkey { + string - comment - dquotedkey { #for these tokenTypes an = is just data. if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } whitespace { if {$state eq "multistring-space"} { - set backlen [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backlen + incr i -1 return 1 } else { set_token_waiting type equal value = complete 1 startindex $cindex @@ -5063,7 +5106,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok ${dquotes}= + append tok = } multiliteral-space { set_tokenType "literalpart" @@ -5084,8 +5127,6 @@ namespace eval tomlish::parse { } cr { #REVIEW! - set dquotes $multi_dquote - set multi_dquote "" ;#!! # \r carriage return if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. set slash_active 0 @@ -5098,16 +5139,18 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal { @@ -5159,8 +5202,6 @@ namespace eval tomlish::parse { } lf { # \n newline - set dquotes $multi_dquote - set multi_dquote "" ;#!! set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { @@ -5171,16 +5212,19 @@ namespace eval tomlish::parse { append tok lf ;#assert we should now have tok "crlf" - as a previous cr is the only way to have an incomplete newline tok return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { + #multiliteral or multistring incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal { @@ -5196,20 +5240,14 @@ namespace eval tomlish::parse { return 1 } stringpart { - if {$dquotes ne ""} { - append tok $dquotes + if {$had_slash} { + #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) + set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] incr i -1 return 1 } else { - if {$had_slash} { - #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) - set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] - incr i -1 - return 1 - } else { - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 } } starttablename - tablename - tablearrayname - starttablearrayname { @@ -5236,20 +5274,13 @@ namespace eval tomlish::parse { incr i -1 return 1 } else { - if {$dquotes ne ""} { - #e.g one or 2 quotes just before nl - set_tokenType "stringpart" - set tok $dquotes - incr i -1 - return 1 - } set_tokenType "newline" set tok lf return 1 } } multiliteral-space { - #assert had_slash 0, multi_dquote "" + #assert had_slash 0 set_tokenType "newline" set tok "lf" return 1 @@ -5275,8 +5306,6 @@ namespace eval tomlish::parse { } } , { - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { @@ -5287,39 +5316,40 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } comment - tablename - tablearrayname { if {$had_slash} {append tok "\\"} append tok , } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { #stringpart can have up to 2 quotes too if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } literal - literalpart - squotedkey { - #assert had_slash always 0, multi_dquote "" + #assert had_slash always 0 append tok $c } whitespace { if {$state eq "multistring-space"} { - set backlen [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backlen + incr i -1 return 1 } else { set_token_waiting type comma value "," complete 1 startindex $cindex @@ -5338,10 +5368,10 @@ namespace eval tomlish::parse { set_tokenType "stringpart" set tok "" if {$had_slash} {append tok "\\"} - append tok "$dquotes," + append tok "," } multiliteral-space { - #assert had_slash 0, multi_dquote "" + #assert had_slash 0 set_tokenType "literalpart" set tok "," } @@ -5354,8 +5384,6 @@ namespace eval tomlish::parse { } } . { - set dquotes $multi_dquote - set multi_dquote "" ;#!! set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { @@ -5366,42 +5394,45 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } comment - untyped_value { if {$had_slash} {append tok "\\"} append tok $c } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } literal - literalpart - squotedkey { - #assert had_slash always 0, multi_dquote "" + #assert had_slash always 0 append tok $c } whitespace { switch -exact -- $state { multistring-space { - set backchars [expr {[tcl::string::length $dquotes] + 1}] + #review if {$had_slash} { - incr backchars 1 + incr i -2 + } else { + incr i -1 } - incr i -$backchars return 1 } xxxdottedkey-space { @@ -5444,7 +5475,7 @@ namespace eval tomlish::parse { set_tokenType "stringpart" set tok "" if {$had_slash} {append tok "\\"} - append tok "$dquotes." + append tok "." } multiliteral-space { set_tokenType "literalpart" @@ -5471,8 +5502,6 @@ namespace eval tomlish::parse { } " " { - set dquotes $multi_dquote - set multi_dquote "" ;#!! if {[tcl::string::length $tokenType]} { set had_slash $slash_active set slash_active 0 @@ -5483,16 +5512,18 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } barekey { @@ -5512,9 +5543,9 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok $dquotes$c + append tok $c } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} { append tok "\\" } append tok $c } @@ -5526,8 +5557,7 @@ namespace eval tomlish::parse { incr i -2 return 1 } else { - #split into STRINGPART aaa WS " " - append tok $dquotes + #split into STRINGPART xxx WS " " incr i -1 return 1 } @@ -5537,15 +5567,7 @@ namespace eval tomlish::parse { } whitespace { if {$state eq "multistring-space"} { - if {$dquotes ne ""} { - #end whitespace token - #go back by the number of quotes plus this space char - set backchars [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backchars - return 1 - } else { - append tok $c - } + append tok $c } else { append tok $c } @@ -5588,12 +5610,6 @@ namespace eval tomlish::parse { incr i -1 return 1 } else { - if {$dquotes ne ""} { - set_tokenType "stringpart" - set tok $dquotes - incr i -1 - return 1 - } set_tokenType "whitespace" append tok $c } @@ -5613,9 +5629,6 @@ namespace eval tomlish::parse { } } tab { - set dquotes $multi_dquote - set multi_dquote "" ;#!! - if {[tcl::string::length $tokenType]} { if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out (?review) set slash_active 0 @@ -5626,12 +5639,18 @@ namespace eval tomlish::parse { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } barekey { @@ -5662,7 +5681,6 @@ namespace eval tomlish::parse { return 1 } else { #split into STRINGPART aaa WS " " - append tok $dquotes incr i -1 return 1 } @@ -5706,15 +5724,8 @@ namespace eval tomlish::parse { incr i -1 return 1 } else { - if {$dquotes ne ""} { - set_tokenType stringpart - set tok $dquotes - incr i -1 - return 1 - } else { - set_tokenType whitespace - append tok $c - } + set_tokenType whitespace + append tok $c } } multiliteral-space { @@ -5732,16 +5743,31 @@ namespace eval tomlish::parse { #BOM (Byte Order Mark) - ignored by token consumer if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } _start_squote_sequence { #assert - tok will be one or two squotes only + #A toml literal probably isn't allowed to contain this + #but we will parse and let the validator sort it out. incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart { append tok $c } + string - stringpart { + append tok $c + } default { + #state machine will generally not have entry to accept bom - let it crash set_token_waiting type bom value "\uFEFF" complete 1 startindex $cindex return 1 } @@ -5752,6 +5778,10 @@ namespace eval tomlish::parse { set_tokenType "literalpart" set tok $c } + multistring-space { + set_tokenType "stringpart" + set tok $c + } default { set_tokenType "bom" set tok "\uFEFF" @@ -5761,8 +5791,6 @@ namespace eval tomlish::parse { } } default { - set dquotes $multi_dquote - set multi_dquote "" ;#!! if {[tcl::string::length $tokenType]} { if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. @@ -5774,28 +5802,24 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } whitespace { if {$state eq "multistring-space"} { - if {$dquotes ne ""} { - set backlen [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backlen - return 1 - } else { - incr i -1 - return 1 - } + incr i -1 + return 1 } else { #review incr i -1 ;#We don't have a full token to add to the token_waiting dict - so leave this char for next run. @@ -5815,7 +5839,7 @@ namespace eval tomlish::parse { return 1 } stringpart { - append tok $dquotes$c + append tok $c } default { #e.g comment/string/literal/literalpart/untyped_value/starttablename/starttablearrayname/tablename/tablearrayname @@ -5835,22 +5859,12 @@ namespace eval tomlish::parse { error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" } } - XXXcurly-syntax { - puts stderr "curly-syntax - review" - if {[tomlish::utils::is_barekey $c]} { - set_tokenType "barekey" - append tok $c - } else { - error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" - } - } multistring-space { set_tokenType "stringpart" if {$had_slash} { - #assert - we don't get had_slash and dquotes at same time set tok \\$c } else { - set tok $dquotes$c + set tok $c } } multiliteral-space { @@ -5890,21 +5904,6 @@ namespace eval tomlish::parse { # error "Reached end of data whilst tokenType = '$tokenType'. INVALID" #} switch -exact -- $tokenType { - startquotesequence { - set toklen [tcl::string::length $tok] - if {$toklen == 1} { - #invalid - #eof with open string - error "tomlish eof reached without closing quote for string. [tomlish::parse::report_line]" - } elseif {$toklen == 2} { - #valid - #we ended in a double quote, not actually a startquoteseqence - effectively an empty string - switch_tokenType "startquote" - incr i -1 - #set_token_waiting type string value "" complete 1 - return 1 - } - } _start_squote_sequence { set toklen [tcl::string::length $tok] switch -- $toklen { @@ -5913,11 +5912,29 @@ namespace eval tomlish::parse { error "tomlish eof reached without closing single quote for string literal. [tomlish::parse::report_line]" } 2 { - #review - set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] set_tokenType "literal" set tok "" return 1 + + ##review + #set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] + #set_tokenType "literal" + #set tok "" + #return 1 + } + } + } + _start_dquote_sequence { + set toklen [tcl::string::length $tok] + switch -- $toklen { + 1 { + #invalid eof with open string + error "tomlish eof reached without closing double quote for string. [tomlish::parse::report_line]" + } + 2 { + set_tokenType "string" + set tok "" + return 1 } } } @@ -6011,6 +6028,16 @@ namespace eval tomlish::dict { return $name } + proc _show_tablenames {tablenames_info} { + append msg \n "tablenames_info:" \n + dict for {tkey tinfo} $tablenames_info { + append msg " " "table: $tkey" \n + dict for {field finfo} $tinfo { + append msg " " "$field $finfo" \n + } + } + return $msg + } } tcl::namespace::eval tomlish::app { diff --git a/src/vendormodules/dictn-0.1.1.tm b/src/vendormodules/dictn-0.1.1.tm new file mode 100644 index 00000000..c9ef87f2 --- /dev/null +++ b/src/vendormodules/dictn-0.1.1.tm @@ -0,0 +1,349 @@ +# -*- tcl -*- +# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt +# +# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. +# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# (C) 2023 +# +# @@ Meta Begin +# Application dictn 0.1.1 +# Meta platform tcl +# Meta license +# @@ Meta End + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Requirements +##e.g package require frobz + + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval dictn { + namespace export {[a-z]*} + namespace ensemble create +} + + +## ::dictn::append +#This can of course 'ruin' a nested dict if applied to the wrong element +# - i.e using the string op 'append' on an element that is itself a nested dict is analogous to the standard Tcl: +# %set list {a b {c d}} +# %append list x +# a b {c d}x +# IOW - don't do that unless you really know that's what you want. +# +proc ::dictn::append {dictvar path {value {}}} { + if {[llength $path] == 1} { + uplevel 1 [list dict append $dictvar $path $value] + } else { + upvar 1 $dictvar dvar + + ::set str [dict get $dvar {*}$path] + append str $val + dict set dvar {*}$path $str + } +} + +proc ::dictn::create {args} { + ::set data {} + foreach {path val} $args { + dict set data {*}$path $val + } + return $data +} + +proc ::dictn::exists {dictval path} { + return [dict exists $dictval {*}$path] +} + +proc ::dictn::filter {dictval path filterType args} { + ::set sub [dict get $dictval {*}$path] + dict filter $sub $filterType {*}$args +} + +proc ::dictn::for {keyvalvars dictval path body} { + ::set sub [dict get $dictval {*}$path] + dict for $keyvalvars $sub $body +} + +proc ::dictn::get {dictval {path {}}} { + return [dict get $dictval {*}$path] +} + +proc ::dictn::getdef {dictval path default} { + return [dict getdef $dictval {*}$path $default] +} + +proc ::dictn::getwithdefault {dictval path default} { + return [dict getdef $dictval {*}$path $default] +} + +if {[info commands ::tcl::dict::getdef] ne ""} { + proc ::dictn::incr {dictvar path {increment {}} } { + if {$increment eq ""} { + ::set increment 1 + } + if {[llength $path] == 1} { + uplevel 1 [list dict incr $dictvar $path $increment] + } else { + upvar 1 $dictvar dvar + if {![::info exists dvar]} { + dict set dvar {*}$path $increment + } else { + ::set newval [expr {[dict getdef $dvar {*}$path 0] + $increment}] + dict set dvar {*}$path $newval + } + return $dvar + } + } +} else { + proc ::dictn::incr {dictvar path {increment {}} } { + if {$increment eq ""} { + ::set increment 1 + } + if {[llength $path] == 1} { + uplevel 1 [list dict incr $dictvar $path $increment] + } else { + upvar 1 $dictvar dvar + if {![::info exists dvar]} { + dict set dvar {*}$path $increment + } else { + if {![dict exists $dvar {*}$path]} { + ::set val 0 + } else { + ::set val [dict get $dvar {*}$path] + } + ::set newval [expr {$val + $increment}] + dict set dvar {*}$path $newval + } + return $dvar + } + } +} + +proc ::dictn::info {dictval {path {}}} { + if {![string length $path]} { + return [dict info $dictval] + } else { + ::set sub [dict get $dictval {*}$path] + return [dict info $sub] + } +} + +proc ::dictn::keys {dictval {path {}} {glob {}}} { + ::set sub [dict get $dictval {*}$path] + if {[string length $glob]} { + return [dict keys $sub $glob] + } else { + return [dict keys $sub] + } +} + +proc ::dictn::lappend {dictvar path args} { + if {[llength $path] == 1} { + uplevel 1 [list dict lappend $dictvar $path {*}$args] + } else { + upvar 1 $dictvar dvar + + ::set list [dict get $dvar {*}$path] + ::lappend list {*}$args + dict set dvar {*}$path $list + } +} + +proc ::dictn::merge {args} { + error "nested merge not yet supported" +} + +#dictn remove dictionaryValue ?path ...? +proc ::dictn::remove {dictval args} { + ::set basic [list] ;#buffer basic (1element path) removals to do in a single call. + + foreach path $args { + if {[llength $path] == 1} { + ::lappend basic $path + } else { + #extract,modify,replace + ::set subpath [lrange $path 0 end-1] + + ::set sub [dict get $dictval {*}$subpath] + ::set sub [dict remove $sub [lindex $path end]] + + dict set dictval {*}$subpath $sub + } + } + + if {[llength $basic]} { + return [dict remove $dictval {*}$basic] + } else { + return $dictval + } +} + + +proc ::dictn::replace {dictval args} { + ::set basic [list] ;#buffer basic (1element path) replacements to do in a single call. + + foreach {path val} $args { + if {[llength $path] == 1} { + ::lappend basic $path $val + } else { + #extract,modify,replace + ::set subpath [lrange $path 0 end-1] + + ::set sub [dict get $dictval {*}$subpath] + ::set sub [dict replace $sub [lindex $path end] $val] + + dict set dictval {*}$subpath $sub + } + } + + + if {[llength $basic]} { + return [dict replace $dictval {*}$basic] + } else { + return $dictval + } +} + + +proc ::dictn::set {dictvar path newval} { + upvar 1 $dictvar dvar + return [dict set dvar {*}$path $newval] +} + +proc ::dictn::size {dictval {path {}}} { + return [dict size [dict get $dictval {*}$path]] +} + +proc ::dictn::unset {dictvar path} { + upvar 1 $dictvar dvar + return [dict unset dvar {*}$path +} + +proc ::dictn::update {dictvar args} { + ::set body [lindex $args end] + ::set maplist [lrange $args 0 end-1] + + upvar 1 $dictvar dvar + foreach {path var} $maplist { + if {[dict exists $dvar {*}$path]} { + uplevel 1 [list set $var [dict get $dvar $path]] + } + } + + catch {uplevel 1 $body} result + + foreach {path var} $maplist { + if {[dict exists $dvar {*}$path]} { + upvar 1 $var $var + if {![::info exists $var]} { + uplevel 1 [list dict unset $dictvar {*}$path] + } else { + uplevel 1 [list dict set $dictvar {*}$path [::set $var]] + } + } + } + return $result +} + +#an experiment. +proc ::dictn::Applyupdate {dictvar args} { + ::set body [lindex $args end] + ::set maplist [lrange $args 0 end-1] + + upvar 1 $dictvar dvar + + ::set headscript "" + ::set i 0 + foreach {path var} $maplist { + if {[dict exists $dvar {*}$path]} { + #uplevel 1 [list set $var [dict get $dvar $path]] + ::lappend arglist $var + ::lappend vallist [dict get $dvar {*}$path] + ::append headscript [string map [list %i% $i %v% $var] {upvar 1 %v% %v%; set %v% [lindex $args %i%]} ] + ::append headscript \n + ::incr i + } + } + + ::set body $headscript\r\n$body + + puts stderr "BODY: $body" + + #set result [apply [list args $body] {*}$vallist] + catch {apply [list args $body] {*}$vallist} result + + foreach {path var} $maplist { + if {[dict exists $dvar {*}$path] && [::info exists $var]} { + dict set dvar {*}$path [::set $var] + } + } + return $result +} + +proc ::dictn::values {dictval {path {}} {glob {}}} { + ::set sub [dict get $dictval {*}$path] + if {[string length $glob]} { + return [dict values $sub $glob] + } else { + return [dict values $sub] + } +} + +# Standard form: +#'dictn with dictVariable path body' +# +# Extended form: +#'dictn with dictVariable path arrayVariable body' +# +proc ::dictn::with {dictvar path args} { + if {[llength $args] == 1} { + ::set body [lindex $args 0] + return [uplevel 1 [list dict with $dictvar {*}$path $body]] + } else { + upvar 1 $dictvar dvar + ::lassign $args arrayname body + + upvar 1 $arrayname arr + array set arr [dict get $dvar {*}$path] + ::set prevkeys [array names arr] + + catch {uplevel 1 $body} result + + + foreach k $prevkeys { + if {![::info exists arr($k)]} { + dict unset $dvar {*}$path $k + } + } + foreach k [array names arr] { + dict set $dvar {*}$path $k $arr($k) + } + + return $result + } +} + + + + + + + + + + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Ready +package provide dictn [namespace eval dictn { + variable version + ::set version 0.1.1 +}] +return \ No newline at end of file diff --git a/src/vendormodules/include_modules.config b/src/vendormodules/include_modules.config index 895bda28..a9c143af 100644 --- a/src/vendormodules/include_modules.config +++ b/src/vendormodules/include_modules.config @@ -12,6 +12,7 @@ set local_modules [list\ c:/repo/jn/tclmodules/tablelist/modules tablelist_tile\ c:/repo/jn/tclmodules/tomlish/modules tomlish\ c:/repo/jn/tclmodules/tomlish/modules test::tomlish\ + c:/repo/jn/tclmodules/dictn/modules dictn\ ] set fossil_modules [dict create\ diff --git a/src/vendormodules/test/tomlish-1.1.3.tm b/src/vendormodules/test/tomlish-1.1.3.tm index ed5044a7..8afb43d9 100644 Binary files a/src/vendormodules/test/tomlish-1.1.3.tm and b/src/vendormodules/test/tomlish-1.1.3.tm differ diff --git a/src/vendormodules/tomlish-1.1.4.tm b/src/vendormodules/tomlish-1.1.4.tm index 7a6d5205..33d5b912 100644 --- a/src/vendormodules/tomlish-1.1.4.tm +++ b/src/vendormodules/tomlish-1.1.4.tm @@ -153,15 +153,10 @@ namespace eval tomlish { } #review - if {[uplevel 1 [list info exists tablenames_seen]]} { - upvar tablenames_seen tablenames_seen + if {[uplevel 1 [list info exists tablenames_info]]} { + upvar tablenames_info tablenames_info } else { - set tablenames_seen [list] ;#list of lists - } - if {[uplevel 1 [list info exists tablenames_closed]]} { - upvar tablenames_closed tablenames_closed - } else { - set tablenames_closed [list] ;#list of lists + set tablenames_info [dict create] ;#keys are lists {parenttable subtable etc} corresponding to parenttable.subtable.etc } foreach sub [lrange $keyval_element 2 end] { @@ -207,13 +202,10 @@ namespace eval tomlish { ARRAY { #we need to recurse to get the corresponding dict for the contained item(s) #pass in the whole $found_sub - not just the $value! - set prev_tablenames_seen $tablenames_seen - set prev_tablenames_closed $tablenames_closed - set tablenames_seen [list] - set tablenames_closed [list] + set prev_tablenames_info $tablenames_info + set tablenames_info [dict create] set result [list type $type value [::tomlish::to_dict [list $found_sub]]] - set tablenames_seen $prev_tablenames_seen - set tablenames_closed $prev_tablenames_closed + set tablenames_info $prev_tablenames_info } MULTISTRING - MULTILITERAL { #review - mapping these to STRING might make some conversions harder? @@ -295,23 +287,66 @@ namespace eval tomlish { #[Data] #temps = [{cpu = 79.5, case = 72.0}] proc to_dict {tomlish} { + package require dictn #keep track of which tablenames have already been directly defined, # so we can raise an error to satisfy the toml rule: 'You cannot define any key or table more than once. Doing so is invalid' #Note that [a] and then [a.b] is ok if there are no subkey conflicts - so we are only tracking complete tablenames here. #we don't error out just because a previous tablename segment has already appeared. - ##variable tablenames_seen [list] - if {[uplevel 1 [list info exists tablenames_seen]]} { - upvar tablenames_seen tablenames_seen - } else { - set tablenames_seen [list] ;#list of lists - } - if {[uplevel 1 [list info exists tablenames_closed]]} { - upvar tablenames_closed tablenames_closed + + #Declaring, Creating, and Defining Tables + #https://github.com/toml-lang/toml/issues/795 + #(update - only Creating and Defining are relevant terminology) + + #review + #tablenames_info keys created, defined, createdby, definedby, closedby + + #consider the following 2 which are legal: + #[table] #'table' created, defined=open definedby={header table} + #x.y = 3 + #[table.x.z] #'table' defined=closed closedby={header table.x.z}, 'table.x' created, 'table.x.z' created defined=open definedby={header table.x.z} + #k= 22 + # #'table.x.z' defined=closed closedby={eof eof} + + #equivalent datastructure + + #[table] #'table' created, defined=open definedby={header table} + #[table.x] #'table' defined=closed closedby={header table.x}, 'table.x' created defined=open definedby={header table.x} + #y = 3 + #[table.x.z] #'table.x' defined=closed closedby={header table.x.z}, 'table.x.z' created defined=open definedby={header table.x.z} + #k=22 + + #illegal + #[table] #'table' created and defined=open + #x.y = 3 #'table.x' created first keyval pair defined=open definedby={keyval x.y = 3} + #[table.x.y.z] #'table' defined=closed, 'table.x' closed because parent 'table' closed?, 'table.x.y' cannot be created + #k = 22 + # + ## - we would fail on encountering table.x.y because only table and table.x are effectively tables - but that table.x is closed should be detected (?) + + #illegal + #[table] + #x.y = {p=3} + #[table.x.y.z] + #k = 22 + ## we should fail because y is an inline table which is closed to further entries + + #note: it is not safe to compare normalized tablenames using join! + # e.g a.'b.c'.d is not the same as a.b.c.d + # instead compare {a b.c d} with {a b c d} + # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. + #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' + #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} + + + + if {[uplevel 1 [list info exists tablenames_info]]} { + upvar tablenames_info tablenames_info } else { - set tablenames_closed [list] ;#list of lists + set tablenames_info [dict create] ;#keyed on tablepath each of which is a list such as {config subgroup etc} (corresponding to config.subgroup.etc) } + log::info "---> to_dict processing '$tomlish'<<<" set items $tomlish @@ -354,7 +389,7 @@ namespace eval tomlish { #a.b.c = 1 #table_key_hierarchy -> a b - #leafkey -> c + #tleaf -> c if {[llength $dotted_key_hierarchy] == 0} { #empty?? probably invalid. review #This is different to '' = 1 or ''.'' = 1 which have lengths 1 and 2 respectively @@ -362,10 +397,10 @@ namespace eval tomlish { } elseif {[llength $dotted_key_hierarchy] == 1} { #dottedkey is only a key - no table component set table_hierarchy [list] - set leafkey [lindex $dotted_key_hierarchy 0] + set tleaf [lindex $dotted_key_hierarchy 0] } else { set table_hierarchy [lrange $dotted_key_hierarchy 0 end-1] - set leafkey [lindex $dotted_key_hierarchy end] + set tleaf [lindex $dotted_key_hierarchy end] } #ensure empty tables are still represented in the datastructure @@ -380,143 +415,101 @@ namespace eval tomlish { } } #review? - if {[dict exists $datastructure {*}$table_hierarchy $leafkey]} { - error "Duplicate key '$table_hierarchy $leafkey'. The key already exists at this level in the toml data. The toml data is not valid." + if {[dict exists $datastructure {*}$table_hierarchy $tleaf]} { + error "Duplicate key '$table_hierarchy $tleaf'. The key already exists at this level in the toml data. The toml data is not valid." } #JMN test 2025 if {[llength $table_hierarchy]} { - lappend tablenames_seen $table_hierarchy + dictn incr tablenames_info [list $table_hierarchy seencount] } set keyval_dict [_get_keyval_value $item] if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { - lappend tablenames_seen [list {*}$table_hierarchy $leafkey] - lappend tablenames_closed [list {*}$table_hierarchy $leafkey] + set t [list {*}$table_hierarchy $tleaf] + dictn incr tablenames_info [list $t seencount] + dictn set tablenames_info [list $t closed] 1 #review - item is an ITABLE - we recurse here without datastructure context :/ #overwriting keys? todo ? - dict set datastructure {*}$table_hierarchy $leafkey $keyval_dict + dict set datastructure {*}$table_hierarchy $tleaf $keyval_dict } else { - dict set datastructure {*}$table_hierarchy $leafkey $keyval_dict + dict set datastructure {*}$table_hierarchy $tleaf $keyval_dict } + } + TABLEARRAY { + set tablename [lindex $item 1] + log::debug "---> to_dict processing item TABLENAME (name: $tablename): $item" + set norm_segments [::tomlish::utils::tablename_split $tablename true] ;#true to normalize + #we expect repeated tablearray entries - each adding a sub-object to the value, which is an array/list. + } TABLE { set tablename [lindex $item 1] + log::debug "---> to_dict processing item TABLE (name: $tablename): $item" #set tablename [::tomlish::utils::tablename_trim $tablename] set norm_segments [::tomlish::utils::tablename_split $tablename true] ;#true to normalize - if {$norm_segments in $tablenames_seen} { - error "Table name '$tablename' has already been directly defined in the toml data. Invalid." - } - log::debug "---> to_dict processing item $tag (name: $tablename): $item" - set name_segments [::tomlish::utils::tablename_split $tablename] ;#unnormalized - set last_seg "" - #toml spec rule - all segments mst be non-empty - #note that the results of tablename_split are 'raw' - ie some segments may be enclosed in single or double quotes. - - set table_key_sublist [list] - - foreach normseg $norm_segments { - lappend table_key_sublist $normseg - if {[dict exists $datastructure {*}$table_key_sublist]} { - #It's ok for this key to already exist *if* it was defined by a previous tablename or equivalent - #and if this key is longer - - #consider the following 2 which are legal: - #[table] - #x.y = 3 - #[table.x.z] - #k= 22 - - #equivalent - - #[table] - #[table.x] - #y = 3 - #[table.x.z] - #k=22 - - #illegal - #[table] - #x.y = 3 - #[table.x.y.z] - #k = 22 - ## - we should fail on encountering table.x.y because only table and table.x are effectively tables - - #illegal - #[table] - #x.y = {p=3} - #[table.x.y.z] - #k = 22 - ## we should fail because y is an inline table which is closed to further entries - - - #note: it is not safe to compare normalized tablenames using join! - # e.g a.'b.c'.d is not the same as a.b.c.d - # instead compare {a b.c d} with {a b c d} - # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. - #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' - #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} - - set sublist_length [llength $table_key_sublist] - set found_testkey 0 - if {$table_key_sublist in $tablenames_seen} { - set found_testkey 1 - } else { - #see if it was defined by a longer entry - foreach seen_table_segments $tablenames_seen { - if {[llength $seen_table_segments] <= $sublist_length} { - continue - } - #each tablenames_seen entry is already a list of normalized segments - - #we could have [a.b.c.d] early on - # followed by [a.b] - which was still defined by the earlier one. + set T_DEFINED [dictn getdef $tablenames_info [list $norm_segments defined] NULL] + if {$T_DEFINED ne "NULL"} { + #our tablename e.g [a.b.c.d] declares a space to 'define' subkeys - but there has already been a definition space for this path + set msg "Table name $tablename has already been directly defined in the toml data. Invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + error $msg + } - set seen_longer [lrange $seen_segments 0 [expr {$sublist_length -1}]] - puts stderr "testkey:'$table_key_sublist' vs seen_match:'$seen_longer'" - if {$table_key_sublist eq $seen_longer} { - set found_testkey 1 - } - } - } - if {$found_testkey == 0} { - #the raw unnormalized tablename might be ok to display in the error message, although it's not the actual dict keyset - set msg "key $table_key_sublist already exists in datastructure, but wasn't defined by a supertable." - append msg \n "tablenames_seen:" \n - foreach ts $tablenames_seen { - append msg " " $ts \n - } + set name_segments [::tomlish::utils::tablename_split $tablename 0] ;#unnormalized e.g ['a'."b".c.d] -> 'a' "b" c d + #results of tablename_split 0 are 'raw' - ie some segments may be enclosed in single or double quotes. + + + set supertable [list] + ############## + # [a.b.c.d] + # norm_segments = {a b c d} + #check a {a b} {a b c} <---- supertables of a.b.c.d + ############## + foreach normseg [lrange $norm_segments 0 end-1] { + lappend supertable $normseg + if {![dictn exists $tablenames_info [list $supertable type]]} { + #supertable with this path doesn't yet exist + if {[dict exists $datastructure {*}$supertable]} { + #There is data though - so it must have been created as a keyval + set msg "Supertable [join $supertable .] of table name $tablename already has data - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] error $msg + } else { + #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here + dictn set tablenames_info [list $supertable type] header + #ensure empty tables are still represented in the datastructure + dict set datastructure {*}$supertable [list] } - } - - } - - #ensure empty tables are still represented in the datastructure - set key_sublist [list] - foreach k $norm_segments { - lappend key_sublist $k - if {![dict exists $datastructure {*}$key_sublist]} { - dict set datastructure {*}$key_sublist [list] } else { - tomlish::log::notice "to_dict datastructure at (TABLE) subkey $key_sublist already had data: [dict get $datastructure {*}$key_sublist]" + #supertable has already been created - and maybe defined - but even if defined we can add subtables } } + #table [a.b.c.d] hasn't been defined - but may have been 'created' already by a longer tablename + # - or may have existing data from a keyval + if {![dictn exists $tablenames_info [list $norm_segments type]]} { + if {[dict exists $datastructure {*}$norm_segments]} { + set msg "Table name $tablename already has data - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + error $msg + } + #no data or previously created table + dictn set tablenames_info [list $norm_segments type] header - #We must do this after the key-collision test above! - lappend tablenames_seen $norm_segments - - + #We are 'defining' this table's keys and values here (even if empty) + dict set datastructure {*}$norm_segments [list] ;#ensure table still represented in datastructure even if we add no keyvals here + } + dictn set tablenames_info [list $norm_segments defined] open log::debug ">>> to_dict >>>>>>>>>>>>>>>>> normalized table key hierarchy : $norm_segments" #now add the contained elements foreach element [lrange $item 2 end] { set type [lindex $element 0] - log::debug "----> tododict processing $tag subitem $type processing contained element $element" + log::debug "----> todict processing $tag subitem $type processing contained element $element" switch -exact -- $type { DOTTEDKEY { set dkey_info [_get_dottedkey_info $element] @@ -547,14 +540,19 @@ namespace eval tomlish { puts stdout "to_dict>>> $keyval_dict" dict set datastructure {*}$norm_segments {*}$dkeys $leaf_key $keyval_dict #JMN 2025 - lappend tablenames_seen [list {*}$norm_segments {*}$dkeys] + #lappend tablenames_info [list {*}$norm_segments {*}$dkeys] + set tkey [list {*}$norm_segments {*}$dkeys] + dictn incr tablenames_info [list $tkey seencount] if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { #the value is either empty or or a dict structure with arbitrary (from-user-data) toplevel keys # inner structure will contain {type value } if all leaves are not empty ITABLES - lappend tablenames_seen [list {*}$norm_segments {*}$dkeys $leaf_key] + set tkey [list {*}$norm_segments {*}$dkeys $leaf_key] + #lappend tablenames_info [list {*}$norm_segments {*}$dkeys $leaf_key] + dictn incr tablenames_info [list $tkey seencount] #if the keyval_dict is not a simple type x value y - then it's an inline table ? #if so - we should add the path to the leaf_key as a closed table too - as it's not allowed to have more entries added. + dictn set tablenames_info [list $tkey closed] 1 } } @@ -562,7 +560,7 @@ namespace eval tomlish { #ignore } default { - error "Sub element of type '$type' not understood in table context. Expected only KEY,DQKEY,SQKEY,NEWLINE,COMMENT,WS" + error "Sub element of type '$type' not understood in table context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" } } } @@ -1316,7 +1314,12 @@ namespace eval tomlish::encode { #NOTE - this DELIBERATELY does not validate the data, or process escapes etc #It encodes the tomlish records as they are. #ie it only produces toml shaped data from a tomlish list. + # #It is part of the roundtripability of data from toml to tomlish + #!! ie - it is not the place to do formatting of inline vs multiline !! + # That needs to be encoded in the tomlish data that is being passed in + # (e.g from_dict could make formatting decisions in the tomlish it produces) + # #e.g duplicate keys etc can exist in the toml output. #The to_dict from_dict (or any equivalent processor pair) is responsible for validation and conversion #back and forth of escape sequences where appropriate. @@ -1646,17 +1649,27 @@ namespace eval tomlish::decode { #pop_trigger_tokens: newline tablename endarray endinlinetable #note a token is a pop trigger depending on context. e.g first newline during keyval is a pop trigger. set parentlevel [expr {$nest -1}] - set do_append_to_parent 1 ;#most tokens will leave this alone - but some like squote_seq need to do their own append + set do_append_to_parent 1 ;#most tokens will leave this alone - but some like tentative_accum_squote need to do their own append switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote { + #should only apply within a multiliteral #### set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed #Without this - we would get extraneous empty list entries in the parent # - as the xxx-squote-space isn't a space level from the toml perspective # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop + #assert prevstate always trailing-squote-space + #dev guardrail - remove? assertion lib? + switch -exact -- $prevstate { + trailing-squote-space { + } + default { + error "--- unexpected popped due to tentative_accum_squote but came from state '$prevstate' should have been trailing-squote-space" + } + } switch -- $tok { ' { - tomlish::parse::set_token_waiting type startsquote value $tok complete 1 startindex [expr {$i -1}] + tomlish::parse::set_token_waiting type single_squote value $tok complete 1 startindex [expr {$i -1}] } '' { #review - we should perhaps return double_squote instead? @@ -1669,74 +1682,51 @@ namespace eval tomlish::decode { tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 3}] } '''' { - switch -exact -- $prevstate { - leading-squote-space { - error "---- 4 squotes from leading-squote-space - shouldn't get here" - #we should have emitted the triple and left the last for next loop + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] + #todo integrate left squote with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + LITERALPART { + set newval "[lindex $lastpart 1]'" + set parentdata $v($parentlevel) + lset parentdata end [list LITERALPART $newval] + set v($parentlevel) $parentdata } - trailing-squote-space { - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] - #todo integrate left squote with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - LITERALPART { - set newval "[lindex $lastpart 1]'" - set parentdata $v($parentlevel) - lset parentdata end [list LITERALPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE { - lappend v($parentlevel) [list LITERALPART "'"] - } - MULTILITERAL { - #empty - lappend v($parentlevel) [list LITERALPART "'"] - } - default { - error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" - } - } + NEWLINE { + lappend v($parentlevel) [list LITERALPART "'"] + } + MULTILITERAL { + #empty + lappend v($parentlevel) [list LITERALPART "'"] } default { - error "--- unexpected popped due to squote_seq but came from state '$prevstate' should have been leading-squote-space or trailing-squote-space" + error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" } } } ''''' { - switch -exact -- $prevstate { - leading-squote-space { - error "---- 5 squotes from leading-squote-space - shouldn't get here" - #we should have emitted the triple and left the following squotes for next loop + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] + #todo integrate left 2 squotes with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + LITERALPART { + set newval "[lindex $lastpart 1]''" + set parentdata $v($parentlevel) + lset parentdata end [list LITERALPART $newval] + set v($parentlevel) $parentdata } - trailing-squote-space { - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] - #todo integrate left 2 squotes with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - LITERALPART { - set newval "[lindex $lastpart 1]''" - set parentdata $v($parentlevel) - lset parentdata end [list LITERALPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE { - lappend v($parentlevel) [list LITERALPART "''"] - } - MULTILITERAL { - lappend v($parentlevel) [list LITERALPART "''"] - } - default { - error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" - } - } + NEWLINE { + lappend v($parentlevel) [list LITERALPART "''"] + } + MULTILITERAL { + lappend v($parentlevel) [list LITERALPART "''"] } default { - error "--- unexpected popped due to squote_seq but came from state '$prevstate' should have been leading-squote-space or trailing-squote-space" + error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" } } } } - puts stderr "tomlish::decode::toml ---- HERE squote_seq pop <$tok>" } triple_squote { #presumably popping multiliteral-space @@ -1763,7 +1753,119 @@ namespace eval tomlish::decode { lappend merged $part } default { - error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($next)" + error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" + } + } + set lasttype [lindex $part 0] + } + set v($nest) $merged + } + tentative_accum_dquote { + #should only apply within a multistring + #### + set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed + #Without this - we would get extraneous empty list entries in the parent + # - as the trailing-dquote-space isn't a space level from the toml perspective + # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop + #assert prevstate always trailing-dquote-space + #dev guardrail - remove? assertion lib? + switch -exact -- $prevstate { + trailing-dquote-space { + } + default { + error "--- unexpected popped due to tentative_accum_dquote but came from state '$prevstate' should have been trailing-dquote-space" + } + } + switch -- $tok { + {"} { + tomlish::parse::set_token_waiting type single_dquote value $tok complete 1 startindex [expr {$i -1}] + } + {""} { + #review - we should perhaps return double_dquote instead? + #tomlish::parse::set_token_waiting type literal value "" complete 1 + tomlish::parse::set_token_waiting type double_dquote value "" complete 1 startindex [expr {$i - 2}] + } + {"""} { + #### + #if already an eof in token_waiting - set_token_waiting will insert before it + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 3}] + } + {""""} { + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 4}] + #todo integrate left dquote with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + STRINGPART { + set newval "[lindex $lastpart 1]\"" + set parentdata $v($parentlevel) + lset parentdata end [list STRINGPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE - CONT - WS { + lappend v($parentlevel) [list STRINGPART {"}] + } + MULTISTRING { + #empty + lappend v($parentlevel) [list STRINGPART {"}] + } + default { + error "--- don't know how to integrate extra trailing dquote with data $v($parentlevel)" + } + } + } + {"""""} { + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i-5}] + #todo integrate left 2 dquotes with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + STRINGPART { + set newval "[lindex $lastpart 1]\"\"" + set parentdata $v($parentlevel) + lset parentdata end [list STRINGPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE - CONT - WS { + lappend v($parentlevel) [list STRINGPART {""}] + } + MULTISTRING { + lappend v($parentlevel) [list STRINGPART {""}] + } + default { + error "--- don't know how to integrate extra trailing 2 dquotes with data $v($parentlevel)" + } + } + } + } + } + triple_dquote { + #presumably popping multistring-space + ::tomlish::log::debug "---- triple_dquote for last_space_action pop leveldata: $v($nest)" + set merged [list] + set lasttype "" + foreach part $v($nest) { + switch -exact -- [lindex $part 0] { + MULTISTRING { + lappend merged $part + } + STRINGPART { + if {$lasttype eq "STRINGPART"} { + set prevpart [lindex $merged end] + lset prevpart 1 [lindex $prevpart 1][lindex $part 1] + lset merged end $prevpart + } else { + lappend merged $part + } + } + CONT - WS { + lappend merged $part + } + NEWLINE { + #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here + #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. + lappend merged $part + } + default { + error "---- triple_dquote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" } } set lasttype [lindex $part 0] @@ -1809,15 +1911,12 @@ namespace eval tomlish::decode { endinlinetable { ::tomlish::log::debug "---- endinlinetable for last_space_action pop" } - endmultiquote { - ::tomlish::log::debug "---- endmultiquote for last_space_action 'pop'" - } default { error "---- unexpected tokenType '$tokenType' for last_space_action 'pop'" } } if {$do_append_to_parent} { - #e.g squote_seq does it's own appends as necessary - so won't get here + #e.g tentative_accum_squote does it's own appends as necessary - so won't get here lappend v($parentlevel) [set v($nest)] } @@ -1831,8 +1930,8 @@ namespace eval tomlish::decode { switch -exact -- $tokenType { - squote_seq_begin { - #### + tentative_trigger_squote - tentative_trigger_dquote { + #### this startok will always be tentative_accum_squote/tentative_accum_dquote starting with one accumulated squote/dquote if {[dict exists $transition_info starttok] && [dict get $transition_info starttok] ne ""} { lassign [dict get $transition_info starttok] starttok_type starttok_val set next_tokenType_known 1 @@ -1840,6 +1939,16 @@ namespace eval tomlish::decode { set tok $starttok_val } } + single_squote { + #JMN - REVIEW + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "squotedkey" + set tok "" + } + triple_squote { + ::tomlish::log::debug "---- push trigger tokenType triple_squote" + set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERALPART + } squotedkey { switch -exact -- $prevstate { table-space - itable-space { @@ -1849,6 +1958,9 @@ namespace eval tomlish::decode { #todo - check not something already waiting? tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } + triple_dquote { + set v($nest) [list MULTISTRING] ;#container for NEWLINE,STRINGPART,CONT + } dquotedkey { switch -exact -- $prevstate { table-space - itable-space { @@ -1858,7 +1970,7 @@ namespace eval tomlish::decode { #todo - check not something already waiting? tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } - XXXdquotedkey - XXXitablequotedkey { + XXXdquotedkey { #todo set v($nest) [list DQKEY $tok] ;#$tok is the keyname } @@ -1878,34 +1990,29 @@ namespace eval tomlish::decode { tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } } - startsquote { - #JMN - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "squotedkey" - set tok "" - } tablename { #note: we do not use the output of tomlish::tablename_trim to produce a tablename for storage in the tomlish list! #The tomlish list is intended to preserve all whitespace (and comments) - so a roundtrip from toml file to tomlish # back to toml file will be identical. #It is up to the datastructure stage to normalize and interpret tomlish for programmatic access. # we call tablename_trim here only to to validate that the tablename data is well-formed at the outermost level, - # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names. + # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names from + # a structural perspective. #todo - review! It's arguable that we should not do any validation here, and just store even incorrect raw tablenames, # so that the tomlish list is more useful for say a toml editor. Consider adding an 'err' tag to the appropriate place in the # tomlish list? - set test_only [::tomlish::utils::tablename_trim $tok] - ::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$test_only'" + #set trimtable [::tomlish::utils::tablename_trim $tok] + #::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$trimtable'" set v($nest) [list TABLE $tok] ;#$tok is the *raw* table name #note also that equivalent tablenames may have different toml representations even after being trimmed! #e.g ["x\t\t"] & ["x "] (tab escapes vs literals) #These will show as above in the tomlish list, but should normalize to the same tablename when used as keys by the datastructure stage. } tablearrayname { - set test_only [::tomlish::utils::tablename_trim $tok] - puts stdout "trimmed (but not normalized) tablearrayname: '$test_only'" + #set trimtable [::tomlish::utils::tablename_trim $tok] + #::tomlish::log::debug "---- trimmed (but not normalized) tablearrayname: '$trimtable'" set v($nest) [list TABLEARRAY $tok] ;#$tok is the *raw* tablearray name } startarray { @@ -1914,14 +2021,6 @@ namespace eval tomlish::decode { startinlinetable { set v($nest) [list ITABLE] ;#$tok is just the opening curly brace - don't output. } - startmultiquote { - ::tomlish::log::debug "---- push trigger tokenType startmultiquote" - set v($nest) [list MULTISTRING] ;#container for STRINGPART, WS, CONT, NEWLINE - } - triple_squote { - ::tomlish::log::debug "---- push trigger tokenType triple_squote" - set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERAL - } default { error "---- push trigger tokenType '$tokenType' not yet implemented" } @@ -1931,11 +2030,11 @@ namespace eval tomlish::decode { #no space level change switch -exact -- $tokenType { squotedkey { - puts "---- squotedkey in state $prevstate (no space level change)" + #puts "---- squotedkey in state $prevstate (no space level change)" lappend v($nest) [list SQKEY $tok] } dquotedkey { - puts "---- dquotedkey in state $prevstate (no space level change)" + #puts "---- dquotedkey in state $prevstate (no space level change)" lappend v($nest) [list DQKEY $tok] } barekey { @@ -1960,29 +2059,46 @@ namespace eval tomlish::decode { startinlinetable { puts stderr "---- decode::toml error. did not expect startinlinetable without space level change (no space level change)" } - startquote { + single_dquote { switch -exact -- $newstate { string-state { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "string" set tok "" } - quoted-key { + dquoted-key { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "dquotedkey" set tok "" } - XXXitable-quoted-key { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "itablequotedkey" - set tok "" + multistring-space { + lappend v($nest) [list STRINGPART {"}] + #may need to be joined on pop if there are neighbouring STRINGPARTS + } + default { + error "---- single_dquote switch case not implemented for nextstate: $newstate (no space level change)" + } + } + } + double_dquote { + #leading extra quotes - test: toml_multistring_startquote2 + switch -exact -- $prevstate { + itable-keyval-value-expected - keyval-value-expected { + puts stderr "tomlish::decode::toml double_dquote TEST" + #empty string + lappend v($nest) [list STRINGPART ""] + } + multistring-space { + #multistring-space to multistring-space + lappend v($nest) [list STRINGPART {""}] } default { - error "---- startquote switch case not implemented for nextstate: $newstate (no space level change)" + error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" } } + } - startsquote { + single_squote { switch -exact -- $newstate { literal-state { set next_tokenType_known 1 @@ -1995,41 +2111,17 @@ namespace eval tomlish::decode { set tok "" } multiliteral-space { - #false alarm squote returned from squote_seq pop + #false alarm squote returned from tentative_accum_squote pop ::tomlish::log::debug "---- adding lone squote to own LITERALPART nextstate: $newstate (no space level change)" #(single squote - not terminating space) lappend v($nest) [list LITERALPART '] #may need to be joined on pop if there are neighbouring LITERALPARTs } default { - error "---- startsquote switch case not implemented for nextstate: $newstate (no space level change)" + error "---- single_squote switch case not implemented for nextstate: $newstate (no space level change)" } } } - startmultiquote { - #review - puts stderr "---- got startmultiquote in state $prevstate (no space level change)" - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "stringpart" - set tok "" - } - endquote { - #nothing to do? - set tok "" - } - endsquote { - set tok "" - } - endmultiquote { - #JMN!! - set tok "" - } - string { - lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes - } - literal { - lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes - } double_squote { switch -exact -- $prevstate { keyval-value-expected { @@ -2044,6 +2136,19 @@ namespace eval tomlish::decode { } } } + enddquote { + #nothing to do? + set tok "" + } + endsquote { + set tok "" + } + string { + lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes + } + literal { + lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes + } multistring { #review lappend v($nest) [list MULTISTRING $tok] @@ -2056,11 +2161,9 @@ namespace eval tomlish::decode { } literalpart { lappend v($nest) [list LITERALPART $tok] ;#will not get wrapped in squotes directly - } - itablequotedkey { - } untyped_value { + #would be better termed unclassified_value #we can't determine the type of unquoted values (int,float,datetime,bool) until the entire token was read. if {$tok in {true false}} { set tag BOOL @@ -2238,7 +2341,7 @@ namespace eval tomlish::utils { #eg {dog."tater.man"} set sLen [tcl::string::length $tablename] set segments [list] - set mode "unknown" ;#5 modes: unknown, quoted,litquoted, unquoted, syntax + set mode "preval" ;#5 modes: preval, quoted,litquoted, unquoted, postval #quoted is for double-quotes, litquoted is for single-quotes (string literal) set seg "" for {set i 0} {$i < $sLen} {incr i} { @@ -2249,139 +2352,166 @@ namespace eval tomlish::utils { set lastChar "" } + #todo - track\count backslashes properly + set c [tcl::string::index $tablename $i] + if {$c eq "\""} { + if {($lastChar eq "\\")} { + #not strictly correct - we could have had an even number prior-backslash sequence + #the toml spec would have us error out immediately on bsl in bad location - but we're + #trying to parse to unvalidated tomlish + set ctest escq + } else { + set ctest dq + } + } else { + set ctest [string map [list " " sp \t tab] $c] + } - if {$c eq "."} { - switch -exact -- $mode { - unquoted { - #dot marks end of segment. - lappend segments $seg - set seg "" - set mode "unknown" - } - quoted { - append seg $c - } - unknown { - lappend segments $seg - set seg "" - } - litquoted { - append seg $c - } - default { - #mode: syntax - #we got our dot. - the syntax mode is now satisfied. - set mode "unknown" + switch -- $ctest { + . { + switch -exact -- $mode { + preval { + error "tablename_split. dot not allowed - expecting a value" + } + unquoted { + #dot marks end of segment. + #if {![is_barekey $seg]} { + # error "tablename_split. dot not allowed - expecting a value" + #} + lappend segments $seg + set seg "" + set mode "preval" + } + quoted { + append seg $c + } + litquoted { + append seg $c + } + postval { + #got dot in an expected location + set mode "preval" + } } } - } elseif {($c eq "\"") && ($lastChar ne "\\")} { - if {$mode eq "unknown"} { - if {[tcl::string::trim $seg] ne ""} { - #we don't allow a quote in the middle of a bare key - error "tablename_split. character '\"' invalid at this point in tablename. tablename: '$tablename'" - } - set mode "quoted" - set seg "\"" - } elseif {$mode eq "unquoted"} { - append seg $c - } elseif {$mode eq "quoted"} { - append seg $c - - if {$normalize} { - lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] - } else { - lappend segments $seg + dq { + #unescaped dquote + switch -- $mode { + preval { + set mode "quoted" + set seg "\"" + } + unquoted { + #invalid in barekey - but we are after structure only + append seg $c + } + quoted { + append seg $c + if {$normalize} { + lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] + } else { + lappend segments $seg + } + set seg "" + set mode "postval" ;#make sure we only accept a dot or end-of-data now. + } + litquoted { + append seg $c + } + postval { + error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" + } } - - set seg "" - set mode "syntax" ;#make sure we only accept a dot or end-of-data now. - } elseif {$mode eq "litquoted"} { - append seg $c - } elseif {$mode eq "syntax"} { - error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" - } - } elseif {($c eq "\'")} { - if {$mode eq "unknown"} { - append seg $c - set mode "litquoted" - } elseif {$mode eq "unquoted"} { - #single quote inside e.g o'neill - append seg $c - } elseif {$mode eq "quoted"} { - append seg $c - - } elseif {$mode eq "litquoted"} { - append seg $c - #no normalization to do - lappend segments $seg - set seg "" - set mode "syntax" - } elseif {$mode eq "syntax"} { - error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" } - - } elseif {$c in [list " " \t]} { - if {$mode eq "syntax"} { - #ignore - } else { - append seg $c + ' { + switch -- $mode { + preval { + append seg $c + set mode "litquoted" + } + unquoted { + #single quote inside e.g o'neill - ultimately invalid - but we pass through here. + append seg $c + } + quoted { + append seg $c + } + litquoted { + append seg $c + #no normalization to do aside from stripping squotes + if {$normalize} { + lappend segments [tcl::string::range $seg 1 end-1] + } else { + lappend segments $seg + } + set seg "" + set mode "postval" + } + postval { + error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" + } + } } - } else { - if {$mode eq "syntax"} { - error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" + sp - tab { + switch -- $mode { + preval - postval { + #ignore + } + unquoted { + #terminates a barekey + lappend segments $seg + set seg "" + set mode "postval" + } + default { + #append to quoted or litquoted + append seg $c + } + } } - if {$mode eq "unknown"} { - set mode "unquoted" + default { + switch -- $mode { + preval { + set mode unquoted + append seg $c + } + postval { + error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" + } + default { + append seg $c + } + } } - append seg $c } + if {$i == $sLen-1} { #end of data ::tomlish::log::debug "End of data: mode='$mode'" - #REVIEW - we can only end up in unquoted or syntax here? are other branches reachable? switch -exact -- $mode { - quoted { - if {$c ne "\""} { - error "tablename_split. missing closing double-quote in a segment. tablename: '$tablename'" - } - if {$normalize} { - lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] - #lappend segments [subst -nocommands -novariables [::string range $seg 1 end-1]] ;#wrong - } else { - lappend segments $seg - } + preval { + error "tablename_split. Expected a value after last dot separator. tablename: '$tablename'" } - litquoted { - set trimmed_seg [tcl::string::trim $seg] - if {[tcl::string::index $trimmed_seg end] ne "\'"} { - error "tablename_split. missing closing single-quote in a segment. tablename: '$tablename'" - } + unquoted { lappend segments $seg } - unquoted - unknown { - lappend segments $seg + quoted { + error "tablename_split. Expected a trailing double quote. tablename: '$tablename'" } - syntax { - #ok - segment already lappended + litquoted { + error "tablename_split. Expected a trailing single quote. tablename: '$tablename'" } - default { - lappend segments $seg + postval { + #ok - segment already lappended } } } } - foreach seg $segments { - set trimmed [tcl::string::trim $seg " \t"] - #note - we explicitly allow 'empty' quoted strings '' & "" - # (these are 'discouraged' but valid toml keys) - #if {$trimmed in [list "''" "\"\""]} { - # puts stderr "tablename_split. warning - Empty quoted string as tablename segment" - #} - if {$trimmed eq "" } { - error "tablename_split. Empty segment found. tablename: '$tablename' segments [llength $segments] ($segments)" - } - } + + #note - we must allow 'empty' quoted strings '' & "" + # (these are 'discouraged' but valid toml keys) + return $segments } @@ -2432,26 +2562,34 @@ namespace eval tomlish::utils { #- escape_string and unescape_string would not be reliably roundtrippable inverses anyway. #REVIEW - provide it anyway? When would it be desirable to use? - variable Bstring_control_map [list\ - \b {\b}\ - \n {\n}\ - \r {\r}\ - \" {\"}\ - \x1b {\e}\ - \\ "\\\\"\ - ] + variable Bstring_control_map [dict create] + dict set Bstring_control_map \b {\b} + dict set Bstring_control_map \n {\n} + dict set Bstring_control_map \r {\r} + dict set Bstring_control_map \" {\"} + #dict set Bstring_control_map \x1b {\e} ;#should presumably be only be a convenience for decode - going the other way we get \u001B + dict set Bstring_control_map \\ "\\\\" + #\e for \x1b seems like it might be included - v1.1?? hard to find current state of where toml is going :/ #for a Bstring (Basic string) tab is explicitly mentioned as not being one that must be escaped. - for {set cdec 0} {$cdec <= 8} {incr cdec} { + #8 = \b - already in list. + #built the remainder whilst checking for entries already hardcoded above -in case more are added to the hardcoded list + for {set cdec 0} {$cdec <= 7} {incr cdec} { set hhhh [format %.4X $cdec] - lappend Bstring_control_map [format %c $cdec] \\u$hhhh + set char [format %c $cdec] + if {![dict exists $Bstring_control_map $char]} { + dict set Bstring_control_map $char \\u$hhhh + } } for {set cdec [expr {0x0A}]} {$cdec <= 0x1F} {incr cdec} { set hhhh [format %.4X $cdec] - lappend Bstring_control_map [format %c $cdec] \\u$hhhh + set char [format %c $cdec] + if {![dict exists $Bstring_control_map $char]} { + dict set Bstring_control_map $char \\u$hhhh + } } # \u007F = 127 - lappend Bstring_control_map [format %c 127] \\u007F + dict set Bstring_control_map [format %c 127] \\u007F #Note the inclusion of backslash in the list of controls makes this non idempotent - subsequent runs would keep encoding the backslashes! #escape only those chars that must be escaped in a Bstring (e.g not tab which can be literal or escaped) @@ -2474,6 +2612,7 @@ namespace eval tomlish::utils { # it recognizes other escapes which aren't approprite e.g \xhh and octal \nnn # it replaces \ with a single whitespace (trailing backslash) #This means we shouldn't use 'subst' on the whole string, but instead substitute only the toml-specified escapes (\r \n \b \t \f \\ \" \uhhhh & \Uhhhhhhhh + #plus \e for \x1b? set buffer "" set buffer4 "" ;#buffer for 4 hex characters following a \u @@ -2558,12 +2697,13 @@ namespace eval tomlish::utils { set ctest [tcl::string::map {{"} dq} $c] switch -exact -- $ctest { dq { - set e "\\\"" - append buffer [subst -nocommand -novariable $e] + append buffer {"} } b - t - n - f - r { - set e "\\$c" - append buffer [subst -nocommand -novariable $e] + append buffer [subst -nocommand -novariable "\\$c"] + } + e { + append buffer \x1b } u { set unicode4_active 1 @@ -2578,8 +2718,7 @@ namespace eval tomlish::utils { #review - toml spec says all other escapes are reserved #and if they are used TOML should produce an error. #we leave detecting this for caller for now - REVIEW - append buffer "\\" - append buffer $c + append buffer "\\$c" } } } else { @@ -3003,7 +3142,7 @@ namespace eval tomlish::parse { # states: # table-space, itable-space, array-space # array-value-expected,keyval-value-expected,itable-keyval-value-expected, keyval-syntax, - # quoted-key, squoted-key + # dquoted-key, squoted-key # string-state, literal-state, multistring... # # notes: @@ -3039,6 +3178,12 @@ namespace eval tomlish::parse { variable stateMatrix set stateMatrix [dict create] + #--------------------------------------------------------- + #WARNING + #The stateMatrix implementation here is currently messy. + #The code is a mixture of declarative via the stateMatrix and imperative via switch statements during PUSH/POP/SAMESPACE transitions. + #This means the state behaviour has to be reasoned about by looking at both in conjuction. + #--------------------------------------------------------- #xxx-space vs xxx-syntax inadequately documented - TODO @@ -3060,35 +3205,19 @@ namespace eval tomlish::parse { barekey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ squotedkey {PUSHSPACE "keyval-space" state "keyval-syntax" note ""}\ dquotedkey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ - XXXstartquote "quoted-key"\ - XXXstartsquote "squoted-key"\ + XXXsingle_dquote "quoted-key"\ + XXXsingle_squote "squoted-key"\ comment "table-space"\ starttablename "tablename-state"\ starttablearrayname "tablearrayname-state"\ - startmultiquote "err-state"\ - endquote "err-state"\ + enddquote "err-state"\ + endsquote "err-state"\ comma "err-state"\ eof "end-state"\ equal "err-state"\ cr "err-lonecr"\ } - #itable-space/ curly-syntax : itables - dict set stateMatrix\ - itable-space {\ - whitespace "itable-space"\ - newline "itable-space"\ - barekey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - squotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - dquotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - endinlinetable "POPSPACE"\ - XXXstartquote "quoted-key"\ - XXXstartsquote {TOSTATE "squoted-key" comment "jn-testing"}\ - comma "err-state"\ - comment "itable-space"\ - eof "err-state"\ - } - #squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate itable-space starttok {squote_seq "'"}} dict set stateMatrix\ @@ -3113,26 +3242,19 @@ namespace eval tomlish::parse { dict set stateMatrix\ keyval-value-expected {\ whitespace "keyval-value-expected"\ - untyped_value {TOSTATE "keyval-tail" note ""}\ - startquote {TOSTATE "string-state" returnstate keyval-tail}\ - startmultiquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ - squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate keyval-value-expected starttok {squote_seq "'"}}\ - startsquote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ - double_squote {TOSTATE "keyval-tail" note "empty literal received when double squote occurs"}\ - triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ - startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ - startarray {PUSHSPACE array-space returnstate keyval-tail}\ - } - #squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate keyval-process-leading-squotes starttok {squote_seq "'"}} - dict set stateMatrix\ - leading-squote-space {\ - squote_seq "POPSPACE"\ + untyped_value {TOSTATE "keyval-tail" note ""}\ + literal {TOSTATE "keyval-tail" note "required for empty literal at EOF"}\ + string {TOSTATE "keyval-tail" note "required for empty string at EOF"}\ + single_dquote {TOSTATE "string-state" returnstate keyval-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ + single_squote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ + triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ + startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ + startarray {PUSHSPACE array-space returnstate keyval-tail}\ } - #dict set stateMatrix\ - # keyval-process-leading-squotes {\ - # startsquote "literal-state"\ - # triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ - # } + #double_squote {TOSTATE "keyval-tail" note "empty literal received when double squote occurs"} + + #2025 - no leading-squote-space - only trailing-squote-space. dict set stateMatrix\ keyval-tail {\ @@ -3142,81 +3264,106 @@ namespace eval tomlish::parse { eof "end-state"\ } + + #itable-space/ curly-syntax : itables + # x={y=1,} + dict set stateMatrix\ + itable-space {\ + whitespace "itable-space"\ + newline "itable-space"\ + barekey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + squotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + dquotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + endinlinetable "POPSPACE"\ + comma "err-state"\ + comment "itable-space"\ + eof "err-state"\ + } + #we don't get single_squote etc here - instead we get the resulting squotedkey token + + + # ??? review - something like this + # + # x={y =1,} dict set stateMatrix\ itable-keyval-syntax {\ - whitespace "itable-keyval-syntax"\ - barekey {PUSHSPACE "dottedkey-space"}\ - squotedkey {PUSHSPACE "dottedkey-space"}\ - dquotedkey {PUSHSPACE "dottedkey-space"}\ - equal "itable-keyval-value-expected"\ + whitespace {TOSTATE "itable-keyval-syntax"}\ + barekey {PUSHSPACE "dottedkey-space"}\ + squotedkey {PUSHSPACE "dottedkey-space"}\ + dquotedkey {PUSHSPACE "dottedkey-space"}\ + equal {TOSTATE "itable-keyval-value-expected"}\ newline "err-state"\ eof "err-state"\ } + + # x={y=1} + dict set stateMatrix\ + itable-keyval-space {\ + whitespace "itable-keyval-syntax"\ + equal {TOSTATE "itable-keyval-value-expected" note "required"}\ + } + dict set stateMatrix\ itable-keyval-value-expected {\ whitespace "itable-keyval-value-expected"\ untyped_value {TOSTATE "itable-val-tail" note ""}\ - startquote {TOSTATE "string-state" returnstate itable-val-tail}\ - startmultiquote {PUSHSPACE "multistring-space" returnstate itable-val-tail}\ - squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate itable-keyval-value-expected starttok {squote_seq "'"}}\ - startsquote {TOSTATE "literal-state" returnstate itable-val-tail note "usual way a literal is triggered"}\ - double_squote {TOSTATE "itable-val-tail" note "empty literal received when double squote occurs"}\ + single_dquote {TOSTATE "string-state" returnstate itable-val-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate itable-val-tail}\ + single_squote {TOSTATE "literal-state" returnstate itable-val-tail note "usual way a literal is triggered"}\ triple_squote {PUSHSPACE "multiliteral-space" returnstate itable-val-tail}\ startinlinetable {PUSHSPACE "itable-space" returnstate itable-val-tail}\ startarray {PUSHSPACE "array-space" returnstate itable-val-tail}\ } - dict set stateMatrix\ - itable-keyval-space {\ - whitespace "itable-keyval-syntax"\ - equal {TOSTATE "itable-keyval-value-expected" note "required"}\ - } + #double_squote not currently generated by _start_squote_sequence - '' processed as single_squote to literal-state just like 'xxx' + # review + # double_squote {TOSTATE "itable-val-tail" note "empty literal received when double squote occurs"} + + + # x={y=1,z="x"} + #POPSPACE is transition from itable-keyval-space to parent itable-space dict set stateMatrix\ itable-val-tail {\ whitespace "itable-val-tail"\ endinlinetable "POPSPACE"\ comma "POPSPACE"\ - XXXnewline {TOSTATE "itable-val-tail" note "itable-space ??"}\ - newline "POPSPACE"\ + newline {TOSTATE "itable-val-tail" note "itable-space ??"}\ comment "itable-val-tail"\ eof "err-state"\ } - #dict set stateMatrix\ - # itable-quoted-key {\ - # whitespace "NA"\ - # itablequotedkey {PUSHSPACE "itable-keyval-space"}\ - # newline "err-state"\ - # endquote "itable-keyval-syntax"\ - # } - #dict set stateMatrix\ - # itable-squoted-key {\ - # whitespace "NA"\ - # itablesquotedkey {PUSHSPACE "itable-keyval-space"}\ - # newline "err-state"\ - # endsquote "itable-keyval-syntax"\ - # } + # XXXnewline "POPSPACE" + # We shouldn't popspace on newline - as if there was no comma we need to stay in itable-val-tail + # This means the newline and subsequent whitespace, comments etc become part of the preceeding dottedkey record + #e.g + # x = { + # j=1 + # #comment within dottedkey j record + # , # comment unattached + # #comment unattached + # k=2 , #comment unattached + # l=3 #comment within l record + # , m=4 + # #comment associated with m record + # + # #still associated with m record + # } + ## - This doesn't quite correspond to what a user might expect - but seems like a consistent mechanism. + #The awkwardness is because there is no way to put in a comment that doesn't consume a trailing comma + #so we cant do: j= 1 #comment for j1 , + # and have the trailing comma recognised. + # + # To associate: j= 1, #comment for j1 + # we would need some extra processing . (not popping until next key ? extra state itable-sep-tail?) REVIEW - worth doing? + # + # The same issue occurs with multiline arrays. The most natural assumption is that a comment on same line after a comma + # is 'associated' with the previous entry. + # + # These comment issues are independent of the data dictionary being generated for conversion to json etc - as the comments don't carry through anyway, + # but are a potential oddity for manipulating the intermediate tomlish structure whilst attempting to preserve 'associated' comments + # (e.g reordering records within an itable) + #The user's intention for 'associated' isn't always clear and the specs don't really guide on this. - - - #array-value-expected ? - dict set stateMatrix\ - XXXvalue-expected {\ - whitespace "value-expected"\ - untyped_value {"SAMESPACE" "" replay untyped_value}\ - startquote "string-state"\ - startsquote "literal-state"\ - triple_squote {PUSHSPACE "multiliteral-space"}\ - startmultiquote {PUSHSPACE "multistring-space"}\ - startinlinetable {PUSHSPACE itable-space}\ - startarray {PUSHSPACE array-space}\ - comment "err-state-value-expected-got-comment"\ - comma "err-state"\ - newline "err-state"\ - eof "err-state"\ - } - #note comment token should never be delivered to array-value-expected state? - #dottedkey-space is not (currently) used within [tablename] or [[tablearrayname]] #it is for keyval ie x.y.z = value @@ -3245,6 +3392,8 @@ namespace eval tomlish::parse { whitespace "dottedkey-space-tail" dotsep "dottedkey-space" equal "POPSPACE"\ + eof "err-state"\ + newline "err-state"\ } #-------------------------------------------------------------------------- @@ -3262,22 +3411,10 @@ namespace eval tomlish::parse { #toml spec looks like heading towards allowing newlines within inline tables #https://github.com/toml-lang/toml/issues/781 - #2025 - appears to be valid for 1.1 - which we are targeting. + #2025 - multiline itables appear to be valid for 1.1 - which we are targeting. #https://github.com/toml-lang/toml/blob/main/toml.md#inline-table #JMN2025 - #dict set stateMatrix\ - # curly-syntax {\ - # whitespace "curly-syntax"\ - # newline "curly-syntax"\ - # barekey {PUSHSPACE "itable-keyval-space"}\ - # itablequotedkey "itable-keyval-space"\ - # endinlinetable "POPSPACE"\ - # startquote "itable-quoted-key"\ - # comma "itable-space"\ - # comment "itable-space"\ - # eof "err-state"\ - # } #review comment "err-state" vs comment "itable-space" - see if TOML 1.1 comes out and allows comments in multiline ITABLES #We currently allow multiline ITABLES (also with comments) in the tokenizer. #if we want to disallow as per TOML 1.0 - we should do so when attempting to get structure? @@ -3291,10 +3428,9 @@ namespace eval tomlish::parse { # untyped_value "SAMESPACE"\ # startarray {PUSHSPACE "array-space"}\ # endarray "POPSPACE"\ - # startmultiquote {PUSHSPACE multistring-space}\ # startinlinetable {PUSHSPACE itable-space}\ - # startquote "string-state"\ - # startsquote "literal-state"\ + # single_dquote "string-state"\ + # single_squote "literal-state"\ # triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax note "seems ok 2024"}\ # comma "array-space"\ # comment "array-space"\ @@ -3305,15 +3441,16 @@ namespace eval tomlish::parse { set aspace [dict create] dict set aspace whitespace "array-space" dict set aspace newline "array-space" - dict set aspace untyped_value "SAMESPACE" + #dict set aspace untyped_value "SAMESPACE" + dict set aspace untyped_value "array-syntax" dict set aspace startarray {PUSHSPACE "array-space"} dict set aspace endarray "POPSPACE" - dict set aspace startmultiquote {PUSHSPACE multistring-space} + dict set aspace single_dquote {TOSTATE "string-state" returnstate array-syntax} + dict set aspace triple_dquote {PUSHSPACE "multistring-space" returnstate array-syntax} + dict set aspace single_squote {TOSTATE "literal-state" returnstate array-syntax} + dict set aspace triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax} dict set aspace startinlinetable {PUSHSPACE itable-space} - dict set aspace startquote "string-state" - dict set aspace startsquote "literal-state" - dict set aspace triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax note "seems ok 2024"} - dict set aspace comma "array-space" + #dict set aspace comma "array-space" dict set aspace comment "array-space" dict set aspace eof "err-state-array-space-got-eof" dict set stateMatrix array-space $aspace @@ -3329,26 +3466,16 @@ namespace eval tomlish::parse { #dict set asyntax untyped_value "SAMESPACE" #dict set asyntax startarray {PUSHSPACE array-space} dict set asyntax endarray "POPSPACE" - #dict set asyntax startmultiquote {PUSHSPACE multistring-space} - #dict set asyntax startquote "string-state" - #dict set asyntax startsquote "literal-state" + #dict set asyntax single_dquote "string-state" + #dict set asyntax single_squote "literal-state" dict set asyntax comma "array-space" dict set asyntax comment "array-syntax" dict set stateMatrix array-syntax $asyntax - #quoted-key & squoted-key need to PUSHSPACE from own token to keyval-space - dict set stateMatrix\ - quoted-key {\ - whitespace "NA"\ - dquotedkey {PUSHSPACE "keyval-space"}\ - newline "err-state"\ - endquote "keyval-syntax"\ - } - - #review + #dquotedkey is a token - dquoted-key is a state dict set stateMatrix\ dquoted-key {\ whitespace "NA"\ @@ -3367,7 +3494,7 @@ namespace eval tomlish::parse { string-state {\ whitespace "NA"\ string "string-state"\ - endquote "SAMESPACE"\ + enddquote "SAMESPACE"\ newline "err-state"\ eof "err-state"\ } @@ -3381,20 +3508,21 @@ namespace eval tomlish::parse { } - #dict set stateMatrix\ - # stringpart {\ - # continuation "SAMESPACE"\ - # endmultiquote "POPSPACE"\ - # eof "err-state"\ - # } dict set stateMatrix\ multistring-space {\ - whitespace "multistring-space"\ - continuation "multistring-space"\ - stringpart "multistring-space"\ - newline "multistring-space"\ - endmultiquote "POPSPACE"\ - eof "err-state"\ + whitespace "multistring-space"\ + continuation "multistring-space"\ + stringpart "multistring-space"\ + newline "multistring-space"\ + tentative_trigger_dquote {PUSHSPACE "trailing-dquote-space" returnstate multistring-space starttok {tentative_accum_dquote {"}}}\ + single_dquote {TOSTATE multistring-space}\ + double_dquote {TOSTATE multistring-space}\ + triple_dquote {POPSPACE}\ + eof "err-state"\ + } + dict set stateMatrix\ + trailing-dquote-space { + tentative_accum_dquote "POPSPACE" } @@ -3402,19 +3530,19 @@ namespace eval tomlish::parse { #todo - treat sole cr as part of literalpart but crlf and lf as newline dict set stateMatrix\ multiliteral-space {\ - literalpart "multiliteral-space"\ - newline "multiliteral-space"\ - squote_seq_begin {PUSHSPACE "trailing-squote-space" returnstate multiliteral-space starttok {squote_seq "'"}}\ - triple_squote {POPSPACE note "on popping - we do any necessary concatenation of LITERALPART items due to squote processing"}\ - double_squote {TOSTATE multiliteral-space note "short squote_seq: can occur anywhere in the space e.g emitted at end when 5 squotes occur"}\ - startsquote {TOSTATE multiliteral-space note "short squote_seq: same as double_squote - false alarm"}\ - eof "err-premature-eof-in-multiliteral-space"\ + literalpart "multiliteral-space"\ + newline "multiliteral-space"\ + tentative_trigger_squote {PUSHSPACE "trailing-squote-space" returnstate multiliteral-space starttok {tentative_accum_squote "'"}}\ + single_squote {TOSTATE multiliteral-space note "short tentative_accum_squote: false alarm this squote is part of data"}\ + double_squote {TOSTATE multiliteral-space note "short tentative_accum_squote: can occur anywhere in the space e.g emitted at end when 5 squotes occur"}\ + triple_squote {POPSPACE note "on popping - we do any necessary concatenation of LITERALPART items due to squote processing"}\ + eof "err-premature-eof-in-multiliteral-space"\ } #trailing because we are looking for possible terminating ''' - but must accept '''' or ''''' and re-integrate the 1st one or 2 extra squotes dict set stateMatrix\ - trailing-squote-space {\ - squote_seq "POPSPACE"\ + trailing-squote-space { + tentative_accum_squote "POPSPACE" } @@ -3499,7 +3627,7 @@ namespace eval tomlish::parse { - + dict set stateMatrix\ end-state {} @@ -3557,14 +3685,13 @@ namespace eval tomlish::parse { dict set spacePushTransitions itable-keyval-space itable-keyval-syntax dict set spacePushTransitions array-space array-space dict set spacePushTransitions table-space tablename-state - dict set spacePushTransitions #itable-space itable-space + #dict set spacePushTransitions #itable-space itable-space #Pop to, next variable spacePopTransitions [dict create] dict set spacePopTransitions array-space array-syntax - #itable-space curly-syntax #itable-keyval-space itable-val-tail #review #we pop to keyval-space from dottedkey-space or from keyval-value-expected? we don't always want to go to keyval-tail @@ -3575,7 +3702,6 @@ namespace eval tomlish::parse { #JMN test #dict set spaceSameTransitions array-space array-syntax - #itable-space curly-syntax #itable-keyval-space itable-val-tail @@ -3611,6 +3737,8 @@ namespace eval tomlish::parse { ::tomlish::log::debug "--->> goNextState tokentype:$tokentype tok:$tok currentstate:$currentstate : transition_to = $transition_to" switch -exact -- [lindex $transition_to 0] { POPSPACE { + set popfromspace_info [spacestack peek] + set popfromspace_state [dict get $popfromspace_info state] spacestack pop set parent_info [spacestack peek] set type [dict get $parent_info type] @@ -3625,17 +3753,17 @@ namespace eval tomlish::parse { set existing [spacestack pop] dict unset existing returnstate spacestack push $existing ;#re-push modification - ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace redirected to stored returnstate $next <<---" + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected to stored returnstate $next <<---" } else { ### #review - do away with spacePopTransitions - which although useful to provide a default.. # - involve error-prone configurations distant to the main state transition configuration in stateMatrix if {[dict exists $::tomlish::parse::spacePopTransitions $parentspace]} { set next [dict get $::tomlish::parse::spacePopTransitions $parentspace] - ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace redirected state to $next (spacePopTransitions)<<---" + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected state to $next (spacePopTransitions)<<---" } else { set next $parentspace - ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace<<---" + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace<<---" } } set result $next @@ -3805,22 +3933,6 @@ namespace eval tomlish::parse { return $tokenType } - proc _shortcircuit_startquotesequence {} { - variable tok - variable i - set toklen [tcl::string::length $tok] - if {$toklen == 1} { - set_tokenType "startquote" - incr i -1 - return -level 2 1 - } elseif {$toklen == 2} { - puts stderr "_shortcircuit_startquotesequence toklen 2" - set_tokenType "startquote" - set tok "\"" - incr i -2 - return -level 2 1 - } - } proc get_token_waiting {} { variable token_waiting @@ -3940,7 +4052,6 @@ namespace eval tomlish::parse { set slash_active 0 set quote 0 set c "" - set multi_dquote "" for {} {$i < $sLen} {} { if {$i > 0} { set lastChar [tcl::string::index $s [expr {$i - 1}]] @@ -3957,8 +4068,6 @@ namespace eval tomlish::parse { switch -exact -- $ctest { # { - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 @@ -3966,16 +4075,20 @@ namespace eval tomlish::parse { if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { + #for multiliteral, multistring - data and/or end incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { + #pseudo token beginning with underscore - never returned to state machine - review incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i [tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } barekey { @@ -4003,7 +4116,7 @@ namespace eval tomlish::parse { append tok $c } default { - #dquotedkey, itablequotedkey, string,literal, multistring + #dquotedkey, string,literal, multistring append tok $c } } @@ -4015,7 +4128,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes#" + append tok "#" } multiliteral-space { set_tokenType "literalpart" @@ -4031,23 +4144,23 @@ namespace eval tomlish::parse { } lc { #left curly brace - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i [tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { @@ -4059,7 +4172,7 @@ namespace eval tomlish::parse { } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } starttablename - starttablearrayname { #*bare* tablename can only contain letters,digits underscores @@ -4105,7 +4218,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes\{" + append tok "\{" } multiliteral-space { set_tokenType "literalpart" @@ -4120,37 +4233,35 @@ namespace eval tomlish::parse { } rc { #right curly brace - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { append tok $c } - XXXitablesquotedkey { - } - string - dquotedkey - itablequotedkey - comment { + string - dquotedkey - comment { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } starttablename - tablename { if {$had_slash} {append tok "\\"} @@ -4221,7 +4332,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes\}" + append tok "\}" } multiliteral-space { set_tokenType "literalpart" ; #review @@ -4237,35 +4348,35 @@ namespace eval tomlish::parse { } lb { #left square bracket - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { append tok $c } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } starttablename { #change the tokenType @@ -4332,7 +4443,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes\[" + append tok "\[" } multiliteral-space { set_tokenType "literalpart" @@ -4350,37 +4461,35 @@ namespace eval tomlish::parse { } rb { #right square bracket - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { append tok $c } - XXXitablesquotedkey { - } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } comment { if {$had_slash} {append tok "\\"} @@ -4428,16 +4537,6 @@ namespace eval tomlish::parse { } } } - XXXtablearraynames { - puts "rb @ tablearraynames ??" - #switch? - - #todo? - if {$had_slash} {append tok "\\"} - #invalid! - but leave for datastructure loading stage to catch - set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex - return 1 - } default { incr i -1 return 1 @@ -4485,7 +4584,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes\]" + append tok "\]" } multiliteral-space { set_tokenType "literalpart" @@ -4498,21 +4597,21 @@ namespace eval tomlish::parse { } } bsl { - set dquotes $multi_dquote - set multi_dquote "" ;#!! #backslash if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } whitespace { @@ -4529,9 +4628,7 @@ namespace eval tomlish::parse { append tok "\\" set slash_active 0 } - XXXitablesquotedkey { - } - string - dquotedkey - itablequotedkey - comment { + string - dquotedkey - comment { if {$slash_active} { set slash_active 0 append tok "\\\\" @@ -4545,7 +4642,6 @@ namespace eval tomlish::parse { set slash_active 0 append tok "\\\\" } else { - append tok $dquotes set slash_active 1 } } @@ -4575,10 +4671,6 @@ namespace eval tomlish::parse { set tok "\\\\" set slash_active 0 } else { - if {$dquotes ne ""} { - set_tokenType "stringpart" - set tok $dquotes - } set slash_active 1 } } @@ -4599,58 +4691,56 @@ namespace eval tomlish::parse { set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { - #short squote_seq tokens are returned if active during any other character + tentative_accum_squote { + #for within multiliteral + #short tentative_accum_squote tokens are returned if active upon receipt of any other character #longest allowable for leading/trailing are returned here #### set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote - switch -- $state { - leading-squote-space { - append tok $c - if {$existingtoklen > 2} { - error "tomlish tok error: squote_seq unexpected length $existingtoklen when another received" - } elseif {$existingtoklen == 2} { - return 1 ;#return tok ''' - } - } - trailing-squote-space { - append tok $c - if {$existingtoklen == 4} { - #maxlen to be an squote_seq is multisquote + 2 = 5 - #return tok ''''' - return 1 - } - } - default { - error "tomlish tok error: squote_seq in unexpected state '$state' - expected leading-squote-space or trailing-squote-space" - } + #assert state = trailing-squote-space + append tok $c + if {$existingtoklen == 4} { + #maxlen to be a tentative_accum_squote is multisquote + 2 = 5 + #return tok with value ''''' + return 1 } } - whitespace { - #end whitespace - incr i -1 ;#reprocess sq + tentative_accum_dquote { + incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { - #temp token creatable only during value-expected or array-space + #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space switch -- [tcl::string::length $tok] { 1 { + #no conclusion can yet be reached append tok $c } 2 { + #enter multiliteral #switch? append tok $c set_tokenType triple_squote return 1 } default { + #if there are more than 3 leading squotes we also enter multiliteral space and the subsequent ones are handled + #by the tentative_accum_squote check for ending sequence which can accept up to 5 and reintegrate the + #extra 1 or 2 squotes as data. error "tomlish unexpected token length [tcl::string::length $tok] in '_start_squote_sequence'" } } } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + whitespace { + #end whitespace + incr i -1 ;#reprocess sq + return 1 + } literal { #slash_active always false #terminate the literal @@ -4663,7 +4753,7 @@ namespace eval tomlish::parse { # idea: end this literalpart (possibly 'temporarily') # let the sq be reprocessed in the multiliteral-space to push an end-multiliteral-sequence to state stack # upon popping end-multiliteral-sequence - stitch quotes back into this literalpart's token (if either too short - or a long ending sequence as shown above) - incr i -1 ;#throw the "'" back to loop - will be added to an squote_seq token for later processing + incr i -1 ;#throw the "'" back to loop - will be added to a tentative_accum_squote token for later processing return 1 } XXXitablesquotedkey { @@ -4684,7 +4774,11 @@ namespace eval tomlish::parse { append tok $c } barekey { - #not clear why o'shennanigan shouldn't be a legal barekey - but it seems not to be. + #barekeys now support all sorts of unicode letter/number chars for other cultures + #but not punctuation - not even for those of Irish heritage who don't object + #to the anglicised form of some names. + # o'shenanigan seems to not be a legal barekey + #The Irish will have to use an earlier form Ó - which apparently many may prefer anyway. error "tomlish Unexpected single quote during barekey. [tomlish::parse::report_line]" } default { @@ -4693,63 +4787,69 @@ namespace eval tomlish::parse { } } else { switch -exact -- $state { - array-space { + array-space - keyval-value-expected - itable-keyval-value-expected { + #leading squote + #pseudo-token _start_squote_sequence ss not received by state machine + #This pseudotoken will trigger production of single_squote token or triple_squote token + #It currently doesn't trigger double_squote token + #(handle '' same as 'x' ie produce a single_squote and go into processing literal) + #review - producing double_squote for empty literal may be slightly more efficient. + #This token is not used to handle squote sequences *within* a multiliteral set_tokenType "_start_squote_sequence" set tok "'" } - itable-keyval-value-expected - keyval-value-expected { - set_tokenType "squote_seq_begin" + multiliteral-space { + #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row + #we are building up a tentative_accum_squote to determine if + #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines + #b) it is exactly ''' and we can terminate the whole multiliteral + #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space + set_tokenType "tentative_trigger_squote" ;#trigger tentative_accum_squote set tok "'" return 1 } - table-space { - #tests: squotedkey.test - set_tokenType "squotedkey" - set tok "" - } - itable-space { - #tests: squotedkey_itable.test + table-space - itable-space { + #tests: squotedkey.test squotedkey_itable.test set_tokenType "squotedkey" set tok "" } - XXXitable-space { - #future - could there be multiline keys? - #this would allow arbitrary tcl dicts to be stored in toml + XXXtable-space - XXXitable-space { + #future - could there be multiline keys? MLLKEY, MLBKEY ? + #this would (almost) allow arbitrary tcl dicts to be stored in toml (aside from escaping issues) #probably unlikely - as it's perhaps not very 'minimal' or ergonomic for config files - set_tokenType "squote_seq_begin" + #@2025 ABNF for toml mentions key, simple-key, unquoted-key, quoted-key and dotted-key + #where key is simple-key or dotted-key - no MLL or MLB components + #the spec states solution for arbitrary binary data is application specific involving encodings + #such as hex, base64 + set_tokenType "_start_squote_sequence" set tok "'" return 1 } tablename-state { #first char in tablename-state/tablearrayname-state - set_tokenType tablename + set_tokenType "tablename" append tok "'" } tablearrayname-state { - set_tokenType tablearrayname + set_tokenType "tablearrayname" append tok "'" } literal-state { + #shouldn't get here? review tomlish::log::debug "- tokloop sq during literal-state with no tokentype - empty literal?" - set_tokenType literal + set_tokenType "literal" incr -1 return 1 } multistring-space { - error "tomlish unimplemented - squote during state '$state'. [tomlish::parse::report_line]" - } - multiliteral-space { - #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row - #we are building up an squote_seq to determine if - #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines - #b) it is exactly ''' and we can terminate the whole multiliteral - #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space - set_tokenType "squote_seq_begin" - set tok "'" - return 1 + set_tokenType "stringpart" + set tok "" + if {$had_slash} {append tok "\\"} + append tok "," + #error "tomlish unimplemented - squote during state '$state'. [tomlish::parse::report_line]" } dottedkey-space { - set_tokenType squotedkey + set_tokenType "squotedkey" } default { error "tomlish unhandled squote during state '$state'. [tomlish::parse::report_line]" @@ -4765,44 +4865,50 @@ namespace eval tomlish::parse { if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote { incr i -1 return 1 } - startquotesequence { - set toklen [tcl::string::length $tok] - if {$toklen == 1} { - append tok $c - } elseif {$toklen == 2} { - append tok $c - #switch vs set? - set_tokenType "startmultiquote" - return 1 - } else { - error "tomlish unexpected token length $toklen in 'startquotesequence'" - } - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" return 1 - - #set toklen [tcl::string::length $tok] - #switch -- $toklen { - # 1 { - # set_tokenType "startsquote" - # incr i -1 - # return 1 - # } - # 2 { - # set_tokenType "startsquote" - # incr i -2 - # return 1 - # } - # default { - # error "tomlish unexpected _start_squote_sequence length $toklen" - # } - #} + } + tentative_accum_dquote { + #within multistring + #short tentative_accum_dquote tokens are returned if active upon receipt of any other character + #longest allowable for leading/trailing are returned here + #### + set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote + #assert state = trailing-squote-space + append tok $c + if {$existingtoklen == 4} { + #maxlen to be a tentative_accum_dquote is multidquote + 2 = 5 + #return tok with value """"" + return 1 + } + } + _start_dquote_sequence { + #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space + switch -- [tcl::string::length $tok] { + 1 { + #no conclusion can yet be reached + append tok $c + } + 2 { + #enter multistring + #switch? + append tok $c + set_tokenType triple_dquote + return 1 + } + default { + #if there are more than 3 leading dquotes we also enter multistring space and the subsequent ones are handled + #by the tentative_accum_dquote check for ending sequence which can accept up to 5 and reintegrate the + #extra 1 or 2 dquotes as data. + error "tomlish unexpected token length [tcl::string::length $tok] in '_start_dquote_sequence'" + } + } } literal - literalpart { append tok $c @@ -4811,8 +4917,8 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" $c } else { - #unescaped quote always terminates a string? - set_token_waiting type endquote value "\"" complete 1 startindex $cindex + #unescaped quote always terminates a string + set_token_waiting type enddquote value "\"" complete 1 startindex $cindex return 1 } } @@ -4821,77 +4927,31 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" $c } else { - #incr i -1 - - if {$multi_dquote eq "\"\""} { - set_token_waiting type endmultiquote value "\"\"\"" complete 1 startindex [expr {$cindex -2}] - set multi_dquote "" - return 1 - } else { - append multi_dquote "\"" - } + incr i -1 ;#throw the {"} back to loop - will be added to a tentative_accum_dquote token for later processing + return 1 } } whitespace { - switch -exact -- $state { - multistring-space { - #REVIEW - if {$had_slash} { - incr i -2 - return 1 - } else { - switch -- [tcl::string::length $multi_dquote] { - 2 { - set_token_waiting type endmultiquote value "\"\"\"" complete 1 startindex [expr {$cindex-2}] - set multi_dquote "" - return 1 - } - 1 { - incr i -2 - return 1 - } - 0 { - incr i -1 - return 1 - } - } - } - } - keyval-value-expected { - #end whitespace token and reprocess - incr i -1 - return 1 - - #if {$multi_dquote eq "\"\""} { - # set_token_waiting type startmultiquote value "\"\"\"" complete 1 - # set multi_dquote "" - # return 1 - #} else { - # #end whitespace token and reprocess - # incr i -1 - # return 1 - #} - } - table-space - itable-space { - incr i -1 - return 1 - } - default { - set_token_waiting type startquote value "\"" complete 1 startindex $cindex - return 1 - } + #assert: had_slash will only ever be true in multistring-space + if {$had_slash} { + incr i -2 + return 1 + } else { + #end whitespace token - throw dq back for reprocessing + incr i -1 + return 1 } } comment { if {$had_slash} {append tok "\\"} append tok $c } - XXXdquotedkey - XXXitablequotedkey { + XXXdquotedkey { if {$had_slash} { append tok "\\" append tok $c } else { - set_token_waiting type endquote value "\"" complete 1 startindex $cindex + set_token_waiting type enddquote value "\"" complete 1 startindex $cindex return 1 } } @@ -4901,7 +4961,7 @@ namespace eval tomlish::parse { append tok "\\" append tok $c } else { - #set_token_waiting type endsquote value "'" complete 1 + #set_token_waiting type enddquote value {"} complete 1 return 1 } } @@ -4924,64 +4984,40 @@ namespace eval tomlish::parse { #$slash_active not relevant when no tokenType #token is string only if we're expecting a value at this point switch -exact -- $state { - array-space { - #!? start looking for possible multistartquote - #set_tokenType startquote - #set tok $c - #return 1 - set_tokenType "startquotesequence" ;#one or more quotes in a row - either startquote or multistartquote - set tok $c - } - keyval-value-expected - itable-keyval-value-expected { - set_tokenType "startquotesequence" ;#one or more quotes in a row - either startquote or multistartquote - set tok $c + array-space - keyval-value-expected - itable-keyval-value-expected { + #leading dquote + #pseudo-token _start_squote_sequence ss not received by state machine + #This pseudotoken will trigger production of single_dquote token or triple_dquote token + #It currently doesn't trigger double_dquote token + #(handle "" same as "x" ie produce a single_dquote and go into processing string) + #review - producing double_dquote for empty string may be slightly more efficient. + #This token is not used to handle dquote sequences once *within* a multistring + set_tokenType "_start_dquote_sequence" + set tok {"} } multistring-space { - #TODO - had_slash!!! - #REVIEW if {$had_slash} { set_tokenType "stringpart" set tok "\\\"" - set multi_dquote "" } else { - if {$multi_dquote eq "\"\""} { - tomlish::log::debug "- tokloop char dq ---> endmultiquote" - set_tokenType "endmultiquote" - set tok "\"\"\"" - return 1 - #set_token_waiting type endmultiquote value "\"\"\"" complete 1 - #set multi_dquote "" - #return 1 - } else { - append multi_dquote "\"" - } + #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row + #we are building up a tentative_accum_squote to determine if + #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines + #b) it is exactly ''' and we can terminate the whole multiliteral + #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space + set_tokenType "tentative_trigger_dquote" ;#trigger tentative_accum_dquote + set tok {"} + return 1 } } multiliteral-space { set_tokenType "literalpart" set tok "\"" } - XXXtable-space { - set_tokenType "startquote" - set tok $c - return 1 - } - XXXitable-space { - set_tokenType "startquote" - set tok $c - } table-space - itable-space { set_tokenType "dquotedkey" set tok "" } - tablename-state { - set_tokenType tablename - set tok $c - } - tablearrayname-state { - set_tokenType tablearrayname - set tok $c - } dottedkey-space { set_tokenType dquotedkey set tok "" @@ -4990,49 +5026,56 @@ namespace eval tomlish::parse { #set_tokenType dquote_seq_begin #set tok $c } + tablename-state { + set_tokenType tablename + set tok $c + } + tablearrayname-state { + set_tokenType tablearrayname + set tok $c + } default { - error "tomlish Unexpected quote during state '$state' [tomlish::parse::report_line]" + error "tomlish Unexpected dquote during state '$state' [tomlish::parse::report_line]" } } } } = { - set dquotes $multi_dquote - set multi_dquote "" ;#!! set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { - #assertion had_slash 0, multi_dquote "" + #assertion had_slash 0 append tok $c } - string - comment - dquotedkey - itablequotedkey { + string - comment - dquotedkey { #for these tokenTypes an = is just data. if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } whitespace { if {$state eq "multistring-space"} { - set backlen [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backlen + incr i -1 return 1 } else { set_token_waiting type equal value = complete 1 startindex $cindex @@ -5063,7 +5106,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok ${dquotes}= + append tok = } multiliteral-space { set_tokenType "literalpart" @@ -5084,8 +5127,6 @@ namespace eval tomlish::parse { } cr { #REVIEW! - set dquotes $multi_dquote - set multi_dquote "" ;#!! # \r carriage return if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. set slash_active 0 @@ -5098,16 +5139,18 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal { @@ -5159,8 +5202,6 @@ namespace eval tomlish::parse { } lf { # \n newline - set dquotes $multi_dquote - set multi_dquote "" ;#!! set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { @@ -5171,16 +5212,19 @@ namespace eval tomlish::parse { append tok lf ;#assert we should now have tok "crlf" - as a previous cr is the only way to have an incomplete newline tok return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { + #multiliteral or multistring incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal { @@ -5196,20 +5240,14 @@ namespace eval tomlish::parse { return 1 } stringpart { - if {$dquotes ne ""} { - append tok $dquotes + if {$had_slash} { + #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) + set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] incr i -1 return 1 } else { - if {$had_slash} { - #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) - set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] - incr i -1 - return 1 - } else { - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 } } starttablename - tablename - tablearrayname - starttablearrayname { @@ -5236,20 +5274,13 @@ namespace eval tomlish::parse { incr i -1 return 1 } else { - if {$dquotes ne ""} { - #e.g one or 2 quotes just before nl - set_tokenType "stringpart" - set tok $dquotes - incr i -1 - return 1 - } set_tokenType "newline" set tok lf return 1 } } multiliteral-space { - #assert had_slash 0, multi_dquote "" + #assert had_slash 0 set_tokenType "newline" set tok "lf" return 1 @@ -5275,8 +5306,6 @@ namespace eval tomlish::parse { } } , { - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { @@ -5287,39 +5316,40 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } comment - tablename - tablearrayname { if {$had_slash} {append tok "\\"} append tok , } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { #stringpart can have up to 2 quotes too if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } literal - literalpart - squotedkey { - #assert had_slash always 0, multi_dquote "" + #assert had_slash always 0 append tok $c } whitespace { if {$state eq "multistring-space"} { - set backlen [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backlen + incr i -1 return 1 } else { set_token_waiting type comma value "," complete 1 startindex $cindex @@ -5338,10 +5368,10 @@ namespace eval tomlish::parse { set_tokenType "stringpart" set tok "" if {$had_slash} {append tok "\\"} - append tok "$dquotes," + append tok "," } multiliteral-space { - #assert had_slash 0, multi_dquote "" + #assert had_slash 0 set_tokenType "literalpart" set tok "," } @@ -5354,8 +5384,6 @@ namespace eval tomlish::parse { } } . { - set dquotes $multi_dquote - set multi_dquote "" ;#!! set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { @@ -5366,42 +5394,45 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } comment - untyped_value { if {$had_slash} {append tok "\\"} append tok $c } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } literal - literalpart - squotedkey { - #assert had_slash always 0, multi_dquote "" + #assert had_slash always 0 append tok $c } whitespace { switch -exact -- $state { multistring-space { - set backchars [expr {[tcl::string::length $dquotes] + 1}] + #review if {$had_slash} { - incr backchars 1 + incr i -2 + } else { + incr i -1 } - incr i -$backchars return 1 } xxxdottedkey-space { @@ -5444,7 +5475,7 @@ namespace eval tomlish::parse { set_tokenType "stringpart" set tok "" if {$had_slash} {append tok "\\"} - append tok "$dquotes." + append tok "." } multiliteral-space { set_tokenType "literalpart" @@ -5471,8 +5502,6 @@ namespace eval tomlish::parse { } " " { - set dquotes $multi_dquote - set multi_dquote "" ;#!! if {[tcl::string::length $tokenType]} { set had_slash $slash_active set slash_active 0 @@ -5483,16 +5512,18 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } barekey { @@ -5512,9 +5543,9 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok $dquotes$c + append tok $c } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} { append tok "\\" } append tok $c } @@ -5526,8 +5557,7 @@ namespace eval tomlish::parse { incr i -2 return 1 } else { - #split into STRINGPART aaa WS " " - append tok $dquotes + #split into STRINGPART xxx WS " " incr i -1 return 1 } @@ -5537,15 +5567,7 @@ namespace eval tomlish::parse { } whitespace { if {$state eq "multistring-space"} { - if {$dquotes ne ""} { - #end whitespace token - #go back by the number of quotes plus this space char - set backchars [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backchars - return 1 - } else { - append tok $c - } + append tok $c } else { append tok $c } @@ -5588,12 +5610,6 @@ namespace eval tomlish::parse { incr i -1 return 1 } else { - if {$dquotes ne ""} { - set_tokenType "stringpart" - set tok $dquotes - incr i -1 - return 1 - } set_tokenType "whitespace" append tok $c } @@ -5613,9 +5629,6 @@ namespace eval tomlish::parse { } } tab { - set dquotes $multi_dquote - set multi_dquote "" ;#!! - if {[tcl::string::length $tokenType]} { if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out (?review) set slash_active 0 @@ -5626,12 +5639,18 @@ namespace eval tomlish::parse { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } barekey { @@ -5662,7 +5681,6 @@ namespace eval tomlish::parse { return 1 } else { #split into STRINGPART aaa WS " " - append tok $dquotes incr i -1 return 1 } @@ -5706,15 +5724,8 @@ namespace eval tomlish::parse { incr i -1 return 1 } else { - if {$dquotes ne ""} { - set_tokenType stringpart - set tok $dquotes - incr i -1 - return 1 - } else { - set_tokenType whitespace - append tok $c - } + set_tokenType whitespace + append tok $c } } multiliteral-space { @@ -5732,16 +5743,31 @@ namespace eval tomlish::parse { #BOM (Byte Order Mark) - ignored by token consumer if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } _start_squote_sequence { #assert - tok will be one or two squotes only + #A toml literal probably isn't allowed to contain this + #but we will parse and let the validator sort it out. incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart { append tok $c } + string - stringpart { + append tok $c + } default { + #state machine will generally not have entry to accept bom - let it crash set_token_waiting type bom value "\uFEFF" complete 1 startindex $cindex return 1 } @@ -5752,6 +5778,10 @@ namespace eval tomlish::parse { set_tokenType "literalpart" set tok $c } + multistring-space { + set_tokenType "stringpart" + set tok $c + } default { set_tokenType "bom" set tok "\uFEFF" @@ -5761,8 +5791,6 @@ namespace eval tomlish::parse { } } default { - set dquotes $multi_dquote - set multi_dquote "" ;#!! if {[tcl::string::length $tokenType]} { if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. @@ -5774,28 +5802,24 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } whitespace { if {$state eq "multistring-space"} { - if {$dquotes ne ""} { - set backlen [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backlen - return 1 - } else { - incr i -1 - return 1 - } + incr i -1 + return 1 } else { #review incr i -1 ;#We don't have a full token to add to the token_waiting dict - so leave this char for next run. @@ -5815,7 +5839,7 @@ namespace eval tomlish::parse { return 1 } stringpart { - append tok $dquotes$c + append tok $c } default { #e.g comment/string/literal/literalpart/untyped_value/starttablename/starttablearrayname/tablename/tablearrayname @@ -5835,22 +5859,12 @@ namespace eval tomlish::parse { error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" } } - XXXcurly-syntax { - puts stderr "curly-syntax - review" - if {[tomlish::utils::is_barekey $c]} { - set_tokenType "barekey" - append tok $c - } else { - error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" - } - } multistring-space { set_tokenType "stringpart" if {$had_slash} { - #assert - we don't get had_slash and dquotes at same time set tok \\$c } else { - set tok $dquotes$c + set tok $c } } multiliteral-space { @@ -5890,21 +5904,6 @@ namespace eval tomlish::parse { # error "Reached end of data whilst tokenType = '$tokenType'. INVALID" #} switch -exact -- $tokenType { - startquotesequence { - set toklen [tcl::string::length $tok] - if {$toklen == 1} { - #invalid - #eof with open string - error "tomlish eof reached without closing quote for string. [tomlish::parse::report_line]" - } elseif {$toklen == 2} { - #valid - #we ended in a double quote, not actually a startquoteseqence - effectively an empty string - switch_tokenType "startquote" - incr i -1 - #set_token_waiting type string value "" complete 1 - return 1 - } - } _start_squote_sequence { set toklen [tcl::string::length $tok] switch -- $toklen { @@ -5913,11 +5912,29 @@ namespace eval tomlish::parse { error "tomlish eof reached without closing single quote for string literal. [tomlish::parse::report_line]" } 2 { - #review - set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] set_tokenType "literal" set tok "" return 1 + + ##review + #set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] + #set_tokenType "literal" + #set tok "" + #return 1 + } + } + } + _start_dquote_sequence { + set toklen [tcl::string::length $tok] + switch -- $toklen { + 1 { + #invalid eof with open string + error "tomlish eof reached without closing double quote for string. [tomlish::parse::report_line]" + } + 2 { + set_tokenType "string" + set tok "" + return 1 } } } @@ -6011,6 +6028,16 @@ namespace eval tomlish::dict { return $name } + proc _show_tablenames {tablenames_info} { + append msg \n "tablenames_info:" \n + dict for {tkey tinfo} $tablenames_info { + append msg " " "table: $tkey" \n + dict for {field finfo} $tinfo { + append msg " " "$field $finfo" \n + } + } + return $msg + } } tcl::namespace::eval tomlish::app { diff --git a/src/vfs/_vfscommon.vfs/modules/dictn-0.1.1.tm b/src/vfs/_vfscommon.vfs/modules/dictn-0.1.1.tm new file mode 100644 index 00000000..c9ef87f2 --- /dev/null +++ b/src/vfs/_vfscommon.vfs/modules/dictn-0.1.1.tm @@ -0,0 +1,349 @@ +# -*- tcl -*- +# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt +# +# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. +# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +# (C) 2023 +# +# @@ Meta Begin +# Application dictn 0.1.1 +# Meta platform tcl +# Meta license +# @@ Meta End + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Requirements +##e.g package require frobz + + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +namespace eval dictn { + namespace export {[a-z]*} + namespace ensemble create +} + + +## ::dictn::append +#This can of course 'ruin' a nested dict if applied to the wrong element +# - i.e using the string op 'append' on an element that is itself a nested dict is analogous to the standard Tcl: +# %set list {a b {c d}} +# %append list x +# a b {c d}x +# IOW - don't do that unless you really know that's what you want. +# +proc ::dictn::append {dictvar path {value {}}} { + if {[llength $path] == 1} { + uplevel 1 [list dict append $dictvar $path $value] + } else { + upvar 1 $dictvar dvar + + ::set str [dict get $dvar {*}$path] + append str $val + dict set dvar {*}$path $str + } +} + +proc ::dictn::create {args} { + ::set data {} + foreach {path val} $args { + dict set data {*}$path $val + } + return $data +} + +proc ::dictn::exists {dictval path} { + return [dict exists $dictval {*}$path] +} + +proc ::dictn::filter {dictval path filterType args} { + ::set sub [dict get $dictval {*}$path] + dict filter $sub $filterType {*}$args +} + +proc ::dictn::for {keyvalvars dictval path body} { + ::set sub [dict get $dictval {*}$path] + dict for $keyvalvars $sub $body +} + +proc ::dictn::get {dictval {path {}}} { + return [dict get $dictval {*}$path] +} + +proc ::dictn::getdef {dictval path default} { + return [dict getdef $dictval {*}$path $default] +} + +proc ::dictn::getwithdefault {dictval path default} { + return [dict getdef $dictval {*}$path $default] +} + +if {[info commands ::tcl::dict::getdef] ne ""} { + proc ::dictn::incr {dictvar path {increment {}} } { + if {$increment eq ""} { + ::set increment 1 + } + if {[llength $path] == 1} { + uplevel 1 [list dict incr $dictvar $path $increment] + } else { + upvar 1 $dictvar dvar + if {![::info exists dvar]} { + dict set dvar {*}$path $increment + } else { + ::set newval [expr {[dict getdef $dvar {*}$path 0] + $increment}] + dict set dvar {*}$path $newval + } + return $dvar + } + } +} else { + proc ::dictn::incr {dictvar path {increment {}} } { + if {$increment eq ""} { + ::set increment 1 + } + if {[llength $path] == 1} { + uplevel 1 [list dict incr $dictvar $path $increment] + } else { + upvar 1 $dictvar dvar + if {![::info exists dvar]} { + dict set dvar {*}$path $increment + } else { + if {![dict exists $dvar {*}$path]} { + ::set val 0 + } else { + ::set val [dict get $dvar {*}$path] + } + ::set newval [expr {$val + $increment}] + dict set dvar {*}$path $newval + } + return $dvar + } + } +} + +proc ::dictn::info {dictval {path {}}} { + if {![string length $path]} { + return [dict info $dictval] + } else { + ::set sub [dict get $dictval {*}$path] + return [dict info $sub] + } +} + +proc ::dictn::keys {dictval {path {}} {glob {}}} { + ::set sub [dict get $dictval {*}$path] + if {[string length $glob]} { + return [dict keys $sub $glob] + } else { + return [dict keys $sub] + } +} + +proc ::dictn::lappend {dictvar path args} { + if {[llength $path] == 1} { + uplevel 1 [list dict lappend $dictvar $path {*}$args] + } else { + upvar 1 $dictvar dvar + + ::set list [dict get $dvar {*}$path] + ::lappend list {*}$args + dict set dvar {*}$path $list + } +} + +proc ::dictn::merge {args} { + error "nested merge not yet supported" +} + +#dictn remove dictionaryValue ?path ...? +proc ::dictn::remove {dictval args} { + ::set basic [list] ;#buffer basic (1element path) removals to do in a single call. + + foreach path $args { + if {[llength $path] == 1} { + ::lappend basic $path + } else { + #extract,modify,replace + ::set subpath [lrange $path 0 end-1] + + ::set sub [dict get $dictval {*}$subpath] + ::set sub [dict remove $sub [lindex $path end]] + + dict set dictval {*}$subpath $sub + } + } + + if {[llength $basic]} { + return [dict remove $dictval {*}$basic] + } else { + return $dictval + } +} + + +proc ::dictn::replace {dictval args} { + ::set basic [list] ;#buffer basic (1element path) replacements to do in a single call. + + foreach {path val} $args { + if {[llength $path] == 1} { + ::lappend basic $path $val + } else { + #extract,modify,replace + ::set subpath [lrange $path 0 end-1] + + ::set sub [dict get $dictval {*}$subpath] + ::set sub [dict replace $sub [lindex $path end] $val] + + dict set dictval {*}$subpath $sub + } + } + + + if {[llength $basic]} { + return [dict replace $dictval {*}$basic] + } else { + return $dictval + } +} + + +proc ::dictn::set {dictvar path newval} { + upvar 1 $dictvar dvar + return [dict set dvar {*}$path $newval] +} + +proc ::dictn::size {dictval {path {}}} { + return [dict size [dict get $dictval {*}$path]] +} + +proc ::dictn::unset {dictvar path} { + upvar 1 $dictvar dvar + return [dict unset dvar {*}$path +} + +proc ::dictn::update {dictvar args} { + ::set body [lindex $args end] + ::set maplist [lrange $args 0 end-1] + + upvar 1 $dictvar dvar + foreach {path var} $maplist { + if {[dict exists $dvar {*}$path]} { + uplevel 1 [list set $var [dict get $dvar $path]] + } + } + + catch {uplevel 1 $body} result + + foreach {path var} $maplist { + if {[dict exists $dvar {*}$path]} { + upvar 1 $var $var + if {![::info exists $var]} { + uplevel 1 [list dict unset $dictvar {*}$path] + } else { + uplevel 1 [list dict set $dictvar {*}$path [::set $var]] + } + } + } + return $result +} + +#an experiment. +proc ::dictn::Applyupdate {dictvar args} { + ::set body [lindex $args end] + ::set maplist [lrange $args 0 end-1] + + upvar 1 $dictvar dvar + + ::set headscript "" + ::set i 0 + foreach {path var} $maplist { + if {[dict exists $dvar {*}$path]} { + #uplevel 1 [list set $var [dict get $dvar $path]] + ::lappend arglist $var + ::lappend vallist [dict get $dvar {*}$path] + ::append headscript [string map [list %i% $i %v% $var] {upvar 1 %v% %v%; set %v% [lindex $args %i%]} ] + ::append headscript \n + ::incr i + } + } + + ::set body $headscript\r\n$body + + puts stderr "BODY: $body" + + #set result [apply [list args $body] {*}$vallist] + catch {apply [list args $body] {*}$vallist} result + + foreach {path var} $maplist { + if {[dict exists $dvar {*}$path] && [::info exists $var]} { + dict set dvar {*}$path [::set $var] + } + } + return $result +} + +proc ::dictn::values {dictval {path {}} {glob {}}} { + ::set sub [dict get $dictval {*}$path] + if {[string length $glob]} { + return [dict values $sub $glob] + } else { + return [dict values $sub] + } +} + +# Standard form: +#'dictn with dictVariable path body' +# +# Extended form: +#'dictn with dictVariable path arrayVariable body' +# +proc ::dictn::with {dictvar path args} { + if {[llength $args] == 1} { + ::set body [lindex $args 0] + return [uplevel 1 [list dict with $dictvar {*}$path $body]] + } else { + upvar 1 $dictvar dvar + ::lassign $args arrayname body + + upvar 1 $arrayname arr + array set arr [dict get $dvar {*}$path] + ::set prevkeys [array names arr] + + catch {uplevel 1 $body} result + + + foreach k $prevkeys { + if {![::info exists arr($k)]} { + dict unset $dvar {*}$path $k + } + } + foreach k [array names arr] { + dict set $dvar {*}$path $k $arr($k) + } + + return $result + } +} + + + + + + + + + + + + +# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +## Ready +package provide dictn [namespace eval dictn { + variable version + ::set version 0.1.1 +}] +return \ No newline at end of file diff --git a/src/vfs/_vfscommon.vfs/modules/test/tomlish-1.1.3.tm b/src/vfs/_vfscommon.vfs/modules/test/tomlish-1.1.3.tm index ed5044a7..8afb43d9 100644 Binary files a/src/vfs/_vfscommon.vfs/modules/test/tomlish-1.1.3.tm and b/src/vfs/_vfscommon.vfs/modules/test/tomlish-1.1.3.tm differ diff --git a/src/vfs/_vfscommon.vfs/modules/tomlish-1.1.4.tm b/src/vfs/_vfscommon.vfs/modules/tomlish-1.1.4.tm index 7a6d5205..33d5b912 100644 --- a/src/vfs/_vfscommon.vfs/modules/tomlish-1.1.4.tm +++ b/src/vfs/_vfscommon.vfs/modules/tomlish-1.1.4.tm @@ -153,15 +153,10 @@ namespace eval tomlish { } #review - if {[uplevel 1 [list info exists tablenames_seen]]} { - upvar tablenames_seen tablenames_seen + if {[uplevel 1 [list info exists tablenames_info]]} { + upvar tablenames_info tablenames_info } else { - set tablenames_seen [list] ;#list of lists - } - if {[uplevel 1 [list info exists tablenames_closed]]} { - upvar tablenames_closed tablenames_closed - } else { - set tablenames_closed [list] ;#list of lists + set tablenames_info [dict create] ;#keys are lists {parenttable subtable etc} corresponding to parenttable.subtable.etc } foreach sub [lrange $keyval_element 2 end] { @@ -207,13 +202,10 @@ namespace eval tomlish { ARRAY { #we need to recurse to get the corresponding dict for the contained item(s) #pass in the whole $found_sub - not just the $value! - set prev_tablenames_seen $tablenames_seen - set prev_tablenames_closed $tablenames_closed - set tablenames_seen [list] - set tablenames_closed [list] + set prev_tablenames_info $tablenames_info + set tablenames_info [dict create] set result [list type $type value [::tomlish::to_dict [list $found_sub]]] - set tablenames_seen $prev_tablenames_seen - set tablenames_closed $prev_tablenames_closed + set tablenames_info $prev_tablenames_info } MULTISTRING - MULTILITERAL { #review - mapping these to STRING might make some conversions harder? @@ -295,23 +287,66 @@ namespace eval tomlish { #[Data] #temps = [{cpu = 79.5, case = 72.0}] proc to_dict {tomlish} { + package require dictn #keep track of which tablenames have already been directly defined, # so we can raise an error to satisfy the toml rule: 'You cannot define any key or table more than once. Doing so is invalid' #Note that [a] and then [a.b] is ok if there are no subkey conflicts - so we are only tracking complete tablenames here. #we don't error out just because a previous tablename segment has already appeared. - ##variable tablenames_seen [list] - if {[uplevel 1 [list info exists tablenames_seen]]} { - upvar tablenames_seen tablenames_seen - } else { - set tablenames_seen [list] ;#list of lists - } - if {[uplevel 1 [list info exists tablenames_closed]]} { - upvar tablenames_closed tablenames_closed + + #Declaring, Creating, and Defining Tables + #https://github.com/toml-lang/toml/issues/795 + #(update - only Creating and Defining are relevant terminology) + + #review + #tablenames_info keys created, defined, createdby, definedby, closedby + + #consider the following 2 which are legal: + #[table] #'table' created, defined=open definedby={header table} + #x.y = 3 + #[table.x.z] #'table' defined=closed closedby={header table.x.z}, 'table.x' created, 'table.x.z' created defined=open definedby={header table.x.z} + #k= 22 + # #'table.x.z' defined=closed closedby={eof eof} + + #equivalent datastructure + + #[table] #'table' created, defined=open definedby={header table} + #[table.x] #'table' defined=closed closedby={header table.x}, 'table.x' created defined=open definedby={header table.x} + #y = 3 + #[table.x.z] #'table.x' defined=closed closedby={header table.x.z}, 'table.x.z' created defined=open definedby={header table.x.z} + #k=22 + + #illegal + #[table] #'table' created and defined=open + #x.y = 3 #'table.x' created first keyval pair defined=open definedby={keyval x.y = 3} + #[table.x.y.z] #'table' defined=closed, 'table.x' closed because parent 'table' closed?, 'table.x.y' cannot be created + #k = 22 + # + ## - we would fail on encountering table.x.y because only table and table.x are effectively tables - but that table.x is closed should be detected (?) + + #illegal + #[table] + #x.y = {p=3} + #[table.x.y.z] + #k = 22 + ## we should fail because y is an inline table which is closed to further entries + + #note: it is not safe to compare normalized tablenames using join! + # e.g a.'b.c'.d is not the same as a.b.c.d + # instead compare {a b.c d} with {a b c d} + # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. + #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' + #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} + + + + if {[uplevel 1 [list info exists tablenames_info]]} { + upvar tablenames_info tablenames_info } else { - set tablenames_closed [list] ;#list of lists + set tablenames_info [dict create] ;#keyed on tablepath each of which is a list such as {config subgroup etc} (corresponding to config.subgroup.etc) } + log::info "---> to_dict processing '$tomlish'<<<" set items $tomlish @@ -354,7 +389,7 @@ namespace eval tomlish { #a.b.c = 1 #table_key_hierarchy -> a b - #leafkey -> c + #tleaf -> c if {[llength $dotted_key_hierarchy] == 0} { #empty?? probably invalid. review #This is different to '' = 1 or ''.'' = 1 which have lengths 1 and 2 respectively @@ -362,10 +397,10 @@ namespace eval tomlish { } elseif {[llength $dotted_key_hierarchy] == 1} { #dottedkey is only a key - no table component set table_hierarchy [list] - set leafkey [lindex $dotted_key_hierarchy 0] + set tleaf [lindex $dotted_key_hierarchy 0] } else { set table_hierarchy [lrange $dotted_key_hierarchy 0 end-1] - set leafkey [lindex $dotted_key_hierarchy end] + set tleaf [lindex $dotted_key_hierarchy end] } #ensure empty tables are still represented in the datastructure @@ -380,143 +415,101 @@ namespace eval tomlish { } } #review? - if {[dict exists $datastructure {*}$table_hierarchy $leafkey]} { - error "Duplicate key '$table_hierarchy $leafkey'. The key already exists at this level in the toml data. The toml data is not valid." + if {[dict exists $datastructure {*}$table_hierarchy $tleaf]} { + error "Duplicate key '$table_hierarchy $tleaf'. The key already exists at this level in the toml data. The toml data is not valid." } #JMN test 2025 if {[llength $table_hierarchy]} { - lappend tablenames_seen $table_hierarchy + dictn incr tablenames_info [list $table_hierarchy seencount] } set keyval_dict [_get_keyval_value $item] if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { - lappend tablenames_seen [list {*}$table_hierarchy $leafkey] - lappend tablenames_closed [list {*}$table_hierarchy $leafkey] + set t [list {*}$table_hierarchy $tleaf] + dictn incr tablenames_info [list $t seencount] + dictn set tablenames_info [list $t closed] 1 #review - item is an ITABLE - we recurse here without datastructure context :/ #overwriting keys? todo ? - dict set datastructure {*}$table_hierarchy $leafkey $keyval_dict + dict set datastructure {*}$table_hierarchy $tleaf $keyval_dict } else { - dict set datastructure {*}$table_hierarchy $leafkey $keyval_dict + dict set datastructure {*}$table_hierarchy $tleaf $keyval_dict } + } + TABLEARRAY { + set tablename [lindex $item 1] + log::debug "---> to_dict processing item TABLENAME (name: $tablename): $item" + set norm_segments [::tomlish::utils::tablename_split $tablename true] ;#true to normalize + #we expect repeated tablearray entries - each adding a sub-object to the value, which is an array/list. + } TABLE { set tablename [lindex $item 1] + log::debug "---> to_dict processing item TABLE (name: $tablename): $item" #set tablename [::tomlish::utils::tablename_trim $tablename] set norm_segments [::tomlish::utils::tablename_split $tablename true] ;#true to normalize - if {$norm_segments in $tablenames_seen} { - error "Table name '$tablename' has already been directly defined in the toml data. Invalid." - } - log::debug "---> to_dict processing item $tag (name: $tablename): $item" - set name_segments [::tomlish::utils::tablename_split $tablename] ;#unnormalized - set last_seg "" - #toml spec rule - all segments mst be non-empty - #note that the results of tablename_split are 'raw' - ie some segments may be enclosed in single or double quotes. - - set table_key_sublist [list] - - foreach normseg $norm_segments { - lappend table_key_sublist $normseg - if {[dict exists $datastructure {*}$table_key_sublist]} { - #It's ok for this key to already exist *if* it was defined by a previous tablename or equivalent - #and if this key is longer - - #consider the following 2 which are legal: - #[table] - #x.y = 3 - #[table.x.z] - #k= 22 - - #equivalent - - #[table] - #[table.x] - #y = 3 - #[table.x.z] - #k=22 - - #illegal - #[table] - #x.y = 3 - #[table.x.y.z] - #k = 22 - ## - we should fail on encountering table.x.y because only table and table.x are effectively tables - - #illegal - #[table] - #x.y = {p=3} - #[table.x.y.z] - #k = 22 - ## we should fail because y is an inline table which is closed to further entries - - - #note: it is not safe to compare normalized tablenames using join! - # e.g a.'b.c'.d is not the same as a.b.c.d - # instead compare {a b.c d} with {a b c d} - # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. - #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' - #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} - - set sublist_length [llength $table_key_sublist] - set found_testkey 0 - if {$table_key_sublist in $tablenames_seen} { - set found_testkey 1 - } else { - #see if it was defined by a longer entry - foreach seen_table_segments $tablenames_seen { - if {[llength $seen_table_segments] <= $sublist_length} { - continue - } - #each tablenames_seen entry is already a list of normalized segments - - #we could have [a.b.c.d] early on - # followed by [a.b] - which was still defined by the earlier one. + set T_DEFINED [dictn getdef $tablenames_info [list $norm_segments defined] NULL] + if {$T_DEFINED ne "NULL"} { + #our tablename e.g [a.b.c.d] declares a space to 'define' subkeys - but there has already been a definition space for this path + set msg "Table name $tablename has already been directly defined in the toml data. Invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + error $msg + } - set seen_longer [lrange $seen_segments 0 [expr {$sublist_length -1}]] - puts stderr "testkey:'$table_key_sublist' vs seen_match:'$seen_longer'" - if {$table_key_sublist eq $seen_longer} { - set found_testkey 1 - } - } - } - if {$found_testkey == 0} { - #the raw unnormalized tablename might be ok to display in the error message, although it's not the actual dict keyset - set msg "key $table_key_sublist already exists in datastructure, but wasn't defined by a supertable." - append msg \n "tablenames_seen:" \n - foreach ts $tablenames_seen { - append msg " " $ts \n - } + set name_segments [::tomlish::utils::tablename_split $tablename 0] ;#unnormalized e.g ['a'."b".c.d] -> 'a' "b" c d + #results of tablename_split 0 are 'raw' - ie some segments may be enclosed in single or double quotes. + + + set supertable [list] + ############## + # [a.b.c.d] + # norm_segments = {a b c d} + #check a {a b} {a b c} <---- supertables of a.b.c.d + ############## + foreach normseg [lrange $norm_segments 0 end-1] { + lappend supertable $normseg + if {![dictn exists $tablenames_info [list $supertable type]]} { + #supertable with this path doesn't yet exist + if {[dict exists $datastructure {*}$supertable]} { + #There is data though - so it must have been created as a keyval + set msg "Supertable [join $supertable .] of table name $tablename already has data - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] error $msg + } else { + #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here + dictn set tablenames_info [list $supertable type] header + #ensure empty tables are still represented in the datastructure + dict set datastructure {*}$supertable [list] } - } - - } - - #ensure empty tables are still represented in the datastructure - set key_sublist [list] - foreach k $norm_segments { - lappend key_sublist $k - if {![dict exists $datastructure {*}$key_sublist]} { - dict set datastructure {*}$key_sublist [list] } else { - tomlish::log::notice "to_dict datastructure at (TABLE) subkey $key_sublist already had data: [dict get $datastructure {*}$key_sublist]" + #supertable has already been created - and maybe defined - but even if defined we can add subtables } } + #table [a.b.c.d] hasn't been defined - but may have been 'created' already by a longer tablename + # - or may have existing data from a keyval + if {![dictn exists $tablenames_info [list $norm_segments type]]} { + if {[dict exists $datastructure {*}$norm_segments]} { + set msg "Table name $tablename already has data - invalid" + append msg \n [tomlish::dict::_show_tablenames $tablenames_info] + error $msg + } + #no data or previously created table + dictn set tablenames_info [list $norm_segments type] header - #We must do this after the key-collision test above! - lappend tablenames_seen $norm_segments - - + #We are 'defining' this table's keys and values here (even if empty) + dict set datastructure {*}$norm_segments [list] ;#ensure table still represented in datastructure even if we add no keyvals here + } + dictn set tablenames_info [list $norm_segments defined] open log::debug ">>> to_dict >>>>>>>>>>>>>>>>> normalized table key hierarchy : $norm_segments" #now add the contained elements foreach element [lrange $item 2 end] { set type [lindex $element 0] - log::debug "----> tododict processing $tag subitem $type processing contained element $element" + log::debug "----> todict processing $tag subitem $type processing contained element $element" switch -exact -- $type { DOTTEDKEY { set dkey_info [_get_dottedkey_info $element] @@ -547,14 +540,19 @@ namespace eval tomlish { puts stdout "to_dict>>> $keyval_dict" dict set datastructure {*}$norm_segments {*}$dkeys $leaf_key $keyval_dict #JMN 2025 - lappend tablenames_seen [list {*}$norm_segments {*}$dkeys] + #lappend tablenames_info [list {*}$norm_segments {*}$dkeys] + set tkey [list {*}$norm_segments {*}$dkeys] + dictn incr tablenames_info [list $tkey seencount] if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { #the value is either empty or or a dict structure with arbitrary (from-user-data) toplevel keys # inner structure will contain {type value } if all leaves are not empty ITABLES - lappend tablenames_seen [list {*}$norm_segments {*}$dkeys $leaf_key] + set tkey [list {*}$norm_segments {*}$dkeys $leaf_key] + #lappend tablenames_info [list {*}$norm_segments {*}$dkeys $leaf_key] + dictn incr tablenames_info [list $tkey seencount] #if the keyval_dict is not a simple type x value y - then it's an inline table ? #if so - we should add the path to the leaf_key as a closed table too - as it's not allowed to have more entries added. + dictn set tablenames_info [list $tkey closed] 1 } } @@ -562,7 +560,7 @@ namespace eval tomlish { #ignore } default { - error "Sub element of type '$type' not understood in table context. Expected only KEY,DQKEY,SQKEY,NEWLINE,COMMENT,WS" + error "Sub element of type '$type' not understood in table context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" } } } @@ -1316,7 +1314,12 @@ namespace eval tomlish::encode { #NOTE - this DELIBERATELY does not validate the data, or process escapes etc #It encodes the tomlish records as they are. #ie it only produces toml shaped data from a tomlish list. + # #It is part of the roundtripability of data from toml to tomlish + #!! ie - it is not the place to do formatting of inline vs multiline !! + # That needs to be encoded in the tomlish data that is being passed in + # (e.g from_dict could make formatting decisions in the tomlish it produces) + # #e.g duplicate keys etc can exist in the toml output. #The to_dict from_dict (or any equivalent processor pair) is responsible for validation and conversion #back and forth of escape sequences where appropriate. @@ -1646,17 +1649,27 @@ namespace eval tomlish::decode { #pop_trigger_tokens: newline tablename endarray endinlinetable #note a token is a pop trigger depending on context. e.g first newline during keyval is a pop trigger. set parentlevel [expr {$nest -1}] - set do_append_to_parent 1 ;#most tokens will leave this alone - but some like squote_seq need to do their own append + set do_append_to_parent 1 ;#most tokens will leave this alone - but some like tentative_accum_squote need to do their own append switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote { + #should only apply within a multiliteral #### set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed #Without this - we would get extraneous empty list entries in the parent # - as the xxx-squote-space isn't a space level from the toml perspective # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop + #assert prevstate always trailing-squote-space + #dev guardrail - remove? assertion lib? + switch -exact -- $prevstate { + trailing-squote-space { + } + default { + error "--- unexpected popped due to tentative_accum_squote but came from state '$prevstate' should have been trailing-squote-space" + } + } switch -- $tok { ' { - tomlish::parse::set_token_waiting type startsquote value $tok complete 1 startindex [expr {$i -1}] + tomlish::parse::set_token_waiting type single_squote value $tok complete 1 startindex [expr {$i -1}] } '' { #review - we should perhaps return double_squote instead? @@ -1669,74 +1682,51 @@ namespace eval tomlish::decode { tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 3}] } '''' { - switch -exact -- $prevstate { - leading-squote-space { - error "---- 4 squotes from leading-squote-space - shouldn't get here" - #we should have emitted the triple and left the last for next loop + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] + #todo integrate left squote with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + LITERALPART { + set newval "[lindex $lastpart 1]'" + set parentdata $v($parentlevel) + lset parentdata end [list LITERALPART $newval] + set v($parentlevel) $parentdata } - trailing-squote-space { - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] - #todo integrate left squote with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - LITERALPART { - set newval "[lindex $lastpart 1]'" - set parentdata $v($parentlevel) - lset parentdata end [list LITERALPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE { - lappend v($parentlevel) [list LITERALPART "'"] - } - MULTILITERAL { - #empty - lappend v($parentlevel) [list LITERALPART "'"] - } - default { - error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" - } - } + NEWLINE { + lappend v($parentlevel) [list LITERALPART "'"] + } + MULTILITERAL { + #empty + lappend v($parentlevel) [list LITERALPART "'"] } default { - error "--- unexpected popped due to squote_seq but came from state '$prevstate' should have been leading-squote-space or trailing-squote-space" + error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" } } } ''''' { - switch -exact -- $prevstate { - leading-squote-space { - error "---- 5 squotes from leading-squote-space - shouldn't get here" - #we should have emitted the triple and left the following squotes for next loop + tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] + #todo integrate left 2 squotes with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + LITERALPART { + set newval "[lindex $lastpart 1]''" + set parentdata $v($parentlevel) + lset parentdata end [list LITERALPART $newval] + set v($parentlevel) $parentdata } - trailing-squote-space { - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] - #todo integrate left 2 squotes with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - LITERALPART { - set newval "[lindex $lastpart 1]''" - set parentdata $v($parentlevel) - lset parentdata end [list LITERALPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE { - lappend v($parentlevel) [list LITERALPART "''"] - } - MULTILITERAL { - lappend v($parentlevel) [list LITERALPART "''"] - } - default { - error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" - } - } + NEWLINE { + lappend v($parentlevel) [list LITERALPART "''"] + } + MULTILITERAL { + lappend v($parentlevel) [list LITERALPART "''"] } default { - error "--- unexpected popped due to squote_seq but came from state '$prevstate' should have been leading-squote-space or trailing-squote-space" + error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" } } } } - puts stderr "tomlish::decode::toml ---- HERE squote_seq pop <$tok>" } triple_squote { #presumably popping multiliteral-space @@ -1763,7 +1753,119 @@ namespace eval tomlish::decode { lappend merged $part } default { - error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($next)" + error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" + } + } + set lasttype [lindex $part 0] + } + set v($nest) $merged + } + tentative_accum_dquote { + #should only apply within a multistring + #### + set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed + #Without this - we would get extraneous empty list entries in the parent + # - as the trailing-dquote-space isn't a space level from the toml perspective + # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop + #assert prevstate always trailing-dquote-space + #dev guardrail - remove? assertion lib? + switch -exact -- $prevstate { + trailing-dquote-space { + } + default { + error "--- unexpected popped due to tentative_accum_dquote but came from state '$prevstate' should have been trailing-dquote-space" + } + } + switch -- $tok { + {"} { + tomlish::parse::set_token_waiting type single_dquote value $tok complete 1 startindex [expr {$i -1}] + } + {""} { + #review - we should perhaps return double_dquote instead? + #tomlish::parse::set_token_waiting type literal value "" complete 1 + tomlish::parse::set_token_waiting type double_dquote value "" complete 1 startindex [expr {$i - 2}] + } + {"""} { + #### + #if already an eof in token_waiting - set_token_waiting will insert before it + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 3}] + } + {""""} { + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 4}] + #todo integrate left dquote with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + STRINGPART { + set newval "[lindex $lastpart 1]\"" + set parentdata $v($parentlevel) + lset parentdata end [list STRINGPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE - CONT - WS { + lappend v($parentlevel) [list STRINGPART {"}] + } + MULTISTRING { + #empty + lappend v($parentlevel) [list STRINGPART {"}] + } + default { + error "--- don't know how to integrate extra trailing dquote with data $v($parentlevel)" + } + } + } + {"""""} { + tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i-5}] + #todo integrate left 2 dquotes with nest data at this level + set lastpart [lindex $v($parentlevel) end] + switch -- [lindex $lastpart 0] { + STRINGPART { + set newval "[lindex $lastpart 1]\"\"" + set parentdata $v($parentlevel) + lset parentdata end [list STRINGPART $newval] + set v($parentlevel) $parentdata + } + NEWLINE - CONT - WS { + lappend v($parentlevel) [list STRINGPART {""}] + } + MULTISTRING { + lappend v($parentlevel) [list STRINGPART {""}] + } + default { + error "--- don't know how to integrate extra trailing 2 dquotes with data $v($parentlevel)" + } + } + } + } + } + triple_dquote { + #presumably popping multistring-space + ::tomlish::log::debug "---- triple_dquote for last_space_action pop leveldata: $v($nest)" + set merged [list] + set lasttype "" + foreach part $v($nest) { + switch -exact -- [lindex $part 0] { + MULTISTRING { + lappend merged $part + } + STRINGPART { + if {$lasttype eq "STRINGPART"} { + set prevpart [lindex $merged end] + lset prevpart 1 [lindex $prevpart 1][lindex $part 1] + lset merged end $prevpart + } else { + lappend merged $part + } + } + CONT - WS { + lappend merged $part + } + NEWLINE { + #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here + #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. + lappend merged $part + } + default { + error "---- triple_dquote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" } } set lasttype [lindex $part 0] @@ -1809,15 +1911,12 @@ namespace eval tomlish::decode { endinlinetable { ::tomlish::log::debug "---- endinlinetable for last_space_action pop" } - endmultiquote { - ::tomlish::log::debug "---- endmultiquote for last_space_action 'pop'" - } default { error "---- unexpected tokenType '$tokenType' for last_space_action 'pop'" } } if {$do_append_to_parent} { - #e.g squote_seq does it's own appends as necessary - so won't get here + #e.g tentative_accum_squote does it's own appends as necessary - so won't get here lappend v($parentlevel) [set v($nest)] } @@ -1831,8 +1930,8 @@ namespace eval tomlish::decode { switch -exact -- $tokenType { - squote_seq_begin { - #### + tentative_trigger_squote - tentative_trigger_dquote { + #### this startok will always be tentative_accum_squote/tentative_accum_dquote starting with one accumulated squote/dquote if {[dict exists $transition_info starttok] && [dict get $transition_info starttok] ne ""} { lassign [dict get $transition_info starttok] starttok_type starttok_val set next_tokenType_known 1 @@ -1840,6 +1939,16 @@ namespace eval tomlish::decode { set tok $starttok_val } } + single_squote { + #JMN - REVIEW + set next_tokenType_known 1 + ::tomlish::parse::set_tokenType "squotedkey" + set tok "" + } + triple_squote { + ::tomlish::log::debug "---- push trigger tokenType triple_squote" + set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERALPART + } squotedkey { switch -exact -- $prevstate { table-space - itable-space { @@ -1849,6 +1958,9 @@ namespace eval tomlish::decode { #todo - check not something already waiting? tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } + triple_dquote { + set v($nest) [list MULTISTRING] ;#container for NEWLINE,STRINGPART,CONT + } dquotedkey { switch -exact -- $prevstate { table-space - itable-space { @@ -1858,7 +1970,7 @@ namespace eval tomlish::decode { #todo - check not something already waiting? tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } - XXXdquotedkey - XXXitablequotedkey { + XXXdquotedkey { #todo set v($nest) [list DQKEY $tok] ;#$tok is the keyname } @@ -1878,34 +1990,29 @@ namespace eval tomlish::decode { tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } } - startsquote { - #JMN - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "squotedkey" - set tok "" - } tablename { #note: we do not use the output of tomlish::tablename_trim to produce a tablename for storage in the tomlish list! #The tomlish list is intended to preserve all whitespace (and comments) - so a roundtrip from toml file to tomlish # back to toml file will be identical. #It is up to the datastructure stage to normalize and interpret tomlish for programmatic access. # we call tablename_trim here only to to validate that the tablename data is well-formed at the outermost level, - # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names. + # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names from + # a structural perspective. #todo - review! It's arguable that we should not do any validation here, and just store even incorrect raw tablenames, # so that the tomlish list is more useful for say a toml editor. Consider adding an 'err' tag to the appropriate place in the # tomlish list? - set test_only [::tomlish::utils::tablename_trim $tok] - ::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$test_only'" + #set trimtable [::tomlish::utils::tablename_trim $tok] + #::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$trimtable'" set v($nest) [list TABLE $tok] ;#$tok is the *raw* table name #note also that equivalent tablenames may have different toml representations even after being trimmed! #e.g ["x\t\t"] & ["x "] (tab escapes vs literals) #These will show as above in the tomlish list, but should normalize to the same tablename when used as keys by the datastructure stage. } tablearrayname { - set test_only [::tomlish::utils::tablename_trim $tok] - puts stdout "trimmed (but not normalized) tablearrayname: '$test_only'" + #set trimtable [::tomlish::utils::tablename_trim $tok] + #::tomlish::log::debug "---- trimmed (but not normalized) tablearrayname: '$trimtable'" set v($nest) [list TABLEARRAY $tok] ;#$tok is the *raw* tablearray name } startarray { @@ -1914,14 +2021,6 @@ namespace eval tomlish::decode { startinlinetable { set v($nest) [list ITABLE] ;#$tok is just the opening curly brace - don't output. } - startmultiquote { - ::tomlish::log::debug "---- push trigger tokenType startmultiquote" - set v($nest) [list MULTISTRING] ;#container for STRINGPART, WS, CONT, NEWLINE - } - triple_squote { - ::tomlish::log::debug "---- push trigger tokenType triple_squote" - set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERAL - } default { error "---- push trigger tokenType '$tokenType' not yet implemented" } @@ -1931,11 +2030,11 @@ namespace eval tomlish::decode { #no space level change switch -exact -- $tokenType { squotedkey { - puts "---- squotedkey in state $prevstate (no space level change)" + #puts "---- squotedkey in state $prevstate (no space level change)" lappend v($nest) [list SQKEY $tok] } dquotedkey { - puts "---- dquotedkey in state $prevstate (no space level change)" + #puts "---- dquotedkey in state $prevstate (no space level change)" lappend v($nest) [list DQKEY $tok] } barekey { @@ -1960,29 +2059,46 @@ namespace eval tomlish::decode { startinlinetable { puts stderr "---- decode::toml error. did not expect startinlinetable without space level change (no space level change)" } - startquote { + single_dquote { switch -exact -- $newstate { string-state { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "string" set tok "" } - quoted-key { + dquoted-key { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "dquotedkey" set tok "" } - XXXitable-quoted-key { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "itablequotedkey" - set tok "" + multistring-space { + lappend v($nest) [list STRINGPART {"}] + #may need to be joined on pop if there are neighbouring STRINGPARTS + } + default { + error "---- single_dquote switch case not implemented for nextstate: $newstate (no space level change)" + } + } + } + double_dquote { + #leading extra quotes - test: toml_multistring_startquote2 + switch -exact -- $prevstate { + itable-keyval-value-expected - keyval-value-expected { + puts stderr "tomlish::decode::toml double_dquote TEST" + #empty string + lappend v($nest) [list STRINGPART ""] + } + multistring-space { + #multistring-space to multistring-space + lappend v($nest) [list STRINGPART {""}] } default { - error "---- startquote switch case not implemented for nextstate: $newstate (no space level change)" + error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" } } + } - startsquote { + single_squote { switch -exact -- $newstate { literal-state { set next_tokenType_known 1 @@ -1995,41 +2111,17 @@ namespace eval tomlish::decode { set tok "" } multiliteral-space { - #false alarm squote returned from squote_seq pop + #false alarm squote returned from tentative_accum_squote pop ::tomlish::log::debug "---- adding lone squote to own LITERALPART nextstate: $newstate (no space level change)" #(single squote - not terminating space) lappend v($nest) [list LITERALPART '] #may need to be joined on pop if there are neighbouring LITERALPARTs } default { - error "---- startsquote switch case not implemented for nextstate: $newstate (no space level change)" + error "---- single_squote switch case not implemented for nextstate: $newstate (no space level change)" } } } - startmultiquote { - #review - puts stderr "---- got startmultiquote in state $prevstate (no space level change)" - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "stringpart" - set tok "" - } - endquote { - #nothing to do? - set tok "" - } - endsquote { - set tok "" - } - endmultiquote { - #JMN!! - set tok "" - } - string { - lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes - } - literal { - lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes - } double_squote { switch -exact -- $prevstate { keyval-value-expected { @@ -2044,6 +2136,19 @@ namespace eval tomlish::decode { } } } + enddquote { + #nothing to do? + set tok "" + } + endsquote { + set tok "" + } + string { + lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes + } + literal { + lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes + } multistring { #review lappend v($nest) [list MULTISTRING $tok] @@ -2056,11 +2161,9 @@ namespace eval tomlish::decode { } literalpart { lappend v($nest) [list LITERALPART $tok] ;#will not get wrapped in squotes directly - } - itablequotedkey { - } untyped_value { + #would be better termed unclassified_value #we can't determine the type of unquoted values (int,float,datetime,bool) until the entire token was read. if {$tok in {true false}} { set tag BOOL @@ -2238,7 +2341,7 @@ namespace eval tomlish::utils { #eg {dog."tater.man"} set sLen [tcl::string::length $tablename] set segments [list] - set mode "unknown" ;#5 modes: unknown, quoted,litquoted, unquoted, syntax + set mode "preval" ;#5 modes: preval, quoted,litquoted, unquoted, postval #quoted is for double-quotes, litquoted is for single-quotes (string literal) set seg "" for {set i 0} {$i < $sLen} {incr i} { @@ -2249,139 +2352,166 @@ namespace eval tomlish::utils { set lastChar "" } + #todo - track\count backslashes properly + set c [tcl::string::index $tablename $i] + if {$c eq "\""} { + if {($lastChar eq "\\")} { + #not strictly correct - we could have had an even number prior-backslash sequence + #the toml spec would have us error out immediately on bsl in bad location - but we're + #trying to parse to unvalidated tomlish + set ctest escq + } else { + set ctest dq + } + } else { + set ctest [string map [list " " sp \t tab] $c] + } - if {$c eq "."} { - switch -exact -- $mode { - unquoted { - #dot marks end of segment. - lappend segments $seg - set seg "" - set mode "unknown" - } - quoted { - append seg $c - } - unknown { - lappend segments $seg - set seg "" - } - litquoted { - append seg $c - } - default { - #mode: syntax - #we got our dot. - the syntax mode is now satisfied. - set mode "unknown" + switch -- $ctest { + . { + switch -exact -- $mode { + preval { + error "tablename_split. dot not allowed - expecting a value" + } + unquoted { + #dot marks end of segment. + #if {![is_barekey $seg]} { + # error "tablename_split. dot not allowed - expecting a value" + #} + lappend segments $seg + set seg "" + set mode "preval" + } + quoted { + append seg $c + } + litquoted { + append seg $c + } + postval { + #got dot in an expected location + set mode "preval" + } } } - } elseif {($c eq "\"") && ($lastChar ne "\\")} { - if {$mode eq "unknown"} { - if {[tcl::string::trim $seg] ne ""} { - #we don't allow a quote in the middle of a bare key - error "tablename_split. character '\"' invalid at this point in tablename. tablename: '$tablename'" - } - set mode "quoted" - set seg "\"" - } elseif {$mode eq "unquoted"} { - append seg $c - } elseif {$mode eq "quoted"} { - append seg $c - - if {$normalize} { - lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] - } else { - lappend segments $seg + dq { + #unescaped dquote + switch -- $mode { + preval { + set mode "quoted" + set seg "\"" + } + unquoted { + #invalid in barekey - but we are after structure only + append seg $c + } + quoted { + append seg $c + if {$normalize} { + lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] + } else { + lappend segments $seg + } + set seg "" + set mode "postval" ;#make sure we only accept a dot or end-of-data now. + } + litquoted { + append seg $c + } + postval { + error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" + } } - - set seg "" - set mode "syntax" ;#make sure we only accept a dot or end-of-data now. - } elseif {$mode eq "litquoted"} { - append seg $c - } elseif {$mode eq "syntax"} { - error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" - } - } elseif {($c eq "\'")} { - if {$mode eq "unknown"} { - append seg $c - set mode "litquoted" - } elseif {$mode eq "unquoted"} { - #single quote inside e.g o'neill - append seg $c - } elseif {$mode eq "quoted"} { - append seg $c - - } elseif {$mode eq "litquoted"} { - append seg $c - #no normalization to do - lappend segments $seg - set seg "" - set mode "syntax" - } elseif {$mode eq "syntax"} { - error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" } - - } elseif {$c in [list " " \t]} { - if {$mode eq "syntax"} { - #ignore - } else { - append seg $c + ' { + switch -- $mode { + preval { + append seg $c + set mode "litquoted" + } + unquoted { + #single quote inside e.g o'neill - ultimately invalid - but we pass through here. + append seg $c + } + quoted { + append seg $c + } + litquoted { + append seg $c + #no normalization to do aside from stripping squotes + if {$normalize} { + lappend segments [tcl::string::range $seg 1 end-1] + } else { + lappend segments $seg + } + set seg "" + set mode "postval" + } + postval { + error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" + } + } } - } else { - if {$mode eq "syntax"} { - error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" + sp - tab { + switch -- $mode { + preval - postval { + #ignore + } + unquoted { + #terminates a barekey + lappend segments $seg + set seg "" + set mode "postval" + } + default { + #append to quoted or litquoted + append seg $c + } + } } - if {$mode eq "unknown"} { - set mode "unquoted" + default { + switch -- $mode { + preval { + set mode unquoted + append seg $c + } + postval { + error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" + } + default { + append seg $c + } + } } - append seg $c } + if {$i == $sLen-1} { #end of data ::tomlish::log::debug "End of data: mode='$mode'" - #REVIEW - we can only end up in unquoted or syntax here? are other branches reachable? switch -exact -- $mode { - quoted { - if {$c ne "\""} { - error "tablename_split. missing closing double-quote in a segment. tablename: '$tablename'" - } - if {$normalize} { - lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] - #lappend segments [subst -nocommands -novariables [::string range $seg 1 end-1]] ;#wrong - } else { - lappend segments $seg - } + preval { + error "tablename_split. Expected a value after last dot separator. tablename: '$tablename'" } - litquoted { - set trimmed_seg [tcl::string::trim $seg] - if {[tcl::string::index $trimmed_seg end] ne "\'"} { - error "tablename_split. missing closing single-quote in a segment. tablename: '$tablename'" - } + unquoted { lappend segments $seg } - unquoted - unknown { - lappend segments $seg + quoted { + error "tablename_split. Expected a trailing double quote. tablename: '$tablename'" } - syntax { - #ok - segment already lappended + litquoted { + error "tablename_split. Expected a trailing single quote. tablename: '$tablename'" } - default { - lappend segments $seg + postval { + #ok - segment already lappended } } } } - foreach seg $segments { - set trimmed [tcl::string::trim $seg " \t"] - #note - we explicitly allow 'empty' quoted strings '' & "" - # (these are 'discouraged' but valid toml keys) - #if {$trimmed in [list "''" "\"\""]} { - # puts stderr "tablename_split. warning - Empty quoted string as tablename segment" - #} - if {$trimmed eq "" } { - error "tablename_split. Empty segment found. tablename: '$tablename' segments [llength $segments] ($segments)" - } - } + + #note - we must allow 'empty' quoted strings '' & "" + # (these are 'discouraged' but valid toml keys) + return $segments } @@ -2432,26 +2562,34 @@ namespace eval tomlish::utils { #- escape_string and unescape_string would not be reliably roundtrippable inverses anyway. #REVIEW - provide it anyway? When would it be desirable to use? - variable Bstring_control_map [list\ - \b {\b}\ - \n {\n}\ - \r {\r}\ - \" {\"}\ - \x1b {\e}\ - \\ "\\\\"\ - ] + variable Bstring_control_map [dict create] + dict set Bstring_control_map \b {\b} + dict set Bstring_control_map \n {\n} + dict set Bstring_control_map \r {\r} + dict set Bstring_control_map \" {\"} + #dict set Bstring_control_map \x1b {\e} ;#should presumably be only be a convenience for decode - going the other way we get \u001B + dict set Bstring_control_map \\ "\\\\" + #\e for \x1b seems like it might be included - v1.1?? hard to find current state of where toml is going :/ #for a Bstring (Basic string) tab is explicitly mentioned as not being one that must be escaped. - for {set cdec 0} {$cdec <= 8} {incr cdec} { + #8 = \b - already in list. + #built the remainder whilst checking for entries already hardcoded above -in case more are added to the hardcoded list + for {set cdec 0} {$cdec <= 7} {incr cdec} { set hhhh [format %.4X $cdec] - lappend Bstring_control_map [format %c $cdec] \\u$hhhh + set char [format %c $cdec] + if {![dict exists $Bstring_control_map $char]} { + dict set Bstring_control_map $char \\u$hhhh + } } for {set cdec [expr {0x0A}]} {$cdec <= 0x1F} {incr cdec} { set hhhh [format %.4X $cdec] - lappend Bstring_control_map [format %c $cdec] \\u$hhhh + set char [format %c $cdec] + if {![dict exists $Bstring_control_map $char]} { + dict set Bstring_control_map $char \\u$hhhh + } } # \u007F = 127 - lappend Bstring_control_map [format %c 127] \\u007F + dict set Bstring_control_map [format %c 127] \\u007F #Note the inclusion of backslash in the list of controls makes this non idempotent - subsequent runs would keep encoding the backslashes! #escape only those chars that must be escaped in a Bstring (e.g not tab which can be literal or escaped) @@ -2474,6 +2612,7 @@ namespace eval tomlish::utils { # it recognizes other escapes which aren't approprite e.g \xhh and octal \nnn # it replaces \ with a single whitespace (trailing backslash) #This means we shouldn't use 'subst' on the whole string, but instead substitute only the toml-specified escapes (\r \n \b \t \f \\ \" \uhhhh & \Uhhhhhhhh + #plus \e for \x1b? set buffer "" set buffer4 "" ;#buffer for 4 hex characters following a \u @@ -2558,12 +2697,13 @@ namespace eval tomlish::utils { set ctest [tcl::string::map {{"} dq} $c] switch -exact -- $ctest { dq { - set e "\\\"" - append buffer [subst -nocommand -novariable $e] + append buffer {"} } b - t - n - f - r { - set e "\\$c" - append buffer [subst -nocommand -novariable $e] + append buffer [subst -nocommand -novariable "\\$c"] + } + e { + append buffer \x1b } u { set unicode4_active 1 @@ -2578,8 +2718,7 @@ namespace eval tomlish::utils { #review - toml spec says all other escapes are reserved #and if they are used TOML should produce an error. #we leave detecting this for caller for now - REVIEW - append buffer "\\" - append buffer $c + append buffer "\\$c" } } } else { @@ -3003,7 +3142,7 @@ namespace eval tomlish::parse { # states: # table-space, itable-space, array-space # array-value-expected,keyval-value-expected,itable-keyval-value-expected, keyval-syntax, - # quoted-key, squoted-key + # dquoted-key, squoted-key # string-state, literal-state, multistring... # # notes: @@ -3039,6 +3178,12 @@ namespace eval tomlish::parse { variable stateMatrix set stateMatrix [dict create] + #--------------------------------------------------------- + #WARNING + #The stateMatrix implementation here is currently messy. + #The code is a mixture of declarative via the stateMatrix and imperative via switch statements during PUSH/POP/SAMESPACE transitions. + #This means the state behaviour has to be reasoned about by looking at both in conjuction. + #--------------------------------------------------------- #xxx-space vs xxx-syntax inadequately documented - TODO @@ -3060,35 +3205,19 @@ namespace eval tomlish::parse { barekey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ squotedkey {PUSHSPACE "keyval-space" state "keyval-syntax" note ""}\ dquotedkey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ - XXXstartquote "quoted-key"\ - XXXstartsquote "squoted-key"\ + XXXsingle_dquote "quoted-key"\ + XXXsingle_squote "squoted-key"\ comment "table-space"\ starttablename "tablename-state"\ starttablearrayname "tablearrayname-state"\ - startmultiquote "err-state"\ - endquote "err-state"\ + enddquote "err-state"\ + endsquote "err-state"\ comma "err-state"\ eof "end-state"\ equal "err-state"\ cr "err-lonecr"\ } - #itable-space/ curly-syntax : itables - dict set stateMatrix\ - itable-space {\ - whitespace "itable-space"\ - newline "itable-space"\ - barekey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - squotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - dquotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - endinlinetable "POPSPACE"\ - XXXstartquote "quoted-key"\ - XXXstartsquote {TOSTATE "squoted-key" comment "jn-testing"}\ - comma "err-state"\ - comment "itable-space"\ - eof "err-state"\ - } - #squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate itable-space starttok {squote_seq "'"}} dict set stateMatrix\ @@ -3113,26 +3242,19 @@ namespace eval tomlish::parse { dict set stateMatrix\ keyval-value-expected {\ whitespace "keyval-value-expected"\ - untyped_value {TOSTATE "keyval-tail" note ""}\ - startquote {TOSTATE "string-state" returnstate keyval-tail}\ - startmultiquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ - squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate keyval-value-expected starttok {squote_seq "'"}}\ - startsquote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ - double_squote {TOSTATE "keyval-tail" note "empty literal received when double squote occurs"}\ - triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ - startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ - startarray {PUSHSPACE array-space returnstate keyval-tail}\ - } - #squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate keyval-process-leading-squotes starttok {squote_seq "'"}} - dict set stateMatrix\ - leading-squote-space {\ - squote_seq "POPSPACE"\ + untyped_value {TOSTATE "keyval-tail" note ""}\ + literal {TOSTATE "keyval-tail" note "required for empty literal at EOF"}\ + string {TOSTATE "keyval-tail" note "required for empty string at EOF"}\ + single_dquote {TOSTATE "string-state" returnstate keyval-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ + single_squote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ + triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ + startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ + startarray {PUSHSPACE array-space returnstate keyval-tail}\ } - #dict set stateMatrix\ - # keyval-process-leading-squotes {\ - # startsquote "literal-state"\ - # triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ - # } + #double_squote {TOSTATE "keyval-tail" note "empty literal received when double squote occurs"} + + #2025 - no leading-squote-space - only trailing-squote-space. dict set stateMatrix\ keyval-tail {\ @@ -3142,81 +3264,106 @@ namespace eval tomlish::parse { eof "end-state"\ } + + #itable-space/ curly-syntax : itables + # x={y=1,} + dict set stateMatrix\ + itable-space {\ + whitespace "itable-space"\ + newline "itable-space"\ + barekey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + squotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + dquotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ + endinlinetable "POPSPACE"\ + comma "err-state"\ + comment "itable-space"\ + eof "err-state"\ + } + #we don't get single_squote etc here - instead we get the resulting squotedkey token + + + # ??? review - something like this + # + # x={y =1,} dict set stateMatrix\ itable-keyval-syntax {\ - whitespace "itable-keyval-syntax"\ - barekey {PUSHSPACE "dottedkey-space"}\ - squotedkey {PUSHSPACE "dottedkey-space"}\ - dquotedkey {PUSHSPACE "dottedkey-space"}\ - equal "itable-keyval-value-expected"\ + whitespace {TOSTATE "itable-keyval-syntax"}\ + barekey {PUSHSPACE "dottedkey-space"}\ + squotedkey {PUSHSPACE "dottedkey-space"}\ + dquotedkey {PUSHSPACE "dottedkey-space"}\ + equal {TOSTATE "itable-keyval-value-expected"}\ newline "err-state"\ eof "err-state"\ } + + # x={y=1} + dict set stateMatrix\ + itable-keyval-space {\ + whitespace "itable-keyval-syntax"\ + equal {TOSTATE "itable-keyval-value-expected" note "required"}\ + } + dict set stateMatrix\ itable-keyval-value-expected {\ whitespace "itable-keyval-value-expected"\ untyped_value {TOSTATE "itable-val-tail" note ""}\ - startquote {TOSTATE "string-state" returnstate itable-val-tail}\ - startmultiquote {PUSHSPACE "multistring-space" returnstate itable-val-tail}\ - squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate itable-keyval-value-expected starttok {squote_seq "'"}}\ - startsquote {TOSTATE "literal-state" returnstate itable-val-tail note "usual way a literal is triggered"}\ - double_squote {TOSTATE "itable-val-tail" note "empty literal received when double squote occurs"}\ + single_dquote {TOSTATE "string-state" returnstate itable-val-tail}\ + triple_dquote {PUSHSPACE "multistring-space" returnstate itable-val-tail}\ + single_squote {TOSTATE "literal-state" returnstate itable-val-tail note "usual way a literal is triggered"}\ triple_squote {PUSHSPACE "multiliteral-space" returnstate itable-val-tail}\ startinlinetable {PUSHSPACE "itable-space" returnstate itable-val-tail}\ startarray {PUSHSPACE "array-space" returnstate itable-val-tail}\ } - dict set stateMatrix\ - itable-keyval-space {\ - whitespace "itable-keyval-syntax"\ - equal {TOSTATE "itable-keyval-value-expected" note "required"}\ - } + #double_squote not currently generated by _start_squote_sequence - '' processed as single_squote to literal-state just like 'xxx' + # review + # double_squote {TOSTATE "itable-val-tail" note "empty literal received when double squote occurs"} + + + # x={y=1,z="x"} + #POPSPACE is transition from itable-keyval-space to parent itable-space dict set stateMatrix\ itable-val-tail {\ whitespace "itable-val-tail"\ endinlinetable "POPSPACE"\ comma "POPSPACE"\ - XXXnewline {TOSTATE "itable-val-tail" note "itable-space ??"}\ - newline "POPSPACE"\ + newline {TOSTATE "itable-val-tail" note "itable-space ??"}\ comment "itable-val-tail"\ eof "err-state"\ } - #dict set stateMatrix\ - # itable-quoted-key {\ - # whitespace "NA"\ - # itablequotedkey {PUSHSPACE "itable-keyval-space"}\ - # newline "err-state"\ - # endquote "itable-keyval-syntax"\ - # } - #dict set stateMatrix\ - # itable-squoted-key {\ - # whitespace "NA"\ - # itablesquotedkey {PUSHSPACE "itable-keyval-space"}\ - # newline "err-state"\ - # endsquote "itable-keyval-syntax"\ - # } + # XXXnewline "POPSPACE" + # We shouldn't popspace on newline - as if there was no comma we need to stay in itable-val-tail + # This means the newline and subsequent whitespace, comments etc become part of the preceeding dottedkey record + #e.g + # x = { + # j=1 + # #comment within dottedkey j record + # , # comment unattached + # #comment unattached + # k=2 , #comment unattached + # l=3 #comment within l record + # , m=4 + # #comment associated with m record + # + # #still associated with m record + # } + ## - This doesn't quite correspond to what a user might expect - but seems like a consistent mechanism. + #The awkwardness is because there is no way to put in a comment that doesn't consume a trailing comma + #so we cant do: j= 1 #comment for j1 , + # and have the trailing comma recognised. + # + # To associate: j= 1, #comment for j1 + # we would need some extra processing . (not popping until next key ? extra state itable-sep-tail?) REVIEW - worth doing? + # + # The same issue occurs with multiline arrays. The most natural assumption is that a comment on same line after a comma + # is 'associated' with the previous entry. + # + # These comment issues are independent of the data dictionary being generated for conversion to json etc - as the comments don't carry through anyway, + # but are a potential oddity for manipulating the intermediate tomlish structure whilst attempting to preserve 'associated' comments + # (e.g reordering records within an itable) + #The user's intention for 'associated' isn't always clear and the specs don't really guide on this. - - - #array-value-expected ? - dict set stateMatrix\ - XXXvalue-expected {\ - whitespace "value-expected"\ - untyped_value {"SAMESPACE" "" replay untyped_value}\ - startquote "string-state"\ - startsquote "literal-state"\ - triple_squote {PUSHSPACE "multiliteral-space"}\ - startmultiquote {PUSHSPACE "multistring-space"}\ - startinlinetable {PUSHSPACE itable-space}\ - startarray {PUSHSPACE array-space}\ - comment "err-state-value-expected-got-comment"\ - comma "err-state"\ - newline "err-state"\ - eof "err-state"\ - } - #note comment token should never be delivered to array-value-expected state? - #dottedkey-space is not (currently) used within [tablename] or [[tablearrayname]] #it is for keyval ie x.y.z = value @@ -3245,6 +3392,8 @@ namespace eval tomlish::parse { whitespace "dottedkey-space-tail" dotsep "dottedkey-space" equal "POPSPACE"\ + eof "err-state"\ + newline "err-state"\ } #-------------------------------------------------------------------------- @@ -3262,22 +3411,10 @@ namespace eval tomlish::parse { #toml spec looks like heading towards allowing newlines within inline tables #https://github.com/toml-lang/toml/issues/781 - #2025 - appears to be valid for 1.1 - which we are targeting. + #2025 - multiline itables appear to be valid for 1.1 - which we are targeting. #https://github.com/toml-lang/toml/blob/main/toml.md#inline-table #JMN2025 - #dict set stateMatrix\ - # curly-syntax {\ - # whitespace "curly-syntax"\ - # newline "curly-syntax"\ - # barekey {PUSHSPACE "itable-keyval-space"}\ - # itablequotedkey "itable-keyval-space"\ - # endinlinetable "POPSPACE"\ - # startquote "itable-quoted-key"\ - # comma "itable-space"\ - # comment "itable-space"\ - # eof "err-state"\ - # } #review comment "err-state" vs comment "itable-space" - see if TOML 1.1 comes out and allows comments in multiline ITABLES #We currently allow multiline ITABLES (also with comments) in the tokenizer. #if we want to disallow as per TOML 1.0 - we should do so when attempting to get structure? @@ -3291,10 +3428,9 @@ namespace eval tomlish::parse { # untyped_value "SAMESPACE"\ # startarray {PUSHSPACE "array-space"}\ # endarray "POPSPACE"\ - # startmultiquote {PUSHSPACE multistring-space}\ # startinlinetable {PUSHSPACE itable-space}\ - # startquote "string-state"\ - # startsquote "literal-state"\ + # single_dquote "string-state"\ + # single_squote "literal-state"\ # triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax note "seems ok 2024"}\ # comma "array-space"\ # comment "array-space"\ @@ -3305,15 +3441,16 @@ namespace eval tomlish::parse { set aspace [dict create] dict set aspace whitespace "array-space" dict set aspace newline "array-space" - dict set aspace untyped_value "SAMESPACE" + #dict set aspace untyped_value "SAMESPACE" + dict set aspace untyped_value "array-syntax" dict set aspace startarray {PUSHSPACE "array-space"} dict set aspace endarray "POPSPACE" - dict set aspace startmultiquote {PUSHSPACE multistring-space} + dict set aspace single_dquote {TOSTATE "string-state" returnstate array-syntax} + dict set aspace triple_dquote {PUSHSPACE "multistring-space" returnstate array-syntax} + dict set aspace single_squote {TOSTATE "literal-state" returnstate array-syntax} + dict set aspace triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax} dict set aspace startinlinetable {PUSHSPACE itable-space} - dict set aspace startquote "string-state" - dict set aspace startsquote "literal-state" - dict set aspace triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax note "seems ok 2024"} - dict set aspace comma "array-space" + #dict set aspace comma "array-space" dict set aspace comment "array-space" dict set aspace eof "err-state-array-space-got-eof" dict set stateMatrix array-space $aspace @@ -3329,26 +3466,16 @@ namespace eval tomlish::parse { #dict set asyntax untyped_value "SAMESPACE" #dict set asyntax startarray {PUSHSPACE array-space} dict set asyntax endarray "POPSPACE" - #dict set asyntax startmultiquote {PUSHSPACE multistring-space} - #dict set asyntax startquote "string-state" - #dict set asyntax startsquote "literal-state" + #dict set asyntax single_dquote "string-state" + #dict set asyntax single_squote "literal-state" dict set asyntax comma "array-space" dict set asyntax comment "array-syntax" dict set stateMatrix array-syntax $asyntax - #quoted-key & squoted-key need to PUSHSPACE from own token to keyval-space - dict set stateMatrix\ - quoted-key {\ - whitespace "NA"\ - dquotedkey {PUSHSPACE "keyval-space"}\ - newline "err-state"\ - endquote "keyval-syntax"\ - } - - #review + #dquotedkey is a token - dquoted-key is a state dict set stateMatrix\ dquoted-key {\ whitespace "NA"\ @@ -3367,7 +3494,7 @@ namespace eval tomlish::parse { string-state {\ whitespace "NA"\ string "string-state"\ - endquote "SAMESPACE"\ + enddquote "SAMESPACE"\ newline "err-state"\ eof "err-state"\ } @@ -3381,20 +3508,21 @@ namespace eval tomlish::parse { } - #dict set stateMatrix\ - # stringpart {\ - # continuation "SAMESPACE"\ - # endmultiquote "POPSPACE"\ - # eof "err-state"\ - # } dict set stateMatrix\ multistring-space {\ - whitespace "multistring-space"\ - continuation "multistring-space"\ - stringpart "multistring-space"\ - newline "multistring-space"\ - endmultiquote "POPSPACE"\ - eof "err-state"\ + whitespace "multistring-space"\ + continuation "multistring-space"\ + stringpart "multistring-space"\ + newline "multistring-space"\ + tentative_trigger_dquote {PUSHSPACE "trailing-dquote-space" returnstate multistring-space starttok {tentative_accum_dquote {"}}}\ + single_dquote {TOSTATE multistring-space}\ + double_dquote {TOSTATE multistring-space}\ + triple_dquote {POPSPACE}\ + eof "err-state"\ + } + dict set stateMatrix\ + trailing-dquote-space { + tentative_accum_dquote "POPSPACE" } @@ -3402,19 +3530,19 @@ namespace eval tomlish::parse { #todo - treat sole cr as part of literalpart but crlf and lf as newline dict set stateMatrix\ multiliteral-space {\ - literalpart "multiliteral-space"\ - newline "multiliteral-space"\ - squote_seq_begin {PUSHSPACE "trailing-squote-space" returnstate multiliteral-space starttok {squote_seq "'"}}\ - triple_squote {POPSPACE note "on popping - we do any necessary concatenation of LITERALPART items due to squote processing"}\ - double_squote {TOSTATE multiliteral-space note "short squote_seq: can occur anywhere in the space e.g emitted at end when 5 squotes occur"}\ - startsquote {TOSTATE multiliteral-space note "short squote_seq: same as double_squote - false alarm"}\ - eof "err-premature-eof-in-multiliteral-space"\ + literalpart "multiliteral-space"\ + newline "multiliteral-space"\ + tentative_trigger_squote {PUSHSPACE "trailing-squote-space" returnstate multiliteral-space starttok {tentative_accum_squote "'"}}\ + single_squote {TOSTATE multiliteral-space note "short tentative_accum_squote: false alarm this squote is part of data"}\ + double_squote {TOSTATE multiliteral-space note "short tentative_accum_squote: can occur anywhere in the space e.g emitted at end when 5 squotes occur"}\ + triple_squote {POPSPACE note "on popping - we do any necessary concatenation of LITERALPART items due to squote processing"}\ + eof "err-premature-eof-in-multiliteral-space"\ } #trailing because we are looking for possible terminating ''' - but must accept '''' or ''''' and re-integrate the 1st one or 2 extra squotes dict set stateMatrix\ - trailing-squote-space {\ - squote_seq "POPSPACE"\ + trailing-squote-space { + tentative_accum_squote "POPSPACE" } @@ -3499,7 +3627,7 @@ namespace eval tomlish::parse { - + dict set stateMatrix\ end-state {} @@ -3557,14 +3685,13 @@ namespace eval tomlish::parse { dict set spacePushTransitions itable-keyval-space itable-keyval-syntax dict set spacePushTransitions array-space array-space dict set spacePushTransitions table-space tablename-state - dict set spacePushTransitions #itable-space itable-space + #dict set spacePushTransitions #itable-space itable-space #Pop to, next variable spacePopTransitions [dict create] dict set spacePopTransitions array-space array-syntax - #itable-space curly-syntax #itable-keyval-space itable-val-tail #review #we pop to keyval-space from dottedkey-space or from keyval-value-expected? we don't always want to go to keyval-tail @@ -3575,7 +3702,6 @@ namespace eval tomlish::parse { #JMN test #dict set spaceSameTransitions array-space array-syntax - #itable-space curly-syntax #itable-keyval-space itable-val-tail @@ -3611,6 +3737,8 @@ namespace eval tomlish::parse { ::tomlish::log::debug "--->> goNextState tokentype:$tokentype tok:$tok currentstate:$currentstate : transition_to = $transition_to" switch -exact -- [lindex $transition_to 0] { POPSPACE { + set popfromspace_info [spacestack peek] + set popfromspace_state [dict get $popfromspace_info state] spacestack pop set parent_info [spacestack peek] set type [dict get $parent_info type] @@ -3625,17 +3753,17 @@ namespace eval tomlish::parse { set existing [spacestack pop] dict unset existing returnstate spacestack push $existing ;#re-push modification - ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace redirected to stored returnstate $next <<---" + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected to stored returnstate $next <<---" } else { ### #review - do away with spacePopTransitions - which although useful to provide a default.. # - involve error-prone configurations distant to the main state transition configuration in stateMatrix if {[dict exists $::tomlish::parse::spacePopTransitions $parentspace]} { set next [dict get $::tomlish::parse::spacePopTransitions $parentspace] - ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace redirected state to $next (spacePopTransitions)<<---" + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected state to $next (spacePopTransitions)<<---" } else { set next $parentspace - ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace<<---" + ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace<<---" } } set result $next @@ -3805,22 +3933,6 @@ namespace eval tomlish::parse { return $tokenType } - proc _shortcircuit_startquotesequence {} { - variable tok - variable i - set toklen [tcl::string::length $tok] - if {$toklen == 1} { - set_tokenType "startquote" - incr i -1 - return -level 2 1 - } elseif {$toklen == 2} { - puts stderr "_shortcircuit_startquotesequence toklen 2" - set_tokenType "startquote" - set tok "\"" - incr i -2 - return -level 2 1 - } - } proc get_token_waiting {} { variable token_waiting @@ -3940,7 +4052,6 @@ namespace eval tomlish::parse { set slash_active 0 set quote 0 set c "" - set multi_dquote "" for {} {$i < $sLen} {} { if {$i > 0} { set lastChar [tcl::string::index $s [expr {$i - 1}]] @@ -3957,8 +4068,6 @@ namespace eval tomlish::parse { switch -exact -- $ctest { # { - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 @@ -3966,16 +4075,20 @@ namespace eval tomlish::parse { if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { + #for multiliteral, multistring - data and/or end incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { + #pseudo token beginning with underscore - never returned to state machine - review incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i [tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } barekey { @@ -4003,7 +4116,7 @@ namespace eval tomlish::parse { append tok $c } default { - #dquotedkey, itablequotedkey, string,literal, multistring + #dquotedkey, string,literal, multistring append tok $c } } @@ -4015,7 +4128,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes#" + append tok "#" } multiliteral-space { set_tokenType "literalpart" @@ -4031,23 +4144,23 @@ namespace eval tomlish::parse { } lc { #left curly brace - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i [tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { @@ -4059,7 +4172,7 @@ namespace eval tomlish::parse { } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } starttablename - starttablearrayname { #*bare* tablename can only contain letters,digits underscores @@ -4105,7 +4218,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes\{" + append tok "\{" } multiliteral-space { set_tokenType "literalpart" @@ -4120,37 +4233,35 @@ namespace eval tomlish::parse { } rc { #right curly brace - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { append tok $c } - XXXitablesquotedkey { - } - string - dquotedkey - itablequotedkey - comment { + string - dquotedkey - comment { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } starttablename - tablename { if {$had_slash} {append tok "\\"} @@ -4221,7 +4332,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes\}" + append tok "\}" } multiliteral-space { set_tokenType "literalpart" ; #review @@ -4237,35 +4348,35 @@ namespace eval tomlish::parse { } lb { #left square bracket - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { append tok $c } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } starttablename { #change the tokenType @@ -4332,7 +4443,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes\[" + append tok "\[" } multiliteral-space { set_tokenType "literalpart" @@ -4350,37 +4461,35 @@ namespace eval tomlish::parse { } rb { #right square bracket - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { append tok $c } - XXXitablesquotedkey { - } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } comment { if {$had_slash} {append tok "\\"} @@ -4428,16 +4537,6 @@ namespace eval tomlish::parse { } } } - XXXtablearraynames { - puts "rb @ tablearraynames ??" - #switch? - - #todo? - if {$had_slash} {append tok "\\"} - #invalid! - but leave for datastructure loading stage to catch - set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex - return 1 - } default { incr i -1 return 1 @@ -4485,7 +4584,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok "$dquotes\]" + append tok "\]" } multiliteral-space { set_tokenType "literalpart" @@ -4498,21 +4597,21 @@ namespace eval tomlish::parse { } } bsl { - set dquotes $multi_dquote - set multi_dquote "" ;#!! #backslash if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } whitespace { @@ -4529,9 +4628,7 @@ namespace eval tomlish::parse { append tok "\\" set slash_active 0 } - XXXitablesquotedkey { - } - string - dquotedkey - itablequotedkey - comment { + string - dquotedkey - comment { if {$slash_active} { set slash_active 0 append tok "\\\\" @@ -4545,7 +4642,6 @@ namespace eval tomlish::parse { set slash_active 0 append tok "\\\\" } else { - append tok $dquotes set slash_active 1 } } @@ -4575,10 +4671,6 @@ namespace eval tomlish::parse { set tok "\\\\" set slash_active 0 } else { - if {$dquotes ne ""} { - set_tokenType "stringpart" - set tok $dquotes - } set slash_active 1 } } @@ -4599,58 +4691,56 @@ namespace eval tomlish::parse { set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { - #short squote_seq tokens are returned if active during any other character + tentative_accum_squote { + #for within multiliteral + #short tentative_accum_squote tokens are returned if active upon receipt of any other character #longest allowable for leading/trailing are returned here #### set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote - switch -- $state { - leading-squote-space { - append tok $c - if {$existingtoklen > 2} { - error "tomlish tok error: squote_seq unexpected length $existingtoklen when another received" - } elseif {$existingtoklen == 2} { - return 1 ;#return tok ''' - } - } - trailing-squote-space { - append tok $c - if {$existingtoklen == 4} { - #maxlen to be an squote_seq is multisquote + 2 = 5 - #return tok ''''' - return 1 - } - } - default { - error "tomlish tok error: squote_seq in unexpected state '$state' - expected leading-squote-space or trailing-squote-space" - } + #assert state = trailing-squote-space + append tok $c + if {$existingtoklen == 4} { + #maxlen to be a tentative_accum_squote is multisquote + 2 = 5 + #return tok with value ''''' + return 1 } } - whitespace { - #end whitespace - incr i -1 ;#reprocess sq + tentative_accum_dquote { + incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { - #temp token creatable only during value-expected or array-space + #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space switch -- [tcl::string::length $tok] { 1 { + #no conclusion can yet be reached append tok $c } 2 { + #enter multiliteral #switch? append tok $c set_tokenType triple_squote return 1 } default { + #if there are more than 3 leading squotes we also enter multiliteral space and the subsequent ones are handled + #by the tentative_accum_squote check for ending sequence which can accept up to 5 and reintegrate the + #extra 1 or 2 squotes as data. error "tomlish unexpected token length [tcl::string::length $tok] in '_start_squote_sequence'" } } } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" + return 1 + } + whitespace { + #end whitespace + incr i -1 ;#reprocess sq + return 1 + } literal { #slash_active always false #terminate the literal @@ -4663,7 +4753,7 @@ namespace eval tomlish::parse { # idea: end this literalpart (possibly 'temporarily') # let the sq be reprocessed in the multiliteral-space to push an end-multiliteral-sequence to state stack # upon popping end-multiliteral-sequence - stitch quotes back into this literalpart's token (if either too short - or a long ending sequence as shown above) - incr i -1 ;#throw the "'" back to loop - will be added to an squote_seq token for later processing + incr i -1 ;#throw the "'" back to loop - will be added to a tentative_accum_squote token for later processing return 1 } XXXitablesquotedkey { @@ -4684,7 +4774,11 @@ namespace eval tomlish::parse { append tok $c } barekey { - #not clear why o'shennanigan shouldn't be a legal barekey - but it seems not to be. + #barekeys now support all sorts of unicode letter/number chars for other cultures + #but not punctuation - not even for those of Irish heritage who don't object + #to the anglicised form of some names. + # o'shenanigan seems to not be a legal barekey + #The Irish will have to use an earlier form Ó - which apparently many may prefer anyway. error "tomlish Unexpected single quote during barekey. [tomlish::parse::report_line]" } default { @@ -4693,63 +4787,69 @@ namespace eval tomlish::parse { } } else { switch -exact -- $state { - array-space { + array-space - keyval-value-expected - itable-keyval-value-expected { + #leading squote + #pseudo-token _start_squote_sequence ss not received by state machine + #This pseudotoken will trigger production of single_squote token or triple_squote token + #It currently doesn't trigger double_squote token + #(handle '' same as 'x' ie produce a single_squote and go into processing literal) + #review - producing double_squote for empty literal may be slightly more efficient. + #This token is not used to handle squote sequences *within* a multiliteral set_tokenType "_start_squote_sequence" set tok "'" } - itable-keyval-value-expected - keyval-value-expected { - set_tokenType "squote_seq_begin" + multiliteral-space { + #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row + #we are building up a tentative_accum_squote to determine if + #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines + #b) it is exactly ''' and we can terminate the whole multiliteral + #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space + set_tokenType "tentative_trigger_squote" ;#trigger tentative_accum_squote set tok "'" return 1 } - table-space { - #tests: squotedkey.test - set_tokenType "squotedkey" - set tok "" - } - itable-space { - #tests: squotedkey_itable.test + table-space - itable-space { + #tests: squotedkey.test squotedkey_itable.test set_tokenType "squotedkey" set tok "" } - XXXitable-space { - #future - could there be multiline keys? - #this would allow arbitrary tcl dicts to be stored in toml + XXXtable-space - XXXitable-space { + #future - could there be multiline keys? MLLKEY, MLBKEY ? + #this would (almost) allow arbitrary tcl dicts to be stored in toml (aside from escaping issues) #probably unlikely - as it's perhaps not very 'minimal' or ergonomic for config files - set_tokenType "squote_seq_begin" + #@2025 ABNF for toml mentions key, simple-key, unquoted-key, quoted-key and dotted-key + #where key is simple-key or dotted-key - no MLL or MLB components + #the spec states solution for arbitrary binary data is application specific involving encodings + #such as hex, base64 + set_tokenType "_start_squote_sequence" set tok "'" return 1 } tablename-state { #first char in tablename-state/tablearrayname-state - set_tokenType tablename + set_tokenType "tablename" append tok "'" } tablearrayname-state { - set_tokenType tablearrayname + set_tokenType "tablearrayname" append tok "'" } literal-state { + #shouldn't get here? review tomlish::log::debug "- tokloop sq during literal-state with no tokentype - empty literal?" - set_tokenType literal + set_tokenType "literal" incr -1 return 1 } multistring-space { - error "tomlish unimplemented - squote during state '$state'. [tomlish::parse::report_line]" - } - multiliteral-space { - #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row - #we are building up an squote_seq to determine if - #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines - #b) it is exactly ''' and we can terminate the whole multiliteral - #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space - set_tokenType "squote_seq_begin" - set tok "'" - return 1 + set_tokenType "stringpart" + set tok "" + if {$had_slash} {append tok "\\"} + append tok "," + #error "tomlish unimplemented - squote during state '$state'. [tomlish::parse::report_line]" } dottedkey-space { - set_tokenType squotedkey + set_tokenType "squotedkey" } default { error "tomlish unhandled squote during state '$state'. [tomlish::parse::report_line]" @@ -4765,44 +4865,50 @@ namespace eval tomlish::parse { if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote { incr i -1 return 1 } - startquotesequence { - set toklen [tcl::string::length $tok] - if {$toklen == 1} { - append tok $c - } elseif {$toklen == 2} { - append tok $c - #switch vs set? - set_tokenType "startmultiquote" - return 1 - } else { - error "tomlish unexpected token length $toklen in 'startquotesequence'" - } - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" return 1 - - #set toklen [tcl::string::length $tok] - #switch -- $toklen { - # 1 { - # set_tokenType "startsquote" - # incr i -1 - # return 1 - # } - # 2 { - # set_tokenType "startsquote" - # incr i -2 - # return 1 - # } - # default { - # error "tomlish unexpected _start_squote_sequence length $toklen" - # } - #} + } + tentative_accum_dquote { + #within multistring + #short tentative_accum_dquote tokens are returned if active upon receipt of any other character + #longest allowable for leading/trailing are returned here + #### + set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote + #assert state = trailing-squote-space + append tok $c + if {$existingtoklen == 4} { + #maxlen to be a tentative_accum_dquote is multidquote + 2 = 5 + #return tok with value """"" + return 1 + } + } + _start_dquote_sequence { + #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space + switch -- [tcl::string::length $tok] { + 1 { + #no conclusion can yet be reached + append tok $c + } + 2 { + #enter multistring + #switch? + append tok $c + set_tokenType triple_dquote + return 1 + } + default { + #if there are more than 3 leading dquotes we also enter multistring space and the subsequent ones are handled + #by the tentative_accum_dquote check for ending sequence which can accept up to 5 and reintegrate the + #extra 1 or 2 dquotes as data. + error "tomlish unexpected token length [tcl::string::length $tok] in '_start_dquote_sequence'" + } + } } literal - literalpart { append tok $c @@ -4811,8 +4917,8 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" $c } else { - #unescaped quote always terminates a string? - set_token_waiting type endquote value "\"" complete 1 startindex $cindex + #unescaped quote always terminates a string + set_token_waiting type enddquote value "\"" complete 1 startindex $cindex return 1 } } @@ -4821,77 +4927,31 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" $c } else { - #incr i -1 - - if {$multi_dquote eq "\"\""} { - set_token_waiting type endmultiquote value "\"\"\"" complete 1 startindex [expr {$cindex -2}] - set multi_dquote "" - return 1 - } else { - append multi_dquote "\"" - } + incr i -1 ;#throw the {"} back to loop - will be added to a tentative_accum_dquote token for later processing + return 1 } } whitespace { - switch -exact -- $state { - multistring-space { - #REVIEW - if {$had_slash} { - incr i -2 - return 1 - } else { - switch -- [tcl::string::length $multi_dquote] { - 2 { - set_token_waiting type endmultiquote value "\"\"\"" complete 1 startindex [expr {$cindex-2}] - set multi_dquote "" - return 1 - } - 1 { - incr i -2 - return 1 - } - 0 { - incr i -1 - return 1 - } - } - } - } - keyval-value-expected { - #end whitespace token and reprocess - incr i -1 - return 1 - - #if {$multi_dquote eq "\"\""} { - # set_token_waiting type startmultiquote value "\"\"\"" complete 1 - # set multi_dquote "" - # return 1 - #} else { - # #end whitespace token and reprocess - # incr i -1 - # return 1 - #} - } - table-space - itable-space { - incr i -1 - return 1 - } - default { - set_token_waiting type startquote value "\"" complete 1 startindex $cindex - return 1 - } + #assert: had_slash will only ever be true in multistring-space + if {$had_slash} { + incr i -2 + return 1 + } else { + #end whitespace token - throw dq back for reprocessing + incr i -1 + return 1 } } comment { if {$had_slash} {append tok "\\"} append tok $c } - XXXdquotedkey - XXXitablequotedkey { + XXXdquotedkey { if {$had_slash} { append tok "\\" append tok $c } else { - set_token_waiting type endquote value "\"" complete 1 startindex $cindex + set_token_waiting type enddquote value "\"" complete 1 startindex $cindex return 1 } } @@ -4901,7 +4961,7 @@ namespace eval tomlish::parse { append tok "\\" append tok $c } else { - #set_token_waiting type endsquote value "'" complete 1 + #set_token_waiting type enddquote value {"} complete 1 return 1 } } @@ -4924,64 +4984,40 @@ namespace eval tomlish::parse { #$slash_active not relevant when no tokenType #token is string only if we're expecting a value at this point switch -exact -- $state { - array-space { - #!? start looking for possible multistartquote - #set_tokenType startquote - #set tok $c - #return 1 - set_tokenType "startquotesequence" ;#one or more quotes in a row - either startquote or multistartquote - set tok $c - } - keyval-value-expected - itable-keyval-value-expected { - set_tokenType "startquotesequence" ;#one or more quotes in a row - either startquote or multistartquote - set tok $c + array-space - keyval-value-expected - itable-keyval-value-expected { + #leading dquote + #pseudo-token _start_squote_sequence ss not received by state machine + #This pseudotoken will trigger production of single_dquote token or triple_dquote token + #It currently doesn't trigger double_dquote token + #(handle "" same as "x" ie produce a single_dquote and go into processing string) + #review - producing double_dquote for empty string may be slightly more efficient. + #This token is not used to handle dquote sequences once *within* a multistring + set_tokenType "_start_dquote_sequence" + set tok {"} } multistring-space { - #TODO - had_slash!!! - #REVIEW if {$had_slash} { set_tokenType "stringpart" set tok "\\\"" - set multi_dquote "" } else { - if {$multi_dquote eq "\"\""} { - tomlish::log::debug "- tokloop char dq ---> endmultiquote" - set_tokenType "endmultiquote" - set tok "\"\"\"" - return 1 - #set_token_waiting type endmultiquote value "\"\"\"" complete 1 - #set multi_dquote "" - #return 1 - } else { - append multi_dquote "\"" - } + #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row + #we are building up a tentative_accum_squote to determine if + #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines + #b) it is exactly ''' and we can terminate the whole multiliteral + #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space + set_tokenType "tentative_trigger_dquote" ;#trigger tentative_accum_dquote + set tok {"} + return 1 } } multiliteral-space { set_tokenType "literalpart" set tok "\"" } - XXXtable-space { - set_tokenType "startquote" - set tok $c - return 1 - } - XXXitable-space { - set_tokenType "startquote" - set tok $c - } table-space - itable-space { set_tokenType "dquotedkey" set tok "" } - tablename-state { - set_tokenType tablename - set tok $c - } - tablearrayname-state { - set_tokenType tablearrayname - set tok $c - } dottedkey-space { set_tokenType dquotedkey set tok "" @@ -4990,49 +5026,56 @@ namespace eval tomlish::parse { #set_tokenType dquote_seq_begin #set tok $c } + tablename-state { + set_tokenType tablename + set tok $c + } + tablearrayname-state { + set_tokenType tablearrayname + set tok $c + } default { - error "tomlish Unexpected quote during state '$state' [tomlish::parse::report_line]" + error "tomlish Unexpected dquote during state '$state' [tomlish::parse::report_line]" } } } } = { - set dquotes $multi_dquote - set multi_dquote "" ;#!! set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { - #assertion had_slash 0, multi_dquote "" + #assertion had_slash 0 append tok $c } - string - comment - dquotedkey - itablequotedkey { + string - comment - dquotedkey { #for these tokenTypes an = is just data. if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } whitespace { if {$state eq "multistring-space"} { - set backlen [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backlen + incr i -1 return 1 } else { set_token_waiting type equal value = complete 1 startindex $cindex @@ -5063,7 +5106,7 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok ${dquotes}= + append tok = } multiliteral-space { set_tokenType "literalpart" @@ -5084,8 +5127,6 @@ namespace eval tomlish::parse { } cr { #REVIEW! - set dquotes $multi_dquote - set multi_dquote "" ;#!! # \r carriage return if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. set slash_active 0 @@ -5098,16 +5139,18 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal { @@ -5159,8 +5202,6 @@ namespace eval tomlish::parse { } lf { # \n newline - set dquotes $multi_dquote - set multi_dquote "" ;#!! set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { @@ -5171,16 +5212,19 @@ namespace eval tomlish::parse { append tok lf ;#assert we should now have tok "crlf" - as a previous cr is the only way to have an incomplete newline tok return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { + #multiliteral or multistring incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal { @@ -5196,20 +5240,14 @@ namespace eval tomlish::parse { return 1 } stringpart { - if {$dquotes ne ""} { - append tok $dquotes + if {$had_slash} { + #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) + set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] incr i -1 return 1 } else { - if {$had_slash} { - #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) - set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] - incr i -1 - return 1 - } else { - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } + set_token_waiting type newline value lf complete 1 startindex $cindex + return 1 } } starttablename - tablename - tablearrayname - starttablearrayname { @@ -5236,20 +5274,13 @@ namespace eval tomlish::parse { incr i -1 return 1 } else { - if {$dquotes ne ""} { - #e.g one or 2 quotes just before nl - set_tokenType "stringpart" - set tok $dquotes - incr i -1 - return 1 - } set_tokenType "newline" set tok lf return 1 } } multiliteral-space { - #assert had_slash 0, multi_dquote "" + #assert had_slash 0 set_tokenType "newline" set tok "lf" return 1 @@ -5275,8 +5306,6 @@ namespace eval tomlish::parse { } } , { - set dquotes $multi_dquote - set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { @@ -5287,39 +5316,40 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } comment - tablename - tablearrayname { if {$had_slash} {append tok "\\"} append tok , } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { #stringpart can have up to 2 quotes too if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } literal - literalpart - squotedkey { - #assert had_slash always 0, multi_dquote "" + #assert had_slash always 0 append tok $c } whitespace { if {$state eq "multistring-space"} { - set backlen [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backlen + incr i -1 return 1 } else { set_token_waiting type comma value "," complete 1 startindex $cindex @@ -5338,10 +5368,10 @@ namespace eval tomlish::parse { set_tokenType "stringpart" set tok "" if {$had_slash} {append tok "\\"} - append tok "$dquotes," + append tok "," } multiliteral-space { - #assert had_slash 0, multi_dquote "" + #assert had_slash 0 set_tokenType "literalpart" set tok "," } @@ -5354,8 +5384,6 @@ namespace eval tomlish::parse { } } . { - set dquotes $multi_dquote - set multi_dquote "" ;#!! set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { @@ -5366,42 +5394,45 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } comment - untyped_value { if {$had_slash} {append tok "\\"} append tok $c } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} - append tok $dquotes$c + append tok $c } literal - literalpart - squotedkey { - #assert had_slash always 0, multi_dquote "" + #assert had_slash always 0 append tok $c } whitespace { switch -exact -- $state { multistring-space { - set backchars [expr {[tcl::string::length $dquotes] + 1}] + #review if {$had_slash} { - incr backchars 1 + incr i -2 + } else { + incr i -1 } - incr i -$backchars return 1 } xxxdottedkey-space { @@ -5444,7 +5475,7 @@ namespace eval tomlish::parse { set_tokenType "stringpart" set tok "" if {$had_slash} {append tok "\\"} - append tok "$dquotes." + append tok "." } multiliteral-space { set_tokenType "literalpart" @@ -5471,8 +5502,6 @@ namespace eval tomlish::parse { } " " { - set dquotes $multi_dquote - set multi_dquote "" ;#!! if {[tcl::string::length $tokenType]} { set had_slash $slash_active set slash_active 0 @@ -5483,16 +5512,18 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } barekey { @@ -5512,9 +5543,9 @@ namespace eval tomlish::parse { if {$had_slash} { append tok "\\" } - append tok $dquotes$c + append tok $c } - string - dquotedkey - itablequotedkey { + string - dquotedkey { if {$had_slash} { append tok "\\" } append tok $c } @@ -5526,8 +5557,7 @@ namespace eval tomlish::parse { incr i -2 return 1 } else { - #split into STRINGPART aaa WS " " - append tok $dquotes + #split into STRINGPART xxx WS " " incr i -1 return 1 } @@ -5537,15 +5567,7 @@ namespace eval tomlish::parse { } whitespace { if {$state eq "multistring-space"} { - if {$dquotes ne ""} { - #end whitespace token - #go back by the number of quotes plus this space char - set backchars [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backchars - return 1 - } else { - append tok $c - } + append tok $c } else { append tok $c } @@ -5588,12 +5610,6 @@ namespace eval tomlish::parse { incr i -1 return 1 } else { - if {$dquotes ne ""} { - set_tokenType "stringpart" - set tok $dquotes - incr i -1 - return 1 - } set_tokenType "whitespace" append tok $c } @@ -5613,9 +5629,6 @@ namespace eval tomlish::parse { } } tab { - set dquotes $multi_dquote - set multi_dquote "" ;#!! - if {[tcl::string::length $tokenType]} { if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out (?review) set slash_active 0 @@ -5626,12 +5639,18 @@ namespace eval tomlish::parse { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } barekey { @@ -5662,7 +5681,6 @@ namespace eval tomlish::parse { return 1 } else { #split into STRINGPART aaa WS " " - append tok $dquotes incr i -1 return 1 } @@ -5706,15 +5724,8 @@ namespace eval tomlish::parse { incr i -1 return 1 } else { - if {$dquotes ne ""} { - set_tokenType stringpart - set tok $dquotes - incr i -1 - return 1 - } else { - set_tokenType whitespace - append tok $c - } + set_tokenType whitespace + append tok $c } } multiliteral-space { @@ -5732,16 +5743,31 @@ namespace eval tomlish::parse { #BOM (Byte Order Mark) - ignored by token consumer if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { + tentative_accum_squote - tentative_accum_dquote { + incr i -1 + return 1 + } _start_squote_sequence { #assert - tok will be one or two squotes only + #A toml literal probably isn't allowed to contain this + #but we will parse and let the validator sort it out. incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } literal - literalpart { append tok $c } + string - stringpart { + append tok $c + } default { + #state machine will generally not have entry to accept bom - let it crash set_token_waiting type bom value "\uFEFF" complete 1 startindex $cindex return 1 } @@ -5752,6 +5778,10 @@ namespace eval tomlish::parse { set_tokenType "literalpart" set tok $c } + multistring-space { + set_tokenType "stringpart" + set tok $c + } default { set_tokenType "bom" set tok "\uFEFF" @@ -5761,8 +5791,6 @@ namespace eval tomlish::parse { } } default { - set dquotes $multi_dquote - set multi_dquote "" ;#!! if {[tcl::string::length $tokenType]} { if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. @@ -5774,28 +5802,24 @@ namespace eval tomlish::parse { incr i -1 return 1 } - squote_seq { + tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } - startquotesequence { - _shortcircuit_startquotesequence - } _start_squote_sequence { incr i -[tcl::string::length $tok] - set_tokenType "startsquote" + set_tokenType "single_squote" + return 1 + } + _start_dquote_sequence { + incr i -[tcl::string::length $tok] + set_tokenType "single_dquote" return 1 } whitespace { if {$state eq "multistring-space"} { - if {$dquotes ne ""} { - set backlen [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backlen - return 1 - } else { - incr i -1 - return 1 - } + incr i -1 + return 1 } else { #review incr i -1 ;#We don't have a full token to add to the token_waiting dict - so leave this char for next run. @@ -5815,7 +5839,7 @@ namespace eval tomlish::parse { return 1 } stringpart { - append tok $dquotes$c + append tok $c } default { #e.g comment/string/literal/literalpart/untyped_value/starttablename/starttablearrayname/tablename/tablearrayname @@ -5835,22 +5859,12 @@ namespace eval tomlish::parse { error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" } } - XXXcurly-syntax { - puts stderr "curly-syntax - review" - if {[tomlish::utils::is_barekey $c]} { - set_tokenType "barekey" - append tok $c - } else { - error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" - } - } multistring-space { set_tokenType "stringpart" if {$had_slash} { - #assert - we don't get had_slash and dquotes at same time set tok \\$c } else { - set tok $dquotes$c + set tok $c } } multiliteral-space { @@ -5890,21 +5904,6 @@ namespace eval tomlish::parse { # error "Reached end of data whilst tokenType = '$tokenType'. INVALID" #} switch -exact -- $tokenType { - startquotesequence { - set toklen [tcl::string::length $tok] - if {$toklen == 1} { - #invalid - #eof with open string - error "tomlish eof reached without closing quote for string. [tomlish::parse::report_line]" - } elseif {$toklen == 2} { - #valid - #we ended in a double quote, not actually a startquoteseqence - effectively an empty string - switch_tokenType "startquote" - incr i -1 - #set_token_waiting type string value "" complete 1 - return 1 - } - } _start_squote_sequence { set toklen [tcl::string::length $tok] switch -- $toklen { @@ -5913,11 +5912,29 @@ namespace eval tomlish::parse { error "tomlish eof reached without closing single quote for string literal. [tomlish::parse::report_line]" } 2 { - #review - set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] set_tokenType "literal" set tok "" return 1 + + ##review + #set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] + #set_tokenType "literal" + #set tok "" + #return 1 + } + } + } + _start_dquote_sequence { + set toklen [tcl::string::length $tok] + switch -- $toklen { + 1 { + #invalid eof with open string + error "tomlish eof reached without closing double quote for string. [tomlish::parse::report_line]" + } + 2 { + set_tokenType "string" + set tok "" + return 1 } } } @@ -6011,6 +6028,16 @@ namespace eval tomlish::dict { return $name } + proc _show_tablenames {tablenames_info} { + append msg \n "tablenames_info:" \n + dict for {tkey tinfo} $tablenames_info { + append msg " " "table: $tkey" \n + dict for {field finfo} $tinfo { + append msg " " "$field $finfo" \n + } + } + return $msg + } } tcl::namespace::eval tomlish::app {