# -*- tcl -*- # Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt # # Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. # Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # (C) 2024 # # @@ Meta Begin # Application tomlish 1.1.1 # Meta platform tcl # Meta license # @@ Meta End # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # doctools header # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ #*** !doctools #[manpage_begin tomlish_module_tomlish 0 1.1.1] #[copyright "2024"] #[titledesc {tomlish toml parser}] [comment {-- Name section and table of contents description --}] #[moddesc {tomlish}] [comment {-- Description at end of page heading --}] #[require tomlish] #[keywords module parsing toml configuration] #[description] #[para] tomlish is an intermediate representation of toml data in a tree structure (tagged lists representing type information) #[para] The design goals are for tomlish to be whitespace and comment preserving ie byte-for byte preservation during roundtrips from toml to tomlish and back to toml #[para] The tomlish representation can then be converted to a Tcl dict structure or to other formats such as json, #[para] although these other formats are generally unlikely to retain whitespace or comments #[para] A further goal is to allow at least a useful subset of in-place editing operations which also preserve whitespace and comments. #[para] e.g leaf key value editing, and table reordering/sorting, key-renaming at any level, key insertions/deletions #[para] The API for editing (tomldoc object?) may require explicit setting of type if accessing an existing key #[para] e.g setting a key that already exists and is a different type (especially if nested structure such as a table or array) #[para] will need a -type option (-force ?) to force overriding with another type such as an int. # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ #*** !doctools #[section Overview] #[para] overview of tomlish #[subsection Concepts] #[para] - # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ ## Requirements # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ #*** !doctools #[subsection dependencies] #[para] packages used by tomlish #[list_begin itemized] package require Tcl 8.6- package require struct::stack package require logger #*** !doctools #[item] [package {Tcl 8.6-}] #[item] [package {struct::stack}] #limit ourselves to clear, destroy, peek, pop, push, rotate, or size (e.g v 1.3 does not implement 'get') #*** !doctools #[list_end] # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ #*** !doctools #[section API] # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # Base namespace # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ namespace eval tomlish { namespace export {[a-z]*}; # Convention: export all lowercase variable types #IDEAS: # since get_toml produces tomlish with whitespace/comments intact: # tomldoc object - allow (at least basic?) editing of toml whilst preserving comments & whitespace # - setKey (set leaf only to value) how to specify type? -type option? - whole array vs index into arrays and further nested objects? - option for raw toml additions? # - separate addKey?? # - deleteKey (delete leaf) # - deleteTable (delete table - if only has leaves? - option to delete with child tables?) # - set/add Table? - position in doc based on existing tables/subtables? #The tomlish intermediate representation allows things such as sorting the toml document by table name or other re-ordering of tables - # because the tables include subkeys, comments and newlines within their structure - those elements all come along with it nicely during reordering. #The same goes for the first newline following a keyval e.g x=1 \ny=2\n\n #The newline is part of the keyval structure so makes reordering easier #example from_toml "a=1\nb=2\n\n\n" # 0 = TOMLISH # 1 = KEY a = {INT 1} {NEWLINE lf} # 2 = NEWLINE lf # 3 = KEY b = {INT 2} {NEWLINE lf} # 4 = NEWLINE lf # 5 = NEWLINE lf #ARRAY is analogous to a Tcl list #TABLE is analogous to a Tcl dict #WS = inline whitespace #KEY = bare key and value #QKEY = double quoted key and value #SQKEY = single quoted key and value #ITABLE = inline table (*can* be anonymous table) # inline table values immediately create a table with the opening brace # inline tables are fully defined between their braces, as are dotted-key subtables defined within # No additional subtables or arrays of tables may be defined within an inline table after the ending brace - they must be entirely self-contained set tags [list TOMLISH ARRAY TABLE ITABLE ANONTABLE WS NEWLINE COMMENT DOTTEDKEY KEY QKEY SQKEY STRING STRINGPART MULTISTRING LITERAL LITERALPART MULTILITERAL INT FLOAT BOOL DATETIME] #tomlish v1.0 should accept arbitrary 64-bit signed ints (from -2^63 to 2^63-1) #we will restrict to this range for compatibility for now - although Tcl can handle larger (arbitrarily so?) set min_int -9223372036854775808 ;#-2^63 set max_int +9223372036854775807 ;#2^63-1 proc Dolog {lvl txt} { #return "$lvl -- $txt" set msg "[clock format [clock seconds] -format "%Y-%m-%dT%H:%M:%S"] tomlish '$txt'" puts stderr $msg } logger::initNamespace ::tomlish foreach lvl [logger::levels] { interp alias {} tomlish_log_$lvl {} ::tomlish::Dolog $lvl log::logproc $lvl tomlish_log_$lvl } #*** !doctools #[subsection {Namespace tomlish}] #[para] Core API functions for tomlish #[list_begin definitions] proc tags {} { return $::tomlish::tags } #helper function for get_dict proc _get_keyval_value {keyval_element} { log::notice ">>> _get_keyval_value from '$keyval_element'<<<" set found_value 0 #find the value # 3 is the earliest index at which the value could occur (depending on whitespace) set found_sub [list] if {[lindex $keyval_element 2] ne "="} { error ">>>_get_keyval_value doesn't seem to be a properly structured { = } list" } foreach sub [lrange $keyval_element 2 end] { #note that a barekey/quotedkey won't occur directly inside a barekey/quotedkey switch -exact -- [lindex $sub 0] { STRING - LITERAL - MULTISTRING - MULTILITERAL - INT - FLOAT - BOOL - DATETIME - TABLE - ARRAY - ITABLE { set type [lindex $sub 0] set value [lindex $sub 1] set found_sub $sub incr found_value 1 } default {} } } if {!$found_value} { error "Failed to find value element in KEY. '$keyval_element'" } if {$found_value > 1} { error "Found multiple value elements in KEY, expected exactly one. '$keyval_element'" } switch -exact -- $type { INT - FLOAT - BOOL - DATETIME { #simple (non-container, no-substitution) datatype set result [list type $type value $value] } STRING - STRINGPART { set result [list type $type value [::tomlish::utils::unescape_string $value]] } LITERAL - LITERALPART { #REVIEW set result [list type $type value $value] } TABLE { #invalid? error "_get_keyval_value invalid to have type TABLE on rhs of =" } ITABLE { set result [::tomlish::get_dict [list $found_sub]] } ARRAY { #we need to recurse to get the corresponding dict for the contained item(s) #pass in the whole $found_sub - not just the $value! set result [list type $type value [::tomlish::get_dict [list $found_sub]]] } MULTISTRING - MULTILITERAL { #review - mapping these to STRING might make some conversions harder? #if we keep the MULTI - we know we have to look for newlines for example when converting to json #without specific types we'd have to check every STRING - and lose info about how best to map chars within it set result [list type $type value [::tomlish::get_dict [list $found_sub]]] } default { error "Unexpected value type '$type' found in keyval '$keyval_element'" } } return $result } proc _get_dottedkey_info {dottedkeyrecord} { set key_hierarchy [list] set key_hierarchy_raw [list] if {[lindex $dottedkeyrecord 0] ne "DOTTEDKEY"} { error "_get_dottedkey_info error. Supplied list doesn't appear to be a DOTTEDKEY (tag: [lindex $dottedkeyrecord 0])" } set compoundkeylist [lindex $dottedkeyrecord 1] set expect_sep 0 foreach part $compoundkeylist { set parttag [lindex $part 0] if {$parttag eq "WS"} { continue } if {$expect_sep} { if {$parttag ne "DOTSEP"} { error "DOTTEDKEY missing dot separator between parts. '$dottedkeyrecord'" } set expect_sep 0 } else { set val [lindex $part 1] switch -exact -- $parttag { KEY { lappend key_hierarchy $val lappend key_hierarchy_raw $val } QKEY { lappend key_hierarchy [::tomlish::utils::unescape_string $val] lappend key_hierarchy_raw \"$val\" } SQKEY { lappend key_hierarchy $val lappend key_hierarchy_raw "'$val'" } default { error "DOTTED key unexpected part '$parttag' - ensure dot separator is between key parts. '$item'" } } set expect_sep 1 } } return [dict create keys $key_hierarchy keys_raw $key_hierarchy_raw] } #get_dict is a *basic* programmatic datastructure for accessing the data. # produce a dictionary of keys and values from a tomlish tagged list. # get_dict is primarily for reading toml data. #Extraneous (not within quoted sections) whitespace and comments are not preserved in this structure, # so a roundtrip from toml to this datastructure and back to toml will lose whitespace formatting and comments. # creating/changing toml values can be done directly on a tomlish list if preserving (or adding) formatting/comments is desired. #A separate package 'tomlish::object' may be needed to allow easier programmatic creating/updating/deleting of data elements whilst preserving (or adding or selectively deleting/editing) such formatting. proc get_dict {tomlish} { #keep track of which tablenames have already been directly defined, # so we can raise an error to satisfy the toml rule: 'You cannot define any key or table more than once. Doing so is invalid' #Note that [a] and then [a.b] is ok if there are no subkey conflicts - so we are only tracking complete tablenames here. #we don't error out just because a previous tablename segment has already appeared. variable tablenames_seen [list] log::info ">>> processing '$tomlish'<<<" set items $tomlish foreach lst $items { if {[lindex $lst 0] ni $::tomlish::tags} { error "supplied string does not appear to be toml parsed into a tomlish tagged list. Run tomlish::decode::toml on the raw toml data to produce a tomlish list" } } if {[lindex $tomlish 0] eq "TOMLISH"} { #ignore TOMLISH tag at beginning set items [lrange $tomlish 1 end] } set datastructure [dict create] foreach item $items { set tag [lindex $item 0] #puts "...> item:'$item' tag:'$tag'" switch -exact -- $tag { KEY - QKEY - SQKEY { log::debug "--> processing $tag: $item" set key [lindex $item 1] if {$tag eq "QKEY"} { set key [::tomlish::utils::unescape_string $key] } #!todo - normalize key. (may be quoted/doublequoted) if {[dict exists $datastructure $key]} { error "Duplicate key '$key'. The key already exists at this level in the toml data. The toml data is not valid." } #lassign [_get_keyval_value $item] type val set keyval_dict [_get_keyval_value $item] dict set datastructure $key $keyval_dict } DOTTEDKEY { log::debug "--> processing $tag: $item" set dkey_info [_get_dottedkey_info $item] set dotted_key_hierarchy [dict get $dkey_info keys] set dotted_key_hierarchy_raw [dict get $dkey_info keys_raw] #a.b.c = 1 #table_key_hierarchy -> a b #leafkey -> c if {[llength $dotted_key_hierarchy] == 0} { #empty?? probably invalid. review #This is different to '' = 1 or ''.'' = 1 which have lengths 1 and 2 respectively error "DOTTED key has no parts - invalid? '$item'" } elseif {[llength $dotted_key_hierarchy] == 1} { #dottedkey is only a key - no table component set table_hierarchy [list] set leafkey [lindex $dotted_key_hierarchy 0] } else { set table_hierarchy [lrange $dotted_key_hierarchy 0 end-1] set table_hierarchy_raw [lrange $dotted_key_hierarchy_raw 0 end-1] set leafkey [lindex $dotted_key_hierarchy end] } #ensure empty tables are still represented in the datastructure set pathkeys [list] foreach k $table_hierarchy { lappend pathkeys $k if {![dict exists $datastructure {*}$pathkeys]} { dict set datastructure {*}$pathkeys [list] } else { tomlish::log::notice "get_dict datastructure at key path $pathkeys already had data: [dict get $datastructure {*}$pathkeys]" } } set keyval_dict [_get_keyval_value $item] dict set datastructure {*}$pathkeys $leafkey $keyval_dict } TABLE { set tablename [lindex $item 1] set tablename [::tomlish::utils::tablename_trim $tablename] if {$tablename in $tablenames_seen} { error "Table name '$tablename' has already been directly defined in the toml data. Invalid." } log::debug "--> processing $tag (name: $tablename): $item" set name_segments [::tomlish::utils::tablename_split $tablename] set last_seg "" #toml spec rule - all segments mst be non-empty #note that the results of tablename_split are 'raw' - ie some segments may be enclosed in single or double quotes. set table_key_hierarchy [list] set table_key_hierarchy_raw [list] foreach rawseg $name_segments { set seg [::tomlish::utils::normalize_key $rawseg] ;#strips one level of enclosing quotes, and substitutes only toml-specified escapes set c1 [tcl::string::index $rawseg 0] set c2 [tcl::string::index $rawseg end] if {($c1 eq "'") && ($c2 eq "'")} { #single quoted segment. No escapes are processed within it. set seg [tcl::string::range $rawseg 1 end-1] } elseif {($c1 eq "\"") && ($c2 eq "\"")} { #double quoted segment. Apply escapes. set seg [::tomlish::utils::unescape_string [tcl::string::range $rawseg 1 end-1]] } else { set seg $rawseg } #no need to check for empty segments here - we've already called tablename_split which would have raised an error for empty segments. #if {$rawseg eq ""} { # error "Table name '[lindex $item 1]' is not valid. All segments (parts between dots) must be non-empty" #} lappend table_key_hierarchy $seg lappend table_key_hierarchy_raw $rawseg if {[dict exists $datastructure {*}$table_key_hierarchy]} { #It's ok for this key to already exist *if* it was defined by a previous tablename, # but not if it was defined as a key/qkey/skey ? set testkey [join $table_key_hierarchy_raw .] set testkey_length [llength $table_key_hierarchy_raw] set found_testkey 0 if {$testkey in $tablenames_seen} { set found_testkey 1 } else { #see if it was defined by a longer entry foreach seen $tablenames_seen { set seen_segments [::tomlish::utils::tablename_split $seen] #these are raw unnormalized tablename segments. Need to normalize the double-quoted ones, # and strip the quotes from both single-quoted and double-quoted entries. #note: it is not safe to compare normalized tablenames using join! # e.g a.'b.c'.d is not the same as a.b.c.d # instead compare {a b.c d} with {a b c d} # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' #VVV the test below is wrong VVV! #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} set seen_match [join [lrange $seen_segments 0 [expr {$testkey_length -1}]] .] puts stderr "testkey:'$testkey' vs seen_match:'$seen_match'" if {$testkey eq $seen_match} { set found_testkey 1 } } } if {$found_testkey == 0} { #the raw table_key_hierarchy is better to display in the error message, although it's not the actual dict keyset set msg "key [join $table_key_hierarchy_raw .] already exists in datastructure, but wasn't defined by a supertable." append msg "tablenames_seen:" foreach ts $tablenames_seen { append msg " " $ts \n } error $msg } } } #ensure empty tables are still represented in the datastructure set table_keys [list] foreach k $table_key_hierarchy { lappend table_keys $k if {![dict exists $datastructure {*}$table_keys]} { dict set datastructure {*}$table_keys [list] } else { tomlish::log::notice "get_dict datastructure at (TABLE) subkey $table_keys already had data: [dict get $datastructure {*}$table_keys]" } } #We must do this after the key-collision test above! lappend tablenames_seen $tablename log::debug ">>>>>>>>>>>>>>>>>>>>table_key_hierarchy : $table_key_hierarchy" log::debug ">>>>>>>>>>>>>>>>>>>>table_key_hierarchy_raw: $table_key_hierarchy_raw" #now add the contained elements foreach element [lrange $item 2 end] { set type [lindex $element 0] switch -exact -- $type { DOTTEDKEY { set dkey_info [_get_dottedkey_info $element] set dotted_key_hierarchy [dict get $dkey_info keys] set dotted_key_hierarchy_raw [dict get $dkey_info keys_raw] set leaf_key [lindex $dotted_key_hierarchy end] set dkeys [lrange $dotted_key_hierarchy 0 end-1] #ensure empty keys are still represented in the datastructure set test_keys $table_keys foreach k $dkeys { lappend test_keys $k if {![dict exists $datastructure {*}$test_keys]} { dict set datastructure {*}$test_keys [list] } else { tomlish::log::notice "get_dict datastructure at (DOTTEDKEY) subkey $test_keys already had data: [dict get $datastructure {*}$test_keys]" } } if {[dict exists $datastructure {*}$table_keys {*}$dkeys $leaf_key]} { error "Duplicate key '$table_keys $dkeys $leaf_key'. The key already exists at this level in the toml data. The toml data is not valid." } set keyval_dict [_get_keyval_value $element] dict set datastructure {*}$table_keys {*}$dkeys $leaf_key $keyval_dict } KEY - QKEY - SQKEY { #obsolete ? set keyval_key [lindex $element 1] if {$type eq "QKEY"} { set keyval_key [::tomlish::utils::unescape_string $keyval_key] } if {[dict exists $datastructure {*}$dotted_key_hierarchy $keyval_key]} { error "Duplicate key '$dotted_key_hierarchy $key'. The key already exists at this level in the toml data. The toml data is not valid." } set keyval_dict [_get_keyval_value $element] dict set datastructure {*}$dotted_key_hierarchy $keyval_key $keyval_dict } NEWLINE - COMMENT - WS { #ignore } default { error "Sub element of type '$type' not understood in table context. Expected only KEY,QKEY,SQKEY,NEWLINE,COMMENT,WS" } } } #now make sure we add an empty value if there were no contained elements! #!todo. } ITABLE { #SEP??? set datastructure [list] foreach element [lrange $item 1 end] { set type [lindex $element 0] switch -exact -- $type { DOTTEDKEY { set dkey_info [_get_dottedkey_info $element] set dotted_key_hierarchy [dict get $dkey_info keys] set dotted_key_hierarchy_raw [dict get $dkey_info keys_raw] set leaf_key [lindex $dotted_key_hierarchy end] set dkeys [lrange $dotted_key_hierarchy 0 end-1] #ensure empty keys are still represented in the datastructure set table_keys [list] ;#We don't know the context - next level up will have to check for key collisions? set test_keys $table_keys foreach k $dkeys { lappend test_keys $k if {![dict exists $datastructure {*}$test_keys]} { dict set datastructure {*}$test_keys [list] } else { tomlish::log::notice "get_dict datastructure at (DOTTEDKEY) subkey $test_keys already had data: [dict get $datastructure {*}$test_keys]" } } if {[dict exists $datastructure {*}$table_keys {*}$dkeys $leaf_key]} { error "Duplicate key '$table_keys $dkeys $leaf_key'. The key already exists at this level in the toml data. The toml data is not valid." } set keyval_dict [_get_keyval_value $element] dict set datastructure {*}$table_keys {*}$dkeys $leaf_key $keyval_dict } NEWLINE - COMMENT - WS { #ignore } default { error "Sub element of type '$type' not understood in ITABLE context. Expected only KEY,QKEY,SQKEY,NEWLINE,COMMENT,WS" } } } } ARRAY { #arrays in toml are allowed to contain mixtures of types set datastructure [list] log::debug "--> processing array: $item" foreach element [lrange $item 1 end] { set type [lindex $element 0] switch -exact -- $type { INT - FLOAT - BOOL - DATETIME { set value [lindex $element 1] lappend datastructure [list type $type value $value] } STRING { set value [lindex $element 1] lappend datastructure [list type $type value [::tomlish::utils::unescape_string $value]] } LITERAL { set value [lindex $element 1] lappend datastructure [list type $type value $value] } ITABLE - TABLE - ARRAY - MULTISTRING - MULTILITERAL { set value [lindex $element 1] lappend datastructure [list type $type value [::tomlish::get_dict [list $element]]] } WS - SEP - NEWLINE - COMMENT { #ignore whitespace, commas, newlines and comments } default { error "Unexpected value type '$type' found in array" } } } } MULTILITERAL { #triple squoted string #first newline stripped only if it is the very first element #(ie *immediately* following the opening delims) #All whitespace other than newlines is within LITERALPARTS # ------------------------------------------------------------------------- #todo - consider extension to toml to allow indent-aware multiline literals # how - propose as issue in toml github? Use different delim? e.g ^^^ ? #e.g # xxx=?'''abc # def # etc # ''' # - we would like to trimleft each line to the column following the opening delim # ------------------------------------------------------------------------- log::debug "--> processing multiliteral: $item" set parts [lrange $item 1 end] if {[lindex $parts 0 0] eq "NEWLINE"} { set parts [lrange $parts 1 end] ;#skip it } for {set idx 0} {$idx < [llength $parts]} {incr idx} { set element [lindex $parts $idx] set type [lindex $element 0] switch -exact -- $type { LITERALPART { append stringvalue [lindex $element 1] } NEWLINE { set val [lindex $element 1] if {$val eq "nl"} { append stringvalue \n } else { append stringvalue \r\n } } default { error "Unexpected value type '$type' found in multistring" } } } set datastructure $stringvalue } MULTISTRING { #triple dquoted string log::debug "--> processing multistring: $item" set stringvalue "" set idx 0 set parts [lrange $item 1 end] for {set idx 0} {$idx < [llength $parts]} {incr idx} { set element [lindex $parts $idx] set type [lindex $element 0] #We use STRINGPART in the tomlish representation as a distinct element to STRING - which would imply wrapping quotes to be reinserted switch -exact -- $type { STRING { #todo - do away with STRING ? #we don't build MULTISTRINGS containing STRING - but should we accept it? tomlish::log::warn "doulbe quoting a STRING found in MULTISTRING - should be STRINGPART?" append stringvalue "\"[::tomlish::utils::unescape_string [lindex $element 1]]\"" } STRINGPART { append stringvalue [::tomlish::utils::unescape_string [lindex $element 1]] } CONT { #When the last non-whitespace character on a line is an unescaped backslash, #it will be trimmed along with all whitespace (including newlines) up to the next non-whitespace character or closing delimiter # review - we allow some whitespace in stringpart elements - can a stringpart ever be all whitespace? set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] if {$next_nl == -1} { #last line set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] if {$non_ws >= 0} { append stringvalue "\\" ;#add the sep } else { #skip over ws without emitting set idx [llength $parts] } } else { set parts_til_nl [lrange $parts 0 $next_nl-1] set non_ws [lsearch -index 0 -start $idx+1 -not $parts_til_nl WS] if {$non_ws >= 0} { append stringvalue "\\" } else { #skip over ws on this line set idx $next_nl #then have to check each subsequent line until we get to first non-whitespace set trimming 1 while {$trimming && $idx < [llength $parts]} { set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] if {$next_nl == -1} { #last line set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] if {$non_ws >= 0} { set idx [expr {$non_ws -1}] } else { set idx [llength $parts] } set trimming 0 } else { set non_ws [lsearch -index 0 -start $idx+1 -not [lrange $parts 0 $next_nl-1] WS] if {$non_ws >= 0} { set idx [expr {$non_ws -1}] set trimming 0 } else { set idx $next_nl #keep trimming } } } } } } NEWLINE { #if newline is first element - it is not part of the data of a multistring if {$idx > 0} { set val [lindex $element 1] if {$val eq "nl"} { append stringvalue \n } else { append stringvalue \r\n } } } WS { append stringvalue [lindex $element 1] } default { error "Unexpected value type '$type' found in multistring" } } } set datastructure $stringvalue } WS - COMMENT - NEWLINE { #ignore } default { error "Unexpected tag '$tag' in Tomlish list '$tomlish'" } } } return $datastructure } proc json_to_toml {json} { #*** !doctools #[call [fun json_to_toml] [arg json]] #[para] set tomlish [::tomlish::from_json $json] set toml [::tomlish::to_toml $tomlish] } #TODO use huddle? proc from_json {json} { set jstruct [::tomlish::json_struct $json] return [::tomlish::from_json_struct $jstruct] } proc from_json_struct {jstruct} { package require fish::json_toml return [fish::json_toml::jsonstruct2tomlish $jstruct] } proc toml_to_json {toml} { set tomlish [::tomlish::from_toml $toml] return [::tomlish::get_json $tomlish] } proc get_json {tomlish} { package require fish::json set d [::tomlish::get_dict $tomlish] #return [::tomlish::dict_to_json $d] return [fish::json::from "struct" $d] } #*** !doctools #[list_end] [comment {--- end definitions namespace tomlish ---}] } # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ namespace eval tomlish::encode { #*** !doctools #[subsection {Namespace tomlish::encode}] #[para] #[list_begin definitions] #STRING,INT,FLOAT,BOOL, DATETIME - simple wrappers for completeness # take a value of the appropriate type and wrap as a tomlish tagged item proc string {s} { return [list STRING $s] } proc int {i} { #whole numbers, may be prefixed with a + or - #Leading zeros are not allowed #Hex,octal binary forms are allowed (toml 1.0) #We will error out on encountering commas, as commas are interpreted differently depending on locale (and don't seem to be supported in the toml spec anyway) #!todo - Tcl can handle bignums - bigger than a 64bit signed long as specified in toml. # - We should probably raise an error for number larger than this and suggest the user supply it as a string? if {[tcl::string::last , $i] > -1} { error "Unable to interpret '$i' as an integer. Use underscores if you need a thousands separator [::tomlish::parse::report_line]" } if {![::tomlish::utils::int_validchars $i]} { error "Unable to interpret '$i' as an integer. Only 0-9 + 1 _ characters are acceptable. [::tomlish::parse::report_line]" } if {[::tomlish::utils::is_int $i]} { return [list INT $i] } else { error "'$i' is not a valid integer as per the Toml spec. [::tomlish::parse::report_line]" } } proc float {f} { #convert any non-lower case variants of special values to lowercase for Toml if {[::tcl::string::tolower $f] in {nan +nan -nan inf +inf -inf}} { return [list FLOAT [tcl::string::tolower $f]] } if {[::tomlish::utils::is_float $f]} { return [list FLOAT $f] } else { error "Unable to interpret '$f' as Toml float. Check your input, or check that tomlish is able to handle all Toml floats properly [::tomlish::parse::report_line]" } } proc datetime {str} { if {[::tomlish::utils::is_datetime $str]} { return [list DATETIME $str] } else { error "Unable to interpret '$str' as Toml datetime. Check your input, or check that tomlish is able to handle all Toml datetimes properly [::tomlish::parse::report_line]" } } proc boolean {b} { #convert any Tcl-acceptable boolean to boolean as accepted by toml - lower case true/false if {![tcl::string::is boolean -strict $b]} { error "Unable to convert '$b' to Toml boolean true|false. [::tomlish::parse::report_line]" } else { if {[expr {$b && 1}]} { return [::list BOOL true] } else { return [::list BOOL false] } } } #TODO #Take tablename followed by # a) *tomlish* name-value pairs e.g table mydata [list KEY item11 = [list STRING "test"]] {KEY item2 = [list INT 1]} # (accept also key value {STRING }) # b) simple 2-element tcl lists being name & *simple* value pairs for which basic heuristics will be used to determine types proc table {name args} { set pairs [list] foreach t $args { if {[llength $t] == 4} { if {[tcl::string::tolower [lindex $t 0]] ne "key" || [tcl::string::tolower [lindex $t 2]] ni "= value"} { error "Only items tagged as KEY = currently accepted as name-value pairs for table command" } lassign $t _k keystr _eq valuepart if {[llength $valuepart] != 2} { error "supplied value must be typed. e.g {INT 1} or {STRING test}" } lappend pairs [list KEY $keystr = $valuepart] } elseif {[llength $t] == 2} { #!todo - type heuristics lassign $t n v lappend pairs [list KEY $n = [list STRING $v]] } else { error "'KEY = { toml but # the first newline is not part of the data. # we elect instead to maintain a basic LITERALPART that must not contain newlines.. # and to compose MULTILITERAL of multiple NEWLINE LITERALPART parts, #with the datastructure representation dropping the first newline (if immediately following opening delim) when building the value. set literal "" foreach part [lrange $item 1 end] { append literal [::tomlish::encode::tomlish [list $part] $nextcontext] } append toml '''$literal''' } INT - BOOL - FLOAT - DATETIME { append toml [lindex $item 1] } INCOMPLETE { error "cannot process tomlish term tagged as INCOMPLETE" } COMMENT { append toml "#[lindex $item 1]" } BOM { #Byte Order Mark may appear at beginning of a file. Needs to be preserved. append toml "\uFEFF" } default { error "Not a properly formed 'tomlish' taggedlist.\n '$list'\n Unknown tag '[lindex $item 0]'. See output of \[tomlish::tags\] command." } } } return $toml } #*** !doctools #[list_end] [comment {--- end definitions namespace tomlish::encode ---}] } #fish toml from tomlish #(encode tomlish as toml) interp alias {} tomlish::to_toml {} tomlish::encode::tomlish # namespace eval tomlish::decode { #*** !doctools #[subsection {Namespace tomlish::decode}] #[para] #[list_begin definitions] #return a Tcl list of tomlish tokens #i.e get a standard list of all the toml terms in string $s #where each element of the list is a *tomlish* term.. i.e a specially 'tagged' Tcl list. #(simliar to a tcl 'Huddle' - but also supporting whitespace preservation) #Note that we deliberately don't check certain things such as duplicate table declarations here. #Part of the justification for this is that as long as the syntax is toml shaped - we can load files which violate certain rules and allow programmatic manipulation. # (e.g perhaps a toml editor to highlight violations for fixing) # A further stage is then necessary to load the tomlish tagged list into a data structure more suitable for efficient query/reading. # e.g dicts or an object oriented structure #Note also - *no* escapes in quoted strings are processed. This is up to the datastructure stage #e.g to_dict will substitute \r \n \uHHHH \UHHHHHHH etc #This is important for tomlish to maintain the ability to perform competely lossless round-trips from toml to tomlish and back to toml. # (which is handy for testing as well as editing some part of the structure with absolutely no effect on other parts of the document) #If we were to unescape a tab character for example # - we have no way of knowing if it was originally specified as \t \u0009 or \U00000009 or directly as a tab character. # For this reason, we also do absolutely no line-ending transformations based on platform. # All line-endings are maintained as is, and even a file with mixed cr crlf line-endings will be correctly interpreted and can be 'roundtripped' proc toml {args} { #*** !doctools #[call [fun toml] [arg arg...]] #[para] return a Tcl list of tomlish tokens set s [join $args \n] namespace upvar ::tomlish::parse is_parsing is_parsing set is_parsing 1 if {[info command ::tomlish::parse::spacestack] eq "::tomlish::parse::spacestack"} { tomlish::parse::spacestack destroy } struct::stack ::tomlish::parse::spacestack namespace upvar ::tomlish::parse last_space_action last_space_action namespace upvar ::tomlish::parse last_space_type last_space_type namespace upvar ::tomlish::parse tok tok set tok "" namespace upvar ::tomlish::parse type type namespace upvar ::tomlish::parse tokenType tokenType ::tomlish::parse::set_tokenType "" namespace upvar ::tomlish::parse tokenType_list tokenType_list set tokenType [list] ;#Flat (un-nested) list of tokentypes found namespace upvar ::tomlish::parse lastChar lastChar set lastChar "" set result "" namespace upvar ::tomlish::parse nest nest set nest 0 namespace upvar ::tomlish::parse v v ;#array keyed on nest level set v(0) {TOMLISH} array set s0 [list] ;#whitespace data to go in {SPACE {}} element. set parentlevel 0 namespace upvar ::tomlish::parse i i set i 0 namespace upvar ::tomlish::parse state state namespace upvar ::tomlish::parse braceCount braceCount set barceCount 0 namespace upvar ::tomlish::parse bracketCount bracketCount set bracketCount 0 set sep 0 set r 1 namespace upvar ::tomlish::parse token_waiting token_waiting set token_waiting [dict create] ;#if ::tok finds a *complete* second token during a run, it will put the 2nd one here to be returned by the next call. set state "table-space" ::tomlish::parse::spacestack push {type space state table-space} namespace upvar ::tomlish::parse linenum linenum;#'line number' of input data. (incremented for each literal linefeed - but not escaped ones in data) set linenum 1 set ::tomlish::parse::state_list [list] try { while {$r} { set r [::tomlish::parse::tok $s] #puts stdout "got tok: '$tok' while parsing string '$s' " set next_tokenType_known 0 ;#whether we begin a new token here based on what terminated the token result of 'tok' #puts "got token: '$tok' tokenType='$tokenType'. while v($nest) = [set v($nest)]" #puts "-->tok: $tok tokenType='$tokenType'" set prevstate $state set transition_info [::tomlish::parse::goNextState $tokenType $tok $state] #review goNextState could perform more than one space_action set space_action [dict get $transition_info space_action] set newstate [dict get $transition_info newstate] ;#use of 'newstate' vs 'state' makes code clearer below if {[tcl::string::match "err-*" $state]} { ::tomlish::log::warn "---- State error in state $prevstate for tokenType: $tokenType token value: $tok. $state aborting parse. [tomlish::parse::report_line]" lappend v(0) [list ERROR tokentype $tokenType state $prevstate to $state leveldata [set v($nest)]] return $v(0) } # --------------------------------------------------------- #NOTE there may already be a token_waiting at this point #set_token_waiting can raise an error here, # in which case the space_action branch needs to be rewritten to handle the existing token_waiting # --------------------------------------------------------- if {$space_action eq "pop"} { #pop_trigger_tokens: newline tablename endarray endinlinetable #note a token is a pop trigger depending on context. e.g first newline during keyval is a pop trigger. set parentlevel [expr {$nest -1}] set do_append_to_parent 1 ;#most tokens will leave this alone - but some like squote_seq need to do their own append switch -exact -- $tokenType { squote_seq { #### set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed #Without this - we would get extraneous empty list entries in the parent # - as the xxx-squote-space isn't a space level from the toml perspective # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop switch -- $tok { ' { tomlish::parse::set_token_waiting type startsquote value $tok complete 1 startindex [expr {$i -1}] } '' { #review - we should perhaps return double_squote instead? #tomlish::parse::set_token_waiting type literal value "" complete 1 tomlish::parse::set_token_waiting type double_squote value "" complete 1 startindex [expr {$i - 2}] } ''' { #### #if already an eof in token_waiting - set_token_waiting will insert before it tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 3}] } '''' { switch -exact -- $prevstate { leading-squote-space { error "---- 4 squotes from leading-squote-space - shouldn't get here" #we should have emitted the triple and left the last for next loop } trailing-squote-space { tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] #todo integrate left squote with nest data at this level set lastpart [lindex $v($parentlevel) end] switch -- [lindex $lastpart 0] { LITERALPART { set newval "[lindex $lastpart 1]'" set parentdata $v($parentlevel) lset parentdata end [list LITERALPART $newval] set v($parentlevel) $parentdata } NEWLINE { lappend v($parentlevel) [list LITERALPART "'"] } default { error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" } } } default { error "--- unexpected popped due to squote_seq but came from state '$prevstate' should have been leading-squote-space or trailing-squote-space" } } } ''''' { switch -exact -- $prevstate { leading-squote-space { error "---- 5 squotes from leading-squote-space - shouldn't get here" #we should have emitted the triple and left the following squotes for next loop } trailing-squote-space { tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] #todo integrate left 2 squotes with nest data at this level set lastpart [lindex $v($parentlevel) end] switch -- [lindex $lastpart 0] { LITERALPART { set newval "[lindex $lastpart 1]''" set parentdata $v($parentlevel) lset parentdata end [list LITERALPART $newval] set v($parentlevel) $parentdata } NEWLINE { lappend v($parentlevel) [LITERALPART "''"] } default { error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" } } } default { error "--- unexpected popped due to squote_seq but came from state '$prevstate' should have been leading-squote-space or trailing-squote-space" } } } } puts "---- HERE squote_seq pop <$tok>" } triple_squote { #presumably popping multiliteral-space ::tomlish::log::debug "---- triple_squote for last_space_action pop leveldata: $v($nest)" set merged [list] set lasttype "" foreach part $v($nest) { switch -exact -- [lindex $part 0] { MULTILITERAL { lappend merged $part } LITERALPART { if {$lasttype eq "LITERALPART"} { set prevpart [lindex $merged end] lset prevpart 1 [lindex $prevpart 1][lindex $part 1] lset merged end $prevpart } else { lappend merged $part } } NEWLINE { #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. lappend merged $part } default { error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($next)" } } set lasttype [lindex $part 0] } set v($nest) $merged } equal { if {$prevstate eq "dottedkey-space"} { tomlish::log::debug "---- equal ending dottedkey-space for last_space_action pop" #re-emit for parent space tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] } } newline { incr linenum lappend v($nest) [list NEWLINE $tok] } tablename { #note: a tablename only 'pops' if we are greater than zero error "---- tablename pop should already have been handled as special case zeropoppushspace in goNextState" } tablearrayname { #!review - tablearrayname different to tablename regarding push/pop? #note: a tablename only 'pops' if we are greater than zero error "---- tablearrayname pop should already have been handled as special case zeropoppushspace in goNextState" } endarray { #nothing to do here. } comma { #comma for inline table will pop the keyvalue space lappend v($nest) "SEP" } endinlinetable { ::tomlish::log::debug "---- endinlinetable for last_space_action pop" } endmultiquote { ::tomlish::log::debug "---- endmultiquote for last_space_action 'pop'" } default { error "---- unexpected tokenType '$tokenType' for last_space_action 'pop'" } } if {$do_append_to_parent} { #e.g squote_seq does it's own appends as necessary - so won't get here lappend v($parentlevel) [set v($nest)] } incr nest -1 } elseif {$last_space_action eq "push"} { set prevnest $nest incr nest 1 set v($nest) [list] # push_trigger_tokens: barekey quotedkey startinlinetable startarray tablename tablearrayname switch -exact -- $tokenType { squote_seq_begin { #### if {[dict exists $transition_info starttok] && [dict get $transition_info starttok] ne ""} { lassign [dict get $transition_info starttok] starttok_type starttok_val set next_tokenType_known 1 ::tomlish::parse::set_tokenType $starttok_type set tok $starttok_val } } squotedkey { switch -exact -- $prevstate { table-space - itable-space { set v($nest) [list DOTTEDKEY] } } #todo - check not something already waiting? tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } barekey { switch -exact -- $prevstate { table-space - itable-space { set v($nest) [list DOTTEDKEY] } } #todo - check not something already waiting? set waiting [tomlish::parse::get_token_waiting] if {[llength $waiting]} { set i [dict get $waiting startindex] tomlish::parse::clear_token_waiting tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } else { tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } } startsquote { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "squotedkey" set tok "" } quotedkey - itablequotedkey { set v($nest) [list QKEY $tok] ;#$tok is the keyname } itablesquotedkey { set v($nest) [list SQKEY $tok] ;#$tok is the keyname } tablename { #note: we do not use the output of tomlish::tablename_trim to produce a tablename for storage in the tomlish list! #The tomlish list is intended to preserve all whitespace (and comments) - so a roundtrip from toml file to tomlish # back to toml file will be identical. #It is up to the datastructure stage to normalize and interpret tomlish for programmatic access. # we call tablename_trim here only to to validate that the tablename data is well-formed at the outermost level, # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names. #todo - review! It's arguable that we should not do any validation here, and just store even incorrect raw tablenames, # so that the tomlish list is more useful for say a toml editor. Consider adding an 'err' tag to the appropriate place in the # tomlish list? set test_only [::tomlish::utils::tablename_trim $tok] ::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$test_only'" set v($nest) [list TABLE $tok] ;#$tok is the *raw* table name #note also that equivalent tablenames may have different toml representations even after being trimmed! #e.g ["x\t\t"] & ["x "] (tab escapes vs literals) #These will show as above in the tomlish list, but should normalize to the same tablename when used as keys by the datastructure stage. } tablearrayname { set test_only [::tomlish::utils::tablename_trim $tok] puts stdout "trimmed (but not normalized) tablearrayname: '$test_only'" set v($nest) [list TABLEARRAY $tok] ;#$tok is the *raw* tablearray name } startarray { set v($nest) [list ARRAY] ;#$tok is just the opening bracket - don't output. } startinlinetable { set v($nest) [list ITABLE] ;#$tok is just the opening curly brace - don't output. } startmultiquote { ::tomlish::log::debug "---- push trigger tokenType startmultiquote" set v($nest) [list MULTISTRING] ;#container for STRINGPART, WS, CONT, NEWLINE } triple_squote { ::tomlish::log::debug "---- push trigger tokenType triple_squote" set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERAL } default { error "---- push trigger tokenType '$tokenType' not yet implemented" } } } else { #no space level change switch -exact -- $tokenType { squotedkey { puts "---- squotedkey in state $prevstate (no space level change)" lappend v($nest) [list SQKEY $tok] } barekey { lappend v($nest) [list KEY $tok] } dotsep { lappend v($nest) [list DOTSEP] } starttablename { #$tok is triggered by the opening bracket and sends nothing to output } starttablearrayname { #$tok is triggered by the double opening brackets and sends nothing to output } tablename - tablenamearray { error "---- did not expect 'tablename/tablearrayname' without space level change (no space level change)" #set v($nest) [list TABLE $tok] } endtablename - endtablearrayname { #no output into the tomlish list for this token } startinlinetable { puts stderr "---- decode::toml error. did not expect startinlinetable without space level change (no space level change)" } startquote { switch -exact -- $newstate { string-state { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "string" set tok "" } quoted-key { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "quotedkey" set tok "" } itable-quoted-key { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "itablequotedkey" set tok "" } default { error "---- startquote switch case not implemented for nextstate: $newstate (no space level change)" } } } startsquote { switch -exact -- $newstate { literal-state { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "literal" set tok "" } squoted-key { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "squotedkey" set tok "" } itable-squoted-key { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "itablesquotedkey" set tok "" } multiliteral-space { #false alarm squote returned from squote_seq pop ::tomlish::log::debug "---- adding lone squote to own LITERALPART nextstate: $newstate (no space level change)" #(single squote - not terminating space) lappend v($nest) [list LITERALPART '] #may need to be joined on pop if there are neighbouring LITERALPARTs } default { error "---- startsquote switch case not implemented for nextstate: $newstate (no space level change)" } } } startmultiquote { #review puts stderr "---- got startmultiquote in state $prevstate (no space level change)" set next_tokenType_known 1 ::tomlish::parse::set_tokenType "stringpart" set tok "" } endquote { #nothing to do? set tok "" } endsquote { set tok "" } endmultiquote { #JMN!! set tok "" } string { lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes } literal { lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes } double_squote { switch -exact -- $prevstate { keyval-value-expected { lappend v($nest) [list LITERAL ""] } multiliteral-space { #multiliteral-space to multiliteral-space lappend v($nest) [list LITERALPART ''] } default { error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" } } } multistring { #review lappend v($nest) [list MULTISTRING $tok] } stringpart { lappend v($nest) [list STRINGPART $tok] ;#will not get wrapped in dquotes directly } multiliteral { lappend v($nest) [LIST MULTILITERAL $tok] } literalpart { lappend v($nest) [list LITERALPART $tok] ;#will not get wrapped in squotes directly } quotedkey { #lappend v($nest) [list QKEY $tok] ;#TEST } itablequotedkey { } untyped_value { #we can't determine the type of unquoted values (int,float,datetime,bool) until the entire token was read. if {$tok in {true false}} { set tag BOOL } elseif {[::tomlish::utils::is_int $tok]} { set tag INT } elseif {[::tomlish::utils::is_float $tok]} { set tag FLOAT } elseif {[::tomlish::utils::is_datetime $tok]} { set tag DATETIME } else { error "---- Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. [tomlish::parse::report_line] (no space level change)" } lappend v($nest) [list $tag $tok] } comment { #puts stdout "----- comment token returned '$tok'------" lappend v($nest) [list COMMENT "$tok"] } equal { #we append '=' to the nest so that any surrounding whitespace is retained. lappend v($nest) = } comma { lappend v($nest) SEP } newline { incr linenum lappend v($nest) [list NEWLINE $tok] } whitespace { lappend v($nest) [list WS $tok] } continuation { lappend v($nest) CONT } bom { lappend v($nest) BOM } eof { #ok - nothing more to add to the tomlish list. #!todo - check previous tokens are complete/valid? } default { error "--- unknown tokenType '$tokenType' during state $prevstate [::tomlish::parse::report_line] (no space level change)" } } } if {!$next_tokenType_known} { ::tomlish::log::notice "---- tomlish::decode::toml - current tokenType:$tokenType Next token type not known" ::tomlish::parse::set_tokenType "" set tok "" } if {$state eq "end-state"} { break } } #while {$nest > 0} { # lappend v([expr {$nest -1}]) [set v($nest)] # incr nest -1 #} while {[::tomlish::parse::spacestack size] > 1} { ::tomlish::parse::spacestack pop lappend v([expr {$nest -1}]) [set v($nest)] incr nest -1 #set parent [spacestack peek] ;#the level being appended to #lassign $parent type state #if {$type eq "space"} { # #} elseif {$type eq "buffer"} { # lappend v([expr {$nest -1}]) {*}[set v($nest)] #} else { # error "invalid spacestack item: $parent" #} } } finally { set is_parsing 0 } return $v(0) } #*** !doctools #[list_end] [comment {--- end definitions namespace tomlish::decode ---}] } #decode toml to tomlish interp alias {} tomlish::from_toml {} tomlish::decode::toml namespace eval tomlish::utils { #*** !doctools #[subsection {Namespace tomlish::utils}] #[para] #[list_begin definitions] #tablenames (& tablearraynames) may contain irrelevant leading, trailing and interspersed whitespace # tablenames can be made up of segments delimited by dots. .eg [ a.b . c ] #trimmed, the tablename becomes {a.b.c} # A segment may contain whitespace if it is quoted e.g [a . b . "c etc " ] #ie whitespace is only irrelevant if it's outside a quoted segment #trimmed, the tablename becomes {a.b."c etc "} proc tablename_trim {tablename} { set segments [tablename_split $tablename false] set trimmed_segments [list] foreach seg $segments { lappend trimmed_segments [::string trim $seg " \t"] } return [join $trimmed_segments .] } #basic generic quote matching for single and double quotes #note for example that {[o'malley]} will return sq - as the single quote is not closed or wrapped in double quotes proc tok_in_quotedpart {tok} { set sLen [tcl::string::length $tok] set quote_type "" set had_slash 0 for {set i 0} {$i < $sLen} {incr i} { set c [tcl::string::index $tok $i] if {$quote_type eq ""} { if {$had_slash} { #don't enter quote mode #leave slash_mode because even if current char is slash - it is escaped set had_slash 0 } else { set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] switch -- $ctype { dq { set quote_type dq } sq { set quote_type sq } bsl { set had_slash 1 } } } } else { if {$had_slash} { #don't leave quoted mode #leave slash_mode because even if current char is slash - it is escaped set had_slash 0 } else { set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] switch -- $ctype { dq { if {$quote_type eq "dq"} { set quote_type "" } } sq { if {$quote_type eq "sq"} { set quote_type "" } } bsl { set had_slash 1 } } } } } return $quote_type ;#dq | sq } #utils::tablename_split proc tablename_split {tablename {normalize false}} { #we can't just split on . because we have to handle quoted segments which may contain a dot. #eg {dog."tater.man"} set sLen [tcl::string::length $tablename] set segments [list] set mode "unknown" ;#5 modes: unknown, quoted,litquoted, unquoted, syntax #quoted is for double-quotes, litquoted is for single-quotes (string literal) set seg "" for {set i 0} {$i < $sLen} {incr i} { if {$i > 0} { set lastChar [tcl::string::index $tablename [expr {$i - 1}]] } else { set lastChar "" } set c [tcl::string::index $tablename $i] if {$c eq "."} { switch -exact -- $mode { unquoted { #dot marks end of segment. lappend segments $seg set seg "" set mode "unknown" } quoted { append seg $c } unknown { lappend segments $seg set seg "" } litquoted { append seg $c } default { #mode: syntax #we got our dot. - the syntax mode is now satisfied. set mode "unknown" } } } elseif {($c eq "\"") && ($lastChar ne "\\")} { if {$mode eq "unknown"} { if {[tcl::string::trim $seg] ne ""} { #we don't allow a quote in the middle of a bare key error "tablename_split. character '\"' invalid at this point in tablename. tablename: '$tablename'" } set mode "quoted" set seg "\"" } elseif {$mode eq "unquoted"} { append seg $c } elseif {$mode eq "quoted"} { append seg $c lappend segments $seg set seg "" set mode "syntax" ;#make sure we only accept a dot or end-of-data now. } elseif {$mode eq "litquoted"} { append seg $c } elseif {$mode eq "syntax"} { error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" } } elseif {($c eq "\'")} { if {$mode eq "unknown"} { append seg $c set mode "litquoted" } elseif {$mode eq "unquoted"} { #single quote inside e.g o'neill append seg $c } elseif {$mode eq "quoted"} { append seg $c } elseif {$mode eq "litquoted"} { append seg $c lappend segments $seg set seg "" set mode "syntax" } elseif {$mode eq "syntax"} { error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" } } elseif {$c in [list " " \t]} { if {$mode eq "syntax"} { #ignore } else { append seg $c } } else { if {$mode eq "syntax"} { error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" } if {$mode eq "unknown"} { set mode "unquoted" } append seg $c } if {$i == $sLen-1} { #end of data ::tomlish::log::debug "End of data: mode='$mode'" switch -exact -- $mode { quoted { if {$c ne "\""} { error "tablename_split. missing closing double-quote in a segment. tablename: '$tablename'" } if {$normalize} { lappend segments $seg } else { lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] #lappend segments [subst -nocommands -novariables [::string range $seg 1 end-1]] ;#wrong } } litquoted { set trimmed_seg [tcl::string::trim $seg] if {[tcl::string::index $trimmed_seg end] ne "\'"} { error "tablename_split. missing closing single-quote in a segment. tablename: '$tablename'" } lappend segments $seg } unquoted - unknown { lappend segments $seg } syntax { #ok - segment already lappended } default { lappend segments $seg } } } } foreach seg $segments { set trimmed [tcl::string::trim $seg " \t"] #note - we explicitly allow 'empty' quoted strings '' & "" # (these are 'discouraged' but valid toml keys) #if {$trimmed in [list "''" "\"\""]} { # puts stderr "tablename_split. warning - Empty quoted string as tablename segment" #} if {$trimmed eq "" } { error "tablename_split. Empty segment found. tablename: '$tablename' segments [llength $segments] ($segments)" } } return $segments } proc unicode_escape_info {slashu} { #!todo # validate that slashu is either a \uxxxx or \Uxxxxxxxx value of the correct length and # is a valid 'unicode scalar value' # ie integers in the range 0 to D7FF16 and E00016 to 10FFFF16 inclusive #expr {(($x >= 0) && ($x <= 0xD7FF16)) || (($x >= 0xE00016) && ($x <= 0x10FFFF16))} if {[tcl::string::match {\\u*} $slashu]} { set exp {^\\u([0-9a-fA-F]{4}$)} if {[regexp $exp $slashu match hex]} { if {[scan $hex %4x dec] != 1} { #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? return [list err [list reason "Failed to convert '$hex' to decimal"]] } else { return [list ok [list char [subst -nocommand -novariable $slashu]]] } } else { return [list err [list reason "Supplied string not of the form \\uHHHH where H in \[0-9a-fA-F\]"]] } } elseif {[tcl::string::match {\\U*} $slashu]} { set exp {^\\U([0-9a-fA-F]{8}$)} if {[regexp $exp $slashu match hex]} { if {[scan $hex %8x dec] != 1} { #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? return [list err [list reason "Failed to convert '$hex' to decimal"]] } else { if {(($dec >= 0) && ($dec <= 0xD7FF16)) || (($dec >= 0xE00016) && ($dec <= 0x10FFFF16))} { return [list ok [list char [subst -nocommand -novariable $slashu]]] } else { return [list err [list reason "$slashu is not within the 'unicode scalar value' ranges 0 to 0xD7FF16 or 0xE00016 to 0x10FFFF16"]] } } } else { return [list err [list reason "Supplied string not of the form \\UHHHHHHHH where H in \[0-9a-fA-F\]"]] } } else { return [list err [list reason "Supplied string did not start with \\u or \\U" ]] } } proc unescape_string {str} { #note we can't just use Tcl subst because: # it also transforms \a (audible bell) and \v (vertical tab) which are not in the toml spec. # it would strip out backslashes inappropriately: e.g "\j" becomes just j # it recognizes other escapes which aren't approprite e.g \xhh and octal \nnn # it replaces\ with a single whitespace #This means we shouldn't use 'subst' on the whole string, but instead substitute only the toml-specified escapes (\r \n \b \t \f \\ \" \uhhhh & \Uhhhhhhhh set buffer "" set buffer4 "" ;#buffer for 4 hex characters following a \u set buffer8 "" ;#buffer for 8 hex characters following a \u set sLen [tcl::string::length $str] #we need to handle arbitrarily long sequences of backslashes. \\\\\ etc set slash_active 0 set unicode4_active 0 set unicode8_active 0 #!todo - check for invalid data in the form of a raw carriage return (decimal 13) without following linefeed? set i 0 for {} {$i < $sLen} {} { if {$i > 0} { set lastChar [tcl::string::index $str [expr {$i - 1}]] } else { set lastChar "" } set c [tcl::string::index $str $i] ::tomlish::log::debug "unescape_string. got char $c" scan $c %c n if {($n <= 31) && ($n != 9) && ($n != 10) && ($n != 13)} { #we don't expect unescaped unicode characters from 0000 to 001F - #*except* for raw tab (which is whitespace) and newlines error "unescape_string. Invalid data for a toml string. Unescaped control character (decimal $n) [::tomlish::utils::string_to_slashu $c]" } incr i ;#must incr here because we do'returns'inside the loop if {$c eq "\\"} { if {$slash_active} { append buffer "\\" set slash_active 0 } elseif {$unicode4_active} { error "unescape_string. unexpected case slash during unicode4 not yet handled" } elseif {$unicode8_active} { error "unescape_string. unexpected case slash during unicode8 not yet handled" } else { # don't output anything (yet) set slash_active 1 } } else { if {$unicode4_active} { if {[tcl::string::length $buffer4] < 4} { append buffer4 $c } if {[tcl::string::length $buffer4] == 4} { #we have a \uHHHH to test set unicode4_active 0 set result [tomlish::utils::unicode_escape_info "\\u$buffer4"] if {[lindex $result 0] eq "ok"} { append buffer [dict get $result ok char] } else { error "unescape_string error: [lindex $result 1]" } } } elseif {$unicode8_active} { if {[tcl::string::length $buffer8] < 8} { append buffer8 $c } if {[tcl::string::length $buffer8] == 8} { #we have a \UHHHHHHHH to test set unicode8_active 0 set result [tomlish::utils::unicode_escape_info "\\U$buffer8"] if {[lindex $result 0] eq "ok"} { append buffer [dict get $result ok char] } else { error "unescape_string error: [lindex $result 1]" } } } elseif {$slash_active} { set slash_active 0 set ctest [tcl::string::map {{"} dq} $c] switch -exact -- $ctest { dq { set e "\\\"" append buffer [subst -nocommand -novariable $e] } b - t - n - f - r { set e "\\$c" append buffer [subst -nocommand -novariable $e] } u { set unicode4_active 1 set buffer4 "" } U { set unicode8_active 1 set buffer8 "" } default { set slash_active 0 append buffer "\\" append buffer $c } } } else { append buffer $c } } } #puts stdout "EOF 4:$unicode4_active 8:$unicode8_active slash:$slash_active" if {$unicode4_active} { error "End of string reached before complete unicode escape sequence \uHHHH" } if {$unicode8_active} { error "End of string reached before complete unicode escape sequence \UHHHHHHHH" } if {$slash_active} { append buffer "\\" } return $buffer } proc normalize_key {rawkey} { set c1 [tcl::string::index $rawkey 0] set c2 [tcl::string::index $rawkey end] if {($c1 eq "'") && ($c2 eq "'")} { #single quoted segment. No escapes allowed within it. set key [tcl::string::range $rawkey 1 end-1] } elseif {($c1 eq "\"") && ($c2 eq "\"")} { #double quoted segment. Apply escapes. # set keydata [tcl::string::range $rawkey 1 end-1] ;#strip outer quotes only set key [::tomlish::utils::unescape_string $keydata] #set key [subst -nocommands -novariables $keydata] ;#wrong. Todo - create a string escape substitution function. } else { set key $rawkey } return $key } proc string_to_slashu {string} { set rv {} foreach c [split $string {}] { scan $c %c c append rv {\u} append rv [format %.4X $c] } return $rv } #'nonprintable' is conservative here because some systems (e.g windows console) are very limited in what they can display. proc nonprintable_to_slashu {s} { set res "" foreach i [split $s ""] { scan $i %c c set printable 0 if {($c>31) && ($c<127)} { set printable 1 } if {$printable} {append res $i} else {append res \\u[format %.4X $c]} } set res } ;#RS #check if str is valid for use as a toml bare key proc is_barekey {str} { if {[tcl::string::length $str] == 0} { return 0 } else { set matches [regexp -all {[a-zA-Z0-9\_\-]} $str] if {[tcl::string::length $str] == $matches} { #all characters match the regexp return 1 } else { return 0 } } } #test only that the characters in str are valid for the toml specified type 'integer'. proc int_validchars1 {str} { set numchars [tcl::string::length $str] if {[regexp -all {[0-9\_\-\+]} $str] == $numchars} { return 1 } else { return 0 } } #add support for hex,octal,binary 0x.. 0o.. 0b... proc int_validchars {str} { set numchars [tcl::string::length $str] if {[regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] == $numchars} { return 1 } else { return 0 } } proc is_int {str} { set matches [regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] if {[tcl::string::length $str] == $matches} { #all characters in legal range # --------------------------------------- #check for leading zeroes in non 0x 0b 0o #first strip any +, - or _ (just for this test) set check [tcl::string::map {+ "" - "" _ ""} $str] if {([tcl::string::length $check] > 1) && ([tcl::string::index $check 0] eq "0") && ([tcl::string::index $check 1] ni {o x b})} { return 0 } # --------------------------------------- #check +,- only occur in the first position. if {[tcl::string::last - $str] > 0} { return 0 } if {[tcl::string::last + $str] > 0} { return 0 } set numeric_value [tcl::string::map {_ ""} $str] ;#allow some earlier tcl versions which don't support underscores #use Tcl's integer check to ensure we don't let things like 3e4 through - which is a float (would need to be 0x3e4 for hex) if {![tcl::string::is integer -strict $numeric_value]} { return 0 } #!todo - check bounds only based on some config value #even though Tcl can handle bignums, we won't accept anything outside of toml 1.0 minimum requirements. #presumably very large numbers would have to be supplied in a toml file as strings. #Review - toml 1.0 only says that it must handle up to 2^63 - not that this is a max if {$numeric_value > $::tomlish::max_int} { return 0 } if {$numeric_value < $::tomlish::min_int} { return 0 } } else { return 0 } #Got this far - didn't find anything wrong with it. return 1 } #test only that the characters in str are valid for the toml specified type 'float'. proc float_validchars {str} { set numchars [tcl::string::length $str] if {[regexp -all {[eE0-9\_\-\+\.]} $str] == $numchars} { return 1 } else { #only allow lower case for these special values - as per Toml 1.0 spec if {$str ni {inf +inf -inf nan +nan -nan}} { return 0 } else { return 1 } } } proc is_float {str} { set matches [regexp -all {[eE0-9\_\-\+\.]} $str] #don't test for case variations - as Toml doesn't allow (whereas Tcl allows Inf NaN etc) if {$str in {inf +inf -inf nan +nan -nan}} { return 1 } if {[tcl::string::length $str] == $matches} { #all characters in legal range #A leading zero is ok, but we should disallow multiple leading zeroes (same rules as toml ints) #Toml spec also disallows leading zeros in the exponent part #... but this seems less interoperable (some libraries generate leading zeroes in exponents) #for now we will allow leading zeros in exponents #!todo - configure 'strict' option to disallow? #first strip any +, - or _ (just for this test) set check [tcl::string::map {+ "" - "" _ ""} $str] set r {([0-9])*} regexp $r $check intpart ;#intpart holds all numerals before the first .,e or E set z {([0])*} regexp $z $intpart leadingzeros if {[tcl::string::length $leadingzeros] > 1} { return 0 } #for floats, +,- may occur in multiple places #e.g -2E-22 +3e34 #!todo - check bounds ? #strip underscores for tcl double check set check [tcl::string::map {_ ""} $str] #string is double accepts inf nan +NaN etc. if {![tcl::string::is double $check]} { return 0 } } else { return 0 } #Got this far - didn't find anything wrong with it. return 1 } #test only that the characters in str are valid for the toml specified type 'datetime'. proc datetime_validchars {str} { set numchars [tcl::string::length $str] if {[regexp -all {[zZtT0-9\-\+\.:]} $str] == $numchars} { return 1 } else { return 0 } } #review - we proc is_datetime {str} { #e.g 1979-05-27 #e.g 1979-05-27T00:32:00Z #e.g 1979-05-27 00:32:00-07:00 #e.g 1979-05-27 00:32:00+10:00 #e.g 1979-05-27 00:32:00.999999-07:00 #review #minimal datetimes? # 2024 ok - shortest valid 4 digit year? # 02:00 ok # 05-17 ok if {[string length $str] < 4} { return 0 } set matches [regexp -all {[zZtT0-9\-\+\.:]} $str] if {[tcl::string::length $str] == $matches} { #all characters in legal range #!todo - use full RFC 3339 parser? lassign [split $str T] datepart timepart #!todo - what if the value is 'time only'? #Tcl's free-form clock scan (no -format option) is deprecated # #if {[catch {clock scan $datepart} err]} { # puts stderr "tcl clock scan failed err:'$err'" # return 0 #} #!todo - verify time part is reasonable } else { return 0 } return 1 } #*** !doctools #[list_end] [comment {--- end definitions namespace tomlish::utils ---}] } namespace eval tomlish::parse { #*** !doctools #[subsection {Namespace tomlish::parse}] #[para] #[list_begin definitions] #This is a somewhat curly mix of a statemachine and toml-nesting-stack littered with special cases. #The code is a pig's-nest - but it should be noted that for example trailing single double quotes in multiline strings are perhaps not so trivial to parse using more standard methods either: # - e.g some kind of backtracking required if using an ABNF parser? #I don't know the precise technical name for this sort of parser; probably something like "Dog's Breakfast" #More seriously, we don't have distinct lex/parse steps - so it is basically a 'fused lexer' or 'scannerless parser' #It is also desirable for this system to be useful in 'interactive' use. review - would a separate lexer make this easier or harder? #A possible alternative more structured approach might be to use a PEG (Parsing Expression Grammar) variable is_parsing 0 ;#whether we are in the middle of parsing tomlish text variable state # states: # table-space, itable-space, array-space # value-expected, keyval-syntax, # quoted-key, squoted-key # string-state, literal-state, multistring... # # notes: # only the -space states are also 'spaces' ie a container which is pushed/popped on the spacestack # # value-expected - we also allow for leading whitespace in this state, but once a value is returned we jump to a state based on the containing space. e.g keyval-tail or array-syntax # #stateMatrix defines for each state, actions to take for each possible token. #single-element actions are the name of the next state into which to transition, or a 'POPSPACE' instruction to pop a level off the spacestack and add the data to the parent container. #dual-element actions are a push instruction and the name of the space to push on the stack. # - PUSHSPACE is a simple push onto the spacestack, zeropoppushspace also pushes, but will first do a pop *if* the current space level is greater than zero (ie if only if not already in root table-space) # -- --- --- --- --- --- #token/state naming guide # -- --- --- --- --- --- #tokens : underscore separated or bare name e.g newline, start_quote, start_squote #private tokens: always have a leading underscore (These are private 'temporary state' tokens that are never returned as actual tokens e.g _start_squote_sequence #states : always contain at least one dash e.g err-state, table-space #instructions # -- --- --- --- --- --- #stateMatrix dict of elements mapping current state to next state based on returned tokens # current-state {token-encountered next-state ... } # where next-state can be a 1 or 2 element list. #If 2 element - the first item is an instruction (ucase) #If 1 element - it is either a lowercase dashed state name or an ucase instruction #e.g {PUSHSPACE } or POPSPACE or SAMESPACE #SAMESPACE - got to same space as parent without popping a level, but has it's own autotransition lookup - strange concept - review usecases variable stateMatrix set stateMatrix [dict create] #xxx-space vs xxx-syntax inadequately documented - TODO # --------------------------------------------------------------------------------------------------------------# # incomplete example of some state starting at table-space # --------------------------------------------------------------------------------------------------------------# # ( = -> value-expected) # keyval-syntax (popped -> keyval-space -> keyval-tail) (autotransition on pop) # keyval-space (autotransition on push ^) # table-space (barekey^) (startquote -> quoted-key ^) # --------------------------------------------------------------------------------------------------------------# dict set stateMatrix\ table-space { bom "table-space"\ whitespace "table-space"\ newline "table-space"\ barekey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ squotedkey {PUSHSPACE "keyval-space" state "keyval-syntax" note ""}\ startquote "quoted-key"\ XXXstartsquote "squoted-key"\ comment "table-space"\ starttablename "tablename-state"\ starttablearrayname "tablearrayname-state"\ startmultiquote "err-state"\ endquote "err-state"\ comma "err-state"\ eof "end-state"\ equal "err-state"\ } #itable-space/ curly-syntax : itables dict set stateMatrix\ itable-space {\ whitespace "itable-space"\ newline "itable-space"\ squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate itable-space starttok {squote_seq "'"}}\ barekey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ squotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ endinlinetable "POPSPACE"\ startquote "quoted-key"\ startsquote {TOSTATE "squoted-key" comment "jn-testing"}\ comma "itable-space"\ comment "err-state"\ eof "err-state"\ } dict set stateMatrix\ keyval-space {\ whitespace "keyval-syntax"\ equal "keyval-value-expected"\ } # ' = ' portion of keyval dict set stateMatrix\ keyval-syntax {\ whitespace "keyval-syntax"\ squotedkey {PUSHSPACE "dottedkey-space"}\ barekey {PUSHSPACE "dottedkey-space"}\ equal "keyval-value-expected"\ comma "err-state"\ newline "err-state"\ eof "err-state"\ } #### dict set stateMatrix\ keyval-value-expected {\ whitespace "keyval-value-expected"\ untyped_value {TOSTATE "keyval-tail" note ""}\ squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate keyval-value-expected starttok {squote_seq "'"}}\ startquote {TOSTATE "string-state" returnstate keyval-tail}\ startmultiquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ startsquote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ double_squote {TOSTATE "keyval-tail" note "empty literal received when double squote occurs"}\ triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ startarray {PUSHSPACE array-space returnstate keyval-tail}\ } #squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate keyval-process-leading-squotes starttok {squote_seq "'"}} dict set stateMatrix\ leading-squote-space {\ squote_seq "POPSPACE"\ } #dict set stateMatrix\ # keyval-process-leading-squotes {\ # startsquote "literal-state"\ # triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ # } dict set stateMatrix\ keyval-tail {\ whitespace "keyval-tail"\ newline "POPSPACE"\ comment "keyval-tail"\ eof "end-state"\ } dict set stateMatrix\ itable-keyval-syntax {\ whitespace "itable-keyval-syntax"\ squotedkey {PUSHSPACE "dottedkey-space"}\ barekey {PUSHSPACE "dottedkey-space"}\ equal "itable-keyval-value-expected"\ newline "err-state"\ eof "err-state"\ } dict set stateMatrix\ itable-keyval-value-expected {\ whitespace "itable-keyval-value-expected"\ untyped_value {TOSTATE "itable-val-tail" note ""}\ squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate itable-keyval-value-expected starttok {squote_seq "'"}}\ startquote {TOSTATE "string-state" returnstate itable-val-tail}\ startmultiquote {PUSHSPACE "multistring-space" returnstate itable-val-tail}\ startsquote {TOSTATE "literal-state" returnstate itable-val-tail note "usual way a literal is triggered"}\ double_squote {TOSTATE "itable-val-tail" note "empty literal received when double squote occurs"}\ triple_squote {PUSHSPACE "multiliteral-space" returnstate itable-val-tail}\ startinlinetable {PUSHSPACE "itable-space" returnstate itable-val-tail}\ startarray {PUSHSPACE "array-space" returnstate itable-val-tail}\ } dict set stateMatrix\ itable-keyval-space {\ whitespace "itable-keyval-syntax"\ equal {TOSTATE "itable-keyval-value-expected" note "required"}\ } dict set stateMatrix\ itable-val-tail {\ whitespace "itable-val-tail"\ endinlinetable "POPSPACE"\ comma "POPSPACE"\ Xnewline {TOSTATE "itable-val-tail" note "itable-space ??"}\ newline "err-state"\ comment "itable-val-tail"\ eof "err-state"\ } #dict set stateMatrix\ # itable-quoted-key {\ # whitespace "NA"\ # itablequotedkey {PUSHSPACE "itable-keyval-space"}\ # newline "err-state"\ # endquote "itable-keyval-syntax"\ # } #dict set stateMatrix\ # itable-squoted-key {\ # whitespace "NA"\ # itablesquotedkey {PUSHSPACE "itable-keyval-space"}\ # newline "err-state"\ # endsquote "itable-keyval-syntax"\ # } dict set stateMatrix\ value-expected {\ whitespace "value-expected"\ untyped_value {"SAMESPACE" "" replay untyped_value}\ startquote "string-state"\ startsquote "literal-state"\ startmultiquote {PUSHSPACE "multistring-space"}\ triple_squote {PUSHSPACE "multiliteral-space"}\ startinlinetable {PUSHSPACE itable-space}\ startarray {PUSHSPACE array-space}\ comment "err-state-value-expected-got-comment"\ comma "err-state"\ newline "err-state"\ eof "err-state"\ } #dottedkey-space is not used within [tablename] or [[tablearrayname]] #it is for keyval ie x.y.z = value dict set stateMatrix\ dottedkey-space {\ whitespace "dottedkey-space"\ dotsep "dottedkey-space"\ barekey "dottedkey-space"\ squotedkey "dottedkey-space"\ quotedkey "dottedkey-space"\ equal "POPSPACE"\ newline "err-state"\ comma "err-state"\ comment "err-state"\ } #dottedkeyend "POPSPACE" #REVIEW #toml spec looks like heading towards allowing newlines within inline tables #https://github.com/toml-lang/toml/issues/781 dict set stateMatrix\ curly-syntax {\ whitespace "curly-syntax"\ newline "curly-syntax"\ barekey {PUSHSPACE "itable-keyval-space"}\ itablequotedkey "itable-keyval-space"\ endinlinetable "POPSPACE"\ startquote "itable-quoted-key"\ comma "itable-space"\ comment "itable-space"\ eof "err-state"\ } #review comment "err-state" vs comment "itable-space" - see if TOML 1.1 comes out and allows comments in multiline ITABLES #We currently allow multiline ITABLES (also with comments) in the tokenizer. #if we want to disallow as per TOML 1.0 - we should do so when attempting to get structure? #JMN REVIEW dict set stateMatrix\ array-space {\ whitespace "array-space"\ newline "array-space"\ untyped_value "SAMESPACE"\ startarray {PUSHSPACE "array-space"}\ endarray "POPSPACE"\ startmultiquote {PUSHSPACE multistring-space}\ startinlinetable {PUSHSPACE itable-space}\ startquote "string-state"\ startsquote "literal-state"\ triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax note "seems ok 2024"}\ comma "array-space"\ comment "array-space"\ eof "err-state-array-space-got-eof"\ } dict set stateMatrix\ array-syntax {\ whitespace "array-syntax"\ newline "array-syntax"\ untyped_value "SAMESPACE"\ startarray {PUSHSPACE array-space}\ endarray "POPSPACE"\ startmultiquote {PUSHSPACE multistring-space}\ startquote "string-state"\ startsquote "literal-state"\ comma "array-space"\ comment "err-state"\ } #quoted-key & squoted-key need to PUSHSPACE from own token to keyval-space dict set stateMatrix\ quoted-key {\ whitespace "NA"\ quotedkey {PUSHSPACE "keyval-space"}\ newline "err-state"\ endquote "keyval-syntax"\ } dict set stateMatrix\ squoted-key {\ whitespace "NA"\ squotedkey "squoted-key"\ newline "err-state"\ } # endsquote {PUSHSPACE "keyval-space"} dict set stateMatrix\ string-state {\ whitespace "NA"\ string "string-state"\ endquote "SAMESPACE"\ newline "err-state"\ eof "err-state"\ } dict set stateMatrix\ literal-state {\ whitespace "NA"\ literal "literal-state"\ endsquote "SAMESPACE"\ newline "err-state"\ eof "err-state"\ } #dict set stateMatrix\ # stringpart {\ # continuation "SAMESPACE"\ # endmultiquote "POPSPACE"\ # eof "err-state"\ # } dict set stateMatrix\ multistring-space {\ whitespace "multistring-space"\ continuation "multistring-space"\ stringpart "multistring-space"\ newline "multistring-space"\ endmultiquote "POPSPACE"\ eof "err-state"\ } #only valid subparts are literalpart and newline. other whitespace etc is within literalpart #todo - treat sole cr as part of literalpart but crlf and lf as newline dict set stateMatrix\ multiliteral-space {\ literalpart "multiliteral-space"\ newline "multiliteral-space"\ squote_seq_begin {PUSHSPACE "trailing-squote-space" returnstate multiliteral-space starttok {squote_seq "'"}}\ triple_squote {POPSPACE note "on popping - we do any necessary concatenation of LITERALPART items due to squote processing"}\ double_squote {TOSTATE multiliteral-space note "short squote_seq: can occur anywhere in the space e.g emitted at end when 5 squotes occur"}\ startsquote {TOSTATE multiliteral-space note "short squote_seq: same as double_squote - false alarm"}\ eof "err-premature-eof-in-multiliteral-space"\ } #trailing because we are looking for possible terminating ''' - but must accept '''' or ''''' and re-integrate the 1st one or 2 extra squotes dict set stateMatrix\ trailing-squote-space {\ squote_seq "POPSPACE"\ } dict set stateMatrix\ tablename-state {\ whitespace "NA"\ tablename {zeropoppushspace table-space}\ tablename2 {PUSHSPACE table-space}\ endtablename "tablename-tail"\ comma "err-state"\ newline "err-state"\ } dict set stateMatrix\ tablearrayname-state {\ whitespace "NA"\ tablearrayname {zeropoppushspace table-space}\ tablearrayname2 {PUSHSPACE table-space}\ endtablearray "tablearrayname-tail"\ comma "err-state"\ newline "err-state"\ } dict set stateMatrix\ tablename-tail {\ whitespace "tablename-tail"\ newline "table-space"\ comment "tablename-tail"\ eof "end-state"\ } dict set stateMatrix\ tablearrayname-tail {\ whitespace "tablearrayname-tail"\ newline "table-space"\ comment "tablearrayname-tail"\ eof "end-state"\ } dict set stateMatrix\ end-state {} set knowntokens [list] set knownstates [list] dict for {state transitions} $stateMatrix { if {$state ni $knownstates} {lappend knownstates $state} dict for {tok instructions} $transitions { if {$tok ni $knowntokens} {lappend knowntokens $tok} } } dict set stateMatrix nostate {} foreach tok $knowntokens { dict set stateMatrix nostate $tok "err-nostate-received-token-$tok" } # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- #purpose - debugging? remove? # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- #build a list of 'push triggers' from the stateMatrix # ie tokens which can push a new space onto spacestack set push_trigger_tokens [list] tcl::dict::for {s transitions} $stateMatrix { tcl::dict::for {token transition_to} $transitions { set instruction [lindex $transition_to 0] switch -exact -- $instruction { PUSHSPACE - zeropoppushspace { if {$token ni $push_trigger_tokens} { lappend push_trigger_tokens $token } } } } } ::tomlish::log::debug "push_trigger_tokens: $push_trigger_tokens" # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- #This seems hacky... (deprecate in favour of explicit arguments to the instructions in stateMatrix?) #spacePopTransitions, spacePushTransitions, spaceSameTransitions below for auto state redirections on POPSPACE,PUSHSPACE,SAMESPACE #mainly for the -space states: #redirect to another state $c based on a state transition from $whatever to $b # e.g "string {array-space array-syntax}" means when transitioning from string to array-space, jump to array-syntax instead. #this is useful as we often don't know state $b. e.g when it is decided by 'POPSPACE' #Push to, next #default first states when we push to these spaces variable spacePushTransitions { keyval-space keyval-syntax itable-keyval-space itable-keyval-syntax array-space array-space table-space tablename-state } #itable-space itable-space #Pop to, next variable spacePopTransitions { array-space array-syntax } #itable-space curly-syntax #itable-keyval-space itable-val-tail #review #we pop to keyval-space from dottedkey-space or from value-expected? we don't always want to go to keyval-tail #leave it out and make the POPSPACE caller explicitly specify it #keyval-space keyval-tail variable spaceSameTransitions { array-space array-syntax } #itable-space curly-syntax #itable-keyval-space itable-val-tail variable state_list ;#reset every tomlish::decode::toml namespace export tomlish toml namespace ensemble create #goNextState has various side-effects e.g pushes and pops spacestack #REVIEW - setting nest and v elements here is ugly #todo - make neater, more single-purpose? proc goNextState {tokentype tok currentstate} { variable state variable nest variable v set prevstate $currentstate variable spacePopTransitions variable spacePushTransitions variable spaceSameTransitions variable last_space_action "none" variable last_space_type "none" variable state_list set result "" set starttok "" if {[dict exists $::tomlish::parse::stateMatrix $currentstate $tokentype]} { set transition_to [dict get $::tomlish::parse::stateMatrix $currentstate $tokentype] ::tomlish::log::debug "--->> goNextState tokentype:$tokentype tok:$tok currentstate:$currentstate : transition_to = $transition_to" switch -exact -- [lindex $transition_to 0] { POPSPACE { spacestack pop set parent_info [spacestack peek] set type [dict get $parent_info type] set parentspace [dict get $parent_info state] set last_space_action "pop" set last_space_type $type if {[dict exists $parent_info returnstate]} { set next [dict get $parent_info returnstate] #clear the returnstate on current level set existing [spacestack pop] dict unset existing returnstate spacestack push $existing ;#re-push modification ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace redirected to stored returnstate $next <<---" } else { ### #review - do away with spacePopTransitions - which although useful to provide a default.. # - involve error-prone configurations distant to the main state transition configuration in stateMatrix if {[dict exists $::tomlish::parse::spacePopTransitions $parentspace]} { set next [dict get $::tomlish::parse::spacePopTransitions $parentspace] ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace redirected state to $next (spacePopTransitions)<<---" } else { set next $parentspace ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace<<---" } } set result $next } SAMESPACE { set currentspace_info [spacestack peek] ::tomlish::log::debug "--->> SAMESPACE got current space entry: $currentspace_info <<<<<" set type [dict get $currentspace_info type] set currentspace [dict get $currentspace_info state] if {[dict exists $currentspace_info returnstate]} { set next [dict get $currentspace_info returnstate] #clear the returnstate on current level set existing [spacestack pop] dict unset existing returnstate spacestack push $existing ;#re-push modification ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace redirected to stored returnstate $next" } else { if {[dict exists $::tomlish::parse::spaceSameTransitions $currentspace]} { set next [dict get $::tomlish::parse::spaceSameTransitions $currentspace] ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace redirected state to $next (spaceSameTransitions)" } else { set next $currentspace ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace" } } set result $next } zeropoppushspace { if {$nest > 0} { #pop back down to the root level (table-space) spacestack pop set parentinfo [spacestack peek] set type [dict get $parentinfo type] set target [dict get $parentinfo state] set last_space_action "pop" set last_space_type $type #----- #standard pop set parentlevel [expr {$nest -1}] lappend v($parentlevel) [set v($nest)] incr nest -1 #----- } #re-entrancy #set next [list PUSHSPACE [lindex $transition_to 1]] set nexttokentype ${tokentype}2 ;#fake token type e.g tablename2 or tablearrayname2 #::tomlish::log::notice "goNextState REENTRANCY. calling goNextState $nexttokentype $tokentype" #set result [::tomlish::parse::goNextState $nexttokentype $tokentype] ::tomlish::log::debug "--->> zeropoppushspace goNextState REENTRANCY. calling goNextState $nexttokentype $currentstate" set transition_info [::tomlish::parse::goNextState $nexttokentype $tok $currentstate] set result [dict get $transition_info newstate] } PUSHSPACE { set original_target [dict get $transition_to PUSHSPACE] if {[dict exists $transition_to returnstate]} { #adjust the existing space record on the stack. #struct::stack doesn't really support that - so we have to pop and re-push #todo - investigate a custom stack implementation where we can efficiently lset the top of the stack set currentspace [spacestack pop] dict set currentspace returnstate [dict get $transition_to returnstate] spacestack push $currentspace ;#return modified info to stack so when we POPSPACE the returnstate is available. } if {[dict exists $transition_to starttok]} { set starttok [dict get $transition_to starttok] } spacestack push [dict create type space state $original_target] set last_space_action "push" set last_space_type "space" if {[dict exists $transition_to state]} { #an explicit state in the pushed space was requested in the stateMatrix - override the spacePushTransition (spacePushTransitions can be deprecated if we require explicitness?) set next [dict get $transition_to state] ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target redirected state to $next by explicit 'state' entry" } else { #puts $::tomlish::parse::spacePushTransitions if {[dict exists $::tomlish::parse::spacePushTransitions $original_target]} { set next [dict get $::tomlish::parse::spacePushTransitions $original_target] ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target redirected state to $next (spacePushTransitions) " } else { set next $original_target ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target" } } set result $next } TOSTATE { if {[dict exists $transition_to returnstate]} { #adjust the existing space record on the stack. #struct::stack doesn't really support that - so we have to pop and re-push #todo - investigate a custom stack implementation where we can efficiently lset the top of the stack set currentspace [spacestack pop] dict set currentspace returnstate [dict get $transition_to returnstate] spacestack push $currentspace ;#return modified info to stack so when we POPSPACE the returnstate is available. } set result [dict get $transition_to TOSTATE] } default { #simplified version of TOSTATE set result [lindex $transition_to 0] ;#ignore everything but first word } } } else { ::tomlish::log::error "--->> No state transition defined from state $currentstate when tokentype $tokentype received" set result "nostate" } lappend state_list [list tokentype $tokentype from $currentstate to $result] set state $result ::tomlish::log::notice "--->> STATE TRANSITION tokenType: '$tokentype' tok:$tok triggering '$currentstate' -> '$result' last_space_action:$last_space_action " return [dict create prevstate $prevstate newstate $result space_action $last_space_action starttok $starttok] } proc report_line {{line ""}} { variable linenum variable is_parsing if {$is_parsing} { if {$line eq ""} { set line $linenum } return "Line Number: $line" } else { #not in the middle of parsing tomlish text - return nothing. return "" } } #produce a *slightly* more readable string rep of the nest for puts etc. proc nest_pretty1 {list} { set prettier "{" foreach el $list { if { [lindex $el 0] eq "NEWLINE"} { append prettier "[list $el]\n" } elseif {([llength $el] > 1) && ([lindex $el 0] in {KEY QKEY SQKEY TABLE ARRAY})} { append prettier [nest_pretty1 $el] } else { append prettier "[list $el] " } } append prettier "}" return $prettier } proc set_tokenType {t} { variable tokenType variable tokenType_list if {![info exists tokenType]} { set tokenType "" } lappend tokenType_list $t set tokenType $t } proc switch_tokenType {t} { variable tokenType variable tokenType_list lset tokenType_list end $t set tokenType $t } proc get_tokenType {} { variable tokenType return $tokenType } proc _shortcircuit_startquotesequence {} { variable tok variable i set toklen [tcl::string::length $tok] if {$toklen == 1} { set_tokenType "startquote" incr i -1 return -level 2 1 } elseif {$toklen == 2} { puts stderr "_shortcircuit_startquotesequence toklen 2" set_tokenType "startquote" set tok "\"" incr i -2 return -level 2 1 } } proc get_token_waiting {} { variable token_waiting return [lindex $token_waiting 0] } proc clear_token_waiting {} { variable token_waiting set token_waiting [list] } #token_waiting is a list - but our standard case is to have only one #in certain circumstances such as near eof we may have 2 #the set_token_waiting function only allows setting when there is not already one waiting. #we want to catch cases of inadvertently trying to set multiple # - the reason being that the state transition triggered by the previous token may have invalidated the assumptions made when a token was added as waiting. proc set_token_waiting {args} { if {[llength $args] %2 != 0} { error "set_token_waiting must have args of form: type value complete 0|1" } variable token_waiting if {[llength $token_waiting] && [dict get [lindex $token_waiting end] type] ne "eof"} { #tokloop already set a token_waiting - but something (post tokloop processing?) is trying to set another #we may need to remove the existing token_waiting and reset the tokloop index to the previous char so it's reprocessed in the possibly new context #rather than attempt to make the right decision here - we raise an error and require the caller to check/handle it set err "set_token_waiting already has token_waiting: [lindex $token_waiting 0]" append err \n " - cannot add token_waiting: $args" error $err #set tomlish::parse::i [expr {[dict get $token_waiting startindex] -1}] #set token_waiting [list] } set waiting [dict create] dict for {k v} $args { switch -exact $k { type - complete { dict set waiting $k $v } value { dict set waiting tok $v } startindex { dict set waiting startindex $v } default { error "set_token_waiting error - unrecognised key $k. known keys: [dict keys $args]" } } } if {![tcl::string::is boolean -strict [dict get $waiting complete]]} { error "set_token_waiting error - 'complete' must be a boolean. got [dict get $waiting complete]" } if {![llength $token_waiting]} { set token_waiting [list $waiting] } else { #an extra sanity-check that we don't have more than just the eof.. if {[llength $token_waiting] > 1} { set err "Unexpected. Existing token_waiting count > 1.\n" foreach tw $token_waiting { append err " $tw" \n } append err " - cannot add token_waiting: $waiting" error $err } #last entry must be a waiting eof set token_waiting [list $waiting [lindex $token_waiting end]] } return } #returns 0 or 1 #tomlish::parse::tok #we attempt to do this without lookahead (potential use in streaming toml? for what benefit?) todo -final flag # - the possible benefit is being able to more easily process in arbitrarily split chunks (although we would still have to watch crlf splitting ?) # - interactive use? proc tok {s} { variable nest variable v variable i variable tok variable type ;#character type variable state ;#FSM variable tokenType variable tokenType_list variable endToken variable lastChar variable braceCount variable bracketCount #------------------------------ #Previous run found another (presumably single-char) token #The normal case is for there to be only one dict in the list #multiple is an exception - primarily for eof variable token_waiting if {[llength $token_waiting]} { set waiting [lindex $token_waiting 0] set tokenType [dict get $waiting type] set tok [dict get $waiting tok] #todo: dict get $token_waiting complete set token_waiting [lrange $token_waiting 1 end] return 1 } #------------------------------ set resultlist [list] set sLen [tcl::string::length $s] set slash_active 0 set quote 0 set c "" set multi_dquote "" for {} {$i < $sLen} {} { if {$i > 0} { set lastChar [tcl::string::index $s [expr {$i - 1}]] } else { set lastChar "" } set c [tcl::string::index $s $i] set cindex $i tomlish::log::debug "- tokloop char <$c> index $i tokenType:$tokenType tok:<$tok>" #puts "got char $c during tokenType '$tokenType'" incr i ;#must incr here because we do returns inside the loop set ctest [tcl::string::map {\{ lc \} rc \[ lb \] rb \" dq ' sq \\ bsl \r cr \n lf \t tab \uFEFF bom} $c] switch -exact -- $ctest { # { set dquotes $multi_dquote set multi_dquote "" set had_slash $slash_active set slash_active 0 if {$had_slash} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { squote_seq { incr i -1 return 1 } startquotesequence { _shortcircuit_startquotesequence } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "startsquote" return 1 } barekey { error "Unexpected character '$c' during bare key. Only \[a-zA-Z_-\] allowed. [tomlish::parse::report_line]" } whitespace { # hash marks end of whitespace token #do a return for the whitespace, set token_waiting #set_token_waiting type comment value "" complete 1 incr i -1 ;#leave comment for next run return 1 } untyped_value { #REVIEW! the spec isn't clear.. is whitespace after an int,bool etc required before comment? #we will accept a comment marker as an immediate terminator of the untyped_value. incr i -1 return 1 } starttablename - starttablearrayname { #fix! error "Character '#' is invalid first character for $tokenType. [tomlish::parse::report_line]" } tablename - tablearrayname { #invalid in bare parts - but allowed in quoted parts - let tablename parser sort it out append tok $c } default { #quotedkey, itablequotedkey, string,literal, multistring append tok $c } } } else { switch -- $state { multistring-space { set_tokenType stringpart set tok "" if {$had_slash} { append tok "\\" } append tok "$dquotes#" } multiliteral-space { set_tokenType "literalpart" set tok "#" } default { #start of token if we're not in a token set_tokenType comment set tok "" ;#The hash is not part of the comment data } } } } lc { #left curly brace set dquotes $multi_dquote set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { squote_seq { incr i -1 return 1 } startquotesequence { _shortcircuit_startquotesequence } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "startsquote" return 1 } literal - literalpart - squotedkey - itablesquotedkey { append tok $c } string - quotedkey - itablequotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} append tok $dquotes$c } starttablename - starttablearrayname { #*bare* tablename can only contain letters,digits underscores error "Invalid tablename first character \{ [tomlish::parse::report_line]" } tablename - tablearrayname { #valid in quoted parts append tok $c } comment { if {$had_slash} {append tok "\\"} append tok "\[" } default { #end any other token. incr i -1 return 1 } } } else { switch -exact -- $state { itable-keyval-value-expected - keyval-value-expected - value-expected { #switch last key to tablename?? set_tokenType "startinlinetable" set tok "\{" return 1 } array-space - array-syntax { #nested anonymous inline table set_tokenType "startinlinetable" set tok "\{" return 1 } table-space { #invalid - but allow parser statemachine to report it. ? set_tokenType "startinlinetable" set tok "\{" return 1 } multistring-space { set_tokenType "stringpart" set tok "" if {$had_slash} { append tok "\\" } append tok "$dquotes\{" } multiliteral-space { set_tokenType "literalpart" set tok "\{" } default { error "state: '$state'. left brace case not implemented [tomlish::parse::report_line]" } } } } rc { #right curly brace set dquotes $multi_dquote set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { squote_seq { incr i -1 return 1 } startquotesequence { _shortcircuit_startquotesequence } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "startsquote" return 1 } literal - literalpart - squotedkey - itablesquotedkey { append tok $c } string - quotedkey - itablequotedkey - comment { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} append tok $dquotes$c } starttablename - tablename { if {$had_slash} {append tok "\\"} #invalid! - but leave for datastructure loading stage to catch set_token_waiting type endinlinetable value "" complete 1 startindex $cindex return 1 } starttablearrayname - tablearrayname { if {$had_slash} {append tok "\\"} #invalid! - but leave for datastructure loading stage to catch set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex return 1 } itable-val-tail { #review error "right-curly in itable-val-tail" } default { #end any other token incr i -1 return 1 } } } else { #$slash_active not relevant when no tokenType switch -exact -- $state { value-expected { #invalid - but allow parser statemachine to report it. set_tokenType "endinlinetable" set tok "\}" return 1 } table-space { #invalid - but allow parser statemachine to report it. ? set_tokenType "endinlinetable" set tok "\}" return 1 } itable-space { set_tokenType "endinlinetable" set tok "\}" return 1 } tablename-state { #e.g [] - empty tablename - allowed or not? #empty tablename/tablearrayname ? #error "unexpected tablename problem" set_tokenType "endinlinetable" set tok "" ;#no output into the tomlish list for this token return 1 } tablearrayname-state { error "unexpected tablearrayname-state problem" set_tokenType "endinlinetable" set tok "" ;#no output into the tomlish list for this token return 1 } array-syntax - array-space { #invalid set_tokenType "endinlinetable" set tok "\}" return 1 } curly-syntax { set_tokenType "endinlinetable" set tok "\}" return 1 } itable-val-tail { set_tokenType "endinlinetable" set tok "" #we need to pop the keyval - and then reprocess to pop the inlinetable - so we incr -1 incr i -1 return 1 } itable-keyval-syntax { error "endinlinetable unexpected at this point. Expecting key=val syntax [tomlish::parse::report_line]" } multistring-space { set_tokenType "stringpart" set tok "" if {$had_slash} { append tok "\\" } append tok "$dquotes\}" } multiliteral-space { set_tokenType "literalpart" ; #review set tok "\}" } default { #JMN2024b keyval-tail? error "state '$state'. endinlinetable case not implemented [tomlish::parse::report_line]" } } } } lb { #left square bracket set dquotes $multi_dquote set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { squote_seq { incr i -1 return 1 } startquotesequence { _shortcircuit_startquotesequence } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "startsquote" return 1 } literal - literalpart - squotedkey - itablesquotedkey { append tok $c } string - quotedkey - itablequotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} append tok $dquotes$c } starttablename { #change the tokenType switch_tokenType "starttablearrayname" set tok "" ;#no output into the tomlish list for this token #any following whitespace is part of the tablearrayname, so return now return 1 } tablename { #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token if {$had_slash} { #resultant tablename may be invalid - but leave for datastructure loading stage to catch append tok "\\[" } else { if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { #invalid at this point - state machine should disallow table -> starttablearrayname set_token_waiting type starttablearrayname value "" complete 1 startindex $cindex return 1 } else { #we appear to still be in single or double quoted section append tok "\[" } } } comment { if {$had_slash} {append tok "\\"} append tok "\[" } default { #end any other token. incr i -1 return 1 } } } else { #$slash_active not relevant when no tokenType switch -exact -- $state { keyval-value-expected - itable-keyval-value-expected - value-expected { set_tokenType "startarray" set tok "\[" return 1 } table-space { #table name #assume it's a single bracket - but we need to wait for non-bracket to confirm it's not a tablearray #note that a starttablearrayname token may contain whitespace between the brackets # e.g \[ \[ set_tokenType "starttablename" set tok "" ;#there is no output into the tomlish list for this token } array-space - array-syntax { #nested array? set_tokenType "startarray" set tok "\[" return 1 #error "state: array-space. startarray case not implemented [tomlish::parse::report_line]" } multistring-space { set_tokenType "stringpart" set tok "" if {$had_slash} { append tok "\\" } append tok "$dquotes\[" } multiliteral-space { set_tokenType "literalpart" set tok "\[" } default { error "state: '$state'. startarray case not implemented [tomlish::parse::report_line]" } } } } rb { #right square bracket set dquotes $multi_dquote set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { squote_seq { incr i -1 return 1 } startquotesequence { _shortcircuit_startquotesequence } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "startsquote" return 1 } literal - literalpart - squotedkey - itablesquotedkey { append tok $c } string - quotedkey - itablequotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} append tok $dquotes$c } comment { if {$had_slash} {append tok "\\"} append tok $c } whitespace { if {$state eq "multistring-space"} { #???? incr i -1 if {$had_slash} {incr i -1} ;#reprocess return 1 } else { incr i -1 if {$had_slash} {incr i -1} ;#reprocess return 1 } } tablename { #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token if {$had_slash} { #resultant tablename may be invalid - but leave for datastructure loading stage to catch append tok "\\]" } else { if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { set_token_waiting type endtablename value "" complete 1 startindex $cindex return 1 } else { #we appear to still be in single or double quoted section append tok "]" } } } tablearraynames { #todo? if {$had_slash} {append tok "\\"} #invalid! - but leave for datastructure loading stage to catch set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex return 1 } default { incr i -1 return 1 } } } else { #$slash_active not relevant when no tokenType switch -exact -- $state { value-expected { #invalid - but allow parser statemachine to report it. set_tokenType "endarray" set tok "\]" return 1 } table-space { #invalid - but allow parser statemachine to report it. ? set_tokenType "endarray" set tok "\]" return 1 } tablename-state { #e.g [] - empty tablename - allowed or not? #empty tablename/tablearrayname ? #error "unexpected tablename problem" set_tokenType "endtablename" set tok "" ;#no output into the tomlish list for this token return 1 } tablearrayname-state { error "unexpected tablearrayname problem" set_tokenType "endtablearray" set tok "" ;#no output into the tomlish list for this token return 1 } array-syntax - array-space { set_tokenType "endarray" set tok "\]" return 1 } multistring-space { set_tokenType "stringpart" set tok "" if {$had_slash} { append tok "\\" } append tok "$dquotes\]" } multiliteral-space { set_tokenType "literalpart" set tok "\]" } default { error "state '$state'. endarray case not implemented [tomlish::parse::report_line]" } } } } bsl { set dquotes $multi_dquote set multi_dquote "" ;#!! #backslash if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { squote_seq { incr i -1 return 1 } startquotesequence { _shortcircuit_startquotesequence } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "startsquote" return 1 } whitespace { if {$state eq "multistring-space"} { #end whitespace token incr i -1 ;#reprocess bsl in next run return 1 } else { error "Unexpected backslash during whitespace. [tomlish::parse::report_line]" } } literal - literalpart - squotedkey - itablesquotedkey { #never need to set slash_active true when in single quoted tokens append tok "\\" set slash_active 0 } string - quotedkey - itablequotedkey - comment { if {$slash_active} { set slash_active 0 append tok "\\\\" } else { set slash_active 1 } } stringpart { if {$slash_active} { #assert - quotes empty - or we wouldn't have slash_active set slash_active 0 append tok "\\\\" } else { append tok $dquotes set slash_active 1 } } starttablename - starttablearrayname { error "backslash is invalid as first character of $tokenType [tomlish::parse::report_line]" } tablename - tablearrayname { if {$slash_active} { set slash_active 0 append tok "\\\\" } else { set slash_active 1 } } barekey { error "Unexpected backslash during barekey. [tomlish::parse::report_line]" } default { error "Backslash unexpected during tokentype: '$tokenType'. [tomlish::parse::report_line]" } } } else { switch -exact -- $state { multistring-space { if {$slash_active} { set_tokenType "stringpart" set tok "\\\\" set slash_active 0 } else { if {$dquotes ne ""} { set_tokenType "stringpart" set tok $dquotes } set slash_active 1 } } multiliteral-space { #nothing can be escaped in multiliteral-space - not even squotes (?) review set_tokenType "literalpart" set tok "\\" } default { error "tok error: Unexpected backslash when no token is active. [tomlish::parse::report_line]" } } } } sq { #single quote set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { squote_seq { #short squote_seq tokens are returned if active during any other character #longest allowable for leading/trailing are returned here #### set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote switch -- $state { leading-squote-space { append tok $c if {$existingtoklen > 2} { error "tok error: squote_seq unexpected length $existingtoklen when another received" } elseif {$existingtoklen == 2} { return 1 ;#return tok ''' } } trailing-squote-space { append tok $c if {$existingtoklen == 4} { #maxlen to be an squote_seq is multisquote + 2 = 5 #return tok ''''' return 1 } } default { error "tok error: squote_seq in unexpected state '$state' - expected leading-squote-space or trailing-squote-space" } } } whitespace { #end whitespace incr i -1 ;#reprocess sq return 1 } startquotesequence { _shortcircuit_startquotesequence } _start_squote_sequence { #temp token creatable only during value-expected or array-space switch -- [tcl::string::length $tok] { 1 { append tok $c } 2 { #switch? append tok $c set_tokenType triple_squote return 1 } default { error "unexpected token length [tcl::string::length $tok] in '_start_squote_sequence'" } } } literal { #slash_active always false #terminate the literal set_token_waiting type endsquote value "'" complete 1 startindex $cindex return 1 } literalpart { #ended by ''' - but final could be '''' or ''''' (up to 2 squotes allowed directly before ending triple squote sequence) #todo # idea: end this literalpart (possibly 'temporarily') # let the sq be reprocessed in the multiliteral-space to push an end-multiliteral-sequence to state stack # upon popping end-multiliteral-sequence - stitch quotes back into this literalpart's token (if either too short - or a long ending sequence as shown above) incr i -1 ;#throw the "'" back to loop - will be added to an squote_seq token for later processing return 1 } itablesquotedkey { set_token_waiting type endsquote value "'" complete 1 startindex $cindex return 1 } squotedkey { ### #set_token_waiting type endsquote value "'" complete 1 return 1 } starttablename - starttablearrayname { #!!! incr i -1 return 1 } tablename - tablearrayname { append tok $c } default { append tok $c } } } else { switch -exact -- $state { value-expected - array-space { set_tokenType "_start_squote_sequence" set tok "'" } itable-keyval-value-expected - keyval-value-expected { set_tokenType "squote_seq_begin" set tok "'" return 1 } table-space { ### set_tokenType "squotedkey" set tok "" } itable-space { set_tokenType "squote_seq_begin" set tok "'" return 1 } tablename-state { #first char in tablename-state/tablearrayname-state set_tokenType tablename append tok "'" } tablearrayname-state { set_tokenType tablearrayname append tok "'" } literal-state { tomlish::log::debug "- tokloop sq during literal-state with no tokentype - empty literal?" set_tokenType literal incr -1 return 1 } multistring-space { error "unimplemented - squote during state '$state'. [tomlish::parse::report_line]" } multiliteral-space { #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row #we are building up an squote_seq to determine if #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines #b) it is exactly ''' and we can terminate the whole multiliteral #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space set_tokenType "squote_seq_begin" set tok "'" return 1 } dottedkey-space { set_tokenType squotedkey } default { error "unhandled squote during state '$state'. [tomlish::parse::report_line]" } } } } dq { #double quote set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { squote_seq { incr i -1 return 1 } startquotesequence { set toklen [tcl::string::length $tok] if {$toklen == 1} { append tok $c } elseif {$toklen == 2} { append tok $c #switch vs set? set_tokenType "startmultiquote" return 1 } else { error "unexpected token length $toklen in 'startquotesequence'" } } _start_squote_sequence { set toklen [tcl::string::length $tok] switch -- $toklen { 1 { set_tokenType "startsquote" incr i -1 return 1 } 2 { set_tokenType "startsquote" incr i -2 return 1 } default { error "unexpected _start_squote_sequence length $toklen" } } } literal - literalpart { append tok $c } string { if {$had_slash} { append tok "\\" $c } else { #unescaped quote always terminates a string? set_token_waiting type endquote value "\"" complete 1 startindex $cindex return 1 } } stringpart { #sub element of multistring if {$had_slash} { append tok "\\" $c } else { #incr i -1 if {$multi_dquote eq "\"\""} { set_token_waiting type endmultiquote value "\"\"\"" complete 1 startindex [expr {$cindex -2}] set multi_dquote "" return 1 } else { append multi_dquote "\"" } } } whitespace { switch -exact -- $state { multistring-space { #REVIEW if {$had_slash} { incr i -2 return 1 } else { switch -- [tcl::string::length $multi_dquote] { 2 { set_token_waiting type endmultiquote value "\"\"\"" complete 1 startindex [expr {$cindex-2}] set multi_dquote "" return 1 } 1 { incr i -2 return 1 } 0 { incr i -1 return 1 } } } } keyval-value-expected - value-expected { #end whitespace token and reprocess incr i -1 return 1 #if {$multi_dquote eq "\"\""} { # set_token_waiting type startmultiquote value "\"\"\"" complete 1 # set multi_dquote "" # return 1 #} else { # #end whitespace token and reprocess # incr i -1 # return 1 #} } default { set_token_waiting type startquote value "\"" complete 1 startindex $cindex return 1 } } } comment { if {$had_slash} {append tok "\\"} append tok $c } quotedkey - itablequotedkey { if {$had_slash} { append tok "\\" append tok $c } else { set_token_waiting type endquote value "\"" complete 1 startindex $cindex return 1 } } squotedkey - itablesquotedkey { append tok $c } tablename - tablearrayname { if {$had_slash} {append tok "\\"} append tok $c } starttablename - starttablearrayname { incr i -1 ;## return 1 } default { error "got quote during tokenType '$tokenType' [tomlish::parse::report_line]" } } } else { #$slash_active not relevant when no tokenType #token is string only if we're expecting a value at this point switch -exact -- $state { keyval-value-expected - value-expected - array-space { #!? start looking for possible multistartquote #set_tokenType startquote #set tok $c #return 1 set_tokenType "startquotesequence" ;#one or more quotes in a row - either startquote or multistartquote set tok $c } multistring-space { #TODO - had_slash!!! #REVIEW if {$had_slash} { set_tokenType "stringpart" set tok "\\\"" set multi_dquote "" } else { if {$multi_dquote eq "\"\""} { tomlish::log::debug "- tokloop char dq ---> endmultiquote" set_tokenType "endmultiquote" set tok "\"\"\"" return 1 #set_token_waiting type endmultiquote value "\"\"\"" complete 1 #set multi_dquote "" #return 1 } else { append multi_dquote "\"" } } } multiliteral-space { set_tokenType "literalpart" set tok "\"" } table-space { set_tokenType "startquote" set tok $c return 1 } itable-space { set_tokenType "startquote" set tok $c return 1 } tablename-state { set_tokenType tablename set tok $c } tablearrayname-state { set_tokenType tablearrayname set tok $c } dottedkey-space { set_tokenType dquote_seq_begin set tok $c } default { error "Unexpected quote during state '$state' [tomlish::parse::report_line]" } } } } = { set dquotes $multi_dquote set multi_dquote "" ;#!! set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { squote_seq { incr i -1 return 1 } startquotesequence { _shortcircuit_startquotesequence } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "startsquote" return 1 } literal - literalpart - squotedkey { #assertion had_slash 0, multi_dquote "" append tok $c } string - comment - quotedkey - itablequotedkey { #for these tokenTypes an = is just data. if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} append tok $dquotes$c } whitespace { if {$state eq "multistring-space"} { set backlen [expr {[tcl::string::length $dquotes] + 1}] incr i -$backlen return 1 } else { set_token_waiting type equal value = complete 1 startindex $cindex return 1 } } barekey { #set_token_waiting type equal value = complete 1 incr i -1 return 1 } starttablename - starttablearrayname { error "Character '=' is invalid first character for $tokenType. [tomlish::parse::report_line]" } tablename - tablearrayname { #invalid in bare name - but valid in quoted parts - leave for tablename parser to sort out append tok $c } default { error "unexpected = character during tokentype $tokenType. case not implemented. [tomlish::parse::report_line]" } } } else { switch -exact -- $state { multistring-space { set_tokenType "stringpart" set tok "" if {$had_slash} { append tok "\\" } append tok ${dquotes}= } multiliteral-space { set_tokenType "literalpart" set tok "=" } dottedkey-space { set_tokenType "equal" set tok "=" return 1 } default { set_tokenType "equal" set tok = return 1 } } } } cr { #REVIEW! set dquotes $multi_dquote set multi_dquote "" ;#!! # \r carriage return if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { squote_seq { incr i -1 return 1 } startquotesequence { _shortcircuit_startquotesequence } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "startsquote" return 1 } literal { append tok $c } literalpart { #we need to split out crlf as a separate NEWLINE to be consistent ::tomlish::log::warning "literalpart ended by cr - needs testing" #return literalpart temporarily - allow cr to be reprocessed from multiliteral-space incr i -1 return 1 } stringpart { append tok $dquotes$c } starttablename - starttablearrayname { error "Character is invalid first character for $tokenType. [tomlish::parse::report_line]" } tablename - tablearrayname { #could in theory be valid in quoted part of name #review - might be better just to disallow here append tok $c } default { #!todo - error out if cr inappropriate for tokenType append tok $c } } } else { #lf may be appended if next #review - lone cr as newline? - this is uncommon - but so is lone cr in a string(?) set_tokenType "newline" set tok cr } } lf { # \n newline set dquotes $multi_dquote set multi_dquote "" ;#!! set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { squote_seq { incr i -1 return 1 } startquotesequence { _shortcircuit_startquotesequence } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "startsquote" return 1 } literal { #nl is not allowed *within* a literal - require multiliteral syntax for any literal containing a newline ''' ''' #even though we terminate the literal without the closing quote here - the token_waiting newline should trigger a state error set_token_waiting type newline value lf complete 1 startindex $cindex return 1 } literalpart { #we allow newlines - but store them within the multiliteral as their own element #This is a legitimate end to the literalpart - but not the whole multiliteral set_token_waiting type newline value lf complete 1 startindex $cindex return 1 } newline { #review #this lf is the trailing part of a crlf append tok lf ;#assert we should now have tok "crlf" - as a previous cr is the only way to have an incomplete newline tok return 1 } stringpart { if {$dquotes ne ""} { append tok $dquotes incr i -1 return 1 } else { if {$had_slash} { #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] incr i -1 return 1 } else { set_token_waiting type newline value lf complete 1 startindex $cindex return 1 } } } starttablename - tablename - tablearrayname - starttablearrayname { error "Character is invalid in $tokenType. [tomlish::parse::report_line]" } default { #newline ends all other tokens. #note for string: we don't add (raw unescaped) newline to simple string. (must use multi-string for this) #note for whitespace: # we will use the convention that \n terminates the current whitespace even if whitespace follows # ie whitespace is split into separate whitespace tokens at each newline #puts "-------------- newline lf during tokenType $tokenType" set_token_waiting type newline value lf complete 1 startindex $cindex return 1 } } } else { switch -exact -- $state { multistring-space { if {$had_slash} { set_tokenType "continuation" set tok "\\" incr i -1 return 1 } else { if {$dquotes ne ""} { #e.g one or 2 quotes just before nl set_tokenType "stringpart" set tok $dquotes incr i -1 return 1 } set_tokenType "newline" set tok lf return 1 } } multiliteral-space { #assert had_slash 0, multi_dquote "" set_tokenType "newline" set tok "lf" return 1 } default { #ignore slash? error? set_tokenType "newline" set tok lf return 1 } } #if {$had_slash} { # #CONT directly before newline - allows strings_5_byteequivalent test to pass # set_tokenType "continuation" # set tok "\\" # incr i -1 # return 1 #} else { # set_tokenType newline # set tok lf # return 1 #} } } , { set dquotes $multi_dquote set multi_dquote "" set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { newline { #incomplete newline set_tokenType "cr" incr i -1 return 1 } squote_seq { incr i -1 return 1 } startquotesequence { _shortcircuit_startquotesequence } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "startsquote" return 1 } comment - tablename - tablearrayname { if {$had_slash} {append tok "\\"} append tok , } string - quotedkey - itablequotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { #stringpart can have up to 2 quotes too if {$had_slash} {append tok "\\"} append tok $dquotes$c } literal - literalpart - squotedkey - itablesquotedkey { #assert had_slash always 0, multi_dquote "" append tok $c } whitespace { if {$state eq "multistring-space"} { set backlen [expr {[tcl::string::length $dquotes] + 1}] incr i -$backlen return 1 } else { set_token_waiting type comma value "," complete 1 startindex $cindex return 1 } } default { set_token_waiting type comma value "," complete 1 startindex $cindex if {$had_slash} {append tok "\\"} return 1 } } } else { switch -exact -- $state { multistring-space { set_tokenType "stringpart" set tok "" if {$had_slash} {append tok "\\"} append tok "$dquotes," } multiliteral-space { #assert had_slash 0, multi_dquote "" set_tokenType "literalpart" set tok "," } default { set_tokenType "comma" set tok "," return 1 } } } } . { set dquotes $multi_dquote set multi_dquote "" ;#!! set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { newline { #incomplete newline set_tokenType "cr" incr i -1 return 1 } squote_seq { incr i -1 return 1 } startquotesequence { _shortcircuit_startquotesequence } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "startsquote" return 1 } comment - untyped_value { if {$had_slash} {append tok "\\"} append tok $c } string - quotedkey - itablequotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} append tok $dquotes$c } literal - literalpart - squotedkey - itablesquotedkey { #assert had_slash always 0, multi_dquote "" append tok $c } whitespace { switch -exact -- $state { multistring-space { set backchars [expr {[tcl::string::length $dquotes] + 1}] if {$had_slash} { incr backchars 1 } incr i -$backchars return 1 } dottedkey-space { incr i -1 return 1 } default { error "Received period during tokenType 'whitespace' [tomlish::parse::report_line]" } } } starttablename - starttablearrayname { #This would correspond to an empty table name error "Character '.' is not allowed as first character ($tokenType). [tomlish::parse::report_line]" } tablename - tablearrayname { #subtable - split later - review append tok $c } barekey { #e.g x.y = 1 #we need to transition the barekey to become a structured table name ??? review #x is the tablename y is the key set_token_waiting type dotsep value "." complete 1 startindex $cindex return 1 } default { error "Received period during tokenType '$tokenType' [tomlish::parse::report_line]" #set_token_waiting type period value . complete 1 #return 1 } } } else { switch -exact -- $state { multistring-space { set_tokenType "stringpart" set tok "" if {$had_slash} {append tok "\\"} append tok "$dquotes." } multiliteral-space { set_tokenType "literalpart" set tok "." } dottedkey-space { ### set_tokenType "dotsep" set tok "." return 1 } default { set_tokenType "untyped_value" set tok "." } } } } " " { set dquotes $multi_dquote set multi_dquote "" ;#!! if {[tcl::string::length $tokenType]} { set had_slash $slash_active set slash_active 0 switch -exact -- $tokenType { newline { #incomplete newline set_tokenType "cr" incr i -1 return 1 } squote_seq { incr i -1 return 1 } startquotesequence { _shortcircuit_startquotesequence } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "startsquote" return 1 } barekey { #todo had_slash - emit token or error #whitespace is a terminator for bare keys #set_token_waiting type whitespace value $c complete 1 incr i -1 return 1 } untyped_value { #unquoted values (int,date,float etc) are terminated by whitespace #set_token_waiting type whitespace value $c complete 1 incr i -1 return 1 } comment { if {$had_slash} { append tok "\\" } append tok $dquotes$c } string - quotedkey - itablequotedkey { if {$had_slash} { append tok "\\" } append tok $c } stringpart { #for stringpart we store WS separately for ease of processing continuations (CONT stripping) if {$had_slash} { #REVIEW #emit the stringpart - go back to the slash incr i -2 return 1 } else { #split into STRINGPART aaa WS " " append tok $dquotes incr i -1 return 1 } } literal - literalpart - squotedkey - itablesquotedkey { append tok $c } whitespace { if {$state eq "multistring-space"} { if {$dquotes ne ""} { #end whitespace token #go back by the number of quotes plus this space char set backchars [expr {[tcl::string::length $dquotes] + 1}] incr i -$backchars return 1 } else { append tok $c } } else { append tok $c } } starttablename - starttablearrayname { incr i -1 return 1 } tablename - tablearrayname { #include whitespace in the tablename/tablearrayname #Will need to be normalized upon interpreting the tomlish as a datastructure append tok $c } default { error "Received whitespace space during tokenType '$tokenType' [tomlish::parse::report_line]" } } } else { set had_slash $slash_active set slash_active 0 switch -exact -- $state { tablename-state { #tablename can have leading,trailing and interspersed whitespace! #These will not be treated as whitespace tokens, instead forming part of the name. set_tokenType tablename set tok "" if {$had_slash} {append tok "\\"} append tok $c } tablearrayname-state { set_tokenType tablearrayname set tok "" if {$had_slash} {append tok "\\"} append tok $c } multistring-space { if {$had_slash} { set_tokenType "continuation" set tok "\\" incr i -1 return 1 } else { if {$dquotes ne ""} { set_tokenType "stringpart" set tok $dquotes incr i -1 return 1 } set_tokenType "whitespace" append tok $c } } multiliteral-space { set_tokenType "literalpart" set tok $c } default { if {$had_slash} { error "unexpected backslash [tomlish::parse::report_line]" } set_tokenType "whitespace" append tok $c } } } } tab { set dquotes $multi_dquote set multi_dquote "" ;#!! if {[tcl::string::length $tokenType]} { if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out (?review) set slash_active 0 switch -exact -- $tokenType { newline { #incomplete newline set_tokenType "cr" incr i -1 return 1 } startquotesequence { _shortcircuit_startquotesequence } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "startsquote" return 1 } barekey { #whitespace is a terminator for bare keys incr i -1 #set_token_waiting type whitespace value $c complete 1 return 1 } untyped_value { #unquoted values (int,date,float etc) are terminated by whitespace #set_token_waiting type whitespace value $c complete 1 incr i -1 return 1 } quotedkey - itablequotedkey - squotedkey - itablesquotedkey { append tok $c } string - comment - whitespace { append tok $c } stringpart { #for stringpart we store WS separately for ease of processing continuations (CONT stripping) if {$had_slash} { #REVIEW #emit the stringpart - go back to the slash incr i -2 return 1 } else { #split into STRINGPART aaa WS " " append tok $dquotes incr i -1 return 1 } } literal - literalpart { append tok $c } starttablename - starttablearrayname { incr i -1 return 1 } tablename - tablearraynames { #include whitespace in the tablename/tablearrayname #Will need to be normalized upon interpreting the tomlish as a datastructure append tok $c } default { error "Received whitespace tab during tokenType '$tokenType' [tomlish::parse::report_line]" } } } else { set had_slash $slash_active if {$slash_active} { set slash_active 0 } switch -exact -- $state { tablename-state { #tablename can have leading,trailing and interspersed whitespace! #These will not be treated as whitespace tokens, instead forming part of the name. set_tokenType tablename set tok $c } tablearrayname-state { set_tokenType tablearrayname set tok $c } multistring-space { if {$had_slash} { set_tokenType "continuation" set tok "\\" incr i -1 return 1 } else { if {$dquotes ne ""} { set_tokenType stringpart set tok $dquotes incr i -1 return 1 } else { set_tokenType whitespace append tok $c } } } multiliteral-space { set_tokenType "literalpart" set tok $c } default { set_tokenType "whitespace" append tok $c } } } } bom { #BOM (Byte Order Mark) - ignored by token consumer if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { _start_squote_sequence { #assert - tok will be one or two squotes only incr i -[tcl::string::length $tok] set_tokenType "startsquote" return 1 } literal - literalpart { append tok $c } default { set_token_waiting type bom value "\uFEFF" complete 1 startindex $cindex return 1 } } } else { switch -exact -- $state { multiliteral-space { set_tokenType "literalpart" set tok $c } default { set_tokenType "bom" set tok "\uFEFF" return 1 } } } } default { set dquotes $multi_dquote set multi_dquote "" ;#!! if {[tcl::string::length $tokenType]} { if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. set slash_active 0 switch -exact -- $tokenType { newline { #incomplete newline set_tokenType "cr" incr i -1 return 1 } squote_seq { incr i -1 return 1 } startquotesequence { _shortcircuit_startquotesequence } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "startsquote" return 1 } whitespace { if {$state eq "multistring-space"} { if {$dquotes ne ""} { set backlen [expr {[tcl::string::length $dquotes] + 1}] incr i -$backlen return 1 } else { incr i -1 return 1 } } else { #review incr i -1 ;#We don't have a full token to add to the token_waiting dict - so leave this char for next run. return 1 } } barekey { if {[tomlish::utils::is_barekey $c]} { append tok $c } else { error "Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] allowed. [tomlish::parse::report_line]" } } starttablename - starttablearrayname { incr i -1 #allow statemachine to set context for subsequent chars return 1 } stringpart { append tok $dquotes$c } default { #e.g comment/string/literal/literalpart/untyped_value/starttablename/starttablearrayname/tablename/tablearrayname append tok $c } } } else { set had_slash $slash_active set slash_active 0 switch -exact -- $state { table-space - itable-space { #if no currently active token - assume another key value pair if {[tomlish::utils::is_barekey $c]} { set_tokenType "barekey" append tok $c } else { error "Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" } } curly-syntax { puts stderr "curly-syntax - review" if {[tomlish::utils::is_barekey $c]} { set_tokenType "barekey" append tok $c } else { error "Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" } } multistring-space { set_tokenType "stringpart" if {$had_slash} { #assert - we don't get had_slash and dquotes at same time set tok \\$c } else { set tok $dquotes$c } } multiliteral-space { set_tokenType "literalpart" set tok $c } tablename-state { set_tokenType "tablename" set tok $c } tablearrayname-state { set_tokenType "tablearrayname" set tok $c } dottedkey-space { set_tokenType barekey set tok $c } default { tomlish::log::debug "- tokloop char '$c' setting to untyped_value while state:$state" set_tokenType "untyped_value" set tok $c } } } } } } #run out of characters (eof) if {[tcl::string::length $tokenType]} { #check for invalid ending tokens #if {$state eq "err-state"} { # error "Reached end of data whilst tokenType = '$tokenType'. INVALID" #} switch -exact -- $tokenType { startquotesequence { set toklen [tcl::string::length $tok] if {$toklen == 1} { #invalid #eof with open string error "eof reached without closing quote for string. [tomlish::parse::report_line]" } elseif {$toklen == 2} { #valid #we ended in a double quote, not actually a startquoteseqence - effectively an empty string switch_tokenType "startquote" incr i -1 #set_token_waiting type string value "" complete 1 return 1 } } _start_squote_sequence { set toklen [tcl::string::length $tok] switch -- $toklen { 1 { #invalid eof with open literal error "eof reached without closing single quote for string literal. [tomlish::parse::report_line]" } 2 { #review set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] set_tokenType "literal" set tok "" return 1 } } } } set_token_waiting type eof value eof complete 1 startindex $i ;#review return 1 } else { ::tomlish::log::debug "- No current tokenType, ran out of characters, setting tokenType to 'eof' [tomlish::parse::report_line]" set tokenType "eof" set tok "eof" } return 0 } #*** !doctools #[list_end] [comment {--- end definitions namespace tomlish::parse ---}] } tcl::namespace::eval tomlish::app { variable applist [list encoder decoder test] #*** !doctools #[subsection {Namespace tomlish::app}] #[para] #[list_begin definitions] proc decoder {args} { #*** !doctools #[call app::[fun decoder] [arg args]] #[para] read toml on stdin until EOF #[para] on error - returns non-zero exit code and writes error on stderr #[para] on success - returns zero exit code and writes JSON encoding of the data on stdout #[para] This decoder is intended to be compatible with toml-test set opts [dict merge [dict create] $args] #fconfigure stdin -encoding utf-8 fconfigure stdin -translation binary #Just slurp it all - presumably we are not handling massive amounts of data on stdin. # - even if the input is large, we probably don't gain much (aside from possible memory savings?) by attempting to process input as it arrives. if {[catch { set toml [read stdin] }]} { exit 2 ;#read error } try { set j [::tomlish::toml_to_json $toml] } on error {em} { puts stderr "decoding failed: '$em'" exit 1 } puts -nonewline stdout $j exit 0 } proc encoder {args} { #*** !doctools #[call app::[fun encoder] [arg args]] #[para] read JSON on stdin until EOF #[para] return non-zero exitcode if JSON data cannot be converted to a valid TOML representation #[para] return zero exitcode and TOML data on stdout if JSON data can be converted. #[para] This encoder is intended to be compatible with toml-test set opts [dict merge [dict create] $args] fconfigure stdin -translation binary if {[catch { set json [read stdin] }]} { exit 2 ;#read error } try { set toml [::tomlish::json_to_toml $json] } on error {em} { puts stderr "encoding failed: '$em'" exit 1 } puts -nonewline stdout $toml exit 0 } proc test {args} { set opts [dict merge [dict create] $args] package require test::tomlish if {[dict exists $opts -suite]} { test::tomlish::suite [dict get $opts -suite] } test::tomlish::run } #*** !doctools #[list_end] [comment {--- end definitions namespace tomlish::app ---}] } proc ::tomlish::appnames {} { set applist [list] foreach cmd [info commands ::tomlish::app::*] { lappend applist [namespace tail $cmd] } return $applist } # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # Secondary API namespace # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ namespace eval tomlish::lib { namespace export {[a-z]*}; # Convention: export all lowercase namespace path [namespace parent] #*** !doctools #[subsection {Namespace tomlish::lib}] #[para] Secondary functions that are part of the API #[list_begin definitions] #proc utility1 {p1 args} { # #*** !doctools # #[call lib::[fun utility1] [arg p1] [opt {?option value...?}]] # #[para]Description of utility1 # return 1 #} #*** !doctools #[list_end] [comment {--- end definitions namespace tomlish::lib ---}] } # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ if {$argc > 0} { puts stderr "argc: $argc args: $argv" if {($argc == 1)} { if {[tcl::string::tolower $argv] in {help -help h -h}} { puts stdout "Usage: -app where appname one of:[tomlish::appnames]" exit 0 } else { puts stderr "Argument '$argv' not understood. Try -help" exit 1 } } set opts [dict create] set opts [dict merge $opts $argv] set opts_understood [list -app ] if {"-app" in [dict keys $opts]} { #Don't vet the remaining opts - as they are interpreted by each app } else { foreach key [dict keys $opts] { if {$key ni $opts_understood} { puts stderr "Option '$key' not understood" exit 1 } } } if {[dict exists $opts -app]} { set app [dict get $opts -app] if {$app ni [tomlish::appnames]} { puts stderr "app '[dict get $opts -app]' not found" exit 1 } tomlish::app::$app {*}$opts } } ## Ready package provide tomlish [namespace eval tomlish { variable pkg tomlish variable version set version 1.1.1 }] return #*** !doctools #[manpage_end]