# -*- tcl -*- # Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt # # Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. # Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # (C) 2024 # # @@ Meta Begin # Application tomlish 1.1.1 # Meta platform tcl # Meta license # @@ Meta End # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # doctools header # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ #*** !doctools #[manpage_begin tomlish_module_tomlish 0 1.1.1] #[copyright "2024"] #[titledesc {Module API}] [comment {-- Name section and table of contents description --}] #[moddesc {-}] [comment {-- Description at end of page heading --}] #[require tomlish] #[keywords module] #[description] #[para] - # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ #*** !doctools #[section Overview] #[para] overview of tomlish #[subsection Concepts] #[para] - # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ ## Requirements # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ #*** !doctools #[subsection dependencies] #[para] packages used by tomlish #[list_begin itemized] package require Tcl 8.6- package require struct::stack package require logger #*** !doctools #[item] [package {Tcl 8.6-}] #[item] [package {struct::stack}] #limit ourselves to clear, destroy, peek, pop, push, rotate, or size (e.g v 1.3 does not implement 'get') #*** !doctools #[list_end] # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ #*** !doctools #[section API] # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # Base namespace # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ namespace eval tomlish { namespace export {[a-z]*}; # Convention: export all lowercase variable types #ARRAY is analogous to a Tcl list #TABLE is analogous to a Tcl dict #WS = inline whitespace #KEYVAL = bare key and value #QKEYVAL = quoted key and value #ITABLE = inline table (*can* be anonymous table) # inline table values immediately create a table with the opening brace # inline tables are fully defined between their braces, as are dotted-key subtables defined within # No additional subtables or arrays of tables may be defined within an inline table after the ending brace - they must be entirely self-contained set tags [list TOMLISH ARRAY TABLE ITABLE ANONTABLE WS NEWLINE COMMENT KEYVAL QKEYVAL STRING MULTISTRING LITSTRING MULTILITSTRING INT FLOAT BOOL DATETIME] #tomlish v1.0 should accept arbitrary 64-bit signed ints (from -2^63 to 2^63-1) #we will restrict to this range for compatibility for now - although Tcl can handle larger (arbitrarily so?) set min_int -9223372036854775808 ;#-2^63 set max_int +9223372036854775807 ;#2^63-1 proc Dolog {lvl txt} { #return "$lvl -- $txt" set msg "[clock format [clock seconds] -format "%Y-%m-%dT%H:%M:%S"] tomlish '$txt'" puts stderr $msg } logger::initNamespace ::tomlish foreach lvl [logger::levels] { interp alias {} tomlish_log_$lvl {} ::tomlish::Dolog $lvl log::logproc $lvl tomlish_log_$lvl } #*** !doctools #[subsection {Namespace tomlish}] #[para] Core API functions for tomlish #[list_begin definitions] proc tags {} { return $::tomlish::tags } #helper function for get_dict proc _get_keyval_value {keyval_element} { log::notice ">>> _get_keyval_value from '$keyval_element'<<<" set found_value 0 #find the value # 3 is the earliest index at which the value could occur (depending on whitespace) set found_sub [list] foreach sub [lrange $keyval_element 2 end] { #note that a barekey/quotedkey won't occur directly inside a barekey/quotedkey switch -exact -- [lindex $sub 0] { STRING - MULTISTRING - INT - FLOAT - BOOL - DATETIME - TABLE - ARRAY - ITABLE { set type [lindex $sub 0] set value [lindex $sub 1] set found_sub $sub incr found_value 1 } default {} } } if {!$found_value} { error "Failed to find value element in KEYVAL. '$keyval_element'" } if {$found_value > 1} { error "Found multiple value elements in KEYVAL, expected exactly one. '$keyval_element'" } switch -exact -- $type { INT - FLOAT - BOOL - DATETIME { #simple (non-container, no-substitution) datatype set result [list type $type value $value] } STRING - STRINGPART { set result [list type $type value [::tomlish::utils::unescape_string $value]] } LITSTRING { #REVIEW set result [list type $type value $value] } TABLE - ITABLE - ARRAY - MULTISTRING { #jmn2024 - added ITABLE - review #we need to recurse to get the corresponding dict for the contained item(s) #pass in the whole $found_sub - not just the $value! set result [list type $type value [::tomlish::get_dict [list $found_sub]]] } default { error "Unexpected value type '$type' found in keyval '$keyval_element'" } } return $result } #get_dict is a *basic* programmatic datastructure for accessing the data. # produce a dictionary of keys and values from a tomlish tagged list. # get_dict is primarily for reading toml data. #Extraneous (not within quoted sections) whitespace and comments are not preserved in this structure, # so a roundtrip from toml to this datastructure and back to toml will lose whitespace formatting and comments. # creating/changing toml values can be done directly on a tomlish list if preserving (or adding) formatting/comments is desired. #A separate package 'tomlish::object' may be needed to allow easier programmatic creating/updating/deleting of data elements whilst preserving (or adding or selectively deleting/editing) such formatting. proc get_dict {tomlish} { #keep track of which tablenames have already been directly defined, # so we can raise an error to satisfy the toml rule: 'You cannot define any key or table more than once. Doing so is invalid' #Note that [a] and then [a.b] is ok if there are no subkey conflicts - so we are only tracking complete tablenames here. #we don't error out just because a previous tablename segment has already appeared. variable tablenames_seen [list] log::info ">>> processing '$tomlish'<<<" set items $tomlish foreach lst $items { if {[lindex $lst 0] ni $::tomlish::tags} { error "supplied string does not appear to be toml parsed into a tomlish tagged list. Run tomlish::decode::toml on the raw toml data to produce a tomlish list" } } if {[lindex $tomlish 0] eq "TOMLISH"} { #ignore TOMLISH tag at beginning set items [lrange $tomlish 1 end] } set datastructure [dict create] foreach item $items { set tag [lindex $item 0] #puts "...> item:'$item' tag:'$tag'" switch -exact -- $tag { KEYVAL - QKEYVAL { log::debug "--> processing $tag: $item" set key [lindex $item 1] #!todo - normalize key. (may be quoted/doublequoted) if {[dict exists $datastructure $key]} { error "Duplicate key '$key'. The key already exists at this level in the toml data. The toml data is not valid." } #lassign [_get_keyval_value $item] type val set keyval_dict [_get_keyval_value $item] dict set datastructure $key $keyval_dict } TABLE { set tablename [lindex $item 1] set tablename [::tomlish::utils::tablename_trim $tablename] if {$tablename in $tablenames_seen} { error "Table name '$tablename' has already been directly defined in the toml data. Invalid." } log::debug "--> processing $tag (name: $tablename): $item" set name_segments [::tomlish::utils::tablename_split $tablename] set last_seg "" #toml spec rule - all segments mst be non-empty #note that the results of tablename_split are 'raw' - ie some segments may be enclosed in single or double quotes. set key_hierarchy [list] set key_hierarchy_raw [list] foreach rawseg $name_segments { set seg [::tomlish::utils::normalize_key $rawseg] ;#strips one level of enclosing quotes, and substitutes only toml-specified escapes set c1 [::string index $rawseg 0] set c2 [::string index $rawseg end] if {($c1 eq "'") && ($c2 eq "'")} { #single quoted segment. No escapes are processed within it. set seg [::string range $rawseg 1 end-1] } elseif {($c1 eq "\"") && ($c2 eq "\"")} { #double quoted segment. Apply escapes. set seg [::tomlish::utils::unescape_string [::string range $rawseg 1 end-1]] #set seg [subst -nocommands -novariables [::string range $rawseg 1 end-1]] } else { set seg $rawseg } #no need to check for empty segments here - we've already called tablename_split which would have raised an error for empty segments. #if {$rawseg eq ""} { # error "Table name '[lindex $item 1]' is not valid. All segments (parts between dots) must be non-empty" #} lappend key_hierarchy $seg lappend key_hierarchy_raw $rawseg if {[dict exists $datastructure {*}$key_hierarchy]} { #It's ok for this key to already exist *if* it was defined by a previous tablename, # but not if it was defined as a keyval/qkeyval set testkey [join $key_hierarchy_raw .] set testkey_length [llength $key_hierarchy_raw] set found_testkey 0 if {$testkey in $tablenames_seen} { set found_testkey 1 } else { #see if it was defined by a longer entry foreach seen $tablenames_seen { set seen_segments [::tomlish::utils::tablename_split $seen] #these are raw unnormalized tablename segments. Need to normalize the double-quoted ones, # and strip the quotes from both single-quoted and double-quoted entries. #note: it is not safe to compare normalized tablenames using join! # e.g a.'b.c'.d is not the same as a.b.c.d # instead compare {a b.c d} with {a b c d} # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' #dots within table segments might seem like an 'edge case' # - but perhaps there is legitimate need to put for example version numbers as tablenames or parts of tablenames. #VVV the test below is wrong VVV! set seen_match [join [lrange $seen_segments 0 [expr {$testkey_length -1}]] .] if {$testkey eq $seen_match} { set found_testkey 1 } } } if {$found_testkey == 0} { #the raw key_hierarchy is better to display in the error message, although it's not the actual dict keyset error "key [join $key_hierarchy_raw .] already exists, but wasn't defined by a supertable." } } } #We must do this after the key-collision test above! lappend tablenames_seen $tablename log::debug ">>>>>>>>>>>>>>>>>>>>key_hierarchy : $key_hierarchy" log::debug ">>>>>>>>>>>>>>>>>>>>key_hierarchy_raw: $key_hierarchy_raw" #now add the contained elements foreach element [lrange $item 2 end] { set type [lindex $element 0] switch -exact -- $type { KEYVAL - QKEYVAL { set keyval_key [lindex $element 1] set keyval_dict [_get_keyval_value $element] dict set datastructure {*}$key_hierarchy $keyval_key $keyval_dict } NEWLINE - COMMENT - WS { #ignore } default { error "Sub element of type '$type' not understood in table context. Expected only KEYVAL,QKEYVAL,NEWLINE,COMMENT,WS" } } } #now make sure we add an empty value if there were no contained elements! #!todo. } ITABLE { #SEP??? set datastructure [list] foreach element [lrange $item 1 end] { set type [lindex $element 0] switch -exact -- $type { KEYVAL - QKEYVAL { set keyval_key [lindex $element 1] set keyval_dict [_get_keyval_value $element] dict set datastructure $keyval_key $keyval_dict } NEWLINE - COMMENT - WS { #ignore } default { error "Sub element of type '$type' not understood in ITABLE context. Expected only KEYVAL,QKEYVAL,NEWLINE,COMMENT,WS" } } } } ARRAY { #arrays in toml are allowed to contain mixtures of types set datastructure [list] log::debug "--> processing array: $item" foreach element [lrange $item 1 end] { set type [lindex $element 0] switch -exact -- $type { INT - FLOAT - BOOL - DATETIME { set value [lindex $element 1] lappend datastructure [list type $type value $value] } STRING { set value [lindex $element 1] lappend datastructure [list type $type value [::tomlish::utils::unescape_string $value]] } TABLE - ARRAY - MULTISTRING { set value [lindex $element 1] lappend datastructure [list type $type value [::tomlish::get_dict [list $element]]] } WS - SEP { #ignore whitespace and commas } default { error "Unexpected value type '$type' found in array" } } } } MULTISTRING { #triple dquoted string log::debug "--> processing multistring: $item" set stringvalue "" set idx 0 set parts [lrange $item 1 end] for {set idx 0} {$idx < [llength $parts]} {incr idx} { set element [lindex $parts $idx] set type [lindex $element 0] switch -exact -- $type { STRINGPART { append stringvalue [::tomlish::utils::unescape_string [lindex $element 1]] } CONT { #When the last non-whitespace character on a line is an unescaped backslash, #it will be trimmed along with all whitespace (including newlines) up to the next non-whitespace character or closing delimiter # review - we allow some whitespace in stringpart elements - can a stringpart ever be all whitespace? set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] if {$next_nl == -1} { #last line set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] if {$non_ws >= 0} { append stringvalue "\\" ;#add the sep } else { #skip over ws without emitting set idx [llength $parts] } } else { set parts_til_nl [lrange $parts 0 $next_nl-1] set non_ws [lsearch -index 0 -start $idx+1 -not $parts_til_nl WS] if {$non_ws >= 0} { append stringvalue "\\" } else { #skip over ws on this line set idx $next_nl #then have to check each subsequent line until we get to first non-whitespace set trimming 1 while {$trimming && $idx < [llength $parts]} { set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] if {$next_nl == -1} { #last line set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] if {$non_ws >= 0} { set idx [expr {$non_ws -1}] } else { set idx [llength $parts] } set trimming 0 } else { set non_ws [lsearch -index 0 -start $idx+1 -not [lrange $parts 0 $next_nl-1] WS] if {$non_ws >= 0} { set idx [expr {$non_ws -1}] set trimming 0 } else { set idx $next_nl #keep trimming } } } } } } NEWLINE { #if newline is first element - it is not part of the data of a multistring if {$idx > 0} { set val [lindex $element 1] if {$val eq "nl"} { append stringvalue \n } else { append stringvalue \r\n } } } WS { append stringvalue [lindex $element 1] } default { error "Unexpected value type '$type' found in multistring" } } } set datastructure $stringvalue } WS - COMMENT - NEWLINE { #ignore } default { error "Unexpected tag '$tag' in Tomlish list '$tomlish'" } } } return $datastructure } proc json_to_toml {json} { #*** !doctools #[call [fun json_to_toml] [arg json]] #[para] set tomlish [::tomlish::from_json $json] set toml [::tomlish::to_toml $tomlish] } proc from_json {json} { set jstruct [::tomlish::json_struct $json] return [::tomlish::from_json_struct $jstruct] } proc from_json_struct {jstruct} { package require fish::json_toml return [fish::json_toml::jsonstruct2tomlish $jstruct] } proc toml_to_json {toml} { set tomlish [::tomlish::from_toml $toml] return [::tomlish::get_json $tomlish] } proc get_json {tomlish} { package require fish::json set d [::tomlish::get_dict $tomlish] #return [::tomlish::dict_to_json $d] return [fish::json::from "struct" $d] } #*** !doctools #[list_end] [comment {--- end definitions namespace tomlish ---}] } # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ namespace eval tomlish::encode { #*** !doctools #[subsection {Namespace tomlish::encode}] #[para] #[list_begin definitions] #STRING,INT,FLOAT,BOOL, DATETIME - simple wrappers for completeness # take a value of the appropriate type and wrap as a tomlish tagged item proc string {s} { return [list STRING $s] } proc int {i} { #whole numbers, may be prefixed with a + or - #Leading zeros are not allowed #Hex,octal binary forms are allowed (toml 1.0) #We will error out on encountering commas, as commas are interpreted differently depending on locale (and don't seem to be supported in the toml spec anyway) #!todo - Tcl can handle bignums - bigger than a 64bit signed long as specified in toml. # - We should probably raise an error for number larger than this and suggest the user supply it as a string? if {[tcl::string::last , $i] > -1} { error "Unable to interpret '$i' as an integer. Use underscores if you need a thousands separator [::tomlish::parse::report_line]" } if {![::tomlish::utils::int_validchars $i]} { error "Unable to interpret '$i' as an integer. Only 0-9 + 1 _ characters are acceptable. [::tomlish::parse::report_line]" } if {[::tomlish::utils::is_int $i]} { return [list INT $i] } else { error "'$i' is not a valid integer as per the Toml spec. [::tomlish::parse::report_line]" } } proc float {f} { #convert any non-lower case variants of special values to lowercase for Toml if {[string tolower $f] in {nan +nan -nan inf +inf -inf}} { return [list FLOAT [string tolower $f]] } if {[::tomlish::utils::is_float $f]} { return [list FLOAT $f] } else { error "Unable to interpret '$f' as Toml float. Check your input, or check that tomlish is able to handle all Toml floats properly [::tomlish::parse::report_line]" } } proc datetime {str} { if {[::tomlish::utils::is_datetime $str]} { return [list DATETIME $str] } else { error "Unable to interpret '$str' as Toml datetime. Check your input, or check that tomlish is able to handle all Toml datetimes properly [::tomlish::parse::report_line]" } } proc boolean {b} { #convert any Tcl-acceptable boolean to boolean as accepted by toml - lower case true/false if {![string is boolean -strict $b]} { error "Unable to convert '$b' to Toml boolean true|false. [::tomlish::parse::report_line]" } else { if {[expr {$b && 1}]} { return [list BOOL true] } else { return [list BOOL false] } } } #Take tablename followed by # a) *tomlish* name-value pairs e.g table mydata [list KEYVAL item11 [list STRING "test"]] {KEYVAL item2 [list INT 1]} # b) simple 2-element tcl lists being name & *simple* value pairs for which basic heuristics will be used to determine types proc table {name args} { set pairs [list] foreach t $args { if {[llength $t] == 3} { if {[lindex $t 0] ne "KEYVAL"} { error "Only items tagged as KEYVAL currently accepted as name-value pairs for table command" } lappend pairs $t } elseif {[llength $t] == 2} { #!todo - type heuristics lassign $t n v lappend pairs [list KEYVAL $n [list STRING $v]] } else { error "erlish 'KEYVAL' or simple name-value pairs expected. Unable to handle [llength $t] element list as argument" } } return [list TABLE $name $pairs] } #the tomlish root is basically a nameless table representing the root of the document proc root {args} { set table [::tomlish::encode::table TOMLISH {*}$args] set result [lindex $table 2] ;#Take only the key-value pair list } #WS = whitepace, US = underscore proc tomlish {list {context ""}} { if {![tcl::string::is list $list]} { error "Supplied 'tomlish' is not a valid Tcl list. Expected a tagged list (parsed Toml)" } set toml "" ;#result string foreach item $list { set tag [lindex $item 0] #puts "tomlish::encode::tomlish processing item '$item', tag '$tag'" #during recursion, some tags require different error checking in different contexts. set nextcontext $tag ; #Handle invalid tag nestings switch -- $context { QKEYVAL - KEYVAL { if {$tag in {KEYVAL QKEYVAL}} { error "Invalid tag '$tag' encountered within '$context'" } } MULTISTRING { #explicitly list the valid child tags if {$tag ni {STRING STRINGPART WS NEWLINE CONT}} { error "Invalid tag '$tag' encountered within a MULTISTRING" } } default { #no context, or no defined nesting error for this context } } switch -- $tag { TOMLISH { #optional root tag. Ignore. } QKEYVAL - KEYVAL { if {$tag eq "KEYVAL"} { append toml [lindex $item 1] ;#Key } else { append toml \"[lindex $item 1]\" ;#Quoted Key } foreach part [lrange $item 2 end] { if {$part eq "="} { append toml "=" } else { append toml [::tomlish::encode::tomlish [list $part] $nextcontext] } } } TABLE { append toml "\[[lindex $item 1]\]" ;#table name foreach part [lrange $item 2 end] { append toml [::tomlish::encode::tomlish [list $part] $nextcontext] } } ITABLE { #inline table - e.g within array or on RHS of keyval/qkeyval set data "" foreach part [lrange $item 1 end] { append data [::tomlish::encode::tomlish [list $part] $nextcontext] } append toml "\{$data\}" } ARRAY { set arraystr "" foreach part [lrange $item 1 end] { append arraystr [::tomlish::encode::tomlish [list $part] $nextcontext] } append toml "\[$arraystr\]" } WS { append toml [lindex $item 1] } SEP { append toml "," } NEWLINE { set chartype [lindex $item 1] if {$chartype eq "lf"} { append toml \n } elseif {$chartype eq "crlf"} { append toml \r\n } else { error "Unrecognized newline type '$chartype'" } } CONT { #line continuation character "\" append toml "\\" } STRING { #simple double quoted strings only # return \"[lindex $item 1]\" } STRINGPART { return [lindex $item 1] } MULTISTRING { #Tripple quoted string which is a container for newlines,whitespace and multiple strings/stringparts set multistring "" ;#variable to build up the string foreach part [lrange $item 1 end] { append multistring [::tomlish::encode::tomlish [list $part] $nextcontext] } append toml "\"\"\"$multistring\"\"\"" } LITSTRING { #Single Quoted string(literal string) append toml '[lindex $item 1]' } MULTILITSTRING { #review - multilitstring can be handled as a single string? set litstring "" foreach part [lrange $item 1 end] { append litstring [::tomlish::encode::tomlish [list $part] $nextcontext] } append toml '''$litstring''' } INT - BOOL - FLOAT - DATETIME { append toml [lindex $item 1] } INCOMPLETE { error "cannot process tomlish term tagged as INCOMPLETE" } COMMENT { append toml "#[lindex $item 1]" } BOM { #Byte Order Mark may appear at beginning of a file. Needs to be preserved. append toml "\uFEFF" } default { error "Not a properly formed 'tomlish' taggedlist.\n '$list'\n Unknown tag '[lindex $item 0]'. See output of \[tomlish::tags\] command." } } } return $toml } #*** !doctools #[list_end] [comment {--- end definitions namespace tomlish::encode ---}] } #fish toml from tomlish #(encode tomlish as toml) interp alias {} tomlish::to_toml {} tomlish::encode::tomlish # namespace eval tomlish::decode { #*** !doctools #[subsection {Namespace tomlish::decode}] #[para] #[list_begin definitions] #return a Tcl list of tomlish tokens #i.e get a standard list of all the toml terms in string $s #where each element of the list is a *tomlish* term.. i.e a specially 'tagged' Tcl list. #(simliar to a tcl 'Huddle' - but also supporting whitespace preservation) #Note that we deliberately don't check certain things such as duplicate table declarations here. #Part of the justification for this is that as long as the syntax is toml shaped - we can load files which violate certain rules and allow programmatic manipulation. # (e.g perhaps a toml editor to highlight violations for fixing) # A further stage is then necessary to load the tomlish tagged list into a data structure more suitable for efficient query/reading. # e.g dicts or an object oriented structure #Note also - *no* escapes in quoted strings are processed. This is up to the datastructure stage #e.g to_dict will substitute \r \n \uHHHH \UHHHHHHH etc #This is important for tomlish to maintain the ability to perform competely lossless round-trips from toml to tomlish and back to toml. # (which is handy for testing as well as editing some part of the structure with absolutely no effect on other parts of the document) #If we were to unescape a tab character for example # - we have no way of knowing if it was originally specified as \t \u0009 or \U00000009 or directly as a tab character. # For this reason, we also do absolutely no line-ending transformations based on platform. # All line-endings are maintained as is, and even a file with mixed cr crlf line-endings will be correctly interpreted and can be 'roundtripped' proc toml {s} { #*** !doctools #[call [fun toml] [arg s]] #[para] return a Tcl list of tomlish tokens namespace upvar ::tomlish::parse is_parsing is_parsing set is_parsing 1 if {[info command ::tomlish::parse::spacestack] eq "::tomlish::parse::spacestack"} { tomlish::parse::spacestack destroy } struct::stack ::tomlish::parse::spacestack namespace upvar ::tomlish::parse last_space_action last_space_action namespace upvar ::tomlish::parse last_space_type last_space_type namespace upvar ::tomlish::parse tok tok set tok "" namespace upvar ::tomlish::parse type type namespace upvar ::tomlish::parse tokenType tokenType ::tomlish::parse::set_tokenType "" namespace upvar ::tomlish::parse tokenType_list tokenType_list set tokenType [list] ;#Flat (un-nested) list of tokentypes found namespace upvar ::tomlish::parse lastChar lastChar set lastChar "" set result "" namespace upvar ::tomlish::parse nest nest set nest 0 namespace upvar ::tomlish::parse v v ;#array keyed on nest level set v(0) {TOMLISH} array set s0 [list] ;#whitespace data to go in {SPACE {}} element. set parentlevel 0 namespace upvar ::tomlish::parse i i set i 0 namespace upvar ::tomlish::parse state state namespace upvar ::tomlish::parse braceCount braceCount set barceCount 0 namespace upvar ::tomlish::parse bracketCount bracketCount set bracketCount 0 set sep 0 set r 1 namespace upvar ::tomlish::parse token_waiting token_waiting set token_waiting [dict create] ;#if ::tok finds a *complete* second token during a run, it will put the 2nd one here to be returned by the next call. set state "key-space" ::tomlish::parse::spacestack push {space key-space} namespace upvar ::tomlish::parse linenum linenum;#'line number' of input data. (incremented for each literal linefeed - but not escaped ones in data) set linenum 1 try { while {$r} { set r [::tomlish::parse::tok $s] #puts stdout "got tok: '$tok' while parsing string '$s' " set next_tokenType_known 0 ;#whether we begin a new token here based on what terminated the token result of 'tok' #puts "got token: '$tok' tokenType='$tokenType'. while v($nest) = [set v($nest)]" #puts "-->tok: $tok tokenType='$tokenType'" set prevstate $state ##### set nextstate [::tomlish::parse::getNextState $tokenType $prevstate] ::tomlish::log::info "tok: $tok STATE TRANSITION tokenType: '$tokenType' triggering '$state' -> '$nextstate' last_space_action:$last_space_action" set state $nextstate if {$state eq "err"} { error "State error - aborting parse. [tomlish::parse::report_line]" } if {$last_space_action eq "pop"} { #pop_trigger_tokens: newline tablename endarray endinlinetable #note a token is a pop trigger depending on context. e.g first newline during keyval is a pop trigger. switch -exact -- $tokenType { newline { incr linenum lappend v($nest) [list NEWLINE $tok] } tablename { #note: a tablename only 'pops' if we are greater than zero error "tablename pop should already have been handled as special case zeropoppushspace in getNextState" } tablearrayname { #!review - tablearrayname different to tablename regarding push/pop? #note: a tablename only 'pops' if we are greater than zero error "tablearrayname pop should already have been handled as special case zeropoppushspace in getNextState" } endarray { #nothing to do here. } comma { #comma for inline table will pop the keyvalue space lappend v($nest) "SEP" } endinlinetable { puts stderr "endinlinetable" } endmultiquote { puts stderr "endmultiquote for last_space_action 'pop'" } default { error "unexpected tokenType '$tokenType' for last_space_action 'pop'" } } set parentlevel [expr {$nest -1}] lappend v($parentlevel) [set v($nest)] incr nest -1 } elseif {$last_space_action eq "push"} { incr nest 1 set v($nest) [list] # push_trigger_tokens: barekey quotedkey startinlinetable startarray tablename tablearrayname switch -exact -- $tokenType { barekey { set v($nest) [list KEYVAL $tok] ;#$tok is the keyname } quotedkey - itablequotedkey { set v($nest) [list QKEYVAL $tok] ;#$tok is the keyname } tablename { #note: we do not use the output of tomlish::tablename_trim to produce a tablename for storage in the tomlish list! #The tomlish list is intended to preserve all whitespace (and comments) - so a roundtrip from toml file to tomlish # back to toml file will be identical. #It is up to the datastructure stage to normalize and interpret tomlish for programmatic access. # we call tablename_trim here only to to validate that the tablename data is well-formed at the outermost level, # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names. #todo - review! It's arguable that we should not do any validation here, and just store even incorrect raw tablenames, # so that the tomlish list is more useful for say a toml editor. Consider adding an 'err' tag to the appropriate place in the # tomlish list? set test_only [::tomlish::utils::tablename_trim $tok] ::tomlish::log::debug "trimmed (but not normalized) tablename: '$test_only'" set v($nest) [list TABLE $tok] ;#$tok is the *raw* table name #note also that equivalent tablenames may have different toml representations even after being trimmed! #e.g ["x\t\t"] & ["x "] (tab escapes vs literals) #These will show as above in the tomlish list, but should normalize to the same tablename when used as keys by the datastructure stage. } tablearrayname { set test_only [::tomlish::utils::tablename_trim $tok] puts stdout "trimmed (but not normalized) tablearrayname: '$test_only'" set v($nest) [list TABLEARRAY $tok] ;#$tok is the *raw* tablearray name } startarray { set v($nest) [list ARRAY] ;#$tok is just the opening bracket - don't output. } startinlinetable { set v($nest) [list ITABLE] ;#$tok is just the opening curly brace - don't output. } startmultiquote { puts stderr "push trigger tokenType startmultiquote (todo)" set v($nest) [list MULTISTRING] ;#container for STRINGPART, NEWLINE #JMN ??? #set next_tokenType_known 1 #::tomlish::parse::set_tokenType "multistring" #set tok "" } default { error "push trigger tokenType '$tokenType' not yet implemented" } } } else { #no space level change switch -exact -- $tokenType { starttablename { #$tok is triggered by the opening bracket and sends nothing to output } starttablearrayname { #$tok is triggered by the double opening brackets and sends nothing to output } tablename - tablenamearray { error "did not expect 'tablename/tablearrayname' without space level change" #set v($nest) [list TABLE $tok] } endtablename - endtablearrayname { #no output into the tomlish list for this token } startinlinetable { puts stderr "decode::toml error. did not expect startlinetable without space level change" } startquote { switch -exact -- $nextstate { string { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "string" set tok "" } quotedkey { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "quotedkey" set tok "" } itablequotedkey { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "itablequotedkey" set tok "" } default { error "startquote switch case not implemented for nextstate: $nextstate" } } } startmultiquote { #review puts stderr "no space level change - got startmultiquote" set next_tokenType_known 1 ::tomlish::parse::set_tokenType "stringpart" set tok "" } endquote { #nothing to do? set tok "" } endmultiquote { #JMN!! set tok "" } string { lappend v($nest) [list STRING $tok] } stringpart { lappend v($nest) [list STRINGPART $tok] } multistring { #review lappend v($nest) [list MULTISTRING $tok] } quotedkey { #lappend v($nest) [list QKEY $tok] ;#TEST } itablequotedkey { } untyped-value { #we can't determine the type of unquoted values (int,float,datetime,bool) until the entire token was read. if {$tok in {true false}} { set tag BOOL } elseif {[::tomlish::utils::is_int $tok]} { set tag INT } elseif {[::tomlish::utils::is_float $tok]} { set tag FLOAT } elseif {[::tomlish::utils::is_datetime $tok]} { set tag DATETIME } else { error "Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. [tomlish::parse::report_line]" } lappend v($nest) [list $tag $tok] } comment { #puts stdout "----- comment token returned '$tok'------" lappend v($nest) [list COMMENT "$tok"] } equal { #we append '=' to the nest so that any surrounding whitespace is retained. lappend v($nest) = } comma { lappend v($nest) SEP } newline { incr linenum lappend v($nest) [list NEWLINE $tok] } whitespace { lappend v($nest) [list WS $tok] } continuation { lappend v($nest) CONT } bom { lappend v($nest) BOM } eof { #ok - nothing more to add to the tomlish list. #!todo - check previous tokens are complete/valid? } default { error "unknown tokenType '$tokenType' [::tomlish::parse::report_line]" } } } if {!$next_tokenType_known} { ::tomlish::parse::set_tokenType "" set tok "" } if {$state eq "end"} { break } } #while {$nest > 0} { # lappend v([expr {$nest -1}]) [set v($nest)] # incr nest -1 #} while {[::tomlish::parse::spacestack size] > 1} { ::tomlish::parse::spacestack pop lappend v([expr {$nest -1}]) [set v($nest)] incr nest -1 #set parent [spacestack peek] ;#the level being appended to #lassign $parent type state #if {$type eq "space"} { # #} elseif {$type eq "buffer"} { # lappend v([expr {$nest -1}]) {*}[set v($nest)] #} else { # error "invalid spacestack item: $parent" #} } } finally { set is_parsing 0 } return $v(0) } #*** !doctools #[list_end] [comment {--- end definitions namespace tomlish::decode ---}] } #decode toml to tomlish interp alias {} tomlish::from_toml {} tomlish::decode::toml namespace eval tomlish::utils { #*** !doctools #[subsection {Namespace tomlish::utils}] #[para] #[list_begin definitions] #tablenames (& tablearraynames) may contain irrelevant leading, trailing and interspersed whitespace # tablenames can be made up of segments delimited by dots. .eg [ a.b . c ] #trimmed, the tablename becomes {a.b.c} # A segment may contain whitespace if it is quoted e.g [a . b . "c etc " ] #ie whitespace is only irrelevant if it's outside a quoted segment #trimmed, the tablename becomes {a.b."c etc "} proc tablename_trim {tablename} { set segments [tablename_split $tablename false] set trimmed_segments [list] foreach seg $segments { lappend trimmed_segments [::string trim $seg [list " " \t]] } return [join $trimmed_segments .] } #utils::tablename_split proc tablename_split {tablename {normalize false}} { #we can't just split on . because we have to handle quoted segments which may contain a dot. #eg {dog."tater.man"} set i 0 set sLen [::string length $tablename] set segments [list] set mode "unknown" ;#5 modes: unknown, quoted,litquoted, unquoted, syntax #quoted is for double-quotes, litquoted is for single-quotes (string literal) set seg "" for {} {$i < $sLen} {} { if {$i > 0} { set lastChar [::string index $tablename [expr {$i - 1}]] } else { set lastChar "" } set c [::string index $tablename $i] incr i if {$c eq "."} { switch -exact -- $mode { unquoted { #dot marks end of segment. lappend segments $seg set seg "" set mode "unknown" } quoted { append seg $c } unknown { lappend segments $seg set seg "" } litquoted { append seg $c } default { #mode: syntax #we got our dot. - the syntax mode is now satisfied. set mode "unknown" } } } elseif {($c eq "\"") && ($lastChar ne "\\")} { if {$mode eq "unknown"} { if {[::string trim $seg] ne ""} { #we don't allow a quote in the middle of a bare key error "tablename_split. character '\"' invalid at this point in tablename. tablename: '$tablename'" } set mode "quoted" set seg "\"" } elseif {$mode eq "unquoted"} { append seg $c } elseif {$mode eq "quoted"} { append seg $c lappend segments $seg set seg "" set mode "syntax" ;#make sure we only accept a dot or end-of-data now. } elseif {$mode eq "litquoted"} { append seg $c } elseif {$mode eq "syntax"} { error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" } } elseif {($c eq "\'")} { if {$mode eq "unknown"} { append seg $c set mode "litquoted" } elseif {$mode eq "unquoted"} { #single quote inside e.g o'neill append seg $c } elseif {$mode eq "quoted"} { append seg $c } elseif {$mode eq "litquoted"} { append seg $c lappend segments $seg set seg "" set mode "syntax" } elseif {$mode eq "syntax"} { error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" } } elseif {$c in [list " " \t]} { if {$mode eq "syntax"} { #ignore } else { append seg $c } } else { if {$mode eq "syntax"} { error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" } if {$mode eq "unknown"} { set mode "unquoted" } append seg $c } if {$i == $sLen} { #end of data ::tomlish::log::debug "End of data: mode='$mode'" switch -exact -- $mode { quoted { if {$c ne "\""} { error "tablename_split. missing closing double-quote in a segment. tablename: '$tablename'" } if {$normalize} { lappend segments $seg } else { lappend segments [::tomlish::utils::unescape_string [::string range $seg 1 end-1]] #lappend segments [subst -nocommands -novariables [::string range $seg 1 end-1]] ;#wrong } } litquoted { set trimmed_seg [::string trim $seg] if {[::string index $trimmed_seg end] ne "\'"} { error "tablename_split. missing closing single-quote in a segment. tablename: '$tablename'" } lappend segments $seg } unquoted - unknown { lappend segments $seg } syntax { #ok - segment already lappended } default { lappend segments $seg } } } } foreach seg $segments { set trimmed [::string trim $seg [list " " \t]] #note - we explicitly allow 'empty' quoted strings '' & "" # (these are 'discouraged' but valid toml keys) #if {$trimmed in [list "''" "\"\""]} { # puts stderr "tablename_split. warning - Empty quoted string as tablename segment" #} if {$trimmed eq "" } { error "tablename_split. Empty segment found. tablename: '$tablename'" } } return $segments } proc unicode_escape_info {slashu} { #!todo # validate that slashu is either a \uxxxx or \Uxxxxxxxx value of the correct length and # is a valid 'unicode scalar value' # ie integers in the range 0 to D7FF16 and E00016 to 10FFFF16 inclusive #expr {(($x >= 0) && ($x <= 0xD7FF16)) || (($x >= 0xE00016) && ($x <= 0x10FFFF16))} if {[::string match {\\u*} $slashu]} { set exp {^\\u([0-9a-fA-F]{4}$)} if {[regexp $exp $slashu match hex]} { if {[scan $hex %4x dec] != 1} { #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? return [list err [list reason "Failed to convert '$hex' to decimal"]] } else { return [list ok [list char [subst -nocommand -novariable $slashu]]] } } else { return [list err [list reason "Supplied string not of the form \\uHHHH where H in \[0-9a-fA-F\]"]] } } elseif {[::string match {\\U*} $slashu]} { set exp {^\\U([0-9a-fA-F]{8}$)} if {[regexp $exp $slashu match hex]} { if {[scan $hex %8x dec] != 1} { #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? return [list err [list reason "Failed to convert '$hex' to decimal"]] } else { if {(($dec >= 0) && ($dec <= 0xD7FF16)) || (($dec >= 0xE00016) && ($dec <= 0x10FFFF16))} { return [list ok [list char [subst -nocommand -novariable $slashu]]] } else { return [list err [list reason "$slashu is not within the 'unicode scalar value' ranges 0 to 0xD7FF16 or 0xE00016 to 0x10FFFF16"]] } } } else { return [list err [list reason "Supplied string not of the form \\UHHHHHHHH where H in \[0-9a-fA-F\]"]] } } else { return [list err [list reason "Supplied string did not start with \\u or \\U" ]] } } proc unescape_string {str} { #note we can't just use Tcl subst because: # it also transforms \a (audible bell) and \v (vertical tab) which are not in the toml spec. # it would strip out backslashes inappropriately: e.g "\j" becomes just j # it recognizes other escapes which aren't approprite e.g \xhh and octal \nnn # it replaces\ with a single whitespace #This means we shouldn't use 'subst' on the whole string, but instead substitute only the toml-specified escapes (\r \n \b \t \f \\ \" \uhhhh & \Uhhhhhhhh set buffer "" set buffer4 "" ;#buffer for 4 hex characters following a \u set buffer8 "" ;#buffer for 8 hex characters following a \u set sLen [::string length $str] #we need to handle arbitrarily long sequences of backslashes. \\\\\ etc set slash_active 0 set unicode4_active 0 set unicode8_active 0 #!todo - check for invalid data in the form of a raw carriage return (decimal 13) without following linefeed? set i 0 for {} {$i < $sLen} {} { if {$i > 0} { set lastChar [::string index $str [expr {$i - 1}]] } else { set lastChar "" } set c [::string index $str $i] ::tomlish::log::debug "unescape_string. got char $c" scan $c %c n if {($n <= 31) && ($n != 9) && ($n != 10) && ($n != 13)} { #we don't expect unescaped unicode characters from 0000 to 001F - #*except* for raw tab (which is whitespace) and newlines error "unescape_string. Invalid data for a toml string. Unescaped control character (decimal $n) [::tomlish::utils::string_to_slashu $c]" } incr i ;#must incr here because we do'returns'inside the loop if {$c eq "\\"} { if {$slash_active} { append buffer "\\" set slash_active 0 } elseif {$unicode4_active} { error "unescape_string. unexpected case slash during unicode4 not yet handled" } elseif {$unicode8_active} { error "unescape_string. unexpected case slash during unicode8 not yet handled" } else { # don't output anything (yet) set slash_active 1 } } else { if {$unicode4_active} { if {[::string length $buffer4] < 4} { append buffer4 $c } if {[::string length $buffer4] == 4} { #we have a \uHHHH to test set unicode4_active 0 set result [tomlish::utils::unicode_escape_info "\\u$buffer4"] if {[lindex $result 0] eq "ok"} { append buffer [dict get $result ok char] } else { error "unescape_string error: [lindex $result 1]" } } } elseif {$unicode8_active} { if {[::string length $buffer8] < 8} { append buffer8 $c } if {[::string length $buffer8] == 8} { #we have a \UHHHHHHHH to test set unicode8_active 0 set result [tomlish::utils::unicode_escape_info "\\U$buffer8"] if {[lindex $result 0] eq "ok"} { append buffer [dict get $result ok char] } else { error "unescape_string error: [lindex $result 1]" } } } elseif {$slash_active} { set slash_active 0 set ctest [string map {{"} dq} $c] switch -exact -- $ctest { dq { set e "\\\"" append buffer [subst -nocommand -novariable $e] } b - t - n - f - r { set e "\\$c" append buffer [subst -nocommand -novariable $e] } u { set unicode4_active 1 set buffer4 "" } U { set unicode8_active 1 set buffer8 "" } default { set slash_active 0 append buffer "\\" append buffer $c } } } else { append buffer $c } } } #puts stdout "EOF 4:$unicode4_active 8:$unicode8_active slash:$slash_active" if {$unicode4_active} { error "End of string reached before complete unicode escape sequence \uHHHH" } if {$unicode8_active} { error "End of string reached before complete unicode escape sequence \UHHHHHHHH" } if {$slash_active} { append buffer "\\" } return $buffer } proc normalize_key {rawkey} { set c1 [::string index $rawkey 0] set c2 [::string index $rawkey end] if {($c1 eq "'") && ($c2 eq "'")} { #single quoted segment. No escapes allowed within it. set key [::string range $rawkey 1 end-1] } elseif {($c1 eq "\"") && ($c2 eq "\"")} { #double quoted segment. Apply escapes. # set keydata [::string range $rawkey 1 end-1] ;#strip outer quotes only set key [::tomlish::utils::unescape_string $keydata] #set key [subst -nocommands -novariables $keydata] ;#wrong. Todo - create a string escape substitution function. } else { set key $rawkey } return $key } proc string_to_slashu {string} { set rv {} foreach c [split $string {}] { scan $c %c c append rv {\u} append rv [format %.4X $c] } return $rv } #'nonprintable' is conservative here because some systems (e.g windows console) are very limited in what they can display. proc nonprintable_to_slashu {s} { set res "" foreach i [split $s ""] { scan $i %c c set printable 0 if {($c>31) && ($c<127)} { set printable 1 } if {$printable} {append res $i} else {append res \\u[format %.4X $c]} } set res } ;#RS #check if str is valid for use as a toml bare key proc is_barekey {str} { if {[::string length $str] == 0} { return 0 } else { set matches [regexp -all {[a-zA-Z0-9\_\-]} $str] if {[::string length $str] == $matches} { #all characters match the regexp return 1 } else { return 0 } } } #test only that the characters in str are valid for the toml specified type 'integer'. proc int_validchars1 {str} { set numchars [::string length $str] if {[regexp -all {[0-9\_\-\+]} $str] == $numchars} { return 1 } else { return 0 } } #add support for hex,octal,binary 0x.. 0o.. 0b... proc int_validchars {str} { set numchars [::string length $str] if {[regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] == $numchars} { return 1 } else { return 0 } } proc is_int {str} { set matches [regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] if {[tcl::string::length $str] == $matches} { #all characters in legal range # --------------------------------------- #check for leading zeroes in non 0x 0b 0o #first strip any +, - or _ (just for this test) set check [::string map {+ "" - "" _ ""} $str] if {([tcl::string::length $check] > 1) && ([tcl::string::index $check 0] eq "0") && ([tcl::string::index $check 1] ni {o x b})} { return 0 } # --------------------------------------- #check +,- only occur in the first position. if {[::string last - $str] > 0} { return 0 } if {[::string last + $str] > 0} { return 0 } set numeric_value [::string map {_ ""} $str] ;#allow some earlier tcl versions which don't support underscores #use Tcl's integer check to ensure we don't let things like 3e4 through - which is a float (would need to be 0x3e4 for hex) if {![string is integer -strict $numeric_value]} { return 0 } #!todo - check bounds only based on some config value #even though Tcl can handle bignums, we won't accept anything outside of toml 1.0 minimum requirements. #presumably very large numbers would have to be supplied in a toml file as strings. #Review - toml 1.0 only says that it must handle up to 2^63 - not that this is a max if {$numeric_value > $::tomlish::max_int} { return 0 } if {$numeric_value < $::tomlish::min_int} { return 0 } } else { return 0 } #Got this far - didn't find anything wrong with it. return 1 } #test only that the characters in str are valid for the toml specified type 'float'. proc float_validchars {str} { set numchars [::string length $str] if {[regexp -all {[eE0-9\_\-\+\.]} $str] == $numchars} { return 1 } else { #only allow lower case for these special values - as per Toml 1.0 spec if {$str ni {inf +inf -inf nan +nan -nan}} { return 0 } else { return 1 } } } proc is_float {str} { set matches [regexp -all {[eE0-9\_\-\+\.]} $str] #don't test for case variations - as Toml doesn't allow (whereas Tcl allows Inf NaN etc) if {$str in {inf +inf -inf nan +nan -nan}} { return 1 } if {[::string length $str] == $matches} { #all characters in legal range #A leading zero is ok, but we should disallow multiple leading zeroes (same rules as toml ints) #Toml spec also disallows leading zeros in the exponent part #... but this seems less interoperable (some libraries generate leading zeroes in exponents) #for now we will allow leading zeros in exponents #!todo - configure 'strict' option to disallow? #first strip any +, - or _ (just for this test) set check [::string map {+ "" - "" _ ""} $str] set r {([0-9])*} regexp $r $check intpart ;#intpart holds all numerals before the first .,e or E set z {([0])*} regexp $z $intpart leadingzeros if {[::string length $leadingzeros] > 1} { return 0 } #for floats, +,- may occur in multiple places #e.g -2E-22 +3e34 #!todo - check bounds ? #strip underscores for tcl double check set check [::string map {_ ""} $str] #string is double accepts inf nan +NaN etc. if {![::string is double $check]} { return 0 } } else { return 0 } #Got this far - didn't find anything wrong with it. return 1 } #test only that the characters in str are valid for the toml specified type 'datetime'. proc datetime_validchars {str} { set numchars [::string length $str] if {[regexp -all {[zZtT0-9\-\+\.:]} $str] == $numchars} { return 1 } else { return 0 } } proc is_datetime {str} { #e.g 1979-05-27T00:32:00-07:00 set matches [regexp -all {[zZtT0-9\-\+\.:]} $str] if {[::string length $str] == $matches} { #all characters in legal range #!todo - use full RFC 3339 parser? lassign [split $str T] datepart timepart #!todo - what if the value is 'time only'? if {[catch {clock scan $datepart} err]} { puts stderr "tcl clock scan failed err:'$err'" return 0 } #!todo - verify time part is reasonable } else { return 0 } return 1 } #*** !doctools #[list_end] [comment {--- end definitions namespace tomlish::utils ---}] } namespace eval tomlish::parse { #*** !doctools #[subsection {Namespace tomlish::parse}] #[para] #[list_begin definitions] variable is_parsing 0 ;#whether we are in the middle of parsing tomlish text variable state # states: # key-space, curly-space, array-space # value-expected, keyval-syntax, doublequoted, singlequoted, quotedkey, string, multistring # # notes: # key-space i # only the -space states are also 'spaces' ie a container which is pushed/popped on the spacestack # value-expected - we also allow for leading whitespace in this state, but once a value is returned we jump to a state based on the containing space. e.g keytail or array-syntax # #stateMatrix defines for each state, actions to take for each possible token. #single-element actions are the name of the next state into which to transition, or a 'popspace' command to pop a level off the spacestack and add the data to the parent container. #dual-element actions are a push command and the name of the space to push on the stack. # - pushspace is a simple push onto the spacestack, zeropoppushspace also pushes, but will first do a pop *if* the current space level is greater than zero (ie if only if not already in root key-space) #test variable stateMatrix set stateMatrix [dict create] dict set stateMatrix\ key-space {whitespace "key-space" newline "key-space" bom "key-space" barekey {pushspace "keyval-space"} eof "end" startquote "quotedkey" startmultiquote "err" endquote "err" comment "key-space" comma "err" starttablename "tablename" starttablearrayname "tablearrayname"} dict set stateMatrix\ curly-space {\ whitespace "curly-space"\ newline "curly-space"\ barekey {pushspace "itablekeyval-space"}\ itablequotedkey "itablekeyval-space"\ endinlinetable "popspace"\ startquote "itablequotedkey"\ comma "curly-space"\ eof "err"\ comment "err"\ } #REVIEW #toml spec looks like heading towards allowing newlines within inline tables #https://github.com/toml-lang/toml/issues/781 dict set stateMatrix\ curly-syntax {\ whitespace "curly-syntax"\ newline "curly-syntax"\ barekey {pushspace "itablekeyval-space"}\ itablequotedkey "itablekeyval-space"\ endinlinetable "popspace"\ startquote "itablequotedkey"\ comma "curly-space"\ eof "err"\ comment "err"\ } dict set stateMatrix\ value-expected {\ whitespace "value-expected"\ newline "err"\ eof "err"\ untyped-value "samespace"\ startquote "string"\ startmultiquote {pushspace "multistring-space"}\ startinlinetable {pushspace curly-space}\ comment "err"\ comma "err"\ startarray {pushspace array-space}\ } dict set stateMatrix\ array-space {\ whitespace "array-space"\ newline "array-space"\ eof "err"\ untyped-value "samespace"\ startarray {pushspace "array-space"}\ endarray "popspace"\ startquote "string"\ startmultiquote "multistring"\ comma "array-space"\ comment "array-space"\ } dict set stateMatrix\ array-syntax {\ whitespace "array-syntax"\ newline "array-syntax"\ untyped-value "samespace"\ startarray {pushspace array-space}\ endarray "popspace"\ startquote "string"\ startmultiquote "multistring"\ comma "array-space"\ comment "err"\ } dict set stateMatrix\ itablekeyval-syntax {whitespace "itablekeyval-syntax" endquote "itablekeyval-syntax" newline "err" equal "value-expected" eof "err"} #dict set stateMatrix\ # itablekeytail {whitespace "itablekeytail" endinlinetable "popspace" comma "popspace" newline "err" comment "err" eof "err"} dict set stateMatrix\ itablevaltail {whitespace "itablevaltail" endinlinetable "popspace" comma "popspace" newline "err" comment "err" eof "err"} dict set stateMatrix\ itablekeyval-space {} dict set stateMatrix\ itablequotedkey {whitespace "NA" itablequotedkey {pushspace "itablekeyval-space"} newline "err" endquote "itablekeyval-syntax"} dict set stateMatrix\ keyval-syntax {whitespace "keyval-syntax" endquote "keyval-syntax" comma "err" newline "err" equal "value-expected" eof "err"} dict set stateMatrix\ keytail {whitespace "keytail" newline "popspace" comment "keytail" eof "end"} dict set stateMatrix\ keyval-space {} dict set stateMatrix\ quotedkey {whitespace "NA" quotedkey {pushspace "keyval-space"} newline "err" endquote "keyval-syntax"} dict set stateMatrix\ string {whitespace "NA" newline "err" string "string" endquote "samespace" eof "err"} dict set stateMatrix\ stringpart {eof "err" continuation "samespace" endmultiquote "popspace"} dict set stateMatrix\ multistring {whitespace "NA" newline "NA" multistring "multistring" endmultiquote "samespace" endquote "err" eof "err"} dict set stateMatrix\ multistring-space {whitespace "multistring-space" continuation "multistring-space" stringpart "multistring-space" multistring "multistring-space" newline "multistring-space" eof "err" endmultiquote "popspace"} dict set stateMatrix\ tablename {whitespace "NA" tablename {zeropoppushspace key-space} tablename2 {pushspace key-space} newline "err" endtablename "tablenametail"} dict set stateMatrix\ baretablename {whitespace "NA" newline "err" equal "value-expected"} dict set stateMatrix\ tablenametail {whitespace "tablenametail" newline "key-space" comment "tablenametail" eof "end"} dict set stateMatrix\ tablearrayname {whitespace "NA" tablearrayname {zeropoppushspace key-space} tablearrayname2 {pushspace key-space} newline "err" endtablearray "tablearraynametail"} dict set stateMatrix\ tablearraynametail {whitespace "tablearraynametail" newline "key-space" comment "tablearraynametail" eof "end"} dict set stateMatrix\ end {} #see also spacePopTransitions and spacePushTransitions below for state redirections on pop/push variable stateMatrix_orig { key-space {whitespace "key-space" newline "key-space" bom "key-space" barekey {pushspace "keyval-space"} eof "end" startquote "quotedkey" startmultiquote "err" endquote "err" comment "key-space" comma "err" starttablename "tablename" starttablearrayname "tablearrayname"} curly-space {whitespace "curly-space" newline "curly-space" barekey {pushspace "keyval-space"} quotedkey {pushspace "keyval-space"} startcurly {pushspace curly-space} endcurly "popspace" eof "err"} value-expected {whitespace "value-expected" newline "err" eof "err" untyped-value "samespace" startquote "string" startmultiquote {pushspace "multistring-space"} comment "err" comma "err" startarray {pushspace array-space}} array-space {whitespace "array-space" newline "array-space" eof "err" untyped-value "samespace" startarray {pushspace "array-space"} endarray "popspace" startquote "string" startmultiquote "multistring" comma "array-space" comment "array-space"} array-syntax {whitespace "array-syntax" newline "array-syntax" comma "array-space" untyped-value "samespace" startquote "string" startmultiquote "multistring" startarray {pushspace array-space} comment "err" endarray "popspace"} keyval-syntax {whitespace "keyval-syntax" endquote "keyval-syntax" newline "err" equal "value-expected" eof "err"} keytail {whitespace "keytail" newline "popspace" comment "keytail" eof "end"} keyval-space {} quotedkey {whitespace "NA" quotedkey {pushspace "keyval-space"} newline "err" endquote "keyval-syntax"} string {whitespace "NA" newline "err" string "string" endquote "samespace" eof "err"} stringpart {eof "err" continuation "samespace" endmultiquote "popspace"} multistring {whitespace "NA" newline "NA" multistring "multistring" endmultiquote "samespace" endquote "err" eof "err"} multistring-space {whitespace "multistring-space" continuation "multistring-space" stringpart "multistring-space" newline "multistring-space" eof "err" endmultiquote ""} tablename {whitespace "NA" tablename {zeropoppushspace key-space} tablename2 {pushspace key-space} newline "err" endtablename "tablenametail"} tablenametail {whitespace "tablenametail" newline "key-space" comment "tablenametail" eof "end"} tablearrayname {whitespace "NA" tablearrayname {zeropoppushspace key-space} tablearrayname2 {pushspace key-space} newline "err" endtablearray "tablearraynametail"} tablearraynametail {whitespace "tablearraynametail" newline "key-space" comment "tablearraynametail" eof "end"} end {} } #build a list of 'push triggers' from the stateMatrix # ie tokens which can push a new space onto spacestack set push_trigger_tokens [list] tcl::dict::for {s transitions} $stateMatrix { tcl::dict::for {token transition_to} $transitions { set action [lindex $transition_to 0] switch -exact -- $action { pushspace - zeropoppushspace { if {$token ni $push_trigger_tokens} { lappend push_trigger_tokens $token } } } } } puts stdout "push_trigger_tokens: $push_trigger_tokens" #!todo - hard code once stateMatrix finalised? #mainly for the -space states: #redirect to another state $c based on a state transition from $whatever to $b # e.g "string {array-space array-syntax}" means when transitioning from string to array-space, jump to array-syntax instead. #this is useful as we often don't know state $b. e.g when it is decided by 'popspace' variable spacePopTransitions { array-space array-syntax curly-space curly-syntax keyval-space keytail itablekeyval-space itablevaltail } variable spacePushTransitions { keyval-space keyval-syntax itablekeyval-space itablekeyval-syntax array-space array-space curly-space curly-space key-space tablename } variable state_list namespace export tomlish toml namespace ensemble create proc getNextState {tokentype currentstate} { variable nest variable v variable spacePopTransitions variable spacePushTransitions variable last_space_action "none" variable last_space_type "none" variable state_list set result "" if {[dict exists $::tomlish::parse::stateMatrix $currentstate $tokentype]} { set transition_to [dict get $::tomlish::parse::stateMatrix $currentstate $tokentype] ::tomlish::log::info "getNextState tokentype:$tokentype currentstate:$currentstate : transition_to = $transition_to" switch -exact -- [lindex $transition_to 0] { popspace { spacestack pop set parent [spacestack peek] lassign $parent type target set last_space_action "pop" set last_space_type $type if {[dict exists $::tomlish::parse::spacePopTransitions $target]} { set next [dict get $::tomlish::parse::spacePopTransitions $target] ::tomlish::log::debug "--->> pop transition to space $target redirected state to $next <<---" } else { set next $target } set result $next } samespace { #note the same data as popspace (spacePopTransitions) is used here. set parent [spacestack peek] ::tomlish::log::debug ">>>>>>>>> got parent $parent <<<<<" lassign $parent type target if {[dict exists $::tomlish::parse::spacePopTransitions $target]} { set next [dict get $::tomlish::parse::spacePopTransitions $target] ::tomlish::log::debug "--->> samespace transition to space $target redirected state to $next <<---" } else { set next $target } set result $next } zeropoppushspace { if {$nest > 0} { #pop back down to the root level (key-space) spacestack pop set parent [spacestack peek] lassign $parent type target set last_space_action "pop" set last_space_type $type #----- #standard pop set parentlevel [expr {$nest -1}] lappend v($parentlevel) [set v($nest)] incr nest -1 #----- } #re-entrancy #set next [list pushspace [lindex $transition_to 1]] set nexttokentype ${tokentype}2 ;#fake token type e.g tablename2 or tablearrayname2 ::tomlish::log::notice "REENTRANCY!!! calling getNextState $nexttokentype $tokentype" set result [::tomlish::parse::getNextState $nexttokentype $tokentype] } pushspace { set target [lindex $transition_to 1] spacestack push [list space $target] set last_space_action "push" set last_space_type "space" #puts $::tomlish::parse::spacePushTransitions if {[dict exists $::tomlish::parse::spacePushTransitions $target]} { set next [dict get $::tomlish::parse::spacePushTransitions $target] ::tomlish::log::info "--->> push transition to space $target redirected state to $next <<---" } else { set next $target } set result $next } default { set result $transition_to } } } else { set result "nostate-err" } lappend state_list $result return $result } proc report_line {{line ""}} { variable linenum variable is_parsing if {$is_parsing} { if {$line eq ""} { set line $linenum } return "Line Number: $line" } else { #not in the middle of parsing tomlish text - return nothing. return "" } } #produce a *slightly* more readable string rep of the nest for puts etc. proc nest_pretty1 {list} { set prettier "{" foreach el $list { if { [lindex $el 0] eq "NEWLINE"} { append prettier "[list $el]\n" } elseif {([llength $el] > 1) && ([lindex $el 0] in {KEYVAL QKEYVAL TABLE ARRAY})} { append prettier [nest_pretty1 $el] } else { append prettier "[list $el] " } } append prettier "}" return $prettier } proc set_tokenType {t} { variable tokenType variable tokenType_list if {![info exists tokenType]} { set tokenType "" } lappend tokenType_list $t set tokenType $t } proc switch_tokenType {t} { variable tokenType variable tokenType_list lset tokenType_list end $t set tokenType $t } proc get_tokenType {} { variable tokenType return $tokenType } proc _shortcircuit_startquotesequence {} { variable tok variable i set toklen [::string length $tok] if {$toklen == 1} { set_tokenType "startquote" incr i -1 return -level 2 1 } elseif {$toklen == 2} { set_tokenType "startquote" set tok "\"" incr i -2 return -level 2 1 } } #return a list of 0 1 or 2 tokens #tomlish::parse::tok proc tok {s} { variable nest variable v variable i variable tok variable type ;#character type variable state ;#FSM set resultlist [list] variable tokenType variable tokenType_list variable endToken set sLen [::string length $s] variable lastChar variable braceCount variable bracketCount #------------------------------ #Previous run found another (presumably single-char) token variable token_waiting if {[dict size $token_waiting]} { set tokenType [dict get $token_waiting type] set tok [dict get $token_waiting tok] dict unset token_waiting type dict unset token_waiting tok return 1 } #------------------------------ set slash_active 0 set quote 0 set c "" set multi_dquote "" for {} {$i < $sLen} {} { if {$i > 0} { set lastChar [string index $s [expr {$i - 1}]] } else { set lastChar "" } set c [string index $s $i] #puts "got char $c during tokenType '$tokenType'" incr i ;#must incr here because we do'returns'inside the loop set ctest [string map {\{ lc \} rc \[ lb \] rb \" dq \\ bsl \r cr \n lf \t tab \uFEFF bom} $c] switch -exact -- $ctest { # { set dquotes $multi_dquote set multi_dquote "" ;#!! if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. set slash_active 0 if {[::string length $tokenType]} { switch -exact -- $tokenType { startquotesequence { _shortcircuit_startquotesequence } barekey { error "Unexpected character '$c' during bare key. Only \[a-zA-Z_-\] allowed. [tomlish::parse::report_line]" } whitespace { # hash marks end of whitespace token #do a return for the whitespace, set token_waiting #dict set token_waiting type comment #dict set token_waiting tok "" incr i -1 ;#leave comment for next run return 1 } untyped-value { #REVIEW! the spec isn't clear.. is whitespace after an int,bool etc required before comment? #we will accept a comment marker as an immediate terminator of the untyped-value. incr i -1 return 1 } default { #quotedkey, string, multistring append tok $c } } } else { #$slash_active not relevant when no tokenType #start of token if we're not in a token set_tokenType comment set tok "" ;#The hash is not part of the comment data } } lc { set multi_dquote "" ;#!! #test jmn2024 #left curly brace try { if {[::string length $tokenType]} { switch -exact -- $tokenType { startquotesequence { _shortcircuit_startquotesequence } string - stringpart { if {$slash_active} {append tok "\\"} append tok $c } starttablename { error "unexpected tablename problem" #$slash_active not relevant to this tokentype #change the tokenType switch_tokenType "starttablearrayname" set tok "" ;#no output into the tomlish list for this token #any following whitespace is part of the tablearrayname, so return now return 1 } comment { if {$slash_active} {append tok "\\"} append tok "\[" } default { #end any other token. incr i -1 return 1 } } } else { switch -exact -- $state { value-expected { #switch last key to tablename?? set_tokenType "startinlinetable" set tok "\{" return 1 } multistring-space { set_tokenType "stringpart" if {$slash_active} { set tok "\\\{" } else { set tok "\{" } } key-space { #invalid - but allow parser statemachine to report it. ? set_tokenType "startinlinetable" set tok "\{" return 1 } array-space - array-syntax { #nested anonymous inline table set_tokenType "startinlinetable" set tok "\{" return 1 } default { error "state: '$state'. left brace case not implemented [tomlish::parse::report_line]" } } } } on error {em} { error $em } finally { set slash_active 0 } } rc { set multi_dquote "" ;#!! #right curly brace try { if {[string length $tokenType]} { switch -exact -- $tokenType { startquotesequence { _shortcircuit_startquotesequence } string - stringpart - comment { if {$slash_active} {append tok "\\"} append tok $c } tablename { if {$slash_active} {append tok "\\"} #invalid! - but leave for datastructure loading stage to catch dict set token_waiting type endinlinetable dict set token_waiting tok "" return 1 } tablearrayname { if {$slash_active} {append tok "\\"} #invalid! - but leave for datastructure loading stage to catch dict set token_waiting type endtablearrayname dict set token_waiting tok "" return 1 } itablevaltail { } default { #end any other token incr i -1 return 1 } } } else { #$slash_active not relevant when no tokenType switch -exact -- $state { value-expected { #invalid - but allow parser statemachine to report it. set_tokenType "endinlinetable" set tok "\}" return 1 } key-space { #invalid - but allow parser statemachine to report it. ? set_tokenType "endinlinetable" set tok "\}" return 1 } tablename { #e.g [] - empty tablename - allowed or not? #empty tablename/tablearrayname ? error "unexpected tablename problem" set_tokenType "endinlinetable" set tok "" ;#no output into the tomlish list for this token return 1 } tablearrayname { error "unexpected tablearrayname problem" set_tokenType "endinlinetable" set tok "" ;#no output into the tomlish list for this token return 1 } curly-syntax - curly-space { set_tokenType "endinlinetable" set tok "\}" return 1 } array-syntax - array-space { #invalid set_tokenType "endinlinetable" set tok "\}" return 1 } itablevaltail { set_tokenType "endinlinetable" set tok "" #we need to pop the keyval - and then reprocess to pop the inlinetable - so we incr -1 incr i -1 return 1 } itablekeyval-syntax { error "endinlinetable unexpected at this point. Expecting key=val syntax [tomlish::parse::report_line]" } default { #JMN2024b keytail? error "state '$state'. endinlinetable case not implemented [tomlish::parse::report_line]" } } } } on error {em} { error $em } finally { set slash_active 0 } } lb { set multi_dquote "" ;#!! #left square bracket try { if {[::string length $tokenType]} { switch -exact -- $tokenType { startquotesequence { _shortcircuit_startquotesequence } string - stringpart { if {$slash_active} {append tok "\\"} append tok $c } starttablename { #$slash_active not relevant to this tokentype #change the tokenType switch_tokenType "starttablearrayname" set tok "" ;#no output into the tomlish list for this token #any following whitespace is part of the tablearrayname, so return now return 1 } comment { if {$slash_active} {append tok "\\"} append tok "\[" } default { #end any other token. incr i -1 return 1 } } } else { #$slash_active not relevant when no tokenType switch -exact -- $state { value-expected { set_tokenType "startarray" set tok "\[" return 1 } key-space { #table name #assume it's a single bracket - but we need to wait for non-bracket to confirm it's not a tablearray #note that a starttablearrayname token may contain whitespace between the brackets # e.g \[ \[ set_tokenType "starttablename" set tok "" ;#there is no output into the tomlish list for this token } array-space - array-syntax { #nested array? set_tokenType "startarray" set tok "\[" return 1 #error "state: array-space. startarray case not implemented [tomlish::parse::report_line]" } default { error "state: '$state'. startarray case not implemented [tomlish::parse::report_line]" } } } } on error {em} { error $em } finally { set slash_active 0 } } rb { set multi_dquote "" ;#!! #right square bracket try { if {[string length $tokenType]} { switch -exact -- $tokenType { startquotesequence { _shortcircuit_startquotesequence } string - stringpart - comment { if {$slash_active} {append tok "\\"} append tok $c } tablename { if {$slash_active} {append tok "\\"} #invalid! - but leave for datastructure loading stage to catch dict set token_waiting type endtablename dict set token_waiting tok "" return 1 } tablearraynames { if {$slash_active} {append tok "\\"} #invalid! - but leave for datastructure loading stage to catch dict set token_waiting type endtablearrayname dict set token_waiting tok "" return 1 } default { incr i -1 return 1 } } } else { #$slash_active not relevant when no tokenType switch -exact -- $state { value-expected { #invalid - but allow parser statemachine to report it. set_tokenType "endarray" set tok "\]" return 1 } key-space { #invalid - but allow parser statemachine to report it. ? set_tokenType "endarray" set tok "\]" return 1 } tablename { #e.g [] - empty tablename - allowed or not? #empty tablename/tablearrayname ? error "unexpected tablename problem" set_tokenType "endtablename" set tok "" ;#no output into the tomlish list for this token return 1 } tablearrayname { error "unexpected tablearrayname problem" set_tokenType "endtablearray" set tok "" ;#no output into the tomlish list for this token return 1 } array-syntax - array-space { set_tokenType "endarray" set tok "\]" return 1 } default { error "state '$state'. endarray case not implemented [tomlish::parse::report_line]" } } } } on error {em} { error $em } finally { set slash_active 0 } } bsl { set dquotes $multi_dquote set multi_dquote "" ;#!! #backslash if {[::string length $tokenType]} { switch -exact -- $tokenType { startquotesequence { _shortcircuit_startquotesequence } string - litstring - multilitstring - comment - tablename - tablearrayname { if {$slash_active} { set slash_active 0 append tok "\\\\" } else { set slash_active 1 } } stringpart { if {$slash_active} { #assert - quotes empty - or we wouldn't have slash_active set slash_active 0 append tok "\\\\" } else { append tok $dquotes set slash_active 1 } } whitespace { if {$state eq "multistring-space"} { #end whitespace token incr i -1 return 1 } else { error "Unexpected backslash during whitespace. [tomlish::parse::report_line]" } } barekey { error "Unexpected backslash during barekey. [tomlish::parse::report_line]" } default { error "Backslash unexpected during tokentype: '$tokenType'. [tomlish::parse::report_line]" } } } else { if {$state eq "multistring-space"} { set slash_active 1 } else { error "Unexpected backslash when no token is active. [tomlish::parse::report_line]" } } } dq { #double quote try { if {[::string length $tokenType]} { switch -exact -- $tokenType { startquotesequence { set toklen [::string length $tok] if {$toklen == 1} { append tok $c } elseif {$toklen == 2} { append tok $c set_tokenType "startmultiquote" return 1 } else { error "unexpected token length in 'startquotesequence'" } } endquotesequence { set toklen [::string length $tok] if {$toklen == 1} { append tok $c } elseif {$toklen == 2} { append tok $c set_tokenType "endmultiquote" return 1 } else { error "unexpected token length in 'endquotesequence'" } } string { if {$slash_active} { append tok "\\" append tok $c } else { #unescaped quote always terminates a string? dict set token_waiting type endquote dict set token_waiting tok "\"" return 1 } } stringpart { #sub element of multistring if {$slash_active} { append tok "\\" append tok $c } else { #incr i -1 if {$multi_dquote eq "\"\""} { dict set token_waiting type endmultiquote dict set token_waiting tok "\"\"\"" set multi_dquote "" return 1 } else { append multi_dquote "\"" } } } whitespace { switch -exact -- $state { multistring-space { #REVIEW if {$multi_dquote eq "\"\""} { dict set token_waiting type endmultiquote dict set token_waiting tok "\"\"\"" set multi_dquote "" return 1 } else { append multi_dquote "\"" } } value-expected { if {$multi_dquote eq "\"\""} { dict set token_waiting type startmultiquote dict set token_waiting tok "\"\"\"" set multi_dquote "" return 1 } else { #end whitespace token and reprocess incr i -1 return 1 #append multi_dquote "\"" } } default { dict set token_waiting type startquote dict set token_waiting tok "\"" return 1 } } } comment { if {$slash_active} {append tok "\\"} append tok $c } quotedkey - itablequotedkey { if {$slash_active} { append tok "\\" append tok $c } else { dict set token_waiting type endquote dict set token_waiting tok "\"" return 1 } } tablename - tablearrayname { if {$slash_active} {append tok "\\"} append tok $c } starttablename - starttablearrayname { incr i -1 ;## return 1 } default { error "got quote during tokenType '$tokenType' [tomlish::parse::report_line]" } } } else { #$slash_active not relevant when no tokenType #token is string only if we're expecting a value at this point switch -exact -- $state { value-expected - array-space { #!? start looking for possible multistartquote #set_tokenType startquote #set tok $c #return 1 set_tokenType startquotesequence ;#one or more quotes in a row - either startquote or multistartquote set tok $c } multistring-space { #REVIEW if {$multi_dquote eq "\"\""} { dict set token_waiting type endmultiquote dict set token_waiting tok "\"\"\"" set multi_dquote "" return 1 } else { append multi_dquote "\"" } } key-space { set tokenType startquote set tok $c return 1 } curly-space { set tokenType startquote set tok $c return 1 } tablename - tablearrayname { set_tokenType $state set tok $c } default { error "Unexpected quote during state '$state' [tomlish::parse::report_line]" } } } } on error {em} { error $em } finally { set slash_active 0 } } = { set dquotes $multi_dquote set multi_dquote "" ;#!! if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. set slash_active 0 if {[::string length $tokenType]} { switch -exact -- $tokenType { startquotesequence { _shortcircuit_startquotesequence } string - comment - quotedkey { #for these tokenTypes an = is just data. append tok $c } stringpart { append tok $dquotes$c } whitespace { dict set token_waiting type equal dict set token_waiting tok = return 1 } barekey { dict set token_waiting type equal dict set token_waiting tok = return 1 } default { error "unexpected = character during tokentype $tokenType. case not implemented. [tomlish::parse::report_line]" } } } else { switch -exact -- $state { multistring-space { set_tokenType stringpart set tok ${dquotes}= } default { set_tokenType equal set tok = return 1 } } } } cr { set dquotes $multi_dquote set multi_dquote "" ;#!! # \r carriage return if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. set slash_active 0 if {[::string length $tokenType]} { switch -exact -- $tokenType { startquotesequence { _shortcircuit_startquotesequence } stringpart { append tok $dquotes$c } default { #!todo - error out if cr inappropriate for tokenType append tok $c } } } else { #lf may be appended if next #review - lone cr as newline? - this is uncommon - but so is lone cr in a string(?) set_tokenType newline set tok cr } } lf { set dquotes $multi_dquote set multi_dquote "" ;#!! # \n newline if {[::string length $tokenType]} { if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. set slash_active 0 switch -exact -- $tokenType { startquotesequence { _shortcircuit_startquotesequence } newline { #this lf is the trailing part of a crlf append tok lf return 1 } stringpart { if {$dquotes ne ""} { append tok $dquotes incr i -1 return 1 } else { dict set token_waiting type newline dict set token_waiting tok lf return 1 } } default { #newline ends all other tokens. #note for string: we don't add (raw unescaped) newline to simple string. (must use multi-string for this) #note for whitespace: # we will use the convention that \n terminates the current whitespace even if whitespace follows # ie whitespace is split into separate whitespace tokens at each newline #puts "-------------- newline lf during tokenType $tokenType" dict set token_waiting type newline dict set token_waiting tok lf return 1 } } } else { set had_slash $slash_active set slash_active 0 if {$had_slash} { set_tokenType "continuation" set tok "\\" incr i -1 return 1 } else { set_tokenType newline set tok lf return 1 } } } , { set dquotes $multi_dquote set multi_dquote "" ;#!! if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. set slash_active 0 if {[::string length $tokenType]} { switch -exact -- $tokenType { startquotesequence { _shortcircuit_startquotesequence } string - comment - quotedkey - tablename - tablearrayname { append tok $c } stringpart { append tok $dquotes$c } default { dict set token_waiting type comma dict set token_waiting tok "," return 1 } } } else { switch -exact -- $state { multistring-space { set_tokenType stringpart set tok "," } multiliteral-space { set_tokenType literalpart set tok "," } default { set_tokenType comma set tok "," return 1 } } } } . { set multi_dquote "" ;#!! if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. set slash_active 0 if {[::string length $tokenType]} { switch -exact -- $tokenType { startquotesequence { _shortcircuit_startquotesequence } string - stringpart - comment - quotedkey - untyped-value { append tok $c } baretablename - tablename - tablearrayname { #subtable - split later - review append tok $c } barekey { #we need to transition the barekey to become a structured table name ??? review switch_tokenType tablename incr i -1 #error "barekey period unimplemented" } default { error "Received period during tokenType '$tokenType' [tomlish::parse::report_line]" #dict set token_waiting type period #dict set token_waiting tok "." #return 1 } } } else { switch -exact -- $state { multistring-space { set_tokenType stringpart set tok "." } multiliteral-space { set_tokenType literalpart set tok "." } default { set_tokenType untyped-value set tok "." } } } } " " { set dquotes $multi_dquote set multi_dquote "" ;#!! if {[::string length $tokenType]} { set had_slash $slash_active set slash_active 0 switch -exact -- $tokenType { startquotesequence { _shortcircuit_startquotesequence } barekey { #whitespace is a terminator for bare keys #dict set token_waiting type whitespace #dict set token_waiting tok $c incr i -1 return 1 } untyped-value { #unquoted values (int,date,float etc) are terminated by whitespace #dict set token_waiting type whitespace #dict set token_waiting tok $c incr i -1 return 1 } comment { if {$had_slash} { append tok "\\" } append tok $c } quotedkey - string { if {$had_slash} { append tok "\\" } #if {$dquotes eq "\""} { #} append tok $c } whitespace { append tok $c } stringpart { if {$had_slash} { #REVIEW incr i -2 return 1 } else { #split into STRINGPART aaa WS " " #keeping WS separate allows easier processing of CONT stripping append tok $dquotes incr i -1 return 1 } } starttablename { incr i -1 return 1 } starttablearrayname { incr i -1 return 1 } tablename - tablearrayname { #include whitespace in the tablename/tablearrayname #Will need to be normalized upon interpreting the tomlish as a datastructure append tok $c } default { error "Received whitespace space during tokenType '$tokenType' [tomlish::parse::report_line]" } } } else { set had_slash $slash_active if {$slash_active} { set slash_active 0 } switch -exact -- $state { tablename - tablearrayname { #tablename can have leading,trailing and interspersed whitespace! #These will not be treated as whitespace tokens, instead forming part of the name. set_tokenType $state if {$had_slash} { set tok "\\$c" } else { set tok $c } } multistring-space { if {$had_slash} { set_tokenType "continuation" set tok "\\" incr i -1 return 1 } else { if {$dquotes ne ""} { set_tokenType "stringpart" set tok $dquotes incr i -1 return } set_tokenType "whitespace" append tok $c } } default { if {$had_slash} { error "unexpected backslash [tomlish::parse::report_line]" } set_tokenType "whitespace" append tok $c } } } } tab { set dquotes $multi_dquote set multi_dquote "" ;#!! if {[::string length $tokenType]} { if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. set slash_active 0 switch -exact -- $tokenType { startquotesequence { _shortcircuit_startquotesequence } barekey { #whitespace is a terminator for bare keys incr i -1 #set token_waiting type whitespace #set token_waiting tok $c return 1 } untyped_value { #unquoted values (int,date,float etc) are terminated by whitespace #dict set token_waiting type whitespace #dict set token_waiting tok $c incr i -1 return 1 } quotedkey { append tok $c } string - comment - whitespace { append tok $c } stringpart { append tok $dquotes$c } starttablename - starttablearrayname { incr i -1 return 1 } tablename - tablearraynames { #include whitespace in the tablename/tablearrayname #Will need to be normalized upon interpreting the tomlish as a datastructure append tok $c } default { error "Received whitespace tab during tokenType '$tokenType' [tomlish::parse::report_line]" } } } else { set had_slash $slash_active if {$slash_active} { set slash_active 0 } switch -exact -- $state { tablename - tablearrayname { #tablename can have leading,trailing and interspersed whitespace! #These will not be treated as whitespace tokens, instead forming part of the name. set_tokenType $state set tok $c } multistring-space { if {$had_slash} { set_tokenType "continuation" set tok "\\" incr i -1 return 1 } else { if {$dquotes ne ""} { set_tokenType stringpart set tok $dquotes incr i -1 return 1 } else { set_tokenType whitespace append tok $c } } } default { set_tokenType "whitespace" append tok $c } } } } bom { #BOM (Byte Order Mark) - ignored by token consumer set_tokenType "bom" set tok "\uFEFF" return 1 } default { set dquotes $multi_dquote set multi_dquote "" ;#!! if {[::string length $tokenType]} { if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. set slash_active 0 switch -exact -- $tokenType { startquotesequence { _shortcircuit_startquotesequence } endquotesequence { puts stderr "endquotesequence: $tok" } whitespace { incr i -1 ;#We don't have a full token to add to the token_waiting dict - so leave this char for next run. return 1 } barekey { if {[tomlish::utils::is_barekey $c]} { append tok $c } else { error "Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] allowed. [tomlish::parse::report_line]" } } starttablename - starttablearrayname { incr i -1 #allow statemachine to set context for subsequent chars return 1 } stringpart { append tok $dquotes$c } default { #e.g comment/string/untyped-value/starttablename/starttablearrayname/tablename/tablearrayname append tok $c } } } else { set had_slash $slash_active set slash_active 0 switch -exact -- $state { key-space - curly-space - curly-syntax { #if no currently active token - assume another key value pair if {[tomlish::utils::is_barekey $c]} { set_tokenType "barekey" append tok $c } else { error "Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" } } multistring-space { set_tokenType "stringpart" if {$had_slash} { #assert - we don't get had_slash and dquotes at same time set tok \\$c } else { set tok $dquotes$c } } tablename { set_tokenType "tablename" set tok $c } tablearrayname { set_tokenType "tablearrayname" set tok $c } default { set_tokenType "untyped-value" set tok $c } } } } } } #run out of characters (eof) if {[::string length $tokenType]} { #check for invalid ending tokens #if {$state eq "err"} { # error "Reached end of data whilst tokenType = '$tokenType'. INVALID" #} if {$tokenType eq "startquotesequence"} { set toklen [::string length $tok] if {$toklen == 1} { #invalid #eof with open string eror "eof reached without closing quote for string. [tomlish::parse::report_line]" } elseif {$toklen == 2} { #valid #we ended in a double quote, not actually a startquoteseqence - effectively an empty string switch_tokenType "startquote" incr i -1 #dict set token_waiting type "string" #dict set token_waiting tok "" return 1 } } dict set token_waiting type "eof" dict set token_waiting tok "eof" return 1 } else { ::tomlish::log::debug "No current tokenType, ran out of characters, setting tokenType to 'eof' [tomlish::parse::report_line]" set tokenType "eof" set tok "eof" } return 0 } #*** !doctools #[list_end] [comment {--- end definitions namespace tomlish::parse ---}] } tcl::namespace::eval tomlish::app { variable applist [list encoder decoder test] #*** !doctools #[subsection {Namespace tomlish::app}] #[para] #[list_begin definitions] proc decoder {args} { #*** !doctools #[call app::[fun decoder] [arg args]] #[para] read toml on stdin until EOF #[para] on error - returns non-zero exit code and writes error on stderr #[para] on success - returns zero exit code and writes JSON encoding of the data on stdout #[para] This decoder is intended to be compatible with toml-test set opts [dict merge [dict create] $args] #fconfigure stdin -encoding utf-8 fconfigure stdin -translation binary #Just slurp it all - presumably we are not handling massive amounts of data on stdin. # - even if the input is large, we probably don't gain much (aside from possible memory savings?) by attempting to process input as it arrives. if {[catch { set toml [read stdin] }]} { exit 2 ;#read error } try { set j [::tomlish::toml_to_json $toml] } on error {em} { puts stderr "decoding failed: '$em'" exit 1 } puts -nonewline stdout $j exit 0 } proc encoder {args} { #*** !doctools #[call app::[fun encoder] [arg args]] #[para] read JSON on stdin until EOF #[para] return non-zero exitcode if JSON data cannot be converted to a valid TOML representation #[para] return zero exitcode and TOML data on stdout if JSON data can be converted. #[para] This encoder is intended to be compatible with toml-test set opts [dict merge [dict create] $args] fconfigure stdin -translation binary if {[catch { set json [read stdin] }]} { exit 2 ;#read error } try { set toml [::tomlish::json_to_toml $json] } on error {em} { puts stderr "encoding failed: '$em'" exit 1 } puts -nonewline stdout $toml exit 0 } proc test {args} { set opts [dict merge [dict create] $args] package require test::tomlish if {[dict exists $opts -suite]} { test::tomlish::suite [dict get $opts -suite] } test::tomlish::run } #*** !doctools #[list_end] [comment {--- end definitions namespace tomlish::app ---}] } proc ::tomlish::appnames {} { set applist [list] foreach cmd [info commands ::tomlish::app::*] { lappend applist [namespace tail $cmd] } return $applist } # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # Secondary API namespace # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ namespace eval tomlish::lib { namespace export {[a-z]*}; # Convention: export all lowercase namespace path [namespace parent] #*** !doctools #[subsection {Namespace tomlish::lib}] #[para] Secondary functions that are part of the API #[list_begin definitions] #proc utility1 {p1 args} { # #*** !doctools # #[call lib::[fun utility1] [arg p1] [opt {?option value...?}]] # #[para]Description of utility1 # return 1 #} #*** !doctools #[list_end] [comment {--- end definitions namespace tomlish::lib ---}] } # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ if {$argc > 0} { puts stderr "argc: $argc args: $argv" if {($argc == 1)} { if {[::string tolower $argv] in {help -help h -h}} { puts stdout "Usage: -app where appname one of:[tomlish::appnames]" exit 0 } else { puts stderr "Argument '$argv' not understood. Try -help" exit 1 } } set opts [dict create] set opts [dict merge $opts $argv] set opts_understood [list -app ] if {"-app" in [dict keys $opts]} { #Don't vet the remaining opts - as they are interpreted by each app } else { foreach key [dict keys $opts] { if {$key ni $opts_understood} { puts stderr "Option '$key' not understood" exit 1 } } } if {[dict exists $opts -app]} { set app [dict get $opts -app] if {$app ni [tomlish::appnames]} { puts stderr "app '[dict get $opts -app]' not found" exit 1 } tomlish::app::$app {*}$opts } } ## Ready package provide tomlish [namespace eval tomlish { variable pkg tomlish variable version set version 1.1.1 }] return #*** !doctools #[manpage_end]