# -*- tcl -*- # Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt # # Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. # Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # (C) 2024 # # @@ Meta Begin # Application tomlish 1.1.6 # Meta platform tcl # Meta license # @@ Meta End # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # doctools header # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ #*** !doctools #[manpage_begin tomlish_module_tomlish 0 1.1.6] #[copyright "2024"] #[titledesc {tomlish toml parser}] [comment {-- Name section and table of contents description --}] #[moddesc {tomlish}] [comment {-- Description at end of page heading --}] #[require tomlish] #[keywords module parsing toml configuration] #[description] #[para] tomlish is an intermediate representation of toml data in a tree structure (tagged lists representing type information) #[para] The design goals are for tomlish to be whitespace and comment preserving ie byte-for byte preservation during roundtrips from toml to tomlish and back to toml #[para] The tomlish representation can then be converted to a Tcl dict structure or to other formats such as json, #[para] although these other formats are generally unlikely to retain whitespace or comments #[para] The other formats also won't preserve roundtripability e.g \t and a literal tab coming from a toml file will be indistinguishable. #[para] A further goal is to allow at least a useful subset of in-place editing operations which also preserve whitespace and comments. #[para] e.g leaf key value editing, and table reordering/sorting, key-renaming at any level, key insertions/deletions #[para] The API for editing (tomldoc object?) may require explicit setting of type if accessing an existing key #[para] e.g setting a key that already exists and is a different type (especially if nested structure such as a table or array) #[para] will need a -type option (-force ?) to force overriding with another type such as an int. # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ #*** !doctools #[section Overview] #[para] overview of tomlish #[subsection Concepts] #[para] - # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ ## Requirements # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ #*** !doctools #[subsection dependencies] #[para] packages used by tomlish #[list_begin itemized] package require Tcl 8.6- package require struct::stack package require logger #*** !doctools #[item] [package {Tcl 8.6-}] #[item] [package {struct::stack}] #limit ourselves to clear, destroy, peek, pop, push, rotate, or size (e.g v 1.3 does not implement 'get') #*** !doctools #[list_end] # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ #*** !doctools #[section API] # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # Base namespace # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ namespace eval tomlish { namespace export {[a-z]*}; # Convention: export all lowercase variable types #*** !doctools #[subsection {Namespace tomlish}] #[para] Core API functions for tomlish #[list_begin definitions] #default interp recursionlimit of 1000 is insufficient to pass 1000 deep nested structures as in certain toml tests. #e.g https://github.com/iarna/toml-spec-tests/tree/latest/values #1000 seems deep for a 'configuration' format - but toml sometimes used for other serialisation purposes. #todo - review set existing_recursionlimit [interp recursionlimit {}] if {$existing_recursionlimit < 5000} { interp recursionlimit {} 5000 } #IDEAS: # since get_toml produces tomlish with whitespace/comments intact: # tomldoc object - allow (at least basic?) editing of toml whilst preserving comments & whitespace # - setKey (set leaf only to value) how to specify type? -type option? - whole array vs index into arrays and further nested objects? - option for raw toml additions? # - separate addKey?? # - deleteKey (delete leaf) # - deleteTable (delete table - if only has leaves? - option to delete with child tables?) # - set/add Table? - position in doc based on existing tables/subtables? #The tomlish intermediate representation allows things such as sorting the toml document by table name or other re-ordering of tables - # because the tables include subkeys, comments and newlines within their structure - those elements all come along with it nicely during reordering. #The same goes for the first newline following a keyval e.g x=1 \ny=2\n\n #The newline is part of the keyval structure so makes reordering easier #example from_toml "a=1\nb=2\n\n\n" # 0 = TOMLISH # 1 = KEY a = {INT 1} {NEWLINE lf} # 2 = NEWLINE lf # 3 = KEY b = {INT 2} {NEWLINE lf} # 4 = NEWLINE lf # 5 = NEWLINE lf #This reordering idea is complicated by the nature of tablearrays - especially as a table header references last tablearrayname, # and duplicate table headers are allowed in that context. #e.g #[[fruits]] # name="apple" # [fruits.metadata] # id=1 # #[unrelated1] # #[[fruits]] # name="pear" # #[unrelated2] # silly="ordering" # #[fruits.metadata] #id=2 #The TABLEARRAY record can't be completely selfcontained on the default parsing mechanism - because it is legal (though not recommended) to have unrelated tables in between. #If we were to 'insert' later related records (such as the 2nd [fruits.metadata] above) into the TABLEARRAY structure - then, even though it might produce 'nicer' toml, # we would lose roundtripability toml->tomlish->toml # ----------------------------------------------------- #REVIEW #todo - some sort of 'normalize'/'grouping' function on tomlish that at least makes records self-contained, and perhaps then (optionally) reorders resulting records sensibly. #such a function on the tomlish may work - although it would be unwise to duplicate the validation aspects of dict::from_tomlish #The most practical way might be to use dict::from_tomlish followed by from_dict - but that would lose comment info and formatting. #In the above example - The decision by the toml author to put [unrelated1] between related tablearrays should be respected, #but the positioning of [unrelated2] between a tablearray and one of its contained tables is suspect. #Both [fruits.metadata] table records should theoretically be added as children to their corresponding [[fruits]] tablearray record in the tomlish. (just as their name keys are) # ----------------------------------------------------- #ARRAY is analogous to a Tcl list #TABLE is analogous to a Tcl dict #WS = inline whitespace #KEY = bare key and value #DQKEY = double quoted key and value #SQKEY = single quoted key and value #ITABLE = inline table (*can* be anonymous table) # inline table values immediately create a table with the opening brace # inline tables are fully defined between their braces, as are dotted-key subtables defined within # No additional subtables or arrays of tables may be defined within an inline table after the ending brace - they must be entirely self-contained set tags [list TOMLISH BOM ARRAY TABLE ITABLE TABLEARRAY WS NEWLINE COMMENT DOTTEDKEY KEY DQKEY SQKEY STRING STRINGPART MULTISTRING LITERAL LITERALPART MULTILITERAL INT FLOAT BOOL] #DDDD lappend tags {*}[list\ DATETIME\ DATETIME-LOCAL\ DATE-LOCAL\ TIME-LOCAL\ ] #removed - ANONTABLE #tomlish v1.0 should accept arbitrary 64-bit signed ints (from -2^63 to 2^63-1) #we will restrict to this range for compatibility for now - although Tcl can handle larger (arbitrarily so?) #todo - configurable - allow empty string for 'unlimited' set min_int -9223372036854775808 ;#-2^63 set max_int +9223372036854775807 ;#2^63-1 proc Dolog {lvl txt} { #return "$lvl -- $txt" set msg "[clock format [clock seconds] -format "%Y-%m-%dT%H:%M:%S"] tomlish '$txt'" puts stderr $msg } logger::initNamespace ::tomlish foreach lvl [logger::levels] { interp alias {} tomlish_log_$lvl {} ::tomlish::Dolog $lvl log::logproc $lvl tomlish_log_$lvl } proc tags {} { return $::tomlish::tags } proc get_dottedkey_info {dottedkeyrecord} { set key_hierarchy [list] set key_hierarchy_raw [list] if {[lindex $dottedkeyrecord 0] ne "DOTTEDKEY"} { error "tomlish::get_dottedkey_info error. Supplied list doesn't appear to be a DOTTEDKEY (tag: [lindex $dottedkeyrecord 0])" } set compoundkeylist [lindex $dottedkeyrecord 1] set expect_sep 0 foreach part $compoundkeylist { set parttag [lindex $part 0] if {$parttag eq "WS"} { continue } if {$expect_sep} { if {$parttag ne "DOTSEP"} { error "DOTTEDKEY missing dot separator between parts. '$dottedkeyrecord'" } set expect_sep 0 } else { set val [lindex $part 1] switch -exact -- $parttag { KEY { lappend key_hierarchy $val lappend key_hierarchy_raw $val } DQKEY { #REVIEW unescape or not? #JJJJ lappend key_hierarchy [::tomlish::utils::unescape_string $val] lappend key_hierarchy_raw \"$val\" } SQKEY { lappend key_hierarchy $val lappend key_hierarchy_raw "'$val'" } default { error "tomlish::get_dottedkey_info DOTTED key unexpected part '$parttag' - ensure dot separator is between key parts. '$compoundkeylist'" } } set expect_sep 1 } } return [dict create keys $key_hierarchy keys_raw $key_hierarchy_raw] } #helper function for tomlish::dict::from_tomlish proc _get_keyval_value {keyval_element} { #e.g #DOTTEDKEY {{KEY a} {WS { }}} = {WS { }} {ARRAY {INT 1} SEP {ITABLE {DOTTEDKEY {{KEY x}} = {INT 1} SEP} {DOTTEDKEY {{KEY y}} = {INT 2}}}} log::notice ">>> _get_keyval_value from '$keyval_element'<<<" #find the value (or 2 values if space separated datetime - and stitch back into one) # 3 is the earliest index at which the value could occur (depending on whitespace) if {[lindex $keyval_element 2] ne "="} { error "tomlish _get_keyval_value keyval_element doesn't seem to be a properly structured { = } list\n $keyval_element" } #review if {[uplevel 1 [list info exists tablenames_info]]} { upvar tablenames_info tablenames_info } else { set tablenames_info [dict create] ;#keys are @@ paths {@@parenttable @@arrayable @@etc} corresponding to parenttable.arraytable[].etc #value is a dict with keys such as ttype, tdefined } set sublist [lrange $keyval_element 3 end] ;# rhs of = set values [list] set value_posns [list] set posn 0 foreach sub $sublist { #note that a barekey/dquotedkey won't occur directly inside a barekey/dquotedkey #DDDD switch -exact -- [lindex $sub 0] { STRING - LITERAL - MULTISTRING - MULTILITERAL - INT - FLOAT - BOOL - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - TABLE - ARRAY - ITABLE { lappend values $sub lappend value_posns $posn } DOTTEDKEY { #we should never see DOTTEDKEY as a toplevel element on RHS #sanity check in case manually manipulated tomlish - or something went very wrong set msg "tomlish::_get_keyval_value Unexpected toplevel value element DOTTEDKEY after =" return -code error -errorcode {TOMLISH SYNTAX UNEXPECTEDDOTTEDKEYRHS} $msg } WS - NEWLINE - COMMENT {} SEP {} default { set msg "tomlish::_get_keyval_value Unexpected toplevel value element [lindex $sub 0] after =" return -code error -errorcode {TOMLISH SYNTAX UNEXPECTED} $msg } } incr posn } switch -- [llength $values] { 0 { error "tomlish Failed to find value element in KEY. '$keyval_element'" } 1 { lassign [lindex $values 0] type value } 2 { #we generally expect a single 'value' item on RHS of = #(ignoring WS,NEWLINE,SEP #(either a simple type, or a container which has multiple values inside) #exception for space separated datetime which is two toplevel values #validate than exactly single space was between the two values lassign $value_posns p1 p2 if {$p2 != $p1 +2} { #sanity check #can probably only get here through manual manipulation of the tomlish list to an unprocessable form error "tomlish KEY appears to have more than one part - but not separated by whitespace - invalid '$keyval_element'" } set between_token [lindex $sublist $p1+1] if {[lindex $between_token 1] ne " "} { error "tomlish KEY in 2 parts is not separated by a single space - cannot consider for datetime '$keyval_element'" } lassign [lindex $values 0] type_d1 value_d1 lassign [lindex $values 1] type_d2 value_d2 #DDDD if {$type_d1 ne "DATE-LOCAL" || $type_d2 ni {DATETIME TIME-LOCAL}} { #we reuse DATETIME tag for standalone time with tz offset (or zZ) error "tomlish KEY in 2 parts does not appear to be datetime '$keyval_element'" } if {$type_d2 eq "TIME-LOCAL"} { set type DATETIME-LOCAL } else { #extra check that 2nd part is actually a time if {![tomlish::utils::is_timepart $value_d2]} { error "tomlish KEY in 2 parts does not appear to be datetime. (part 2 not a time value) '$keyval_element'" } set type DATETIME } set value "${value_d1}T${value_d2}" } default { error "tomlish Found multiple value elements in KEY, expected one. (or 2 for space-separated datetime) '$keyval_element'" } } set sub_tablenames_info [dict create] switch -exact -- $type { INT - FLOAT - BOOL - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL { #DDDD #simple (non-container, no-substitution) datatype set result [list type $type value $value] } STRING - STRINGPART { #JJJ #!!! review #set result [list type $type value [::tomlish::utils::unescape_string $value]] set result [list type $type value $value] } LITERAL - LITERALPART { #REVIEW set result [list type $type value $value] } TABLE { #invalid? error "tomlish _get_keyval_value invalid to have type TABLE on rhs of =" } ITABLE { #This one should not be returned as a type value structure! # set prev_tablenames_info $tablenames_info set tablenames_info [dict create] set result [::tomlish::dict::from_tomlish [ list [lindex $values 0] ]] set sub_tablenames_info $tablenames_info set tablenames_info $prev_tablenames_info } ARRAY { #we need to recurse to get the corresponding dict for the contained item(s) #pass in the whole [lindex $values 0] (type val) - not just the $value! set prev_tablenames_info $tablenames_info set tablenames_info [dict create] set result [list type $type value [ ::tomlish::dict::from_tomlish [ list [lindex $values 0] ] ]] set sub_tablenames_info $tablenames_info set tablenames_info $prev_tablenames_info } MULTISTRING - MULTILITERAL { #review - mapping these to STRING might make some conversions harder? #if we keep the MULTI - we know we have to look for newlines for example when converting to json #without specific types we'd have to check every STRING - and lose info about how best to map chars within it set result [list type $type value [ ::tomlish::dict::from_tomlish [ list [lindex $values 0] ] ]] } default { error "tomlish Unexpected value type '$type' found in keyval '$keyval_element'" } } return [dict create result $result tablenames_info $sub_tablenames_info] } proc to_dict {tomlish} { tomlish::dict::from_tomlish $tomlish } proc _from_dictval_tomltype {parents tablestack keys typeval} { set type [dict get $typeval type] set val [dict get $typeval value] #These are the restricted sets of typed used in the tomlish::dict representation #They are a subset of the types in tomlish: data types plus ARRAY, arranged in a dictionary form. #The container types: ITABLE, TABLE, TABLEARRAY are not used as they are represented as dictionary keys and ARRAY items. #The WS, COMMENT, and NEWLINE elements are also unrepresented in the dict structure. switch -- $type { ARRAY { set subitems [list] foreach item $val { lappend subitems [_from_dictval [list {*}$parents ARRAY] $tablestack $keys $item] SEP } if {[lindex $subitems end] eq "SEP"} { set subitems [lrange $subitems 0 end-1] } return [list ARRAY {*}$subitems] } ITABLE { error "not applicable" if {$val eq ""} { return ITABLE } else { return [_from_dictval [list {*}$parents ITABLE] $tablestack $keys $val] } } STRING { #JSJS #if our dict came from json - we have already decided what type of STRING/LITERAL etc to use when building the dict #do not validate like this - important that eg json val\\ue -> dict val\ue -> tomlish/toml val\\ue #see toml-tests #if {![tomlish::utils::rawstring_is_valid_tomlstring $val]} { # #todo? # return -code error -errorcode {TOML SYNTAX INVALIDSTRING} "Unescaped controls in string or invalid escapes" #} return [list STRING [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $val]] } MULTISTRING { #value is a raw string that isn't encoded as tomlish #create a valid toml snippet with the raw value and decode it to the proper tomlish MULTISTRING format #We need to convert controls in $val to escape sequences - except for newlines # #consider an *option* to reformat for long lines? (perhaps overcomplex - byte equiv - but may fold in ugly places) #we could use a line-length limit to decide when to put in a "line ending backslash" #and even format it with a reasonable indent so that proper CONT and WS entries are made (?) REVIEW # #TODO set tomlpart "x=\"\"\"\\\n" append tomlpart [tomlish::utils::rawstring_to_MultiBstring_with_escaped_controls $val] append tomlpart "\"\"\"" set tomlish [tomlish::from_toml $tomlpart] #e.g if val = " etc\nblah" #TOMLISH {DOTTEDKEY {{KEY x}} = {MULTISTRING CONT {NEWLINE LF} {WS { }} {STRINGPART etc} {NEWLINE lf} {STRINGPART blah} } } #lindex 1 3 is the MULTISTRING tomlish list return [lindex $tomlish 1 3] } MULTILITERAL { #MLL string can contain newlines - but still no control chars #todo - validate - e.g val can't contain more than 2 squotes in a row if {[string first ''' $val] >=0} { set msg "_from_dictval_tomltype error: more than 2 single quotes in a row found in MULTILITERAL - cannot encode dict to TOML-VALID TOMLISH" return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} $msg } #rawstring_is_valid_multiliteral - allow newlines as lf or crlf - but not bare cr if {![tomlish::utils::rawstring_is_valid_multiliteral $val]} { return -code error -errorcode {TOML SYNTAX INVALIDMULTILITERAL} "Controls other than tab or newlines found in multiliteral" } set tomlpart "x='''\n" append tomlpart $val ''' set tomlish [tomlish::from_toml $tomlpart] return [lindex $tomlish 1 3] } LITERAL { #from v1.0 spec - "Control characters other than tab are not permitted in a literal string" #(This rules out raw ANSI SGR - which is somewhat restrictive - but perhaps justified for a config format # as copy-pasting ansi to a config value is probably not always wise, and it's not something that can be # easily input via a text editor. ANSI can go in Basic strings using the \e escape if that's accepted v1.1?) #we could choose to change the type to another format here when encountering invalid chars - but that seems #like too much magic. We elect to error out and require the dict to have valid data for the types it specifies. if {[string first ' $val] >=0} { set msg "_from_dictval_tomltype error: single quote found in LITERAL - cannot encode dict to TOML-VALID TOMLISH" return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} $msg } #JJJJ if {![tomlish::utils::rawstring_is_valid_literal $val]} { #has controls other than tab #todo - squote? return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} "Controls other than tab found in literal" } return [list LITERAL $val] } INT { if {![::tomlish::utils::is_int $val]} { error "_from_dictval_tomltype error: bad INT value '$val' - cannot encode dict to TOML-VALID TOMLISH" } return [list INT $val] } FLOAT { if {![::tomlish::utils::is_float $val]} { error "_from_dictval_tomltype error: bad FLOAT value '$val' - cannot encode dict to TOML-VALID TOMLISH" } return [list FLOAT $val] } default { if {$type ni [::tomlish::tags]} { error "_from_dictval_tomltype error: Unrecognised typename '$type' in {type value } - cannot encode dict to TOML-VALID TOMLISH" } return [list $type $val] } } } proc _from_dictval {parents tablestack keys vinfo} { set k [lindex $keys end] set K_PART [tomlish::dict::classify_rawkey $k] ;#get [list SQKEY ] #puts stderr "---parents:'$parents' keys:'$keys' vinfo: $vinfo---" #puts stderr "---tablestack: $tablestack---" set result [list] set lastparent [lindex $parents end] if {$lastparent in [list "" do_inline]} { if {[tomlish::dict::is_typeval $vinfo]} { set type [dict get $vinfo type] #treat ITABLE differently? set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] lappend result DOTTEDKEY [list $K_PART {WS { }}] = {WS { }} $sublist {NEWLINE lf} } else { if {$vinfo ne ""} { #set result [list DOTTEDKEY [list [list KEY $k]] = ] #set records [list ITABLE] set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] if {$lastparent eq "do_inline"} { set result [list DOTTEDKEY [list $K_PART] =] set records [list ITABLE] } else { set tname [tomlish::dict::join_and_quote_rawkey_list [list $k]] set result [list TABLE $tname {NEWLINE lf}] set tablestack [list {*}$tablestack [list T $k]] set records [list] } set lastidx [expr {[dict size $vinfo] -1}] set dictidx 0 dict for {vk vv} $vinfo { set VK_PART [tomlish::dict::classify_rawkey $vk] ;#get [list SQKEY ] #(SQKEY & DQKEY do not have the enclosing quotes in their returned val) #if {[regexp {\s} $vk] || [string first . $vk] >= 0} { # set VK_PART [list SQKEY $vk] #} else { # set VK_PART [list KEY $vk] #} if {[tomlish::dict::is_typeval $vv]} { #type x value y #REVIEW - we could detect if value is an array of objects, #and depending on parent context - emit a series of TABLEARRAY records instead of a DOTTEDKEY record containing an ARRAY of objects set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] set record [list DOTTEDKEY [list $VK_PART {WS { }}] = {WS { }} $sublist] } else { if {$vv eq ""} { #experimental if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { ::tomlish::log::notice "_from_dictval could uninline KEY $vk (tablestack:$tablestack)" #set tname [tomlish::dict::name_from_tablestack [list {*}$tablestack [list T $vk]]] #we can't just join normalized keys - need keys with appropriate quotes and escapes #set tname [join [list {*}$keys $vk] .] ;#WRONG set tq [tomlish::dict::join_and_quote_rawkey_list [list {*}$keys $vk]] ##wrong? results in TABLE within TABLE record?? todo pop? #set record [list TABLE $tq {NEWLINE lf}] #set tablestack [list {*}$tablestack [list T $vk]] #REVIEW!!! set record [list DOTTEDKEY [list $VK_PART] = ITABLE] set tablestack [list {*}$tablestack [list I $vk]] } else { set record [list DOTTEDKEY [list $VK_PART] = ITABLE] set tablestack [list {*}$tablestack [list I $vk]] } } else { if { 0 } { #experiment.. sort of getting there. if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { ::tomlish::log::notice "_from_dictval could uninline2 KEYS [list {*}$keys $vk] (tablestack:$tablestack)" set tq [tomlish::dict::join_and_quote_rawkey_list [list {*}$keys $vk]] set record [list TABLE $tq {NEWLINE lf}] set tablestack [list {*}$tablestack [list T $vk]] #review - todo? set dottedkey_value [_from_dictval [list {*}$parents TABLE] $tablestack [list {*}$keys $vk] $vv] lappend record {*}$dottedkey_value } else { set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] } } else { set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] } } } if {$dictidx != $lastidx} { #lappend record SEP if {$lastparent eq "do_inline"} { lappend record SEP } else { lappend record {NEWLINE lf} } } if {[llength $record]} { lappend records $record } incr dictidx } if {$lastparent eq "do_inline"} { lappend result $records {NEWLINE lf} } else { lappend result {*}$records {NEWLINE lf} } } else { if {$lastparent eq "do_inline"} { lappend result DOTTEDKEY [list $K_PART] = ITABLE {NEWLINE lf} } else { set tname [tomlish::dict::join_and_quote_rawkey_list [list $k]] #REVIEW lappend result TABLE $tname {NEWLINE lf} } } } } else { #lastparent is not toplevel "" or "do_inline" if {[tomlish::dict::is_typeval $vinfo]} { #type x value y set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] lappend result {*}$sublist } else { if {$lastparent eq "TABLE"} { #review dict for {vk vv} $vinfo { set VK_PART [tomlish::dict::classify_rawkey $vk] ;#get [list SQKEY ] set dottedkey_value [_from_dictval [list {*}$parents DOTTEDKEY] $tablestack [list {*}$keys $vk] $vv] lappend result [list DOTTEDKEY [list $VK_PART] = $dottedkey_value {NEWLINE lf}] } } else { if {$vinfo ne ""} { set lastidx [expr {[dict size $vinfo] -1}] set dictidx 0 set sub [list] #REVIEW #set result $lastparent ;#e.g sets ITABLE set result ITABLE set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] dict for {vk vv} $vinfo { set VK_PART [tomlish::dict::classify_rawkey $vk] ;#get [list SQKEY ] if {[tomlish::dict::is_typeval $vv]} { #type x value y set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] set record [list DOTTEDKEY [list $VK_PART] = $sublist] } else { if {$vv eq ""} { #can't just uninline at this level #we need a better method to query main dict for uninlinability at each level # (including what's been inlined already) #if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { # puts stderr "_from_dictval uninline2 KEY $keys" # set tname [tomlish::dict::join_and_quote_rawkey_list [list {*}$keys $vk]] # set record [list TABLE $tname {NEWLINE lf}] # set tablestack [list {*}$tablestack [list T $vk]] #} else { set record [list DOTTEDKEY [list $VK_PART] = ITABLE] #} } else { #set sub [_from_dictval ITABLE $vk $vv] set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] #set record [list DOTTEDKEY [list $VK_PART] = ITABLE $dottedkey_value] set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] } } if {$dictidx != $lastidx} { lappend record SEP } lappend result $record incr dictidx } } else { #e.g x=[{}] log::debug "---> _from_dictval empty ITABLE x-1" #lappend result DOTTEDKEY [list $K_PART] = ITABLE ;#wrong lappend result ITABLE } } } } return $result } proc from_dict {d} { #consider: # t1={a=1,b=2} # x = 1 # from_dict gives us: t1 {a {type INT value 1} b {type INT value 2}} x {type INT value 1} #If we represent t1 as an expanded table we get # [t1] # a=1 # b=2 # x=1 # --- which is incorrect - as x was a toplevel key like t1! #This issue doesn't occur if x is itself an inline table # t1={a=1,b=2} # x= {no="problem"} # # (or if we were to reorder x to come before t1) #ie the order of the dict elements influences how the toml can be represented. #As the dictionary form doesn't distinguish the structure used to create tables {[table1]\nk=v} vs inline {table1={k=v}} #Without a solution, from_dict would have to always produce the inline form for toplevel tables unless we allowed re-ordering, #which is unpreferred here. #A possible solution: #scan the top level to see if all (trailing) elements are themselves dicts # (ie not of form {type XXX value yyy}) # # A further point is that if all root level values are at the 'top' - we can treat lower table-like structures as {[table]} elements #ie we don't need to force do_inline if all the 'simple' keys are before any compound keys #set root_has_values 0 #approach 1) - the naive approach - forces inline when not always necessary #dict for {k v} $d { # if {[llength $v] == 4 && [lindex $v 0] eq "type"} { # set root_has_values 1 # break # } #} #approach 2) - track the position of last {type x value y} in the dictionary built by dict::from_tomlish # - still not perfect. Inlines dotted tables unnecessarily #This means from_dict doesn't produce output optimal for human editing. set last_simple [tomlish::dict::last_tomltype_posn $d] ## set parent "do_inline" ;#a value used in _from_dictval to distinguish from "" or other context based parent values #Any keys that are themselves tables - will need to be represented inline #to avoid reordering, or incorrect assignment of plain values to the wrong table. ## set parent "" #all toplevel keys in the dict structure can represent subtables. #we are free to use {[tablename]\n} syntax for toplevel elements. set tomlish [list TOMLISH] set dictposn 0 set tablestack [list [list T root]] ;#todo dict for {t tinfo} $d { if {$last_simple > $dictposn} { set parents [list do_inline] } else { set parents [list ""] } set keys [list $t] #review - where to make decision on # DOTTEDKEY containing array of objs #vs # list of TABLEARRAY records #At least for the top set trecord [_from_dictval $parents $tablestack $keys $tinfo] lappend tomlish $trecord incr dictposn } return $tomlish } proc typedjson_to_toml {json} { #*** !doctools #[call [fun typedjson_to_toml] [arg json]] #[para] set tomlish [::tomlish::from_dict_from_typedjson $json] lappend tomlish [list NEWLINE lf] set toml [::tomlish::to_toml $tomlish] } set json1 {{ "a": {"type": "integer", "value": "42"}}} set json2 {{ "a": {"type": "integer", "value": "42"}, "b": {"type": "string", "value": "test"} }} set json3 { { "best-day-ever": {"type": "datetime", "value": "1987-07-05T17:45:00Z"}, "numtheory": { "boring": {"type": "bool", "value": "false"}, "perfection": [ {"type": "integer", "value": "6"}, {"type": "integer", "value": "28"}, {"type": "integer", "value": "496"} ] } } } set json4 { { "best-day-ever": {"type": "datetime", "value": "1987-07-05T17:45:00Z"}, "numtheory": { "boring": {"type": "bool", "value": "false"}, "perfection": [ {"type": "integer", "value": "6"}, {"type": "integer", "value": "28"}, {"type": "integer", "value": "496"} ] }, "emptyobj": {}, "emptyarray": [] } } set json5 { { "a": { " x ": {}, "b.c": {}, "d.e": {}, "b": { "c": {} } } } } #surrogate pair face emoji set json6 { { "surrogatepair": {"type": "string", "value": "\uD83D\uDE10"} } } set json7 { { "escapes": {"type": "string", "value": "val\\ue"} } } proc from_dict_from_typedjson {json} { set d [tomlish::dict::from_typedjson $json] tomlish::from_dict $d ;#return tomlish } proc toml_to_typedjson {toml} { set tomlish [::tomlish::from_toml $toml] set d [tomlish::dict::from_tomlish $tomlish] #full validation only occurs by re-encoding dict to tomlish set test [tomlish::from_dict $d] set h [tomlish::typedhuddle::from_dict $d] #huddle jsondump $h tomlish::huddle::jsondumpraw $h } #proc get_json {tomlish} { # package require fish::json # set d [::tomlish::dict::from_tomlish $tomlish] # #return [::tomlish::dict_to_json $d] # return [fish::json::from "struct" $d] #} #return a Tcl list of tomlish tokens #i.e get a standard list of all the toml terms in string $s #where each element of the list is a *tomlish* term.. i.e a specially 'tagged' Tcl list. #(simliar to a tcl 'Huddle' - but also supporting whitespace preservation) # ---------------------------------------------------------------------------------------------- # NOTE: the production of tomlish from toml source doesn't indicate the toml source was valid!!! # e.g we deliberately don't check certain things such as duplicate table declarations here. # ---------------------------------------------------------------------------------------------- #Part of the justification for this is that as long as the syntax is toml shaped - we can load files which violate certain rules and allow programmatic manipulation. # (e.g perhaps a toml editor to highlight violations for fixing) # A further stage is then necessary to load the tomlish tagged list into a data structure more suitable for efficient query/reading. # e.g dicts or an object oriented structure #Note also - *no* escapes in quoted strings are processed. This is up to the datastructure stage #e.g dict::from_tomlish will substitute \r \n \uHHHH \UHHHHHHH etc #This is important for tomlish to maintain the ability to perform competely lossless round-trips from toml to tomlish and back to toml. # (which is handy for testing as well as editing some part of the structure with absolutely no effect on other parts of the document) #If we were to unescape a tab character for example # - we have no way of knowing if it was originally specified as \t \u0009 or \U00000009 or directly as a tab character. # For this reason, we also do absolutely no line-ending transformations based on platform. # All line-endings are maintained as is, and even a file with mixed lf crlf line-endings will be correctly interpreted and can be 'roundtripped' proc from_toml {args} { namespace upvar ::tomlish::parse s s set s [join $args \n] namespace upvar ::tomlish::parse i i set i 0 ;#index into s namespace upvar ::tomlish::parse is_parsing is_parsing set is_parsing 1 if {[info command ::tomlish::parse::spacestack] eq "::tomlish::parse::spacestack"} { tomlish::parse::spacestack destroy } struct::stack ::tomlish::parse::spacestack namespace upvar ::tomlish::parse last_space_action last_space_action namespace upvar ::tomlish::parse last_space_type last_space_type namespace upvar ::tomlish::parse tok tok set tok "" namespace upvar ::tomlish::parse type type namespace upvar ::tomlish::parse tokenType tokenType ::tomlish::parse::set_tokenType "" namespace upvar ::tomlish::parse tokenType_list tokenType_list set tokenType [list] ;#Flat (un-nested) list of tokentypes found namespace upvar ::tomlish::parse lastChar lastChar set lastChar "" set result "" namespace upvar ::tomlish::parse nest nest set nest 0 namespace upvar ::tomlish::parse v v ;#array keyed on nest level set v(0) {TOMLISH} array set s0 [list] ;#whitespace data to go in {SPACE {}} element. set parentlevel 0 namespace upvar ::tomlish::parse state state namespace upvar ::tomlish::parse braceCount braceCount set barceCount 0 namespace upvar ::tomlish::parse bracketCount bracketCount set bracketCount 0 set sep 0 set r 1 namespace upvar ::tomlish::parse token_waiting token_waiting set token_waiting [dict create] ;#if ::tok finds a *complete* second token during a run, it will put the 2nd one here to be returned by the next call. set state "table-space" ::tomlish::parse::spacestack push {type space state table-space} namespace upvar ::tomlish::parse linenum linenum;#'line number' of input data. (incremented for each literal linefeed - but not escaped ones in data) set linenum 1 set ::tomlish::parse::state_list [list] try { while {$r} { set r [::tomlish::parse::tok] #puts stdout "got tok: '$tok' while parsing string '$s' " set next_tokenType_known 0 ;#whether we begin a new token here based on what terminated the token result of 'tok' #puts "got token: '$tok' tokenType='$tokenType'. while v($nest) = [set v($nest)]" #puts "-->tok: $tok tokenType='$tokenType'" set prevstate $state set transition_info [::tomlish::parse::goNextState $tokenType $tok $state] #review goNextState could perform more than one space_action set space_action [dict get $transition_info space_action] set newstate [dict get $transition_info newstate] ;#use of 'newstate' vs 'state' makes code clearer below if {[tcl::string::match "err-*" $state]} { ::tomlish::log::warn "---- State error in state $prevstate for tokenType: $tokenType token value: $tok. $state aborting parse. [tomlish::parse::report_line]" lappend v(0) [list ERROR tokentype $tokenType state $prevstate to $state leveldata [set v($nest)]] return $v(0) } # --------------------------------------------------------- #NOTE there may already be a token_waiting at this point #set_token_waiting can raise an error here, # in which case the space_action branch needs to be rewritten to handle the existing token_waiting # --------------------------------------------------------- if {$space_action eq "pop"} { #pop_trigger_tokens: newline tablename endarray endinlinetable #note a token is a pop trigger depending on context. e.g first newline during keyval is a pop trigger. set parentlevel [expr {$nest -1}] set do_append_to_parent 1 ;#most tokens will leave this alone - but some like tentative_accum_squote need to do their own append switch -exact -- $tokenType { tentative_accum_squote { #should only apply within a multiliteral #### set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed #Without this - we would get extraneous empty list entries in the parent # - as the xxx-squote-space isn't a space level from the toml perspective # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop #assert prevstate always trailing-squote-space #dev guardrail - remove? assertion lib? switch -exact -- $prevstate { trailing-squote-space { } default { error "--- unexpected popped due to tentative_accum_squote but came from state '$prevstate' should have been trailing-squote-space" } } switch -- $tok { ' { tomlish::parse::set_token_waiting type single_squote value $tok complete 1 startindex [expr {$i -1}] } '' { #review - we should perhaps return double_squote instead? #tomlish::parse::set_token_waiting type literal value "" complete 1 tomlish::parse::set_token_waiting type double_squote value "" complete 1 startindex [expr {$i - 2}] } ''' { #### #if already an eof in token_waiting - set_token_waiting will insert before it tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 3}] } '''' { tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] #todo integrate left squote with nest data at this level set lastpart [lindex $v($parentlevel) end] switch -- [lindex $lastpart 0] { LITERALPART { set newval "[lindex $lastpart 1]'" set parentdata $v($parentlevel) lset parentdata end [list LITERALPART $newval] set v($parentlevel) $parentdata } NEWLINE { lappend v($parentlevel) [list LITERALPART "'"] } MULTILITERAL { #empty lappend v($parentlevel) [list LITERALPART "'"] } default { error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" } } } ''''' { tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] #todo integrate left 2 squotes with nest data at this level set lastpart [lindex $v($parentlevel) end] switch -- [lindex $lastpart 0] { LITERALPART { set newval "[lindex $lastpart 1]''" set parentdata $v($parentlevel) lset parentdata end [list LITERALPART $newval] set v($parentlevel) $parentdata } NEWLINE { lappend v($parentlevel) [list LITERALPART "''"] } MULTILITERAL { lappend v($parentlevel) [list LITERALPART "''"] } default { error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" } } } } } triple_squote { #presumably popping multiliteral-space ::tomlish::log::debug "---- triple_squote for last_space_action pop leveldata: $v($nest)" set merged [list] set lasttype "" foreach part $v($nest) { switch -exact -- [lindex $part 0] { MULTILITERAL { lappend merged $part } LITERALPART { if {$lasttype eq "LITERALPART"} { set prevpart [lindex $merged end] lset prevpart 1 [lindex $prevpart 1][lindex $part 1] lset merged end $prevpart } else { lappend merged $part } } NEWLINE { #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. lappend merged $part } default { error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" } } set lasttype [lindex $part 0] } set v($nest) $merged } tentative_accum_dquote { #should only apply within a multistring #### set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed #Without this - we would get extraneous empty list entries in the parent # - as the trailing-dquote-space isn't a space level from the toml perspective # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop #assert prevstate always trailing-dquote-space #dev guardrail - remove? assertion lib? switch -exact -- $prevstate { trailing-dquote-space { } default { error "--- unexpected popped due to tentative_accum_dquote but came from state '$prevstate' should have been trailing-dquote-space" } } switch -- $tok { {"} { tomlish::parse::set_token_waiting type single_dquote value $tok complete 1 startindex [expr {$i -1}] } {""} { #review - we should perhaps return double_dquote instead? #tomlish::parse::set_token_waiting type literal value "" complete 1 tomlish::parse::set_token_waiting type double_dquote value "" complete 1 startindex [expr {$i - 2}] } {"""} { #### #if already an eof in token_waiting - set_token_waiting will insert before it tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 3}] } {""""} { tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 4}] #todo integrate left dquote with nest data at this level set lastpart [lindex $v($parentlevel) end] switch -- [lindex $lastpart 0] { STRINGPART { set newval "[lindex $lastpart 1]\"" set parentdata $v($parentlevel) lset parentdata end [list STRINGPART $newval] set v($parentlevel) $parentdata } NEWLINE - CONT - WS { lappend v($parentlevel) [list STRINGPART {"}] } MULTISTRING { #empty lappend v($parentlevel) [list STRINGPART {"}] } default { error "--- don't know how to integrate extra trailing dquote with data $v($parentlevel)" } } } {"""""} { tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i-5}] #todo integrate left 2 dquotes with nest data at this level set lastpart [lindex $v($parentlevel) end] switch -- [lindex $lastpart 0] { STRINGPART { set newval "[lindex $lastpart 1]\"\"" set parentdata $v($parentlevel) lset parentdata end [list STRINGPART $newval] set v($parentlevel) $parentdata } NEWLINE - CONT - WS { lappend v($parentlevel) [list STRINGPART {""}] } MULTISTRING { lappend v($parentlevel) [list STRINGPART {""}] } default { error "--- don't know how to integrate extra trailing 2 dquotes with data $v($parentlevel)" } } } } } triple_dquote { #presumably popping multistring-space ::tomlish::log::debug "---- triple_dquote for last_space_action pop leveldata: $v($nest)" set merged [list] set lasttype "" foreach part $v($nest) { switch -exact -- [lindex $part 0] { MULTISTRING { lappend merged $part } STRINGPART { if {$lasttype eq "STRINGPART"} { set prevpart [lindex $merged end] lset prevpart 1 [lindex $prevpart 1][lindex $part 1] lset merged end $prevpart } else { lappend merged $part } } CONT - WS { lappend merged $part } NEWLINE { #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. lappend merged $part } default { error "---- triple_dquote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" } } set lasttype [lindex $part 0] } set v($nest) $merged } equal { #pop caused by = switch -exact -- $prevstate { dottedkey-space { tomlish::log::debug "---- equal ending dottedkey-space for last_space_action pop" #re-emit for parent space tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] } dottedkey-space-tail { #experiment? tomlish::log::debug "---- equal ending dottedkey-space-tail for last_space_action pop" #re-emit for parent space tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] } } } newline { incr linenum lappend v($nest) [list NEWLINE $tok] } tablename { #note: a tablename only 'pops' if we are greater than zero error "---- tablename pop should already have been handled as special case zeropoppushspace in goNextState" } tablearrayname { #!review - tablearrayname different to tablename regarding push/pop? #note: a tablename only 'pops' if we are greater than zero error "---- tablearrayname pop should already have been handled as special case zeropoppushspace in goNextState" } endarray { #nothing to do here. } comma { #comma for inline table will pop the keyvalue space lappend v($nest) "SEP" } endinlinetable { ::tomlish::log::debug "---- endinlinetable for last_space_action pop" } default { error "---- unexpected tokenType '$tokenType' for last_space_action 'pop'" } } if {$do_append_to_parent} { #e.g tentative_accum_squote does it's own appends as necessary - so won't get here lappend v($parentlevel) [set v($nest)] } incr nest -1 } elseif {$last_space_action eq "push"} { set prevnest $nest incr nest 1 set v($nest) [list] # push_trigger_tokens: barekey dquotedkey startinlinetable startarray tablename tablearrayname switch -exact -- $tokenType { tentative_trigger_squote - tentative_trigger_dquote { #### this startok will always be tentative_accum_squote/tentative_accum_dquote starting with one accumulated squote/dquote if {[dict exists $transition_info starttok] && [dict get $transition_info starttok] ne ""} { lassign [dict get $transition_info starttok] starttok_type starttok_val set next_tokenType_known 1 ::tomlish::parse::set_tokenType $starttok_type set tok $starttok_val } } single_squote { #JMN - REVIEW set next_tokenType_known 1 ::tomlish::parse::set_tokenType "squotedkey" set tok "" } triple_squote { ::tomlish::log::debug "---- push trigger tokenType triple_squote" set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERALPART } squotedkey { switch -exact -- $prevstate { table-space - itable-space { set v($nest) [list DOTTEDKEY] } } #todo - check not something already waiting? tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } triple_dquote { set v($nest) [list MULTISTRING] ;#container for NEWLINE,STRINGPART,CONT } dquotedkey { switch -exact -- $prevstate { table-space - itable-space { set v($nest) [list DOTTEDKEY] } } #todo - check not something already waiting? tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } barekey { switch -exact -- $prevstate { table-space - itable-space { set v($nest) [list DOTTEDKEY] } } #todo - check not something already waiting? set waiting [tomlish::parse::get_token_waiting] if {[llength $waiting]} { set i [dict get $waiting startindex] tomlish::parse::clear_token_waiting tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } else { tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space } } tablename { #note: we do not use the output of tablename_trim to produce a tablename for storage in the tomlish list! #The tomlish list is intended to preserve all whitespace (and comments) - so a roundtrip from toml file to tomlish # back to toml file will be identical. #It is up to the datastructure stage to normalize and interpret tomlish for programmatic access. # we call tablename_trim here only to to validate that the tablename data is well-formed at the outermost level, # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names from # a structural perspective. #todo - review! It's arguable that we should not do any validation here, and just store even incorrect raw tablenames, # so that the tomlish list is more useful for say a toml editor. Consider adding an 'err' tag to the appropriate place in the # tomlish list? #set trimtable [tablename_trim $tok] #::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$trimtable'" set v($nest) [list TABLE $tok] ;#$tok is the *raw* table name #note also that equivalent tablenames may have different toml representations even after being trimmed! #e.g ["x\t\t"] & ["x "] (tab escapes vs literals) #These will show as above in the tomlish list, but should normalize to the same tablename when used as keys by the datastructure stage. } tablearrayname { #set trimtable [tablename_trim $tok] #::tomlish::log::debug "---- trimmed (but not normalized) tablearrayname: '$trimtable'" set v($nest) [list TABLEARRAY $tok] ;#$tok is the *raw* tablearray name } startarray { set v($nest) [list ARRAY] ;#$tok is just the opening bracket - don't output. } startinlinetable { set v($nest) [list ITABLE] ;#$tok is just the opening curly brace - don't output. } default { error "---- push trigger tokenType '$tokenType' not yet implemented" } } } else { #no space level change switch -exact -- $tokenType { squotedkey { #puts "---- squotedkey in state $prevstate (no space level change)" lappend v($nest) [list SQKEY $tok] } dquotedkey { #puts "---- dquotedkey in state $prevstate (no space level change)" lappend v($nest) [list DQKEY $tok] } barekey { lappend v($nest) [list KEY $tok] } dotsep { lappend v($nest) [list DOTSEP] } starttablename { #$tok is triggered by the opening bracket and sends nothing to output } starttablearrayname { #$tok is triggered by the double opening brackets and sends nothing to output } tablename - tablenamearray { error "---- did not expect 'tablename/tablearrayname' without space level change (no space level change)" #set v($nest) [list TABLE $tok] } endtablename - endtablearrayname { #no output into the tomlish list for this token } startinlinetable { puts stderr "---- decode::toml error. did not expect startinlinetable without space level change (no space level change)" } single_dquote { switch -exact -- $newstate { string-state { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "string" set tok "" } dquoted-key { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "dquotedkey" set tok "" } multistring-space { lappend v($nest) [list STRINGPART {"}] #may need to be joined on pop if there are neighbouring STRINGPARTS } default { error "---- single_dquote switch case not implemented for nextstate: $newstate (no space level change)" } } } double_dquote { #leading extra quotes - test: toml_multistring_startquote2 switch -exact -- $prevstate { itable-keyval-value-expected - keyval-value-expected { puts stderr "tomlish::decode::toml double_dquote TEST" #empty string lappend v($nest) [list STRINGPART ""] } multistring-space { #multistring-space to multistring-space lappend v($nest) [list STRINGPART {""}] } default { error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" } } } single_squote { switch -exact -- $newstate { literal-state { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "literal" set tok "" } squoted-key { set next_tokenType_known 1 ::tomlish::parse::set_tokenType "squotedkey" set tok "" } multiliteral-space { #false alarm squote returned from tentative_accum_squote pop ::tomlish::log::debug "---- adding lone squote to own LITERALPART nextstate: $newstate (no space level change)" #(single squote - not terminating space) lappend v($nest) [list LITERALPART '] #may need to be joined on pop if there are neighbouring LITERALPARTs } default { error "---- single_squote switch case not implemented for nextstate: $newstate (no space level change)" } } } double_squote { switch -exact -- $prevstate { keyval-value-expected { lappend v($nest) [list LITERAL ""] } multiliteral-space { #multiliteral-space to multiliteral-space lappend v($nest) [list LITERALPART ''] } default { error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" } } } enddquote { #nothing to do? set tok "" } endsquote { set tok "" } string { #JJJJ set tok [tomlish::from_Bstring $tok] lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes } literal { lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes } multistring { #review #JJJJ ? lappend v($nest) [list MULTISTRING $tok] } stringpart { #JJJJ set tok [tomlish::from_Bstring $tok] lappend v($nest) [list STRINGPART $tok] ;#will not get wrapped in dquotes directly } multiliteral { lappend v($nest) [LIST MULTILITERAL $tok] } literalpart { lappend v($nest) [list LITERALPART $tok] ;#will not get wrapped in squotes directly } untyped_value { #would be better termed unclassified_value #we can't determine the type of unquoted values (int,float,datetime,bool) until the entire token was read. unset -nocomplain tag if {$tok in {true false}} { set tag BOOL } else { if {[::tomlish::utils::is_int $tok]} { set tag INT } else { if {[string is integer -strict $tok]} { #didn't qualify as a toml int - but still an int #probably means is_int is limiting size and not accepting bigints (configurable?) #or it didn't qualify due to more than 1 leading zero #or other integer format issue such as repeated underscores error "---- Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. (looks close to being an int. Formatting or range issue?) [tomlish::parse::report_line] (no space level change)" } else { #DDDD if {[::tomlish::utils::is_float $tok]} { set tag FLOAT } elseif {[::tomlish::utils::is_localtime $tok]} { set tag TIME-LOCAL } elseif {[::tomlish::utils::is_timepart $tok]} { #Note we must allow lone timepart here (not just is_localtime which doesn't allow tz offsets) in case it followed a previous localdate set tag DATETIME ;#?? review standalone time with tz - no specific tag - only allowed as followup value from DATETIME-LOCAL } elseif {[::tomlish::utils::is_datepart $tok]} { set tag DATE-LOCAL } elseif {[::tomlish::utils::is_datetime $tok]} { #not just a date or just a time #could be either local or have tz offset #DDDD JJJ set norm [string map {" " T} $tok];#prob unneeded - we won't get here if there was a space - would arrive as 2 separate tokens review. lassign [split $norm T] dp tp if {[::tomlish::utils::is_localtime $tp]} { set tag DATETIME-LOCAL } else { set tag DATETIME } } elseif {[::tomlish::utils::is_datetime X$tok] || [::tomlish::utils::is_timepart X$tok]} { # obsolete #Note we must allow lone timepart here (not just is_localtime which doesn't allow tz offsets) in case it followed a previous localdate #e.g x= 2025-01-01 02:34Z #The dict::from_tomlish validation will catch an invalid standaline timepart, or combine with leading date if applicable. set tag DATETIME } else { error "---- Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. [tomlish::parse::report_line] (no space level change)" } } } } #assert either tag is set, or we errored out. lappend v($nest) [list $tag $tok] } comment { #puts stdout "----- comment token returned '$tok'------" #JJJJ set tok [tomlish::from_comment $tok] lappend v($nest) [list COMMENT "$tok"] } equal { #we append '=' to the nest so that any surrounding whitespace is retained. lappend v($nest) = } comma { lappend v($nest) SEP } newline { incr linenum lappend v($nest) [list NEWLINE $tok] } whitespace { lappend v($nest) [list WS $tok] } continuation { lappend v($nest) CONT } bom { lappend v($nest) BOM } eof { #ok - nothing more to add to the tomlish list. #!todo - check previous tokens are complete/valid? } default { error "--- unknown tokenType '$tokenType' during state $prevstate [::tomlish::parse::report_line] (no space level change)" } } } if {!$next_tokenType_known} { ::tomlish::log::notice "---- tomlish::decode::toml - current tokenType:$tokenType Next token type not known" ::tomlish::parse::set_tokenType "" set tok "" } if {$state eq "end-state"} { break } } #while {$nest > 0} { # lappend v([expr {$nest -1}]) [set v($nest)] # incr nest -1 #} while {[::tomlish::parse::spacestack size] > 1} { ::tomlish::parse::spacestack pop lappend v([expr {$nest -1}]) [set v($nest)] incr nest -1 #set parent [spacestack peek] ;#the level being appended to #lassign $parent type state #if {$type eq "space"} { # #} elseif {$type eq "buffer"} { # lappend v([expr {$nest -1}]) {*}[set v($nest)] #} else { # error "invalid spacestack item: $parent" #} } } finally { set is_parsing 0 } return $v(0) } #toml dquoted string to tomlish STRING # - only allow specified escape sequences # - allow any unicode except those that must be escaped: dquote, bsl, and control chars(except tab) proc from_Bstring {bstr} { #JJJJ if {[catch { tomlish::utils::unescape_string $bstr } errM]} { return -code error -errorcode {TOML SYNTAX INVALIDESCAPE} "tomlish::from_Bstring toml Bstring contains invalid escape sequence\n$errM" ;#review } #assert: all escapes are now valid if {[regexp {[\u0000-\u0008\u000A-\u001F\u007f]} $bstr]} { set msg "tomlish::from_Bstring toml Bstring contains controls that must be escaped" return -code error -errorcode {TOML SYNTAX BSTRINGUNESCAPEDCONTROLS} $msg ;#review } return $bstr } #validate toml comment # - disallow controls that must be escaped #from spec: # "Control characters other than tab (U+0000 to U+0008, U+000A to U+001F, U+007F) are not permitted in comments." proc from_comment {comment} { if {[regexp {[\u0000-\u0008\u000A-\u001F\u007f]} $comment]} { set msg "tomlish::from_comment toml comment contains controls that must be escaped" return -code error -errorcode {TOML SYNTAX COMMENTUNESCAPEDCONTROLS} $msg ;#review } return $comment } #*** !doctools #[list_end] [comment {--- end definitions namespace tomlish ---}] } # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ namespace eval tomlish::build { #STRING,INT,FLOAT,BOOL, DATETIME - simple wrappers for completeness # take a value of the appropriate type and wrap as a tomlish tagged item proc STRING {s} { return [list STRING [::tomlish::utils::rawstring_to_Bstring_with_escaped_controls $s]] } proc LITERAL {litstring} { error todo } proc INT {i} { #whole numbers, may be prefixed with a + or - #Leading zeros are not allowed #Hex,octal binary forms are allowed (toml 1.0) #We will error out on encountering commas, as commas are interpreted differently depending on locale (and don't seem to be supported in the toml spec anyway) #!todo - Tcl can handle bignums - bigger than a 64bit signed long as specified in toml. # - We should probably raise an error for number larger than this and suggest the user supply it as a string? if {[tcl::string::last , $i] > -1} { error "Unable to interpret '$i' as an integer. Use underscores if you need a thousands separator [::tomlish::parse::report_line]" } if {![::tomlish::utils::int_validchars $i]} { error "Unable to interpret '$i' as an integer. Only 0-9 + 1 _ characters are acceptable. [::tomlish::parse::report_line]" } if {[::tomlish::utils::is_int $i]} { return [list INT $i] } else { error "'$i' is not a valid integer as per the Toml spec. [::tomlish::parse::report_line]" } } proc FLOAT {f} { #convert any non-lower case variants of special values to lowercase for Toml if {[::tcl::string::tolower $f] in {nan +nan -nan inf +inf -inf}} { return [list FLOAT [tcl::string::tolower $f]] } if {[::tomlish::utils::is_float $f]} { return [list FLOAT $f] } else { error "Unable to interpret '$f' as Toml float. Check your input, or check that tomlish is able to handle all Toml floats properly [::tomlish::parse::report_line]" } } proc DATETIME {str} { if {[::tomlish::utils::is_datetime $str]} { return [list DATETIME $str] } else { error "Unable to interpret '$str' as Toml datetime. Check your input, or check that tomlish is able to handle all Toml datetimes properly [::tomlish::parse::report_line]" } } proc DATETIME-LOCAL {str} { error "build::DATETIME-LOCAL todo" } proc BOOLEAN {b} { #convert any Tcl-acceptable boolean to boolean as accepted by toml - lower case true/false if {![tcl::string::is boolean -strict $b]} { error "Unable to convert '$b' to Toml boolean true|false. [::tomlish::parse::report_line]" } else { if {$b && 1} { return [::list BOOL true] } else { return [::list BOOL false] } } } #REVIEW #Take tablename followed by # a) *tomlish* name-value pairs e.g table mydata [list KEY item11 = [list STRING "test"]] {KEY item2 = [list INT 1]} # (accept also key value {STRING }) # b) simple 2-element tcl lists being name & *simple* value pairs for which basic heuristics will be used to determine types proc _table {name args} { set pairs [list] foreach t $args { if {[llength $t] == 4} { if {[tcl::string::tolower [lindex $t 0]] ne "key" || [tcl::string::tolower [lindex $t 2]] ni "= value"} { error "Only items tagged as KEY = currently accepted as name-value pairs for table command" } lassign $t _k keystr _eq valuepart if {[llength $valuepart] != 2} { error "supplied value must be typed. e.g {INT 1} or {STRING test}" } lappend pairs [list KEY $keystr = $valuepart] } elseif {[llength $t] == 2} { #!todo - type heuristics lassign $t n v lappend pairs [list KEY $n = [list STRING $v]] } else { error "'KEY = { toml but # the first newline is not part of the data. # we elect instead to maintain a basic LITERALPART that must not contain newlines.. # and to compose MULTILITERAL of multiple NEWLINE LITERALPART parts, #with the datastructure representation dropping the first newline (if immediately following opening delim) when building the value. set literal "" foreach part [lrange $item 1 end] { append literal [::tomlish::encode::tomlish [list $part] $nextcontext] } append toml '''$literal''' } INT - BOOL - FLOAT - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL { #DDDD append toml [lindex $item 1] } INCOMPLETE { error "cannot process tomlish term tagged as INCOMPLETE" } COMMENT { append toml "#[lindex $item 1]" } BOM { #Byte Order Mark may appear at beginning of a file. Needs to be preserved. append toml "\uFEFF" } default { error "Not a properly formed 'tomlish' taggedlist.\n '$list'\n Unknown tag '[lindex $item 0]'. See output of \[tomlish::tags\] command." } } } return $toml } #*** !doctools #[list_end] [comment {--- end definitions namespace tomlish::encode ---}] } #fish toml from tomlish #(encode tomlish as toml) interp alias {} tomlish::to_toml {} tomlish::encode::tomlish # namespace eval tomlish::decode { #*** !doctools #[subsection {Namespace tomlish::decode}] #[para] #[list_begin definitions] #*** !doctools #[list_end] [comment {--- end definitions namespace tomlish::decode ---}] } #decode toml to tomlish #interp alias {} tomlish::from_toml {} tomlish::decode::toml namespace eval tomlish::utils { #*** !doctools #[subsection {Namespace tomlish::utils}] #[para] #[list_begin definitions] #basic generic quote matching for single and double quotes #note for example that {[o'malley]} will return sq - as the single quote is not closed or wrapped in double quotes proc tok_in_quotedpart {tok} { set sLen [tcl::string::length $tok] set quote_type "" set had_slash 0 for {set i 0} {$i < $sLen} {incr i} { set c [tcl::string::index $tok $i] if {$quote_type eq ""} { if {$had_slash} { #don't enter quote mode #leave slash_mode because even if current char is slash - it is escaped set had_slash 0 } else { set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] switch -- $ctype { dq { set quote_type dq } sq { set quote_type sq } bsl { set had_slash 1 } } } } else { if {$had_slash} { #don't leave quoted mode #leave slash_mode because even if current char is slash - it is escaped set had_slash 0 } else { set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] switch -- $ctype { dq { if {$quote_type eq "dq"} { set quote_type "" } } sq { if {$quote_type eq "sq"} { set quote_type "" } } bsl { set had_slash 1 } } } } } return $quote_type ;#dq | sq } proc hex_escape_info {slashx} { set exp {^\\x([0-9a-fA-F]{2}$)} if {[regexp $exp $slashx match hex]} { return [list ok [list char [subst -nocommand -novariable $slashx]]] } else { return [list err [list reason "Supplied string not of the form \\xHH where H in \[0-9a-fA-F\]"]] } } proc unicode_escape_info {slashu} { #!todo # validate that slashu is either a \uxxxx or \Uxxxxxxxx value of the correct length and # is a valid 'unicode scalar value' (any Unicode code point except high-surrogate and low-surrogate code points) # ie integers in the range 0 to D7FF16 and E00016 to 10FFFF16 inclusive #expr {(($x >= 0) && ($x <= 0xD7FF16)) || (($x >= 0xE00016) && ($x <= 0x10FFFF16))} if {[tcl::string::match {\\u*} $slashu]} { set exp {^\\u([0-9a-fA-F]{4}$)} if {[regexp $exp $slashu match hex]} { if {[scan $hex %4x dec] != 1} { #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? return [list err [list reason "Failed to convert '$hex' to decimal"]] } else { return [list ok [list char [subst -nocommand -novariable $slashu]]] } } else { return [list err [list reason "Supplied string not of the form \\uHHHH where H in \[0-9a-fA-F\]"]] } } elseif {[tcl::string::match {\\U*} $slashu]} { set exp {^\\U([0-9a-fA-F]{8}$)} if {[regexp $exp $slashu match hex]} { if {[scan $hex %8x dec] != 1} { #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? return [list err [list reason "Failed to convert '$hex' to decimal"]] } else { if {(($dec >= 0) && ($dec <= 0xD7FF16)) || (($dec >= 0xE00016) && ($dec <= 0x10FFFF16))} { return [list ok [list char [subst -nocommand -novariable $slashu]]] } else { return [list err [list reason "$slashu is not within the 'unicode scalar value' ranges 0 to 0xD7FF16 or 0xE00016 to 0x10FFFF16"]] } } } else { return [list err [list reason "Supplied string not of the form \\UHHHHHHHH where H in \[0-9a-fA-F\]"]] } } else { return [list err [list reason "Supplied string did not start with \\u or \\U" ]] } } #Note that unicode characters don't *have* to be escaped. #So if we provide a function named 'escape_string', the name implies the inverse of unescape_string which unescapes unicode \u \U values. #- an inverse of unescape_string would encode all unicode chars unnecessarily. #- as toml accepts a compact escape sequence for common chars such as tab,backspace,linefeed etc but also allows the full form \u009 etc #- escape_string and unescape_string would not be reliably roundtrippable inverses anyway. #REVIEW - provide it anyway? When would it be desirable to use? variable Bstring_control_map [dict create] dict set Bstring_control_map \b {\b} dict set Bstring_control_map \n {\n} dict set Bstring_control_map \r {\r} dict set Bstring_control_map \" {\"} dict set Bstring_control_map \x1b {\e} ;#In spec it's included in the list of 'must be escaped', as well as the 'convenience' escapes - so we make it go both ways. dict set Bstring_control_map \\ "\\\\" #\e for \x1b seems like it might be included - v1.1?? hard to find current state of where toml is going :/ #for a Bstring (Basic string) tab is explicitly mentioned as not being one that must be escaped. #8 = \b - already in list. #built the remainder whilst checking for entries already hardcoded above -in case more are added to the hardcoded list for {set cdec 0} {$cdec <= 7} {incr cdec} { set hhhh [format %.4X $cdec] set char [format %c $cdec] if {![dict exists $Bstring_control_map $char]} { dict set Bstring_control_map $char \\u$hhhh } } for {set cdec [expr {0x0A}]} {$cdec <= 0x1F} {incr cdec} { set hhhh [format %.4X $cdec] set char [format %c $cdec] if {![dict exists $Bstring_control_map $char]} { dict set Bstring_control_map $char \\u$hhhh } } # \u007F = 127 dict set Bstring_control_map [format %c 127] \\u007F # ------------------------------------------------------------------ variable Literal_control_map [dict create] #controls other than tab for {set cdec 0} {$cdec <= 8} {incr cdec} { set hhhh [format %.4X $cdec] set char [format %c $cdec] if {![dict exists $Literal_control_map $char]} { dict set Literal_control_map $char \\u$hhhh } } for {set cdec [expr {0x0A}]} {$cdec <= 0x1F} {incr cdec} { set hhhh [format %.4X $cdec] set char [format %c $cdec] if {![dict exists $Literal_control_map $char]} { dict set Literal_control_map $char \\u$hhhh } } # \u007F = 127 dict set Literal_control_map [format %c 127] \\u007F # ------------------------------------------------------------------ variable Multiliteral_control_map set Multiliteral_control_map [dict remove $Literal_control_map \n] variable String_control_map set String_control_map [dict remove $Literal_control_map \\] variable MultiBstring_totoml_map #'minimally' escaped sequences of double quotes. #e.g {""\"""\"} vs {\"\"\"\"\"} #This produces easier to read toml - and in many cases may be more likely to match original format when roundtripped from dict datastructure # REVIEW - should this be configurable? set MultiBstring_totoml_map [dict remove $Bstring_control_map {"} \r \n] dict set MultiBstring_totoml_map {"""} {""\"} ;#" editor hack: commented quote for dumb syntax highlighers #Note the inclusion of backslash in the list of controls makes this non idempotent - subsequent runs would keep encoding the backslashes! #escape only those chars that must be escaped in a Bstring (e.g not tab which can be literal or escaped) #for example - can be used by from_dict to produce valid Bstring data for a tomlish record proc rawstring_to_Bstring_with_escaped_controls {str} { #for the well known chars that have compact escape sequences allowed by toml - we choose that form over the full \u form. #we'll use a string map with an explicit list rather than algorithmic at runtime # - the string map is probably more performant than splitting a string, especially if it's large upvar ::tomlish::utils::Bstring_control_map map return [string map $map $str] } proc rawstring_to_MultiBstring_with_escaped_controls {str} { #for the well known chars that have compact escape sequences allowed by toml - we choose that form over the full \u form. #we'll use a string map with an explicit list rather than algorithmic at runtime # - the string map is probably more performant than splitting a string, especially if it's large upvar ::tomlish::utils::MultiBstring_totoml_map map return [string map $map $str] } proc rawstring_is_valid_tomlstring {str} { #controls are allowed in this direction dict -> toml (they get quoted) #check any existing escapes are valid if {[catch { unescape_string $str } errM]} { return 0 } return 1 } proc rawstring_is_valid_literal {str} { #detect control chars other than tab variable Literal_control_map set testval [string map $Literal_control_map $str] return [expr {$testval eq $str}] } proc rawstring_is_valid_multiliteral {str} { #detect control chars other than tab variable Multiliteral_control_map set teststr [string map [list \r\n ok] $str] set testval [string map $Multiliteral_control_map $teststr] return [expr {$testval eq $teststr}] } #review - unescape what string? Bstring vs MLBstring? #we should be specific in the function naming here #used by dict::from_tomlish - so part of validation? - REVIEW proc unescape_string {str} { #note we can't just use Tcl subst because: # it also transforms \a (audible bell) and \v (vertical tab) which are not in the toml spec. # it would strip out backslashes inappropriately: e.g "\j" becomes just j # it recognizes other escapes which aren't approprite e.g octal \nnn # it replaces \ with a single whitespace (trailing backslash) #This means we shouldn't use 'subst' on the whole string, but instead substitute only the toml-specified escapes (\r \n \b \t \f \\ \" \uhhhh & \Uhhhhhhhh #plus \e for \x1b? set buffer "" set buffer2 "" ;#buffer for 2 hex characters following a \x set buffer4 "" ;#buffer for 4 hex characters following a \u set buffer8 "" ;#buffer for 8 hex characters following a \u set sLen [tcl::string::length $str] #we need to handle arbitrarily long sequences of backslashes. \\\\\ etc set slash_active 0 set unicode2_active 0 set unicode4_active 0 set unicode8_active 0 ::tomlish::log::debug "unescape_string. got len [string length str] str $str" #!todo - check for invalid data in the form of a raw carriage return (decimal 13) without following linefeed? set i 0 for {} {$i < $sLen} {} { if {$i > 0} { set lastChar [tcl::string::index $str [expr {$i - 1}]] } else { set lastChar "" } set c [tcl::string::index $str $i] #::tomlish::log::debug "unescape_string. got char $c" ;#too much? ##---------------------- ##as we are 'unescaping' - should we really be testing here for existing values that should have been escaped? ##The answer is probably no - keep this function to a single purpose - test elsewhere for raw controls. ##this test looks incomplete anyway REVIEW #scan $c %c n #if {($n <= 31) && ($n != 9) && ($n != 10) && ($n != 13)} { # #we don't expect unescaped unicode characters from 0000 to 001F - # #*except* for raw tab (which is whitespace) and newlines # error "unescape_string. Invalid data for a toml string. Unescaped control character (decimal $n) [::tomlish::utils::string_to_slashu $c]" #} ##---------------------- incr i ;#must incr here because we do'returns'inside the loop if {$c eq "\\"} { if {$slash_active} { append buffer "\\" set slash_active 0 } elseif {$unicode2_active} { error "unescape_string. unexpected case slash during unicode2 not yet handled" } elseif {$unicode4_active} { error "unescape_string. unexpected case slash during unicode4 not yet handled" } elseif {$unicode8_active} { error "unescape_string. unexpected case slash during unicode8 not yet handled" } else { # don't output anything (yet) set slash_active 1 } } else { if {$unicode2_active} { if {[tcl::string::length $buffer2] < 2} { append buffer2 $c } if {[tcl::string::length $buffer2] == 2} { #we have a \xHH to test set unicode2_active 0 set result [tomlish::utils::hex_escape_info "\\x$buffer2"] if {[lindex $result 0] eq "ok"} { append buffer [dict get $result ok char] } else { error "unescape_string error: [lindex $result 1]" } } } elseif {$unicode4_active} { if {[tcl::string::length $buffer4] < 4} { append buffer4 $c } if {[tcl::string::length $buffer4] == 4} { #we have a \uHHHH to test set unicode4_active 0 set result [tomlish::utils::unicode_escape_info "\\u$buffer4"] if {[lindex $result 0] eq "ok"} { append buffer [dict get $result ok char] } else { error "unescape_string error: [lindex $result 1]" } } } elseif {$unicode8_active} { if {[tcl::string::length $buffer8] < 8} { append buffer8 $c } if {[tcl::string::length $buffer8] == 8} { #we have a \UHHHHHHHH to test set unicode8_active 0 set result [tomlish::utils::unicode_escape_info "\\U$buffer8"] if {[lindex $result 0] eq "ok"} { append buffer [dict get $result ok char] } else { error "unescape_string error: [lindex $result 1]" } } } elseif {$slash_active} { set slash_active 0 set ctest [tcl::string::map {{"} dq} $c] switch -exact -- $ctest { dq { append buffer {"} } b - t - n - f - r { append buffer [subst -nocommand -novariable "\\$c"] } e { append buffer \x1b } x { #introduced in 1.1.0 \xHH set unicode2_active 1 set buffer2 "" } u { set unicode4_active 1 set buffer4 "" } U { set unicode8_active 1 set buffer8 "" } default { set slash_active 0 #review - toml spec says all other escapes are reserved #and if they are used TOML should produce an error. #append buffer "\\$c" set msg "Invalid escape sequence \\ followed by '$c'" return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} $msg } } } else { append buffer $c } } } #puts stdout "EOF 4:$unicode4_active 8:$unicode8_active slash:$slash_active" if {$unicode2_active} { error "End of string reached before complete hex escape sequence \xHH" } if {$unicode4_active} { error "End of string reached before complete unicode escape sequence \uHHHH" } if {$unicode8_active} { error "End of string reached before complete unicode escape sequence \UHHHHHHHH" } if {$slash_active} { append buffer "\\" } try { encoding convertto utf-8 $buffer } trap {} {emsg eopts} { return -code error -errorcode {TOMLISH SYNTAX ENCODINGERROR} $emsg } return $buffer } #This does not have to do with unicode normal forms - which it seems toml has decided against regarding use in keys (review/references?) #This is meant for internal use regarding ensuring we match equivalent keys which may have just been specified with different string mechanisms, #e.g squoted vs dquoted vs barekey. proc normalize_key {rawkey} { set c1 [tcl::string::index $rawkey 0] set c2 [tcl::string::index $rawkey end] if {($c1 eq "'") && ($c2 eq "'")} { #single quoted segment. No escapes allowed within it. set key [tcl::string::range $rawkey 1 end-1] } elseif {($c1 eq "\"") && ($c2 eq "\"")} { #double quoted segment. Unapply escapes. # set keydata [tcl::string::range $rawkey 1 end-1] ;#strip outer quotes only #e.g key could have mix of \UXXXXXXXX escapes and unicode chars #or mix of \t and literal tabs. #unescape to convert all to literal versions for comparison set key [::tomlish::utils::unescape_string $keydata] #set key [subst -nocommands -novariables $keydata] ;#wrong. Todo - create a string escape substitution function. } else { set key $rawkey } return $key } proc string_to_slashu {string} { set rv {} foreach c [split $string {}] { scan $c %c cdec if {$cdec > 65535} { append rv {\U} [format %.8X $cdec] } else { append rv {\u} [format %.4X $cdec] } } return $rv } #'nonprintable' is conservative here because some systems (e.g windows console) are very limited in what they can display. #This is used for display purposes only (error msgs) proc nonprintable_to_slashu {s} { set res "" foreach i [split $s ""] { scan $i %c cdec set printable 0 if {($cdec>31) && ($cdec<127)} { set printable 1 } if {$printable} { append res $i } else { if {$cdec > 65535} { append res \\U[format %.8X $cdec] } else { append res \\u[format %.4X $cdec] } } } set res } ;# initial version from tcl wiki RS proc rawstring_to_jsonstring {s} { #like nonprintable_to_slashu # - also escape every dquote # - escape newlines set res "" foreach i [split $s ""] { scan $i %c cdec switch -- $cdec { 34 { #double quote append res \\\" } 13 { #carriage return append res \\r } 8 { append res \\b } 9 { append res \\t } 10 { #linefeed append res \\n } 92 { append res \\\\ } default { set printable 0 if {($cdec>31) && ($cdec<127)} { set printable 1 } if {$printable} { append res $i } else { if {$cdec > 65535} { #append res $i #append res \\U[format %.8X $cdec] ;#wrong #append res "\\U{[format %.8x $cdec]}" ;#some variation of json? package require punk::cesu #e.g \U0001f610 emoticon face #surrogate pair: \uD83D\uDE10 set surrogatepair [punk::cesu::to_surrogatestring -format escape $i] append res $surrogatepair } else { append res \\u[format %.4X $cdec] } } } } } set res } #check if str is valid for use as a toml bare key #Early toml versions only allowed letters + underscore + dash proc is_basic_barekey {str} { if {[tcl::string::length $str] == 0} { return 0 } else { set matches [regexp -all {[a-zA-Z0-9\_\-]} $str] if {[tcl::string::length $str] == $matches} { #all characters match the regexp return 1 } else { return 0 } } } #from toml.abnf in github.com/toml-lang/toml #unquoted-key = 1*unquoted-key-char #unquoted-key-char = ALPHA / DIGIT / %x2D / %x5F ; a-z A-Z 0-9 - _ #unquoted-key-char =/ %xB2 / %xB3 / %xB9 / %xBC-BE ; superscript digits, fractions #unquoted-key-char =/ %xC0-D6 / %xD8-F6 / %xF8-37D ; non-symbol chars in Latin block #unquoted-key-char =/ %x37F-1FFF ; exclude GREEK QUESTION MARK, which is basically a semi-colon #unquoted-key-char =/ %x200C-200D / %x203F-2040 ; from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ #unquoted-key-char =/ %x2070-218F / %x2460-24FF ; include super-/subscripts, letterlike/numberlike forms, enclosed alphanumerics #unquoted-key-char =/ %x2C00-2FEF / %x3001-D7FF ; skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces #unquoted-key-char =/ %x2070-21FF / %x2300-24FF ; skip math operators #unquoted-key-char =/ %x25A0-268B / %x2690-2757 ; skip box drawing, block elements, and some yin-yang symbols #unquoted-key-char =/ %x2762-2767 / %x2776-27E5 ; skip some Dingbat punctuation #unquoted-key-char =/ %x2801-297F ; skip some math brackets and arrows, and braille blank #unquoted-key-char =/ %x2B00-2FFF / %x3001-D7FF ; skip various math operators and symbols, and ideographic space #unquoted-key-char =/ %xF900-FDCF / %xFDF0-FFFD ; skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) #unquoted-key-char =/ %x10000-EFFFF ; all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) variable re_barekey set ranges [list] lappend ranges {a-zA-Z0-9\_\-} lappend ranges {\u00B2} {\u00B3} {\u00B9} {\u00BC-\u00BE} ;# superscript digits, fractions lappend ranges {\u00C0-\u00D6} {\u00D8-\u00F6} {\u00F8-\u037D} ;# non-symbol chars in Latin block lappend ranges {\u037f-\u1FFF} ;# exclude GREEK QUESTION MARK, which is basically a semi-colon lappend ranges {\u200C-\u200D} {\u203F-\u2040} ;# from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ lappend ranges {\u2070-\u218f} {\u2460-\u24FF} ;# include super-subscripts, letterlike/numberlike forms, enclosed alphanumerics lappend ranges {\u2C00-\u2FEF} {\u3001-\uD7FF} ;# skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces lappend ranges {\u2070-\u21FF} {\u2300-\u24FF} ;# skip math operators lappend ranges {\u25A0-\u268B} {\u2690-\u2757} ;# skip box drawing, block elements, and some yin-yang symbols lappend ranges {\u2762-\u2767} {\u2776-\u27E5} ;# skip some Dingbat punctuation lappend ranges {\u2801-\u297F} ;# skip some math brackets and arrows, and braille blank lappend ranges {\u2B00-\u2FFF} {\u3001-\uD7FF} ;# skip various math operators and symbols, and ideographic space lappend ranges {\uF900-\uFDCF} {\uFDF0-\uFFFD} ;# skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) lappend ranges {\U10000-\UEFFFF} ;# all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) set re_barekey {^[} foreach r $ranges { append re_barekey $r } append re_barekey {]+$} proc is_barekey {str} { if {[tcl::string::length $str] == 0} { return 0 } variable re_barekey return [regexp $re_barekey $str] } #test only that the characters in str are valid for the toml specified type 'integer'. proc int_validchars1 {str} { set numchars [tcl::string::length $str] if {[regexp -all {[0-9\_\-\+]} $str] == $numchars} { return 1 } else { return 0 } } #add support for hex,octal,binary 0x.. 0o.. 0b... proc int_validchars {str} { set numchars [tcl::string::length $str] if {[regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] == $numchars} { return 1 } else { return 0 } } proc is_int {str} { set matches [regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] ;#0b101 etc covered by a-f if {[tcl::string::length $str] == $matches} { #all characters in legal range # --------------------------------------- #check for leading zeroes in non 0x 0b 0o #first strip any +, - or _ (just for this test) #(but still allowing 0 -0 +0) set check [tcl::string::map {+ "" - "" _ ""} $str] if {([tcl::string::length $check] > 1) && ([tcl::string::index $check 0] eq "0") && ([tcl::string::index $check 1] ni {o x b})} { return 0 } # --------------------------------------- #check +,- only occur in the first position. (excludes also +++1 etc) if {[tcl::string::last - $str] > 0} { return 0 } if {[tcl::string::last + $str] > 0} { return 0 } #------------------------------------------- #unclear if a 'digit' includes the type specifiers x b o #we assume the 0x 0b 0o are NOT counted as digits - as underscores here would seem #to be likely to cause interop issues with other systems #(e.g tcl allows 0b1_1 but not 0b_11) #Most of this structure would be unnecessary if we could rely on string::is::integer understanding underscores (9+?) #we still need to support earlier Tcl for now though. #first rule out any case with more than one underscore in a row if {[regexp {__} $str]} { return 0 } if {[string index $str 0] eq "_"} { return 0 } set utest [string trimleft $str +-] #test again for further trick like _+_0xFF if {[string index $utest 0] eq "_"} { return 0 } if {[string range $utest 0 1] in {0x 0b 0o}} { set testnum [string range $utest 2 end] #spec says *non-negative* integers may *also* be expressed in hex, octal or binary #and also explicitly states + not allowed #presumed to mean negative not allowed. if {[string index $str 0] in {- +}} { return 0 } } else { set testnum $utest #exclude also things like 0_x 0___b that snuck past our prefix test if {![string is digit -strict [string map {_ ""} $testnum]]} { return 0 } #assert - only digits and underscores in testnum #still may have underscores at each end } #assert testnum is now the 'digits' portion of a , 0x 0b 0o number #(+ and - already stripped) #It may still have chars unsuitable for its type - which will be caught by the string::is::integer test below if {[string length $testnum] != [string length [string trim $testnum _]]} { #had non-inner underscores in 'digit' part return 0 } #assert str only has solo inner underscores (if any) between 'digits' #------------------------------------------- set numeric_value [tcl::string::map {_ ""} $str] ;#allow some earlier tcl versions which don't support underscores #use Tcl's integer check to ensure we don't let things like 3e4 through - which is a float (would need to be 0x3e4 for hex) if {![tcl::string::is integer -strict $numeric_value]} { return 0 } #!todo - check bounds only based on some config value #even though Tcl can handle bignums, we won't accept anything outside of toml 1.0 minimum requirements by default (for now) #presumably very large numbers would have to be supplied in a toml file as strings. #Review - toml 1.0 only says that it must handle up to 2^63 - not that this is a max #some question around implementations allowed to use lower values such as 2^31 on some systems? if {$::tomlish::max_int ne "" && $numeric_value > $::tomlish::max_int} { return 0 } if {$::tomlish::min_int ne "" && $numeric_value < $::tomlish::min_int} { return 0 } } else { return 0 } #Got this far - didn't find anything wrong with it. return 1 } #test only that the characters in str are valid for the toml specified type 'float'. proc float_validchars {str} { set numchars [tcl::string::length $str] if {[regexp -all {[eE0-9\_\-\+\.]} $str] == $numchars} { return 1 } else { #only allow lower case for these special values - as per Toml 1.0 spec if {$str ni {inf +inf -inf nan +nan -nan}} { return 0 } else { return 1 } } } #note - Tcl's string is double will return true also for the subset of float values which are integers #This function is to determine whether it matches the Toml float concept - so requires a . or e or E proc is_float {str} { #vip greenlight known literals, don't test for case variations - as Toml doesn't allow (whereas Tcl allows Inf NaN etc) if {$str in {inf +inf -inf nan +nan -nan}} { return 1 } #doorcheck the basics for floatiness vs members of that rival gang - ints if {![regexp {[.eE]} $str]} { #could be an integer - which isn't specifically a float for Toml purposes. return 0 } #patdown for any contraband chars set matches [regexp -all {[eE0-9\_\-\+\.]} $str] if {[tcl::string::length $str] != $matches} { return 0 } #all characters in legal range #A leading zero is ok, but we should disallow multiple leading zeroes (same rules as toml ints) #Early Toml spec also disallowed leading zeros in the exponent part(?) #... this seems less interoperable anyway (some libraries generate leading zeroes in exponents) #we allow leading zeros in exponents here. #Check for leading zeros in main part #first strip any +, - or _ (just for this test) set check [tcl::string::map {+ "" - "" _ ""} $str] set r {([0-9])*} regexp $r $check intpart ;#intpart holds all numerals before the first .,e or E #leading zero only if exactly one zero if {$intpart ne "0" && [string match 0* $intpart]} { return 0 } #for floats, +,- may occur in multiple places #e.g -2E-22 +3e34 #!todo - check bounds ? #----------------------------------------- if {[regexp {__} $str]} { return 0 } if {[string index $str 0] eq "_" || [string index $str end] eq "_"} { return 0 } set utest [string trimleft $str +-] #test again for further trick like _+_ if {[string index $utest 0] eq "_"} { return 0 } #----------------------------------------- #decimal point, if used must be surrounded by at least one digit on each side #e.g 3.e+20 also illegal set dposn [string first . $str] if {$dposn > -1 } { set d3 [string range $str $dposn-1 $dposn+1] if {![string is integer -strict [string index $d3 0]] || ![string is integer -strict [string index $d3 2]]} { return 0 } } #we've already eliminated leading/trailing underscores #now ensure each inner underscore is surrounded by digits if {[regexp {_[^0-9]|[^0-9]_} $str]} { return 0 } #strip underscores for tcl double check so we can support < tcl 9 versions which didn't allow underscores set check [tcl::string::map {_ ""} $str] #string is double accepts inf nan +NaN etc. if {![tcl::string::is double $check]} { return 0 } #All good - seems to be a toml-approved float and not an int. return 1 } #test only that the characters in str are valid for the toml specified type 'datetime'. proc datetime_validchars {str} { set numchars [tcl::string::length $str] if {[regexp -all {[zZtT0-9\-\+\.:]} $str] == $numchars} { return 1 } else { return 0 } } proc is_datepart {str} { set matches [regexp -all {[0-9\-]} $str] if {[tcl::string::length $str] != $matches} { return 0 } #seems to require yyyy-mm-dd (e.g not allowing just yyyy-mm) if {![regexp {^([0-9]{4})-([0-9]{2})-([0-9]{2})$} $str _match y m d]} { return 0 } if {$m > 12 || $m == 0} { return 0 } switch -- [expr {$m}] { 1 - 3 - 5 - 7 - 8 - 10 - 12 { if {$d > 31 || $d == 0} { return 0 } } 2 { if {$d > 29 || $d == 0} { return 0 } if {$d == 29} { #leapyear check if {[catch {clock scan $str -format %Y-%m-%d} errM]} { return 0 } } } 4 - 6 - 9 - 11 { if {$d > 30 || $d == 0} { return 0 } } } return 1 } proc is_localdate {str} { is_datepart $str } #allow only hh:mm:ss or hh:mm (no subseconds) proc _is_hms_or_hm_time {val} { set numchars [tcl::string::length $val] if {[regexp -all {[0-9:]} $val] != $numchars} { return 0 } #assert now digits and colons only set hms_cparts [split $val :] #2 or 3 parts only are valid - check contents of each part if {[llength $hms_cparts] == 2} { lassign $hms_cparts hr min if {[string length $hr] != 2 || [string length $min] != 2} { return 0 } if {$hr > 23 || $min > 59} { return 0 } } elseif {[llength $hms_cparts] == 3} { lassign $hms_cparts hr min sec if {[string length $hr] != 2 || [string length $min] != 2 || [string length $sec] !=2} { return 0 } #possible for sec to be 60 - leap second RFC 3339 if {$hr > 23 || $min > 59 || $sec > 60} { return 0 } } else { return 0 } return 1 } proc is_timepart {str} { #validate the part after the T (or space) #we receive only that trailing part here. #odt1 = 1979-05-27T07:32:00Z #odt2 = 1979-05-27T00:32:00-07:00 #odt3 = 1979-05-27T00:32:00.5-07:00 #odt4 = 1979-05-27T00:32:00.999999-07:00 set numchars [tcl::string::length $str] #timepart can have negative or positive offsets so - and + must be accepted if {[regexp -all {[zZt0-9\-\+\.:]} $str] == $numchars} { #todo #basic check that we have leading 2dig hr and 2dig min separated by colon if {![regexp {^[0-9]{2}:[0-9]{2}$|^[0-9]{2}:[0-9]{2}[^0-9]{1}.*$} $str]} { #nn:nn or nn:nnX.* where X is non digit return 0 } set dotparts [split $str .] if {[llength $dotparts] ni {1 2}} { return 0 } if {[llength $dotparts] == 2} { lassign $dotparts hms tail #validate tail - which might have +- offset if {[string index $tail end] ni {z Z}} { #from hh:mm:??. #check for +/- something if {[regexp {(.*)[+-](.*)} $tail _match fraction offset]} { if {![string is digit -strict $fraction]} { return 0 } if {![_is_hms_or_hm_time $offset]} { return 0 } } } else { set tail [string range $tail 0 end-1] #expect tail nnnn (from hh:mm::ss.nnnnZ) #had a dot and a zZ - no other offset valid (?) if {![string is digit -strict $tail]} { return 0 } } } else { #no dot (fraction of second) if {[regexp {(.*)[+-](.*)} $str _match hms offset]} { #validate offset if {![_is_hms_or_hm_time $offset]} { return 0 } } else { set hms $str set offset "" #trim a *single* z or Z off hms if present - multiple should error later if {[string index $hms end] in {z Z}} { set hms [string range $hms 0 end-1] } } } #hms is allowed in toml to be hh:mm:ss or hh:mm #validate we have hh:mm:ss or hh:mm - exactly 2 digits each if {![_is_hms_or_hm_time $hms]} { return 0 } return 1 } else { return 0 } } proc is_localtime {str} { #time of day without any relation to a specific day or any offset or timezone set numchars [tcl::string::length $str] if {[regexp -all {[0-9\.:]} $str] == $numchars} { #todo if {![regexp {^[0-9]{2}:[0-9]{2}$|^[0-9]{2}:[0-9]{2}:[0-9]{2}([.][0-9]+){0,1}$} $str]} { #hh:mm or hh:mm:ss or hh:mm::ss.nnn return 0 } set dotparts [split $str .] if {[llength $dotparts] ni {1 2}} { return 0 } if {[llength $dotparts] == 2} { lassign $dotparts hms _tail #validate tail - just fractional seconds - regex has confirmed at least one digit and only digits #nothing todo? max length? } else { #no fractional seconds set hms $str } if {![_is_hms_or_hm_time $hms]} { return 0 } return 1 } else { return 0 } } #review proc is_datetime {str} { #Essentially RFC3339 formatted date-time - but: #1) allowing seconds to be omitted (:00 assumed) #2) T may be replaced with a single space character TODO - parser support for space in datetime! # (RFC 3339 allows space instead of T also - but doesn't specify it *must* be a single space) #toml-lint @2025-04 doesn't accept t for T or z for Z - but RFC3339 does #toml spec doesn't clarify - we will accept #e.g 1979-05-27 #e.g 1979-05-27T00:32:00Z #e.g 1979-05-27 00:32:00-07:00 #e.g 1979-05-27 00:32:00+10:00 #e.g 1979-05-27 00:32:00.999999-07:00 #review #minimal datetimes? # 2024 not ok - 2024T not accepted by tomlint why? # 02:00 ok # 02:00:00.5 ok # 1:00 - not ok - RFC3339 requires 2-digit hr,min,sec #toml-lint.com accepts 2025-01 if {[string length $str] < 5} { return 0 } set matches [regexp -all {[zZtT0-9\ \-\+\.:]} $str] if {[tcl::string::length $str] == $matches} { #all characters in legal range if {[regexp -all {\ } $str] > 1} { #only a single space is allowed. return 0 } #If we get a space - it is only valid as a convience to represent the T separator #we can normalize by converting to T here before more tests set str [string map {" " T t T} $str] #a further sanity check on T if {[regexp -all {T} $str] > 1} { return 0 } #!todo - use full RFC 3339 parser? #!todo - what if the value is 'time only'? if {[string first T $str] > -1} { lassign [split $str T] datepart timepart if {![is_datepart $datepart]} { return 0 } if {![is_timepart $timepart]} { return 0 } } else { #either a datepart or a localtime #spec: "If you include only the time portion of an RFC 3339 formatted date-time, it will represent that time of day # without any relation to a specific day or any offset or timezone." if {!([is_datepart $str] || [is_localtime $str])} { return 0 } } #Tcl's free-form clock scan (no -format option) is deprecated # #if {[catch {clock scan $datepart} err]} { # puts stderr "tcl clock scan failed err:'$err'" # return 0 #} } else { return 0 } return 1 } #*** !doctools #[list_end] [comment {--- end definitions namespace tomlish::utils ---}] } namespace eval tomlish::parse { #*** !doctools #[subsection {Namespace tomlish::parse}] #[para] #[list_begin definitions] #This is a somewhat curly mix of a statemachine and toml-nesting-stack littered with special cases. #The code is a pig's-nest - but it should be noted that for example trailing single double quotes in multiline strings are perhaps not so trivial to parse using more standard methods either: # - e.g some kind of backtracking required if using an ABNF parser? #I don't know the precise technical name for this sort of parser; probably something like "Dog's Breakfast" #More seriously, we don't have distinct lex/parse steps - so it is basically a 'fused lexer' or 'scannerless parser' #It is also desirable for this system to be useful in 'interactive' use. review - would a separate lexer make this easier or harder? #A possible alternative more structured approach might be to use a PEG (Parsing Expression Grammar) variable is_parsing 0 ;#whether we are in the middle of parsing tomlish text variable state # states: # table-space, itable-space, array-space # array-value-expected,keyval-value-expected,itable-keyval-value-expected, keyval-syntax, # dquoted-key, squoted-key # string-state, literal-state, multistring... # # notes: # only the -space states are also 'spaces' ie a container which is pushed/popped on the spacestack # # xxx_value-expected - we also allow for leading whitespace in this state, but once a value is returned we jump to a state based on the containing space. e.g keyval-tail or array-syntax # #stateMatrix defines for each state, actions to take for each possible token. #single-element actions are the name of the next state into which to transition, or a 'POPSPACE' instruction to pop a level off the spacestack and add the data to the parent container. #dual-element actions are a push instruction and the name of the space to push on the stack. # - PUSHSPACE is a simple push onto the spacestack, zeropoppushspace also pushes, but will first do a pop *if* the current space level is greater than zero (ie if only if not already in root table-space) # -- --- --- --- --- --- #token/state naming guide # -- --- --- --- --- --- #tokens : underscore separated or bare name e.g newline, start_quote, start_squote #private tokens: always have a leading underscore (These are private 'temporary state' tokens that are never returned as actual tokens e.g _start_squote_sequence #states : always contain at least one dash e.g err-state, table-space #instructions # -- --- --- --- --- --- #stateMatrix dict of elements mapping current state to next state based on returned tokens # current-state {token-encountered next-state ... } # where next-state can be a 1 or 2 element list. #If 2 element - the first item is an instruction (ucase) #If 1 element - it is either a lowercase dashed state name or an ucase instruction #e.g {PUSHSPACE } or POPSPACE or SAMESPACE #SAMESPACE - got to same space as parent without popping a level, but has it's own autotransition lookup - strange concept - review usecases variable stateMatrix set stateMatrix [dict create] #--------------------------------------------------------- #WARNING #The stateMatrix implementation here is currently messy. #The code is a mixture of declarative via the stateMatrix and imperative via switch statements during PUSH/POP/SAMESPACE transitions. #This means the state behaviour has to be reasoned about by looking at both in conjuction. #--------------------------------------------------------- #xxx-space vs xxx-syntax inadequately documented - TODO #review - out of date? # --------------------------------------------------------------------------------------------------------------# # incomplete example of some state starting at table-space # --------------------------------------------------------------------------------------------------------------# # ( = -> keyval-value-expected) # keyval-syntax (popped -> keyval-space -> keyval-tail) (autotransition on pop) # keyval-space (autotransition on push ^) # table-space (barekey^) (startdquote -> dquoted-key ^) # --------------------------------------------------------------------------------------------------------------# dict set stateMatrix\ table-space { bom "table-space"\ whitespace "table-space"\ newline "table-space"\ barekey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ squotedkey {PUSHSPACE "keyval-space" state "keyval-syntax" note ""}\ dquotedkey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ XXXsingle_dquote "quoted-key"\ XXXsingle_squote "squoted-key"\ comment "table-space"\ starttablename "tablename-state"\ starttablearrayname "tablearrayname-state"\ enddquote "err-state"\ endsquote "err-state"\ comma "err-state"\ eof "end-state"\ equal "err-state"\ cr "err-lonecr"\ } dict set stateMatrix\ keyval-space {\ whitespace "keyval-syntax"\ equal "keyval-value-expected"\ } # ' = ' portion of keyval dict set stateMatrix\ keyval-syntax {\ whitespace "keyval-syntax"\ barekey {PUSHSPACE "dottedkey-space"}\ squotedkey {PUSHSPACE "dottedkey-space"}\ dquotedkey {PUSHSPACE "dottedkey-space"}\ equal "keyval-value-expected"\ comma "err-state"\ newline "err-state"\ eof "err-state"\ } #### dict set stateMatrix\ keyval-value-expected {\ whitespace "keyval-value-expected"\ untyped_value {TOSTATE "keyval-untyped-sequence" note "possible datetime datepart"}\ literal {TOSTATE "keyval-tail" note "required for empty literal at EOF"}\ string {TOSTATE "keyval-tail" note "required for empty string at EOF"}\ single_dquote {TOSTATE "string-state" returnstate keyval-tail}\ triple_dquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ single_squote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ startarray {PUSHSPACE array-space returnstate keyval-tail}\ } #double_squote {TOSTATE "keyval-tail" note "empty literal received when double squote occurs"} #untyped_value sequences without intervening comma are allowed for datepart timepart #we will produce tomlish with missing SEPS and to_dict must validate whether 2 adjacent barekeys are valid dict set stateMatrix\ keyval-untyped-sequence {\ whitespace "keyval-untyped-sequence"\ untyped_value {TOSTATE "keyval-tail"}\ literal {TOSTATE "keyval-tail" note "required for empty literal at EOF"}\ string {TOSTATE "keyval-tail" note "required for empty string at EOF"}\ single_dquote {TOSTATE "string-state" returnstate keyval-tail}\ triple_dquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ single_squote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ startarray {PUSHSPACE array-space returnstate keyval-tail}\ newline "POPSPACE"\ comment "keyval-tail"\ eof "end-state"\ } #2025 - no leading-squote-space - only trailing-squote-space. dict set stateMatrix\ keyval-tail {\ whitespace "keyval-tail"\ newline "POPSPACE"\ comment "keyval-tail"\ eof "end-state"\ } #itable-space/ curly-syntax : itables # x={y=1,} dict set stateMatrix\ itable-space {\ whitespace "itable-space"\ newline "itable-space"\ barekey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ squotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ dquotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ endinlinetable "POPSPACE"\ comma "err-state"\ comment "itable-space"\ eof "err-state"\ } #we don't get single_squote etc here - instead we get the resulting squotedkey token # ??? review - something like this # # x={y =1,} dict set stateMatrix\ itable-keyval-syntax {\ whitespace {TOSTATE "itable-keyval-syntax"}\ barekey {PUSHSPACE "dottedkey-space"}\ squotedkey {PUSHSPACE "dottedkey-space"}\ dquotedkey {PUSHSPACE "dottedkey-space"}\ equal {TOSTATE "itable-keyval-value-expected"}\ newline "err-state"\ eof "err-state"\ } # x={y=1} dict set stateMatrix\ itable-keyval-space {\ whitespace "itable-keyval-syntax"\ equal {TOSTATE "itable-keyval-value-expected" note "required"}\ } dict set stateMatrix\ itable-keyval-value-expected {\ whitespace "itable-keyval-value-expected"\ untyped_value {TOSTATE "itable-val-tail" note ""}\ single_dquote {TOSTATE "string-state" returnstate itable-val-tail}\ triple_dquote {PUSHSPACE "multistring-space" returnstate itable-val-tail}\ single_squote {TOSTATE "literal-state" returnstate itable-val-tail note "usual way a literal is triggered"}\ triple_squote {PUSHSPACE "multiliteral-space" returnstate itable-val-tail}\ startinlinetable {PUSHSPACE "itable-space" returnstate itable-val-tail}\ startarray {PUSHSPACE "array-space" returnstate itable-val-tail}\ } #double_squote not currently generated by _start_squote_sequence - '' processed as single_squote to literal-state just like 'xxx' # review # double_squote {TOSTATE "itable-val-tail" note "empty literal received when double squote occurs"} # x={y=1,z="x"} #POPSPACE is transition from itable-keyval-space to parent itable-space dict set stateMatrix\ itable-val-tail {\ whitespace "itable-val-tail"\ endinlinetable "POPSPACE"\ comma "POPSPACE"\ newline {TOSTATE "itable-val-tail" note "itable-space ??"}\ comment "itable-val-tail"\ eof "err-state"\ } # XXXnewline "POPSPACE" # We shouldn't popspace on newline - as if there was no comma we need to stay in itable-val-tail # This means the newline and subsequent whitespace, comments etc become part of the preceeding dottedkey record #e.g # x = { # j=1 # #comment within dottedkey j record # , # comment unattached # #comment unattached # k=2 , #comment unattached # l=3 #comment within l record # , m=4 # #comment associated with m record # # #still associated with m record # } ## - This doesn't quite correspond to what a user might expect - but seems like a consistent mechanism. #The awkwardness is because there is no way to put in a comment that doesn't consume a trailing comma #so we cant do: j= 1 #comment for j1 , # and have the trailing comma recognised. # # To associate: j= 1, #comment for j1 # we would need some extra processing . (not popping until next key ? extra state itable-sep-tail?) REVIEW - worth doing? # # The same issue occurs with multiline arrays. The most natural assumption is that a comment on same line after a comma # is 'associated' with the previous entry. # # These comment issues are independent of the data dictionary being generated for conversion to json etc - as the comments don't carry through anyway, # but are a potential oddity for manipulating the intermediate tomlish structure whilst attempting to preserve 'associated' comments # (e.g reordering records within an itable) #The user's intention for 'associated' isn't always clear and the specs don't really guide on this. #dottedkey-space is not (currently) used within [tablename] or [[tablearrayname]] #it is for keyval ie x.y.z = value #this is the state after dot #we are expecting a complete key token or whitespace #(initial entry to the space is by one of the keys - which will immediately go to dottedkey-space-tail) dict set stateMatrix\ dottedkey-space {\ whitespace "dottedkey-space"\ dotsep "err-state"\ barekey "dottedkey-space-tail"\ squotedkey "dottedkey-space-tail"\ dquotedkey "dottedkey-space-tail"\ newline "err-state"\ comma "err-state"\ comment "err-state"\ equal "err-state"\ } #dottedkeyend "POPSPACE" #equal "POPSPACE"\ #jmn 2025 #we have 1 or more dottedkeys so far - need dotsep to add more, whitespace to maintain, equal to pop dict set stateMatrix\ dottedkey-space-tail {\ whitespace "dottedkey-space-tail" dotsep "dottedkey-space" equal "POPSPACE"\ eof "err-state"\ newline "err-state"\ } #-------------------------------------------------------------------------- #scratch area #from_toml {x=1} # barekey tok # table-space PUSHSPACE keyval-space state keyval-syntax # #-------------------------------------------------------------------------- #REVIEW #toml spec looks like heading towards allowing newlines within inline tables #https://github.com/toml-lang/toml/issues/781 #2025 - multiline itables appear to be valid for 1.1 - which we are targeting. #https://github.com/toml-lang/toml/blob/main/toml.md#inline-table #JMN2025 #review comment "err-state" vs comment "itable-space" - see if TOML 1.1 comes out and allows comments in multiline ITABLES #We currently allow multiline ITABLES (also with comments) in the tokenizer. #if we want to disallow as per TOML 1.0 - we should do so when attempting to get structure? #JMN REVIEW #dict set stateMatrix\ # array-space {\ # whitespace "array-space"\ # newline "array-space"\ # untyped_value "SAMESPACE"\ # startarray {PUSHSPACE "array-space"}\ # endarray "POPSPACE"\ # startinlinetable {PUSHSPACE itable-space}\ # single_dquote "string-state"\ # single_squote "literal-state"\ # triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax note "seems ok 2024"}\ # comma "array-space"\ # comment "array-space"\ # eof "err-state-array-space-got-eof"\ # } ## array-space ## set aspace [dict create] dict set aspace whitespace "array-space" dict set aspace newline "array-space" #dict set aspace untyped_value "SAMESPACE" dict set aspace untyped_value "array-syntax" dict set aspace startarray {PUSHSPACE "array-space"} dict set aspace endarray "POPSPACE" dict set aspace single_dquote {TOSTATE "string-state" returnstate array-syntax} dict set aspace triple_dquote {PUSHSPACE "multistring-space" returnstate array-syntax} dict set aspace single_squote {TOSTATE "literal-state" returnstate array-syntax} dict set aspace triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax} dict set aspace startinlinetable {PUSHSPACE itable-space} #dict set aspace comma "array-space" dict set aspace comment "array-space" dict set aspace eof "err-state-array-space-got-eof" dict set stateMatrix array-space $aspace #when we pop from an inner array we get to array-syntax #e.g {x=[[]] ??? set tarntail [dict create] dict set tarntail whitespace "err-state" ;#"tablearrayname-tail" ;#spec doesn't allow whitespace here dict set tarntail newline "err-state" dict set tarntail comment "err-state" dict set tarntail eof "err-state" dict set tarntail endtablename "tablearray-tail" dict set stateMatrix tablearrayname-tail $tarntail #review - somewhat counterintuitive...? # [(starttablearrayname) (endtablearrayname] # [(starttablename) (endtablename)] # [[xxx]] ??? set tartail [dict create] dict set tartail whitespace "tablearray-tail" dict set tartail newline "table-space" dict set tartail comment "tablearray-tail" dict set tartail eof "end-state" dict set stateMatrix tablearray-tail $tartail dict set stateMatrix\ end-state {} set knowntokens [list] set knownstates [list] dict for {state transitions} $stateMatrix { if {$state ni $knownstates} {lappend knownstates $state} dict for {tok instructions} $transitions { if {$tok ni $knowntokens} {lappend knowntokens $tok} } } dict set stateMatrix nostate {} foreach tok $knowntokens { dict set stateMatrix nostate $tok "err-nostate-received-token-$tok" } # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- #purpose - debugging? remove? # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- #build a list of 'push triggers' from the stateMatrix # ie tokens which can push a new space onto spacestack set push_trigger_tokens [list] tcl::dict::for {s transitions} $stateMatrix { tcl::dict::for {token transition_to} $transitions { set instruction [lindex $transition_to 0] switch -exact -- $instruction { PUSHSPACE - zeropoppushspace { if {$token ni $push_trigger_tokens} { lappend push_trigger_tokens $token } } } } } ::tomlish::log::debug "push_trigger_tokens: $push_trigger_tokens" # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- #This seems hacky... (deprecate in favour of explicit arguments to the instructions in stateMatrix?) #spacePopTransitions, spacePushTransitions, spaceSameTransitions below for auto state redirections on POPSPACE,PUSHSPACE,SAMESPACE #mainly for the -space states: #redirect to another state $c based on a state transition from $whatever to $b # e.g "string {array-space array-syntax}" means when transitioning from string to array-space, jump to array-syntax instead. #this is useful as we often don't know state $b. e.g when it is decided by 'POPSPACE' #use dict set to add values so we can easily add/remove/comment lines #Push to, next #default first states when we push to these spaces variable spacePushTransitions [dict create] dict set spacePushTransitions keyval-space keyval-syntax dict set spacePushTransitions itable-keyval-space itable-keyval-syntax dict set spacePushTransitions array-space array-space dict set spacePushTransitions table-space tablename-state #dict set spacePushTransitions #itable-space itable-space #Pop to, next variable spacePopTransitions [dict create] dict set spacePopTransitions array-space array-syntax #itable-keyval-space itable-val-tail #review #we pop to keyval-space from dottedkey-space or from keyval-value-expected? we don't always want to go to keyval-tail #leave it out and make the POPSPACE caller explicitly specify it #keyval-space keyval-tail variable spaceSameTransitions [dict create] #JMN test #dict set spaceSameTransitions array-space array-syntax #itable-keyval-space itable-val-tail variable state_list ;#reset every tomlish::decode::toml namespace export tomlish toml namespace ensemble create #goNextState has various side-effects e.g pushes and pops spacestack #REVIEW - setting nest and v elements here is ugly #todo - make neater, more single-purpose? proc goNextState {tokentype tok currentstate} { variable state variable nest variable v set prevstate $currentstate variable spacePopTransitions variable spacePushTransitions variable spaceSameTransitions variable last_space_action "none" variable last_space_type "none" variable state_list set result "" set starttok "" if {[dict exists $::tomlish::parse::stateMatrix $currentstate $tokentype]} { set transition_to [dict get $::tomlish::parse::stateMatrix $currentstate $tokentype] ::tomlish::log::debug "--->> goNextState tokentype:$tokentype tok:$tok currentstate:$currentstate : transition_to = $transition_to" switch -exact -- [lindex $transition_to 0] { POPSPACE { set popfromspace_info [spacestack peek] set popfromspace_state [dict get $popfromspace_info state] spacestack pop set parent_info [spacestack peek] set type [dict get $parent_info type] set parentspace [dict get $parent_info state] set last_space_action "pop" set last_space_type $type if {[dict exists $parent_info returnstate]} { set next [dict get $parent_info returnstate] #clear the returnstate on current level set existing [spacestack pop] dict unset existing returnstate spacestack push $existing ;#re-push modification ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected to stored returnstate $next <<---" } else { ### #review - do away with spacePopTransitions - which although useful to provide a default.. # - involve error-prone configurations distant to the main state transition configuration in stateMatrix if {[dict exists $::tomlish::parse::spacePopTransitions $parentspace]} { set next [dict get $::tomlish::parse::spacePopTransitions $parentspace] ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected state to $next (spacePopTransitions)<<---" } else { set next $parentspace ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace<<---" } } set result $next } SAMESPACE { set currentspace_info [spacestack peek] ::tomlish::log::debug "--->> SAMESPACE got current space entry: $currentspace_info <<<<<" set type [dict get $currentspace_info type] set currentspace [dict get $currentspace_info state] if {[dict exists $currentspace_info returnstate]} { set next [dict get $currentspace_info returnstate] #clear the returnstate on current level set existing [spacestack pop] dict unset existing returnstate spacestack push $existing ;#re-push modification ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace redirected to stored returnstate $next" } else { if {[dict exists $::tomlish::parse::spaceSameTransitions $currentspace]} { set next [dict get $::tomlish::parse::spaceSameTransitions $currentspace] ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace redirected state to $next (spaceSameTransitions)" } else { set next $currentspace ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace" } } set result $next } zeropoppushspace { if {$nest > 0} { #pop back down to the root level (table-space) spacestack pop set parentinfo [spacestack peek] set type [dict get $parentinfo type] set target [dict get $parentinfo state] set last_space_action "pop" set last_space_type $type #----- #standard pop set parentlevel [expr {$nest -1}] lappend v($parentlevel) [set v($nest)] incr nest -1 #----- } #re-entrancy #set next [list PUSHSPACE [lindex $transition_to 1]] set nexttokentype ${tokentype}2 ;#fake token type e.g tablename2 or tablearrayname2 ::tomlish::log::debug "--->> zeropoppushspace goNextState RECURSE. calling goNextState $nexttokentype $currentstate" set transition_info [::tomlish::parse::goNextState $nexttokentype $tok $currentstate] set result [dict get $transition_info newstate] } PUSHSPACE { set original_target [dict get $transition_to PUSHSPACE] if {[dict exists $transition_to returnstate]} { #adjust the existing space record on the stack. #struct::stack doesn't really support that - so we have to pop and re-push #todo - investigate a custom stack implementation where we can efficiently lset the top of the stack set currentspace [spacestack pop] dict set currentspace returnstate [dict get $transition_to returnstate] spacestack push $currentspace ;#return modified info to stack so when we POPSPACE the returnstate is available. } if {[dict exists $transition_to starttok]} { set starttok [dict get $transition_to starttok] } spacestack push [dict create type space state $original_target] set last_space_action "push" set last_space_type "space" if {[dict exists $transition_to state]} { #an explicit state in the pushed space was requested in the stateMatrix - override the spacePushTransition (spacePushTransitions can be deprecated if we require explicitness?) set next [dict get $transition_to state] ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target redirected state to $next by explicit 'state' entry" } else { #puts $::tomlish::parse::spacePushTransitions if {[dict exists $::tomlish::parse::spacePushTransitions $original_target]} { set next [dict get $::tomlish::parse::spacePushTransitions $original_target] ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target redirected state to $next (spacePushTransitions) " } else { set next $original_target ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target" } } set result $next } TOSTATE { if {[dict exists $transition_to returnstate]} { #adjust the existing space record on the stack. #struct::stack doesn't really support that - so we have to pop and re-push #todo - investigate a custom stack implementation where we can efficiently lset the top of the stack set currentspace [spacestack pop] dict set currentspace returnstate [dict get $transition_to returnstate] spacestack push $currentspace ;#return modified info to stack so when we POPSPACE the returnstate is available. } set result [dict get $transition_to TOSTATE] } default { #simplified version of TOSTATE set result [lindex $transition_to 0] ;#ignore everything but first word } } } else { ::tomlish::log::error "--->> No state transition defined from state $currentstate when tokentype $tokentype received" set result "nostate" } lappend state_list [list tokentype $tokentype from $currentstate to $result] set state $result ::tomlish::log::notice "--->> STATE TRANSITION tokenType: '$tokentype' tok:$tok triggering '$currentstate' -> '$result' last_space_action:$last_space_action " return [dict create prevstate $prevstate newstate $result space_action $last_space_action starttok $starttok] } proc report_line {{line ""}} { variable linenum variable is_parsing if {$is_parsing} { if {$line eq ""} { set line $linenum } return "Line Number: $line" } else { #not in the middle of parsing tomlish text - return nothing. return "" } } #produce a *slightly* more readable string rep of the nest for puts etc. proc nest_pretty1 {list} { set prettier "{" foreach el $list { if { [lindex $el 0] eq "NEWLINE"} { append prettier "[list $el]\n" } elseif {([llength $el] > 1) && ([lindex $el 0] in {KEY DQKEY SQKEY TABLE ARRAY})} { append prettier [nest_pretty1 $el] } else { append prettier "[list $el] " } } append prettier "}" return $prettier } proc set_tokenType {t} { variable tokenType variable tokenType_list if {![info exists tokenType]} { set tokenType "" } lappend tokenType_list $t set tokenType $t } proc switch_tokenType {t} { variable tokenType variable tokenType_list lset tokenType_list end $t set tokenType $t } proc get_tokenType {} { variable tokenType return $tokenType } proc get_token_waiting {} { variable token_waiting return [lindex $token_waiting 0] } proc clear_token_waiting {} { variable token_waiting set token_waiting [list] } #token_waiting is a list - but our standard case is to have only one #in certain circumstances such as near eof we may have 2 #the set_token_waiting function only allows setting when there is not already one waiting. #we want to catch cases of inadvertently trying to set multiple # - the reason being that the state transition triggered by the previous token may have invalidated the assumptions made when a token was added as waiting. proc set_token_waiting {args} { if {[llength $args] %2 != 0} { error "tomlish set_token_waiting must have args of form: type value complete 0|1" } variable token_waiting if {[llength $token_waiting] && [dict get [lindex $token_waiting end] type] ne "eof"} { #tokloop already set a token_waiting - but something (post tokloop processing?) is trying to set another #we may need to remove the existing token_waiting and reset the tokloop index to the previous char so it's reprocessed in the possibly new context #rather than attempt to make the right decision here - we raise an error and require the caller to check/handle it set err "tomlish set_token_waiting already has token_waiting: [lindex $token_waiting 0]" append err \n " - cannot add token_waiting: $args" error $err #set tomlish::parse::i [expr {[dict get $token_waiting startindex] -1}] #set token_waiting [list] } set waiting [dict create] dict for {k v} $args { switch -exact $k { type - complete { dict set waiting $k $v } value { dict set waiting tok $v } startindex { dict set waiting startindex $v } default { error "tomlish set_token_waiting error - unrecognised key $k. known keys: [dict keys $args]" } } } if {![tcl::string::is boolean -strict [dict get $waiting complete]]} { error "tomlish set_token_waiting error - 'complete' must be a boolean. got [dict get $waiting complete]" } if {![llength $token_waiting]} { set token_waiting [list $waiting] } else { #an extra sanity-check that we don't have more than just the eof.. if {[llength $token_waiting] > 1} { set err "tomlish Unexpected. Existing token_waiting count > 1.\n" foreach tw $token_waiting { append err " $tw" \n } append err " - cannot add token_waiting: $waiting" error $err } #last entry must be a waiting eof set token_waiting [list $waiting [lindex $token_waiting end]] } return } #returns 0 or 1 #tomlish::parse::tok #we attempt to do this without lookahead (potential use in streaming toml? for what benefit?) todo -final flag # - the possible benefit is being able to more easily process in arbitrarily split chunks (although we would still have to watch crlf splitting ?) # - interactive use? proc tok {} { variable nest variable s variable v variable i variable tok variable type ;#character type variable state ;#FSM variable tokenType variable tokenType_list variable endToken variable lastChar variable braceCount variable bracketCount #------------------------------ #Previous run found another (presumably single-char) token #The normal case is for there to be only one dict in the list #multiple is an exception - primarily for eof variable token_waiting if {[llength $token_waiting]} { set waiting [lindex $token_waiting 0] set tokenType [dict get $waiting type] set tok [dict get $waiting tok] #todo: dict get $token_waiting complete set token_waiting [lrange $token_waiting 1 end] return 1 } #------------------------------ set resultlist [list] set sLen [tcl::string::length $s] set slash_active 0 set quote 0 set c "" for {} {$i < $sLen} {} { if {$i > 0} { set lastChar [tcl::string::index $s [expr {$i - 1}]] set start_of_data h } else { set lastChar "" set start_of_data 1 #bom-handling if {[tcl::string::index $s 0] eq "\uFEFF"} { #bom (could be from various encodings - now decoded as single unicode char FEFF) #incr i 1 ;#skip over initial bom? } } set c [tcl::string::index $s $i] set cindex $i set ctest [tcl::string::map {\{ lc \} rc \[ lb \] rb \" dq ' sq \\ bsl \r cr \n lf \t tab \uFEFF bom} $c] tomlish::log::debug "- tokloop char <$ctest> index $i tokenType:$tokenType tok:<$tok>" #puts "got char $c during tokenType '$tokenType'" incr i ;#must incr here because we do returns inside the loop switch -exact -- $ctest { # { set had_slash $slash_active set slash_active 0 if {$had_slash} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { newline { #incomplete newline set_tokenType "cr" incr i -1 return 1 } tentative_accum_squote - tentative_accum_dquote { #for multiliteral, multistring - data and/or end incr i -1 return 1 } _start_squote_sequence { #pseudo token beginning with underscore - never returned to state machine - review incr i -[tcl::string::length $tok] set_tokenType "single_squote" return 1 } _start_dquote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_dquote" return 1 } barekey { error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed (see tomlish::utils::is_barekey). [tomlish::parse::report_line]" } whitespace { # hash marks end of whitespace token #do a return for the whitespace, set token_waiting #set_token_waiting type comment value "" complete 1 incr i -1 ;#leave comment for next run return 1 } untyped_value { #REVIEW! the spec isn't clear.. is whitespace after an int,bool etc required before comment? #we will accept a comment marker as an immediate terminator of the untyped_value. incr i -1 return 1 } starttablename - starttablearrayname { #fix? error "tomlish Character '#' is invalid first character for $tokenType. [tomlish::parse::report_line]" } tablename - tablearrayname { #invalid in bare parts - but allowed in quoted parts - let tablename parser sort it out append tok $c } default { #dquotedkey, string,literal, multistring append tok $c } } } else { switch -- $state { multistring-space { set_tokenType stringpart set tok "" if {$had_slash} { append tok "\\" } append tok "#" } multiliteral-space { set_tokenType "literalpart" set tok "#" } default { #start of token if we're not in a token set_tokenType comment set tok "" ;#The hash is not part of the comment data } } } } lc { #left curly brace set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_squote" return 1 } _start_dquote_sequence { incr i [tcl::string::length $tok] set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { append tok $c } string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} append tok $c } starttablename - starttablearrayname { #*bare* tablename can only contain letters,digits underscores error "tomlish Invalid tablename first character \{ [tomlish::parse::report_line]" } tablename - tablearrayname { #valid in quoted parts append tok $c } comment { if {$had_slash} {append tok "\\"} append tok "\{" } default { #end any other token. incr i -1 return 1 } } } else { switch -exact -- $state { itable-keyval-value-expected - keyval-value-expected { #switch last key to tablename?? set_tokenType "startinlinetable" set tok "\{" return 1 } array-space - array-syntax { #nested anonymous inline table set_tokenType "startinlinetable" set tok "\{" return 1 } table-space { #invalid - but allow parser statemachine to report it. ? set_tokenType "startinlinetable" set tok "\{" return 1 } multistring-space { set_tokenType "stringpart" set tok "" if {$had_slash} { append tok "\\" } append tok "\{" } multiliteral-space { set_tokenType "literalpart" set tok "\{" } default { error "tomlish state: '$state'. left brace case not implemented [tomlish::parse::report_line]" } } } } rc { #right curly brace set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_squote" return 1 } _start_dquote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { append tok $c } string - dquotedkey - comment { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} append tok $c } starttablename - tablename { if {$had_slash} {append tok "\\"} #invalid! - but leave for datastructure loading stage to catch set_token_waiting type endinlinetable value "" complete 1 startindex $cindex return 1 } starttablearrayname - tablearrayname { if {$had_slash} {append tok "\\"} #invalid! - but leave for datastructure loading stage to catch set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex return 1 } default { #end any other token incr i -1 return 1 } } } else { #$slash_active not relevant when no tokenType switch -exact -- $state { table-space { #invalid - but allow parser statemachine to report it. ? set_tokenType "endinlinetable" set tok "\}" return 1 } itable-space { set_tokenType "endinlinetable" set tok "\}" return 1 } tablename-state { #e.g [] - empty tablename - allowed or not? #empty tablename/tablearrayname ? #error "unexpected tablename problem" set_tokenType "endinlinetable" set tok "" ;#no output into the tomlish list for this token return 1 } tablearrayname-state { error "tomlish unexpected tablearrayname-state problem" set_tokenType "endinlinetable" set tok "" ;#no output into the tomlish list for this token return 1 } array-syntax - array-space { #invalid set_tokenType "endinlinetable" set tok "\}" return 1 } itable-val-tail { set_tokenType "endinlinetable" set tok "" #we need to pop the keyval - and then reprocess to pop the inlinetable - so we incr -1 incr i -1 return 1 } itable-keyval-syntax { error "tomlish endinlinetable unexpected at this point. Expecting key=val syntax [tomlish::parse::report_line]" } multistring-space { set_tokenType "stringpart" set tok "" if {$had_slash} { append tok "\\" } append tok "\}" } multiliteral-space { set_tokenType "literalpart" ; #review set tok "\}" } default { #JMN2024b keyval-tail? error "tomlish state '$state'. endinlinetable case not implemented [tomlish::parse::report_line]" } } } } lb { #left square bracket set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_squote" return 1 } _start_dquote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { append tok $c } string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} append tok $c } starttablename { #change the tokenType switch_tokenType "starttablearrayname" set tok "" ;#no output into the tomlish list for this token #any following whitespace is part of the tablearrayname, so return now return 1 } tablename - tablearrayname { #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token if {$had_slash} { #resultant tablename may be invalid - but leave for datastructure loading stage to catch #append tok "\\[" append tok {\[} } else { if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { #invalid at this point - state machine should disallow: # table -> starttablearrayname # tablearray -> starttablearrayname set_token_waiting type starttablearrayname value "" complete 1 startindex $cindex return 1 } else { #we appear to still be in single or double quoted section append tok "\[" } } } comment { if {$had_slash} {append tok "\\"} append tok "\[" } default { #end any other token. incr i -1 return 1 } } } else { #$slash_active not relevant when no tokenType switch -exact -- $state { keyval-value-expected - itable-keyval-value-expected { set_tokenType "startarray" set tok "\[" return 1 } array-space - array-syntax { #nested array? set_tokenType "startarray" set tok "\[" return 1 #error "state: array-space. startarray case not implemented [tomlish::parse::report_line]" } table-space { #table name #assume it's a single bracket - but we need to wait for non-bracket to confirm it's not a tablearray #note that a starttablearrayname token may contain whitespace between the brackets # e.g \[ \[ set_tokenType "starttablename" set tok "" ;#there is no output into the tomlish list for this token } multistring-space { set_tokenType "stringpart" set tok "" if {$had_slash} { append tok "\\" } append tok "\[" } multiliteral-space { set_tokenType "literalpart" set tok "\[" } itable-space { #handle state just to give specific error msg error "tomlish state: '$state'. Left square bracket invalid. Cannot start array in inline table without key. Use key=\[\] syntax. [tomlish::parse::report_line]" } default { error "tomlish state: '$state'. startarray case not implemented [tomlish::parse::report_line]" } } } } rb { #right square bracket set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_squote" return 1 } _start_dquote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { append tok $c } string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} append tok $c } comment { if {$had_slash} {append tok "\\"} append tok $c } whitespace { if {$state eq "multistring-space"} { #???? incr i -1 if {$had_slash} {incr i -1} ;#reprocess return 1 } else { incr i -1 if {$had_slash} {incr i -1} ;#reprocess return 1 } } starttablename { #toml-test invalid/table/empty set_token_waiting type tablename value "" complete 1 startindex $cindex incr i -1 return 1 } tablename { #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token if {$had_slash} { #resultant tablename may be invalid - but leave for datastructure loading stage to catch append tok "\\]" } else { if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { set_token_waiting type endtablename value "" complete 1 startindex $cindex return 1 } else { #we appear to still be in single or double quoted section append tok "]" } } } tablearrayname { #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token if {$had_slash} { #resultant tablename may be invalid - but leave for datastructure loading stage to catch append tok "\\]" } else { if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex return 1 } else { #we appear to still be in single or double quoted section append tok "]" } } } default { incr i -1 return 1 } } } else { #$slash_active not relevant when no tokenType switch -exact -- $state { array-syntax - array-space { #invalid - but allow parser statemachine to report it. set_tokenType "endarray" set tok "\]" return 1 } table-space { #invalid - but allow parser statemachine to report it. ? set_tokenType "endarray" set tok "\]" return 1 } tablename-state { #e.g [] - empty tablename #tomltest 1.1.0 invalid/table/empty #should be invalid #we parse it and let dict::from_tomlish error when it tries to split table set_tokenType "endtablename" set tok "" ;#no output into the tomlish list for this token return 1 } tablearrayname-state { error "tomlish unexpected tablearrayname problem" set_tokenType "endtablearray" set tok "" ;#no output into the tomlish list for this token return 1 } tablearrayname-tail { #[[xxx] set_tokenType "endtablename" #sequence: starttablename -> starttablearrayname -> endtablearrayname -> endtablename return 1 } multistring-space { set_tokenType "stringpart" set tok "" if {$had_slash} { append tok "\\" } append tok "\]" } multiliteral-space { set_tokenType "literalpart" set tok "\]" } default { error "tomlish state '$state'. endarray case not implemented [tomlish::parse::report_line]" } } } } bsl { #backslash if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_squote" return 1 } _start_dquote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_dquote" return 1 } whitespace { if {$state eq "multistring-space"} { #end whitespace token incr i -1 ;#reprocess bsl in next run return 1 } else { error "tomlish Unexpected backslash during whitespace. [tomlish::parse::report_line]" } } literal - literalpart - squotedkey { #never need to set slash_active true when in single quoted tokens append tok "\\" set slash_active 0 } string - dquotedkey - comment { if {$slash_active} { set slash_active 0 append tok "\\\\" } else { set slash_active 1 } } stringpart { if {$slash_active} { set slash_active 0 append tok "\\\\" } else { set slash_active 1 } } starttablename - starttablearrayname { error "tomlish backslash is invalid as first character of $tokenType [tomlish::parse::report_line]" } tablename - tablearrayname { if {$slash_active} { set slash_active 0 append tok "\\\\" } else { set slash_active 1 } } barekey { error "tomlish Unexpected backslash during barekey. [tomlish::parse::report_line]" } default { error "tomlish Backslash unexpected during tokentype: '$tokenType'. [tomlish::parse::report_line]" } } } else { switch -exact -- $state { multistring-space { if {$slash_active} { set_tokenType "stringpart" set tok "\\\\" set slash_active 0 } else { set slash_active 1 } } multiliteral-space { #nothing can be escaped in multiliteral-space - not even squotes (?) review set_tokenType "literalpart" set tok "\\" } default { error "tomlish tok error: Unexpected backslash when no token is active. [tomlish::parse::report_line]" } } } } sq { #single quote set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { tentative_accum_squote { #for within multiliteral #short tentative_accum_squote tokens are returned if active upon receipt of any other character #longest allowable for leading/trailing are returned here #### set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote #assert state = trailing-squote-space append tok $c if {$existingtoklen == 4} { #maxlen to be a tentative_accum_squote is multisquote + 2 = 5 #return tok with value ''''' return 1 } } tentative_accum_dquote { incr i -1 return 1 } _start_squote_sequence { #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space switch -- [tcl::string::length $tok] { 1 { #no conclusion can yet be reached append tok $c } 2 { #enter multiliteral #switch? append tok $c set_tokenType triple_squote return 1 } default { #if there are more than 3 leading squotes we also enter multiliteral space and the subsequent ones are handled #by the tentative_accum_squote check for ending sequence which can accept up to 5 and reintegrate the #extra 1 or 2 squotes as data. error "tomlish unexpected token length [tcl::string::length $tok] in '_start_squote_sequence'" } } } _start_dquote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_dquote" return 1 } whitespace { #end whitespace incr i -1 ;#reprocess sq return 1 } literal { #slash_active always false #terminate the literal set_token_waiting type endsquote value "'" complete 1 startindex $cindex return 1 } literalpart { #ended by ''' - but final could be '''' or ''''' (up to 2 squotes allowed directly before ending triple squote sequence) #todo # idea: end this literalpart (possibly 'temporarily') # let the sq be reprocessed in the multiliteral-space to push an end-multiliteral-sequence to state stack # upon popping end-multiliteral-sequence - stitch quotes back into this literalpart's token (if either too short - or a long ending sequence as shown above) incr i -1 ;#throw the "'" back to loop - will be added to a tentative_accum_squote token for later processing return 1 } XXXitablesquotedkey { set_token_waiting type endsquote value "'" complete 1 startindex $cindex return 1 } squotedkey { ### #set_token_waiting type endsquote value "'" complete 1 return 1 } starttablename - starttablearrayname { #!!! incr i -1 return 1 } tablename - tablearrayname { append tok $c } barekey { #barekeys now support all sorts of unicode letter/number chars for other cultures #but not punctuation - not even for those of Irish heritage who don't object #to the anglicised form of some names. # o'shenanigan seems to not be a legal barekey #The Irish will have to use an earlier form Ó - which apparently many may prefer anyway. error "tomlish Unexpected single quote during barekey. [tomlish::parse::report_line]" } default { append tok $c } } } else { switch -exact -- $state { array-space - keyval-value-expected - itable-keyval-value-expected { #leading squote #pseudo-token _start_squote_sequence ss not received by state machine #This pseudotoken will trigger production of single_squote token or triple_squote token #It currently doesn't trigger double_squote token #(handle '' same as 'x' ie produce a single_squote and go into processing literal) #review - producing double_squote for empty literal may be slightly more efficient. #This token is not used to handle squote sequences *within* a multiliteral set_tokenType "_start_squote_sequence" set tok "'" } multiliteral-space { #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row #we are building up a tentative_accum_squote to determine if #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines #b) it is exactly ''' and we can terminate the whole multiliteral #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space set_tokenType "tentative_trigger_squote" ;#trigger tentative_accum_squote set tok "'" return 1 } table-space - itable-space { #tests: squotedkey.test squotedkey_itable.test set_tokenType "squotedkey" set tok "" } XXXtable-space - XXXitable-space { #future - could there be multiline keys? MLLKEY, MLBKEY ? #this would (almost) allow arbitrary tcl dicts to be stored in toml (aside from escaping issues) #probably unlikely - as it's perhaps not very 'minimal' or ergonomic for config files #@2025 ABNF for toml mentions key, simple-key, unquoted-key, quoted-key and dotted-key #where key is simple-key or dotted-key - no MLL or MLB components #the spec states solution for arbitrary binary data is application specific involving encodings #such as hex, base64 set_tokenType "_start_squote_sequence" set tok "'" return 1 } tablename-state { #first char in tablename-state/tablearrayname-state set_tokenType "tablename" append tok "'" } tablearrayname-state { set_tokenType "tablearrayname" append tok "'" } literal-state { #shouldn't get here? review tomlish::log::debug "- tokloop sq during literal-state with no tokentype - empty literal?" set_tokenType "literal" incr -1 return 1 } multistring-space { set_tokenType "stringpart" set tok "" if {$had_slash} {append tok "\\"} append tok "," #error "tomlish unimplemented - squote during state '$state'. [tomlish::parse::report_line]" } dottedkey-space { set_tokenType "squotedkey" } default { error "tomlish unhandled squote during state '$state'. [tomlish::parse::report_line]" } } } } dq { #double quote set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { newline { #review #incomplete newline set_tokenType "cr" incr i -1 return 1 } tentative_accum_squote { incr i -1 return 1 } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_squote" return 1 } tentative_accum_dquote { #within multistring #short tentative_accum_dquote tokens are returned if active upon receipt of any other character #longest allowable for leading/trailing are returned here #### set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote #assert state = trailing-squote-space append tok $c if {$existingtoklen == 4} { #maxlen to be a tentative_accum_dquote is multidquote + 2 = 5 #return tok with value """"" return 1 } } _start_dquote_sequence { #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space switch -- [tcl::string::length $tok] { 1 { #no conclusion can yet be reached append tok $c } 2 { #enter multistring #switch? append tok $c set_tokenType triple_dquote return 1 } default { #if there are more than 3 leading dquotes we also enter multistring space and the subsequent ones are handled #by the tentative_accum_dquote check for ending sequence which can accept up to 5 and reintegrate the #extra 1 or 2 dquotes as data. error "tomlish unexpected token length [tcl::string::length $tok] in '_start_dquote_sequence'" } } } literal - literalpart { append tok $c } string { if {$had_slash} { append tok "\\" $c } else { #unescaped quote always terminates a string set_token_waiting type enddquote value "\"" complete 1 startindex $cindex return 1 } } stringpart { #sub element of multistring if {$had_slash} { append tok "\\" $c } else { incr i -1 ;#throw the {"} back to loop - will be added to a tentative_accum_dquote token for later processing return 1 } } whitespace { #assert: had_slash will only ever be true in multistring-space if {$had_slash} { incr i -2 return 1 } else { #end whitespace token - throw dq back for reprocessing incr i -1 return 1 } } comment { if {$had_slash} {append tok "\\"} append tok $c } XXXdquotedkey { if {$had_slash} { append tok "\\" append tok $c } else { set_token_waiting type enddquote value "\"" complete 1 startindex $cindex return 1 } } dquotedkey { ### if {$had_slash} { append tok "\\" append tok $c } else { #set_token_waiting type enddquote value {"} complete 1 return 1 } } squotedkey { append tok $c } tablename - tablearrayname { if {$had_slash} {append tok "\\"} append tok $c } starttablename - starttablearrayname { incr i -1 ;## return 1 } default { error "tomlish got quote during tokenType '$tokenType' [tomlish::parse::report_line]" } } } else { #$slash_active not relevant when no tokenType #token is string only if we're expecting a value at this point switch -exact -- $state { array-space - keyval-value-expected - itable-keyval-value-expected { #leading dquote #pseudo-token _start_squote_sequence ss not received by state machine #This pseudotoken will trigger production of single_dquote token or triple_dquote token #It currently doesn't trigger double_dquote token #(handle "" same as "x" ie produce a single_dquote and go into processing string) #review - producing double_dquote for empty string may be slightly more efficient. #This token is not used to handle dquote sequences once *within* a multistring set_tokenType "_start_dquote_sequence" set tok {"} } multistring-space { if {$had_slash} { set_tokenType "stringpart" set tok "\\\"" } else { #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row #we are building up a tentative_accum_squote to determine if #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines #b) it is exactly ''' and we can terminate the whole multiliteral #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space set_tokenType "tentative_trigger_dquote" ;#trigger tentative_accum_dquote set tok {"} return 1 } } multiliteral-space { set_tokenType "literalpart" set tok "\"" } table-space - itable-space { set_tokenType "dquotedkey" set tok "" } dottedkey-space { set_tokenType dquotedkey set tok "" #only if complex keys become a thing #set_tokenType dquote_seq_begin #set tok $c } tablename-state { set_tokenType tablename set tok $c } tablearrayname-state { set_tokenType tablearrayname set tok $c } default { error "tomlish Unexpected dquote during state '$state' [tomlish::parse::report_line]" } } } } = { set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { newline { #review #incomplete newline set_tokenType "cr" incr i -1 return 1 } tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_squote" return 1 } _start_dquote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_dquote" return 1 } literal - literalpart - squotedkey { #assertion had_slash 0 append tok $c } string - comment - dquotedkey { #for these tokenTypes an = is just data. if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} append tok $c } whitespace { if {$state eq "multistring-space"} { incr i -1 return 1 } else { set_token_waiting type equal value = complete 1 startindex $cindex return 1 } } barekey { #set_token_waiting type equal value = complete 1 incr i -1 return 1 } starttablename - starttablearrayname { error "tomlish Character '=' is invalid first character for $tokenType. [tomlish::parse::report_line]" } tablename - tablearrayname { #invalid in bare name - but valid in quoted parts - leave for tablename parser to sort out append tok $c } default { error "tomlish unexpected = character during tokentype $tokenType. case not implemented. [tomlish::parse::report_line]" } } } else { switch -exact -- $state { multistring-space { set_tokenType "stringpart" set tok "" if {$had_slash} { append tok "\\" } append tok = } multiliteral-space { set_tokenType "literalpart" set tok "=" } dottedkey-space { set_tokenType "equal" set tok "=" return 1 } default { set_tokenType "equal" set tok = return 1 } } } } cr { #REVIEW! # \r carriage return if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { newline { #we have received a double cr ::tomlish::log::warn "double cr - will generate cr token. needs testing" set_tokenType "cr" ;#lone cr token will generally raise an error - but let state machine handle it incr i -1 return 1 } tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_squote" return 1 } _start_dquote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_dquote" return 1 } literal { append tok $c } literalpart { #part of MLL string (multi-line literal string) #we need to split out crlf as a separate NEWLINE to be consistent ::tomlish::log::warn "literalpart ended by cr - needs testing" #return literalpart temporarily - allow cr to be reprocessed from multiliteral-space incr i -1 return 1 } stringpart { #stringpart is a part of MLB string (multi-line basic string) #throw back the cr - if followed by lf it will become a {NEWLINE crlf} entry within the MULTISTRING list (e.g between STRINGPART entries) incr i -1 return 1 } starttablename - starttablearrayname { error "tomlish Character is invalid first character for $tokenType. [tomlish::parse::report_line]" } tablename - tablearrayname { #could in theory be valid in quoted part of name #review - might be better just to disallow here append tok $c } whitespace { #it should technically be part of whitespace if not followed by lf #but outside of values we are also free to map it to be another NEWLINE instead? REVIEW incr i -1 return 1 } untyped_value { incr i -1 return 1 } comment { #JJJJ #review incr i -1 return 1 } default { #!todo - error out if cr inappropriate for tokenType append tok $c } } } else { #lf may be appended if next #review - lone cr as newline? - this is uncommon - but so is lone cr in a string(?) set_tokenType "newline" set tok cr } } lf { # \n newline set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { newline { #review #this lf is the trailing part of a crlf append tok lf ;#assert we should now have tok "crlf" - as a previous cr is the only way to have an incomplete newline tok return 1 } tentative_accum_squote - tentative_accum_dquote { #multiliteral or multistring incr i -1 return 1 } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_squote" return 1 } _start_dquote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_dquote" return 1 } literal { #nl is not allowed *within* a literal - require multiliteral syntax for any literal containing a newline ''' ''' #even though we terminate the literal without the closing quote here - the token_waiting newline should trigger a state error set_token_waiting type newline value lf complete 1 startindex $cindex return 1 } literalpart { #we allow newlines - but store them within the multiliteral as their own element #This is a legitimate end to the literalpart - but not the whole multiliteral set_token_waiting type newline value lf complete 1 startindex $cindex return 1 } stringpart { if {$had_slash} { #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] incr i -1 return 1 } else { set_token_waiting type newline value lf complete 1 startindex $cindex return 1 } } starttablename - tablename - tablearrayname - starttablearrayname { error "tomlish Character is invalid in $tokenType. [tomlish::parse::report_line]" } default { #newline ends all other tokens. #note for string: we don't add (raw unescaped) newline to simple string. (must use multi-string for this) #note for whitespace: # we will use the convention that \n terminates the current whitespace even if whitespace follows # ie whitespace is split into separate whitespace tokens at each newline #puts "-------------- newline lf during tokenType $tokenType" set_token_waiting type newline value lf complete 1 startindex $cindex return 1 } } } else { switch -exact -- $state { multistring-space { if {$had_slash} { set_tokenType "continuation" set tok "\\" incr i -1 return 1 } else { set_tokenType "newline" set tok lf return 1 } } multiliteral-space { #assert had_slash 0 set_tokenType "newline" set tok "lf" return 1 } default { #ignore slash? error? set_tokenType "newline" set tok lf return 1 } } #if {$had_slash} { # #CONT directly before newline - allows strings_5_byteequivalent test to pass # set_tokenType "continuation" # set tok "\\" # incr i -1 # return 1 #} else { # set_tokenType newline # set tok lf # return 1 #} } } , { set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { newline { #incomplete newline set_tokenType "cr" incr i -1 return 1 } tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_squote" return 1 } _start_dquote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_dquote" return 1 } comment - tablename - tablearrayname { if {$had_slash} {append tok "\\"} append tok , } string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { #stringpart can have up to 2 quotes too if {$had_slash} {append tok "\\"} append tok $c } literal - literalpart - squotedkey { #assert had_slash always 0 append tok $c } whitespace { if {$state eq "multistring-space"} { incr i -1 return 1 } else { set_token_waiting type comma value "," complete 1 startindex $cindex return 1 } } default { set_token_waiting type comma value "," complete 1 startindex $cindex if {$had_slash} {append tok "\\"} return 1 } } } else { switch -exact -- $state { multistring-space { set_tokenType "stringpart" set tok "" if {$had_slash} {append tok "\\"} append tok "," } multiliteral-space { #assert had_slash 0 set_tokenType "literalpart" set tok "," } default { set_tokenType "comma" set tok "," return 1 } } } } . { set had_slash $slash_active set slash_active 0 if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { newline { #incomplete newline set_tokenType "cr" incr i -1 return 1 } tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_squote" return 1 } _start_dquote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_dquote" return 1 } comment - untyped_value { if {$had_slash} {append tok "\\"} append tok $c } string - dquotedkey { if {$had_slash} {append tok "\\"} append tok $c } stringpart { if {$had_slash} {append tok "\\"} append tok $c } literal - literalpart - squotedkey { #assert had_slash always 0 append tok $c } whitespace { switch -exact -- $state { multistring-space { #review if {$had_slash} { incr i -2 } else { incr i -1 } return 1 } xxxdottedkey-space { incr i -1 return 1 } dottedkey-space-tail { incr i -1 return 1 } default { error "tomlish Received period during tokenType 'whitespace' [tomlish::parse::report_line]" } } } starttablename - starttablearrayname { #This would correspond to an empty table name error "tomlish Character '.' is not allowed as first character ($tokenType). [tomlish::parse::report_line]" } tablename - tablearrayname { #subtable - split later - review append tok $c } barekey { #e.g x.y = 1 #we need to transition the barekey to become a structured table name ??? review #x is the tablename y is the key set_token_waiting type dotsep value "." complete 1 startindex $cindex return 1 } default { error "tomlish Received period during tokenType '$tokenType' [tomlish::parse::report_line]" #set_token_waiting type period value . complete 1 #return 1 } } } else { switch -exact -- $state { multistring-space { set_tokenType "stringpart" set tok "" if {$had_slash} {append tok "\\"} append tok "." } multiliteral-space { set_tokenType "literalpart" set tok "." } XXXdottedkey-space { ### obs? set_tokenType "dotsep" set tok "." return 1 } dottedkey-space-tail { ### set_tokenType "dotsep" set tok "." return 1 } default { set_tokenType "untyped_value" set tok "." } } } } " " - tab { if {[tcl::string::length $tokenType]} { set had_slash $slash_active set slash_active 0 switch -exact -- $tokenType { newline { #incomplete newline set_tokenType "cr" incr i -1 return 1 } tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_squote" return 1 } _start_dquote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_dquote" return 1 } barekey { #todo had_slash - emit token or error #whitespace is a terminator for bare keys #set_token_waiting type whitespace value $c complete 1 incr i -1 return 1 } untyped_value { #unquoted values (int,date,float etc) are terminated by whitespace #set_token_waiting type whitespace value $c complete 1 incr i -1 return 1 } comment { if {$had_slash} { append tok "\\" } append tok $c } string - dquotedkey { if {$had_slash} { append tok "\\" } append tok $c } stringpart { #for stringpart we store WS separately for ease of processing continuations (CONT stripping) if {$had_slash} { #REVIEW #emit the stringpart - go back to the slash incr i -2 return 1 } else { #split into STRINGPART xxx WS " " incr i -1 return 1 } } literal - literalpart - squotedkey { append tok $c } whitespace { if {$state eq "multistring-space"} { append tok $c } else { append tok $c } } starttablename - starttablearrayname { incr i -1 return 1 } tablename - tablearrayname { #include whitespace in the tablename/tablearrayname #Will need to be normalized upon interpreting the tomlish as a datastructure append tok $c } default { error "tomlish Received whitespace space during tokenType '$tokenType' [tomlish::parse::report_line]" } } } else { set had_slash $slash_active set slash_active 0 switch -exact -- $state { tablename-state { #tablename can have leading,trailing and interspersed whitespace! #These will not be treated as whitespace tokens, instead forming part of the name. set_tokenType tablename set tok "" if {$had_slash} {append tok "\\"} append tok $c } tablearrayname-state { set_tokenType tablearrayname set tok "" if {$had_slash} {append tok "\\"} append tok $c } multistring-space { if {$had_slash} { set_tokenType "continuation" set tok "\\" incr i -1 return 1 } else { set_tokenType "whitespace" append tok $c } } multiliteral-space { set_tokenType "literalpart" set tok $c } default { if {$had_slash} { error "tomlish unexpected backslash [tomlish::parse::report_line]" } set_tokenType "whitespace" append tok $c } } } } tabX { if {[tcl::string::length $tokenType]} { if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out (?review) set slash_active 0 switch -exact -- $tokenType { newline { #incomplete newline set_tokenType "cr" incr i -1 return 1 } tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_squote" return 1 } _start_dquote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_dquote" return 1 } barekey { #whitespace is a terminator for bare keys incr i -1 #set_token_waiting type whitespace value $c complete 1 return 1 } untyped_value { #unquoted values (int,date,float etc) are terminated by whitespace #set_token_waiting type whitespace value $c complete 1 incr i -1 return 1 } squotedkey { append tok $c } dquotedkey - string - comment - whitespace { #REVIEW append tok $c } stringpart { #for stringpart we store WS separately for ease of processing continuations (CONT stripping) if {$had_slash} { #REVIEW #emit the stringpart - go back to the slash incr i -2 return 1 } else { #split into STRINGPART aaa WS " " incr i -1 return 1 } } literal - literalpart { append tok $c } starttablename - starttablearrayname { incr i -1 return 1 } tablename - tablearrayname { #include whitespace in the tablename/tablearrayname #Will need to be normalized upon interpreting the tomlish as a datastructure append tok $c } default { error "tomlish Received whitespace tab during tokenType '$tokenType' [tomlish::parse::report_line]" } } } else { set had_slash $slash_active if {$slash_active} { set slash_active 0 } switch -exact -- $state { tablename-state { #tablename can have leading,trailing and interspersed whitespace! #These will not be treated as whitespace tokens, instead forming part of the name. set_tokenType tablename set tok $c } tablearrayname-state { set_tokenType tablearrayname set tok $c } multistring-space { if {$had_slash} { set_tokenType "continuation" set tok "\\" incr i -1 return 1 } else { set_tokenType whitespace append tok $c } } multiliteral-space { set_tokenType "literalpart" set tok $c } default { set_tokenType "whitespace" append tok $c } } } } bom { #bom encoded as single unicode codepoint \uFFEF #BOM (Byte Order Mark) - ignored by token consumer if {[tcl::string::length $tokenType]} { switch -exact -- $tokenType { tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } _start_squote_sequence { #assert - tok will be one or two squotes only #A toml literal probably isn't allowed to contain this #but we will parse and let the validator sort it out. incr i -[tcl::string::length $tok] set_tokenType "single_squote" return 1 } _start_dquote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_dquote" return 1 } literal - literalpart { append tok $c } string - stringpart { append tok $c } default { #state machine will generally not have entry to accept bom - let it crash set_token_waiting type bom value "\uFEFF" complete 1 startindex $cindex return 1 } } } else { switch -exact -- $state { multiliteral-space { set_tokenType "literalpart" set tok $c } multistring-space { set_tokenType "stringpart" set tok $c } default { set_tokenType "bom" set tok "\uFEFF" return 1 } } } } default { if {[tcl::string::length $tokenType]} { if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. set slash_active 0 switch -exact -- $tokenType { newline { #incomplete newline set_tokenType "cr" incr i -1 return 1 } tentative_accum_squote - tentative_accum_dquote { incr i -1 return 1 } _start_squote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_squote" return 1 } _start_dquote_sequence { incr i -[tcl::string::length $tok] set_tokenType "single_dquote" return 1 } whitespace { if {$state eq "multistring-space"} { incr i -1 return 1 } else { #review incr i -1 ;#We don't have a full token to add to the token_waiting dict - so leave this char for next run. return 1 } } barekey { if {[tomlish::utils::is_barekey $c]} { append tok $c } else { error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed. (see tomlish::utils::is_barekey) [tomlish::parse::report_line]" } } starttablename - starttablearrayname { incr i -1 #allow statemachine to set context for subsequent chars return 1 } string - stringpart { append tok $c } default { #e.g comment/literal/literalpart/untyped_value/starttablename/starttablearrayname/tablename/tablearrayname append tok $c } } } else { set had_slash $slash_active set slash_active 0 switch -exact -- $state { table-space - itable-space { #if no currently active token - assume another key value pair if {[tomlish::utils::is_barekey $c]} { set_tokenType "barekey" append tok $c } else { error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" } } multistring-space { set_tokenType "stringpart" if {$had_slash} { set tok \\$c } else { set tok $c } } multiliteral-space { set_tokenType "literalpart" set tok $c } tablename-state { set_tokenType "tablename" set tok $c } tablearrayname-state { set_tokenType "tablearrayname" set tok $c } dottedkey-space { set_tokenType barekey set tok $c } default { #todo - something like ansistring VIEW to show control chars? set cshow [string map [list \t tab \v vt] $c] tomlish::log::debug "- tokloop char '$cshow' setting to untyped_value while state:$state [tomlish::parse::report_line]" set_tokenType "untyped_value" set tok $c } } } } } } #run out of characters (eof) if {[tcl::string::length $tokenType]} { #check for invalid ending tokens #if {$state eq "err-state"} { # error "Reached end of data whilst tokenType = '$tokenType'. INVALID" #} switch -exact -- $tokenType { _start_squote_sequence { set toklen [tcl::string::length $tok] switch -- $toklen { 1 { #invalid eof with open literal error "tomlish eof reached without closing single quote for string literal. [tomlish::parse::report_line]" } 2 { set_tokenType "literal" set tok "" return 1 ##review #set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] #set_tokenType "literal" #set tok "" #return 1 } } } _start_dquote_sequence { set toklen [tcl::string::length $tok] switch -- $toklen { 1 { #invalid eof with open string error "tomlish eof reached without closing double quote for string. [tomlish::parse::report_line]" } 2 { set_tokenType "string" set tok "" return 1 } } } newline { #The only newline token that has still not been returned should have a tok value of "cr" puts "tomlish eof reached - with incomplete newline token '$tok'" if {$tok eq "cr"} { #we convert lone cr to it's own "cr" token elsewhere in the document to allow statemachine to handle it. #(which it should generally do by not handling it ie raising an error - or emitting an ERROR list in the tomlish) #if trailing char is a lone cr - we should encode it the same way as elsewhere that is outside of values # ie as it's own token. switch_tokenType "cr" return 1 } else { #should be unreachable error "tomlish eof reached - with invalid newline token. value: $tok" } } } set_token_waiting type eof value eof complete 1 startindex $i ;#review return 1 } else { ::tomlish::log::debug "- No current tokenType, ran out of characters, setting tokenType to 'eof' [tomlish::parse::report_line]" set tokenType "eof" set tok "eof" } return 0 } #*** !doctools #[list_end] [comment {--- end definitions namespace tomlish::parse ---}] } namespace eval tomlish::huddle { proc from_json {json} { package require huddle package require huddle::json #note - huddle may now contain raw surrogate pair - which cannot be emitted to stdout set h [huddle::json::json2huddle parse $json] } proc from_dict {d} { } #raw - strings must already be processed into values suitable for json e.g surrogate pair escaping proc jsondumpraw {huddle_object {offset " "} {newline "\n"} {begin ""}} { upvar ::huddle::types types set nextoff "$begin$offset" set nlof "$newline$nextoff" set sp " " if {[string equal $offset ""]} {set sp ""} set type [huddle type $huddle_object] switch -- $type { boolean - number { return [huddle get_stripped $huddle_object] } null { return null } string { set data [huddle get_stripped $huddle_object] # JSON permits only oneline string #set data [string map { # \n \\n # \t \\t # \r \\r # \b \\b # \f \\f # \\ \\\\ # \" \\\" # / \\/ # } $data #] return "\"$data\"" } list { set inner {} set len [huddle llength $huddle_object] for {set i 0} {$i < $len} {incr i} { set subobject [huddle get $huddle_object $i] lappend inner [jsondumpraw $subobject $offset $newline $nextoff] } if {[llength $inner] == 1} { return "\[[lindex $inner 0]\]" } return "\[$nlof[join $inner ,$nlof]$newline$begin\]" } dict { set inner {} foreach {key} [huddle keys $huddle_object] { lappend inner [subst {"$key":$sp[jsondumpraw [huddle get $huddle_object $key] $offset $newline $nextoff]}] } #if {[llength $inner] == 1} { # return $inner ;#wrong - breaks with quoted list representation # #FAILS: toml-test valid/comment/tricky #} return "\{$nlof[join $inner ,$nlof]$newline$begin\}" } default { set node [unwrap $huddle_object] #foreach {tag src} $node break lassign $node tag src return [$types(callback:$tag) jsondumpraw $huddle_object $offset $newline $nextoff] } } } } #typed as per toml-test types namespace eval tomlish::typedhuddle { proc from_json {json} { set plainhuddle [tomlish::huddle::from_json $json] error "tomlish::typedhuddle::from_json unimplemented" } proc from_dict {d} { package require huddle set h [huddle create] if {[tomlish::dict::is_typeval $d]} { set dtype [dict get $d type] switch -- $dtype { ARRAY { #error "typedhuddle::from_dict ARRAY not yet handled" set h_list [huddle list] set elements [dict get $d value] foreach el $elements { set sub [from_dict $el] huddle append h_list $sub } return $h_list } default { set tinfo [tomlish::dict::convert_typeval_to_tomltest $d] #basic non-container types set h_tdict [huddle create] huddle set h_tdict type [huddle string [dict get $tinfo type]] huddle set h_tdict value [huddle string [dict get $tinfo value]] return $h_tdict } } } else { dict for {dictkey dictval} $d { set jsonkey [tomlish::utils::rawstring_to_jsonstring $dictkey] if {[tomlish::dict::is_typeval $dictval]} { set dtype [dict get $dictval type] switch -- $dtype { ARRAY { #error "typedhuddle::from_dict ARRAY not yet handled" set h_next [huddle list] set elements [dict get $dictval value] foreach el $elements { set sub [from_dict $el] huddle append h_next $sub } } default { set tinfo [tomlish::dict::convert_typeval_to_tomltest $dictval] set tp [dict get $tinfo type] #basic non-container types set h_next [huddle create] ;#dict huddle set h_next type [huddle string [dict get $tinfo type]] huddle set h_next value [huddle string [dict get $tinfo value]] } } huddle set h $jsonkey $h_next } else { #dict set sub [from_dict $dictval] huddle set h $jsonkey $sub } } } return $h } proc is_typeval {huddled} { set htype [huddle type $huddled] if {$htype ne "dict"} { return 0 } if {[huddle keys $huddled] ne {type value}} { return 0 } set tp [huddle type $huddled type] switch -- $tp { string - integer - float - bool - datetime - datetime-local - date-local - time-local { return 1 } } return 0 } #direction from typed json towards toml proc convert_typeval_to_tomlish {huddled} { set htype [huddle get_stripped $huddled type] set hval [huddle get_stripped $huddled value] switch -- $htype { string { #we need to decide here the type of string element to use in toml/tomlish #STRING,MULTISTRING,LITERAL,MULTILITERAL #set unesc [tomlish::utils::unescape_jsonstring $hval] ;#no need - json parser unescaped when creating the huddle set unesc $hval #(huddle::json::json2huddle parse $json) #since it was unescaped any backslashes remaining represent themselves - reapply escape - REVIEW #set hval [string map [list \\ \\\ ] $hval] #JSJS if {[string first \n $unesc] >= 0} { #always use a MULTI if {[string first ' $unesc] >=0} { if {[string first ''' $unesc] >=0} { set dtype MULTISTRING } else { set dtype MULTILITERAL } } else { if {[string first \"\"\" $unesc] >=0} { set dtype MULTILITERAL } else { set dtype MULTISTRING } } } else { #use multi if needed? if {[string first '' $hval] >=0} { if {[string first ''' $unesc] >=0} { set dtype STRING } else { set dtype MULTILITERAL } } elseif {[string first ' $unesc] >= 0} { set dtype STRING } elseif {[string first \"\"\" $unesc] >= 0} { set dtype LITERAL } else { #STRING or LITERAL? set dtype STRING } } } datetime - bool { set dtype [string toupper $htype] } float { set dtype FLOAT if {[string is integer -strict $hval]} { #json FLOAT specified as integer - must have dot for toml set hval [expr {double($hval)}] } } integer { set dtype INT } datetime - datetime-local - date-local - time-local { #DDDD #set dtype DATETIME set dtype [string toupper $htype] } default { error "tomlish::typedhuddle::convert_typeval_to_tomlish unrecognised type $htype" } } return [list type $dtype value $hval] } } namespace eval tomlish::toml { proc from_binary {bindata} { set bom "" set b12 [tcl::string::range $bindata 0 1] set b12test [string map [list \xEF\xBB utf8_12 \xFE\xFF bom16be \xFF\xFE utf32le_12 \x00\x00 utf32be_12] $b12] switch -- $b12test { bom16be { #FEFF set bom utf-16be } utf32le_12 { #FFFE set b34 [tcl::string::range $bindata 2 3] if {$b34 eq "\x00\x00"} { set bom utf-32le } else { set bom utf-16le } } utf32be_12 { #0000 set b34 [tcl::string::range $bindata 2 3] if {$b34 eq "\xFE\xFF"} { set bom utf-32be } } utf8_12 { set b3 [tcl::string::index $bindata 2] if {$b3 eq "\xBF"} { set bom utf-8 } } } if {$bom eq ""} { #no bom - assume utf8 - but we read in as binary #if data wasn't actually utf8 we may error here depending on content - or we may just get wrongly encoded chars set tomldata [encoding convertfrom utf-8 $bindata] } elseif {$bom eq "utf-8"} { #utf-8 bom read in as binary set tomldata [encoding convertfrom utf-8 $bindata] #bom now encoded as single unicode char \uFFEF } else { return -code error -errorcode {TOML ENCODING NOTUTF8} "Input not UTF8 encoded according to BOM. Indicated encoding is '$bom' - invalid for toml" } return $tomldata } proc from_tomlish {tomlish} { return [tomlish::encode::tomlish $tomlish] } #todo - rename to taggedjson proc from_tomlish_from_dict_from_typedjson {json} { set d [tomlish::dict::from_typedjson $json] from_tomlish [tomlish::from_dict $d] ;#return tomlish } proc tablename_split {tablename {normalize false}} { #we can't just split on . because we have to handle quoted segments which may contain a dot. #eg {dog."tater.man"} if {$tablename eq ""} { error "tablename_split. No table name segments found. empty tablename" } set sLen [tcl::string::length $tablename] set segments [list] set mode "preval" ;#5 modes: preval, quoted,litquoted, unquoted, postval #quoted is for double-quotes, litquoted is for single-quotes (string literal) set seg "" for {set i 0} {$i < $sLen} {incr i} { if {$i > 0} { set lastChar [tcl::string::index $tablename [expr {$i - 1}]] } else { set lastChar "" } #todo - track\count backslashes properly set c [tcl::string::index $tablename $i] if {$c eq "\""} { if {($lastChar eq "\\")} { #not strictly correct - we could have had an even number prior-backslash sequence #the toml spec would have us error out immediately on bsl in bad location - but we're #trying to parse to unvalidated tomlish set ctest escq } else { set ctest dq } } else { set ctest [string map [list " " sp \t tab] $c] } switch -- $ctest { . { switch -exact -- $mode { preval { error "tablename_split. dot not allowed - expecting a value" } unquoted { #dot marks end of segment. if {![tomlish::utils::is_barekey $seg]} { error "tablename_split. unquoted key segment $seg is not a valid toml key" } lappend segments $seg set seg "" set mode "preval" } quoted { append seg $c } litquoted { append seg $c } postval { #got dot in an expected location set mode "preval" } } } dq { #unescaped dquote switch -- $mode { preval { set mode "quoted" set seg "\"" } unquoted { #invalid in barekey - but we are after structure only append seg $c } quoted { append seg $c #JJJJ if {$normalize} { lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] } else { lappend segments $seg } set seg "" set mode "postval" ;#make sure we only accept a dot or end-of-data now. } litquoted { append seg $c } postval { error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" } } } ' { switch -- $mode { preval { append seg $c set mode "litquoted" } unquoted { #single quote inside e.g o'neill - ultimately invalid - but we pass through here. append seg $c } quoted { append seg $c } litquoted { append seg $c #no normalization to do aside from stripping squotes if {$normalize} { lappend segments [tcl::string::range $seg 1 end-1] } else { lappend segments $seg } set seg "" set mode "postval" } postval { error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" } } } sp - tab { switch -- $mode { preval - postval { #ignore } unquoted { #terminates a barekey lappend segments $seg set seg "" set mode "postval" } default { #append to quoted or litquoted append seg $c } } } default { switch -- $mode { preval { set mode unquoted append seg $c } postval { error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" } default { append seg $c } } } } if {$i == $sLen-1} { #end of data ::tomlish::log::debug "End of data: mode='$mode'" switch -exact -- $mode { preval { if {[llength $segments]} { error "tablename_split. Expected a value after last dot separator. tablename: '$tablename'" } else { error "tablename_split. Whitespace only? No table name segments found. tablename: '$tablename'" } } unquoted { if {![tomlish::utils::is_barekey $seg]} { #e.g toml-test invalid/table/with-pound required to fail for invalid barekey error "tablename_split. unquoted key segment $seg is not a valid toml key" } lappend segments $seg } quoted { error "tablename_split. Expected a trailing double quote. tablename: '$tablename'" } litquoted { error "tablename_split. Expected a trailing single quote. tablename: '$tablename'" } postval { #ok - segment already lappended } } } } #note - we must allow 'empty' quoted strings '' & "" # (these are 'discouraged' but valid toml keys) return $segments } #tablenames (& tablearraynames) may contain irrelevant leading, trailing and interspersed whitespace # tablenames can be made up of segments delimited by dots. .eg [ a.b . c ] #trimmed, the tablename becomes {a.b.c} # A segment may contain whitespace if it is quoted e.g [a . b . "c etc " ] #ie whitespace is only irrelevant if it's outside a quoted segment #trimmed, the tablename becomes {a.b."c etc "} proc tablename_trim {tablename} { set segments [tomlish::toml::tablename_split $tablename false] set trimmed_segments [list] foreach seg $segments { lappend trimmed_segments [::string trim $seg " \t"] } return [join $trimmed_segments .] } } namespace eval tomlish::dict { namespace export {[a-z]*}; # Convention: export all lowercase namespace path [namespace parent] #from_taggedjson proc from_typedjson {json} { package require huddle package require huddle::json set h [huddle::json::json2huddle parse $json] #json2huddle parse unescapes the basic json escapes \n \\ etc #$h could now contain raw form of surrogate pair (json2huddle parse as at 2025-014 doesn't convert the surrogates - just unescapes?) if {[catch {encoding convertto utf-8 $h} errM]} { #This test suggests we have raw surrogate pairs - REVIEW package require punk::cesu set h [punk::cesu::from_surrogatestring $h] } tomlish::dict::from_typedhuddle $h } proc from_typedhuddle {h} { set resultd [dict create] switch -- [huddle type $h] { dict { foreach k [huddle keys $h] { switch -- [huddle type $h $k] { dict { set huddle_d [huddle get $h $k] #puts stderr "huddle_d: $huddle_d" #set v [huddle get_stripped $h $k] if {[tomlish::typedhuddle::is_typeval $huddle_d]} { dict set resultd $k [tomlish::typedhuddle::convert_typeval_to_tomlish $huddle_d] } else { dict set resultd $k [from_typedhuddle $huddle_d] } } list { set items [huddle get $h $k] set numitems [huddle llength $items] if {$numitems == 0} { dict set resultd $k [list type ARRAY value {}] } else { set arritems [list] for {set i 0} {$i < $numitems} {incr i} { set item [huddle get $items $i] #puts stderr "item: $item" #set v [huddle get $item] if {[tomlish::typedhuddle::is_typeval $item]} { lappend arritems [tomlish::typedhuddle::convert_typeval_to_tomlish $item] } else { lappend arritems [from_typedhuddle $item] } } dict set resultd $k [list type ARRAY value $arritems] } } default { error "dict_from_json unexpected subtype [huddle type $h $k] in dict" } } } } list { set items [huddle get $h] set numitems [huddle llength $items] if {$numitems == 0} { return [list type ARRAY value {}] } else { set arritems [list] for {set i 0} {$i < $numitems} {incr i} { set item [huddle get $items $i] #puts stderr "item: $item" #set v [huddle get $item] if {[tomlish::typedhuddle::is_typeval $item]} { lappend arritems [tomlish::typedhuddle::convert_typeval_to_tomlish $item] } else { lappend arritems [from_typedhuddle $item] } } return [list type ARRAY value $arritems] } } } return $resultd } proc is_typeval {d} { #designed to detect {type value } e.g {type INT value 3}, {type STRING value "blah etc"} #as a sanity check we need to avoid mistaking user data that happens to match same form #consider x.y={type="spud",value="blah"} #The value of type will itself have already been converted to {type STRING value spud} ie never a single element. #check the length of the type as a quick way to see it's a tag - not something else masqerading. expr {[string is dict $d] && [dict size $d] == 2 && [dict exists $d type] && [dict exists $d value] && [llength [dict get $d type]] == 1} } #simple types only - not containers? proc convert_typeval_to_tomltest {d} { set dtype [dict get $d type] set dval [dict get $d value] switch -- $dtype { INT { set testtype integer set dval [expr {$dval}] ;#convert e.g 0xDEADBEEF to 3735928559 } DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - FLOAT - BOOL { #DDDD set testtype [string tolower $dtype] } STRING - MULTISTRING { set testtype string #JJJJ set dval [tomlish::utils::unescape_string $dval] set dval [tomlish::utils::rawstring_to_jsonstring $dval] } LITERAL - MULTILITERAL { set testtype string #don't validate on way out to json here? #decoder should validate by calling tomlish::from_dict #if {![tomlish::utils::rawstring_is_valid_literal $dval]} { # return -code error -errorcode {TOML SYNTAX INVALIDLITERAL} $msg #} set dval [tomlish::utils::rawstring_to_jsonstring $dval] } MULTILITERAL { #todo - escape newlines for json? set testtype string } default { error "convert_typeval_to_tomltest unhandled type $dtype" } } return [list type $testtype value $dval] } # Check that each leaf is a typeval or typeval dict #importantly: must accept empty dict leaves e.g {x {}} proc is_typeval_dict {d {checkarrays 0}} { if {![string is dict $d]} { return 0 } dict for {k v} $d { set is_d 0 if {!([is_typeval $v] || [set is_d [is_typeval_dict $v $checkarrays]])} { return 0 } if {!$is_d} { set vtype [dict get $v type] switch -- $vtype { INT - FLOAT - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - BOOL - LITERAL - STRING - MULTILITERAL - MULTISTRING {} ARRAY { if {$checkarrays} { set arrdata [dict get $v value] foreach el $arrdata { if {![is_typeval_dict $el $checkarrays]} { return 0 } } } } default { puts stderr "is_typeval_dict: Unexpected type '$vtype'" return 0 } } } } return 1 } proc last_tomltype_posn {d} { set last_simple -1 set dictposn [expr {[dict size $d] -1}] foreach k [lreverse [dict keys $d]] { set dval [dict get $d $k] if {[is_typeval $dval]} { set last_simple $dictposn break } incr dictposn -1 } return $last_simple } #review proc name_from_tablestack {tablestack} { set name "" foreach tinfo [lrange $tablestack 1 end] { lassign $tinfo type namepart switch -- $type { T { if {$name eq ""} { append name $namepart } else { append name .$namepart } } I { if {$name eq ""} { append name $namepart } else { append name .$namepart } } default { #end at first break in the leading sequence of T & I tablenames break } } } return $name } #tablenames_info is a flat dict with the key being an '@@' path proc _show_tablenames {tablenames_info} { #e.g {@l@a @@b} {ttype header_table tdefined closed} append msg \n "tablenames_info:" \n dict for {tkey tinfo} $tablenames_info { append msg " " "table: $tkey" \n dict for {field finfo} $tinfo { append msg " " "$field $finfo" \n } } return $msg } #take a raw string and classify: result is a 2 element list comprised of KEY|SQKEY|DQKEY and the value being the appropriate inner string proc classify_rawkey {rawval} { if {![::tomlish::utils::is_barekey $rawval]} { #requires quoting # #Any dot in the key would have been split by dict::from_tomlish - so if it's present here it's part of this key - not a level separator! # #we'll use a basic mechanisms for now to determine the type of quoting # - whether it has any single quotes or not. # (can't go in an SQKEY) # - whether it has any chars that require quoting when in a Bstring # (if so - then its visual representation might be unsuitable for a key in a toml text file, so escape and put in DQKEY instead of literal SQKEY) #todo - more? #REVIEW - the backslash might often be in things like a regex or windows path - which is often better expressed in a literal SQKEY # from literal examples: # 'c:\Users\nodejs\templates' # '<\i\c*\s*>' #If these are in *keys* our basic test will express these as: # "c:\\Users\\nodejs\\templates" # "<\\i\\c*\\s*>" # This still works - but a smarter test might determine when SQKEY is the better form? #when coming from external systems - can we even know if the value was already escaped? REVIEW #Probably when coming from json - we know it's already escaped - and so we build our dict converting keys to unescaped #TODO - clarify in documentation that keys resulting from dict::from_tomlish are in 'normalized' (unescaped) form # #For keys - we currently (2025) are only allowed barekeys,basic strings and literal strings. (no multiline forms) set k_escaped [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $rawval] if {[string length $k_escaped] != [string length $rawval]} { #escaping made a difference set has_escape_requirement 1 } else { set has_escape_requirement 0 } if {[string first ' $rawval] >=0 || $has_escape_requirement} { #basic string # (any ANSI SGR sequence will end up here in escaped form ) return [list DQKEY $k_escaped] } else { #literal string return [list SQKEY $rawval] } } else { return [list KEY $rawval] } } #the quoting implies the necessary escaping for DQKEYs proc join_and_quote_rawkey_list {rawkeylist} { set result "" foreach rk $rawkeylist { lassign [tomlish::dict::classify_rawkey $rk] type val switch -- $type { SQKEY { append result "'$val'." } DQKEY { append result "\"$val\"." } KEY { append result "$val." } } } return [string range $result 0 end-1] } proc _process_tomlish_dottedkey {element {context_refpath {}}} { upvar tablenames_info tablenames_info upvar datastructure datastructure set dottedtables_defined [list] set dkey_info [tomlish::get_dottedkey_info $element] #e.g1 keys {x.y y} keys_raw {'x.y' "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) #e.g2 keys {x.y y} keys_raw {{"x.y"} "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) #[a.b] #t1.t2.dottedtable.leafkey = "val" #we have already checked supertables a & {a b} # - in basic case, passed in context_refpath as {@@a @@b} # - our context_refpath could also include some combination of keys and array indices e.g {@@a @@b 3 @@subtablekey} #We need to check {a b t1} & {a b t2} ('creation' only) #and then 'dottedtable' is 'defined' while leafkey is an ordinary key in dottedtable #note we also get here as a 'dottedkey' with the following even though there is no dot in k #[a.b] #leafkey = "val" set all_dotted_keys [dict get $dkey_info keys] set dottedkeyname [join $all_dotted_keys .] if {[llength $all_dotted_keys] > 1} { #dottedtable.k=1 #tX.dottedtable.k=1 #etc #Wrap in a list so we can detect 'null' equivalent. #We can't use empty string as that's a valid dotted key segment set dottedtable_bag [list [lindex $all_dotted_keys end-1]] set dotparents [lrange $all_dotted_keys 0 end-2] } else { #basic case - not really a 'dotted' key #k = 1 set dottedtable_bag [list] ;#empty bag set dotparents [list] } #assert dottedtable_bag only ever holds 0 or 1 elements set leaf_key [lindex $all_dotted_keys end] #see also: https://github.com/toml-lang/toml/issues/846 "Can dotted keys insert into already-defined [tables]?" #This code was originally written with a misinterpretation of: #"Dotted keys create and define a table for each key part before the last one, provided that such tables were not previously created." # 'each key part before the last one' refers to each key in a single dotted key entry # not each 2nd-to last key in a list of dotted keys. #we've already tested the table/tablearray keys that got us here.. but not the dottedkey segments (if any) prior to dottedtable & leaf_key set dottedsuper_refpath $context_refpath foreach normkey $dotparents { lappend dottedsuper_refpath @@$normkey if {![dict exists $tablenames_info $dottedsuper_refpath ttype]} { #supertable with this combined path (context_path plus parts of dottedkey) not yet 'created' if {[tomlish::dict::path::exists $datastructure $dottedsuper_refpath]} { #There is data so it must have been created as a keyval set msg "Path $dottedsuper_refpath for dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" append msg \n [tomlish::dict::_show_tablenames $tablenames_info] #raise a specific type of error for tests to check return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg } #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here #dict set tablenames_info $dottedsuper_refpath ttype unknown_table ;#REVIEW dict set tablenames_info $dottedsuper_refpath ttype unknown_dotted ;#REVIEW #see note above re dotted keys insert into already defined table - we need to 'define' all the dotted supers in this block lappend dottedtables_defined $dottedsuper_refpath #ensure empty tables are still represented in the datastructure tomlish::dict::path::set_endpoint datastructure $dottedsuper_refpath {} ;#set to empty subdict } else { #added for fixed assumption set ttype [dict get $tablenames_info $dottedsuper_refpath ttype] set definedstate [dictn getdef $tablenames_info [list $dottedsuper_refpath tdefined] NULL] switch -- $ttype { dottedkey_table - unknown_dotted { #'created' as dotted - but make sure it's from this header section - i.e defined not set if {$definedstate ne "NULL"} { #collision with some other dottedkey set msg "Table at $dottedsuper_refpath represented by dottedkey $dottedkeyname has been 'defined' elsewhere (table redefinition) - invalid" append msg \n [tomlish::dict::_show_tablenames $tablenames_info] return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg } } itable { #itables are immediately defined set msg "Table at $dottedsuper_refpath represented by dottedkey $dottedkeyname has been 'defined' as itable (table redefinition) - invalid" append msg \n [tomlish::dict::_show_tablenames $tablenames_info] return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg } default { #header_table, header_tablearray or unknown_header #is header_tablearray any different from header_table in this context? #we don't set tdefined for tablearray anyway - so should be ok here. if {$definedstate ne "NULL"} { set msg "Table at $dottedsuper_refpath represented by dottedkey $dottedkeyname has been 'defined' in a header (table redefinition) - invalid" append msg \n [tomlish::dict::_show_tablenames $tablenames_info] return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg } } } } } #dottedtable being 2nd last segment was for original assumption - todo - tidy up? we are duplicating the logic above #review - any need/advantage to treat 2nd to last key any different from other supers? ie D in a.b.c.D.key=1 #no need for 'unknown_dotted' vs 'dottedkey_table' ?? if {[llength $dottedtable_bag] == 1} { set dottedtable [lindex $dottedtable_bag 0] set dottedkey_refpath [list {*}$dottedsuper_refpath "@@$dottedtable"] #our dotted key is attempting to define a table if {![dict exists $tablenames_info $dottedkey_refpath ttype]} { #first one - but check datastructure for collisions if {[tomlish::dict::path::exists $datastructure $dottedkey_refpath]} { set msg "Path $dottedkey_refpath for dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" append msg \n [tomlish::dict::_show_tablenames $tablenames_info] #raise a specific type of error for tests to check return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg } #'create' the table dict set tablenames_info $dottedkey_refpath ttype dottedkey_table #don't actually set 'defined' here.. use the end of TABLE record to close them off by looking at this list tomlish::dict::path::set_endpoint datastructure $dottedkey_refpath {} lappend dottedtables_defined $dottedkey_refpath # } else { #exists - but might be from another dottedkey within the current header section #the table is open for adding keys until the next 'header' section ([tablename] / [[tablearray]]) #check for 'defined' closed (or just existence) set ttype [dict get $tablenames_info $dottedkey_refpath ttype] set definedstate [dictn getdef $tablenames_info [list $dottedkey_refpath tdefined] NULL] switch -- $ttype { dottedkey_table - unknown_dotted { #'created' as dotted - but make sure it's from this header section - i.e defined not set if {$definedstate ne "NULL"} { #collision with some other dottedkey set msg "Table at $dottedkey_refpath represented by dottedkey $dottedkeyname has been 'defined' elsewhere (table redefinition) - invalid" append msg \n [tomlish::dict::_show_tablenames $tablenames_info] return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg } } itable { #itables are immediately defined set msg "Table at $dottedkey_refpath represented by dottedkey $dottedkeyname has been 'defined' as itable (table redefinition) - invalid" append msg \n [tomlish::dict::_show_tablenames $tablenames_info] return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg } default { #header_table, header_tablearray or unknown_header #is header_tablearray any different from header_table in this context? #we don't set tdefined for tablearray anyway - so should be ok here. if {$definedstate ne "NULL"} { set msg "Table at $dottedkey_refpath represented by dottedkey $dottedkeyname has been 'defined' in a header (table redefinition) - invalid" append msg \n [tomlish::dict::_show_tablenames $tablenames_info] return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg } } } } } else { set dottedkey_refpath $dottedsuper_refpath } #assert - dottedkey represents a key val pair that can be added set fullkey_refpath [list {*}$dottedkey_refpath @@$leaf_key] if {[tomlish::dict::path::exists $datastructure $fullkey_refpath]} { set msg "Duplicate key. The key (path $fullkey_refpath) already exists at this level in the toml data. The toml data is not valid." append msg \n [tomlish::dict::_show_tablenames $tablenames_info] return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg } #set keyval_dict [_get_keyval_value $element] lassign [_get_keyval_value $element] _ keyval_dict _ sub_tablenames_info #keyval_dict is either a {type value } #or the result from parsing an arbitrary dict from an inline table - which could theoretically look the same at the topmost level #punk::dict::is_typeval can distinguish tomlish::log::debug "_process_tomlish_dottedkey>>> context:$context_refpath dottedkey $dottedkeyname kv: $keyval_dict" tomlish::dict::path::set_endpoint datastructure $fullkey_refpath $keyval_dict #remove ? #if {![tomlish::dict::is_typeval $keyval_dict]} { # #the value is either empty or or a dict structure with arbitrary (from-user-data) toplevel keys # # inner structure will contain {type value } if all leaves are not empty ITABLES # ##set tkey [list {*}$norm_segments {*}$all_dotted_keys] # #by not creating a tablenames_info record - we effectively make it closed anyway? # #it should be detected as a key # #is there any need to store tablenames_info for it?? # #REVIEW # ##TODO - update? # #dictn incr tablenames_info [list $tkey seencount] # ##if the keyval_dict is not a simple type x value y - then it's an inline table ? # ##if so - we should add the path to the leaf_key as a closed table too - as it's not allowed to have more entries added. # #dictn set tablenames_info [list $tkey closed] 1 #} return [dict create dottedtables_defined $dottedtables_defined] } #tomlish::dict::from_tomlish is a *basic* programmatic datastructure for accessing the data. # produce a dictionary of keys and values from a tomlish tagged list. # dict::from_tomlish is primarily for read access to toml data. #Extraneous (not within quoted sections) whitespace and comments are not preserved in this structure, # so a roundtrip from toml to this datastructure and back to toml will lose whitespace formatting and comments. # creating/changing toml values can be done directly on a tomlish list if preserving (or adding) formatting/comments is desired. #A separate package 'tomlish::object' may be needed to allow easier programmatic creating/updating/deleting of data elements whilst preserving (or adding or selectively deleting/editing) such formatting. # #within an ARRAY, we store a list of items such as plain dicts (possibly empty) and {type value } for simple types #(ARRAYS can be mixed type) #This means our dict structure should have only ARRAY and simple types which need to be in {type value } form #A dict within an array encodeded as a type ITABLE value should also parse - but is the unpreferred form - REVIEW test? #Namespacing? #ie note the difference: #[Data] #temp = { cpu = 79.5, case = 72.0} # versus #[Data] #temps = [{cpu = 79.5, case = 72.0}] proc from_tomlish {tomlish} { package require dictn #keep track of which tablenames have already been directly defined, # so we can raise an error to satisfy the toml rule: 'You cannot define any key or table more than once. Doing so is invalid' #Note that [a] and then [a.b] is ok if there are no subkey conflicts - so we are only tracking complete tablenames here. #we don't error out just because a previous tablename segment has already appeared. #Declaring, Creating, and Defining Tables #https://github.com/toml-lang/toml/issues/795 #(update - only Creating and Defining are relevant terminology) #review #tablenames_info keys ttype created, tdefined, createdby, definedby, closedby ??? review keys # [tname] = header_table [[tname]] = header_tablearray #consider the following 2 which are legal: #[table] #'table' created, defined=open type header_table #x.y = 3 #[table.x.z] #'table' tdefined=closed closedby={header_table table.x.z}, 'table.x' created, 'table.x.z' created tdefined=open tdefinedby={header_table table.x.z} #k= 22 # #'table.x.z' tdefined=closed closedby={eof eof} #equivalent datastructure #[table] #'table' created, tdefined=open definedby={header_table table} #[table.x] #'table' tdefined=closed closedby={header_table table.x}, 'table.x' created tdefined=open definedby={header_table table.x} #y = 3 #[table.x.z] #'table.x' tdefined=closed closedby={header_table table.x.z}, 'table.x.z' created tdefined=open definedby={header_table table.x.z} #k=22 #illegal #[table] #'table' created and tdefined=open #x.y = 3 #'table.x' created first keyval pair tdefined=open definedby={keyval x.y = 3} #[table.x.y.z] #'table' tdefined=closed, 'table.x' closed because parent 'table' closed?, 'table.x.y' cannot be created #k = 22 # ## - we would fail on encountering table.x.y because only table and table.x are effectively tables - but that table.x is closed should be detected (?) #illegal #[table] #x.y = {p=3} #[table.x.y.z] #k = 22 ## we should fail because y is an inline table which is closed to further entries #note: it is not safe to compare normalized tablenames using join! # e.g a.'b.c'.d is not the same as a.b.c.d # instead compare {a b.c d} with {a b c d} # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} if {[uplevel 1 [list info exists tablenames_info]]} { upvar tablenames_info tablenames_info } else { set tablenames_info [dict create] ;#keyed on tablepath each of which is an @@path such as {@@config @@subgroup @@etc} (corresponding to config.subgroup.etc) #also has non @@ indexes which are list indexes as taken by tcl list commands (int or end-1 etc) #value is a dict with keys: ttype, tdefined } log::info "---> dict::from_tomlish processing '$tomlish'<<<" set items $tomlish foreach lst $items { if {[lindex $lst 0] ni $::tomlish::tags} { error "supplied list does not appear to be toml parsed into a tomlish tagged list. Run tomlish::decode::toml on the raw toml data to produce a tomlish list" } } if {[lindex $tomlish 0] eq "TOMLISH"} { #ignore TOMLISH tag at beginning set items [lrange $tomlish 1 end] } set datastructure [dict create] set dottedtables_defined [list] foreach item $items { set tag [lindex $item 0] #puts "...> item:'$item' tag:'$tag'" switch -exact -- $tag { KEY - DQKEY - SQKEY - INT - FLOAT - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL - STRING - LITERAL { #why would we get individual key item as opposed to DOTTEDKEY? error "tomlish::dict::from_tomlish error: invalid tag: $tag. At the toplevel, from_tomlish can only process WS NEWLINE COMMENT and compound elements DOTTEDKEY TABLE TABLEARRAY ITABLE MULTILITERAL MULTISTRING" } DOTTEDKEY { #toplevel dotted key set dkinfo [_process_tomlish_dottedkey $item] lappend dottedtables_defined {*}[dict get $dkinfo dottedtables_defined] #at any level - we don't expect any more DOTTEDKEY records in a tomlish structure after TABLE or TABLEARRAY are encountered #as those records should encapsulate their own dottedkeys } TABLEARRAY { #close off any dottedtables_defined created by dottedkeys at this level foreach dtablepath $dottedtables_defined { dict set tablenames_info $dtablepath tdefined closed } set dottedtables_defined [list] ;#for closing off at end by setting 'defined' set tablearrayname [lindex $item 1] tomlish::log::debug "---> tomlish::dict::from_tomlish processing item TABLENAME (name: $tablearrayname): $item" set norm_segments [::tomlish::toml::tablename_split $tablearrayname true] ;#true to normalize #we expect repeated tablearray entries - each adding a sub-object to the value, which is an array/list. #tablearrayname is likely to appear multiple times - so unlike a TABLE we don't check for 'defined' for the full name as an indicator of a problem set supertable [list] ############## # [[a.b.c.d]] # norm_segments = {a b c d} #check a {a b} {a b c} <---- supertables of a.b.c.d ############## set refpath [list] ;#e.g @@j1 @@j2 1 @@k1 end foreach normseg [lrange $norm_segments 0 end-1] { lappend supertable $normseg lappend refpath @@$normseg if {![dict exists $tablenames_info $refpath ttype]} { #supertable with this path doesn't yet exist if {[tomlish::dict::path::exists $datastructure $refpath]} { #There is data though - so it must have been created as a keyval set msg "Supertable [join $supertable .] of tablearray name $tablearrayname already has data but doesn't appear to be a table - invalid" append msg \n [tomlish::dict::_show_tablenames $tablenames_info] #raise a specific type of error for tests to check #test: datastructure_tablearray_supertable_keycollision return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg } else { #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here #review - we can't later specify as tablearray so should just set ttype to header_table even though it's being created # because of a tablearray header? #By setting ttype to something other than table_header we can provide more precise errorCode/msg ?? dict set tablenames_info $refpath ttype unknown_header #ensure empty tables are still represented in the datastructure dict set datastructure {*}$supertable [list] } } else { #REVIEW!! # what happens with from_toml {[[a.b.c]]} {[[a.b]]} ??? #presumed that a and a.b were 'created' as tables (supertables of tablearray at a.b.c) and can't now be redefined as tablearrays #supertable has already been created - and may be defined - but even if defined we can add subtables unless it is of type itable #but if it's a tablearray - we need to point to the most 'recently defined table element of the array' #(last member of that array - need to check type? allowed to have non-table elements ie nonhomogenous??) set supertype [dict get $tablenames_info $refpath ttype] if {$supertype eq "header_tablearray"} { #exercised by toml-tests: # valid/table/array-table-array # valid/table/array-nest #puts stdout "todict!!! TABLEARRAY nesting required for supertable [join $supertable .]" #'refer' to the appropriate element in existing array set arrdata [tomlish::dict::path::get $datastructure [list {*}$refpath @@value]] set idx [expr {[llength $arrdata]-1}] if {$idx < 0} { #existing tablearray should have at least one entry even if empty (review) set msg "reference to empty tablearray?" return -code error -errorcode {TOMLISH STRUCTURE REFTOEMPTYTABLEARRAY} $msg } lappend refpath $idx } } } # #puts "TABLE supertable refpath $refpath" lappend refpath @@[lindex $norm_segments end] tomlish::log::debug "TABLEARRAY refpath $refpath" set tablearray_refpath $refpath if {![dict exists $tablenames_info $tablearray_refpath ttype]} { #first encounter of this tablearrayname if {[tomlish::dict::path::exists $datastructure $tablearray_refpath]} { #e.g from_toml {a=1} {[[a]]} set msg "Cannot create tablearray name $tablearrayname. Key already has data but key doesn't appear to be a table (keycollision) - invalid" append msg \n [tomlish::dict::_show_tablenames $tablenames_info] #raise a specific type of error for tests to check #test: datastructure_tablearray_direct_keycollision_error return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg } #no collision - we can create the tablearray and the array in the datastructure dict set tablenames_info $tablearray_refpath ttype header_tablearray #dict set datastructure {*}$norm_segments [list type ARRAY value {}] #create array along with empty array-item at position zero tomlish::dict::path::set_endpoint datastructure $tablearray_refpath [list type ARRAY value {{}}] set arrayitem_refpath [list {*}$tablearray_refpath 0] #set ARRAY_ELEMENTS [list] } else { #we have an existing tablenames_info record for this path - but is it a tablearray? set ttype [dict get $tablenames_info $tablearray_refpath ttype] if {$ttype ne "header_tablearray"} { #header_table or itable switch -- $ttype { itable {set ttypename itable} header_table {set ttypename table} dottedkey_table {set ttypename dottedkey_table} unknown_header - unknown_dotted { #table was created e.g as supertable - but not specifically a tablearray #violates ordering - return specific test error set msg "Table $tablearrayname referenced as supertable before tablearray defined (ordering)" return -code error -errorcode {TOMLISH STRUCTURE TABLEARRAYORDERING} $msg } default {error "unrecognised type $ttype - expected header_table or itable"} } set msg "tablearray name $tablearrayname already appears to be already created as '$ttypename' not tablearray - invalid?" append msg \n [tomlish::dict::_show_tablenames $tablenames_info] #raise a specific type of error for tests to check return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg } #EXISTING tablearray #add to array #error "add_to_array not implemented" #{type ARRAY value } #set ARRAY_ELEMENTS [dict get $datastructure {*}$norm_segments value] tomlish::log::debug ">>>>pre-extend-array dict::from_tomlish datastructure: $datastructure" set existing_array [tomlish::dict::path::get $datastructure [list {*}$tablearray_refpath @@value]] set arrayitem_refpath [list {*}$tablearray_refpath [llength $existing_array]] tomlish::dict::path::lappend datastructure $tablearray_refpath {} tomlish::log::debug ">>>>post-extend-array dict::from_tomlish datastructure: $datastructure" } #set object [dict create] ;#array context equivalent of 'datastructure' #add to ARRAY_ELEMENTS and write back in to datastructure. foreach element [lrange $item 2 end] { set type [lindex $element 0] tomlish::log::debug "----> todict processing $tag subitem $type processing contained element $element" switch -exact -- $type { DOTTEDKEY { set dkinfo [_process_tomlish_dottedkey $element $arrayitem_refpath] lappend dottedtables_defined {*}[dict get $dkinfo dottedtables_defined] } NEWLINE - COMMENT - WS { #ignore } TABLE { #we *perhaps* should be able to process tablearray subtables either as part of the tablearray record, or independently. #(or even a mixture of both, although that is somewhat an edge case, and of limited utility) #[[fruit]] #x=1 # [fruit.metadata] # [fruit.otherdata] #when processing a dict destined for the above - the tomlish generator (e.g from_dict) #should create as 1 or 3 records (but could create 2 records if there was an unrelated table in between the subtables) #choices: all in tablearray record, tablearray + 1 or 2 table records. # #We are going the other way here - so we just need to realise that the list of tables 'belonging' to this tablearray might not be complete. # #the subtable names must be prefixed with the tablearray - we should validate that for any contained TABLE records #The default mechanism is for from_dict to produce tomlish with separate TABLE records - and use the ordering to determine membership #If we were to support wrapping the TABLE records within a TABLEARRAY - we should also support TABLEARRAY within TABLEARRAY # ----------------------------------------------------------------------- #Implementing this is not critical for standard encoding/decoding of toml! #It would be an alternative form for the tomlish intermediate form - and adds complexity. # #The upside would be to provide a function for sorting/rearranging in the tomlish form if all records were fully encapsulated. #A possible downside is that unrelated tables placed before a tablearray is fully defined (within the tablearray definition area in toml) # would have to be re-positioned before or after the encapsulated tablearray record. # While unrelated tables in such a position aren't a recommended way to write toml, they appear to be valid # and preserving the author's ordering is a goal of the basic encoding/decoding operations if no explicit sorting/reordering was requested. # #Consider an 'encapsulate' method to this (tomlish -> tomlish) # ----------------------------------------------------------------------- #todo error "tomlish::dict::from_tomlish TABLE element within TABLEARRAY not handled - TABLE should be a separate tomlish record" } default { error "tomlish::dict::from_tomlish Sub element of type '$type' not understood in tablearray context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" } } } #end of TABLEARRAY record - equivalent of EOF or next header - close off the dottedtables foreach dtablepath $dottedtables_defined { dict set tablenames_info $dtablepath tdefined closed } } TABLE { #close off any dottedtables_defined created by dottedkeys at this level foreach dtablepath $dottedtables_defined { dict set tablenames_info $dtablepath tdefined closed } set tablename [lindex $item 1] set dottedtables_defined [list] ;#for closing off at end by setting 'defined' #As our TABLE record contains all it's child DOTTEDKEY records - this should be equivalent to setting them as defined at EOF or next header. #----------------------------------------------------------------------------------- #default assumption - our reference is to the main tablenames_info and datastructure #Will need to append keys appropriately if we have recursed #----------------------------------------------------------------------------------- log::debug "---> tomlish::dict::from_tomlish processing item TABLE (name: $tablename): $item" set norm_segments [::tomlish::toml::tablename_split $tablename true] ;#true to normalize set name_segments [::tomlish::toml::tablename_split $tablename 0] ;#unnormalized e.g ['a'."b".c.d] -> 'a' "b" c d #results of tablename_split 0 are 'raw' - ie some segments may be enclosed in single or double quotes. set supertable [list] ############## # [a.b.c.d] # norm_segments = {a b c d} #check a {a b} {a b c} <---- supertables of a.b.c.d ############## ############## #[[a]] #[a.b] #supertable a is tablearray ############## #also consider ############## # [[a.b]] # [a.b.c.d] #supertable a is a table, supertable a.b is tablearray, supertable a.b.c is elementtable ############## set refpath [list] ;#e.g @@j1 @@j2 1 @@k1 end foreach normseg [lrange $norm_segments 0 end-1] { lappend supertable $normseg lappend refpath @@$normseg if {![dict exists $tablenames_info $refpath ttype]} { #supertable with this path doesn't yet exist if {[tomlish::dict::path::exists $datastructure $refpath]} { #There is data though - so it must have been created as a keyval set msg "Supertable [join $supertable .] of table name $tablename (path $refpath) already has data but doesn't appear to be a table (keycollision) - invalid" append msg \n [tomlish::dict::_show_tablenames $tablenames_info] #raise a specific type of error for tests to check return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg } #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here #we also don't know whether it's a table or a dottedkey_table (not allowed to be tablearray - out of order?) dict set tablenames_info $refpath ttype unknown_header #ensure empty tables are still represented in the datastructure #dict set datastructure {*}$supertable [list] tomlish::dict::path::set_endpoint datastructure $refpath {} } else { #supertable has already been created - and may be defined - but even if defined we can add subtables unless it is of type itable if {[dict get $tablenames_info $refpath ttype] eq "header_tablearray"} { #'refer' to the appropriate element in existing array set arrdata [tomlish::dict::path::get $datastructure [list {*}$refpath @@value]] set idx [expr {[llength $arrdata]-1}] if {$idx < 0} { #existing tablearray should have at least one entry even if empty (review) set msg "reference to empty tablearray?" return -code error -errorcode {TOMLISH STRUCTURE REFTOEMPTYTABLEARRAY} $msg } lappend refpath $idx } else { #?? if {[dictn getdef $tablenames_info [list $refpath tdefined] NULL] eq "NULL"} { } else { } } } } #puts "TABLE supertable refpath $refpath" lappend refpath @@[lindex $norm_segments end] tomlish::log::info "TABLE refpath $refpath" set table_refpath $refpath #table [a.b.c.d] hasn't been defined - but may have been 'created' already by a longer tablename # - or may have existing data from a keyval if {![dict exists $tablenames_info $table_refpath ttype]} { if {[tomlish::dict::path::exists $datastructure $table_refpath]} { #e.g from_toml {a=1} {[a]} set msg "Cannot create table name $tablename. Key already has data but key doesn't appear to be a table (keycollision) - invalid" append msg \n [tomlish::dict::_show_tablenames $tablenames_info] #raise a specific type of error for tests to check #test: datastructure_tablename_keyval_collision_error return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg } #no data or previously created table dict set tablenames_info $table_refpath ttype header_table #We are 'defining' this table's keys and values here (even if empty) #dict set datastructure {*}$norm_segments [list] ;#ensure table still represented in datastructure even if we add no keyvals here tomlish::dict::path::set_endpoint datastructure $table_refpath {} ;#ensure table still represented in datastructure even if we add no keyvals here } else { if {[dict get $tablenames_info $table_refpath ttype] eq "header_tablearray"} { #e.g tomltest invalid/table/duplicate-table-array2 #[[tbl]] #[tbl] set msg "Table name $tablename has already been created as a tablearray. Invalid" append msg \n [tomlish::dict::_show_tablenames $tablenames_info] return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg } else { #any other type tdefined is a problem set T_DEFINED [dictn getdef $tablenames_info [list $table_refpath tdefined] NULL] if {$T_DEFINED ne "NULL" } { #our tablename e.g [a.b.c.d] declares a space to 'define' subkeys - but there has already been a definition space for this path set msg "Table name $tablename has already been defined in the toml data. Invalid" append msg \n [tomlish::dict::_show_tablenames $tablenames_info] #raise a specific type of error for tests to check return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg } } } dict set tablenames_info $table_refpath tdefined open #now add the contained elements foreach element [lrange $item 2 end] { set type [lindex $element 0] log::debug "----> todict processing $tag subitem $type processing contained element $element" switch -exact -- $type { DOTTEDKEY { set dkinfo [_process_tomlish_dottedkey $element $table_refpath] lappend dottedtables_defined {*}[dict get $dkinfo dottedtables_defined] } NEWLINE - COMMENT - WS { #ignore } default { error "Sub element of type '$type' not understood in table context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" } } } #end of TABLE record - equivalent of EOF or next header - close off the dottedtables foreach dtablepath $dottedtables_defined { dict set tablenames_info $dtablepath tdefined closed } } ITABLE { #As there is no other mechanism to create tables within an ITABLE than dottedkeys # and ITABLES are fully defined/enclosed - we can rely on key collision and don't need to track dottedtables_defined - REVIEW. set dottedtables_defined [list] #SEP??? #ITABLE only ever on RHS of = or inside ARRAY set datastructure [dict create] set tablenames_info [dict create] foreach element [lrange $item 1 end] { set type [lindex $element 0] log::debug "----> tododict processing $tag subitem $type processing contained element $element" switch -exact -- $type { DOTTEDKEY { set dkinfo [_process_tomlish_dottedkey $element] } NEWLINE - COMMENT - WS { #ignore } default { error "Sub element of type '$type' not understood in ITABLE context. Expected only KEY,DQKEY,SQKEY,NEWLINE,COMMENT,WS" } } } } ARRAY { #arrays in toml are allowed to contain mixtures of types set datastructure [list] log::debug "--> processing array: $item" foreach element [lrange $item 1 end] { set type [lindex $element 0] log::debug "----> tododict processing $tag subitem $type processing contained element $element" switch -exact -- $type { INT - FLOAT - BOOL - DATETIME - DATETIME-LOCAL - DATE-LOCAL - TIME-LOCAL { set value [lindex $element 1] lappend datastructure [list type $type value $value] } STRING { #JJJJ #don't unescape string! set value [lindex $element 1] #lappend datastructure [list type $type value [::tomlish::utils::unescape_string $value]] lappend datastructure [list type $type value $value] } LITERAL { set value [lindex $element 1] lappend datastructure [list type $type value $value] } ITABLE { #anonymous table #lappend datastructure [list type $type value [::tomlish::to_dict [list $element]]] lappend datastructure [::tomlish::dict::from_tomlish [list $element]] ;#store itables within arrays as raw dicts (possibly empty) } TABLE - TABLEARRAY { #invalid? shouldn't be output from from_dict - but could manually be constructed as such? review #doesn't make sense as table needs a name? #take as synonym for ITABLE? error "tomlish::dict::from_tomlish $type within array unexpected" } ARRAY - MULTISTRING - MULTILITERAL { #set value [lindex $element 1] lappend datastructure [list type $type value [::tomlish::dict::from_tomlish [list $element]]] } WS - SEP - NEWLINE - COMMENT { #ignore whitespace, commas, newlines and comments } default { error "tomlish::dict::from_tomlish Unexpected value type '$type' found in array" } } } } MULTILITERAL { #triple squoted string #first newline stripped only if it is the very first element #(ie *immediately* following the opening delims) #All whitespace other than newlines is within LITERALPARTS # ------------------------------------------------------------------------- #todo - consider extension to toml to allow indent-aware multiline literals # how - propose as issue in toml github? Use different delim? e.g ^^^ ? #e.g # xxx=?'''abc # def # etc # ''' # - we would like to trimleft each line to the column following the opening delim # ------------------------------------------------------------------------- log::debug "---> todict processing multiliteral: $item" set parts [lrange $item 1 end] if {[lindex $parts 0 0] eq "NEWLINE"} { set parts [lrange $parts 1 end] ;#skip it } for {set idx 0} {$idx < [llength $parts]} {incr idx} { set element [lindex $parts $idx] set type [lindex $element 0] switch -exact -- $type { LITERALPART { append stringvalue [lindex $element 1] } NEWLINE { set val [lindex $element 1] if {$val eq "lf"} { append stringvalue \n } else { append stringvalue \r\n } } default { error "Unexpected value type '$type' found in multistring" } } } set datastructure $stringvalue } MULTISTRING { #triple dquoted string log::debug "---> tomlish::dict::from_tomlish processing multistring: $item" set stringvalue "" set idx 0 set parts [lrange $item 1 end] for {set idx 0} {$idx < [llength $parts]} {incr idx} { set element [lindex $parts $idx] set type [lindex $element 0] #We use STRINGPART in the tomlish representation as a distinct element to STRING - which would imply wrapping quotes to be reinserted switch -exact -- $type { STRING { #todo - do away with STRING ? #we don't build MULTISTRINGS containing STRING - but should we accept it? tomlish::log::warn "double quoting a STRING found in MULTISTRING - should be STRINGPART?" #append stringvalue "\"[::tomlish::utils::unescape_string [lindex $element 1]]\"" append stringvalue "\"[lindex $element 1]\"" } STRINGPART { #JJJ #don't unescape string #append stringvalue [::tomlish::utils::unescape_string [lindex $element 1]] append stringvalue [lindex $element 1] } CONT { #When the last non-whitespace character on a line is an unescaped backslash, #it will be trimmed along with all whitespace (including newlines) up to the next non-whitespace character or closing delimiter # review - we allow some whitespace in stringpart elements - can a stringpart ever be all whitespace? set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] if {$next_nl == -1} { #last (or first and only) line return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} "Invalid whitespace escape - not a valid continuation position" #set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] #if {$non_ws >= 0} { # #append stringvalue "\\" # return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} "Invalid whitespace escape - not a valid continuation position" #} else { # #skip over ws without emitting # set idx [llength $parts] #} } else { set parts_til_nl [lrange $parts 0 $next_nl-1] set non_ws [lsearch -index 0 -start $idx+1 -not $parts_til_nl WS] if {$non_ws >= 0} { #This CONT is invalid. If there had been a non-whitespace char directly following it, #it wouldn't have come through as a CONT token #Now that we see it isn't the last non-whitespace backslash on the line we can reject # as an invalid escape of space or tab #append stringvalue "\\" return -code error -errorcode {TOMLISH SYNTAX INVALIDESCAPE} "Invalid whitespace escape - not a valid continuation position" } else { #skip over ws on this line set idx $next_nl #then have to check each subsequent line until we get to first non-whitespace set trimming 1 while {$trimming && $idx < [llength $parts]} { set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] if {$next_nl == -1} { #last line set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] if {$non_ws >= 0} { set idx [expr {$non_ws -1}] } else { set idx [llength $parts] } set trimming 0 } else { set non_ws [lsearch -index 0 -start $idx+1 -not [lrange $parts 0 $next_nl-1] WS] if {$non_ws >= 0} { set idx [expr {$non_ws -1}] set trimming 0 } else { set idx $next_nl #keep trimming } } } } } } NEWLINE { #if newline is first element - it is not part of the data of a multistring if {$idx > 0} { set val [lindex $element 1] if {$val eq "lf"} { append stringvalue \n } else { append stringvalue \r\n } } } WS { append stringvalue [lindex $element 1] } default { error "Unexpected value type '$type' found in multistring" } } } set datastructure $stringvalue } WS - COMMENT - NEWLINE { #ignore } BOM { #this token is the unicode single char \uFFEF #It doesn't tell us what encoding was originally used (though toml should only accept UTF-8 files) #ignore at start - what about in other positions? } default { error "Unexpected tag '$tag' in Tomlish list '$tomlish'" } } } return $datastructure } } namespace eval tomlish::dict::path { #access tomlish dict structure namespace export {[a-z]*}; # Convention: export all lowercase #access with path such as: @@k @@k 0 @@k end where dict keys marked with @@ and plain values are list indices into in {type ARRAY value } #leaf elements returned as structured {type value } proc get {dictval {path {}}} { if {$path eq ""} { return $dictval } ::set data $dictval ::set pathsofar [list] foreach p $path { ::lappend pathsofar $p if {[string range $p 0 1] eq "@@"} { ::set data [dict get $data [string range $p 2 end]] } else { if {![tomlish::dict::is_typeval $data]} { error "tomlish::dict::path::get error bad path $path. Attempt to access table as array at subpath $pathsofar." } if {[dict get $data type] ne "ARRAY"} { error "tomlish::dict::get error bad path $path. Subpath $pathsofar is not an array." } ::set arrdata [dict get $data value] ::set data [lindex $arrdata $p] } } return $data } proc exists {dictval path} { ::set data $dictval ::set pathsofar [list] ::set exists 1 foreach p $path { ::lappend pathsofar $p if {[string range $p 0 1] eq "@@"} { ::set k [string range $p 2 end] if {![dict exists $data $k]} { return 0 } ::set data [dict get $data $k] } else { if {![tomlish::dict::is_typeval $data]} { return 0 } if {[dict get $data type] ne "ARRAY"} { return 0 } ::set arrdata [dict get $data value] ::set intp [tomlish::system::lindex_resolve_basic $arrdata $p] ;#handle index math (end-1 etc) if {$intp == -1} { #out of bounds return 0 } ::set data [lindex $arrdata $p] } } return $exists } #a restricted analogy of 'dictn set' #set 'endpoints' - don't create intermediate paths # can replace an existing dict with another dict # can create a key when key at tail end of path is a key (ie @@keyname, not index) # can replace an existing {type value value } # with added restriction that if is ARRAY the new must also be ARRAY proc set_endpoint {dictvariable path value} { upvar $dictvariable dict_being_edited ::set data $dict_being_edited ::set pathsofar [list] if {!([tomlish::dict::is_typeval $value] || [tomlish::dict::is_typeval_dict $value 0])} { #failed check of supplied value as basic type, or a sub-dict structure (not checking arrays) error "tomlish::dict::path::set_endpoint error - value must already be in the tomlish form {type value } or be a dict with such forms as leaves" } foreach p $path { ::lappend pathsofar $p if {[string range $p 0 1] eq "@@"} { ::set k [string range $p 2 end] #if {![dict exists $data $k]} { # error "tomlish::dict:path::set error bad path $path. Attempt to access nonexistent element at subpath $pathsofar." #} ::set varname v[incr v] if {$pathsofar eq $path} { #see if endpoint of the path given already exists if {[dict exists $data $k]} { ::set endpoint [dict get $data $k] if {[tomlish::dict::is_typeval $endpoint]} { set existing_tp [dict get $endpoint type] if {![tomlish::dict::is_typeval $value]} { error "tomlish::dict::path::set_endpoint error Unable to overwrite subpath '$pathsofar' which is of type $existing_tp with sub-dict. Supplied value not {type value val } with sub-dict: $value" } switch -- [dict get $endpoint type] { ARRAY { #disallow overwriting array - unless given value is an ARRAY? REVIEW if {[dict get $value type] ne "ARRAY"} { error "tomlish::dict::path::set_endpoint error bad path '$path'. Cannot overwrite array with non-array: $value" } } default { # } } } else { #endpoint is a typeval dict not a plain typeval - only allow overwrite with a typeval dict if {![tomlish::dict::is_typeval_dict $value 0]} { error "tomlish::dict::path::set_endpoint error path '$path'. Cannot overwrite sub-dict (size: [dict size $endpoint]) with non sub-dict: $value" } } ::set $varname $value dict set vdict $pathsofar $varname break } else { ::set arrdata [dict get $data value] set idx [tomlish::system::lindex_resolve_basic $arrdata $p] if {$idx == -1} { error "tomlish::dict::path::set_endpoint error bad path '$path'. No existing element at $p" } ::set data [lindex $arrdata $p] ::set $varname $data dict set vdict $pathsofar $varname } } } #dict for {path varname} $vdict { # puts "$path $varname\n" # puts " '[::set $varname]'\n" # puts "" #} ::set i 0 ::set reverse [lreverse $vdict] foreach {varname path} $reverse { set newval [::set $varname] if {$i+2 == [llength $reverse]} { ::set k [lindex $path end] ::set k [string range $k 2 end] ;#first key is always @@something dict set dict_being_edited $k $newval #puts "--result $dict_being_edited" break } ::set nextvarname [lindex $reverse $i+2] ::set nextval [::set $nextvarname] ::set k [lindex $path end] if {[string match @@* $k]} { #dict key #dict set $nextvarname $k $newval set_endpoint $nextvarname [list $k] $newval } else { #list index ::set nextarr [dict get $nextval value] ::lset nextarr $k $newval dict set $nextvarname value $nextarr } ::incr i 2 } return $dict_being_edited } #path must be to a {type ARRAY value } #REVIEW - how to lappend to deep mixed dict/array structure without rewriting whole datastructure? proc lappend {dictvariable path args} { upvar $dictvariable dict_being_edited ::set data $dict_being_edited ::set pathsofar [list] #::set newlist [list] ::set v 0 ::set vdict [dict create] foreach a $args { if {![string is dict $a]} { error "tomlish::dict::path::lappend error - lappended arguments must already be in the tomlish form {type value } or be a dict with such forms as leaves" } } foreach p $path { ::lappend pathsofar $p if {[string range $p 0 1] eq "@@"} { ::set k [string range $p 2 end] if {![dict exists $data $k]} { error "tomlish::dict::path::lappend error bad path $path. Attempt to access nonexistent element at subpath $pathsofar." } ::set varname v[incr v] if {$pathsofar eq $path} { #see if endpoint of the path given is an ARRAY ::set endpoint [dict get $data $k] if {![tomlish::dict::is_typeval $endpoint]} { error "tomlish::dict::path::lappend error bad path $path. Attempt to access table as array at subpath $pathsofar." } if {[dict get $endpoint type] ne "ARRAY"} { error "tomlish::dict::path::lappend error bad path $path. Subpath $pathsofar is not an array." } ::set arrdata [dict get $endpoint value] ::lappend arrdata {*}$args dict set endpoint value $arrdata ::set newlist $endpoint ::set $varname $newlist dict set vdict $pathsofar $varname break } ::set data [dict get $data $k] ::set $varname $data dict set vdict $pathsofar $varname } else { if {![tomlish::dict::is_typeval $data]} { error "tomlish::dict::path::lappend error bad path $path. Attempt to access table as array at subpath $pathsofar." } if {[dict get $data type] ne "ARRAY"} { error "tomlish::dict::path::lappend error bad path $path. Subpath $pathsofar is not an array." } ::set varname v[incr v] if {$pathsofar eq $path} { if {[dict get $data type] ne "ARRAY"} { error "tomlish::dict::path::lappend error bad path $path. Parent path is not an array." } ::set parentarray [dict get $data value] ::set idx [tomlish::system::lindex_resolve_basic $parentarray $p] if {$idx == -1} { error "tomlish::dict::path::lappend error bad path $path. Index $p does not exist." } ::set endpoint [lindex $parentarray $p] if {[dict get $endpoint type] ne "ARRAY"} { error "tomlish::dict::path::lappend error bad path $path. Not an array." } ::set arrdata [dict get $endpoint value] ::lappend arrdata {*}$args dict set endpoint value $arrdata ::set newlist $endpoint #::lset parentarray $p $newlist #set parentarray $newlist ::set $varname $newlist dict set vdict $pathsofar $varname break } else { ::set arrdata [dict get $data value] set idx [tomlish::system::lindex_resolve_basic $arrdata $p] if {$idx == -1} { error "tomlish::dict::path::lappend error bad path $path. Subpath $pathsofar, index $p does not exist." } ::set data [lindex $arrdata $p] ::set $varname $data dict set vdict $pathsofar $varname } } } #todo tomlish::log::debug ? #dict for {path varname} $vdict { # puts "$path $varname\n" # puts " [::set $varname]\n" # puts "" #} ::set i 0 ::set reverse [lreverse $vdict] foreach {varname path} $reverse { set newval [::set $varname] if {$i+2 == [llength $reverse]} { ::set k [lindex $path end] ::set k [string range $k 2 end] ;#first key is always @@something dict set dict_being_edited $k $newval #puts "--result $dict_being_edited" break } ::set nextvarname [lindex $reverse $i+2] ::set nextval [::set $nextvarname] ::set k [lindex $path end] if {[string match @@* $k]} { #dict key set k [string range $k 2 end] dict set $nextvarname $k $newval } else { #list index ::set nextarr [dict get $nextval value] ::lset nextarr $k $newval dict set $nextvarname value $nextarr } ::incr i 2 } return $dict_being_edited } } tcl::namespace::eval tomlish::to_dict { proc @@path {dictkeys} { lmap v $dictkeys {string cat @@ $v} } } tcl::namespace::eval tomlish::app { #*** !doctools #[subsection {Namespace tomlish::app}] #[para] #[list_begin definitions] tcl::namespace::eval argdoc { proc test_suites {} { if {[package provide test::tomlish] eq ""} { return [list] } return [test::tomlish::SUITES] } } package require punk::args punk::args::define { @id -id ::tomlish::app::decoder @cmd -name tomlish::app::decoder -help\ "Read toml on stdin until EOF on error - returns non-zero exit code and writes error to the errorchannel. on success - returns zero exit code and writes JSON encoding of the data to the outputchannel. This decoder is intended to be compatble with toml-test." @leaders -min 0 -max 0 @opts -help -type none -help\ "Display this usage message" -inputchannel -default stdin -inputencoding -default "iso8859-1" -choicerestricted 0 -choices {utf-8 utf-16 iso8859-1} -help\ "configure encoding on input channel iso8859-1 is equivalent to binary encoding" -outputchannel -default stdout -errorchannel -default stderr @values -min 0 -max 0 } proc decoder {args} { set argd [punk::args::parse $args withid ::tomlish::app::decoder] set ch_input [dict get $argd opts -inputchannel] set ch_input_enc [dict get $argd opts -inputencoding] set ch_output [dict get $argd opts -outputchannel] set ch_error [dict get $argd opts -errorchannel] if {[dict exists $argd received -help]} { return [punk::args::usage -scheme info ::tomlish::app::decoder] } chan configure $ch_input -encoding $ch_input_enc #translation? chan configure $ch_input -translation lf ;# toml-test invalid/control tests we need to see raw CRs to reject them properly - auto translation won't do. #Just slurp it all - presumably we are not handling massive amounts of data on stdin. # - even if the input is large, we probably don't gain much (aside from possible memory savings?) by attempting to process input as it arrives. if {[catch { set inputdata [read $ch_input] if {$ch_input_enc eq "iso8859-1"} { set toml [tomlish::toml::from_binary $inputdata] } else { set toml $inputdata } } errM]} { puts stderr "read-input error: $errM" #toml-tests expect exit code 1 #e.g invalid/encoding/utf16-bom exit 1 ;#read error } try { set j [::tomlish::toml_to_typedjson $toml] } on error {em} { puts $ch_error "decoding failed: '$em'" exit 1 } puts -nonewline $ch_output $j exit 0 } package require punk::args punk::args::define { @id -id ::tomlish::app::encoder @cmd -name tomlish::app::encoder -help\ "Read JSON on input until EOF return non-zero exitcode if JSON data cannot be converted to a valid TOML representation. return zero exitcode and TOML data on output if JSON data can be converted. This encoder is intended to be compatible with toml-test." @leaders -min 0 -max 0 @opts -help -type none -help \ "Display this usage message" -restrict_barekeys -default 0 -help\ "If true, keys containing unicode will be quoted. If false, an extended range of barekeys will be used in unquoted form." -inputchannel -default stdin -inputencoding -default "" -choicerestricted 0 -choices {utf-8 utf-16 iso8859-1} -help\ "configure encoding on input channel If not supplied, leave at Tcl default" -outputchannel -default stdout -errorchannel -default stderr @values -min 0 -max 0 } proc encoder {args} { set argd [punk::args::parse $args withid ::tomlish::app::encoder] set restrict_barekeys [dict get $argd opts -restrict_barekeys] set ch_input [dict get $argd opts -inputchannel] set ch_input_enc [dict get $argd opts -inputencoding] set ch_output [dict get $argd opts -outputchannel] set ch_error [dict get $argd opts -errorchannel] if {[dict exists $argd received -help]} { return [punk::args::usage -scheme info ::tomlish::app::encoder] } #review if {$ch_input_enc ne ""} { chan configure $ch_input -encoding $ch_input_enc } #review chan configure $ch_input -translation lf if {[catch { set json [read $ch_input] }]} { exit 2 ;#read error } try { #tomlish::typedjson_to_toml set toml [::tomlish::toml::from_tomlish_from_dict_from_typedjson $json] } trap {} {e eopts} { puts $ch_error "encoding failed: '$e'" puts $ch_error "$::errorInfo" exit 1 } puts -nonewline $ch_output $toml exit 0 } punk::args::define { @dynamic @id -id ::tomlish::app::test @cmd -name tomlish::app::test @leaders @opts -any 1 -help -type none -help\ "Display this usage message or further info if more args." -suite -default tests -choices {${[::tomlish::app::argdoc::test_suites]}} @values -min 0 -max -1 } proc test {args} { package require test::tomlish set argd [punk::args::parse $args withid ::tomlish::app::test] set opts [dict get $argd opts] set values [dict get $argd values] set received [dict get $argd received] set solos [dict get $argd solos] set opt_suite [dict get $opts -suite] if {[dict exists $received -help] && ![dict exists $received -suite]} { return [punk::args::usage -scheme info ::tomlish::app::test] } test::tomlish::SUITE $opt_suite #if {[catch {test::tomlish::SUITE $opt_suite} errM]} { # puts stderr "Unknown test suite '$opt_suite'. Available suites: [test::tomlish::SUITES]" # exit 1 #} set run_opts [dict remove $opts -suite] set run_opts [dict remove $run_opts {*}$solos] set result [test::tomlish::RUN {*}$run_opts {*}$solos {*}$values] return $result } #*** !doctools #[list_end] [comment {--- end definitions namespace tomlish::app ---}] } proc ::tomlish::appnames {} { set applist [list] foreach cmd [info commands ::tomlish::app::*] { lappend applist [namespace tail $cmd] } return $applist } # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # Secondary API namespace # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ namespace eval tomlish::lib { namespace export {[a-z]*}; # Convention: export all lowercase namespace path [namespace parent] #*** !doctools #[subsection {Namespace tomlish::lib}] #[para] Secondary functions that are part of the API #[list_begin definitions] #proc utility1 {p1 args} { # #*** !doctools # #[call lib::[fun utility1] [arg p1] [opt {?option value...?}]] # #[para]Description of utility1 # return 1 #} #*** !doctools #[list_end] [comment {--- end definitions namespace tomlish::lib ---}] } # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ namespace eval tomlish::system { #taken from punk::lib proc lindex_resolve_basic {list index} { #*** !doctools #[call [fun lindex_resolve_basic] [arg list] [arg index]] #[para] Accepts index of the forms accepted by Tcl's list commands. (e.g compound indices such as 3+1 end-2) #[para] returns -1 for out of range at either end, or a valid integer index #[para] Unlike lindex_resolve; lindex_resolve_basic can't determine if an out of range index was out of range at the lower or upper bound #[para] This is only likely to be faster than average over lindex_resolve for Tcl which has the builtin lseq command #[para] The performance advantage is more likely to be present when using compound indexes such as $x+1 or end-1 #[para] For pure integer indices the performance should be equivalent set index [tcl::string::map {_ {}} $index] ;#forward compatibility with integers such as 1_000 if {[string is integer -strict $index]} { #can match +i -i #avoid even the lseq overhead when the index is simple if {$index < 0 || ($index >= [llength $list])} { #even though in this case we could return -2 or -3 like lindex_resolve; for consistency we don't, as it's not always determinable for compound indices using the lseq method. return -1 } else { #integer may still have + sign - normalize with expr return [expr {$index}] } } if {[llength $list]} { set indices [tomlish::system::range 0 [expr {[llength $list]-1}]] ;# uses lseq if available, has fallback. #if lseq was available - $indices is an 'arithseries' - theoretically not taking up ram(?) } else { set indices [list] } set idx [lindex $indices $index] if {$idx eq ""} { #we have no way to determine if out of bounds is at lower vs upper end return -1 } else { return $idx } } if {[info commands ::lseq] ne ""} { #tcl 8.7+ lseq significantly faster, especially for larger ranges #The internal rep can be an 'arithseries' with no string representation #support minimal set from to proc range {from to} { lseq $from $to } } else { #lseq accepts basic expressions e.g 4-2 for both arguments #e.g we can do lseq 0 [llength $list]-1 #if range is to be consistent with the lseq version above - it should support that, even though we don't support most lseq functionality in either wrapper. proc range {from to} { set to [offset_expr $to] set from [offset_expr $from] if {$to > $from} { set count [expr {($to -$from) + 1}] if {$from == 0} { return [lsearch -all [lrepeat $count 0] *] } else { incr from -1 return [lmap v [lrepeat $count 0] {incr from}] } #slower methods. #2) #set i -1 #set L [lrepeat $count 0] #lmap v $L {lset L [incr i] [incr from];lindex {}} #return $L #3) #set L {} #for {set i 0} {$i < $count} {incr i} { # lappend L [incr from] #} #return $L } elseif {$from > $to} { set count [expr {$from - $to} + 1] #1) if {$to == 0} { return [lreverse [lsearch -all [lrepeat $count 0] *]] } else { incr from return [lmap v [lrepeat $count 0] {incr from -1}] } #2) #set i -1 #set L [lrepeat $count 0] #lmap v $L {lset L [incr i] [incr from -1];lindex {}} #return $L #3) #set L {} #for {set i 0} {$i < $count} {incr i} { # lappend L [incr from -1] #} #return $L } else { return [list $from] } } } } if {[info exists ::argc] && $::argc > 0} { #puts stderr "argc: $::argc args: $::argv" set arglist $::argv # -------------- #make sure any dependant packages that are sourced don't get any commandline args set ::argv {} set ::argc 0 # -------------- package require punk::args punk::args::define { @dynamic @id -id tomlish::cmdline @cmd -name tomlish -help\ "toml encoder/decoder written in Tcl" @opts -any 1 -help -type none -help\ "Display this usage message or more specific help if further arguments provided." -app -choices {${[tomlish::appnames]}} } try { set argd [punk::args::parse $arglist withid tomlish::cmdline] } trap {PUNKARGS VALIDATION} {msg erroropts} { puts stderr $msg exit 1 } lassign [dict values $argd] leaders opts values received solos if {[dict exists $received -help] && ![dict exists $received -app]} { #only emit cmdline help if -app not supplied as well - otherwise app function can act on -help for more specific help #puts stdout "Usage: -app where appname one of:[tomlish::appnames]" puts stdout [punk::args::usage -scheme info tomlish::cmdline] exit 0 } if {![dict exists $received -app]} { puts stderr [punk::args::usage -scheme error tomlish::cmdline] exit 1 } set app [dict get $opts -app] set appnames [tomlish::appnames] set app_opts [dict remove $opts -app {*}$solos] try { set result [tomlish::app::$app {*}$app_opts {*}$solos {*}$values] } trap {PUNKARGS VALIDATION} {msg erroropts} { #The validation error should fully describe the issue #no need for errortrace - keep the output cleaner puts stderr $msg exit 1 } trap {} {msg erroropts} { #unexpected error - uncaught throw will produce error trace #todo - a support msg? Otherwise we may as well just leave off this trap. throw [dict get $erroropts -errorcode] [dict get $erroropts -errorinfo] } if {"-help" in $solos} { puts stderr $result exit 1 } else { if {$result ne ""} { puts stdout $result exit 0 } } } ## Ready package provide tomlish [namespace eval tomlish { variable pkg tomlish variable version set version 1.1.6 }] return #*** !doctools #[manpage_end]