You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

3357 lines
150 KiB

# -*- tcl -*-
# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from <pkg>-buildversion.txt
#
# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem.
# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository.
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# (C) 2024
#
# @@ Meta Begin
# Application tomlish 1.1.1
# Meta platform tcl
# Meta license <unspecified>
# @@ Meta End
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# doctools header
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[manpage_begin tomlish_module_tomlish 0 1.1.1]
#[copyright "2024"]
#[titledesc {Module API}] [comment {-- Name section and table of contents description --}]
#[moddesc {-}] [comment {-- Description at end of page heading --}]
#[require tomlish]
#[keywords module]
#[description]
#[para] -
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[section Overview]
#[para] overview of tomlish
#[subsection Concepts]
#[para] -
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
## Requirements
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[subsection dependencies]
#[para] packages used by tomlish
#[list_begin itemized]
package require Tcl 8.6-
package require struct::stack
package require logger
#*** !doctools
#[item] [package {Tcl 8.6-}]
#[item] [package {struct::stack}]
#limit ourselves to clear, destroy, peek, pop, push, rotate, or size (e.g v 1.3 does not implement 'get')
#*** !doctools
#[list_end]
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[section API]
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# Base namespace
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
namespace eval tomlish {
namespace export {[a-z]*}; # Convention: export all lowercase
variable types
#ARRAY is analogous to a Tcl list
#TABLE is analogous to a Tcl dict
#WS = inline whitespace
#KEYVAL = bare key and value
#QKEYVAL = quoted key and value
#ITABLE = inline table (*can* be anonymous table)
# inline table values immediately create a table with the opening brace
# inline tables are fully defined between their braces, as are dotted-key subtables defined within
# No additional subtables or arrays of tables may be defined within an inline table after the ending brace - they must be entirely self-contained
set tags [list TOMLISH ARRAY TABLE ITABLE ANONTABLE WS NEWLINE COMMENT KEYVAL QKEYVAL STRING MULTISTRING LITSTRING MULTILITSTRING INT FLOAT BOOL DATETIME]
#tomlish v1.0 should accept arbitrary 64-bit signed ints (from -2^63 to 2^63-1)
#we will restrict to this range for compatibility for now - although Tcl can handle larger (arbitrarily so?)
set min_int -9223372036854775808 ;#-2^63
set max_int +9223372036854775807 ;#2^63-1
proc Dolog {lvl txt} {
#return "$lvl -- $txt"
set msg "[clock format [clock seconds] -format "%Y-%m-%dT%H:%M:%S"] tomlish '$txt'"
puts stderr $msg
}
logger::initNamespace ::tomlish
foreach lvl [logger::levels] {
interp alias {} tomlish_log_$lvl {} ::tomlish::Dolog $lvl
log::logproc $lvl tomlish_log_$lvl
}
#*** !doctools
#[subsection {Namespace tomlish}]
#[para] Core API functions for tomlish
#[list_begin definitions]
proc tags {} {
return $::tomlish::tags
}
#helper function for get_dict
proc _get_keyval_value {keyval_element} {
log::notice ">>> _get_keyval_value from '$keyval_element'<<<"
set found_value 0
#find the value
# 3 is the earliest index at which the value could occur (depending on whitespace)
set found_sub [list]
foreach sub [lrange $keyval_element 2 end] {
#note that a barekey/quotedkey won't occur directly inside a barekey/quotedkey
switch -exact -- [lindex $sub 0] {
STRING - MULTISTRING - INT - FLOAT - BOOL - DATETIME - TABLE - ARRAY - ITABLE {
set type [lindex $sub 0]
set value [lindex $sub 1]
set found_sub $sub
incr found_value 1
}
default {}
}
}
if {!$found_value} {
error "Failed to find value element in KEYVAL. '$keyval_element'"
}
if {$found_value > 1} {
error "Found multiple value elements in KEYVAL, expected exactly one. '$keyval_element'"
}
switch -exact -- $type {
INT - FLOAT - BOOL - DATETIME {
#simple (non-container, no-substitution) datatype
set result [list type $type value $value]
}
STRING - STRINGPART {
set result [list type $type value [::tomlish::utils::unescape_string $value]]
}
LITSTRING {
#REVIEW
set result [list type $type value $value]
}
TABLE - ITABLE - ARRAY - MULTISTRING {
#jmn2024 - added ITABLE - review
#we need to recurse to get the corresponding dict for the contained item(s)
#pass in the whole $found_sub - not just the $value!
set result [list type $type value [::tomlish::get_dict [list $found_sub]]]
}
default {
error "Unexpected value type '$type' found in keyval '$keyval_element'"
}
}
return $result
}
#get_dict is a *basic* programmatic datastructure for accessing the data.
# produce a dictionary of keys and values from a tomlish tagged list.
# get_dict is primarily for reading toml data.
#Extraneous (not within quoted sections) whitespace and comments are not preserved in this structure,
# so a roundtrip from toml to this datastructure and back to toml will lose whitespace formatting and comments.
# creating/changing toml values can be done directly on a tomlish list if preserving (or adding) formatting/comments is desired.
#A separate package 'tomlish::object' may be needed to allow easier programmatic creating/updating/deleting of data elements whilst preserving (or adding or selectively deleting/editing) such formatting.
proc get_dict {tomlish} {
#keep track of which tablenames have already been directly defined,
# so we can raise an error to satisfy the toml rule: 'You cannot define any key or table more than once. Doing so is invalid'
#Note that [a] and then [a.b] is ok if there are no subkey conflicts - so we are only tracking complete tablenames here.
#we don't error out just because a previous tablename segment has already appeared.
variable tablenames_seen [list]
log::info ">>> processing '$tomlish'<<<"
set items $tomlish
foreach lst $items {
if {[lindex $lst 0] ni $::tomlish::tags} {
error "supplied string does not appear to be toml parsed into a tomlish tagged list. Run tomlish::decode::toml on the raw toml data to produce a tomlish list"
}
}
if {[lindex $tomlish 0] eq "TOMLISH"} {
#ignore TOMLISH tag at beginning
set items [lrange $tomlish 1 end]
}
set datastructure [dict create]
foreach item $items {
set tag [lindex $item 0]
#puts "...> item:'$item' tag:'$tag'"
switch -exact -- $tag {
KEYVAL - QKEYVAL {
log::debug "--> processing $tag: $item"
set key [lindex $item 1]
#!todo - normalize key. (may be quoted/doublequoted)
if {[dict exists $datastructure $key]} {
error "Duplicate key '$key'. The key already exists at this level in the toml data. The toml data is not valid."
}
#lassign [_get_keyval_value $item] type val
set keyval_dict [_get_keyval_value $item]
dict set datastructure $key $keyval_dict
}
TABLE {
set tablename [lindex $item 1]
set tablename [::tomlish::utils::tablename_trim $tablename]
if {$tablename in $tablenames_seen} {
error "Table name '$tablename' has already been directly defined in the toml data. Invalid."
}
log::debug "--> processing $tag (name: $tablename): $item"
set name_segments [::tomlish::utils::tablename_split $tablename]
set last_seg ""
#toml spec rule - all segments mst be non-empty
#note that the results of tablename_split are 'raw' - ie some segments may be enclosed in single or double quotes.
set key_hierarchy [list]
set key_hierarchy_raw [list]
foreach rawseg $name_segments {
set seg [::tomlish::utils::normalize_key $rawseg] ;#strips one level of enclosing quotes, and substitutes only toml-specified escapes
set c1 [::string index $rawseg 0]
set c2 [::string index $rawseg end]
if {($c1 eq "'") && ($c2 eq "'")} {
#single quoted segment. No escapes are processed within it.
set seg [::string range $rawseg 1 end-1]
} elseif {($c1 eq "\"") && ($c2 eq "\"")} {
#double quoted segment. Apply escapes.
set seg [::tomlish::utils::unescape_string [::string range $rawseg 1 end-1]]
#set seg [subst -nocommands -novariables [::string range $rawseg 1 end-1]]
} else {
set seg $rawseg
}
#no need to check for empty segments here - we've already called tablename_split which would have raised an error for empty segments.
#if {$rawseg eq ""} {
# error "Table name '[lindex $item 1]' is not valid. All segments (parts between dots) must be non-empty"
#}
lappend key_hierarchy $seg
lappend key_hierarchy_raw $rawseg
if {[dict exists $datastructure {*}$key_hierarchy]} {
#It's ok for this key to already exist *if* it was defined by a previous tablename,
# but not if it was defined as a keyval/qkeyval
set testkey [join $key_hierarchy_raw .]
set testkey_length [llength $key_hierarchy_raw]
set found_testkey 0
if {$testkey in $tablenames_seen} {
set found_testkey 1
} else {
#see if it was defined by a longer entry
foreach seen $tablenames_seen {
set seen_segments [::tomlish::utils::tablename_split $seen]
#these are raw unnormalized tablename segments. Need to normalize the double-quoted ones,
# and strip the quotes from both single-quoted and double-quoted entries.
#note: it is not safe to compare normalized tablenames using join!
# e.g a.'b.c'.d is not the same as a.b.c.d
# instead compare {a b.c d} with {a b c d}
# Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string.
#'a.b'.'c.d.e' vs 'a.b.c'.'d.e'
#dots within table segments might seem like an 'edge case'
# - but perhaps there is legitimate need to put for example version numbers as tablenames or parts of tablenames.
#VVV the test below is wrong VVV!
set seen_match [join [lrange $seen_segments 0 [expr {$testkey_length -1}]] .]
if {$testkey eq $seen_match} {
set found_testkey 1
}
}
}
if {$found_testkey == 0} {
#the raw key_hierarchy is better to display in the error message, although it's not the actual dict keyset
error "key [join $key_hierarchy_raw .] already exists, but wasn't defined by a supertable."
}
}
}
#We must do this after the key-collision test above!
lappend tablenames_seen $tablename
log::debug ">>>>>>>>>>>>>>>>>>>>key_hierarchy : $key_hierarchy"
log::debug ">>>>>>>>>>>>>>>>>>>>key_hierarchy_raw: $key_hierarchy_raw"
#now add the contained elements
foreach element [lrange $item 2 end] {
set type [lindex $element 0]
switch -exact -- $type {
KEYVAL - QKEYVAL {
set keyval_key [lindex $element 1]
set keyval_dict [_get_keyval_value $element]
dict set datastructure {*}$key_hierarchy $keyval_key $keyval_dict
}
NEWLINE - COMMENT - WS {
#ignore
}
default {
error "Sub element of type '$type' not understood in table context. Expected only KEYVAL,QKEYVAL,NEWLINE,COMMENT,WS"
}
}
}
#now make sure we add an empty value if there were no contained elements!
#!todo.
}
ITABLE {
#SEP???
set datastructure [list]
foreach element [lrange $item 1 end] {
set type [lindex $element 0]
switch -exact -- $type {
KEYVAL - QKEYVAL {
set keyval_key [lindex $element 1]
set keyval_dict [_get_keyval_value $element]
dict set datastructure $keyval_key $keyval_dict
}
NEWLINE - COMMENT - WS {
#ignore
}
default {
error "Sub element of type '$type' not understood in ITABLE context. Expected only KEYVAL,QKEYVAL,NEWLINE,COMMENT,WS"
}
}
}
}
ARRAY {
#arrays in toml are allowed to contain mixtures of types
set datastructure [list]
log::debug "--> processing array: $item"
foreach element [lrange $item 1 end] {
set type [lindex $element 0]
switch -exact -- $type {
INT - FLOAT - BOOL - DATETIME {
set value [lindex $element 1]
lappend datastructure [list type $type value $value]
}
STRING {
set value [lindex $element 1]
lappend datastructure [list type $type value [::tomlish::utils::unescape_string $value]]
}
TABLE - ARRAY - MULTISTRING {
set value [lindex $element 1]
lappend datastructure [list type $type value [::tomlish::get_dict [list $element]]]
}
WS - SEP {
#ignore whitespace and commas
}
default {
error "Unexpected value type '$type' found in array"
}
}
}
}
MULTISTRING {
#triple dquoted string
log::debug "--> processing multistring: $item"
set stringvalue ""
set idx 0
set parts [lrange $item 1 end]
for {set idx 0} {$idx < [llength $parts]} {incr idx} {
set element [lindex $parts $idx]
set type [lindex $element 0]
switch -exact -- $type {
STRINGPART {
append stringvalue [::tomlish::utils::unescape_string [lindex $element 1]]
}
CONT {
#When the last non-whitespace character on a line is an unescaped backslash,
#it will be trimmed along with all whitespace (including newlines) up to the next non-whitespace character or closing delimiter
# review - we allow some whitespace in stringpart elements - can a stringpart ever be all whitespace?
set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE]
if {$next_nl == -1} {
#last line
set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS]
if {$non_ws >= 0} {
append stringvalue "\\" ;#add the sep
} else {
#skip over ws without emitting
set idx [llength $parts]
}
} else {
set parts_til_nl [lrange $parts 0 $next_nl-1]
set non_ws [lsearch -index 0 -start $idx+1 -not $parts_til_nl WS]
if {$non_ws >= 0} {
append stringvalue "\\"
} else {
#skip over ws on this line
set idx $next_nl
#then have to check each subsequent line until we get to first non-whitespace
set trimming 1
while {$trimming && $idx < [llength $parts]} {
set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE]
if {$next_nl == -1} {
#last line
set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS]
if {$non_ws >= 0} {
set idx [expr {$non_ws -1}]
} else {
set idx [llength $parts]
}
set trimming 0
} else {
set non_ws [lsearch -index 0 -start $idx+1 -not [lrange $parts 0 $next_nl-1] WS]
if {$non_ws >= 0} {
set idx [expr {$non_ws -1}]
set trimming 0
} else {
set idx $next_nl
#keep trimming
}
}
}
}
}
}
NEWLINE {
#if newline is first element - it is not part of the data of a multistring
if {$idx > 0} {
set val [lindex $element 1]
if {$val eq "nl"} {
append stringvalue \n
} else {
append stringvalue \r\n
}
}
}
WS {
append stringvalue [lindex $element 1]
}
default {
error "Unexpected value type '$type' found in multistring"
}
}
}
set datastructure $stringvalue
}
WS - COMMENT - NEWLINE {
#ignore
}
default {
error "Unexpected tag '$tag' in Tomlish list '$tomlish'"
}
}
}
return $datastructure
}
proc json_to_toml {json} {
#*** !doctools
#[call [fun json_to_toml] [arg json]]
#[para]
set tomlish [::tomlish::from_json $json]
set toml [::tomlish::to_toml $tomlish]
}
proc from_json {json} {
set jstruct [::tomlish::json_struct $json]
return [::tomlish::from_json_struct $jstruct]
}
proc from_json_struct {jstruct} {
package require fish::json_toml
return [fish::json_toml::jsonstruct2tomlish $jstruct]
}
proc toml_to_json {toml} {
set tomlish [::tomlish::from_toml $toml]
return [::tomlish::get_json $tomlish]
}
proc get_json {tomlish} {
package require fish::json
set d [::tomlish::get_dict $tomlish]
#return [::tomlish::dict_to_json $d]
return [fish::json::from "struct" $d]
}
#*** !doctools
#[list_end] [comment {--- end definitions namespace tomlish ---}]
}
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
namespace eval tomlish::encode {
#*** !doctools
#[subsection {Namespace tomlish::encode}]
#[para]
#[list_begin definitions]
#STRING,INT,FLOAT,BOOL, DATETIME - simple wrappers for completeness
# take a value of the appropriate type and wrap as a tomlish tagged item
proc string {s} {
return [list STRING $s]
}
proc int {i} {
#whole numbers, may be prefixed with a + or -
#Leading zeros are not allowed
#Hex,octal binary forms are allowed (toml 1.0)
#We will error out on encountering commas, as commas are interpreted differently depending on locale (and don't seem to be supported in the toml spec anyway)
#!todo - Tcl can handle bignums - bigger than a 64bit signed long as specified in toml.
# - We should probably raise an error for number larger than this and suggest the user supply it as a string?
if {[tcl::string::last , $i] > -1} {
error "Unable to interpret '$i' as an integer. Use underscores if you need a thousands separator [::tomlish::parse::report_line]"
}
if {![::tomlish::utils::int_validchars $i]} {
error "Unable to interpret '$i' as an integer. Only 0-9 + 1 _ characters are acceptable. [::tomlish::parse::report_line]"
}
if {[::tomlish::utils::is_int $i]} {
return [list INT $i]
} else {
error "'$i' is not a valid integer as per the Toml spec. [::tomlish::parse::report_line]"
}
}
proc float {f} {
#convert any non-lower case variants of special values to lowercase for Toml
if {[string tolower $f] in {nan +nan -nan inf +inf -inf}} {
return [list FLOAT [string tolower $f]]
}
if {[::tomlish::utils::is_float $f]} {
return [list FLOAT $f]
} else {
error "Unable to interpret '$f' as Toml float. Check your input, or check that tomlish is able to handle all Toml floats properly [::tomlish::parse::report_line]"
}
}
proc datetime {str} {
if {[::tomlish::utils::is_datetime $str]} {
return [list DATETIME $str]
} else {
error "Unable to interpret '$str' as Toml datetime. Check your input, or check that tomlish is able to handle all Toml datetimes properly [::tomlish::parse::report_line]"
}
}
proc boolean {b} {
#convert any Tcl-acceptable boolean to boolean as accepted by toml - lower case true/false
if {![string is boolean -strict $b]} {
error "Unable to convert '$b' to Toml boolean true|false. [::tomlish::parse::report_line]"
} else {
if {[expr {$b && 1}]} {
return [list BOOL true]
} else {
return [list BOOL false]
}
}
}
#Take tablename followed by
# a) *tomlish* name-value pairs e.g table mydata [list KEYVAL item11 [list STRING "test"]] {KEYVAL item2 [list INT 1]}
# b) simple 2-element tcl lists being name & *simple* value pairs for which basic heuristics will be used to determine types
proc table {name args} {
set pairs [list]
foreach t $args {
if {[llength $t] == 3} {
if {[lindex $t 0] ne "KEYVAL"} {
error "Only items tagged as KEYVAL currently accepted as name-value pairs for table command"
}
lappend pairs $t
} elseif {[llength $t] == 2} {
#!todo - type heuristics
lassign $t n v
lappend pairs [list KEYVAL $n [list STRING $v]]
} else {
error "erlish 'KEYVAL' or simple name-value pairs expected. Unable to handle [llength $t] element list as argument"
}
}
return [list TABLE $name $pairs]
}
#the tomlish root is basically a nameless table representing the root of the document
proc root {args} {
set table [::tomlish::encode::table TOMLISH {*}$args]
set result [lindex $table 2] ;#Take only the key-value pair list
}
#WS = whitepace, US = underscore
proc tomlish {list {context ""}} {
if {![tcl::string::is list $list]} {
error "Supplied 'tomlish' is not a valid Tcl list. Expected a tagged list (parsed Toml)"
}
set toml "" ;#result string
foreach item $list {
set tag [lindex $item 0]
#puts "tomlish::encode::tomlish processing item '$item', tag '$tag'"
#during recursion, some tags require different error checking in different contexts.
set nextcontext $tag ;
#Handle invalid tag nestings
switch -- $context {
QKEYVAL -
KEYVAL {
if {$tag in {KEYVAL QKEYVAL}} {
error "Invalid tag '$tag' encountered within '$context'"
}
}
MULTISTRING {
#explicitly list the valid child tags
if {$tag ni {STRING STRINGPART WS NEWLINE CONT}} {
error "Invalid tag '$tag' encountered within a MULTISTRING"
}
}
default {
#no context, or no defined nesting error for this context
}
}
switch -- $tag {
TOMLISH {
#optional root tag. Ignore.
}
QKEYVAL -
KEYVAL {
if {$tag eq "KEYVAL"} {
append toml [lindex $item 1] ;#Key
} else {
append toml \"[lindex $item 1]\" ;#Quoted Key
}
foreach part [lrange $item 2 end] {
if {$part eq "="} {
append toml "="
} else {
append toml [::tomlish::encode::tomlish [list $part] $nextcontext]
}
}
}
TABLE {
append toml "\[[lindex $item 1]\]" ;#table name
foreach part [lrange $item 2 end] {
append toml [::tomlish::encode::tomlish [list $part] $nextcontext]
}
}
ITABLE {
#inline table - e.g within array or on RHS of keyval/qkeyval
set data ""
foreach part [lrange $item 1 end] {
append data [::tomlish::encode::tomlish [list $part] $nextcontext]
}
append toml "\{$data\}"
}
ARRAY {
set arraystr ""
foreach part [lrange $item 1 end] {
append arraystr [::tomlish::encode::tomlish [list $part] $nextcontext]
}
append toml "\[$arraystr\]"
}
WS {
append toml [lindex $item 1]
}
SEP {
append toml ","
}
NEWLINE {
set chartype [lindex $item 1]
if {$chartype eq "lf"} {
append toml \n
} elseif {$chartype eq "crlf"} {
append toml \r\n
} else {
error "Unrecognized newline type '$chartype'"
}
}
CONT {
#line continuation character "\"
append toml "\\"
}
STRING {
#simple double quoted strings only
#
return \"[lindex $item 1]\"
}
STRINGPART {
return [lindex $item 1]
}
MULTISTRING {
#Tripple quoted string which is a container for newlines,whitespace and multiple strings/stringparts
set multistring "" ;#variable to build up the string
foreach part [lrange $item 1 end] {
append multistring [::tomlish::encode::tomlish [list $part] $nextcontext]
}
append toml "\"\"\"$multistring\"\"\""
}
LITSTRING {
#Single Quoted string(literal string)
append toml '[lindex $item 1]'
}
MULTILITSTRING {
#review - multilitstring can be handled as a single string?
set litstring ""
foreach part [lrange $item 1 end] {
append litstring [::tomlish::encode::tomlish [list $part] $nextcontext]
}
append toml '''$litstring'''
}
INT -
BOOL -
FLOAT -
DATETIME {
append toml [lindex $item 1]
}
INCOMPLETE {
error "cannot process tomlish term tagged as INCOMPLETE"
}
COMMENT {
append toml "#[lindex $item 1]"
}
BOM {
#Byte Order Mark may appear at beginning of a file. Needs to be preserved.
append toml "\uFEFF"
}
default {
error "Not a properly formed 'tomlish' taggedlist.\n '$list'\n Unknown tag '[lindex $item 0]'. See output of \[tomlish::tags\] command."
}
}
}
return $toml
}
#*** !doctools
#[list_end] [comment {--- end definitions namespace tomlish::encode ---}]
}
#fish toml from tomlish
#(encode tomlish as toml)
interp alias {} tomlish::to_toml {} tomlish::encode::tomlish
#
namespace eval tomlish::decode {
#*** !doctools
#[subsection {Namespace tomlish::decode}]
#[para]
#[list_begin definitions]
#return a Tcl list of tomlish tokens
#i.e get a standard list of all the toml terms in string $s
#where each element of the list is a *tomlish* term.. i.e a specially 'tagged' Tcl list.
#(simliar to a tcl 'Huddle' - but also supporting whitespace preservation)
#Note that we deliberately don't check certain things such as duplicate table declarations here.
#Part of the justification for this is that as long as the syntax is toml shaped - we can load files which violate certain rules and allow programmatic manipulation.
# (e.g perhaps a toml editor to highlight violations for fixing)
# A further stage is then necessary to load the tomlish tagged list into a data structure more suitable for efficient query/reading.
# e.g dicts or an object oriented structure
#Note also - *no* escapes in quoted strings are processed. This is up to the datastructure stage
#e.g to_dict will substitute \r \n \uHHHH \UHHHHHHH etc
#This is important for tomlish to maintain the ability to perform competely lossless round-trips from toml to tomlish and back to toml.
# (which is handy for testing as well as editing some part of the structure with absolutely no effect on other parts of the document)
#If we were to unescape a tab character for example
# - we have no way of knowing if it was originally specified as \t \u0009 or \U00000009 or directly as a tab character.
# For this reason, we also do absolutely no line-ending transformations based on platform.
# All line-endings are maintained as is, and even a file with mixed cr crlf line-endings will be correctly interpreted and can be 'roundtripped'
proc toml {s} {
#*** !doctools
#[call [fun toml] [arg s]]
#[para] return a Tcl list of tomlish tokens
namespace upvar ::tomlish::parse is_parsing is_parsing
set is_parsing 1
if {[info command ::tomlish::parse::spacestack] eq "::tomlish::parse::spacestack"} {
tomlish::parse::spacestack destroy
}
struct::stack ::tomlish::parse::spacestack
namespace upvar ::tomlish::parse last_space_action last_space_action
namespace upvar ::tomlish::parse last_space_type last_space_type
namespace upvar ::tomlish::parse tok tok
set tok ""
namespace upvar ::tomlish::parse type type
namespace upvar ::tomlish::parse tokenType tokenType
::tomlish::parse::set_tokenType ""
namespace upvar ::tomlish::parse tokenType_list tokenType_list
set tokenType [list] ;#Flat (un-nested) list of tokentypes found
namespace upvar ::tomlish::parse lastChar lastChar
set lastChar ""
set result ""
namespace upvar ::tomlish::parse nest nest
set nest 0
namespace upvar ::tomlish::parse v v ;#array keyed on nest level
set v(0) {TOMLISH}
array set s0 [list] ;#whitespace data to go in {SPACE {}} element.
set parentlevel 0
namespace upvar ::tomlish::parse i i
set i 0
namespace upvar ::tomlish::parse state state
namespace upvar ::tomlish::parse braceCount braceCount
set barceCount 0
namespace upvar ::tomlish::parse bracketCount bracketCount
set bracketCount 0
set sep 0
set r 1
namespace upvar ::tomlish::parse token_waiting token_waiting
set token_waiting [dict create] ;#if ::tok finds a *complete* second token during a run, it will put the 2nd one here to be returned by the next call.
set state "key-space"
::tomlish::parse::spacestack push {space key-space}
namespace upvar ::tomlish::parse linenum linenum;#'line number' of input data. (incremented for each literal linefeed - but not escaped ones in data)
set linenum 1
try {
while {$r} {
set r [::tomlish::parse::tok $s]
#puts stdout "got tok: '$tok' while parsing string '$s' "
set next_tokenType_known 0 ;#whether we begin a new token here based on what terminated the token result of 'tok'
#puts "got token: '$tok' tokenType='$tokenType'. while v($nest) = [set v($nest)]"
#puts "-->tok: $tok tokenType='$tokenType'"
set prevstate $state
#####
set nextstate [::tomlish::parse::getNextState $tokenType $prevstate]
::tomlish::log::info "tok: $tok STATE TRANSITION tokenType: '$tokenType' triggering '$state' -> '$nextstate' last_space_action:$last_space_action"
set state $nextstate
if {$state eq "err"} {
error "State error - aborting parse. [tomlish::parse::report_line]"
}
if {$last_space_action eq "pop"} {
#pop_trigger_tokens: newline tablename endarray endinlinetable
#note a token is a pop trigger depending on context. e.g first newline during keyval is a pop trigger.
switch -exact -- $tokenType {
newline {
incr linenum
lappend v($nest) [list NEWLINE $tok]
}
tablename {
#note: a tablename only 'pops' if we are greater than zero
error "tablename pop should already have been handled as special case zeropoppushspace in getNextState"
}
tablearrayname {
#!review - tablearrayname different to tablename regarding push/pop?
#note: a tablename only 'pops' if we are greater than zero
error "tablearrayname pop should already have been handled as special case zeropoppushspace in getNextState"
}
endarray {
#nothing to do here.
}
comma {
#comma for inline table will pop the keyvalue space
lappend v($nest) "SEP"
}
endinlinetable {
puts stderr "endinlinetable"
}
endmultiquote {
puts stderr "endmultiquote for last_space_action 'pop'"
}
default {
error "unexpected tokenType '$tokenType' for last_space_action 'pop'"
}
}
set parentlevel [expr {$nest -1}]
lappend v($parentlevel) [set v($nest)]
incr nest -1
} elseif {$last_space_action eq "push"} {
incr nest 1
set v($nest) [list]
# push_trigger_tokens: barekey quotedkey startinlinetable startarray tablename tablearrayname
switch -exact -- $tokenType {
barekey {
set v($nest) [list KEYVAL $tok] ;#$tok is the keyname
}
quotedkey - itablequotedkey {
set v($nest) [list QKEYVAL $tok] ;#$tok is the keyname
}
tablename {
#note: we do not use the output of tomlish::tablename_trim to produce a tablename for storage in the tomlish list!
#The tomlish list is intended to preserve all whitespace (and comments) - so a roundtrip from toml file to tomlish
# back to toml file will be identical.
#It is up to the datastructure stage to normalize and interpret tomlish for programmatic access.
# we call tablename_trim here only to to validate that the tablename data is well-formed at the outermost level,
# so we can raise an error at this point rather than create a tomlish list with obviously invalid table names.
#todo - review! It's arguable that we should not do any validation here, and just store even incorrect raw tablenames,
# so that the tomlish list is more useful for say a toml editor. Consider adding an 'err' tag to the appropriate place in the
# tomlish list?
set test_only [::tomlish::utils::tablename_trim $tok]
::tomlish::log::debug "trimmed (but not normalized) tablename: '$test_only'"
set v($nest) [list TABLE $tok] ;#$tok is the *raw* table name
#note also that equivalent tablenames may have different toml representations even after being trimmed!
#e.g ["x\t\t"] & ["x "] (tab escapes vs literals)
#These will show as above in the tomlish list, but should normalize to the same tablename when used as keys by the datastructure stage.
}
tablearrayname {
set test_only [::tomlish::utils::tablename_trim $tok]
puts stdout "trimmed (but not normalized) tablearrayname: '$test_only'"
set v($nest) [list TABLEARRAY $tok] ;#$tok is the *raw* tablearray name
}
startarray {
set v($nest) [list ARRAY] ;#$tok is just the opening bracket - don't output.
}
startinlinetable {
set v($nest) [list ITABLE] ;#$tok is just the opening curly brace - don't output.
}
startmultiquote {
puts stderr "push trigger tokenType startmultiquote (todo)"
set v($nest) [list MULTISTRING] ;#container for STRINGPART, NEWLINE
#JMN ???
#set next_tokenType_known 1
#::tomlish::parse::set_tokenType "multistring"
#set tok ""
}
default {
error "push trigger tokenType '$tokenType' not yet implemented"
}
}
} else {
#no space level change
switch -exact -- $tokenType {
starttablename {
#$tok is triggered by the opening bracket and sends nothing to output
}
starttablearrayname {
#$tok is triggered by the double opening brackets and sends nothing to output
}
tablename - tablenamearray {
error "did not expect 'tablename/tablearrayname' without space level change"
#set v($nest) [list TABLE $tok]
}
endtablename - endtablearrayname {
#no output into the tomlish list for this token
}
startinlinetable {
puts stderr "decode::toml error. did not expect startlinetable without space level change"
}
startquote {
switch -exact -- $nextstate {
string {
set next_tokenType_known 1
::tomlish::parse::set_tokenType "string"
set tok ""
}
quotedkey {
set next_tokenType_known 1
::tomlish::parse::set_tokenType "quotedkey"
set tok ""
}
itablequotedkey {
set next_tokenType_known 1
::tomlish::parse::set_tokenType "itablequotedkey"
set tok ""
}
default {
error "startquote switch case not implemented for nextstate: $nextstate"
}
}
}
startmultiquote {
#review
puts stderr "no space level change - got startmultiquote"
set next_tokenType_known 1
::tomlish::parse::set_tokenType "stringpart"
set tok ""
}
endquote {
#nothing to do?
set tok ""
}
endmultiquote {
#JMN!!
set tok ""
}
string {
lappend v($nest) [list STRING $tok]
}
stringpart {
lappend v($nest) [list STRINGPART $tok]
}
multistring {
#review
lappend v($nest) [list MULTISTRING $tok]
}
quotedkey {
#lappend v($nest) [list QKEY $tok] ;#TEST
}
itablequotedkey {
}
untyped-value {
#we can't determine the type of unquoted values (int,float,datetime,bool) until the entire token was read.
if {$tok in {true false}} {
set tag BOOL
} elseif {[::tomlish::utils::is_int $tok]} {
set tag INT
} elseif {[::tomlish::utils::is_float $tok]} {
set tag FLOAT
} elseif {[::tomlish::utils::is_datetime $tok]} {
set tag DATETIME
} else {
error "Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. [tomlish::parse::report_line]"
}
lappend v($nest) [list $tag $tok]
}
comment {
#puts stdout "----- comment token returned '$tok'------"
lappend v($nest) [list COMMENT "$tok"]
}
equal {
#we append '=' to the nest so that any surrounding whitespace is retained.
lappend v($nest) =
}
comma {
lappend v($nest) SEP
}
newline {
incr linenum
lappend v($nest) [list NEWLINE $tok]
}
whitespace {
lappend v($nest) [list WS $tok]
}
continuation {
lappend v($nest) CONT
}
bom {
lappend v($nest) BOM
}
eof {
#ok - nothing more to add to the tomlish list.
#!todo - check previous tokens are complete/valid?
}
default {
error "unknown tokenType '$tokenType' [::tomlish::parse::report_line]"
}
}
}
if {!$next_tokenType_known} {
::tomlish::parse::set_tokenType ""
set tok ""
}
if {$state eq "end"} {
break
}
}
#while {$nest > 0} {
# lappend v([expr {$nest -1}]) [set v($nest)]
# incr nest -1
#}
while {[::tomlish::parse::spacestack size] > 1} {
::tomlish::parse::spacestack pop
lappend v([expr {$nest -1}]) [set v($nest)]
incr nest -1
#set parent [spacestack peek] ;#the level being appended to
#lassign $parent type state
#if {$type eq "space"} {
#
#} elseif {$type eq "buffer"} {
# lappend v([expr {$nest -1}]) {*}[set v($nest)]
#} else {
# error "invalid spacestack item: $parent"
#}
}
} finally {
set is_parsing 0
}
return $v(0)
}
#*** !doctools
#[list_end] [comment {--- end definitions namespace tomlish::decode ---}]
}
#decode toml to tomlish
interp alias {} tomlish::from_toml {} tomlish::decode::toml
namespace eval tomlish::utils {
#*** !doctools
#[subsection {Namespace tomlish::utils}]
#[para]
#[list_begin definitions]
#tablenames (& tablearraynames) may contain irrelevant leading, trailing and interspersed whitespace
# tablenames can be made up of segments delimited by dots. .eg [ a.b . c ]
#trimmed, the tablename becomes {a.b.c}
# A segment may contain whitespace if it is quoted e.g [a . b . "c etc " ]
#ie whitespace is only irrelevant if it's outside a quoted segment
#trimmed, the tablename becomes {a.b."c etc "}
proc tablename_trim {tablename} {
set segments [tablename_split $tablename false]
set trimmed_segments [list]
foreach seg $segments {
lappend trimmed_segments [::string trim $seg [list " " \t]]
}
return [join $trimmed_segments .]
}
#utils::tablename_split
proc tablename_split {tablename {normalize false}} {
#we can't just split on . because we have to handle quoted segments which may contain a dot.
#eg {dog."tater.man"}
set i 0
set sLen [::string length $tablename]
set segments [list]
set mode "unknown" ;#5 modes: unknown, quoted,litquoted, unquoted, syntax
#quoted is for double-quotes, litquoted is for single-quotes (string literal)
set seg ""
for {} {$i < $sLen} {} {
if {$i > 0} {
set lastChar [::string index $tablename [expr {$i - 1}]]
} else {
set lastChar ""
}
set c [::string index $tablename $i]
incr i
if {$c eq "."} {
switch -exact -- $mode {
unquoted {
#dot marks end of segment.
lappend segments $seg
set seg ""
set mode "unknown"
}
quoted {
append seg $c
}
unknown {
lappend segments $seg
set seg ""
}
litquoted {
append seg $c
}
default {
#mode: syntax
#we got our dot. - the syntax mode is now satisfied.
set mode "unknown"
}
}
} elseif {($c eq "\"") && ($lastChar ne "\\")} {
if {$mode eq "unknown"} {
if {[::string trim $seg] ne ""} {
#we don't allow a quote in the middle of a bare key
error "tablename_split. character '\"' invalid at this point in tablename. tablename: '$tablename'"
}
set mode "quoted"
set seg "\""
} elseif {$mode eq "unquoted"} {
append seg $c
} elseif {$mode eq "quoted"} {
append seg $c
lappend segments $seg
set seg ""
set mode "syntax" ;#make sure we only accept a dot or end-of-data now.
} elseif {$mode eq "litquoted"} {
append seg $c
} elseif {$mode eq "syntax"} {
error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'"
}
} elseif {($c eq "\'")} {
if {$mode eq "unknown"} {
append seg $c
set mode "litquoted"
} elseif {$mode eq "unquoted"} {
#single quote inside e.g o'neill
append seg $c
} elseif {$mode eq "quoted"} {
append seg $c
} elseif {$mode eq "litquoted"} {
append seg $c
lappend segments $seg
set seg ""
set mode "syntax"
} elseif {$mode eq "syntax"} {
error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'"
}
} elseif {$c in [list " " \t]} {
if {$mode eq "syntax"} {
#ignore
} else {
append seg $c
}
} else {
if {$mode eq "syntax"} {
error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'"
}
if {$mode eq "unknown"} {
set mode "unquoted"
}
append seg $c
}
if {$i == $sLen} {
#end of data
::tomlish::log::debug "End of data: mode='$mode'"
switch -exact -- $mode {
quoted {
if {$c ne "\""} {
error "tablename_split. missing closing double-quote in a segment. tablename: '$tablename'"
}
if {$normalize} {
lappend segments $seg
} else {
lappend segments [::tomlish::utils::unescape_string [::string range $seg 1 end-1]]
#lappend segments [subst -nocommands -novariables [::string range $seg 1 end-1]] ;#wrong
}
}
litquoted {
set trimmed_seg [::string trim $seg]
if {[::string index $trimmed_seg end] ne "\'"} {
error "tablename_split. missing closing single-quote in a segment. tablename: '$tablename'"
}
lappend segments $seg
}
unquoted - unknown {
lappend segments $seg
}
syntax {
#ok - segment already lappended
}
default {
lappend segments $seg
}
}
}
}
foreach seg $segments {
set trimmed [::string trim $seg [list " " \t]]
#note - we explicitly allow 'empty' quoted strings '' & ""
# (these are 'discouraged' but valid toml keys)
#if {$trimmed in [list "''" "\"\""]} {
# puts stderr "tablename_split. warning - Empty quoted string as tablename segment"
#}
if {$trimmed eq "" } {
error "tablename_split. Empty segment found. tablename: '$tablename'"
}
}
return $segments
}
proc unicode_escape_info {slashu} {
#!todo
# validate that slashu is either a \uxxxx or \Uxxxxxxxx value of the correct length and
# is a valid 'unicode scalar value'
# ie integers in the range 0 to D7FF16 and E00016 to 10FFFF16 inclusive
#expr {(($x >= 0) && ($x <= 0xD7FF16)) || (($x >= 0xE00016) && ($x <= 0x10FFFF16))}
if {[::string match {\\u*} $slashu]} {
set exp {^\\u([0-9a-fA-F]{4}$)}
if {[regexp $exp $slashu match hex]} {
if {[scan $hex %4x dec] != 1} {
#why would a scan ever fail after matching the regexp? !todo - review. unreachable branch?
return [list err [list reason "Failed to convert '$hex' to decimal"]]
} else {
return [list ok [list char [subst -nocommand -novariable $slashu]]]
}
} else {
return [list err [list reason "Supplied string not of the form \\uHHHH where H in \[0-9a-fA-F\]"]]
}
} elseif {[::string match {\\U*} $slashu]} {
set exp {^\\U([0-9a-fA-F]{8}$)}
if {[regexp $exp $slashu match hex]} {
if {[scan $hex %8x dec] != 1} {
#why would a scan ever fail after matching the regexp? !todo - review. unreachable branch?
return [list err [list reason "Failed to convert '$hex' to decimal"]]
} else {
if {(($dec >= 0) && ($dec <= 0xD7FF16)) || (($dec >= 0xE00016) && ($dec <= 0x10FFFF16))} {
return [list ok [list char [subst -nocommand -novariable $slashu]]]
} else {
return [list err [list reason "$slashu is not within the 'unicode scalar value' ranges 0 to 0xD7FF16 or 0xE00016 to 0x10FFFF16"]]
}
}
} else {
return [list err [list reason "Supplied string not of the form \\UHHHHHHHH where H in \[0-9a-fA-F\]"]]
}
} else {
return [list err [list reason "Supplied string did not start with \\u or \\U" ]]
}
}
proc unescape_string {str} {
#note we can't just use Tcl subst because:
# it also transforms \a (audible bell) and \v (vertical tab) which are not in the toml spec.
# it would strip out backslashes inappropriately: e.g "\j" becomes just j
# it recognizes other escapes which aren't approprite e.g \xhh and octal \nnn
# it replaces\<newline><whitespace> with a single whitespace
#This means we shouldn't use 'subst' on the whole string, but instead substitute only the toml-specified escapes (\r \n \b \t \f \\ \" \uhhhh & \Uhhhhhhhh
set buffer ""
set buffer4 "" ;#buffer for 4 hex characters following a \u
set buffer8 "" ;#buffer for 8 hex characters following a \u
set sLen [::string length $str]
#we need to handle arbitrarily long sequences of backslashes. \\\\\ etc
set slash_active 0
set unicode4_active 0
set unicode8_active 0
#!todo - check for invalid data in the form of a raw carriage return (decimal 13) without following linefeed?
set i 0
for {} {$i < $sLen} {} {
if {$i > 0} {
set lastChar [::string index $str [expr {$i - 1}]]
} else {
set lastChar ""
}
set c [::string index $str $i]
::tomlish::log::debug "unescape_string. got char $c"
scan $c %c n
if {($n <= 31) && ($n != 9) && ($n != 10) && ($n != 13)} {
#we don't expect unescaped unicode characters from 0000 to 001F -
#*except* for raw tab (which is whitespace) and newlines
error "unescape_string. Invalid data for a toml string. Unescaped control character (decimal $n) [::tomlish::utils::string_to_slashu $c]"
}
incr i ;#must incr here because we do'returns'inside the loop
if {$c eq "\\"} {
if {$slash_active} {
append buffer "\\"
set slash_active 0
} elseif {$unicode4_active} {
error "unescape_string. unexpected case slash during unicode4 not yet handled"
} elseif {$unicode8_active} {
error "unescape_string. unexpected case slash during unicode8 not yet handled"
} else {
# don't output anything (yet)
set slash_active 1
}
} else {
if {$unicode4_active} {
if {[::string length $buffer4] < 4} {
append buffer4 $c
}
if {[::string length $buffer4] == 4} {
#we have a \uHHHH to test
set unicode4_active 0
set result [tomlish::utils::unicode_escape_info "\\u$buffer4"]
if {[lindex $result 0] eq "ok"} {
append buffer [dict get $result ok char]
} else {
error "unescape_string error: [lindex $result 1]"
}
}
} elseif {$unicode8_active} {
if {[::string length $buffer8] < 8} {
append buffer8 $c
}
if {[::string length $buffer8] == 8} {
#we have a \UHHHHHHHH to test
set unicode8_active 0
set result [tomlish::utils::unicode_escape_info "\\U$buffer8"]
if {[lindex $result 0] eq "ok"} {
append buffer [dict get $result ok char]
} else {
error "unescape_string error: [lindex $result 1]"
}
}
} elseif {$slash_active} {
set slash_active 0
set ctest [string map {{"} dq} $c]
switch -exact -- $ctest {
dq {
set e "\\\""
append buffer [subst -nocommand -novariable $e]
}
b - t - n - f - r {
set e "\\$c"
append buffer [subst -nocommand -novariable $e]
}
u {
set unicode4_active 1
set buffer4 ""
}
U {
set unicode8_active 1
set buffer8 ""
}
default {
set slash_active 0
append buffer "\\"
append buffer $c
}
}
} else {
append buffer $c
}
}
}
#puts stdout "EOF 4:$unicode4_active 8:$unicode8_active slash:$slash_active"
if {$unicode4_active} {
error "End of string reached before complete unicode escape sequence \uHHHH"
}
if {$unicode8_active} {
error "End of string reached before complete unicode escape sequence \UHHHHHHHH"
}
if {$slash_active} {
append buffer "\\"
}
return $buffer
}
proc normalize_key {rawkey} {
set c1 [::string index $rawkey 0]
set c2 [::string index $rawkey end]
if {($c1 eq "'") && ($c2 eq "'")} {
#single quoted segment. No escapes allowed within it.
set key [::string range $rawkey 1 end-1]
} elseif {($c1 eq "\"") && ($c2 eq "\"")} {
#double quoted segment. Apply escapes.
#
set keydata [::string range $rawkey 1 end-1] ;#strip outer quotes only
set key [::tomlish::utils::unescape_string $keydata]
#set key [subst -nocommands -novariables $keydata] ;#wrong. Todo - create a string escape substitution function.
} else {
set key $rawkey
}
return $key
}
proc string_to_slashu {string} {
set rv {}
foreach c [split $string {}] {
scan $c %c c
append rv {\u}
append rv [format %.4X $c]
}
return $rv
}
#'nonprintable' is conservative here because some systems (e.g windows console) are very limited in what they can display.
proc nonprintable_to_slashu {s} {
set res ""
foreach i [split $s ""] {
scan $i %c c
set printable 0
if {($c>31) && ($c<127)} {
set printable 1
}
if {$printable} {append res $i} else {append res \\u[format %.4X $c]}
}
set res
} ;#RS
#check if str is valid for use as a toml bare key
proc is_barekey {str} {
if {[::string length $str] == 0} {
return 0
} else {
set matches [regexp -all {[a-zA-Z0-9\_\-]} $str]
if {[::string length $str] == $matches} {
#all characters match the regexp
return 1
} else {
return 0
}
}
}
#test only that the characters in str are valid for the toml specified type 'integer'.
proc int_validchars1 {str} {
set numchars [::string length $str]
if {[regexp -all {[0-9\_\-\+]} $str] == $numchars} {
return 1
} else {
return 0
}
}
#add support for hex,octal,binary 0x.. 0o.. 0b...
proc int_validchars {str} {
set numchars [::string length $str]
if {[regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] == $numchars} {
return 1
} else {
return 0
}
}
proc is_int {str} {
set matches [regexp -all {[0-9\_xo\-\+A-Fa-f]} $str]
if {[tcl::string::length $str] == $matches} {
#all characters in legal range
# ---------------------------------------
#check for leading zeroes in non 0x 0b 0o
#first strip any +, - or _ (just for this test)
set check [::string map {+ "" - "" _ ""} $str]
if {([tcl::string::length $check] > 1) && ([tcl::string::index $check 0] eq "0") && ([tcl::string::index $check 1] ni {o x b})} {
return 0
}
# ---------------------------------------
#check +,- only occur in the first position.
if {[::string last - $str] > 0} {
return 0
}
if {[::string last + $str] > 0} {
return 0
}
set numeric_value [::string map {_ ""} $str] ;#allow some earlier tcl versions which don't support underscores
#use Tcl's integer check to ensure we don't let things like 3e4 through - which is a float (would need to be 0x3e4 for hex)
if {![string is integer -strict $numeric_value]} {
return 0
}
#!todo - check bounds only based on some config value
#even though Tcl can handle bignums, we won't accept anything outside of toml 1.0 minimum requirements.
#presumably very large numbers would have to be supplied in a toml file as strings.
#Review - toml 1.0 only says that it must handle up to 2^63 - not that this is a max
if {$numeric_value > $::tomlish::max_int} {
return 0
}
if {$numeric_value < $::tomlish::min_int} {
return 0
}
} else {
return 0
}
#Got this far - didn't find anything wrong with it.
return 1
}
#test only that the characters in str are valid for the toml specified type 'float'.
proc float_validchars {str} {
set numchars [::string length $str]
if {[regexp -all {[eE0-9\_\-\+\.]} $str] == $numchars} {
return 1
} else {
#only allow lower case for these special values - as per Toml 1.0 spec
if {$str ni {inf +inf -inf nan +nan -nan}} {
return 0
} else {
return 1
}
}
}
proc is_float {str} {
set matches [regexp -all {[eE0-9\_\-\+\.]} $str]
#don't test for case variations - as Toml doesn't allow (whereas Tcl allows Inf NaN etc)
if {$str in {inf +inf -inf nan +nan -nan}} {
return 1
}
if {[::string length $str] == $matches} {
#all characters in legal range
#A leading zero is ok, but we should disallow multiple leading zeroes (same rules as toml ints)
#Toml spec also disallows leading zeros in the exponent part
#... but this seems less interoperable (some libraries generate leading zeroes in exponents)
#for now we will allow leading zeros in exponents
#!todo - configure 'strict' option to disallow?
#first strip any +, - or _ (just for this test)
set check [::string map {+ "" - "" _ ""} $str]
set r {([0-9])*}
regexp $r $check intpart ;#intpart holds all numerals before the first .,e or E
set z {([0])*}
regexp $z $intpart leadingzeros
if {[::string length $leadingzeros] > 1} {
return 0
}
#for floats, +,- may occur in multiple places
#e.g -2E-22 +3e34
#!todo - check bounds ?
#strip underscores for tcl double check
set check [::string map {_ ""} $str]
#string is double accepts inf nan +NaN etc.
if {![::string is double $check]} {
return 0
}
} else {
return 0
}
#Got this far - didn't find anything wrong with it.
return 1
}
#test only that the characters in str are valid for the toml specified type 'datetime'.
proc datetime_validchars {str} {
set numchars [::string length $str]
if {[regexp -all {[zZtT0-9\-\+\.:]} $str] == $numchars} {
return 1
} else {
return 0
}
}
proc is_datetime {str} {
#e.g 1979-05-27T00:32:00-07:00
set matches [regexp -all {[zZtT0-9\-\+\.:]} $str]
if {[::string length $str] == $matches} {
#all characters in legal range
#!todo - use full RFC 3339 parser?
lassign [split $str T] datepart timepart
#!todo - what if the value is 'time only'?
if {[catch {clock scan $datepart} err]} {
puts stderr "tcl clock scan failed err:'$err'"
return 0
}
#!todo - verify time part is reasonable
} else {
return 0
}
return 1
}
#*** !doctools
#[list_end] [comment {--- end definitions namespace tomlish::utils ---}]
}
namespace eval tomlish::parse {
#*** !doctools
#[subsection {Namespace tomlish::parse}]
#[para]
#[list_begin definitions]
variable is_parsing 0 ;#whether we are in the middle of parsing tomlish text
variable state
# states:
# key-space, curly-space, array-space
# value-expected, keyval-syntax, doublequoted, singlequoted, quotedkey, string, multistring
#
# notes:
# key-space i
# only the -space states are also 'spaces' ie a container which is pushed/popped on the spacestack
# value-expected - we also allow for leading whitespace in this state, but once a value is returned we jump to a state based on the containing space. e.g keytail or array-syntax
#
#stateMatrix defines for each state, actions to take for each possible token.
#single-element actions are the name of the next state into which to transition, or a 'popspace' command to pop a level off the spacestack and add the data to the parent container.
#dual-element actions are a push command and the name of the space to push on the stack.
# - pushspace is a simple push onto the spacestack, zeropoppushspace also pushes, but will first do a pop *if* the current space level is greater than zero (ie if only if not already in root key-space)
#test
variable stateMatrix
set stateMatrix [dict create]
dict set stateMatrix\
key-space {whitespace "key-space" newline "key-space" bom "key-space" barekey {pushspace "keyval-space"} eof "end" startquote "quotedkey" startmultiquote "err" endquote "err" comment "key-space" comma "err" starttablename "tablename" starttablearrayname "tablearrayname"}
dict set stateMatrix\
curly-space {\
whitespace "curly-space"\
newline "curly-space"\
barekey {pushspace "itablekeyval-space"}\
itablequotedkey "itablekeyval-space"\
endinlinetable "popspace"\
startquote "itablequotedkey"\
comma "curly-space"\
eof "err"\
comment "err"\
}
#REVIEW
#toml spec looks like heading towards allowing newlines within inline tables
#https://github.com/toml-lang/toml/issues/781
dict set stateMatrix\
curly-syntax {\
whitespace "curly-syntax"\
newline "curly-syntax"\
barekey {pushspace "itablekeyval-space"}\
itablequotedkey "itablekeyval-space"\
endinlinetable "popspace"\
startquote "itablequotedkey"\
comma "curly-space"\
eof "err"\
comment "err"\
}
dict set stateMatrix\
value-expected {\
whitespace "value-expected"\
newline "err"\
eof "err"\
untyped-value "samespace"\
startquote "string"\
startmultiquote {pushspace "multistring-space"}\
startinlinetable {pushspace curly-space}\
comment "err"\
comma "err"\
startarray {pushspace array-space}\
}
dict set stateMatrix\
array-space {\
whitespace "array-space"\
newline "array-space"\
eof "err"\
untyped-value "samespace"\
startarray {pushspace "array-space"}\
endarray "popspace"\
startquote "string"\
startmultiquote "multistring"\
comma "array-space"\
comment "array-space"\
}
dict set stateMatrix\
array-syntax {\
whitespace "array-syntax"\
newline "array-syntax"\
untyped-value "samespace"\
startarray {pushspace array-space}\
endarray "popspace"\
startquote "string"\
startmultiquote "multistring"\
comma "array-space"\
comment "err"\
}
dict set stateMatrix\
itablekeyval-syntax {whitespace "itablekeyval-syntax" endquote "itablekeyval-syntax" newline "err" equal "value-expected" eof "err"}
#dict set stateMatrix\
# itablekeytail {whitespace "itablekeytail" endinlinetable "popspace" comma "popspace" newline "err" comment "err" eof "err"}
dict set stateMatrix\
itablevaltail {whitespace "itablevaltail" endinlinetable "popspace" comma "popspace" newline "err" comment "err" eof "err"}
dict set stateMatrix\
itablekeyval-space {}
dict set stateMatrix\
itablequotedkey {whitespace "NA" itablequotedkey {pushspace "itablekeyval-space"} newline "err" endquote "itablekeyval-syntax"}
dict set stateMatrix\
keyval-syntax {whitespace "keyval-syntax" endquote "keyval-syntax" comma "err" newline "err" equal "value-expected" eof "err"}
dict set stateMatrix\
keytail {whitespace "keytail" newline "popspace" comment "keytail" eof "end"}
dict set stateMatrix\
keyval-space {}
dict set stateMatrix\
quotedkey {whitespace "NA" quotedkey {pushspace "keyval-space"} newline "err" endquote "keyval-syntax"}
dict set stateMatrix\
string {whitespace "NA" newline "err" string "string" endquote "samespace" eof "err"}
dict set stateMatrix\
stringpart {eof "err" continuation "samespace" endmultiquote "popspace"}
dict set stateMatrix\
multistring {whitespace "NA" newline "NA" multistring "multistring" endmultiquote "samespace" endquote "err" eof "err"}
dict set stateMatrix\
multistring-space {whitespace "multistring-space" continuation "multistring-space" stringpart "multistring-space" multistring "multistring-space" newline "multistring-space" eof "err" endmultiquote "popspace"}
dict set stateMatrix\
tablename {whitespace "NA" tablename {zeropoppushspace key-space} tablename2 {pushspace key-space} newline "err" endtablename "tablenametail"}
dict set stateMatrix\
baretablename {whitespace "NA" newline "err" equal "value-expected"}
dict set stateMatrix\
tablenametail {whitespace "tablenametail" newline "key-space" comment "tablenametail" eof "end"}
dict set stateMatrix\
tablearrayname {whitespace "NA" tablearrayname {zeropoppushspace key-space} tablearrayname2 {pushspace key-space} newline "err" endtablearray "tablearraynametail"}
dict set stateMatrix\
tablearraynametail {whitespace "tablearraynametail" newline "key-space" comment "tablearraynametail" eof "end"}
dict set stateMatrix\
end {}
#see also spacePopTransitions and spacePushTransitions below for state redirections on pop/push
variable stateMatrix_orig {
key-space {whitespace "key-space" newline "key-space" bom "key-space" barekey {pushspace "keyval-space"} eof "end" startquote "quotedkey" startmultiquote "err" endquote "err" comment "key-space" comma "err" starttablename "tablename" starttablearrayname "tablearrayname"}
curly-space {whitespace "curly-space" newline "curly-space" barekey {pushspace "keyval-space"} quotedkey {pushspace "keyval-space"} startcurly {pushspace curly-space} endcurly "popspace" eof "err"}
value-expected {whitespace "value-expected" newline "err" eof "err" untyped-value "samespace" startquote "string" startmultiquote {pushspace "multistring-space"} comment "err" comma "err" startarray {pushspace array-space}}
array-space {whitespace "array-space" newline "array-space" eof "err" untyped-value "samespace" startarray {pushspace "array-space"} endarray "popspace" startquote "string" startmultiquote "multistring" comma "array-space" comment "array-space"}
array-syntax {whitespace "array-syntax" newline "array-syntax" comma "array-space" untyped-value "samespace" startquote "string" startmultiquote "multistring" startarray {pushspace array-space} comment "err" endarray "popspace"}
keyval-syntax {whitespace "keyval-syntax" endquote "keyval-syntax" newline "err" equal "value-expected" eof "err"}
keytail {whitespace "keytail" newline "popspace" comment "keytail" eof "end"}
keyval-space {}
quotedkey {whitespace "NA" quotedkey {pushspace "keyval-space"} newline "err" endquote "keyval-syntax"}
string {whitespace "NA" newline "err" string "string" endquote "samespace" eof "err"}
stringpart {eof "err" continuation "samespace" endmultiquote "popspace"}
multistring {whitespace "NA" newline "NA" multistring "multistring" endmultiquote "samespace" endquote "err" eof "err"}
multistring-space {whitespace "multistring-space" continuation "multistring-space" stringpart "multistring-space" newline "multistring-space" eof "err" endmultiquote ""}
tablename {whitespace "NA" tablename {zeropoppushspace key-space} tablename2 {pushspace key-space} newline "err" endtablename "tablenametail"}
tablenametail {whitespace "tablenametail" newline "key-space" comment "tablenametail" eof "end"}
tablearrayname {whitespace "NA" tablearrayname {zeropoppushspace key-space} tablearrayname2 {pushspace key-space} newline "err" endtablearray "tablearraynametail"}
tablearraynametail {whitespace "tablearraynametail" newline "key-space" comment "tablearraynametail" eof "end"}
end {}
}
#build a list of 'push triggers' from the stateMatrix
# ie tokens which can push a new space onto spacestack
set push_trigger_tokens [list]
tcl::dict::for {s transitions} $stateMatrix {
tcl::dict::for {token transition_to} $transitions {
set action [lindex $transition_to 0]
switch -exact -- $action {
pushspace - zeropoppushspace {
if {$token ni $push_trigger_tokens} {
lappend push_trigger_tokens $token
}
}
}
}
}
puts stdout "push_trigger_tokens: $push_trigger_tokens"
#!todo - hard code once stateMatrix finalised?
#mainly for the -space states:
#redirect to another state $c based on a state transition from $whatever to $b
# e.g "string {array-space array-syntax}" means when transitioning from string to array-space, jump to array-syntax instead.
#this is useful as we often don't know state $b. e.g when it is decided by 'popspace'
variable spacePopTransitions {
array-space array-syntax
curly-space curly-syntax
keyval-space keytail
itablekeyval-space itablevaltail
}
variable spacePushTransitions {
keyval-space keyval-syntax
itablekeyval-space itablekeyval-syntax
array-space array-space
curly-space curly-space
key-space tablename
}
variable state_list
namespace export tomlish toml
namespace ensemble create
proc getNextState {tokentype currentstate} {
variable nest
variable v
variable spacePopTransitions
variable spacePushTransitions
variable last_space_action "none"
variable last_space_type "none"
variable state_list
set result ""
if {[dict exists $::tomlish::parse::stateMatrix $currentstate $tokentype]} {
set transition_to [dict get $::tomlish::parse::stateMatrix $currentstate $tokentype]
::tomlish::log::info "getNextState tokentype:$tokentype currentstate:$currentstate : transition_to = $transition_to"
switch -exact -- [lindex $transition_to 0] {
popspace {
spacestack pop
set parent [spacestack peek]
lassign $parent type target
set last_space_action "pop"
set last_space_type $type
if {[dict exists $::tomlish::parse::spacePopTransitions $target]} {
set next [dict get $::tomlish::parse::spacePopTransitions $target]
::tomlish::log::debug "--->> pop transition to space $target redirected state to $next <<---"
} else {
set next $target
}
set result $next
}
samespace {
#note the same data as popspace (spacePopTransitions) is used here.
set parent [spacestack peek]
::tomlish::log::debug ">>>>>>>>> got parent $parent <<<<<"
lassign $parent type target
if {[dict exists $::tomlish::parse::spacePopTransitions $target]} {
set next [dict get $::tomlish::parse::spacePopTransitions $target]
::tomlish::log::debug "--->> samespace transition to space $target redirected state to $next <<---"
} else {
set next $target
}
set result $next
}
zeropoppushspace {
if {$nest > 0} {
#pop back down to the root level (key-space)
spacestack pop
set parent [spacestack peek]
lassign $parent type target
set last_space_action "pop"
set last_space_type $type
#-----
#standard pop
set parentlevel [expr {$nest -1}]
lappend v($parentlevel) [set v($nest)]
incr nest -1
#-----
}
#re-entrancy
#set next [list pushspace [lindex $transition_to 1]]
set nexttokentype ${tokentype}2 ;#fake token type e.g tablename2 or tablearrayname2
::tomlish::log::notice "REENTRANCY!!! calling getNextState $nexttokentype $tokentype"
set result [::tomlish::parse::getNextState $nexttokentype $tokentype]
}
pushspace {
set target [lindex $transition_to 1]
spacestack push [list space $target]
set last_space_action "push"
set last_space_type "space"
#puts $::tomlish::parse::spacePushTransitions
if {[dict exists $::tomlish::parse::spacePushTransitions $target]} {
set next [dict get $::tomlish::parse::spacePushTransitions $target]
::tomlish::log::info "--->> push transition to space $target redirected state to $next <<---"
} else {
set next $target
}
set result $next
}
default {
set result $transition_to
}
}
} else {
set result "nostate-err"
}
lappend state_list $result
return $result
}
proc report_line {{line ""}} {
variable linenum
variable is_parsing
if {$is_parsing} {
if {$line eq ""} {
set line $linenum
}
return "Line Number: $line"
} else {
#not in the middle of parsing tomlish text - return nothing.
return ""
}
}
#produce a *slightly* more readable string rep of the nest for puts etc.
proc nest_pretty1 {list} {
set prettier "{"
foreach el $list {
if { [lindex $el 0] eq "NEWLINE"} {
append prettier "[list $el]\n"
} elseif {([llength $el] > 1) && ([lindex $el 0] in {KEYVAL QKEYVAL TABLE ARRAY})} {
append prettier [nest_pretty1 $el]
} else {
append prettier "[list $el] "
}
}
append prettier "}"
return $prettier
}
proc set_tokenType {t} {
variable tokenType
variable tokenType_list
if {![info exists tokenType]} {
set tokenType ""
}
lappend tokenType_list $t
set tokenType $t
}
proc switch_tokenType {t} {
variable tokenType
variable tokenType_list
lset tokenType_list end $t
set tokenType $t
}
proc get_tokenType {} {
variable tokenType
return $tokenType
}
proc _shortcircuit_startquotesequence {} {
variable tok
variable i
set toklen [::string length $tok]
if {$toklen == 1} {
set_tokenType "startquote"
incr i -1
return -level 2 1
} elseif {$toklen == 2} {
set_tokenType "startquote"
set tok "\""
incr i -2
return -level 2 1
}
}
#return a list of 0 1 or 2 tokens
#tomlish::parse::tok
proc tok {s} {
variable nest
variable v
variable i
variable tok
variable type ;#character type
variable state ;#FSM
set resultlist [list]
variable tokenType
variable tokenType_list
variable endToken
set sLen [::string length $s]
variable lastChar
variable braceCount
variable bracketCount
#------------------------------
#Previous run found another (presumably single-char) token
variable token_waiting
if {[dict size $token_waiting]} {
set tokenType [dict get $token_waiting type]
set tok [dict get $token_waiting tok]
dict unset token_waiting type
dict unset token_waiting tok
return 1
}
#------------------------------
set slash_active 0
set quote 0
set c ""
set multi_dquote ""
for {} {$i < $sLen} {} {
if {$i > 0} {
set lastChar [string index $s [expr {$i - 1}]]
} else {
set lastChar ""
}
set c [string index $s $i]
#puts "got char $c during tokenType '$tokenType'"
incr i ;#must incr here because we do'returns'inside the loop
set ctest [string map {\{ lc \} rc \[ lb \] rb \" dq \\ bsl \r cr \n lf \t tab \uFEFF bom} $c]
switch -exact -- $ctest {
# {
set dquotes $multi_dquote
set multi_dquote "" ;#!!
if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out.
set slash_active 0
if {[::string length $tokenType]} {
switch -exact -- $tokenType {
startquotesequence {
_shortcircuit_startquotesequence
}
barekey {
error "Unexpected character '$c' during bare key. Only \[a-zA-Z_-\] allowed. [tomlish::parse::report_line]"
}
whitespace {
# hash marks end of whitespace token
#do a return for the whitespace, set token_waiting
#dict set token_waiting type comment
#dict set token_waiting tok ""
incr i -1 ;#leave comment for next run
return 1
}
untyped-value {
#REVIEW! the spec isn't clear.. is whitespace after an int,bool etc required before comment?
#we will accept a comment marker as an immediate terminator of the untyped-value.
incr i -1
return 1
}
default {
#quotedkey, string, multistring
append tok $c
}
}
} else {
#$slash_active not relevant when no tokenType
#start of token if we're not in a token
set_tokenType comment
set tok "" ;#The hash is not part of the comment data
}
}
lc {
set multi_dquote "" ;#!!
#test jmn2024
#left curly brace
try {
if {[::string length $tokenType]} {
switch -exact -- $tokenType {
startquotesequence {
_shortcircuit_startquotesequence
}
string - stringpart {
if {$slash_active} {append tok "\\"}
append tok $c
}
starttablename {
error "unexpected tablename problem"
#$slash_active not relevant to this tokentype
#change the tokenType
switch_tokenType "starttablearrayname"
set tok "" ;#no output into the tomlish list for this token
#any following whitespace is part of the tablearrayname, so return now
return 1
}
comment {
if {$slash_active} {append tok "\\"}
append tok "\["
}
default {
#end any other token.
incr i -1
return 1
}
}
} else {
switch -exact -- $state {
value-expected {
#switch last key to tablename??
set_tokenType "startinlinetable"
set tok "\{"
return 1
}
multistring-space {
set_tokenType "stringpart"
if {$slash_active} {
set tok "\\\{"
} else {
set tok "\{"
}
}
key-space {
#invalid - but allow parser statemachine to report it. ?
set_tokenType "startinlinetable"
set tok "\{"
return 1
}
array-space - array-syntax {
#nested anonymous inline table
set_tokenType "startinlinetable"
set tok "\{"
return 1
}
default {
error "state: '$state'. left brace case not implemented [tomlish::parse::report_line]"
}
}
}
} on error {em} {
error $em
} finally {
set slash_active 0
}
}
rc {
set multi_dquote "" ;#!!
#right curly brace
try {
if {[string length $tokenType]} {
switch -exact -- $tokenType {
startquotesequence {
_shortcircuit_startquotesequence
}
string - stringpart - comment {
if {$slash_active} {append tok "\\"}
append tok $c
}
tablename {
if {$slash_active} {append tok "\\"}
#invalid! - but leave for datastructure loading stage to catch
dict set token_waiting type endinlinetable
dict set token_waiting tok ""
return 1
}
tablearrayname {
if {$slash_active} {append tok "\\"}
#invalid! - but leave for datastructure loading stage to catch
dict set token_waiting type endtablearrayname
dict set token_waiting tok ""
return 1
}
itablevaltail {
}
default {
#end any other token
incr i -1
return 1
}
}
} else {
#$slash_active not relevant when no tokenType
switch -exact -- $state {
value-expected {
#invalid - but allow parser statemachine to report it.
set_tokenType "endinlinetable"
set tok "\}"
return 1
}
key-space {
#invalid - but allow parser statemachine to report it. ?
set_tokenType "endinlinetable"
set tok "\}"
return 1
}
tablename {
#e.g [] - empty tablename - allowed or not?
#empty tablename/tablearrayname ?
error "unexpected tablename problem"
set_tokenType "endinlinetable"
set tok "" ;#no output into the tomlish list for this token
return 1
}
tablearrayname {
error "unexpected tablearrayname problem"
set_tokenType "endinlinetable"
set tok "" ;#no output into the tomlish list for this token
return 1
}
curly-syntax - curly-space {
set_tokenType "endinlinetable"
set tok "\}"
return 1
}
array-syntax - array-space {
#invalid
set_tokenType "endinlinetable"
set tok "\}"
return 1
}
itablevaltail {
set_tokenType "endinlinetable"
set tok ""
#we need to pop the keyval - and then reprocess to pop the inlinetable - so we incr -1
incr i -1
return 1
}
itablekeyval-syntax {
error "endinlinetable unexpected at this point. Expecting key=val syntax [tomlish::parse::report_line]"
}
default {
#JMN2024b keytail?
error "state '$state'. endinlinetable case not implemented [tomlish::parse::report_line]"
}
}
}
} on error {em} {
error $em
} finally {
set slash_active 0
}
}
lb {
set multi_dquote "" ;#!!
#left square bracket
try {
if {[::string length $tokenType]} {
switch -exact -- $tokenType {
startquotesequence {
_shortcircuit_startquotesequence
}
string - stringpart {
if {$slash_active} {append tok "\\"}
append tok $c
}
starttablename {
#$slash_active not relevant to this tokentype
#change the tokenType
switch_tokenType "starttablearrayname"
set tok "" ;#no output into the tomlish list for this token
#any following whitespace is part of the tablearrayname, so return now
return 1
}
comment {
if {$slash_active} {append tok "\\"}
append tok "\["
}
default {
#end any other token.
incr i -1
return 1
}
}
} else {
#$slash_active not relevant when no tokenType
switch -exact -- $state {
value-expected {
set_tokenType "startarray"
set tok "\["
return 1
}
key-space {
#table name
#assume it's a single bracket - but we need to wait for non-bracket to confirm it's not a tablearray
#note that a starttablearrayname token may contain whitespace between the brackets
# e.g \[ \[
set_tokenType "starttablename"
set tok "" ;#there is no output into the tomlish list for this token
}
array-space - array-syntax {
#nested array?
set_tokenType "startarray"
set tok "\["
return 1
#error "state: array-space. startarray case not implemented [tomlish::parse::report_line]"
}
default {
error "state: '$state'. startarray case not implemented [tomlish::parse::report_line]"
}
}
}
} on error {em} {
error $em
} finally {
set slash_active 0
}
}
rb {
set multi_dquote "" ;#!!
#right square bracket
try {
if {[string length $tokenType]} {
switch -exact -- $tokenType {
startquotesequence {
_shortcircuit_startquotesequence
}
string - stringpart - comment {
if {$slash_active} {append tok "\\"}
append tok $c
}
tablename {
if {$slash_active} {append tok "\\"}
#invalid! - but leave for datastructure loading stage to catch
dict set token_waiting type endtablename
dict set token_waiting tok ""
return 1
}
tablearraynames {
if {$slash_active} {append tok "\\"}
#invalid! - but leave for datastructure loading stage to catch
dict set token_waiting type endtablearrayname
dict set token_waiting tok ""
return 1
}
default {
incr i -1
return 1
}
}
} else {
#$slash_active not relevant when no tokenType
switch -exact -- $state {
value-expected {
#invalid - but allow parser statemachine to report it.
set_tokenType "endarray"
set tok "\]"
return 1
}
key-space {
#invalid - but allow parser statemachine to report it. ?
set_tokenType "endarray"
set tok "\]"
return 1
}
tablename {
#e.g [] - empty tablename - allowed or not?
#empty tablename/tablearrayname ?
error "unexpected tablename problem"
set_tokenType "endtablename"
set tok "" ;#no output into the tomlish list for this token
return 1
}
tablearrayname {
error "unexpected tablearrayname problem"
set_tokenType "endtablearray"
set tok "" ;#no output into the tomlish list for this token
return 1
}
array-syntax - array-space {
set_tokenType "endarray"
set tok "\]"
return 1
}
default {
error "state '$state'. endarray case not implemented [tomlish::parse::report_line]"
}
}
}
} on error {em} {
error $em
} finally {
set slash_active 0
}
}
bsl {
set dquotes $multi_dquote
set multi_dquote "" ;#!!
#backslash
if {[::string length $tokenType]} {
switch -exact -- $tokenType {
startquotesequence {
_shortcircuit_startquotesequence
}
string - litstring - multilitstring - comment - tablename - tablearrayname {
if {$slash_active} {
set slash_active 0
append tok "\\\\"
} else {
set slash_active 1
}
}
stringpart {
if {$slash_active} {
#assert - quotes empty - or we wouldn't have slash_active
set slash_active 0
append tok "\\\\"
} else {
append tok $dquotes
set slash_active 1
}
}
whitespace {
if {$state eq "multistring-space"} {
#end whitespace token
incr i -1
return 1
} else {
error "Unexpected backslash during whitespace. [tomlish::parse::report_line]"
}
}
barekey {
error "Unexpected backslash during barekey. [tomlish::parse::report_line]"
}
default {
error "Backslash unexpected during tokentype: '$tokenType'. [tomlish::parse::report_line]"
}
}
} else {
if {$state eq "multistring-space"} {
set slash_active 1
} else {
error "Unexpected backslash when no token is active. [tomlish::parse::report_line]"
}
}
}
dq {
#double quote
try {
if {[::string length $tokenType]} {
switch -exact -- $tokenType {
startquotesequence {
set toklen [::string length $tok]
if {$toklen == 1} {
append tok $c
} elseif {$toklen == 2} {
append tok $c
set_tokenType "startmultiquote"
return 1
} else {
error "unexpected token length in 'startquotesequence'"
}
}
endquotesequence {
set toklen [::string length $tok]
if {$toklen == 1} {
append tok $c
} elseif {$toklen == 2} {
append tok $c
set_tokenType "endmultiquote"
return 1
} else {
error "unexpected token length in 'endquotesequence'"
}
}
string {
if {$slash_active} {
append tok "\\"
append tok $c
} else {
#unescaped quote always terminates a string?
dict set token_waiting type endquote
dict set token_waiting tok "\""
return 1
}
}
stringpart {
#sub element of multistring
if {$slash_active} {
append tok "\\"
append tok $c
} else {
#incr i -1
if {$multi_dquote eq "\"\""} {
dict set token_waiting type endmultiquote
dict set token_waiting tok "\"\"\""
set multi_dquote ""
return 1
} else {
append multi_dquote "\""
}
}
}
whitespace {
switch -exact -- $state {
multistring-space {
#REVIEW
if {$multi_dquote eq "\"\""} {
dict set token_waiting type endmultiquote
dict set token_waiting tok "\"\"\""
set multi_dquote ""
return 1
} else {
append multi_dquote "\""
}
}
value-expected {
if {$multi_dquote eq "\"\""} {
dict set token_waiting type startmultiquote
dict set token_waiting tok "\"\"\""
set multi_dquote ""
return 1
} else {
#end whitespace token and reprocess
incr i -1
return 1
#append multi_dquote "\""
}
}
default {
dict set token_waiting type startquote
dict set token_waiting tok "\""
return 1
}
}
}
comment {
if {$slash_active} {append tok "\\"}
append tok $c
}
quotedkey - itablequotedkey {
if {$slash_active} {
append tok "\\"
append tok $c
} else {
dict set token_waiting type endquote
dict set token_waiting tok "\""
return 1
}
}
tablename - tablearrayname {
if {$slash_active} {append tok "\\"}
append tok $c
}
starttablename - starttablearrayname {
incr i -1 ;##
return 1
}
default {
error "got quote during tokenType '$tokenType' [tomlish::parse::report_line]"
}
}
} else {
#$slash_active not relevant when no tokenType
#token is string only if we're expecting a value at this point
switch -exact -- $state {
value-expected - array-space {
#!? start looking for possible multistartquote
#set_tokenType startquote
#set tok $c
#return 1
set_tokenType startquotesequence ;#one or more quotes in a row - either startquote or multistartquote
set tok $c
}
multistring-space {
#REVIEW
if {$multi_dquote eq "\"\""} {
dict set token_waiting type endmultiquote
dict set token_waiting tok "\"\"\""
set multi_dquote ""
return 1
} else {
append multi_dquote "\""
}
}
key-space {
set tokenType startquote
set tok $c
return 1
}
curly-space {
set tokenType startquote
set tok $c
return 1
}
tablename - tablearrayname {
set_tokenType $state
set tok $c
}
default {
error "Unexpected quote during state '$state' [tomlish::parse::report_line]"
}
}
}
} on error {em} {
error $em
} finally {
set slash_active 0
}
}
= {
set dquotes $multi_dquote
set multi_dquote "" ;#!!
if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out.
set slash_active 0
if {[::string length $tokenType]} {
switch -exact -- $tokenType {
startquotesequence {
_shortcircuit_startquotesequence
}
string - comment - quotedkey {
#for these tokenTypes an = is just data.
append tok $c
}
stringpart {
append tok $dquotes$c
}
whitespace {
dict set token_waiting type equal
dict set token_waiting tok =
return 1
}
barekey {
dict set token_waiting type equal
dict set token_waiting tok =
return 1
}
default {
error "unexpected = character during tokentype $tokenType. case not implemented. [tomlish::parse::report_line]"
}
}
} else {
switch -exact -- $state {
multistring-space {
set_tokenType stringpart
set tok ${dquotes}=
}
default {
set_tokenType equal
set tok =
return 1
}
}
}
}
cr {
set dquotes $multi_dquote
set multi_dquote "" ;#!!
# \r carriage return
if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out.
set slash_active 0
if {[::string length $tokenType]} {
switch -exact -- $tokenType {
startquotesequence {
_shortcircuit_startquotesequence
}
stringpart {
append tok $dquotes$c
}
default {
#!todo - error out if cr inappropriate for tokenType
append tok $c
}
}
} else {
#lf may be appended if next
#review - lone cr as newline? - this is uncommon - but so is lone cr in a string(?)
set_tokenType newline
set tok cr
}
}
lf {
set dquotes $multi_dquote
set multi_dquote "" ;#!!
# \n newline
if {[::string length $tokenType]} {
if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out.
set slash_active 0
switch -exact -- $tokenType {
startquotesequence {
_shortcircuit_startquotesequence
}
newline {
#this lf is the trailing part of a crlf
append tok lf
return 1
}
stringpart {
if {$dquotes ne ""} {
append tok $dquotes
incr i -1
return 1
} else {
dict set token_waiting type newline
dict set token_waiting tok lf
return 1
}
}
default {
#newline ends all other tokens.
#note for string: we don't add (raw unescaped) newline to simple string. (must use multi-string for this)
#note for whitespace:
# we will use the convention that \n terminates the current whitespace even if whitespace follows
# ie whitespace is split into separate whitespace tokens at each newline
#puts "-------------- newline lf during tokenType $tokenType"
dict set token_waiting type newline
dict set token_waiting tok lf
return 1
}
}
} else {
set had_slash $slash_active
set slash_active 0
if {$had_slash} {
set_tokenType "continuation"
set tok "\\"
incr i -1
return 1
} else {
set_tokenType newline
set tok lf
return 1
}
}
}
, {
set dquotes $multi_dquote
set multi_dquote "" ;#!!
if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out.
set slash_active 0
if {[::string length $tokenType]} {
switch -exact -- $tokenType {
startquotesequence {
_shortcircuit_startquotesequence
}
string - comment - quotedkey - tablename - tablearrayname {
append tok $c
}
stringpart {
append tok $dquotes$c
}
default {
dict set token_waiting type comma
dict set token_waiting tok ","
return 1
}
}
} else {
switch -exact -- $state {
multistring-space {
set_tokenType stringpart
set tok ","
}
multiliteral-space {
set_tokenType literalpart
set tok ","
}
default {
set_tokenType comma
set tok ","
return 1
}
}
}
}
. {
set multi_dquote "" ;#!!
if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out.
set slash_active 0
if {[::string length $tokenType]} {
switch -exact -- $tokenType {
startquotesequence {
_shortcircuit_startquotesequence
}
string - stringpart - comment - quotedkey - untyped-value {
append tok $c
}
baretablename - tablename - tablearrayname {
#subtable - split later - review
append tok $c
}
barekey {
#we need to transition the barekey to become a structured table name ??? review
switch_tokenType tablename
incr i -1
#error "barekey period unimplemented"
}
default {
error "Received period during tokenType '$tokenType' [tomlish::parse::report_line]"
#dict set token_waiting type period
#dict set token_waiting tok "."
#return 1
}
}
} else {
switch -exact -- $state {
multistring-space {
set_tokenType stringpart
set tok "."
}
multiliteral-space {
set_tokenType literalpart
set tok "."
}
default {
set_tokenType untyped-value
set tok "."
}
}
}
}
" " {
set dquotes $multi_dquote
set multi_dquote "" ;#!!
if {[::string length $tokenType]} {
set had_slash $slash_active
set slash_active 0
switch -exact -- $tokenType {
startquotesequence {
_shortcircuit_startquotesequence
}
barekey {
#whitespace is a terminator for bare keys
#dict set token_waiting type whitespace
#dict set token_waiting tok $c
incr i -1
return 1
}
untyped-value {
#unquoted values (int,date,float etc) are terminated by whitespace
#dict set token_waiting type whitespace
#dict set token_waiting tok $c
incr i -1
return 1
}
comment {
if {$had_slash} {
append tok "\\"
}
append tok $c
}
quotedkey - string {
if {$had_slash} {
append tok "\\"
}
#if {$dquotes eq "\""} {
#}
append tok $c
}
whitespace {
append tok $c
}
stringpart {
if {$had_slash} {
#REVIEW
incr i -2
return 1
} else {
#split into STRINGPART aaa WS " "
#keeping WS separate allows easier processing of CONT stripping
append tok $dquotes
incr i -1
return 1
}
}
starttablename {
incr i -1
return 1
}
starttablearrayname {
incr i -1
return 1
}
tablename - tablearrayname {
#include whitespace in the tablename/tablearrayname
#Will need to be normalized upon interpreting the tomlish as a datastructure
append tok $c
}
default {
error "Received whitespace space during tokenType '$tokenType' [tomlish::parse::report_line]"
}
}
} else {
set had_slash $slash_active
if {$slash_active} {
set slash_active 0
}
switch -exact -- $state {
tablename - tablearrayname {
#tablename can have leading,trailing and interspersed whitespace!
#These will not be treated as whitespace tokens, instead forming part of the name.
set_tokenType $state
if {$had_slash} {
set tok "\\$c"
} else {
set tok $c
}
}
multistring-space {
if {$had_slash} {
set_tokenType "continuation"
set tok "\\"
incr i -1
return 1
} else {
if {$dquotes ne ""} {
set_tokenType "stringpart"
set tok $dquotes
incr i -1
return
}
set_tokenType "whitespace"
append tok $c
}
}
default {
if {$had_slash} {
error "unexpected backslash [tomlish::parse::report_line]"
}
set_tokenType "whitespace"
append tok $c
}
}
}
}
tab {
set dquotes $multi_dquote
set multi_dquote "" ;#!!
if {[::string length $tokenType]} {
if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out.
set slash_active 0
switch -exact -- $tokenType {
startquotesequence {
_shortcircuit_startquotesequence
}
barekey {
#whitespace is a terminator for bare keys
incr i -1
#set token_waiting type whitespace
#set token_waiting tok $c
return 1
}
untyped_value {
#unquoted values (int,date,float etc) are terminated by whitespace
#dict set token_waiting type whitespace
#dict set token_waiting tok $c
incr i -1
return 1
}
quotedkey {
append tok $c
}
string - comment - whitespace {
append tok $c
}
stringpart {
append tok $dquotes$c
}
starttablename - starttablearrayname {
incr i -1
return 1
}
tablename - tablearraynames {
#include whitespace in the tablename/tablearrayname
#Will need to be normalized upon interpreting the tomlish as a datastructure
append tok $c
}
default {
error "Received whitespace tab during tokenType '$tokenType' [tomlish::parse::report_line]"
}
}
} else {
set had_slash $slash_active
if {$slash_active} {
set slash_active 0
}
switch -exact -- $state {
tablename - tablearrayname {
#tablename can have leading,trailing and interspersed whitespace!
#These will not be treated as whitespace tokens, instead forming part of the name.
set_tokenType $state
set tok $c
}
multistring-space {
if {$had_slash} {
set_tokenType "continuation"
set tok "\\"
incr i -1
return 1
} else {
if {$dquotes ne ""} {
set_tokenType stringpart
set tok $dquotes
incr i -1
return 1
} else {
set_tokenType whitespace
append tok $c
}
}
}
default {
set_tokenType "whitespace"
append tok $c
}
}
}
}
bom {
#BOM (Byte Order Mark) - ignored by token consumer
set_tokenType "bom"
set tok "\uFEFF"
return 1
}
default {
set dquotes $multi_dquote
set multi_dquote "" ;#!!
if {[::string length $tokenType]} {
if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out.
set slash_active 0
switch -exact -- $tokenType {
startquotesequence {
_shortcircuit_startquotesequence
}
endquotesequence {
puts stderr "endquotesequence: $tok"
}
whitespace {
incr i -1 ;#We don't have a full token to add to the token_waiting dict - so leave this char for next run.
return 1
}
barekey {
if {[tomlish::utils::is_barekey $c]} {
append tok $c
} else {
error "Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] allowed. [tomlish::parse::report_line]"
}
}
starttablename - starttablearrayname {
incr i -1
#allow statemachine to set context for subsequent chars
return 1
}
stringpart {
append tok $dquotes$c
}
default {
#e.g comment/string/untyped-value/starttablename/starttablearrayname/tablename/tablearrayname
append tok $c
}
}
} else {
set had_slash $slash_active
set slash_active 0
switch -exact -- $state {
key-space - curly-space - curly-syntax {
#if no currently active token - assume another key value pair
if {[tomlish::utils::is_barekey $c]} {
set_tokenType "barekey"
append tok $c
} else {
error "Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]"
}
}
multistring-space {
set_tokenType "stringpart"
if {$had_slash} {
#assert - we don't get had_slash and dquotes at same time
set tok \\$c
} else {
set tok $dquotes$c
}
}
tablename {
set_tokenType "tablename"
set tok $c
}
tablearrayname {
set_tokenType "tablearrayname"
set tok $c
}
default {
set_tokenType "untyped-value"
set tok $c
}
}
}
}
}
}
#run out of characters (eof)
if {[::string length $tokenType]} {
#check for invalid ending tokens
#if {$state eq "err"} {
# error "Reached end of data whilst tokenType = '$tokenType'. INVALID"
#}
if {$tokenType eq "startquotesequence"} {
set toklen [::string length $tok]
if {$toklen == 1} {
#invalid
#eof with open string
eror "eof reached without closing quote for string. [tomlish::parse::report_line]"
} elseif {$toklen == 2} {
#valid
#we ended in a double quote, not actually a startquoteseqence - effectively an empty string
switch_tokenType "startquote"
incr i -1
#dict set token_waiting type "string"
#dict set token_waiting tok ""
return 1
}
}
dict set token_waiting type "eof"
dict set token_waiting tok "eof"
return 1
} else {
::tomlish::log::debug "No current tokenType, ran out of characters, setting tokenType to 'eof' [tomlish::parse::report_line]"
set tokenType "eof"
set tok "eof"
}
return 0
}
#*** !doctools
#[list_end] [comment {--- end definitions namespace tomlish::parse ---}]
}
tcl::namespace::eval tomlish::app {
variable applist [list encoder decoder test]
#*** !doctools
#[subsection {Namespace tomlish::app}]
#[para]
#[list_begin definitions]
proc decoder {args} {
#*** !doctools
#[call app::[fun decoder] [arg args]]
#[para] read toml on stdin until EOF
#[para] on error - returns non-zero exit code and writes error on stderr
#[para] on success - returns zero exit code and writes JSON encoding of the data on stdout
#[para] This decoder is intended to be compatible with toml-test
set opts [dict merge [dict create] $args]
#fconfigure stdin -encoding utf-8
fconfigure stdin -translation binary
#Just slurp it all - presumably we are not handling massive amounts of data on stdin.
# - even if the input is large, we probably don't gain much (aside from possible memory savings?) by attempting to process input as it arrives.
if {[catch {
set toml [read stdin]
}]} {
exit 2 ;#read error
}
try {
set j [::tomlish::toml_to_json $toml]
} on error {em} {
puts stderr "decoding failed: '$em'"
exit 1
}
puts -nonewline stdout $j
exit 0
}
proc encoder {args} {
#*** !doctools
#[call app::[fun encoder] [arg args]]
#[para] read JSON on stdin until EOF
#[para] return non-zero exitcode if JSON data cannot be converted to a valid TOML representation
#[para] return zero exitcode and TOML data on stdout if JSON data can be converted.
#[para] This encoder is intended to be compatible with toml-test
set opts [dict merge [dict create] $args]
fconfigure stdin -translation binary
if {[catch {
set json [read stdin]
}]} {
exit 2 ;#read error
}
try {
set toml [::tomlish::json_to_toml $json]
} on error {em} {
puts stderr "encoding failed: '$em'"
exit 1
}
puts -nonewline stdout $toml
exit 0
}
proc test {args} {
set opts [dict merge [dict create] $args]
package require test::tomlish
if {[dict exists $opts -suite]} {
test::tomlish::suite [dict get $opts -suite]
}
test::tomlish::run
}
#*** !doctools
#[list_end] [comment {--- end definitions namespace tomlish::app ---}]
}
proc ::tomlish::appnames {} {
set applist [list]
foreach cmd [info commands ::tomlish::app::*] {
lappend applist [namespace tail $cmd]
}
return $applist
}
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# Secondary API namespace
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
namespace eval tomlish::lib {
namespace export {[a-z]*}; # Convention: export all lowercase
namespace path [namespace parent]
#*** !doctools
#[subsection {Namespace tomlish::lib}]
#[para] Secondary functions that are part of the API
#[list_begin definitions]
#proc utility1 {p1 args} {
# #*** !doctools
# #[call lib::[fun utility1] [arg p1] [opt {?option value...?}]]
# #[para]Description of utility1
# return 1
#}
#*** !doctools
#[list_end] [comment {--- end definitions namespace tomlish::lib ---}]
}
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
if {$argc > 0} {
puts stderr "argc: $argc args: $argv"
if {($argc == 1)} {
if {[::string tolower $argv] in {help -help h -h}} {
puts stdout "Usage: -app <appname> where appname one of:[tomlish::appnames]"
exit 0
} else {
puts stderr "Argument '$argv' not understood. Try -help"
exit 1
}
}
set opts [dict create]
set opts [dict merge $opts $argv]
set opts_understood [list -app ]
if {"-app" in [dict keys $opts]} {
#Don't vet the remaining opts - as they are interpreted by each app
} else {
foreach key [dict keys $opts] {
if {$key ni $opts_understood} {
puts stderr "Option '$key' not understood"
exit 1
}
}
}
if {[dict exists $opts -app]} {
set app [dict get $opts -app]
if {$app ni [tomlish::appnames]} {
puts stderr "app '[dict get $opts -app]' not found"
exit 1
}
tomlish::app::$app {*}$opts
}
}
## Ready
package provide tomlish [namespace eval tomlish {
variable pkg tomlish
variable version
set version 1.1.1
}]
return
#*** !doctools
#[manpage_end]