ansi art cp437 nul char fix + ansistring work

1 year ago · a1e9865b80
2 changed files with 137 additions and 24 deletions
--- a/src/modules/punk/ansi-999999.0a1.0.tm
+++ b/src/modules/punk/ansi-999999.0a1.0.tm
@ -218,7 +218,9 @@ namespace eval punk::ansi {
    #Layout for cp437 won't be right if you don't at least set width of control-chars to 1 - but also some images specifically use these glyphs
    #most fonts don't seem to supply graphics for these control characters even when cp437 is in use - the c1 control glyphs appear to be more widely available - but we could add them here too 
    #by mapping these we can display regardless.
-    #nul char - no cp437 image. (which is good - because we use nul as a filler to mean empty column in overtype rendering)
+    #nul char - no cp437 image but commonly used as space in ansi graphics. 
    #(This is a potential conflict because we use nul as a filler to mean empty column in overtype rendering) REVIEW
    dict set cp437_map \u0000 " "     ;#space
    dict set cp437_map \u0001 \u263A  ;#smiley
    dict set cp437_map \u0003 \u263B  ;#smiley-filled
    dict set cp437_map \u0003 \u2665  ;#heart
@ -1597,10 +1599,23 @@ namespace eval punk::ansi {
            set codestate_initial $codestate_empty ;#keep a copy for resets.
            set did_reset 0
            #we should also handle 8bit CSI here?  mixed \x1b\[  and \x9b  ?  Which should be used in the merged result?
            #There are arguments to move to 8bit CSI for keyboard protocols (to solve keypress timing issues?) - but does this extend to SGR codes? 
            #we will output 7bit merge of the SGRs even if some or all were 8bit CSi
            #As at 2024 - 7bit are widely supported 8bit seem to be often ignored by pseudoterminals
            #auto-detecting and emitting 8bit only if any are present in our input doesn't seem like a good idea - as sgr_merge_list is only seeing a subset of the data - so any auto-decision at this level will just introduce indeterminism.
            #review - consider a higher-level option for always emitting 8bit or always 7bit
            #either way - if we get mixed CSI input - it probably makes more sense to merge their parameters than maintain the distinction and pass the mess downstream.
            #We still output any non SGR codes in the list as they came in - preserving their CSI
            foreach c $args {
-                switch -- [string index $c 1][string index $c end] {
+                #normalize 8bit to a token of the same length so our string operations on the code are the same and we can maintain a switch statement with literals rather than escapes
                #.. but preserve original c
                set cnorm [string map [list \x9b {8[} ] $c]
                switch -- [string index $cnorm 1][string index $cnorm end] {
                    {[m} {
-                        set params [string range $c 2 end-1] ;#strip leading esc lb and trailing m
+                        set params [string range $cnorm 2 end-1] ;#strip leading esc lb and trailing m
                        #some systems use colon for 256 colors or RGB or nonstandard subparameters
                        #- it is therefore probably not ok to map to semicolon within SGR codes and treat the same.
@ -2185,6 +2200,15 @@ namespace eval punk::ansi::class {
            variable o_from_ansistring o_to_ansistring
            variable o_ns_from o_ns_to  ;#some dirty encapsulation violation as a 'friend' of ansistring objects - direct record of namespaces as they are frequently accessed
            constructor {args} {
                #-- make assert available --
                # By pointing it to the assert imported into ::punk::ansi::class
                # (we could alternatively import assert *directly* from ::punk::assertion::assert - but we can't chain imports as setting active flag renames the command, breaking chained imports)
                set nspath [namespace path]
                if {"::punk::ansi::class" ni $nspath} {
                    lappend nspath ::punk::ansi::class
                }
                namespace path $nspath 
                #--                      --
                if {[llength $args] < 2} {
 		            error {usage: ?-width <int>? ?-wrap [1|0]? ?-overflow [1|0]? from_ansistring to_ansistring}
 	            }
@ -2251,14 +2275,72 @@ namespace eval punk::ansi::class {
            }
            method rendernext {} {
                upvar ${o_ns_from}::o_ansisplits from_ansisplits
-                upvar ${o_ns_from}::o_elements elements
+                upvar ${o_ns_from}::o_elements from_elements
                upvar ${o_ns_from}::o_splitindex from_splitindex 
                #if {![llength $from_ansisplits]} {$o_from_ansistring eval_in {my MakeSplit}} ;#!!todo - a better way to keep this method semi hidden but call from a 'friend'
                if {![llength $from_ansisplits]} {
                    namespace eval $o_ns_from {my MakeSplit}
                }
                set eidx [llength $o_rendereditems]
                #compare what we've rendered so far to our source to confirm they're still in sync
                if {[lrange $o_rendereditems 0 $eidx-1] ne [lrange $from_elements 0 $eidx-1]} {
                    puts stdout "rendereditems 0->[expr {$eidx-1}]: [ansistring VIEW [lrange $o_rendereditems 0 $eidx-1]]"
                    puts stdout "from_elements 0->[expr {$eidx-1}]: [ansistring VIEW [lrange $from_elements 0 $eidx-1]]"
                    error "rendernext error - rendering state is out of sync. rendereditems list not-equal to corresponding part of ansistring $o_from_ansistring"
                }
                if {$eidx == [llength $from_elements]} {
                    #nothing new available
                    return [dict create type "" rendercount 0 start_count_unrendered 0 end_count_unrendered 0]
                }
-                if {![llength $from_ansisplits]} {$o_from_ansistring eval_in {my MakeSplit}} ;#!!todo - a better way to keep this method semi hidden but call from a 'friend'
+                set start_elements_unrendered [expr {[llength $from_elements] - [llength $o_rendereditems]}]
                set elements_unrendered [expr {[llength $elements] - [llength $o_rendereditems]}]
                #we need to render in pt code chunks - not each grapheme element individually
-                #translate from element index to ansisplits index?
+                #translate from element index to ansisplits index
                set process_splitindex [lindex $from_splitindex $eidx] ;#which from_ansisplits index the first unrendered element belongs to
                set elementinfo [lindex $from_elements $eidx]
                lassign $elementinfo type_rendered item 
                #we don't expect type to change should be all graphemes (type 'g') or a single code (type 'sgr','other' etc)
                #review - we may want to store more info for graphemes e.g g0 g1 g2 for zero-wide 1-wide 2-wide ?
                #if so - we should report a list of the grapheme types that were rendered in a pt block
                #as a counterpoint however - we don't currently retrieve grapheme width during split (performance impact at wrong time?) - and width may depend on the rendering method anyway
                #e.g c0 controls are normally zero printing width - but are (often) 1-wide glyphs in a cp437 rendering operation.
                #we want to render all the elements in this splitindex - for pt this may be multiple, for code it will be a single element
                set newtext ""
                set rendercount 0
                if {$type_rendered eq "g"} {
                    set e_splitindex $process_splitindex
                    while {$e_splitindex == $process_splitindex && $eidx < [llength $from_elements]} {
                        append newtext $item
                        lappend o_rendereditems $elementinfo 
                        incr rendercount
                        incr eidx
                        set e_splitindex [lindex $from_splitindex $eidx]
                        set elementinfo [lindex $from_elements $eidx]
                        lassign $elementinfo _type item
                    }
                } else {
                    set newtext $item 
                    lappend o_rendereditems $elementinfo 
                    incr rendercount
                }
-                return [dict create count_unrendered $elements_unrendered]
+                set end_elements_unrendered [expr {[llength $from_elements] - [llength $o_rendereditems]}]
                set count_rendered [expr {$start_elements_unrendered - $end_elements_unrendered}]
                assert {$rendercount == $count_rendered}
                #todo - renderline equivalent?
                $o_to_ansistring append $newtext
                return [dict create type $type_rendered rendercount $rendercount start_count_unrendered $start_elements_unrendered end_count_unrendered $end_elements_unrendered]
            }
        }
@ -2274,42 +2356,58 @@ namespace eval punk::ansi::class {
    #As this is intended for column-based terminals - it has a different notion of string length, string index etc than for a plain string.
    #oo names beginning with uppercase are private - so we can't use capitalisation as a hint to distinguish those which differ from Tcl semantics
    oo::class create class_ansistring {
-        variable o_cksum_command
+        variable o_cksum_command o_string o_count
        variable o_string
        variable o_count
        #this is the main state we keep of the split apart string
        #we use the punk::ansi::ta::split_codes_single function which produces a list with zero, or an odd number elements always beginning and ending with plaintext
-        variable o_ptlist ;#plaintext as list of elements from ansisplits  - will include empty elements from between adjacent ansi-codes 
+        variable o_ptlist               ;#plaintext as list of elements from ansisplits  - will include empty elements from between adjacent ansi-codes 
-        variable o_ansisplits ;#store our plaintext/ansi-code splits so we don't keep re-running the regexp to split
+        variable o_ansisplits           ;#store our plaintext/ansi-code splits so we don't keep re-running the regexp to split
        #State regarding output renderstring (if any)
-        variable o_renderout  ;#another class_ansistring instance
+        variable o_renderout            ;#another class_ansistring instance
-        variable o_renderer ;# punk::ansi::class::renderer::class_<rendertype> instance
+        variable o_renderer             ;# punk::ansi::class::renderer::class_<rendertype> instance
        variable o_renderwidth
        variable o_rendertype
-        variable o_elements o_sgrstacks  ;#elements contains entry for each grapheme/control + each ansi code, stacks has list of ansi sgr codes 
+        # -- per element lookups --
-        variable o_gx0states
+        # llengths should all be the same
        # we maintain 4 lookups per entry rather than a single nested list
        # it is estimated that separate lists will be more efficient for certain operations - but that is open to review/testing.
        variable o_elements             ;#elements contains entry for each grapheme/control + each ansi code
        variable o_sgrstacks            ;#list of ansi sgr codes that will be merged later. Entries deliberately repeat if no change from previous entry. Later scans look for difference between n and n-1 when deciding where to apply codes.
        variable o_gx0states            ;#0|1 for alternate graphics gx0
        variable o_splitindex           ;#entry for each element indicating the index of the split it belongs to. 
        # --                     --
        constructor {string} {
            set o_string $string
            #-- make assert available --
            # By pointing it to the assert imported into ::punk::ansi::class
            # (we could alternatively import assert *directly* from ::punk::assertion::assert - but we can't chain imports as setting active flag renames the command, breaking imports)
            set nspath [namespace path]
            if {"::punk::ansi::class" ni $nspath} {
                lappend nspath ::punk::ansi::class
            }
            namespace path $nspath 
            #--                      --
            #we choose not to generate an internal split-state for the initial string - which may potentially be large.
            #there are a few methods such as get, has_ansi, show_state,checksum that can run efficiently on the initial string without generating it.
            #The length method can use ansi::ta::detect to work quickly without updating it if it can, and other methods also update it as necessary
            set o_count "" ;#o_count first updated when string appended or a method causes MakeSplit to run (or by count method if constructor argument was empty string)
            set o_ansisplits [list] ;#we get empty pt(plaintext) between each ansi code. Codes include cursor movements, resets,alt graphics modes, terminal mode settings etc. 
            set o_ptlist [list]
            #o_ansisplits and o_ptlist should only remain empty if an empty string was passed to the contructor, or no methods have yet triggered the initial string to have it's internal state built.
            set o_elements      [list]
            set o_sgrstacks     [list]
            set o_gx0states     [list]
            set o_splitindex    [list]
            set o_cksum_command [list sha1::sha1 -hex]
@ -2394,16 +2492,22 @@ namespace eval punk::ansi::class {
            set o_ptlist [list]
            set codestack [list]
            set gx0_state 0 ;#default off 
            set current_split_index 0 ;#incremented for each pt block, incremented for each code
            foreach {pt code} $o_ansisplits {
                lappend o_ptlist $pt
                foreach grapheme [punk::char::grapheme_split $pt] {
                    lappend o_elements [list g $grapheme]
                    lappend o_sgrstacks $codestack
                    lappend o_gx0states $gx0_state
                    lappend o_splitindex $current_split_index
                }
                #after handling the pt block - incr the current_split_index
                incr current_split_index  ;#increment for each pt block - whether empty string or not. Indices corresponding to empty PT blocks will therefore not be present in o_splitindex as there were  no elements in that ansisplit entry 
                #we will only get an empty code at the very end of ansisplits (ansisplits is length 0 or odd length - always with pt at start and pt at end)
                if {$code ne ""} {
                    lappend o_sgrstacks $codestack
                    lappend o_gx0states $gx0_state
                    lappend o_splitindex $current_split_index
                    #maintenance warning - dup in append!
                    if {[punk::ansi::codetype::is_sgr_reset $code]} {
@ -2429,12 +2533,14 @@ namespace eval punk::ansi::class {
                            lappend o_elements [list other $code]
                        }
                    }
                    #after each code (ignoring bogus empty final due to foreach with 2 vars on odd-length list) increment the current_split_index
                    incr current_split_index
                }
                #assertion every grapheme and every individual code has been added to o_elements
                #every element has an entry in o_sgrstacks 
                #every element has an entry in o_gx0states
                assert {[llength $o_elements] == [llength $o_sgrstacks] && [llength $o_elements] == [llength $o_gx0states]}
            }
            #assertion every grapheme and every individual code has been added to o_elements
            #every element has an entry in o_sgrstacks 
            #every element has an entry in o_gx0states
            assert {[llength $o_elements] == [llength $o_sgrstacks] && [llength $o_elements] == [llength $o_gx0states] && [llength $o_elements] == [llength $o_splitindex]}
        }
        method convert_altg {} {
            #do we need a method to retrieve without converting in the object?
@ -2617,10 +2723,12 @@ namespace eval punk::ansi::class {
                }
                set last_codestack [lindex $o_sgrstacks end]
                set last_gx0state  [lindex $o_gx0states end]
                set current_split_index [expr {[llength $o_ansisplits]-1}] ;#we are attaching to existing trailing pt - use its splitindex
                foreach grapheme [punk::char::grapheme_split $catstr] {
                    lappend o_elements [list g $grapheme]
                    lappend o_sgrstacks $last_codestack
                    lappend o_gx0states $last_gx0state
                    lappend o_splitindex $current_split_index
                }
                incr o_count [my DoCount $catstr]
            } else {
@ -2630,7 +2738,7 @@ namespace eval punk::ansi::class {
                    my MakeSplit
                    set combined_plaintext [join $o_ptlist ""]
                    set o_count [my DoCount $combined_plaintext]
-                    assert {[llength $o_elements] == [llength $o_sgrstacks] && [llength $o_elements] == [llength $o_gx0states]}
+                    assert {[llength $o_elements] == [llength $o_sgrstacks] && [llength $o_elements] == [llength $o_gx0states] && [llength $o_elements] == [llength $o_splitindex]}
                    return $o_string 
                } else {
                    #update each element of internal state incrementally without reprocessing what is already there. 
@ -2639,6 +2747,7 @@ namespace eval punk::ansi::class {
                    set ptnew ""
                    set codestack [lindex $o_sgrstacks end]
                    set gx0_state  [lindex $o_gx0states end]
                    set current_split_index 0
                    foreach {pt code} $newsplits {
                        lappend o_ptlist $pt
                        append ptnew $pt
@ -2646,10 +2755,13 @@ namespace eval punk::ansi::class {
                            lappend o_elements [list g $grapheme]
                            lappend o_sgrstacks $codestack
                            lappend o_gx0states $gx0_state
                            lappend o_splitindex $current_split_index
                        }
                        incr current_split_index ;#increment 1 of 2 within each loop
                        if {$code ne ""} {
                            lappend o_sgrstacks $codestack
                            lappend o_gx0states $gx0_state
                            lappend o_splitindex $current_split_index
                            #maintenance - dup in MakeSplit!
                            if {[punk::ansi::codetype::is_sgr_reset $code]} {
                                set codestack [list]
@ -2674,6 +2786,7 @@ namespace eval punk::ansi::class {
                                    lappend o_elements [list other $code]
                                }
                            }
                            incr current_split_index ;#increment 2 of 2
                        }
                    }
                    lset o_ansisplits end [string cat [lindex $o_ansisplits end] [lindex $newsplits 0]]
@ -2681,7 +2794,7 @@ namespace eval punk::ansi::class {
                    incr o_count [my DoCount $ptnew]
                }
            }
-            assert {[llength $o_elements] == [llength $o_sgrstacks] && [llength $o_elements] == [llength $o_gx0states]}
+            assert {[llength $o_elements] == [llength $o_sgrstacks] && [llength $o_elements] == [llength $o_gx0states] && [llength $o_elements] == [llength $o_splitindex]}
            return $o_string
        }
        #method append_and_render - append and render up to end of appended data at same time
@ -3556,7 +3669,7 @@ namespace eval punk::ansi::ansistring {
    #Todo - rows! Note that a 'row' doesn't represent an output row if the ANSI string we are working with contains movement/cursor restores etc.
    #The column/row concept works for an ansistring that has been 'rendered' to some defined area.
-    #row for arbitrary ANSI input only tells us which line of input we are in - e.g a single massive line of ANSI input would appear to have one row but could result in many.
+    #row for arbitrary ANSI input only tells us which line of input we are in - e.g a single massive line of ANSI input would appear to have one row but could result in many rendered output rows.
    #return pair of column extents occupied by the character index supplied.
    #single-width grapheme will return pair of integers of equal value
--- a/src/testansi/67_Calendar_2020_06_June.ans
+++ b/src/testansi/67_Calendar_2020_06_June.ans