punkshell/src/bootsupport/modules/punk/fileline-0.1.0.tm

# -*- tcl -*-
# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from <pkg>-buildversion.txt
#
# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem.
# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository.
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# (C) 2024
#
# @@ Meta Begin
# Application punk::fileline 0.1.0
# Meta platform     tcl
# Meta license      BSD
# @@ Meta End


# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# doctools header
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[manpage_begin punkshell_module_punk::fileline 0 0.1.0]
#[copyright "2024"]
#[titledesc {file line-handling utilities}]  [comment {-- Name section and table of contents description --}]
#[moddesc {punk fileline}]             [comment {-- Description at end of page heading --}]
#[require punk::fileline]
#[keywords module text parse file encoding BOM]
#[description]
#[para] -

# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++

#*** !doctools
#[section Overview]
#[para]Utilities for in-memory analysis of text file data as both line data and byte/char-counted data whilst preserving the line-endings (even if mixed)
#[para]This is important for certain text files where examining the number of chars/bytes is important
#[para]For example - windows .cmd/.bat files need some byte counting to determine if labels lie on chunk boundaries and need to be moved.
#[para]This chunk-size counting will depend on the character encoding.
#[para]Despite including the word 'file', the library doesn't necessarily deal with reading/writing to the filesystem -
#[para]The raw data can be supplied as a string, or loaded from a file using punk::fileline::get_textinfo -file <filename>
#[subsection Concepts]
#[para]A chunk of textfile data (possibly representing a whole file - but usually at least a complete set of lines) is loaded into a punk::fileline::class::textinfo instance at object creation.
#[example_begin]
#    package require punk::fileline
#    package require fileutil
#    set rawdata [lb]fileutil::cat data.txt -translation binary[rb]
#    punk::fileline::class::textinfo create obj_data  $rawdata
#    puts stdout [lb]obj_data linecount[rb]
#[example_end]
#[subsection Notes]
#[para]Line records are referred to by a zero-based index instead of a one-based index as is commonly used when displaying files.
#[para]This is for programming consistency and convenience, and the module user should do their own conversion to one-based indexing for line display or messaging if desired.
#[para]No support for lone carriage-returns being interpreted as line-endings.
#[para]CR line-endings that are intended to be interpreted as such should be mapped to something else before the data is supplied to this module.

# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
##  Requirements
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++

#*** !doctools
#[subsection dependencies]
#[para] packages needed by punk::fileline
#[list_begin itemized]

    package require Tcl 8.6-
    package require punk::args
    #*** !doctools
    #[item] [package {Tcl 8.6-}]
    #[item] [package {punk::args}]


    # #package require frobz
    # #*** !doctools
    # #[item] [package {frobz}]

#*** !doctools
#[list_end] [comment {- end dependencies list -}]

#*** !doctools
#[subsection {optional dependencies}]
#[para] packages that add functionality but aren't strictly required
#[list_begin itemized]

    #*** !doctools
    #[item] [package {punk::ansi}]
    #[para] - recommended for class::textinfo [method chunk_boundary_display]
    #[item] [package {punk::char}]
    #[para] - recommended for class::textinfo [method chunk_boundary_display]
    #[item] [package {overtype}]
    #[para] - recommended for class::textinfo [method chunk_boundary_display]


#*** !doctools
#[list_end] [comment {- end optional dependencies list -}]


# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++

#*** !doctools
#[section API]

# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# oo::class namespace
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
namespace eval punk::fileline::class {
    namespace export *
    #*** !doctools
    #[subsection {Namespace punk::fileline::class}]
    #[para] class definitions
    if {[info commands [namespace current]::textinfo] eq ""} {
        #*** !doctools
        #[list_begin enumerated]

        # oo::class create interface_sample1 {
        #     #*** !doctools
        #     #[enum] CLASS [class interface_sample1]
        #     #[list_begin definitions]

        #     method test {arg1} {
        #         #*** !doctools
        #         #[call class::interface_sample1    [method test] [arg arg1]]
        #         #[para] test method
        #         puts "test: $arg1"
        #     }

        #     #*** !doctools
        #     #[list_end] [comment {-- end definitions interface_sample1}]
        # }


        #uses zero based indexing. Caller can add 1 for line numbers
        oo::class create [namespace current]::textinfo {
                #*** !doctools
                #[enum] CLASS [class textinfo]
                #[list_begin definitions]
                #    [para] [emph METHODS]

                variable o_chunk ;#current state
                variable o_chunkop_store
                variable o_lineop_store

                variable o_chunk_epoch
                variable o_line_epoch
                variable o_payloadlist
                variable o_linemap
                variable o_LF_C
                variable o_CRLF_C


                variable o_bom_id
                variable o_bom
                variable o_bom_map

                #review - for now we expect datachunk to be data without BOM and already encoded appropriately
                #fileline::get_textinfo has support for interpreting BOM - but we currently have no way to do that for data not coming from a file
                #refactor to allow that code to be called from here?
                constructor {datachunk args} {
                    #*** !doctools
                    #[call class::textinfo [method constructor] [arg datachunk] [opt {option value...}]]
                    #[para] Constructor for textinfo object which represents a chunk or all of a file
                    #[para] datachunk should be passed with the file data including line-endings as-is for full functionality. ie use something like:
                    #[example_begin]
                    #    fconfigure $fd -translation binary
                    #    set chunkdata [lb]read $fd[rb]]
                    #or
                    #    set chunkdata [lb]fileutil::cat <filename> -translation binary[rb]
                    #[example_end]
                    #[para] when loading the data
                    namespace eval [namespace current] {
                        set nspath [namespace path]
                        foreach p [list ::punk::fileline ::punk::fileline::ansi] {
                            if {$p ni $nspath} {
                                lappend nspath $p
                            }
                        }
                        namespace path $nspath
                    }

                    set o_bom_map [list\
                        utf-8 \u00ef\u00bb\u00bf\
                        utf-16be \u00fe\u00ff\
                        utf-16le \u00ff\u00fe\
                        utf-32be \u0000\u0000\u00fe\u00ff\
                        utf-32le \u00ff\u00fe\u0000\u0000\
                        utf-7    \u002b\u002f\u0076\
                        utf-1    \u00f7\u0064\u004c\
                        utf-ebcdic \u00dd\u0073\u0066\u0073\
                        utf-scsu \u0003\u00fe\u00ff\
                        utf-bocu-1 \u00fb\u00ee\u0028\
                        utf-gb18030 \u0084\u0031\u0095\u0033\
                    ]
                    set o_bom_id ""
                    set o_bom "" ;#review

                    set o_chunk $datachunk
                    set o_line_epoch [list]
                    set o_chunk_epoch [list "fromchunkchange-at-[clock micros]"]
                    set crlf_lf_placeholders [list \uFFFF \uFFFE] ;#defaults - if already exist in file - error out with message
                    set defaults [dict create\
                        -substitutionmap {}\
                        -crlf_lf_placeholders $crlf_lf_placeholders\
                        -userid "<pid-[pid]-tid-[thread::id]>"\
                    ]
                    set known_opts [dict keys $defaults]
                    foreach {k v} $args {
                        if {$k ni $known_opts} {
                            error "[self] constructor error: unknown option '$k'. Known options: $known_opts"
                        }
                    }
                    set opts [dict merge $defaults $args]
                    # -- --- --- --- --- --- ---
                    set opt_substitutionmap         [dict get $opts -substitutionmap] ;#review - can be done by caller - or a loadable -policy
                    set opt_crlf_lf_placeholders    [dict get $opts -crlf_lf_placeholders]
                    set opt_userid                  [dict get $opts -userid]
                    # -- --- --- --- --- --- ---

                    if {[llength $opt_crlf_lf_placeholders] != 2 || [string length [lindex $opt_crlf_lf_placeholders 0]] !=1 || [string length [lindex $opt_crlf_lf_placeholders 1]] !=1} {
                        error "textinfo::constructor error: -crlf_lf_placeholders requires a list of exactly 2 chars"
                    }
                    lassign $opt_crlf_lf_placeholders o_LF_C o_CRLF_C
                    if {[string first $o_LF_C $o_chunk] >=0} {
                        set decval [scan $o_LF_C %c]
                        if {$decval < 32 || $decval > 127} {
                            set char_desc "(decimal value $decval)"
                        } else {
                            set char_desc "'$o_LF_C' (decimal value $decval)"
                        }
                        error "textinfo::constructor error: rawfiledata already contains linefeed substitution character $char_desc specified as first element of -crlf_lf_placeholders"
                    }
                    if {[string first $o_CRLF_C $o_chunk] >=0} {
                        set decval [scan $o_CRLF_C %c]
                        if {$decval < 32 || $decval > 127} {
                            set char_desc "(decimal value $decval)"
                        } else {
                            set char_desc "'$o_CRLF_C' (decimal value $decval)"
                        }
                        error "textinfo::constructor error: rawfiledata already contains carriagereturn-linefeed substitution character $char_desc specified as second element of -crlf_lf_placeholders"
                    }
                    if {$o_LF_C eq $o_CRLF_C} {
                        puts stderr "WARNING: same substitution character used for both elements of -crlf_lf_placeholders - byte counting may be off if file contains mixed line-endings"
                    }

                    my regenerate_lines

                }

                method set_bomid {bomid} {
                    if {$bomid ni [dict keys $o_bom_map]} {
                        error "Unrecognised bom-id $bomid. Known values: [dict keys $o_bom_map]"
                    }
                    set o_bom_id $bomid
                    set o_bom [dict get $o_bom_map $bomid]
                }
                method get_bomid {} {
                    return $o_bom_id
                }
                method get_bom {} {
                    return $o_bom
                }

                method chunk {chunkstart chunkend} {
                    #*** !doctools
                    #[call class::textinfo [method chunk] [arg chunkstart] [arg chunkend]]
                    #[para]Return a range of bytes from the underlying raw chunk data.
                    #[para] e.g The following retrieves the entire chunk
                    #[para] objName chunk 0 end
                    return [string range $o_chunk $chunkstart $chunkend]
                }
                method chunklen {} {
                    #*** !doctools
                    #[call class::textinfo [method chunklen]]
                    #[para] Number of bytes/characters in the raw data of the file
                    return [string length $o_chunk]
                }
                method chunk_boundary_display {chunkstart chunkend chunksize args} {
                    #*** !doctools
                    #[call class::textinfo [method chunk_boundary_display]]
                    #[para]Returns a string displaying the boundaries at chunksize bytes between chunkstart and chunkend
                    #[para]Defaults to using ansi colour if punk::ansi module is available. Use -ansi 0 to disable colour
                    set opts [dict create\
                        -ansi $::punk::fileline::ansi::enabled\
                        -offset 0\
                        -displaybytes 200\
                        -truncatedmark "..."\
                        -completemark  "---"\
                        -moremark      " + "\
                        -continuemark  " > "\
                        -linemaxwidth 100\
                        -linebase       0\
                        -limit          -1\
                        -boundaries    {}\
                        -showconfig    0\
                        -boundaryheader {Boundary %i% at %b%}\
                    ]
                    foreach {k v} $args {
                        switch -- $k {
                            -ansi - -offset - -displaybytes - -truncatedmark - -completemark - -moremark - -continuemark - -linemaxwidth - -linebase - -limit - -boundaries - -showconfig - -boundaryheader {
                                dict set opts $k $v
                            }
                            default {
                                error "[self]::chunk_boundary error: unknown option '$k'. Known options: [dict keys $opts]"
                            }
                        }
                    }
                    # -- --- --- --- --- ---
                    set opt_ansi [dict get $opts -ansi]
                    set opt_offset [dict get $opts -offset]
                    set opt_displaybytes [dict get $opts -displaybytes]
                    set opt_tmark [dict get $opts -truncatedmark]
                    set opt_cmark [dict get $opts -completemark]
                    set opt_linemax [dict get $opts -linemaxwidth]
                    set opt_linebase [dict get $opts -linebase]
                    set opt_linebase [string map [list _ ""] $opt_linebase]
                    set opt_limit [dict get $opts -limit] ;#limit number of boundaries to display
                    set opt_boundaries [dict get $opts -boundaries] ;#use pre-calculated boundaries if supplied
                    set opt_showconfig [dict get $opts -showconfig]
                    set opt_boundaryheader [dict get $opts -boundaryheader]
                    # -- --- --- --- --- ---
                    package require overtype
                    # will require punk::char and punk::ansi

                    if {"::punk::fileline::ansi::ansistrip" ne [info commands ::punk::fileline::ansi::ansistrip]} {
                        namespace eval ::punk::fileline::ansi {
                            namespace import ::punk::ansi::*
                        }
                    }

                    #This mechanism for enabling/disabling ansi is a bit clumsy - prone to errors with regard to keeping in sync with any api changes in punk ansi
                    #It's done here to allow this to be used without the full set of punk modules and/or shell - REVIEW

                    #risk of failing to reset on error
                    set pre_ansi_enabled $::punk::fileline::ansi::enabled
                    if {$opt_ansi} {
                        set ::punk::fileline::ansi::enabled 1
                    } else {
                        set ::punk::fileline::ansi::enabled 0
                    }
                    if {"::punk::fileline::ansistrip" ne [info commands ::punk::fileline::ansistrip]} {
                        proc ::punk::fileline::a {args} {
                            if {$::punk::fileline::ansi::enabled} {
                                tailcall ::punk::fileline::ansi::a {*}$args
                            } else {
                                return ""
                            }
                        }
                        proc ::punk::fileline::a+ {args} {
                            if {$::punk::fileline::ansi::enabled} {
                                tailcall ::punk::fileline::ansi::a+ {*}$args
                            } else {
                                return ""
                            }
                        }
                        proc ::punk::fileline::ansistrip {str} {
                            if {$::punk::fileline::ansi::enabled} {
                                tailcall ::punk::fileline::ansi::ansistrip $str
                            } else {
                                return $str
                            }
                        }
                    }
                    set maxline [lindex [my chunkrange_to_linerange $chunkend $chunkend] 0]
                    set minline [lindex [my chunkrange_to_linerange $chunkstart $chunkstart] 0]

                    #suport simple end+-int  (+-)start(+-)int to set linebase to line corresponding to chunkstart or chunkend
                    #also simple int+int and int-int - nothing more complicated (similar to Tcl lrange etc in that regard)
                    #commonly this will be something like -start or -end
                    if {![string is integer -strict $opt_linebase]} {
                        set sign ""
                        set errunrecognised "unrecognised -linebase value '$opt_linebase'. Expected positive or negative integer or -start -start-int -start+int -end -end-int -end+int or -eof (where leading - is optional but probably desirable) "
                        if {[string index $opt_linebase 0] eq "-"} {
                            set sign -
                            set tail [string range $opt_linebase 1 end]
                        } else {
                            set tail [string trimleft $opt_linebase +];#ignore +
                        }
                        #todo - switch -glob -- $tail
                        if {[string match eof* $tail]} {
                            set endmath [string range $tail 3 end]
                            #todo endmath?
                            if {$tail eq "eof"} {
                                set lastline [lindex [my chunkrange_to_linerange end end] 0]
                                set linebase ${sign}$lastline
                            } else {
                                error $errunrecognised
                            }
                        } elseif {[string match end* $tail]} {
                            set endmath [string range $tail 3 end]
                            if {[string length $endmath]} {
                                set op [string index $endmath 0]
                                if {$op in {+ -}} {
                                    set operand [string range $endmath 1 end]
                                    if {[string is integer -strict $operand]} {
                                        if {$op eq "+"} {
                                            set linebase [expr {$maxline + $operand}]
                                        } else {
                                            set linebase [expr {$maxline - $operand}]
                                        }
                                    } else {
                                        error $errunrecognised
                                    }
                                } else {
                                    error $errunrecognised
                                }
                            } else {
                                set linebase $maxline
                            }
                            set linebase ${sign}$linebase
                        } elseif {[string match start* $tail]} {
                            set endmath [string range $tail 5 end]
                            if {[string length $endmath]} {
                                set op [string index $endmath 0]
                                if {$op in {+ -}} {
                                    set operand [string range $endmath 1 end]
                                    if {[string is integer -strict $operand]} {
                                        if {$op eq "+"} {
                                            set linebase [expr {$minline + $operand}]
                                        } else {
                                            set linebase [expr {$minline - $operand}]
                                        }
                                    } else {
                                        error $errunrecognised
                                    }
                                } else {
                                    error $errunrecognised
                                }
                            } else {
                                set linebase $minline
                            }
                            set linebase ${sign}$linebase
                        } elseif {[string match *-* $tail]} {
                            set extras [lassign [split $tail -] int1 int2]
                            if {[llength $extras]} {
                                error $errunrecognised
                            }
                            if {![string is integer -strict $int1] || ![string is integer -strict $int2]} {
                                error $errunrecognised
                            }
                            set linebase [expr {$int1 - $int2}]
                            set linebase ${sign}$linebase
                        } elseif {[string match *+* $tail]} {
                            set extras [lassign [split $tail +] int1 int2]
                            if {[llength $extras]} {
                                error $errunrecognised
                            }
                            if {![string is integer -strict $int1] || ![string is integer -strict $int2]} {
                                error $errunrecognised
                            }
                            set linebase [expr {$int1 + $int2}]
                            set linebase ${sign}$linebase
                        } else {
                            error $errunrecognised
                        }

                    } else {
                        set linebase $opt_linebase
                    }

                    lassign [my numeric_chunkrange $chunkstart $chunkend] chunkstart chunkend

                    if {![llength $opt_boundaries]} {
                        set binfo [lib::range_spans_chunk_boundaries $chunkstart $chunkend $chunksize -offset $opt_offset]
                        set boundaries [dict get $binfo boundaries]
                    } else {
                        set boundaries [list]
                        foreach b $opt_boundaries {
                            if {$chunkstart <= $b && $chunkend >= $b} {
                                lappend boundaries [expr {$b + $opt_offset}]
                            }
                        }
                    }


                    if {![llength $boundaries]} {
                        return "No boundaries found between $chunkstart and $chunkend for chunksize $chunksize (when offset $opt_offset)"
                    }
                    if {$opt_showconfig} {
                        set result "chunk range $chunkstart $chunkend line range $minline $maxline linebase $linebase limit $opt_limit\n"
                    } else {
                        set result ""
                    }
                    set pre_bytes [expr {$opt_displaybytes /2}]
                    set post_bytes $pre_bytes
                    set max_bytes [expr {[my chunklen] -1}]
                    if {$opt_limit > 0} {
                        set boundaries [lrange $boundaries[unset boundaries] 0 $opt_limit-1]
                    }

                    set i 0
                    foreach b $boundaries {
                        if {$opt_boundaryheader ne ""} {
                            set j [expr {$i+1}]
                            append  result [string map [list %b% $b %i% $i %j% $j] $opt_boundaryheader] \n
                        }
                        set low [expr {max(($b - $pre_bytes),0)}]
                        set high [expr {min(($b + $post_bytes),$max_bytes)}]

                        set lineinfolist [my chunkrange_to_lineinfolist $low $high -show_truncated 1]
                        set le_map [list \r\n <crlf> \r <cr> \n <lf>]
                        set result_list [list]
                        foreach lineinfo $lineinfolist {
                            set lineidx [dict get $lineinfo lineindex]

                            set linenum [expr {$lineidx + $linebase}]
                            set s [dict get $lineinfo start]
                            set e [dict get $lineinfo end]

                            set boundarymarker ""
                            set displayidx ""
                            set linenum_display $linenum
                            if {$s <= $b && $e >= $b} {
                                set idx [expr {$b - $s}] ;#index into whole position in whole line - not so useful if we're viewing a small section of a line
                                set char [string index [my line $lineidx] $idx]
                                set char_display [string map [list \r <CR> \n <LF>] $char]
                                if {[dict get $lineinfo is_truncated]} {
                                    set tside [dict get $lineinfo truncatedside]
                                    set truncated [dict get $lineinfo truncated]
                                    set tlen [string length $truncated]
                                    if {"left" in $tside} {
                                        set tleft [dict get $lineinfo truncatedleft]
                                        set tleftlen [string length $tleft]
                                        set displayidx [expr {$idx - $tleftlen}]
                                    } elseif {"right" in $tside} {
                                        set displayidx $idx
                                    }
                                } else {
                                    set displayidx $idx
                                }
                                set boundarymarker "'[a+ green bold]$char_display[a]'@$displayidx"
                                set linenum_display ${linenum_display},$idx
                            }

                            set lhs_status $opt_cmark ;#default
                            set rhs_status $opt_cmark ;#default
                            if {[dict get $lineinfo is_truncated]} {
                                set line [dict get $lineinfo truncated]
                                set tside [dict get $lineinfo truncatedside]
                                if {"left" in $tside && "right" in $tside } {
                                    set lhs_status $opt_tmark
                                    set rhs_status $opt_tmark
                                } elseif {"left" in $tside} {
                                    set lhs_status $opt_tmark
                                } elseif {"right" in $tside} {
                                    set rhs_status $opt_tmark
                                }


                            } else {
                                set line [my line $lineidx]
                            }
                            if {$displayidx ne ""} {
                                set line [string replace $line $displayidx $displayidx [a+ White green bold]$char_display[a]]
                            }
                            set displayline [string map $le_map $line]
                            lappend result_list [list $linenum_display $boundarymarker $lhs_status $displayline $rhs_status]
                        }
                        set title_linenum "LNUM"
                        set linenums [lsearch -index 0 -all -inline -subindices $result_list *]
                        set markers [lsearch -index 1 -all -inline -subindices $result_list *]
                        set lines   [lsearch -index 3 -all -inline -subindices $result_list *]
                        set title_marker ""
                        set title_line "Line"
                        #todo - use punk::char for unicode support of wide chars etc?
                        set widest_linenum [tcl::mathfunc::max {*}[lmap v [concat [list $title_linenum] $linenums] {string length $v}]]
                        set widest_marker [tcl::mathfunc::max {*}[lmap v [concat [list $title_marker] $markers] {string length [ansistrip $v]}]]
                        set widest_status [expr {max([string length $opt_cmark], [string length $opt_tmark])}]
                        set widest_line [tcl::mathfunc::max {*}[lmap v [concat [list $title_line] $lines] {string length $v}]]
                        foreach row $result_list {
                            lassign $row linenum marker lhs_status line rhs_status
                            append result [format " %-*s " $widest_linenum $linenum]
	                        append result [format " %-*s " $widest_marker $marker]
                            append result [format " %-*s " $widest_status $lhs_status]
                            append result [format " %-*s " $widest_line $line]
                            append result [format " %-*s " $widest_status $rhs_status] \n
                        }
                        incr i
                    }
                    set ::punk::fileline::ansi::enabled $pre_ansi_enabled
                    return $result
                }
                method linecount {} {
                    #*** !doctools
                    #[call class::textinfo [method linecount]]
                    #[para] Number of lines in the raw data of the file, counted as per the policy in effect
                    return [llength $o_payloadlist]
                }


                method line {lineindex} {
                    #*** !doctools
                    #[call class::textinfo [method line] [arg lineindex]]
                    #[para]Reconstructs and returns the raw line using the payload and per-line stored line-ending metadata
                    #[para]A 'line' may be returned without a line-ending if the unerlying chunk had trailing data without a line-ending (or the chunk was loaded under a non-standard -policy setting)
                    #[para]Whilst such data may not conform to definitions (e.g POSIX) of the terms 'textfile' and 'line' - it is useful here to represent it as a line with metadata le set to "none"
                    #[para]To return just the data which might more commonly be needed for dealing with lines, use the [method linepayload] method - which returns the line data minus line-ending

                    lassign [my numeric_linerange $lineindex 0] lineindex

                    set le [dict get $o_linemap $lineindex le]
                    set le_chars [dict get [dict create lf \n crlf \r\n none ""] $le]
                    return [lindex $o_payloadlist $lineindex]$le_chars
                }
                method chunk_find_glob {globsearch args} {
                    #todo - use linepayload_find_glob when -ignore_lineendings is 0 - but check truncations for 1st and last line
                    error "unimplemented"
                }
                method linepayload_find_glob {globsearch args} {
                    #*** !doctools
                    #[call class::textinfo [method linepayload_find_glob] [arg globsearch] [opt {option value...}]]
                    #[para]Return a lineinfolist (see [method lineinfo] and [method lineinfolist]) of lines where payload matches the [arg globsearch] string
                    #[para]To limit the returned results use the -limit n option - where -limit 0 means return all matches.
                    #[para]For example: [method linepayload_find_glob] "*test*" -limit 1
                    #[para]The result is always a list of lineinfo dictionaries even if one item is returned
                    #[para] -limitfrom can be start|end
                    #[para]The order of results is always the order as they occur in the data - even if -limitfrom end is specified.
                    #[para]-limitfrom end means that only the last -limit items are returned
                    #[para]Note that as glob accepts [lb]chars[rb]] to mean match any character in the set given by chars, searching for literal square brackets should be done by escaping the bracket with a backslash
                    #[para]This is true even if only a single square bracket is being searched for. e.g {*[lb]file*} will not find the word file followed by a left square-bracket - even though the search didn't close the square brackets.
                    #[para]In the above case - the literal search should be {*\[lb]file*}

                    set opts [dict create\
                        -limit 0\
                        -strategy 1\
                        -start 0\
                        -end end\
                        -limitfrom start\
                    ]
                    foreach {k v} $args {
                        switch -- $k {
                            -limit - -strategy - -start - -end - -limitfrom {
                                dict set opts $k $v
                            }
                            default {
                                error "linepayload_find_glob unknown option '$k'. Known options: [dict keys $opts]"
                            }
                        }
                    }
                    # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
                    set opt_limit       [dict get $opts -limit]
                    if {![string is integer -strict $opt_limit] || $opt_limit < 0} {
                        error "linepayload_find_glob -limit must be positive integer"
                    }
                    # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
                    set opt_strategy    [dict get $opts -strategy]
                    # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
                    set opt_start       [dict get $opts -start]
                    set opt_start [expr {$opt_start}]
                    if {$opt_start != 0} {error "-start unimplemented"}
                    # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
                    set opt_end       [dict get $opts -end]
                    set max_line_index [expr {[llength $o_payloadlist]-1}]
                    if {$opt_end eq "end"} {
                        set opt_end $max_line_index
                    }
                    #TODO
                    if {$opt_end < $max_line_index} {error "-end less than max_line_index unimplemented"}
                    # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
                    set opt_limitfrom   [dict get $opts -limitfrom]
                    #-limitfrom  start|end only
                    #TODO
                    if {$opt_limitfrom ne "start"} {error "-limitfrom unimplemented"}
                    # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---


                    set lineinfolist [list]

                    if {$opt_limit == 1} {
                        set idx [lsearch -glob [lrange $o_payloadlist $opt_start $opt_end] $globsearch]
                        if {$idx >=0} {
                            set i [expr {$opt_start + $idx}]
                            lappend lineinfolist [list lineindex $i [dict get $o_linemap $i] payload [lindex $o_payloadlist $i]]
                        }
                    } elseif {$opt_limit == 0} {
                        set indices [lsearch -all -glob [lrange $o_payloadlist $opt_start $opt_end] $globsearch]
                        foreach irel $indices {
                            set i [expr {$opt_start + $irel}]
                            lappend lineinfolist [list lineindex $i [dict get $o_linemap $i] payload  [lindex $o_payloadlist $i]]
                        }
                    } else {
                        #todo - auto-strategy based on limit vs number of lines
                        if {$opt_strategy == 0} {
                            set posn 0
                            for {set r 0} {$r < $opt_limit} {incr r} {
                                set n [lsearch [lrange $o_payloadlist $posn+$opt_start end] $globsearch]
                                if {$n >=0} {
                                    set irel [expr {$posn + $n}]
                                    set i [expr {$irel + $opt_start}]
                                    lappend lineinfolist [list lineindex $i {*}[dict get $o_linemap $i] payload [lindex $o_payloadlist $i]]
                                    set posn [expr {$irel+1}]
                                }
                            }
                        } else {
                            set indices [lsearch -all -glob [lrange $o_payloadlist $opt_start $opt_end] $globsearch]
                            set limited [lrange $indices 0 $opt_limit-1]
                            foreach irel $limited {
                                set i [expr {$opt_start + $irel}]
                                lappend lineinfolist [list lineindex $i {*}[dict get $o_linemap $i] payload [lindex $o_payloadlist $i]]
                            }
                        }
                    }
                    return $lineinfolist
                }
                method linepayload {lineindex} {
                    #*** !doctools
                    #[call class::textinfo [method linepayload] [arg lineindex]]
                    #[para]Return the text of the line indicated by the zero-based lineindex
                    #[para]The line-ending is not returned in the data - but is still stored against this lineindex
                    #[para]Line Metadata such as the line-ending for a particular line and the byte/character range it occupies within the chunk can be retrieved with the [method linemeta] method
                    #[para]To retrieve both the line text and metadata in a single call the [method lineinfo] method can be used
                    #[para]To retrieve an entire line including line-ending use the [method line] method.
                    lassign [my numeric_linerange $lineindex 0] lineindex
                    return [lindex $o_payloadlist $lineindex]
                }
                method linepayloads {startindex endindex} {
                    #*** !doctools
                    #[call class::textinfo [method linepayloads] [arg startindex] [arg endindex]]
                    #[para]Return a list of just the payloads in the specified linindex range, with no metadata.
                    return [lrange $o_payloadlist $startindex $endindex]
                }
                method linemeta {lineindex} {
                    #*** !doctools
                    #[call class::textinfo [method linemeta] [arg lineindex]]
                    #[para]Return a dict of the metadata for the line indicated by the zero-based lineindex
                    #[para]Keys returned include
                    #[list_begin itemized]
                    #[item] le
                    #[para] A string representing the type of line-ending:  crlf|lf|none
                    #[item] linelen
                    #[para] The number of characters/bytes in the whole line including line-ending if any
                    #[item] payloadlen
                    #[para] The number of character/bytes in the line excluding line-ending
                    #[item] start
                    #[para] The zero-based index into the associated raw file data indicating at which byte/character index this line begins
                    #[item] end
                    #[para] The zero-based index into the associated raw file data indicating at which byte/character index this line ends
                    #[para] This end-point corresponds to the last character of the line-ending if any - not necessarily the last character of the line's payload
                    #[list_end]
                    lassign [my numeric_linerange $lineindex 0] lineindex
                    dict get $o_linemap $lineindex
                }
                method lineinfo {lineindex} {
                    #*** !doctools
                    #[call class::textinfo [method lineinfo] [arg lineindex]]
                    #[para]Return a dict of the metadata and text for the line indicated by the zero-based lineindex
                    #[para]This returns the same info as the [method linemeta] with an added key of 'payload' which is the text of the line without line-ending.
                    #[para]The 'payload' value is the same as is returned from the [method linepayload] method.
                    lassign [my numeric_linerange $lineindex 0] lineindex ;#convert lineindex to canonical number e.g 1_000 -> 1000  end -> highest index
                    return [dict create lineindex $lineindex {*}[dict get $o_linemap $lineindex] payload [lindex $o_payloadlist $lineindex]]
                }
                method lineinfolist {startidx endidx} {
                    #*** !doctools
                    #[call class::textinfo [method lineinfolist] [arg startidx] [arg endidx]]
                    #[para]Returns list of lineinfo dicts for each line in line index range startidx to endidx
                    lassign [my numeric_linerange $startidx $endidx] startidx endidx
                    set chunkstart  [dict get $o_linemap $startidx start]
                    set chunkend    [dict get $o_linemap $endidx end]
                    set line_list [my chunkrange_to_lineinfolist $chunkstart $chunkend] ;# assertion - no need to view truncations as we've picked start and end of complete lines
                    #verify sanity
                    set l_start [lindex $line_list 0]
                    if {[set idx_start [dict get $l_start lineindex]] ne $startidx} {
                        error "lineinfolist first lineindex $idx_start doesn't match startidx $startidx"
                    }
                    set l_end [lindex $line_list end]
                    if {[set idx_end [dict get $l_end lineindex]] ne $endidx} {
                        error "lineinfolist last lineindex $idx_end doesn't match endidx $endidx"
                    }
                    return $line_list
                }

                method linerange_to_chunkrange {startidx endidx} {
                    #*** !doctools
                    #[call class::textinfo [method linerange_to_chunkrange] [arg startidx] [arg endidx]]

                    lassign [my numeric_linerange $startidx $endidx] startidx endidx
                    #inclusive range
                    return [list [dict get $o_linemap $startidx start] [dict get $o_linemap $endidx end]]
                }
                method linerange_to_chunk {startidx endidx} {
                    #*** !doctools
                    #[call class::textinfo [method linerange_to_chunk] [arg startidx] [arg endidx]]
                    set chunkrange [my linerange_to_chunkrange $startidx $endidx]
                    return [string range $o_chunk [lindex $chunkrange 0] [lindex $chunkrange 1]]
                }
                method lines {startidx endidx} {
                    #*** !doctools
                    #[call class::textinfo [method lines] [arg startidx] [arg endidx]]
                    lassign [my numeric_linerange $startidx $endidx] startidx endidx
                    set linelist [list]
                    set le_map [dict create lf \n crlf \r\n none ""]
                    for {set i $startidx} {$i <= $endidx} {incr i} {
                        lappend linelist "[lindex $o_payloadlist $i][dict get $le_map [dict get $o_linemap $i le]]"
                    }
                    return $linelist
                }
                method linepayloads {startidx endidx} {
                    #*** !doctools
                    #[call class::textinfo [method linepayloads] [arg startidx] [arg endidx]]
                    return [lrange $o_payloadlist $startidx $endidx]
                }
                method chunkrange_to_linerange {chunkstart chunkend} {
                    #*** !doctools
                    #[call class::textinfo [method chunkrange_to_linerange] [arg chunkstart] [arg chunkend]]
                    lassign [my numeric_chunkrange $chunkstart $chunkend] chunkstart chunkend

                    set linestart -1
                    for {set i 0} {$i < [llength $o_payloadlist]} {incr i} {
                        if {($chunkstart >= [dict get $o_linemap $i start]) && ($chunkstart <= [dict get $o_linemap $i end])} {
                            set linestart $i
                            break
                        }
                    }
                    if {$linestart == -1} {
                        error "Line with range in chunk spanning start index $chunkstart not found"
                    }
                    set lineend -1
                    for {set i [expr {[llength $o_payloadlist] -1}]} {$i >=0} {incr i -1} {
                        if {($chunkend >= [dict get $o_linemap $i start]) && ($chunkend <= [dict get $o_linemap $i end])} {
                            set lineend $i
                            break
                        }
                    }
                    if {$lineend == -1} {
                        error "Line with range spanning end index $chunkend not found"
                    }
                    return [list $linestart $lineend]
                }
                method chunkrange_to_lineinfolist {chunkstart chunkend args} {
                    #*** !doctools
                    #[call class::textinfo [method chunkrange_to_lineinfolist] [arg chunkstart] [arg chunkend] [opt {option value...}]]
                    #[para]Return a list of dicts each with structure like the result of the [method lineinfo] method - but possibly with extra keys for truncation information if -show_truncated 1 is supplied
                    #[para]The truncation key in a lineinfo dict may be returned for first and/or last line in the resulting list.
                    #[para]truncation shows the shortened (missing bytes on left and/or right side) part of the entire line (potentially including line-ending or even partial line-ending)
                    #[para]Note that this truncation info is only in the return value of this method - and will not be reflected in [method lineinfo] queries to the main chunk.

                    lassign [my numeric_chunkrange $chunkstart $chunkend] chunkstart chunkend
                    set defaults [dict create\
                        -show_truncated 0\
                    ]
                    set known_opts [dict keys $defaults]
                    foreach {k v} $args {
                        if {$k ni $known_opts} {
                            error "chunkrange_to_lines error: unknown option '$k'. Known options: $known_opts"
                        }
                    }
                    set opts [dict merge $defaults $args]
                    # -- --- --- --- --- --- --- ---
                    set opt_show_truncated [dict get $opts -show_truncated]
                    # -- --- --- --- --- --- --- ---

                    set infolist [list]
                    set linerange [my chunkrange_to_linerange $chunkstart $chunkend]
                    lassign $linerange start_lineindex end_lineindex

                    #if -show_truncated
                    #return extra keys for first and last items (which may be the same item if chunkrange is entirely within a line)
                    #add is_truncated 0|1 to all lines
                    #Even if the start/end line is not fully within the chunkrange ie truncated - the 'payload' key will contain the original untruncated data
                    ###########################
                    # first line may have payload tail truncated - or just linefeed, or even a split linefeed
                    ###########################
                    set first [dict create lineindex $start_lineindex {*}[dict get $o_linemap $start_lineindex] payload [lindex $o_payloadlist $start_lineindex]]
                    set start_info [dict get $o_linemap $start_lineindex]


                    if {$chunkstart > [dict get $start_info start]} {
                        dict set first is_truncated 1
                        dict set first truncatedside [list left] ;#truncatedside is a list which may have 'right' added if last line is same as first line
                    } else {
                        dict set first is_truncated 0
                    }

                    if {$opt_show_truncated} {
                        #line1
                        if {$chunkstart > [dict get $start_info start]} {
                            #there is lhs truncation
                            set payload [lindex $o_payloadlist $start_lineindex]
                            set line_start [dict get $start_info start]
                            set le_chars [dict get [dict create lf \n crlf \r\n none ""] [dict get $start_info le]]
                            set payload_and_le "${payload}${le_chars}"
                            set split [expr {$chunkstart - $line_start}]
                            set truncated [string range $payload_and_le $split end]
                            set lhs [string range $payload_and_le 0 $split-1]

                            dict set first truncated $truncated
                            dict set first truncatedleft $lhs
                        }
                    }
                    ###########################

                    ###########################
                    # middle lines if any - no truncation
                    ###########################
                    #difference in indexes of 1 would only mean 2 items to return
                    set middle_list [list]
                    if {($end_lineindex - $start_lineindex) > 1} {
                        for {set i [expr {$start_lineindex +1}]} {$i <= [expr {$end_lineindex -1}] } {incr i} {
                            #lineindex is key into main list
                            lappend middle_list [dict create lineindex $i {*}[dict get $o_linemap $i] payload [lindex $o_payloadlist $i] is_truncated 0]
                        }
                    }
                    ###########################

                    ###########################
                    # tail line  may have beginning or all of payload truncated - linefeed may be split if crlf
                    # may be same line as first line - in which case truncation at beginning as well
                    if {$end_lineindex == $start_lineindex} {
                        #same record
                        set end_info $start_info


                        if {$chunkend < [dict get $end_info end]} {
                            #there is rhs truncation
                            if {[dict get $first is_truncated]} {
                                dict set first truncatedside [list left right]
                            } else {
                                dict set first is_truncated 1
                                dict set first truncatedside [list right]
                            }
                        }

                        if {$opt_show_truncated} {
                            if {$chunkend < [dict get $end_info end]} {
                                #there is rhs truncation and we need to return the splits
                                #do rhs truncation - possibly in addition to existing lhs truncation
                                # ...
                                set payload [lindex $o_payloadlist $end_lineindex]
                                set line_start [dict get $end_info start]
                                set le_chars [dict get [dict create lf \n crlf \r\n none ""] [dict get $end_info le]]
                                set payload_and_le "${payload}${le_chars}"
                                set split [expr {$chunkend - $line_start}]
                                set truncated [string range $payload_and_le 0 $split]
                                set rhs [string range $payload_and_le $split+1 end]
                                dict set first truncatedright $rhs
                                if {"left" ni [dict get $first truncatedside]} {
                                    #rhs truncation only
                                    puts "payload_and_le: $payload_and_le"
                                    puts "LENGTH: [string length $payload_and_le]"
                                    #---
                                    #---
                                    dict set first truncated $truncated
                                    dict set first truncatedside [list right]
                                } else {
                                    #truncated on both sides
                                    set lhslen [string length [dict get $first truncatedleft]]
                                    #re-truncate the truncation to reapply the original lhs truncation
                                    set truncated [string range $truncated $lhslen end]
                                    dict set first truncated $truncated
                                }
                            }
                        }
                        #no middle or last to append
                        lappend infolist $first
                    } else {
                        set last [dict create lineindex $end_lineindex {*}[dict get $o_linemap $end_lineindex] payload [lindex $o_payloadlist $end_lineindex]]
                        set end_info [dict get $o_linemap $end_lineindex]


                        if {$chunkend < [dict get $end_info end]} {
                            dict set last is_truncated 1
                            dict set last truncatedside [list right]
                        } else {
                            dict set last is_truncated 0
                        }

                        if {$opt_show_truncated} {
                            if {$chunkend < [dict get $end_info end]} {
                                #there is rhs truncation - and last line in range is a different line to first one
                                set payload [lindex $o_payloadlist $end_lineindex]
                                set line_start [dict get $end_info start]
                                set line_end [dict get $end_info end]
                                set le [dict get $end_info le]
                                set le_size [dict get {lf 1 crlf 2 none 0} $le]
                                set le_chars [dict get [dict create lf \n crlf \r\n none ""] $le]
                                set payload_and_le "${payload}${le_chars}"

                                set split [expr {$chunkend - $line_start}]
                                set truncated [string range $payload_and_le 0 $split]
                                set rhs [string range $payload_and_le $split+1 end]

                                dict set last truncated $truncated
                                dict set last truncatedright $rhs
                                #this has the effect that truncating the rhs by 1 can result in truncated being larger than original payload for crlf lines - as payload now sees the cr
                                #this is a bit unintuitive - but probably best reflects the reality. The truncated value is the truncated 'line' rather than the truncated 'payload'
                            }
                        }


                        lappend infolist $first
                        if {[llength $middle_list]} {
                            lappend infolist {*}$middle_list
                        }
                        lappend infolist $last
                    }
                    ###########################
                    #assertion  all records have is_truncated key.
                    #assertion if is_truncated == 1 truncatedside should contain a list of either left, right or both left and right
                    #assertion If not opt_show_truncated - then truncated records will not have truncated,truncatedleft,truncatedright keys.
                    return $infolist
                }

                #need to check truncations so that any split \r\n is counted precisely  todo
                method chunk_le_counts {chunkstart chunkend} {
                    set infolines [my chunkrange_to_lineinfolist $chunkstart $chunkend -show_truncated 1]
                    set lf_count 0
                    set crlf_count 0
                    set none_count 0
                    foreach d $infolines {
                        set le [dict get $d le]
                        if {$le eq "lf"} {
                            incr lf_count
                        } elseif {$le eq "crlf"} {
                            incr crlf_count
                        } else {
                            incr none_count
                        }
                    }
                    #even without split crlf - this can overcount by counting the lf or crlf in a line which had an ending not in the chunk range specified

                    #check first and last infoline for truncations
                    #Also check if the truncation is directly between an crlf
                    #both an lhs split and an rhs split could land between cr and lf
                    #to be precise - we should presumably count the part within our chunk as either a none for cr or an lf
                    #This means a caller counting chunk by chunk using this method will sometimes get the wrong answer depending on where crlfs lie relative to their chosen chunk size
                    #This is presumably ok - as it should be a well known thing to watch out for.
                    #If we're only receiving chunk by chunk we can't reliably detect splits vs lone <cr>s in the data
                    #There are surely more efficient ways for a caller to count line-endings in the way that makes sense for them
                    #but we should makes things as easy as possible for users of this line/chunk structure anyway.

                    set first [lindex $infolines 0]
                    if {[dict get $first is_truncated]} {
                        #could be the only line - and truncated at one or both ends.
                        #both a left and a right truncation could split a crlf

                    }
                    set last  [lindex $infolines end]
                    if {[dict get $first lineindex] != [dict get $last lineindex]} {
                        #only need to process last if it is a different line
                        #if so - then split can only be left side

                    }


                    return [dict create lf $lf_count crlf $crlf_count unterminated $none_count warning line_ending_splits_unimplemented]
                }

                #todo - test last line and merge as necessary with first line from new chunk - generate line data only for appended chunk
                method append_chunk {rawchunk} {
                    error "sorry - unimplemented"
                }

                method numeric_linerange {startidx endidx} {
                    #*** !doctools
                    #[call class::textinfo [method numeric_linerange] [arg startidx] [arg endidx]]
                    #[para]A helper to return any Tcl-style end end-x values given to startidx or endidx; converted to their specific values based on the current state of the underlying line data
                    #[para]This is used internally by API functions such as [method line] to enable it to accept more expressive indices
                    return [my normalize_indices $startidx $endidx [expr {[dict size $o_linemap]-1}]]
                }
                method numeric_chunkrange {startidx endidx} {
                    #*** !doctools
                    #[call class::textinfo [method numeric_chunkrange] [arg startidx] [arg endidx]]
                    #[para]A helper to return any Tcl-style end end-x entries supplied to startidx or endidx; converted to their specific values based on the current state of the underlying chunk data
                    return [my normalize_indices $startidx $endidx [expr {[string length $o_chunk]-1}]]
                }
                method normalize_indices {startidx endidx max} {
                    #*** !doctools
                    #[call class::textinfo [method normalize_indices] [arg startidx] [arg endidx] [arg max]]
                    #[para]A utility to convert some of the of Tcl-style list-index expressions such as end, end-1 etc to valid indices in the range 0 to the supplied max
                    #[para]Basic addition and subtraction expressions such as 4-1  5+2 are accepted
                    #[para]startidx higher than endidx is allowed
                    #[para]Unlike Tcl's index expressions - we raise an error if the calculated index is out of bounds 0 to max
                    set original_startidx $startidx
                    set original_endidx $endidx
                    set startidx [string map [list _ ""] $startidx] ;#don't barf on Tcl 8.7+ underscores in numbers - we can't just use expr because it will not handle end-x
                    set endidx   [string map [list _ ""] $endidx]
                    if {![string is digit -strict "$startidx$endidx"]} {
                        foreach whichvar [list start end] {
                            upvar 0 ${whichvar}idx index
                            if {![string is digit -strict $index]} {
                                switch -glob -- $index {
                                    end {
                                        set index $max
                                    }
                                    "*-*" {
                                        #end-int  or int-int - like lrange etc we don't accept arbitrarily complex expressions
                                        lassign [split $index -] A B
                                        if {$A eq "end"} {
                                            set index [expr {$max - $B}]
                                        } else {
                                            set index [expr {$A - $B}]
                                        }
                                    }
                                    "*+*" {
                                        lassign [split $index +] A B
                                        if {$A eq "end"} {
                                            #review - this will just result in out of bounds error in final test - as desired
                                            #By calculating here - we will see the result in the error message - but it's probably not particularly useful - as we don't really need end+ support at all.
                                            set index [expr {$max + $B}]
                                        } else {
                                            set index [expr {$A + $B}]
                                        }
                                    }
                                    default {
                                        #May be something like +2 or -0  which braced expr can hanle
                                        #we would like to avoid unbraced expr here - as we're potentially dealing with ranges that may come from external sources.
                                        if {[catch {expr {$index}} index]} {
                                            #could be  end+x - but we don't want out of bounds to be valid
                                            #set it to something that the final bounds expr test can deal with
                                            set index Inf
                                        }
                                    }
                                }
                            }
                        }
                    }
                    #Unlike Tcl lrange,lindex etc - we don't want to support out of bound indices.
                    #show the supplied index and what it was mapped to in the error message.
                    if {$startidx < 0 || $startidx > $max} {
                        error "Bad start index '$original_startidx'. $startidx out of bounds 0 - $max"
                    }
                    if {$endidx < 0 || $endidx > $max} {
                        error "Bad end index '$original_endidx'.  $endidx out of bounds 0 - $max (try $max or end)"
                    }
                    return [list $startidx $endidx]
                }

                method regenerate_lines {args} {
                    #*** !doctools
                    #[call class::textinfo [method regenerate_lines]]
                    #[para]generate a list of lines from the current state of the stored raw data chunk and keep a map of line-endings indexed by lineindex
                    #[para]This is called automatically by the Constructor during object creation
                    #[para]It is exposed in the API experimentally - as chunk and line manipulation functions are considered.
                    #[para]TODO - review whether such manual control will be necessary/desirable

                    #we don't store the actual line-endings as characters (for better layout of debug/display of data) - instead we store names lf|crlf|none

                    # first split on lf - then crlf. As we've replaced with single substution chars - the order doesn't matter.
                    set o_payloadlist [list]
                    set o_linemap [dict create]
                    set crlf_replace [list \r\n $o_CRLF_C \n $o_LF_C]
                    set normalised_data [string map $crlf_replace $o_chunk]

                    set lf_lines [split $normalised_data $o_LF_C]

                    set idx 0
                    set lf_count 0
                    set crlf_count 0
                    set filedata_offset 0
                    set i 0
                    set imax [expr {[llength $lf_lines]-1}]
                    foreach lfln $lf_lines {
                        set crlf_parts [split $lfln $o_CRLF_C]
                        if {[llength $crlf_parts] <= 1} {
                            #no crlf
                            set payloadlen [string length $lfln]
                            set le_size 1
                            set le lf
                            if {$i == $imax} {
                                #no more lf segments - and no crlfs
                                if {$payloadlen > 0} {
                                    #last line in split has chars - therefore there was no trailing line-ending
                                    set le_size 0
                                    set le none
                                } else {
                                    #empty space after last line-ending
                                    #not really a line - we get here from splitting on our lf-replacement char
                                    #An editor might display this pseudo-line with a line number - but we won't treat it as one here
                                    break
                                }
                            }
                            lappend o_payloadlist $lfln
                            set linelen [expr {$payloadlen + $le_size}]
                            #we include line-ending in byte count for a line.
                            dict set o_linemap $idx [list le $le linelen $linelen payloadlen $payloadlen start $filedata_offset end [expr {$filedata_offset + $linelen -1}]]
                            incr filedata_offset $linelen
                            incr lf_count
                            incr idx
                        } else {
                            foreach crlfpart [lrange $crlf_parts 0 end-1] {
                                lappend o_payloadlist $crlfpart
                                set payloadlen [string length $crlfpart]
                                set linelen [expr {$payloadlen + 2}]
                                dict set o_linemap $idx [list le crlf linelen $linelen payloadlen $payloadlen start $filedata_offset end [expr {$filedata_offset + $linelen -1}]]
                                incr filedata_offset $linelen
                                incr crlf_count
                                incr idx
                            }
                            set lfpart [lindex $crlf_parts end]
                            set payloadlen [string length $lfpart]
                            if {$i == $imax} {
                                #no more lf segments - but we did find crlf in last (or perhaps only) lf line
                                #last element in our split  has no le
                                if {$payloadlen > 0} {
                                    set le_size 0
                                    set le none
                                } else {
                                    #set le_size 2
                                    #set le crlf
                                    break
                                }
                            }  else {
                                #more lf segments to come
                                    set le_size 1
                                    set le lf
                            }

                            lappend o_payloadlist $lfpart
                            set linelen [expr {$payloadlen + $le_size}]
                            dict set o_linemap $idx [list le $le linelen $linelen payloadlen $payloadlen start $filedata_offset end [expr {$filedata_offset + $linelen -1}]]
                            incr filedata_offset $linelen
                            incr lf_count
                            incr idx
                        }
                        incr i
                        #incr filedata_offset ;#move up 1 so start entry for next line is greater than end entry for previous line
                    }
                    set le_count [expr {$lf_count + $crlf_count}]
                    if {$le_count != [llength $o_payloadlist]} {
                        puts stderr "fileline::class::textinfo warning. regenerate_lines lf_count: $lf_count + crlf_count: $crlf_count does not equal length of lines stored: [llength $o_payloadlist]"
                    }

                }
                method regenerate_chunk {} {
                    #o_payloadlist
                    #o_linemap
                    set oldsize [string length $o_chunk]
                    set newchunk ""
                    dict for {idx lineinfo} $o_linemap {
                        set

                    }

                    return [list newsize [string length $newchunk] oldsize $oldsize]
                }


                #*** !doctools
                #[list_end]
            }
        #*** !doctools
        #[list_end] [comment {--- end class enumeration ---}]
    }
}
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++

# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# Base namespace
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
namespace eval punk::fileline {
    namespace export *
    #variable xyz

    #*** !doctools
    #[subsection {Namespace punk::fileline}]
    #[para] Core API functions for punk::fileline
    #[list_begin definitions]

    punk::args::define {
        @id     -id ::punk::fileline::get_textinfo
        @cmd    -name punk::fileline::get_textinfo -help\
            "return: textinfo object instance"
        -file -default {} -type existingfile
        -translation -default iso8859-1
        -encoding -default "\uFFFF"
        -includebom -default 0
        @values -min 0 -max 1
    }
    proc get_textinfo {args} {
        #*** !doctools
        #[call get_textinfo [opt {option value...}] [opt datachunk]]
        #[para]Returns textinfo object instance representing data in string datachunk or if -file filename supplied - data loaded from a file
        #[para]The encoding used is as specified in the -encoding option - or from the Byte Order Mark (bom) at the beginning of the data
        #[para]For Tcl 8.6 - encodings such as utf-16le may not be available - so the bytes are swapped appropriately depending on the platform byteOrder and encoding 'unicode' is used.
        #[para]encoding defaults to utf-8 if no -encoding specified and no BOM was found
        #[para]Whether  -encoding was specified or not - by default the BOM characters are not retained in the line-data
        #[para]If -includebom 1 is specified - the bom will be retained in the stored chunk and the data for line 1, but will undergo the same encoding transformation as the rest of the data
        #[para]The get_bomid method of the returned object will contain an identifier for any BOM encountered.
        #[para] e.g utf-8,utf-16be, utf-16le, utf-32be, utf32-le, SCSU, BOCU-1,GB18030, UTF-EBCDIC, utf-1, utf-7
        #[para]If the encoding specified in the BOM isn't recognised by Tcl - the resulting data is likely to remain as the raw bytes of whatever encoding that is.
        #[para]Currently only utf-8, utf-16* and utf-32* are properly supported even though the other BOMs are detected, reported via get_bomid, and stripped from the data.
        #[para]GB18030 falls back to cp936/gbk (unless a gb18030 encoding has been installed). Use -encoding iso8859-1 if this isn't suitable and you need to do your own processing of the bytes.

        lassign [dict values [punk::args::get_by_id ::punk::fileline::get_textinfo $args]] opts values
        # -- --- --- ---
        set opt_file    [dict get $opts -file]
        set opt_translation [dict get $opts -translation]
        set opt_encoding    [dict get $opts -encoding]
        set opt_includebom  [dict get $opts -includebom]
        # -- --- --- ---

        if {$opt_file ne ""} {
            set filename $opt_file
            set fd [open $filename r]
            fconfigure $fd -translation binary -encoding $opt_translation;#should use translation binary to get actual line-endings - but we allow caller to override
            #Always read encoding in binary - check for bom below and/or apply chosen opt_encoding
            set rawchunk [read $fd]
            close $fd
            if {[llength $values]} {
                puts stderr "Ignoring trailing argument [string length [lindex $values 0]] bytes. Not used when -file is specified"
            }
        } else {
            set rawchunk [lindex $values 0]
        }
        set rawlen [string length $rawchunk]
        #examine first 4 bytes for possible BOM
        #big-endian BOMs
        # -----------------------------------
        #EFBBBF    - utf-8         reliabletxt
        #FEFF      - utf-16be      reliabletxt
        #FFFE      - utf-16le      reliabletxt
        #0000FEFF  - utf-32be      reliabletxt
        #FFFE0000  - utf-32le
        #0000FFFE  - utf-32be(2143)  non-standard!   (not supported)
        #FEFF0000  - utf-32le(3412)  non-standard!   (not supported - will detect as utf-16be)
        #2B2F76    - utf-7          (not supported)
        #F7644C    - utf-1          (not supported)
        #DD736673  - UTF-EBCDIC     (not supported)
        #0EFEFF    - SCSU           (not supported)
        #FBEE28    - BOCU-1     Binary Ordered Compression for Unicode (mime-compatible) - (not supported - fall back to utf-8)
        #84319533  - GB18030    -  Chinese gov standard (fall back to cp936 with warning if no encoding name)
        # -----------------------------------

        set first32 [string range $rawchunk 0 3]
        #scan using capital H for big-endian order
        set first32_be [binary scan $first32 H* maybe_bom] ;#we use H* instead of H8 for 8 nibbles (4 bytes) - because our first32 may contain less than 4 bytes - in which case we won't match
        set bomid ""
        set bomenc ""
        set is_reliabletxt 0 ;#see http://reliabletxt.com - only utf-8 with bom, utf-16be, utf-16le, utf-32be supported as at 2024
        set startdata 0
        #todo switch -glob
        if {[string match "efbbbf*" $maybe_bom]} {
            set bomid utf-8
            set bomenc utf-8
            set is_reliabletxt 1
            set startdata 3
        } elseif {$maybe_bom eq "0000feff"} {
            set bomid utf-32be
            set bomenc utf-32be
            set is_reliabletxt 1
            set startdata 4
        } elseif {$maybe_bom eq "fffe0000"} {
            #Technically ambiguous - could be utf-16le bom followed by utf-16 null character (2 byte null)
            puts stderr "WARNING - ambiguous BOM fffe0000 found. Treating as utf-32le - but could be utf-16le - consider manually setting -encoding or converting data to another encoding."
            set bomid utf-32le
            set bomenc utf-32le
            set startdata 4
        } elseif {[string match "feff*" $maybe_bom]} {
            set bomid utf-16be
            set bomenc utf-16be
            set is_reliabletxt 1
            set startdata 2
        } elseif {[string match "fffe*" $maybe_bom]} {
            set bomid utf-16le
            set bomenc utf-16le
            set is_reliabletxt 1
            set startdata 2
        } elseif {$maybe_bom eq "0efeff"} {
            set bomid scsu
            set bomenc "binary"
            set startdata 3
        } elseif {$maybe_bom eq "fbee28"} {
            set bomid bocu-1
            puts stderr "WARNING - bocu-1 BOM FBEE28 found. Not supported - back to binary"
            set bomenc "binary" ;# utf-8???
            set startdata 3
        } elseif {$maybe_bom eq "84319533"} {
            if {![dict exists [punk::char::page_names_dict gb18030]]} {
                puts stderr "WARNING - no direct support for GB18030 (chinese) - falling back to cp936/gbk"
                set bomenc cp936
            } else {
                set bomenc [dict get [punk::char::page_names_dict gb18030]] ;#review - this may never exist in Tcl or may be named differently - create a handler?
            }
            set bomid gb18030
            set startdata 4
        } elseif {$maybe_bom eq "f7644c"} {
            puts stderr "WARNING utf-1 BOM F7644C found - not supported. Falling back to binary"
            set bomid utf-1
            set bomenc binary
            set startdata 3
        } elseif {[string match "2b2f76*" $maybe_bom]} {
            puts stderr "WARNING utf-7 BOM 2b2f76 found - not supported. Falling back to binary and leaving BOM in data!"
            #review - work out how to strip bom - last 2 bits of 4th byte belong to following character
            set bomid utf-7
            set bomenc binary
            set startdata 0
        }

        #todo - check xml encoding attribute / html content-type
        #todo - a separate chardet (https://chardet.readthedocs.io/ ) or mozilla like mechanism that can be manually called to autodetect character encoding
        #This should be an explicit operation - not automatially done here unless we provide a flag for it.


        if {$opt_includebom} {
            set startdata 0
        }

        if {$opt_encoding eq "\uFFFF"} {
            if {$bomenc ne "" && $bomenc ne "binary"} {
                if {[package vcompare [package provide Tcl] 8.7] < 0} {
                    #tcl 8.6 has unicode encoding but not utf-16le etc
                    if {$bomenc ni [encoding names]} {
                        if {$bomenc eq "utf-16le"} {
                            if {$::tcl_platform(byteOrder) eq "littleEndian"} {
                                set datachunk [encoding convertfrom unicode [string range $rawchunk $startdata end]]
                                set encoding_selected unicode
                            } else {
                                set datachunk [encoding convertfrom unicode [system::wordswap16 [string range $rawchunk $startdata end]]]
                                set encoding_selected "unicode (wordswapped 16)"
                            }
                        } elseif {$bomenc eq "utf-16be"} {
                            if {$::tcl_platform(byteOrder) eq "littleEndian"} {
                                set datachunk [encoding convertfrom unicode [system::wordswap16 [string range $rawchunk $startdata end]]]
                                set encoding_selected "unicode (wordswapped 16)"
                            } else {
                                set datachunk [encoding convertfrom unicode [string range $rawchunk $startdata end]]
                                set encoding_selected unicode
                            }
                        } elseif {$bomenc eq "utf-32le"} {
                            if {$::tcl_platform(byteOrder) eq "littleEndian"} {
                                set datachunk [encoding convertfrom unicode [string range $rawchunk $startdata end]]
                                set encoding_selected unicode
                            } else {
                                set datachunk [encoding convertfrom unicode [system::wordswap32 [string range $rawchunk $startdata end]]]
                                set encoding_selected "unicode (wordswapped 32)"
                            }
                        } elseif {$bomenc eq "utf-32be"} {
                            if {$::tcl_platform(byteOrder) eq "littleEndian"} {
                                set datachunk [encoding convertfrom unicode [system::wordswap32 [string range $rawchunk $startdata end]]]
                                set encoding_selected "unicode (wordswapped 32)"
                            } else {
                                set datachunk [encoding convertfrom unicode [string range $rawchunk $startdata end]]
                                set encoding_selected unicode
                            }
                        } else {
                            error "Encoding $bomenc unavailable in this version of Tcl"
                        }
                    }  else {
                        set datachunk [encoding convertfrom $bomenc [string range $rawchunk $startdata end]]
                        set encoding_selected $bomenc
                    }
                } else {
                    #tcl 8.7 plus has utf-16le etc
                    set datachunk [encoding convertfrom $bomenc [string range $rawchunk $startdata end]]
                    set encoding_selected $bomenc
                }
            } else {
                #!?
                if {$bomenc eq "binary"} {
                    set datachunk [string range $rawchunk $startdata end]
                    set encoding_selected binary
                } else {
                    set datachunk [encoding convertfrom utf-8 [string range $rawchunk $startdata end]]
                    set encoding_selected utf-8
                }
            }
        } else {
            #manually specified encoding overrides bom - but still remove bom-chars REVIEW
            #e.g we still want bom info - but specify binary encoding

            if {$opt_encoding eq "binary"} {
                set datachunk [string range $rawchunk $startdata end]
            } else {
                set datachunk [encoding convertfrom $opt_encoding [string range $rawchunk $startdata end]]
            }
            set encoding_selected $opt_encoding
        }

        set textobj [class::textinfo new $datachunk]
        if {$bomid ne ""} {
            $textobj set_bomid $bomid
        }


        set summary ""
        append summary "Bytes loaded           :  $rawlen" \n
        append summary "BOM ID                 :  $bomid" \n
        append summary "Encoding selected      :  $encoding_selected" \n
        append summary "Characters             :  [$textobj chunklen]" \n
        append summary "Lines recognised       :  [$textobj linecount]" \n
        set leinfo [$textobj chunk_le_counts 0 end]
        append summary "crlf endings (windows) :  [dict get $leinfo crlf]" \n
        append summary "lf endings   (unix)    :  [dict get $leinfo lf]" \n
        append summary "unterminated lines     :  [dict get $leinfo unterminated]" \n
        puts stdout $summary
        return $textobj
    }

    proc file_boundary_display {filename startbyte endbyte chunksize args} {
        set fd [open $filename r] ;#use default error if file not readable
        fconfigure $fd -translation binary
        set rawfiledata [read $fd]
        close $fd
        set textobj [class::textinfo new $rawfiledata]
        set result [$textobj chunk_boundary_display $startbyte $endbyte $chunksize {*}$args]
        $textobj destroy
        return $result
    }


    #*** !doctools
    #[list_end] [comment {--- end definitions namespace punk::fileline ---}]
}
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++


# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# Secondary API namespace
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
namespace eval punk::fileline::lib {
    namespace export *
    namespace path [namespace parent]
    #*** !doctools
    #[subsection {Namespace punk::fileline::lib}]
    #[para] Secondary functions that are part of the API
    #[list_begin definitions]


    proc range_spans_chunk_boundaries {start end chunksize args} {
        #*** !doctools
        #[call [fun lib::range_spans_chunk_boundaries] [arg start] [arg end] [arg chunksize]]
        #[para]Takes start and end offset, generally representing bytes or character indices, and computes a list of boundaries at multiples of the chunksize that are spanned by the start and end range.
        #[list_begin arguments]
        #  [arg_def integer start]
        #  [para] zero-based start index of range
        #  [arg_def integer end]
        #  [para] zero-based end index of range
        #  [arg_def integer chunksize]
        #  [para] Number of bytes/characters  in chunk - must be positive and > 0
        #[list_end]
        #[para]returns a dict with the keys is_span and boundaries
        #[para]is_span 0|1 indicates if the range specified spans a boundary of chunksize
        #[para]boundaries contains a list of the spanned boundaries - which are always multiples of the chunksize
        #[para]e.g
        #[example_begin]
        #    range_spans_chunk_boundaries 10 1750 512
        #    is_span 1 boundaries {512 1024 1536}
        #[example_end]
        #[para]The -offset <int> option
        #[example_begin]
        #    range_spans_chunk_boundaries 10 1750 512 -offset 2
        #    is_span 1 boundaries {514 1026 1538}
        #[example_end]
        #[para] This function automatically uses lseq (if Tcl >= 8.7) when number of boundaries spanned is approximately greater than 75
        if {[catch {package require Tcl 8.7-}]} {
            #only one implementation available for older Tcl
            tailcall punk::fileline::system::_range_spans_chunk_boundaries_tcl $start $end $chunksize {*}$args
        }
        if {$chunksize < 1} {
            error "range_spans_chunk_boundaries chunksize must be >= 1"
        }

        if {(abs($end - $start) / $chunksize) < 75} {
            tailcall punk::fileline::system::_range_spans_chunk_boundaries_tcl $start $end $chunksize {*}$args
        } else {
            tailcall punk::fileline::system::_range_spans_chunk_boundaries_lseq $start $end $chunksize {*}$args
        }
    }

    proc range_boundaries {start end chunksizes args} {
        set argd [punk::args::get_dict {
            -offset -default 0
        } $args]
        lassign [dict values $argd] leaders opts remainingargs

    }

    #*** !doctools
    #[list_end] [comment {--- end definitions namespace punk::fileline::lib ---}]
}
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++


# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[section Internal]
namespace eval punk::fileline::system {
    #*** !doctools
    #[subsection {Namespace punk::fileline::system}]
    #[para] Internal functions that are not part of the API

    proc wordswap16 {data} {
        #scan in one endianness - format in the other. Whether we scan le/be first doesn't matter as long as we format using the opposite endianness
        binary scan $data s* elements ;#scan little endian
        return [binary format S* $elements] ;#format big endian
    }
    proc wordswap32 {data} {
        binary scan $data i* elements
        return [binary format I* $elements]
    }

    proc scan32bit_be {i32} {
        if {[binary scan $i32 I x]} {
            return $x
        } else {
            error "couldn't scan $i32"
        }
    }

    #for 8.7+ using lseq
    #much faster when resultant boundary size is large (at least when offset 0)
    proc _range_spans_chunk_boundaries_lseq {start end chunksize args} {
        if {$chunksize < 1} {error "chunksize must be > 0"} ;#sanitycheck in case called directly
        set opts [dict create\
            -offset 0\
        ]
        foreach {k v} $args {
            switch -- $k {
                -offset {
                    dict set opts $k $v
                }
                default {
                    error "unknown option '$k'. Known options: [dict keys $opts]"
                }
            }
        }
        # -- --- --- ---
        set opt_offset [dict get $opts -offset]
        # -- --- --- ---

        set smod [expr {$start % $chunksize}]
        if {$smod != 0} {
            set start [expr {$start + ($chunksize - $smod)}]
            if {$start > $end} {
                return [list is_span 0 boundaries {}]
            }
        }
        set boundaries [lseq $start to $end $chunksize]
        #offset can be negative
        if {$opt_offset} {
            if {$opt_offset + [lindex $boundaries end] > $end || $opt_offset + [lindex $boundaries 0] < $start} {
                set overflow 1
            } else {
                set overflow 0
            }
            set boundaries [lmap v $boundaries[unset boundaries]  {expr {$v + $opt_offset}}]
            if {$overflow} {
                #we don't know how many overflowed..
                set inrange [list]
                foreach b $boundaries {
                    if {$b >= $start && $b <= $end} {
                        lappend inrange $b
                    }
                }
                set boundaries $inrange
            }
        }
        return [list is_span [expr {[llength $boundaries]>0}] boundaries $boundaries]
    }

    #faster than lseq for small number of resultant boundaries (~< 75) (which is a common use case)
    #gets very slow (comparitively) with large resultsets
    proc _range_spans_chunk_boundaries_tcl {start end chunksize args} {
        if {$chunksize < 1} {error "chunksize must be > 0"} ;#sanitycheck in case called directly
        set opts [dict create\
            -offset 0\
        ]
        foreach {k v} $args {
            switch -- $k {
                -offset {
                    dict set opts $k $v
                }
                default {
                    error "unknown option '$k'. Known options: [dict keys $opts]"
                }
            }
        }
        # -- --- --- ---
        set opt_offset [dict get $opts -offset]
        # -- --- --- ---

        set is_span 0
        set smod [expr {$start % $chunksize}]
        if {$smod != 0} {
            set start [expr {$start + ($chunksize - $smod)}]
        }
        set boundaries [list]

        #we only need to pre-check the result-range for negative offsets - as our main loop stops before end?
        if {$opt_offset < 0} {
            #set btrack [expr {$start + $opt_offset}] ;#start back one to make sure we catch the first boundary
            set btrack $bstart
            set boff [expr {$btrack + $opt_offset}] ;#must be growing even if start and offset are negative - as chunksize is at least 1
            while {$boff < $start} {
                incr btrack $chunksize
                set boff [expr {$btrack + $opt_offset}]
            }
            set bstart $btrack
        } else {
            set bstart $start
        }
        for {set b $bstart} {[set boff [expr {$b  + $opt_offset}]] <= $end} {incr b $chunksize} {
            lappend boundaries $boff
        }

        return [list is_span [expr {[llength $boundaries]>0}] boundaries $boundaries offset $opt_offset]
    }

    proc _range_spans_chunk_boundaries_TIMEIT {start end chunksize {repeat 1}} {
        puts "main  : [time {punk::fileline::lib::range_spans_chunk_boundaries $start $end $chunksize} $repeat]"
        puts "tcl : [time {punk::fileline::system::_range_spans_chunk_boundaries_tcl $start $end $chunksize} $repeat]"
        if {![catch {package require Tcl 8.7-}]} {
            puts "lseq : [time {punk::fileline::system::_range_spans_chunk_boundaries_lseq $start $end $chunksize} $repeat]"
        }
    }
}
namespace eval punk::fileline::ansi {
    #*** !doctools
    #[subsection {Namespace punk::fileline::ansi}]
    #[para]These are ansi functions imported from punk::ansi - or no-ops if that package is unavailable
    #[para]See [package punk::ansi] for documentation
    #[list_begin definitions]
    variable enabled 1
    #*** !doctools
    #[call [fun ansi::a]]
    #[call [fun ansi::a+]]
    #[call [fun ansi::ansistrip]]

    #*** !doctools
    #[list_end] [comment {--- end definitions namespace punk::fileline::ansi ---}]
}

# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
##  Ready
package provide punk::fileline [namespace eval punk::fileline {
    variable pkg punk::fileline
    variable version
    set version 0.1.0
}]
return

#*** !doctools
#[manpage_end]