You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1733 lines
93 KiB
1733 lines
93 KiB
# -*- tcl -*- |
|
# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from <pkg>-buildversion.txt |
|
# |
|
# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. |
|
# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. |
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
# (C) 2024 |
|
# |
|
# @@ Meta Begin |
|
# Application punk::fileline 0.1.0 |
|
# Meta platform tcl |
|
# Meta license BSD |
|
# @@ Meta End |
|
|
|
|
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
# doctools header |
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
#*** !doctools |
|
#[manpage_begin punkshell_module_punk::fileline 0 0.1.0] |
|
#[copyright "2024"] |
|
#[titledesc {file line-handling utilities}] [comment {-- Name section and table of contents description --}] |
|
#[moddesc {punk fileline}] [comment {-- Description at end of page heading --}] |
|
#[require punk::fileline] |
|
#[keywords module text parse file encoding BOM] |
|
#[description] |
|
#[para] - |
|
|
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
|
|
#*** !doctools |
|
#[section Overview] |
|
#[para]Utilities for in-memory analysis of text file data as both line data and byte/char-counted data whilst preserving the line-endings (even if mixed) |
|
#[para]This is important for certain text files where examining the number of chars/bytes is important |
|
#[para]For example - windows .cmd/.bat files need some byte counting to determine if labels lie on chunk boundaries and need to be moved. |
|
#[para]This chunk-size counting will depend on the character encoding. |
|
#[para]Despite including the word 'file', the library doesn't necessarily deal with reading/writing to the filesystem - |
|
#[para]The raw data can be supplied as a string, or loaded from a file using punk::fileline::get_textinfo -file <filename> |
|
#[subsection Concepts] |
|
#[para]A chunk of textfile data (possibly representing a whole file - but usually at least a complete set of lines) is loaded into a punk::fileline::class::textinfo instance at object creation. |
|
#[example_begin] |
|
# package require punk::fileline |
|
# package require fileutil |
|
# set rawdata [lb]fileutil::cat data.txt -translation binary[rb] |
|
# punk::fileline::class::textinfo create obj_data $rawdata |
|
# puts stdout [lb]obj_data linecount[rb] |
|
#[example_end] |
|
#[subsection Notes] |
|
#[para]Line records are referred to by a zero-based index instead of a one-based index as is commonly used when displaying files. |
|
#[para]This is for programming consistency and convenience, and the module user should do their own conversion to one-based indexing for line display or messaging if desired. |
|
#[para]No support for lone carriage-returns being interpreted as line-endings. |
|
#[para]CR line-endings that are intended to be interpreted as such should be mapped to something else before the data is supplied to this module. |
|
|
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
## Requirements |
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
|
|
#*** !doctools |
|
#[subsection dependencies] |
|
#[para] packages needed by punk::fileline |
|
#[list_begin itemized] |
|
|
|
package require Tcl 8.6- |
|
package require punk::args |
|
#*** !doctools |
|
#[item] [package {Tcl 8.6-}] |
|
#[item] [package {punk::args}] |
|
|
|
|
|
# #package require frobz |
|
# #*** !doctools |
|
# #[item] [package {frobz}] |
|
|
|
#*** !doctools |
|
#[list_end] [comment {- end dependencies list -}] |
|
|
|
#*** !doctools |
|
#[subsection {optional dependencies}] |
|
#[para] packages that add functionality but aren't strictly required |
|
#[list_begin itemized] |
|
|
|
#*** !doctools |
|
#[item] [package {punk::ansi}] |
|
#[para] - recommended for class::textinfo [method chunk_boundary_display] |
|
#[item] [package {punk::char}] |
|
#[para] - recommended for class::textinfo [method chunk_boundary_display] |
|
#[item] [package {overtype}] |
|
#[para] - recommended for class::textinfo [method chunk_boundary_display] |
|
|
|
|
|
#*** !doctools |
|
#[list_end] [comment {- end optional dependencies list -}] |
|
|
|
|
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
|
|
#*** !doctools |
|
#[section API] |
|
|
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
# oo::class namespace |
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
namespace eval punk::fileline::class { |
|
namespace export * |
|
#*** !doctools |
|
#[subsection {Namespace punk::fileline::class}] |
|
#[para] class definitions |
|
if {[info commands [namespace current]::textinfo] eq ""} { |
|
#*** !doctools |
|
#[list_begin enumerated] |
|
|
|
# oo::class create interface_sample1 { |
|
# #*** !doctools |
|
# #[enum] CLASS [class interface_sample1] |
|
# #[list_begin definitions] |
|
|
|
# method test {arg1} { |
|
# #*** !doctools |
|
# #[call class::interface_sample1 [method test] [arg arg1]] |
|
# #[para] test method |
|
# puts "test: $arg1" |
|
# } |
|
|
|
# #*** !doctools |
|
# #[list_end] [comment {-- end definitions interface_sample1}] |
|
# } |
|
|
|
|
|
#uses zero based indexing. Caller can add 1 for line numbers |
|
oo::class create [namespace current]::textinfo { |
|
#*** !doctools |
|
#[enum] CLASS [class textinfo] |
|
#[list_begin definitions] |
|
# [para] [emph METHODS] |
|
|
|
variable o_chunk ;#current state |
|
variable o_chunkop_store |
|
variable o_lineop_store |
|
|
|
variable o_chunk_epoch |
|
variable o_line_epoch |
|
variable o_payloadlist |
|
variable o_linemap |
|
variable o_LF_C |
|
variable o_CRLF_C |
|
|
|
|
|
variable o_bom_id |
|
variable o_bom |
|
variable o_bom_map |
|
|
|
#review - for now we expect datachunk to be data without BOM and already encoded appropriately |
|
#fileline::get_textinfo has support for interpreting BOM - but we currently have no way to do that for data not coming from a file |
|
#refactor to allow that code to be called from here? |
|
constructor {datachunk args} { |
|
#*** !doctools |
|
#[call class::textinfo [method constructor] [arg datachunk] [opt {option value...}]] |
|
#[para] Constructor for textinfo object which represents a chunk or all of a file |
|
#[para] datachunk should be passed with the file data including line-endings as-is for full functionality. ie use something like: |
|
#[example_begin] |
|
# fconfigure $fd -translation binary |
|
# set chunkdata [lb]read $fd[rb]] |
|
#or |
|
# set chunkdata [lb]fileutil::cat <filename> -translation binary[rb] |
|
#[example_end] |
|
#[para] when loading the data |
|
namespace eval [namespace current] { |
|
set nspath [namespace path] |
|
foreach p [list ::punk::fileline ::punk::fileline::ansi] { |
|
if {$p ni $nspath} { |
|
lappend nspath $p |
|
} |
|
} |
|
namespace path $nspath |
|
} |
|
|
|
set o_bom_map [list\ |
|
utf-8 \u00ef\u00bb\u00bf\ |
|
utf-16be \u00fe\u00ff\ |
|
utf-16le \u00ff\u00fe\ |
|
utf-32be \u0000\u0000\u00fe\u00ff\ |
|
utf-32le \u00ff\u00fe\u0000\u0000\ |
|
utf-7 \u002b\u002f\u0076\ |
|
utf-1 \u00f7\u0064\u004c\ |
|
utf-ebcdic \u00dd\u0073\u0066\u0073\ |
|
utf-scsu \u0003\u00fe\u00ff\ |
|
utf-bocu-1 \u00fb\u00ee\u0028\ |
|
utf-gb18030 \u0084\u0031\u0095\u0033\ |
|
] |
|
set o_bom_id "" |
|
set o_bom "" ;#review |
|
|
|
set o_chunk $datachunk |
|
set o_line_epoch [list] |
|
set o_chunk_epoch [list "fromchunkchange-at-[clock micros]"] |
|
set crlf_lf_placeholders [list \uFFFF \uFFFE] ;#defaults - if already exist in file - error out with message |
|
set defaults [dict create\ |
|
-substitutionmap {}\ |
|
-crlf_lf_placeholders $crlf_lf_placeholders\ |
|
-userid "<pid-[pid]-tid-[thread::id]>"\ |
|
] |
|
set known_opts [dict keys $defaults] |
|
foreach {k v} $args { |
|
if {$k ni $known_opts} { |
|
error "[self] constructor error: unknown option '$k'. Known options: $known_opts" |
|
} |
|
} |
|
set opts [dict merge $defaults $args] |
|
# -- --- --- --- --- --- --- |
|
set opt_substitutionmap [dict get $opts -substitutionmap] ;#review - can be done by caller - or a loadable -policy |
|
set opt_crlf_lf_placeholders [dict get $opts -crlf_lf_placeholders] |
|
set opt_userid [dict get $opts -userid] |
|
# -- --- --- --- --- --- --- |
|
|
|
if {[llength $opt_crlf_lf_placeholders] != 2 || [string length [lindex $opt_crlf_lf_placeholders 0]] !=1 || [string length [lindex $opt_crlf_lf_placeholders 1]] !=1} { |
|
error "textinfo::constructor error: -crlf_lf_placeholders requires a list of exactly 2 chars" |
|
} |
|
lassign $opt_crlf_lf_placeholders o_LF_C o_CRLF_C |
|
if {[string first $o_LF_C $o_chunk] >=0} { |
|
set decval [scan $o_LF_C %c] |
|
if {$decval < 32 || $decval > 127} { |
|
set char_desc "(decimal value $decval)" |
|
} else { |
|
set char_desc "'$o_LF_C' (decimal value $decval)" |
|
} |
|
error "textinfo::constructor error: rawfiledata already contains linefeed substitution character $char_desc specified as first element of -crlf_lf_placeholders" |
|
} |
|
if {[string first $o_CRLF_C $o_chunk] >=0} { |
|
set decval [scan $o_CRLF_C %c] |
|
if {$decval < 32 || $decval > 127} { |
|
set char_desc "(decimal value $decval)" |
|
} else { |
|
set char_desc "'$o_CRLF_C' (decimal value $decval)" |
|
} |
|
error "textinfo::constructor error: rawfiledata already contains carriagereturn-linefeed substitution character $char_desc specified as second element of -crlf_lf_placeholders" |
|
} |
|
if {$o_LF_C eq $o_CRLF_C} { |
|
puts stderr "WARNING: same substitution character used for both elements of -crlf_lf_placeholders - byte counting may be off if file contains mixed line-endings" |
|
} |
|
|
|
my regenerate_lines |
|
|
|
} |
|
|
|
method set_bomid {bomid} { |
|
if {$bomid ni [dict keys $o_bom_map]} { |
|
error "Unrecognised bom-id $bomid. Known values: [dict keys $o_bom_map]" |
|
} |
|
set o_bom_id $bomid |
|
set o_bom [dict get $o_bom_map $bomid] |
|
} |
|
method get_bomid {} { |
|
return $o_bom_id |
|
} |
|
method get_bom {} { |
|
return $o_bom |
|
} |
|
|
|
method chunk {chunkstart chunkend} { |
|
#*** !doctools |
|
#[call class::textinfo [method chunk] [arg chunkstart] [arg chunkend]] |
|
#[para]Return a range of bytes from the underlying raw chunk data. |
|
#[para] e.g The following retrieves the entire chunk |
|
#[para] objName chunk 0 end |
|
return [string range $o_chunk $chunkstart $chunkend] |
|
} |
|
method chunklen {} { |
|
#*** !doctools |
|
#[call class::textinfo [method chunklen]] |
|
#[para] Number of bytes/characters in the raw data of the file |
|
return [string length $o_chunk] |
|
} |
|
method chunk_boundary_display {chunkstart chunkend chunksize args} { |
|
#*** !doctools |
|
#[call class::textinfo [method chunk_boundary_display]] |
|
#[para]Returns a string displaying the boundaries at chunksize bytes between chunkstart and chunkend |
|
#[para]Defaults to using ansi colour if punk::ansi module is available. Use -ansi 0 to disable colour |
|
set opts [dict create\ |
|
-ansi $::punk::fileline::ansi::enabled\ |
|
-offset 0\ |
|
-displaybytes 200\ |
|
-truncatedmark "..."\ |
|
-completemark "---"\ |
|
-moremark " + "\ |
|
-continuemark " > "\ |
|
-linemaxwidth 100\ |
|
-linebase 0\ |
|
-limit -1\ |
|
-boundaries {}\ |
|
-showconfig 0\ |
|
-boundaryheader {Boundary %i% at %b%}\ |
|
] |
|
foreach {k v} $args { |
|
switch -- $k { |
|
-ansi - -offset - -displaybytes - -truncatedmark - -completemark - -moremark - -continuemark - -linemaxwidth - -linebase - -limit - -boundaries - -showconfig - -boundaryheader { |
|
dict set opts $k $v |
|
} |
|
default { |
|
error "[self]::chunk_boundary error: unknown option '$k'. Known options: [dict keys $opts]" |
|
} |
|
} |
|
} |
|
# -- --- --- --- --- --- |
|
set opt_ansi [dict get $opts -ansi] |
|
set opt_offset [dict get $opts -offset] |
|
set opt_displaybytes [dict get $opts -displaybytes] |
|
set opt_tmark [dict get $opts -truncatedmark] |
|
set opt_cmark [dict get $opts -completemark] |
|
set opt_linemax [dict get $opts -linemaxwidth] |
|
set opt_linebase [dict get $opts -linebase] |
|
set opt_linebase [string map [list _ ""] $opt_linebase] |
|
set opt_limit [dict get $opts -limit] ;#limit number of boundaries to display |
|
set opt_boundaries [dict get $opts -boundaries] ;#use pre-calculated boundaries if supplied |
|
set opt_showconfig [dict get $opts -showconfig] |
|
set opt_boundaryheader [dict get $opts -boundaryheader] |
|
# -- --- --- --- --- --- |
|
package require overtype |
|
# will require punk::char and punk::ansi |
|
|
|
if {"::punk::fileline::ansi::ansistrip" ne [info commands ::punk::fileline::ansi::ansistrip]} { |
|
namespace eval ::punk::fileline::ansi { |
|
namespace import ::punk::ansi::* |
|
} |
|
} |
|
|
|
#This mechanism for enabling/disabling ansi is a bit clumsy - prone to errors with regard to keeping in sync with any api changes in punk ansi |
|
#It's done here to allow this to be used without the full set of punk modules and/or shell - REVIEW |
|
|
|
#risk of failing to reset on error |
|
set pre_ansi_enabled $::punk::fileline::ansi::enabled |
|
if {$opt_ansi} { |
|
set ::punk::fileline::ansi::enabled 1 |
|
} else { |
|
set ::punk::fileline::ansi::enabled 0 |
|
} |
|
if {"::punk::fileline::ansistrip" ne [info commands ::punk::fileline::ansistrip]} { |
|
proc ::punk::fileline::a {args} { |
|
if {$::punk::fileline::ansi::enabled} { |
|
tailcall ::punk::fileline::ansi::a {*}$args |
|
} else { |
|
return "" |
|
} |
|
} |
|
proc ::punk::fileline::a+ {args} { |
|
if {$::punk::fileline::ansi::enabled} { |
|
tailcall ::punk::fileline::ansi::a+ {*}$args |
|
} else { |
|
return "" |
|
} |
|
} |
|
proc ::punk::fileline::ansistrip {str} { |
|
if {$::punk::fileline::ansi::enabled} { |
|
tailcall ::punk::fileline::ansi::ansistrip $str |
|
} else { |
|
return $str |
|
} |
|
} |
|
} |
|
set maxline [lindex [my chunkrange_to_linerange $chunkend $chunkend] 0] |
|
set minline [lindex [my chunkrange_to_linerange $chunkstart $chunkstart] 0] |
|
|
|
#suport simple end+-int (+-)start(+-)int to set linebase to line corresponding to chunkstart or chunkend |
|
#also simple int+int and int-int - nothing more complicated (similar to Tcl lrange etc in that regard) |
|
#commonly this will be something like -start or -end |
|
if {![string is integer -strict $opt_linebase]} { |
|
set sign "" |
|
set errunrecognised "unrecognised -linebase value '$opt_linebase'. Expected positive or negative integer or -start -start-int -start+int -end -end-int -end+int or -eof (where leading - is optional but probably desirable) " |
|
if {[string index $opt_linebase 0] eq "-"} { |
|
set sign - |
|
set tail [string range $opt_linebase 1 end] |
|
} else { |
|
set tail [string trimleft $opt_linebase +];#ignore + |
|
} |
|
#todo - switch -glob -- $tail |
|
if {[string match eof* $tail]} { |
|
set endmath [string range $tail 3 end] |
|
#todo endmath? |
|
if {$tail eq "eof"} { |
|
set lastline [lindex [my chunkrange_to_linerange end end] 0] |
|
set linebase ${sign}$lastline |
|
} else { |
|
error $errunrecognised |
|
} |
|
} elseif {[string match end* $tail]} { |
|
set endmath [string range $tail 3 end] |
|
if {[string length $endmath]} { |
|
set op [string index $endmath 0] |
|
if {$op in {+ -}} { |
|
set operand [string range $endmath 1 end] |
|
if {[string is integer -strict $operand]} { |
|
if {$op eq "+"} { |
|
set linebase [expr {$maxline + $operand}] |
|
} else { |
|
set linebase [expr {$maxline - $operand}] |
|
} |
|
} else { |
|
error $errunrecognised |
|
} |
|
} else { |
|
error $errunrecognised |
|
} |
|
} else { |
|
set linebase $maxline |
|
} |
|
set linebase ${sign}$linebase |
|
} elseif {[string match start* $tail]} { |
|
set endmath [string range $tail 5 end] |
|
if {[string length $endmath]} { |
|
set op [string index $endmath 0] |
|
if {$op in {+ -}} { |
|
set operand [string range $endmath 1 end] |
|
if {[string is integer -strict $operand]} { |
|
if {$op eq "+"} { |
|
set linebase [expr {$minline + $operand}] |
|
} else { |
|
set linebase [expr {$minline - $operand}] |
|
} |
|
} else { |
|
error $errunrecognised |
|
} |
|
} else { |
|
error $errunrecognised |
|
} |
|
} else { |
|
set linebase $minline |
|
} |
|
set linebase ${sign}$linebase |
|
} elseif {[string match *-* $tail]} { |
|
set extras [lassign [split $tail -] int1 int2] |
|
if {[llength $extras]} { |
|
error $errunrecognised |
|
} |
|
if {![string is integer -strict $int1] || ![string is integer -strict $int2]} { |
|
error $errunrecognised |
|
} |
|
set linebase [expr {$int1 - $int2}] |
|
set linebase ${sign}$linebase |
|
} elseif {[string match *+* $tail]} { |
|
set extras [lassign [split $tail +] int1 int2] |
|
if {[llength $extras]} { |
|
error $errunrecognised |
|
} |
|
if {![string is integer -strict $int1] || ![string is integer -strict $int2]} { |
|
error $errunrecognised |
|
} |
|
set linebase [expr {$int1 + $int2}] |
|
set linebase ${sign}$linebase |
|
} else { |
|
error $errunrecognised |
|
} |
|
|
|
} else { |
|
set linebase $opt_linebase |
|
} |
|
|
|
lassign [my numeric_chunkrange $chunkstart $chunkend] chunkstart chunkend |
|
|
|
if {![llength $opt_boundaries]} { |
|
set binfo [lib::range_spans_chunk_boundaries $chunkstart $chunkend $chunksize -offset $opt_offset] |
|
set boundaries [dict get $binfo boundaries] |
|
} else { |
|
set boundaries [list] |
|
foreach b $opt_boundaries { |
|
if {$chunkstart <= $b && $chunkend >= $b} { |
|
lappend boundaries [expr {$b + $opt_offset}] |
|
} |
|
} |
|
} |
|
|
|
|
|
if {![llength $boundaries]} { |
|
return "No boundaries found between $chunkstart and $chunkend for chunksize $chunksize (when offset $opt_offset)" |
|
} |
|
if {$opt_showconfig} { |
|
set result "chunk range $chunkstart $chunkend line range $minline $maxline linebase $linebase limit $opt_limit\n" |
|
} else { |
|
set result "" |
|
} |
|
set pre_bytes [expr {$opt_displaybytes /2}] |
|
set post_bytes $pre_bytes |
|
set max_bytes [expr {[my chunklen] -1}] |
|
if {$opt_limit > 0} { |
|
set boundaries [lrange $boundaries[unset boundaries] 0 $opt_limit-1] |
|
} |
|
|
|
set i 0 |
|
foreach b $boundaries { |
|
if {$opt_boundaryheader ne ""} { |
|
set j [expr {$i+1}] |
|
append result [string map [list %b% $b %i% $i %j% $j] $opt_boundaryheader] \n |
|
} |
|
set low [expr {max(($b - $pre_bytes),0)}] |
|
set high [expr {min(($b + $post_bytes),$max_bytes)}] |
|
|
|
set lineinfolist [my chunkrange_to_lineinfolist $low $high -show_truncated 1] |
|
set le_map [list \r\n <crlf> \r <cr> \n <lf>] |
|
set result_list [list] |
|
foreach lineinfo $lineinfolist { |
|
set lineidx [dict get $lineinfo lineindex] |
|
|
|
set linenum [expr {$lineidx + $linebase}] |
|
set s [dict get $lineinfo start] |
|
set e [dict get $lineinfo end] |
|
|
|
set boundarymarker "" |
|
set displayidx "" |
|
set linenum_display $linenum |
|
if {$s <= $b && $e >= $b} { |
|
set idx [expr {$b - $s}] ;#index into whole position in whole line - not so useful if we're viewing a small section of a line |
|
set char [string index [my line $lineidx] $idx] |
|
set char_display [string map [list \r <CR> \n <LF>] $char] |
|
if {[dict get $lineinfo is_truncated]} { |
|
set tside [dict get $lineinfo truncatedside] |
|
set truncated [dict get $lineinfo truncated] |
|
set tlen [string length $truncated] |
|
if {"left" in $tside} { |
|
set tleft [dict get $lineinfo truncatedleft] |
|
set tleftlen [string length $tleft] |
|
set displayidx [expr {$idx - $tleftlen}] |
|
} elseif {"right" in $tside} { |
|
set displayidx $idx |
|
} |
|
} else { |
|
set displayidx $idx |
|
} |
|
set boundarymarker "'[a+ green bold]$char_display[a]'@$displayidx" |
|
set linenum_display ${linenum_display},$idx |
|
} |
|
|
|
set lhs_status $opt_cmark ;#default |
|
set rhs_status $opt_cmark ;#default |
|
if {[dict get $lineinfo is_truncated]} { |
|
set line [dict get $lineinfo truncated] |
|
set tside [dict get $lineinfo truncatedside] |
|
if {"left" in $tside && "right" in $tside } { |
|
set lhs_status $opt_tmark |
|
set rhs_status $opt_tmark |
|
} elseif {"left" in $tside} { |
|
set lhs_status $opt_tmark |
|
} elseif {"right" in $tside} { |
|
set rhs_status $opt_tmark |
|
} |
|
|
|
|
|
} else { |
|
set line [my line $lineidx] |
|
} |
|
if {$displayidx ne ""} { |
|
set line [string replace $line $displayidx $displayidx [a+ White green bold]$char_display[a]] |
|
} |
|
set displayline [string map $le_map $line] |
|
lappend result_list [list $linenum_display $boundarymarker $lhs_status $displayline $rhs_status] |
|
} |
|
set title_linenum "LNUM" |
|
set linenums [lsearch -index 0 -all -inline -subindices $result_list *] |
|
set markers [lsearch -index 1 -all -inline -subindices $result_list *] |
|
set lines [lsearch -index 3 -all -inline -subindices $result_list *] |
|
set title_marker "" |
|
set title_line "Line" |
|
#todo - use punk::char for unicode support of wide chars etc? |
|
set widest_linenum [tcl::mathfunc::max {*}[lmap v [concat [list $title_linenum] $linenums] {string length $v}]] |
|
set widest_marker [tcl::mathfunc::max {*}[lmap v [concat [list $title_marker] $markers] {string length [ansistrip $v]}]] |
|
set widest_status [expr {max([string length $opt_cmark], [string length $opt_tmark])}] |
|
set widest_line [tcl::mathfunc::max {*}[lmap v [concat [list $title_line] $lines] {string length $v}]] |
|
foreach row $result_list { |
|
lassign $row linenum marker lhs_status line rhs_status |
|
append result [format " %-*s " $widest_linenum $linenum] |
|
append result [format " %-*s " $widest_marker $marker] |
|
append result [format " %-*s " $widest_status $lhs_status] |
|
append result [format " %-*s " $widest_line $line] |
|
append result [format " %-*s " $widest_status $rhs_status] \n |
|
} |
|
incr i |
|
} |
|
set ::punk::fileline::ansi::enabled $pre_ansi_enabled |
|
return $result |
|
} |
|
method linecount {} { |
|
#*** !doctools |
|
#[call class::textinfo [method linecount]] |
|
#[para] Number of lines in the raw data of the file, counted as per the policy in effect |
|
return [llength $o_payloadlist] |
|
} |
|
|
|
|
|
method line {lineindex} { |
|
#*** !doctools |
|
#[call class::textinfo [method line] [arg lineindex]] |
|
#[para]Reconstructs and returns the raw line using the payload and per-line stored line-ending metadata |
|
#[para]A 'line' may be returned without a line-ending if the unerlying chunk had trailing data without a line-ending (or the chunk was loaded under a non-standard -policy setting) |
|
#[para]Whilst such data may not conform to definitions (e.g POSIX) of the terms 'textfile' and 'line' - it is useful here to represent it as a line with metadata le set to "none" |
|
#[para]To return just the data which might more commonly be needed for dealing with lines, use the [method linepayload] method - which returns the line data minus line-ending |
|
|
|
lassign [my numeric_linerange $lineindex 0] lineindex |
|
|
|
set le [dict get $o_linemap $lineindex le] |
|
set le_chars [dict get [dict create lf \n crlf \r\n none ""] $le] |
|
return [lindex $o_payloadlist $lineindex]$le_chars |
|
} |
|
method chunk_find_glob {globsearch args} { |
|
#todo - use linepayload_find_glob when -ignore_lineendings is 0 - but check truncations for 1st and last line |
|
error "unimplemented" |
|
} |
|
method linepayload_find_glob {globsearch args} { |
|
#*** !doctools |
|
#[call class::textinfo [method linepayload_find_glob] [arg globsearch] [opt {option value...}]] |
|
#[para]Return a lineinfolist (see [method lineinfo] and [method lineinfolist]) of lines where payload matches the [arg globsearch] string |
|
#[para]To limit the returned results use the -limit n option - where -limit 0 means return all matches. |
|
#[para]For example: [method linepayload_find_glob] "*test*" -limit 1 |
|
#[para]The result is always a list of lineinfo dictionaries even if one item is returned |
|
#[para] -limitfrom can be start|end |
|
#[para]The order of results is always the order as they occur in the data - even if -limitfrom end is specified. |
|
#[para]-limitfrom end means that only the last -limit items are returned |
|
#[para]Note that as glob accepts [lb]chars[rb]] to mean match any character in the set given by chars, searching for literal square brackets should be done by escaping the bracket with a backslash |
|
#[para]This is true even if only a single square bracket is being searched for. e.g {*[lb]file*} will not find the word file followed by a left square-bracket - even though the search didn't close the square brackets. |
|
#[para]In the above case - the literal search should be {*\[lb]file*} |
|
|
|
set opts [dict create\ |
|
-limit 0\ |
|
-strategy 1\ |
|
-start 0\ |
|
-end end\ |
|
-limitfrom start\ |
|
] |
|
foreach {k v} $args { |
|
switch -- $k { |
|
-limit - -strategy - -start - -end - -limitfrom { |
|
dict set opts $k $v |
|
} |
|
default { |
|
error "linepayload_find_glob unknown option '$k'. Known options: [dict keys $opts]" |
|
} |
|
} |
|
} |
|
# -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- |
|
set opt_limit [dict get $opts -limit] |
|
if {![string is integer -strict $opt_limit] || $opt_limit < 0} { |
|
error "linepayload_find_glob -limit must be positive integer" |
|
} |
|
# -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- |
|
set opt_strategy [dict get $opts -strategy] |
|
# -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- |
|
set opt_start [dict get $opts -start] |
|
set opt_start [expr {$opt_start}] |
|
if {$opt_start != 0} {error "-start unimplemented"} |
|
# -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- |
|
set opt_end [dict get $opts -end] |
|
set max_line_index [expr {[llength $o_payloadlist]-1}] |
|
if {$opt_end eq "end"} { |
|
set opt_end $max_line_index |
|
} |
|
#TODO |
|
if {$opt_end < $max_line_index} {error "-end less than max_line_index unimplemented"} |
|
# -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- |
|
set opt_limitfrom [dict get $opts -limitfrom] |
|
#-limitfrom start|end only |
|
#TODO |
|
if {$opt_limitfrom ne "start"} {error "-limitfrom unimplemented"} |
|
# -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- |
|
|
|
|
|
set lineinfolist [list] |
|
|
|
if {$opt_limit == 1} { |
|
set idx [lsearch -glob [lrange $o_payloadlist $opt_start $opt_end] $globsearch] |
|
if {$idx >=0} { |
|
set i [expr {$opt_start + $idx}] |
|
lappend lineinfolist [list lineindex $i [dict get $o_linemap $i] payload [lindex $o_payloadlist $i]] |
|
} |
|
} elseif {$opt_limit == 0} { |
|
set indices [lsearch -all -glob [lrange $o_payloadlist $opt_start $opt_end] $globsearch] |
|
foreach irel $indices { |
|
set i [expr {$opt_start + $irel}] |
|
lappend lineinfolist [list lineindex $i [dict get $o_linemap $i] payload [lindex $o_payloadlist $i]] |
|
} |
|
} else { |
|
#todo - auto-strategy based on limit vs number of lines |
|
if {$opt_strategy == 0} { |
|
set posn 0 |
|
for {set r 0} {$r < $opt_limit} {incr r} { |
|
set n [lsearch [lrange $o_payloadlist $posn+$opt_start end] $globsearch] |
|
if {$n >=0} { |
|
set irel [expr {$posn + $n}] |
|
set i [expr {$irel + $opt_start}] |
|
lappend lineinfolist [list lineindex $i {*}[dict get $o_linemap $i] payload [lindex $o_payloadlist $i]] |
|
set posn [expr {$irel+1}] |
|
} |
|
} |
|
} else { |
|
set indices [lsearch -all -glob [lrange $o_payloadlist $opt_start $opt_end] $globsearch] |
|
set limited [lrange $indices 0 $opt_limit-1] |
|
foreach irel $limited { |
|
set i [expr {$opt_start + $irel}] |
|
lappend lineinfolist [list lineindex $i {*}[dict get $o_linemap $i] payload [lindex $o_payloadlist $i]] |
|
} |
|
} |
|
} |
|
return $lineinfolist |
|
} |
|
method linepayload {lineindex} { |
|
#*** !doctools |
|
#[call class::textinfo [method linepayload] [arg lineindex]] |
|
#[para]Return the text of the line indicated by the zero-based lineindex |
|
#[para]The line-ending is not returned in the data - but is still stored against this lineindex |
|
#[para]Line Metadata such as the line-ending for a particular line and the byte/character range it occupies within the chunk can be retrieved with the [method linemeta] method |
|
#[para]To retrieve both the line text and metadata in a single call the [method lineinfo] method can be used |
|
#[para]To retrieve an entire line including line-ending use the [method line] method. |
|
lassign [my numeric_linerange $lineindex 0] lineindex |
|
return [lindex $o_payloadlist $lineindex] |
|
} |
|
method linepayloads {startindex endindex} { |
|
#*** !doctools |
|
#[call class::textinfo [method linepayloads] [arg startindex] [arg endindex]] |
|
#[para]Return a list of just the payloads in the specified linindex range, with no metadata. |
|
return [lrange $o_payloadlist $startindex $endindex] |
|
} |
|
method linemeta {lineindex} { |
|
#*** !doctools |
|
#[call class::textinfo [method linemeta] [arg lineindex]] |
|
#[para]Return a dict of the metadata for the line indicated by the zero-based lineindex |
|
#[para]Keys returned include |
|
#[list_begin itemized] |
|
#[item] le |
|
#[para] A string representing the type of line-ending: crlf|lf|none |
|
#[item] linelen |
|
#[para] The number of characters/bytes in the whole line including line-ending if any |
|
#[item] payloadlen |
|
#[para] The number of character/bytes in the line excluding line-ending |
|
#[item] start |
|
#[para] The zero-based index into the associated raw file data indicating at which byte/character index this line begins |
|
#[item] end |
|
#[para] The zero-based index into the associated raw file data indicating at which byte/character index this line ends |
|
#[para] This end-point corresponds to the last character of the line-ending if any - not necessarily the last character of the line's payload |
|
#[list_end] |
|
lassign [my numeric_linerange $lineindex 0] lineindex |
|
dict get $o_linemap $lineindex |
|
} |
|
method lineinfo {lineindex} { |
|
#*** !doctools |
|
#[call class::textinfo [method lineinfo] [arg lineindex]] |
|
#[para]Return a dict of the metadata and text for the line indicated by the zero-based lineindex |
|
#[para]This returns the same info as the [method linemeta] with an added key of 'payload' which is the text of the line without line-ending. |
|
#[para]The 'payload' value is the same as is returned from the [method linepayload] method. |
|
lassign [my numeric_linerange $lineindex 0] lineindex ;#convert lineindex to canonical number e.g 1_000 -> 1000 end -> highest index |
|
return [dict create lineindex $lineindex {*}[dict get $o_linemap $lineindex] payload [lindex $o_payloadlist $lineindex]] |
|
} |
|
method lineinfolist {startidx endidx} { |
|
#*** !doctools |
|
#[call class::textinfo [method lineinfolist] [arg startidx] [arg endidx]] |
|
#[para]Returns list of lineinfo dicts for each line in line index range startidx to endidx |
|
lassign [my numeric_linerange $startidx $endidx] startidx endidx |
|
set chunkstart [dict get $o_linemap $startidx start] |
|
set chunkend [dict get $o_linemap $endidx end] |
|
set line_list [my chunkrange_to_lineinfolist $chunkstart $chunkend] ;# assertion - no need to view truncations as we've picked start and end of complete lines |
|
#verify sanity |
|
set l_start [lindex $line_list 0] |
|
if {[set idx_start [dict get $l_start lineindex]] ne $startidx} { |
|
error "lineinfolist first lineindex $idx_start doesn't match startidx $startidx" |
|
} |
|
set l_end [lindex $line_list end] |
|
if {[set idx_end [dict get $l_end lineindex]] ne $endidx} { |
|
error "lineinfolist last lineindex $idx_end doesn't match endidx $endidx" |
|
} |
|
return $line_list |
|
} |
|
|
|
method linerange_to_chunkrange {startidx endidx} { |
|
#*** !doctools |
|
#[call class::textinfo [method linerange_to_chunkrange] [arg startidx] [arg endidx]] |
|
|
|
lassign [my numeric_linerange $startidx $endidx] startidx endidx |
|
#inclusive range |
|
return [list [dict get $o_linemap $startidx start] [dict get $o_linemap $endidx end]] |
|
} |
|
method linerange_to_chunk {startidx endidx} { |
|
#*** !doctools |
|
#[call class::textinfo [method linerange_to_chunk] [arg startidx] [arg endidx]] |
|
set chunkrange [my linerange_to_chunkrange $startidx $endidx] |
|
return [string range $o_chunk [lindex $chunkrange 0] [lindex $chunkrange 1]] |
|
} |
|
method lines {startidx endidx} { |
|
#*** !doctools |
|
#[call class::textinfo [method lines] [arg startidx] [arg endidx]] |
|
lassign [my numeric_linerange $startidx $endidx] startidx endidx |
|
set linelist [list] |
|
set le_map [dict create lf \n crlf \r\n none ""] |
|
for {set i $startidx} {$i <= $endidx} {incr i} { |
|
lappend linelist "[lindex $o_payloadlist $i][dict get $le_map [dict get $o_linemap $i le]]" |
|
} |
|
return $linelist |
|
} |
|
method linepayloads {startidx endidx} { |
|
#*** !doctools |
|
#[call class::textinfo [method linepayloads] [arg startidx] [arg endidx]] |
|
return [lrange $o_payloadlist $startidx $endidx] |
|
} |
|
method chunkrange_to_linerange {chunkstart chunkend} { |
|
#*** !doctools |
|
#[call class::textinfo [method chunkrange_to_linerange] [arg chunkstart] [arg chunkend]] |
|
lassign [my numeric_chunkrange $chunkstart $chunkend] chunkstart chunkend |
|
|
|
set linestart -1 |
|
for {set i 0} {$i < [llength $o_payloadlist]} {incr i} { |
|
if {($chunkstart >= [dict get $o_linemap $i start]) && ($chunkstart <= [dict get $o_linemap $i end])} { |
|
set linestart $i |
|
break |
|
} |
|
} |
|
if {$linestart == -1} { |
|
error "Line with range in chunk spanning start index $chunkstart not found" |
|
} |
|
set lineend -1 |
|
for {set i [expr {[llength $o_payloadlist] -1}]} {$i >=0} {incr i -1} { |
|
if {($chunkend >= [dict get $o_linemap $i start]) && ($chunkend <= [dict get $o_linemap $i end])} { |
|
set lineend $i |
|
break |
|
} |
|
} |
|
if {$lineend == -1} { |
|
error "Line with range spanning end index $chunkend not found" |
|
} |
|
return [list $linestart $lineend] |
|
} |
|
method chunkrange_to_lineinfolist {chunkstart chunkend args} { |
|
#*** !doctools |
|
#[call class::textinfo [method chunkrange_to_lineinfolist] [arg chunkstart] [arg chunkend] [opt {option value...}]] |
|
#[para]Return a list of dicts each with structure like the result of the [method lineinfo] method - but possibly with extra keys for truncation information if -show_truncated 1 is supplied |
|
#[para]The truncation key in a lineinfo dict may be returned for first and/or last line in the resulting list. |
|
#[para]truncation shows the shortened (missing bytes on left and/or right side) part of the entire line (potentially including line-ending or even partial line-ending) |
|
#[para]Note that this truncation info is only in the return value of this method - and will not be reflected in [method lineinfo] queries to the main chunk. |
|
|
|
lassign [my numeric_chunkrange $chunkstart $chunkend] chunkstart chunkend |
|
set defaults [dict create\ |
|
-show_truncated 0\ |
|
] |
|
set known_opts [dict keys $defaults] |
|
foreach {k v} $args { |
|
if {$k ni $known_opts} { |
|
error "chunkrange_to_lines error: unknown option '$k'. Known options: $known_opts" |
|
} |
|
} |
|
set opts [dict merge $defaults $args] |
|
# -- --- --- --- --- --- --- --- |
|
set opt_show_truncated [dict get $opts -show_truncated] |
|
# -- --- --- --- --- --- --- --- |
|
|
|
set infolist [list] |
|
set linerange [my chunkrange_to_linerange $chunkstart $chunkend] |
|
lassign $linerange start_lineindex end_lineindex |
|
|
|
#if -show_truncated |
|
#return extra keys for first and last items (which may be the same item if chunkrange is entirely within a line) |
|
#add is_truncated 0|1 to all lines |
|
#Even if the start/end line is not fully within the chunkrange ie truncated - the 'payload' key will contain the original untruncated data |
|
########################### |
|
# first line may have payload tail truncated - or just linefeed, or even a split linefeed |
|
########################### |
|
set first [dict create lineindex $start_lineindex {*}[dict get $o_linemap $start_lineindex] payload [lindex $o_payloadlist $start_lineindex]] |
|
set start_info [dict get $o_linemap $start_lineindex] |
|
|
|
|
|
if {$chunkstart > [dict get $start_info start]} { |
|
dict set first is_truncated 1 |
|
dict set first truncatedside [list left] ;#truncatedside is a list which may have 'right' added if last line is same as first line |
|
} else { |
|
dict set first is_truncated 0 |
|
} |
|
|
|
if {$opt_show_truncated} { |
|
#line1 |
|
if {$chunkstart > [dict get $start_info start]} { |
|
#there is lhs truncation |
|
set payload [lindex $o_payloadlist $start_lineindex] |
|
set line_start [dict get $start_info start] |
|
set le_chars [dict get [dict create lf \n crlf \r\n none ""] [dict get $start_info le]] |
|
set payload_and_le "${payload}${le_chars}" |
|
set split [expr {$chunkstart - $line_start}] |
|
set truncated [string range $payload_and_le $split end] |
|
set lhs [string range $payload_and_le 0 $split-1] |
|
|
|
dict set first truncated $truncated |
|
dict set first truncatedleft $lhs |
|
} |
|
} |
|
########################### |
|
|
|
########################### |
|
# middle lines if any - no truncation |
|
########################### |
|
#difference in indexes of 1 would only mean 2 items to return |
|
set middle_list [list] |
|
if {($end_lineindex - $start_lineindex) > 1} { |
|
for {set i [expr {$start_lineindex +1}]} {$i <= [expr {$end_lineindex -1}] } {incr i} { |
|
#lineindex is key into main list |
|
lappend middle_list [dict create lineindex $i {*}[dict get $o_linemap $i] payload [lindex $o_payloadlist $i] is_truncated 0] |
|
} |
|
} |
|
########################### |
|
|
|
########################### |
|
# tail line may have beginning or all of payload truncated - linefeed may be split if crlf |
|
# may be same line as first line - in which case truncation at beginning as well |
|
if {$end_lineindex == $start_lineindex} { |
|
#same record |
|
set end_info $start_info |
|
|
|
|
|
if {$chunkend < [dict get $end_info end]} { |
|
#there is rhs truncation |
|
if {[dict get $first is_truncated]} { |
|
dict set first truncatedside [list left right] |
|
} else { |
|
dict set first is_truncated 1 |
|
dict set first truncatedside [list right] |
|
} |
|
} |
|
|
|
if {$opt_show_truncated} { |
|
if {$chunkend < [dict get $end_info end]} { |
|
#there is rhs truncation and we need to return the splits |
|
#do rhs truncation - possibly in addition to existing lhs truncation |
|
# ... |
|
set payload [lindex $o_payloadlist $end_lineindex] |
|
set line_start [dict get $end_info start] |
|
set le_chars [dict get [dict create lf \n crlf \r\n none ""] [dict get $end_info le]] |
|
set payload_and_le "${payload}${le_chars}" |
|
set split [expr {$chunkend - $line_start}] |
|
set truncated [string range $payload_and_le 0 $split] |
|
set rhs [string range $payload_and_le $split+1 end] |
|
dict set first truncatedright $rhs |
|
if {"left" ni [dict get $first truncatedside]} { |
|
#rhs truncation only |
|
puts "payload_and_le: $payload_and_le" |
|
puts "LENGTH: [string length $payload_and_le]" |
|
#--- |
|
#--- |
|
dict set first truncated $truncated |
|
dict set first truncatedside [list right] |
|
} else { |
|
#truncated on both sides |
|
set lhslen [string length [dict get $first truncatedleft]] |
|
#re-truncate the truncation to reapply the original lhs truncation |
|
set truncated [string range $truncated $lhslen end] |
|
dict set first truncated $truncated |
|
} |
|
} |
|
} |
|
#no middle or last to append |
|
lappend infolist $first |
|
} else { |
|
set last [dict create lineindex $end_lineindex {*}[dict get $o_linemap $end_lineindex] payload [lindex $o_payloadlist $end_lineindex]] |
|
set end_info [dict get $o_linemap $end_lineindex] |
|
|
|
|
|
if {$chunkend < [dict get $end_info end]} { |
|
dict set last is_truncated 1 |
|
dict set last truncatedside [list right] |
|
} else { |
|
dict set last is_truncated 0 |
|
} |
|
|
|
if {$opt_show_truncated} { |
|
if {$chunkend < [dict get $end_info end]} { |
|
#there is rhs truncation - and last line in range is a different line to first one |
|
set payload [lindex $o_payloadlist $end_lineindex] |
|
set line_start [dict get $end_info start] |
|
set line_end [dict get $end_info end] |
|
set le [dict get $end_info le] |
|
set le_size [dict get {lf 1 crlf 2 none 0} $le] |
|
set le_chars [dict get [dict create lf \n crlf \r\n none ""] $le] |
|
set payload_and_le "${payload}${le_chars}" |
|
|
|
set split [expr {$chunkend - $line_start}] |
|
set truncated [string range $payload_and_le 0 $split] |
|
set rhs [string range $payload_and_le $split+1 end] |
|
|
|
dict set last truncated $truncated |
|
dict set last truncatedright $rhs |
|
#this has the effect that truncating the rhs by 1 can result in truncated being larger than original payload for crlf lines - as payload now sees the cr |
|
#this is a bit unintuitive - but probably best reflects the reality. The truncated value is the truncated 'line' rather than the truncated 'payload' |
|
} |
|
} |
|
|
|
|
|
lappend infolist $first |
|
if {[llength $middle_list]} { |
|
lappend infolist {*}$middle_list |
|
} |
|
lappend infolist $last |
|
} |
|
########################### |
|
#assertion all records have is_truncated key. |
|
#assertion if is_truncated == 1 truncatedside should contain a list of either left, right or both left and right |
|
#assertion If not opt_show_truncated - then truncated records will not have truncated,truncatedleft,truncatedright keys. |
|
return $infolist |
|
} |
|
|
|
#need to check truncations so that any split \r\n is counted precisely todo |
|
method chunk_le_counts {chunkstart chunkend} { |
|
set infolines [my chunkrange_to_lineinfolist $chunkstart $chunkend -show_truncated 1] |
|
set lf_count 0 |
|
set crlf_count 0 |
|
set none_count 0 |
|
foreach d $infolines { |
|
set le [dict get $d le] |
|
if {$le eq "lf"} { |
|
incr lf_count |
|
} elseif {$le eq "crlf"} { |
|
incr crlf_count |
|
} else { |
|
incr none_count |
|
} |
|
} |
|
#even without split crlf - this can overcount by counting the lf or crlf in a line which had an ending not in the chunk range specified |
|
|
|
#check first and last infoline for truncations |
|
#Also check if the truncation is directly between an crlf |
|
#both an lhs split and an rhs split could land between cr and lf |
|
#to be precise - we should presumably count the part within our chunk as either a none for cr or an lf |
|
#This means a caller counting chunk by chunk using this method will sometimes get the wrong answer depending on where crlfs lie relative to their chosen chunk size |
|
#This is presumably ok - as it should be a well known thing to watch out for. |
|
#If we're only receiving chunk by chunk we can't reliably detect splits vs lone <cr>s in the data |
|
#There are surely more efficient ways for a caller to count line-endings in the way that makes sense for them |
|
#but we should makes things as easy as possible for users of this line/chunk structure anyway. |
|
|
|
set first [lindex $infolines 0] |
|
if {[dict get $first is_truncated]} { |
|
#could be the only line - and truncated at one or both ends. |
|
#both a left and a right truncation could split a crlf |
|
|
|
} |
|
set last [lindex $infolines end] |
|
if {[dict get $first lineindex] != [dict get $last lineindex]} { |
|
#only need to process last if it is a different line |
|
#if so - then split can only be left side |
|
|
|
} |
|
|
|
|
|
return [dict create lf $lf_count crlf $crlf_count unterminated $none_count warning line_ending_splits_unimplemented] |
|
} |
|
|
|
#todo - test last line and merge as necessary with first line from new chunk - generate line data only for appended chunk |
|
method append_chunk {rawchunk} { |
|
error "sorry - unimplemented" |
|
} |
|
|
|
method numeric_linerange {startidx endidx} { |
|
#*** !doctools |
|
#[call class::textinfo [method numeric_linerange] [arg startidx] [arg endidx]] |
|
#[para]A helper to return any Tcl-style end end-x values given to startidx or endidx; converted to their specific values based on the current state of the underlying line data |
|
#[para]This is used internally by API functions such as [method line] to enable it to accept more expressive indices |
|
return [my normalize_indices $startidx $endidx [expr {[dict size $o_linemap]-1}]] |
|
} |
|
method numeric_chunkrange {startidx endidx} { |
|
#*** !doctools |
|
#[call class::textinfo [method numeric_chunkrange] [arg startidx] [arg endidx]] |
|
#[para]A helper to return any Tcl-style end end-x entries supplied to startidx or endidx; converted to their specific values based on the current state of the underlying chunk data |
|
return [my normalize_indices $startidx $endidx [expr {[string length $o_chunk]-1}]] |
|
} |
|
method normalize_indices {startidx endidx max} { |
|
#*** !doctools |
|
#[call class::textinfo [method normalize_indices] [arg startidx] [arg endidx] [arg max]] |
|
#[para]A utility to convert some of the of Tcl-style list-index expressions such as end, end-1 etc to valid indices in the range 0 to the supplied max |
|
#[para]Basic addition and subtraction expressions such as 4-1 5+2 are accepted |
|
#[para]startidx higher than endidx is allowed |
|
#[para]Unlike Tcl's index expressions - we raise an error if the calculated index is out of bounds 0 to max |
|
set original_startidx $startidx |
|
set original_endidx $endidx |
|
set startidx [string map [list _ ""] $startidx] ;#don't barf on Tcl 8.7+ underscores in numbers - we can't just use expr because it will not handle end-x |
|
set endidx [string map [list _ ""] $endidx] |
|
if {![string is digit -strict "$startidx$endidx"]} { |
|
foreach whichvar [list start end] { |
|
upvar 0 ${whichvar}idx index |
|
if {![string is digit -strict $index]} { |
|
switch -glob -- $index { |
|
end { |
|
set index $max |
|
} |
|
"*-*" { |
|
#end-int or int-int - like lrange etc we don't accept arbitrarily complex expressions |
|
lassign [split $index -] A B |
|
if {$A eq "end"} { |
|
set index [expr {$max - $B}] |
|
} else { |
|
set index [expr {$A - $B}] |
|
} |
|
} |
|
"*+*" { |
|
lassign [split $index +] A B |
|
if {$A eq "end"} { |
|
#review - this will just result in out of bounds error in final test - as desired |
|
#By calculating here - we will see the result in the error message - but it's probably not particularly useful - as we don't really need end+ support at all. |
|
set index [expr {$max + $B}] |
|
} else { |
|
set index [expr {$A + $B}] |
|
} |
|
} |
|
default { |
|
#May be something like +2 or -0 which braced expr can hanle |
|
#we would like to avoid unbraced expr here - as we're potentially dealing with ranges that may come from external sources. |
|
if {[catch {expr {$index}} index]} { |
|
#could be end+x - but we don't want out of bounds to be valid |
|
#set it to something that the final bounds expr test can deal with |
|
set index Inf |
|
} |
|
} |
|
} |
|
} |
|
} |
|
} |
|
#Unlike Tcl lrange,lindex etc - we don't want to support out of bound indices. |
|
#show the supplied index and what it was mapped to in the error message. |
|
if {$startidx < 0 || $startidx > $max} { |
|
error "Bad start index '$original_startidx'. $startidx out of bounds 0 - $max" |
|
} |
|
if {$endidx < 0 || $endidx > $max} { |
|
error "Bad end index '$original_endidx'. $endidx out of bounds 0 - $max (try $max or end)" |
|
} |
|
return [list $startidx $endidx] |
|
} |
|
|
|
method regenerate_lines {args} { |
|
#*** !doctools |
|
#[call class::textinfo [method regenerate_lines]] |
|
#[para]generate a list of lines from the current state of the stored raw data chunk and keep a map of line-endings indexed by lineindex |
|
#[para]This is called automatically by the Constructor during object creation |
|
#[para]It is exposed in the API experimentally - as chunk and line manipulation functions are considered. |
|
#[para]TODO - review whether such manual control will be necessary/desirable |
|
|
|
#we don't store the actual line-endings as characters (for better layout of debug/display of data) - instead we store names lf|crlf|none |
|
|
|
# first split on lf - then crlf. As we've replaced with single substution chars - the order doesn't matter. |
|
set o_payloadlist [list] |
|
set o_linemap [dict create] |
|
set crlf_replace [list \r\n $o_CRLF_C \n $o_LF_C] |
|
set normalised_data [string map $crlf_replace $o_chunk] |
|
|
|
set lf_lines [split $normalised_data $o_LF_C] |
|
|
|
set idx 0 |
|
set lf_count 0 |
|
set crlf_count 0 |
|
set filedata_offset 0 |
|
set i 0 |
|
set imax [expr {[llength $lf_lines]-1}] |
|
foreach lfln $lf_lines { |
|
set crlf_parts [split $lfln $o_CRLF_C] |
|
if {[llength $crlf_parts] <= 1} { |
|
#no crlf |
|
set payloadlen [string length $lfln] |
|
set le_size 1 |
|
set le lf |
|
if {$i == $imax} { |
|
#no more lf segments - and no crlfs |
|
if {$payloadlen > 0} { |
|
#last line in split has chars - therefore there was no trailing line-ending |
|
set le_size 0 |
|
set le none |
|
} else { |
|
#empty space after last line-ending |
|
#not really a line - we get here from splitting on our lf-replacement char |
|
#An editor might display this pseudo-line with a line number - but we won't treat it as one here |
|
break |
|
} |
|
} |
|
lappend o_payloadlist $lfln |
|
set linelen [expr {$payloadlen + $le_size}] |
|
#we include line-ending in byte count for a line. |
|
dict set o_linemap $idx [list le $le linelen $linelen payloadlen $payloadlen start $filedata_offset end [expr {$filedata_offset + $linelen -1}]] |
|
incr filedata_offset $linelen |
|
incr lf_count |
|
incr idx |
|
} else { |
|
foreach crlfpart [lrange $crlf_parts 0 end-1] { |
|
lappend o_payloadlist $crlfpart |
|
set payloadlen [string length $crlfpart] |
|
set linelen [expr {$payloadlen + 2}] |
|
dict set o_linemap $idx [list le crlf linelen $linelen payloadlen $payloadlen start $filedata_offset end [expr {$filedata_offset + $linelen -1}]] |
|
incr filedata_offset $linelen |
|
incr crlf_count |
|
incr idx |
|
} |
|
set lfpart [lindex $crlf_parts end] |
|
set payloadlen [string length $lfpart] |
|
if {$i == $imax} { |
|
#no more lf segments - but we did find crlf in last (or perhaps only) lf line |
|
#last element in our split has no le |
|
if {$payloadlen > 0} { |
|
set le_size 0 |
|
set le none |
|
} else { |
|
#set le_size 2 |
|
#set le crlf |
|
break |
|
} |
|
} else { |
|
#more lf segments to come |
|
set le_size 1 |
|
set le lf |
|
} |
|
|
|
lappend o_payloadlist $lfpart |
|
set linelen [expr {$payloadlen + $le_size}] |
|
dict set o_linemap $idx [list le $le linelen $linelen payloadlen $payloadlen start $filedata_offset end [expr {$filedata_offset + $linelen -1}]] |
|
incr filedata_offset $linelen |
|
incr lf_count |
|
incr idx |
|
} |
|
incr i |
|
#incr filedata_offset ;#move up 1 so start entry for next line is greater than end entry for previous line |
|
} |
|
set le_count [expr {$lf_count + $crlf_count}] |
|
if {$le_count != [llength $o_payloadlist]} { |
|
puts stderr "fileline::class::textinfo warning. regenerate_lines lf_count: $lf_count + crlf_count: $crlf_count does not equal length of lines stored: [llength $o_payloadlist]" |
|
} |
|
|
|
} |
|
method regenerate_chunk {} { |
|
#o_payloadlist |
|
#o_linemap |
|
set oldsize [string length $o_chunk] |
|
set newchunk "" |
|
dict for {idx lineinfo} $o_linemap { |
|
set |
|
|
|
} |
|
|
|
return [list newsize [string length $newchunk] oldsize $oldsize] |
|
} |
|
|
|
|
|
#*** !doctools |
|
#[list_end] |
|
} |
|
#*** !doctools |
|
#[list_end] [comment {--- end class enumeration ---}] |
|
} |
|
} |
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
|
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
# Base namespace |
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
namespace eval punk::fileline { |
|
namespace export * |
|
#variable xyz |
|
|
|
#*** !doctools |
|
#[subsection {Namespace punk::fileline}] |
|
#[para] Core API functions for punk::fileline |
|
#[list_begin definitions] |
|
|
|
punk::args::define { |
|
@id -id ::punk::fileline::get_textinfo |
|
@cmd -name punk::fileline::get_textinfo -help\ |
|
"return: textinfo object instance" |
|
-file -default {} -type existingfile |
|
-translation -default iso8859-1 |
|
-encoding -default "\uFFFF" |
|
-includebom -default 0 |
|
@values -min 0 -max 1 |
|
} |
|
proc get_textinfo {args} { |
|
#*** !doctools |
|
#[call get_textinfo [opt {option value...}] [opt datachunk]] |
|
#[para]Returns textinfo object instance representing data in string datachunk or if -file filename supplied - data loaded from a file |
|
#[para]The encoding used is as specified in the -encoding option - or from the Byte Order Mark (bom) at the beginning of the data |
|
#[para]For Tcl 8.6 - encodings such as utf-16le may not be available - so the bytes are swapped appropriately depending on the platform byteOrder and encoding 'unicode' is used. |
|
#[para]encoding defaults to utf-8 if no -encoding specified and no BOM was found |
|
#[para]Whether -encoding was specified or not - by default the BOM characters are not retained in the line-data |
|
#[para]If -includebom 1 is specified - the bom will be retained in the stored chunk and the data for line 1, but will undergo the same encoding transformation as the rest of the data |
|
#[para]The get_bomid method of the returned object will contain an identifier for any BOM encountered. |
|
#[para] e.g utf-8,utf-16be, utf-16le, utf-32be, utf32-le, SCSU, BOCU-1,GB18030, UTF-EBCDIC, utf-1, utf-7 |
|
#[para]If the encoding specified in the BOM isn't recognised by Tcl - the resulting data is likely to remain as the raw bytes of whatever encoding that is. |
|
#[para]Currently only utf-8, utf-16* and utf-32* are properly supported even though the other BOMs are detected, reported via get_bomid, and stripped from the data. |
|
#[para]GB18030 falls back to cp936/gbk (unless a gb18030 encoding has been installed). Use -encoding iso8859-1 if this isn't suitable and you need to do your own processing of the bytes. |
|
|
|
lassign [dict values [punk::args::get_by_id ::punk::fileline::get_textinfo $args]] opts values |
|
# -- --- --- --- |
|
set opt_file [dict get $opts -file] |
|
set opt_translation [dict get $opts -translation] |
|
set opt_encoding [dict get $opts -encoding] |
|
set opt_includebom [dict get $opts -includebom] |
|
# -- --- --- --- |
|
|
|
if {$opt_file ne ""} { |
|
set filename $opt_file |
|
set fd [open $filename r] |
|
fconfigure $fd -translation binary -encoding $opt_translation;#should use translation binary to get actual line-endings - but we allow caller to override |
|
#Always read encoding in binary - check for bom below and/or apply chosen opt_encoding |
|
set rawchunk [read $fd] |
|
close $fd |
|
if {[llength $values]} { |
|
puts stderr "Ignoring trailing argument [string length [lindex $values 0]] bytes. Not used when -file is specified" |
|
} |
|
} else { |
|
set rawchunk [lindex $values 0] |
|
} |
|
set rawlen [string length $rawchunk] |
|
#examine first 4 bytes for possible BOM |
|
#big-endian BOMs |
|
# ----------------------------------- |
|
#EFBBBF - utf-8 reliabletxt |
|
#FEFF - utf-16be reliabletxt |
|
#FFFE - utf-16le reliabletxt |
|
#0000FEFF - utf-32be reliabletxt |
|
#FFFE0000 - utf-32le |
|
#0000FFFE - utf-32be(2143) non-standard! (not supported) |
|
#FEFF0000 - utf-32le(3412) non-standard! (not supported - will detect as utf-16be) |
|
#2B2F76 - utf-7 (not supported) |
|
#F7644C - utf-1 (not supported) |
|
#DD736673 - UTF-EBCDIC (not supported) |
|
#0EFEFF - SCSU (not supported) |
|
#FBEE28 - BOCU-1 Binary Ordered Compression for Unicode (mime-compatible) - (not supported - fall back to utf-8) |
|
#84319533 - GB18030 - Chinese gov standard (fall back to cp936 with warning if no encoding name) |
|
# ----------------------------------- |
|
|
|
set first32 [string range $rawchunk 0 3] |
|
#scan using capital H for big-endian order |
|
set first32_be [binary scan $first32 H* maybe_bom] ;#we use H* instead of H8 for 8 nibbles (4 bytes) - because our first32 may contain less than 4 bytes - in which case we won't match |
|
set bomid "" |
|
set bomenc "" |
|
set is_reliabletxt 0 ;#see http://reliabletxt.com - only utf-8 with bom, utf-16be, utf-16le, utf-32be supported as at 2024 |
|
set startdata 0 |
|
#todo switch -glob |
|
if {[string match "efbbbf*" $maybe_bom]} { |
|
set bomid utf-8 |
|
set bomenc utf-8 |
|
set is_reliabletxt 1 |
|
set startdata 3 |
|
} elseif {$maybe_bom eq "0000feff"} { |
|
set bomid utf-32be |
|
set bomenc utf-32be |
|
set is_reliabletxt 1 |
|
set startdata 4 |
|
} elseif {$maybe_bom eq "fffe0000"} { |
|
#Technically ambiguous - could be utf-16le bom followed by utf-16 null character (2 byte null) |
|
puts stderr "WARNING - ambiguous BOM fffe0000 found. Treating as utf-32le - but could be utf-16le - consider manually setting -encoding or converting data to another encoding." |
|
set bomid utf-32le |
|
set bomenc utf-32le |
|
set startdata 4 |
|
} elseif {[string match "feff*" $maybe_bom]} { |
|
set bomid utf-16be |
|
set bomenc utf-16be |
|
set is_reliabletxt 1 |
|
set startdata 2 |
|
} elseif {[string match "fffe*" $maybe_bom]} { |
|
set bomid utf-16le |
|
set bomenc utf-16le |
|
set is_reliabletxt 1 |
|
set startdata 2 |
|
} elseif {$maybe_bom eq "0efeff"} { |
|
set bomid scsu |
|
set bomenc "binary" |
|
set startdata 3 |
|
} elseif {$maybe_bom eq "fbee28"} { |
|
set bomid bocu-1 |
|
puts stderr "WARNING - bocu-1 BOM FBEE28 found. Not supported - back to binary" |
|
set bomenc "binary" ;# utf-8??? |
|
set startdata 3 |
|
} elseif {$maybe_bom eq "84319533"} { |
|
if {![dict exists [punk::char::page_names_dict gb18030]]} { |
|
puts stderr "WARNING - no direct support for GB18030 (chinese) - falling back to cp936/gbk" |
|
set bomenc cp936 |
|
} else { |
|
set bomenc [dict get [punk::char::page_names_dict gb18030]] ;#review - this may never exist in Tcl or may be named differently - create a handler? |
|
} |
|
set bomid gb18030 |
|
set startdata 4 |
|
} elseif {$maybe_bom eq "f7644c"} { |
|
puts stderr "WARNING utf-1 BOM F7644C found - not supported. Falling back to binary" |
|
set bomid utf-1 |
|
set bomenc binary |
|
set startdata 3 |
|
} elseif {[string match "2b2f76*" $maybe_bom]} { |
|
puts stderr "WARNING utf-7 BOM 2b2f76 found - not supported. Falling back to binary and leaving BOM in data!" |
|
#review - work out how to strip bom - last 2 bits of 4th byte belong to following character |
|
set bomid utf-7 |
|
set bomenc binary |
|
set startdata 0 |
|
} |
|
|
|
#todo - check xml encoding attribute / html content-type |
|
#todo - a separate chardet (https://chardet.readthedocs.io/ ) or mozilla like mechanism that can be manually called to autodetect character encoding |
|
#This should be an explicit operation - not automatially done here unless we provide a flag for it. |
|
|
|
|
|
if {$opt_includebom} { |
|
set startdata 0 |
|
} |
|
|
|
if {$opt_encoding eq "\uFFFF"} { |
|
if {$bomenc ne "" && $bomenc ne "binary"} { |
|
if {[package vcompare [package provide Tcl] 8.7] < 0} { |
|
#tcl 8.6 has unicode encoding but not utf-16le etc |
|
if {$bomenc ni [encoding names]} { |
|
if {$bomenc eq "utf-16le"} { |
|
if {$::tcl_platform(byteOrder) eq "littleEndian"} { |
|
set datachunk [encoding convertfrom unicode [string range $rawchunk $startdata end]] |
|
set encoding_selected unicode |
|
} else { |
|
set datachunk [encoding convertfrom unicode [system::wordswap16 [string range $rawchunk $startdata end]]] |
|
set encoding_selected "unicode (wordswapped 16)" |
|
} |
|
} elseif {$bomenc eq "utf-16be"} { |
|
if {$::tcl_platform(byteOrder) eq "littleEndian"} { |
|
set datachunk [encoding convertfrom unicode [system::wordswap16 [string range $rawchunk $startdata end]]] |
|
set encoding_selected "unicode (wordswapped 16)" |
|
} else { |
|
set datachunk [encoding convertfrom unicode [string range $rawchunk $startdata end]] |
|
set encoding_selected unicode |
|
} |
|
} elseif {$bomenc eq "utf-32le"} { |
|
if {$::tcl_platform(byteOrder) eq "littleEndian"} { |
|
set datachunk [encoding convertfrom unicode [string range $rawchunk $startdata end]] |
|
set encoding_selected unicode |
|
} else { |
|
set datachunk [encoding convertfrom unicode [system::wordswap32 [string range $rawchunk $startdata end]]] |
|
set encoding_selected "unicode (wordswapped 32)" |
|
} |
|
} elseif {$bomenc eq "utf-32be"} { |
|
if {$::tcl_platform(byteOrder) eq "littleEndian"} { |
|
set datachunk [encoding convertfrom unicode [system::wordswap32 [string range $rawchunk $startdata end]]] |
|
set encoding_selected "unicode (wordswapped 32)" |
|
} else { |
|
set datachunk [encoding convertfrom unicode [string range $rawchunk $startdata end]] |
|
set encoding_selected unicode |
|
} |
|
} else { |
|
error "Encoding $bomenc unavailable in this version of Tcl" |
|
} |
|
} else { |
|
set datachunk [encoding convertfrom $bomenc [string range $rawchunk $startdata end]] |
|
set encoding_selected $bomenc |
|
} |
|
} else { |
|
#tcl 8.7 plus has utf-16le etc |
|
set datachunk [encoding convertfrom $bomenc [string range $rawchunk $startdata end]] |
|
set encoding_selected $bomenc |
|
} |
|
} else { |
|
#!? |
|
if {$bomenc eq "binary"} { |
|
set datachunk [string range $rawchunk $startdata end] |
|
set encoding_selected binary |
|
} else { |
|
set datachunk [encoding convertfrom utf-8 [string range $rawchunk $startdata end]] |
|
set encoding_selected utf-8 |
|
} |
|
} |
|
} else { |
|
#manually specified encoding overrides bom - but still remove bom-chars REVIEW |
|
#e.g we still want bom info - but specify binary encoding |
|
|
|
if {$opt_encoding eq "binary"} { |
|
set datachunk [string range $rawchunk $startdata end] |
|
} else { |
|
set datachunk [encoding convertfrom $opt_encoding [string range $rawchunk $startdata end]] |
|
} |
|
set encoding_selected $opt_encoding |
|
} |
|
|
|
set textobj [class::textinfo new $datachunk] |
|
if {$bomid ne ""} { |
|
$textobj set_bomid $bomid |
|
} |
|
|
|
|
|
|
|
|
|
set summary "" |
|
append summary "Bytes loaded : $rawlen" \n |
|
append summary "BOM ID : $bomid" \n |
|
append summary "Encoding selected : $encoding_selected" \n |
|
append summary "Characters : [$textobj chunklen]" \n |
|
append summary "Lines recognised : [$textobj linecount]" \n |
|
set leinfo [$textobj chunk_le_counts 0 end] |
|
append summary "crlf endings (windows) : [dict get $leinfo crlf]" \n |
|
append summary "lf endings (unix) : [dict get $leinfo lf]" \n |
|
append summary "unterminated lines : [dict get $leinfo unterminated]" \n |
|
puts stdout $summary |
|
return $textobj |
|
} |
|
|
|
proc file_boundary_display {filename startbyte endbyte chunksize args} { |
|
set fd [open $filename r] ;#use default error if file not readable |
|
fconfigure $fd -translation binary |
|
set rawfiledata [read $fd] |
|
close $fd |
|
set textobj [class::textinfo new $rawfiledata] |
|
set result [$textobj chunk_boundary_display $startbyte $endbyte $chunksize {*}$args] |
|
$textobj destroy |
|
return $result |
|
} |
|
|
|
|
|
|
|
#*** !doctools |
|
#[list_end] [comment {--- end definitions namespace punk::fileline ---}] |
|
} |
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
|
|
|
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
# Secondary API namespace |
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
namespace eval punk::fileline::lib { |
|
namespace export * |
|
namespace path [namespace parent] |
|
#*** !doctools |
|
#[subsection {Namespace punk::fileline::lib}] |
|
#[para] Secondary functions that are part of the API |
|
#[list_begin definitions] |
|
|
|
|
|
proc range_spans_chunk_boundaries {start end chunksize args} { |
|
#*** !doctools |
|
#[call [fun lib::range_spans_chunk_boundaries] [arg start] [arg end] [arg chunksize]] |
|
#[para]Takes start and end offset, generally representing bytes or character indices, and computes a list of boundaries at multiples of the chunksize that are spanned by the start and end range. |
|
#[list_begin arguments] |
|
# [arg_def integer start] |
|
# [para] zero-based start index of range |
|
# [arg_def integer end] |
|
# [para] zero-based end index of range |
|
# [arg_def integer chunksize] |
|
# [para] Number of bytes/characters in chunk - must be positive and > 0 |
|
#[list_end] |
|
#[para]returns a dict with the keys is_span and boundaries |
|
#[para]is_span 0|1 indicates if the range specified spans a boundary of chunksize |
|
#[para]boundaries contains a list of the spanned boundaries - which are always multiples of the chunksize |
|
#[para]e.g |
|
#[example_begin] |
|
# range_spans_chunk_boundaries 10 1750 512 |
|
# is_span 1 boundaries {512 1024 1536} |
|
#[example_end] |
|
#[para]The -offset <int> option |
|
#[example_begin] |
|
# range_spans_chunk_boundaries 10 1750 512 -offset 2 |
|
# is_span 1 boundaries {514 1026 1538} |
|
#[example_end] |
|
#[para] This function automatically uses lseq (if Tcl >= 8.7) when number of boundaries spanned is approximately greater than 75 |
|
if {[catch {package require Tcl 8.7-}]} { |
|
#only one implementation available for older Tcl |
|
tailcall punk::fileline::system::_range_spans_chunk_boundaries_tcl $start $end $chunksize {*}$args |
|
} |
|
if {$chunksize < 1} { |
|
error "range_spans_chunk_boundaries chunksize must be >= 1" |
|
} |
|
|
|
if {(abs($end - $start) / $chunksize) < 75} { |
|
tailcall punk::fileline::system::_range_spans_chunk_boundaries_tcl $start $end $chunksize {*}$args |
|
} else { |
|
tailcall punk::fileline::system::_range_spans_chunk_boundaries_lseq $start $end $chunksize {*}$args |
|
} |
|
} |
|
|
|
proc range_boundaries {start end chunksizes args} { |
|
set argd [punk::args::get_dict { |
|
-offset -default 0 |
|
} $args] |
|
lassign [dict values $argd] leaders opts remainingargs |
|
|
|
} |
|
|
|
#*** !doctools |
|
#[list_end] [comment {--- end definitions namespace punk::fileline::lib ---}] |
|
} |
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
|
|
|
|
|
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
#*** !doctools |
|
#[section Internal] |
|
namespace eval punk::fileline::system { |
|
#*** !doctools |
|
#[subsection {Namespace punk::fileline::system}] |
|
#[para] Internal functions that are not part of the API |
|
|
|
proc wordswap16 {data} { |
|
#scan in one endianness - format in the other. Whether we scan le/be first doesn't matter as long as we format using the opposite endianness |
|
binary scan $data s* elements ;#scan little endian |
|
return [binary format S* $elements] ;#format big endian |
|
} |
|
proc wordswap32 {data} { |
|
binary scan $data i* elements |
|
return [binary format I* $elements] |
|
} |
|
|
|
proc scan32bit_be {i32} { |
|
if {[binary scan $i32 I x]} { |
|
return $x |
|
} else { |
|
error "couldn't scan $i32" |
|
} |
|
} |
|
|
|
#for 8.7+ using lseq |
|
#much faster when resultant boundary size is large (at least when offset 0) |
|
proc _range_spans_chunk_boundaries_lseq {start end chunksize args} { |
|
if {$chunksize < 1} {error "chunksize must be > 0"} ;#sanitycheck in case called directly |
|
set opts [dict create\ |
|
-offset 0\ |
|
] |
|
foreach {k v} $args { |
|
switch -- $k { |
|
-offset { |
|
dict set opts $k $v |
|
} |
|
default { |
|
error "unknown option '$k'. Known options: [dict keys $opts]" |
|
} |
|
} |
|
} |
|
# -- --- --- --- |
|
set opt_offset [dict get $opts -offset] |
|
# -- --- --- --- |
|
|
|
set smod [expr {$start % $chunksize}] |
|
if {$smod != 0} { |
|
set start [expr {$start + ($chunksize - $smod)}] |
|
if {$start > $end} { |
|
return [list is_span 0 boundaries {}] |
|
} |
|
} |
|
set boundaries [lseq $start to $end $chunksize] |
|
#offset can be negative |
|
if {$opt_offset} { |
|
if {$opt_offset + [lindex $boundaries end] > $end || $opt_offset + [lindex $boundaries 0] < $start} { |
|
set overflow 1 |
|
} else { |
|
set overflow 0 |
|
} |
|
set boundaries [lmap v $boundaries[unset boundaries] {expr {$v + $opt_offset}}] |
|
if {$overflow} { |
|
#we don't know how many overflowed.. |
|
set inrange [list] |
|
foreach b $boundaries { |
|
if {$b >= $start && $b <= $end} { |
|
lappend inrange $b |
|
} |
|
} |
|
set boundaries $inrange |
|
} |
|
} |
|
return [list is_span [expr {[llength $boundaries]>0}] boundaries $boundaries] |
|
} |
|
|
|
#faster than lseq for small number of resultant boundaries (~< 75) (which is a common use case) |
|
#gets very slow (comparitively) with large resultsets |
|
proc _range_spans_chunk_boundaries_tcl {start end chunksize args} { |
|
if {$chunksize < 1} {error "chunksize must be > 0"} ;#sanitycheck in case called directly |
|
set opts [dict create\ |
|
-offset 0\ |
|
] |
|
foreach {k v} $args { |
|
switch -- $k { |
|
-offset { |
|
dict set opts $k $v |
|
} |
|
default { |
|
error "unknown option '$k'. Known options: [dict keys $opts]" |
|
} |
|
} |
|
} |
|
# -- --- --- --- |
|
set opt_offset [dict get $opts -offset] |
|
# -- --- --- --- |
|
|
|
set is_span 0 |
|
set smod [expr {$start % $chunksize}] |
|
if {$smod != 0} { |
|
set start [expr {$start + ($chunksize - $smod)}] |
|
} |
|
set boundaries [list] |
|
|
|
#we only need to pre-check the result-range for negative offsets - as our main loop stops before end? |
|
if {$opt_offset < 0} { |
|
#set btrack [expr {$start + $opt_offset}] ;#start back one to make sure we catch the first boundary |
|
set btrack $bstart |
|
set boff [expr {$btrack + $opt_offset}] ;#must be growing even if start and offset are negative - as chunksize is at least 1 |
|
while {$boff < $start} { |
|
incr btrack $chunksize |
|
set boff [expr {$btrack + $opt_offset}] |
|
} |
|
set bstart $btrack |
|
} else { |
|
set bstart $start |
|
} |
|
for {set b $bstart} {[set boff [expr {$b + $opt_offset}]] <= $end} {incr b $chunksize} { |
|
lappend boundaries $boff |
|
} |
|
|
|
return [list is_span [expr {[llength $boundaries]>0}] boundaries $boundaries offset $opt_offset] |
|
} |
|
|
|
proc _range_spans_chunk_boundaries_TIMEIT {start end chunksize {repeat 1}} { |
|
puts "main : [time {punk::fileline::lib::range_spans_chunk_boundaries $start $end $chunksize} $repeat]" |
|
puts "tcl : [time {punk::fileline::system::_range_spans_chunk_boundaries_tcl $start $end $chunksize} $repeat]" |
|
if {![catch {package require Tcl 8.7-}]} { |
|
puts "lseq : [time {punk::fileline::system::_range_spans_chunk_boundaries_lseq $start $end $chunksize} $repeat]" |
|
} |
|
} |
|
} |
|
namespace eval punk::fileline::ansi { |
|
#*** !doctools |
|
#[subsection {Namespace punk::fileline::ansi}] |
|
#[para]These are ansi functions imported from punk::ansi - or no-ops if that package is unavailable |
|
#[para]See [package punk::ansi] for documentation |
|
#[list_begin definitions] |
|
variable enabled 1 |
|
#*** !doctools |
|
#[call [fun ansi::a]] |
|
#[call [fun ansi::a+]] |
|
#[call [fun ansi::ansistrip]] |
|
|
|
#*** !doctools |
|
#[list_end] [comment {--- end definitions namespace punk::fileline::ansi ---}] |
|
} |
|
|
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
## Ready |
|
package provide punk::fileline [namespace eval punk::fileline { |
|
variable pkg punk::fileline |
|
variable version |
|
set version 0.1.0 |
|
}] |
|
return |
|
|
|
#*** !doctools |
|
#[manpage_end] |
|
|
|
|