From b96ace811bfc67de9f5f184b3fd486ec4084cfed Mon Sep 17 00:00:00 2001 From: Julian Noble Date: Sat, 30 Mar 2024 06:16:22 +1100 Subject: [PATCH] telnet utf-8/cp437 auto encoding guess, plus better debug output --- src/bootsupport/modules/punk/char-0.1.0.tm | 9 +- src/bootsupport/modules/punk/lib-0.1.1.tm | 21 +++ src/modules/punk/basictelnet-999999.0a1.0.tm | 159 ++++++++++++++----- src/modules/punk/char-999999.0a1.0.tm | 9 +- src/modules/punk/lib-999999.0a1.0.tm | 21 +++ 5 files changed, 172 insertions(+), 47 deletions(-) diff --git a/src/bootsupport/modules/punk/char-0.1.0.tm b/src/bootsupport/modules/punk/char-0.1.0.tm index 700a5857..6e179905 100644 --- a/src/bootsupport/modules/punk/char-0.1.0.tm +++ b/src/bootsupport/modules/punk/char-0.1.0.tm @@ -89,7 +89,7 @@ namespace eval punk::char { variable invalid "???" ;# ideally this would be 0xFFFD - which should display as black diamond/rhombus with question mark. As at 2023 - this tends to display indistinguishably from other missing glyph boxes - so we use a longer sequence we can detect by length and to display obviously variable invalid_display_char \u25ab; #todo - change to 0xFFFD once diamond glyph more common? - #just the 7-bit ascii. use [page ascii] for the 8-bit + #just the 7-bit ascii. use [page ascii] for the 8-bit layout proc ascii {} {return { 00 NUL 01 SOH 02 STX 03 ETX 04 EOT 05 ENQ 06 ACK 07 BEL 08 BS 09 HT 0a LF 0b VT 0c FF 0d CR 0e SO 0f SI @@ -235,6 +235,10 @@ namespace eval punk::char { dict set args -cols 16 tailcall page $encname {*}$args } + + #This will not display for example, c0 glyphs for cp437 + # we could use the punk::ansi::cp437_map dict - but while that might be what some expect to see - it would probably be too much magic for this function - which is intended to align more with what Tcl's encoding convertfrom/to actually does. + # for nonprinting members of the page, 2 and 3 letter codes are used rather than unicode visualisation replacements or even unicode equivalent replacements known to be appropriate for the page proc page {encname args} { variable invalid set encname [encname $encname] @@ -467,7 +471,7 @@ namespace eval punk::char { variable charsets set encname [encname $pagename] set defaults [list\ - -range {0 256}\ + -range {0 255}\ -charset ""\ ] set opts [dict merge $defaults $args] @@ -510,6 +514,7 @@ namespace eval punk::char { .= ascii |> .=> linelist -line trimleft |> .=* concat |> {regexp -all -inline {\S+} $data} } + #review - use terminal to display actual supported DEC specials vs using dict at: punk::ansi::map_special_graphics which maps to known unicode equivalents proc asciidict2 {} { set d [dict create] dict for {k v} [basedict_display] { diff --git a/src/bootsupport/modules/punk/lib-0.1.1.tm b/src/bootsupport/modules/punk/lib-0.1.1.tm index f9c4c1c4..3c28d2dc 100644 --- a/src/bootsupport/modules/punk/lib-0.1.1.tm +++ b/src/bootsupport/modules/punk/lib-0.1.1.tm @@ -302,6 +302,27 @@ namespace eval punk::lib { #[para]see [uri https://wiki.tcl-lang.org/page/K] #[para]It is used in cases where command-substitution at the calling-point performs some desired effect. + + proc is_utf8_multibyteprefix {bytes} { + #*** !doctools + #[call [fun is_utf8_multibyteprefix] [arg str]] + #[para] Returns a boolean if str is potentially a prefix for a multibyte utf-8 character + #[para] ie - tests if it is possible that appending more data will result in a utf-8 codepoint + #[para] Will return false for an already complete utf-8 codepoint + #[para] It is assumed the incomplete sequence is at the beginning of the bytes argument + #[para] Suitable input for this might be from the unreturned tail portion of get_utf8_leading $testbytes + #[para] e.g using: set head [lb]get_utf8_leading $testbytes[rb] ; set tail [lb]string range $testbytes [lb]string length $head[rb] end[rb] + regexp {(?x) + ^ + (?: + [\xC0-\xDF] | #possible prefix for two-byte codepoint + [\xE0-\xEF] [\x80-\xBF]{0,1} | #possible prefix for three-byte codepoint + [\xF0-\xF4] [\x80-\xBF]{0,2} #possible prefix for + ) + $ + } $bytes + } + proc is_utf8_first {str} { regexp {(?x) # Expanded regexp syntax, so I can put in comments :-) ^ diff --git a/src/modules/punk/basictelnet-999999.0a1.0.tm b/src/modules/punk/basictelnet-999999.0a1.0.tm index 2f9d40a5..6d816dd2 100644 --- a/src/modules/punk/basictelnet-999999.0a1.0.tm +++ b/src/modules/punk/basictelnet-999999.0a1.0.tm @@ -105,6 +105,12 @@ namespace eval punk::basictelnet { set window_cols 80 set window_rows 25 + + #Some modern(?) telnet servers seem to just pump out utf-8 encoded graphics by default - without negotiating or confirming binary etc? review + variable encoding_guess utf-8 + #we will experimentally assume utf-8 - which should handle ascii fine - and flip to cp437 when data encountered that cannot be valid utf-8 + #todo - proper charset negotiation + variable debug set debug 0 proc debug {{on_off ""}} { @@ -137,7 +143,7 @@ namespace eval punk::basictelnet { #try: DUMB,ANSI,VT100,XTERM #see also the Mud Terminal Type Standard as an extensiont to RFC1091 Telenet Terminal-Type variable remote_terminal_type - set remote_terminal_type "" ;#emtpy until/unless reported via an option 24 send from the remote + set remote_terminal_type "" ;#empty until/unless reported via an option 24 send from the remote #*** !doctools @@ -146,13 +152,13 @@ namespace eval punk::basictelnet { #[list_begin definitions] variable optioncodes - dict set optioncodes 0 [list name "Binary Transmission"] - dict set optioncodes 1 [list name "Echo"] - dict set optioncodes 2 [list name "Reconnection"] - dict set optioncodes 3 [list name "Suppress Go Ahead"] - dict set optioncodes 4 [list name "Approx Message Size Negotiation"] - dict set optioncodes 5 [list name "Status"] ;#RFC 859 - dict set optioncodes 6 [list name "Timing Mark"] + dict set optioncodes 0 [list name "Binary Transmission" short "bin"] + dict set optioncodes 1 [list name "Echo" short "echo"] + dict set optioncodes 2 [list name "Reconnection" short "recon"] + dict set optioncodes 3 [list name "Suppress Go Ahead" short "Suppress GA"] + dict set optioncodes 4 [list name "Approx Message Size Negotiation" short "msgsize nego"] + dict set optioncodes 5 [list name "Status" short "status"] ;#RFC 859 + dict set optioncodes 6 [list name "Timing Mark" short "tmark"] dict set optioncodes 7 [list name "Remote Controlled Trans and Echo"] dict set optioncodes 8 [list name "Output Line Width"] dict set optioncodes 9 [list name "Output Page Size"] @@ -170,29 +176,29 @@ namespace eval punk::basictelnet { dict set optioncodes 21 [list name "SUPDUP"] dict set optioncodes 22 [list name "SUPDUP Output"] dict set optioncodes 23 [list name "Send Location"] - dict set optioncodes 24 [list name "Terminal Type"] + dict set optioncodes 24 [list name "Terminal Type" short "ttype"] dict set optioncodes 25 [list name "End of Record"] - dict set optioncodes 26 [list name "TACACS User Identification"] + dict set optioncodes 26 [list name "TACACS User Identification" short "tacacs"] dict set optioncodes 27 [list name "Output Marking"] dict set optioncodes 28 [list name "Terminal Location Number"] dict set optioncodes 29 [list name "Telnet 3270 Regime"] dict set optioncodes 30 [list name "X.3 PAD"] - dict set optioncodes 31 [list name "Negotiate About Window Size"] ;#RFC 1073 NAWS - dict set optioncodes 32 [list name "Terminal Speed"] ;#rfc 1079 + dict set optioncodes 31 [list name "Negotiate About Window Size" short "NAWS"] ;#RFC 1073 NAWS + dict set optioncodes 32 [list name "Terminal Speed" short "tspeed"] ;#rfc 1079 dict set optioncodes 33 [list name "Remote Flow Control"] dict set optioncodes 34 [list name "Line Mode"] dict set optioncodes 35 [list name "X Display Location"] - dict set optioncodes 36 [list name "Environment Option"] - dict set optioncodes 37 [list name "Authentication Option"] - dict set optioncodes 38 [list name "Encryption Option"] - dict set optioncodes 39 [list name "New Environment Option"] + dict set optioncodes 36 [list name "Environment Option" short "env opt"] + dict set optioncodes 37 [list name "Authentication Option" short "authent opt"] + dict set optioncodes 38 [list name "Encryption Option" short "encrypt opt"] + dict set optioncodes 39 [list name "New Environment Option" short "new env opt"] dict set optioncodes 40 [list name "TN3270E"] dict set optioncodes 41 [list name "XAUTH"] dict set optioncodes 42 [list name "CHARSET"] - dict set optioncodes 43 [list name "Telnet Remote Serial Port (RSP)"] + dict set optioncodes 43 [list name "Telnet Remote Serial Port (RSP)" short "RSP"] dict set optioncodes 44 [list name "Com Port Control Option"] - dict set optioncodes 45 [list name "Telnet Supress Local Echo"] - dict set optioncodes 46 [list name "Telnet Start TLS"] + dict set optioncodes 45 [list name "Telnet Supress Local Echo" short "no-local-echo"] + dict set optioncodes 46 [list name "Telnet Start TLS" short "starttls"] dict set optioncodes 47 [list name "KERMIT"] dict set optioncodes 48 [list name "SEND-URL"] dict set optioncodes 49 [list name "FORWARD_X"] @@ -216,6 +222,8 @@ namespace eval punk::basictelnet { variable optioncodes variable server_option_state variable client_option_state + variable encoding_guess + set encoding_guess utf-8 dict for {k _v} $optioncodes { dict set server_option_state $k 0 ;#DO from our perspective @@ -223,6 +231,36 @@ namespace eval punk::basictelnet { } } reset_option_states ;#initialise to NVT - all off + proc get_server_option_state_summary {} { + variable server_option_state + variable optioncodes + set summary "" + dict for {k v} $server_option_state { + if {$v} { + if {[dict exists $optioncodes $k short]} { + append summary "[dict get $optioncodes $k short]," + } else { + append summary "[dict get $optioncodes $k name]," + } + } + } + set summary [string trimright $summary ,] + } + proc get_client_option_state_summary {} { + variable client_option_state + variable optioncodes + set summary "" + dict for {k v} $client_option_state { + if {$v} { + if {[dict exists $optioncodes $k short]} { + append summary "[dict get $optioncodes $k short]," + } else { + append summary "[dict get $optioncodes $k name]," + } + } + } + set summary [string trimright $summary ,] + } # ----------------------------------- # A rudimentary hardcoded configuration for options/negotiation @@ -319,6 +357,8 @@ namespace eval punk::basictelnet { #experiment proc debug_frame {info inputchannel outputchannel} { + variable encoding_guess + variable terminal_type variable writing_debug_frame if {$writing_debug_frame == 1} { after 1 {punk::basictelnet::add_debug "" $readchannel $writechannel} @@ -332,13 +372,17 @@ namespace eval punk::basictelnet { incr writing_debug_frame + set server_summary "SVR:[get_server_option_state_summary]" + set client_summary "CLI:[get_client_option_state_summary]" + set info $server_summary\n$client_summary\n$info + #set existing_handler [fileevent stdin readable] set RST "\x1b\[m" set w 80 - set infoframe [textblock::frame -width $w -ansiborder [a+ green bold] -title "[a cyan]Telnet Debug$RST" $info] + set infoframe [textblock::frame -width $w -ansiborder [a+ green bold] -title "[a cyan]Telnet Debug $terminal_type (encoding guess:$encoding_guess)$RST" $info] #set w [textblock::width $infoframe] - set spacepatch [textblock::block $w 4 "$RST "] + set spacepatch "$RST[textblock::block $w 4 { }]" #puts -nonewline [punk::ansi::cursor_off] #use non cursorsave version - slower - but less likely to interfere with cursor operations in data @@ -353,9 +397,10 @@ namespace eval punk::basictelnet { if {[catch { #punk::console::move_emitblock_return 6 90 $spacepatch\n$infoframe punk::console::move_emitblock_return 6 90 $spacepatch + flush stdout punk::console::move_emitblock_return 10 90 $infoframe - #puts -nonewline stdout [punk::ansi::cursor_on] ;#Enabling cursor should take account of whether it was enabled before - we don't have that info currently! review flush stdout + #puts -nonewline stdout [punk::ansi::cursor_on] ;#Enabling cursor should take account of whether it was enabled before - we don't have that info currently! review } errM]} { puts stderr "debug_frame error: $errM" } @@ -405,6 +450,7 @@ namespace eval punk::basictelnet { #waiting data will be supplied to this handler if the other handler over-read (e.g repl handling ANSI ESC \[6n response on stdin finding other data before the ANSI response.) proc toServer {sock} { variable server_option_state + variable encoding_guess upvar ::punk::console::input_chunks_waiting input_chunks_waiting set nextwaiting "" @@ -456,8 +502,6 @@ namespace eval punk::basictelnet { set wrote_sock 0 } -#JJJ - #update idletasks if {$wrote_sock && ![eof $sock]} { ################################################################################## #Re-enable channel read handler only if no waiting chunks - must process in order @@ -486,6 +530,8 @@ namespace eval punk::basictelnet { } proc fromServer {sock} { + variable encoding_guess + variable debug variable fromserver_unprocessed fileevent $sock readable {} variable in_sb @@ -532,16 +578,30 @@ namespace eval punk::basictelnet { #mini debug buffer for each fromServer call - render using add_debug each loop set debug_info "" - append debug_info "------raw data [string length $data]---prev unprocessed:[string length $last_unprocessed]---" \n - #append debug_info [ansistring VIEW -lf 1 -vt 1 [encoding convertfrom utf-8 $data]] \n - set - append debug_info "------------------------------------------" \n + if {$debug} { + #only do this text-processing work if debug is on + append debug_info "------raw data [string length $data]---prev unprocessed:[string length $last_unprocessed]---" \n + #append debug_info [ansistring VIEW -lf 1 -vt 1 [encoding convertfrom utf-8 $data]] \n + set rawview [ansistring VIEW -lf 1 -vt 1 [encoding convertfrom $encoding_guess $data]] + set viewblock [overtype::left -wrap 1 -width 78 -height 4 "" $rawview] + set lines [split $viewblock \n] + if {[llength $lines] > 4} { + append debug_info [join [list {*}[lrange $lines 0 1] "...<[expr {[llength $lines] -4}] lines undisplayed>..." {*}[lrange $lines end-1 end]] \n] + } else { + append debug_info $viewblock + } + append debug_info "\n------------------------------------------" \n + } if {[string length $data]} { #puts "1----------------------------------" #puts [ansistring VIEW -lf 1 -vt 1 $data] - + #--------------- + #TODO - fix possible chunk boundary that gives us an incomplete IAC sequence. + #As it stands - we won't properly handle it - possible it will cause intermittent telnet protocol bugs! + #will need a mechanism within protocol function and loop to abort and throw back to next fromServer event + #--------------- while 1 { if {!$in_sb} { #\xff 255 is the IAC Data Byte (Interpret As Command) @@ -555,22 +615,24 @@ namespace eval punk::basictelnet { break } #write [string range $data 0 $idx-1] - puts -nonewline stdout [encoding convertfrom utf-8 [string range $data 0 $idx-1]] + puts -nonewline stdout [encoding convertfrom $encoding_guess [string range $data 0 $idx-1]] flush stdout set post_IAC_byte [string index $data [expr {$idx+1}]] - incr idx 2 if {$post_IAC_byte < "\xef"} { #?? #write \xf0$post_IAC_byte ;#from wiki code. purpose not understood. puts stderr "unexpected - byte less than EF following IAC" - set data [string range $data $idx-1 end] + set data [string range $data $idx+1 end] + incr idx } elseif {$post_IAC_byte == "\xff"} { #write \xf0 ;#?? This came from wiki code - intention unclear.. latin small letter Eth #RFC indicates double up of \xff is treated as literal - #this can't be part of utf-8 - so + #this can't be part of utf-8 - puts -nonewline stdout \xff - set data [string range $data $idx end] + set data [string range $data $idx+2 end] + incr idx 2 } else { + incr idx 2 set ophex "" #telnet commands are at least 2 bytes binary scan $post_IAC_byte H2 cmdhex @@ -578,7 +640,7 @@ namespace eval punk::basictelnet { fb - fc - fd - fe { #WILL, WON'T, DO, DON'T #3bytes - last is option - set opbyte [string index $data [expr {$idx}]] + set opbyte [string index $data $idx] #don't incr idx - protocol will do so #incr idx binary scan $opbyte H2 ophex @@ -630,15 +692,27 @@ namespace eval punk::basictelnet { } } ;#end inner while - #puts -nonewline stdout $data - set prefix [punk::lib::get_utf8_leading $data] - set plen [string length $prefix] - set tail [string range $data $plen end] + + if {$encoding_guess eq "utf-8"} { + set prefix [punk::lib::get_utf8_leading $data] + set plen [string length $prefix] + set tail [string range $data $plen end] + if {[string length $tail] && [string index $tail 0] ne "\xFF" && ![punk::lib::is_utf8_multibyteprefix $tail]} { + puts stderr ">>>>[ansistring VIEW -lf 1 -vt 1 $tail]<<<<" + #Our utf-8 guess has been disproven! + set encoding_guess cp437 + set prefix $data + set tail "" + } + } else { + set prefix $data + set tail "" + } set ansisplits [list] if {[string length $tail]} { set fromserver_unprocessed $tail - puts -nonewline stdout "[encoding convertfrom utf-8 $prefix]" + puts -nonewline stdout [encoding convertfrom $encoding_guess $prefix] } else { set fromserver_unprocessed "" #look for incomplete ansi sequences @@ -647,10 +721,10 @@ namespace eval punk::basictelnet { set last_pt [lindex $ansisplits end] ;#last part is supposed to be plaintext - if it looks like it contains a partial ansi - throw it to fromserver_unprocessed for next fromServer call if {[string first "\x1b" $last_pt] >= 0} { set complete [join [lrange $ansisplits 0 end-1] ""] - puts -nonewline stdout "[encoding convertfrom utf-8 $complete]" + puts -nonewline stdout [encoding convertfrom $encoding_guess $complete] set fromserver_unprocessed $last_pt } else { - puts -nonewline stdout "[encoding convertfrom utf-8 $prefix]" + puts -nonewline stdout [encoding convertfrom $encoding_guess $prefix] } } @@ -662,7 +736,6 @@ namespace eval punk::basictelnet { set debug_info "" #add_debug has potentially written to another part of the screen with different SGR colour/background etc #ie - by interrupting the telnet data with our own output, we lose SGR context when returning to normal output. - variable debug if {$debug} { if {![llength $ansisplits]} { set ansisplits [punk::ansi::ta::split_codes_single $prefix] diff --git a/src/modules/punk/char-999999.0a1.0.tm b/src/modules/punk/char-999999.0a1.0.tm index 25f729ed..2dce356f 100644 --- a/src/modules/punk/char-999999.0a1.0.tm +++ b/src/modules/punk/char-999999.0a1.0.tm @@ -89,7 +89,7 @@ namespace eval punk::char { variable invalid "???" ;# ideally this would be 0xFFFD - which should display as black diamond/rhombus with question mark. As at 2023 - this tends to display indistinguishably from other missing glyph boxes - so we use a longer sequence we can detect by length and to display obviously variable invalid_display_char \u25ab; #todo - change to 0xFFFD once diamond glyph more common? - #just the 7-bit ascii. use [page ascii] for the 8-bit + #just the 7-bit ascii. use [page ascii] for the 8-bit layout proc ascii {} {return { 00 NUL 01 SOH 02 STX 03 ETX 04 EOT 05 ENQ 06 ACK 07 BEL 08 BS 09 HT 0a LF 0b VT 0c FF 0d CR 0e SO 0f SI @@ -235,6 +235,10 @@ namespace eval punk::char { dict set args -cols 16 tailcall page $encname {*}$args } + + #This will not display for example, c0 glyphs for cp437 + # we could use the punk::ansi::cp437_map dict - but while that might be what some expect to see - it would probably be too much magic for this function - which is intended to align more with what Tcl's encoding convertfrom/to actually does. + # for nonprinting members of the page, 2 and 3 letter codes are used rather than unicode visualisation replacements or even unicode equivalent replacements known to be appropriate for the page proc page {encname args} { variable invalid set encname [encname $encname] @@ -467,7 +471,7 @@ namespace eval punk::char { variable charsets set encname [encname $pagename] set defaults [list\ - -range {0 256}\ + -range {0 255}\ -charset ""\ ] set opts [dict merge $defaults $args] @@ -510,6 +514,7 @@ namespace eval punk::char { .= ascii |> .=> linelist -line trimleft |> .=* concat |> {regexp -all -inline {\S+} $data} } + #review - use terminal to display actual supported DEC specials vs using dict at: punk::ansi::map_special_graphics which maps to known unicode equivalents proc asciidict2 {} { set d [dict create] dict for {k v} [basedict_display] { diff --git a/src/modules/punk/lib-999999.0a1.0.tm b/src/modules/punk/lib-999999.0a1.0.tm index 07630603..841edab9 100644 --- a/src/modules/punk/lib-999999.0a1.0.tm +++ b/src/modules/punk/lib-999999.0a1.0.tm @@ -302,6 +302,27 @@ namespace eval punk::lib { #[para]see [uri https://wiki.tcl-lang.org/page/K] #[para]It is used in cases where command-substitution at the calling-point performs some desired effect. + + proc is_utf8_multibyteprefix {bytes} { + #*** !doctools + #[call [fun is_utf8_multibyteprefix] [arg str]] + #[para] Returns a boolean if str is potentially a prefix for a multibyte utf-8 character + #[para] ie - tests if it is possible that appending more data will result in a utf-8 codepoint + #[para] Will return false for an already complete utf-8 codepoint + #[para] It is assumed the incomplete sequence is at the beginning of the bytes argument + #[para] Suitable input for this might be from the unreturned tail portion of get_utf8_leading $testbytes + #[para] e.g using: set head [lb]get_utf8_leading $testbytes[rb] ; set tail [lb]string range $testbytes [lb]string length $head[rb] end[rb] + regexp {(?x) + ^ + (?: + [\xC0-\xDF] | #possible prefix for two-byte codepoint + [\xE0-\xEF] [\x80-\xBF]{0,1} | #possible prefix for three-byte codepoint + [\xF0-\xF4] [\x80-\xBF]{0,2} #possible prefix for + ) + $ + } $bytes + } + proc is_utf8_first {str} { regexp {(?x) # Expanded regexp syntax, so I can put in comments :-) ^