Browse Source

telnet utf-8/cp437 auto encoding guess, plus better debug output

master
Julian Noble 8 months ago
parent
commit
b96ace811b
  1. 9
      src/bootsupport/modules/punk/char-0.1.0.tm
  2. 21
      src/bootsupport/modules/punk/lib-0.1.1.tm
  3. 159
      src/modules/punk/basictelnet-999999.0a1.0.tm
  4. 9
      src/modules/punk/char-999999.0a1.0.tm
  5. 21
      src/modules/punk/lib-999999.0a1.0.tm

9
src/bootsupport/modules/punk/char-0.1.0.tm

@ -89,7 +89,7 @@ namespace eval punk::char {
variable invalid "???" ;# ideally this would be 0xFFFD - which should display as black diamond/rhombus with question mark. As at 2023 - this tends to display indistinguishably from other missing glyph boxes - so we use a longer sequence we can detect by length and to display obviously
variable invalid_display_char \u25ab; #todo - change to 0xFFFD once diamond glyph more common?
#just the 7-bit ascii. use [page ascii] for the 8-bit
#just the 7-bit ascii. use [page ascii] for the 8-bit layout
proc ascii {} {return {
00 NUL 01 SOH 02 STX 03 ETX 04 EOT 05 ENQ 06 ACK 07 BEL
08 BS 09 HT 0a LF 0b VT 0c FF 0d CR 0e SO 0f SI
@ -235,6 +235,10 @@ namespace eval punk::char {
dict set args -cols 16
tailcall page $encname {*}$args
}
#This will not display for example, c0 glyphs for cp437
# we could use the punk::ansi::cp437_map dict - but while that might be what some expect to see - it would probably be too much magic for this function - which is intended to align more with what Tcl's encoding convertfrom/to actually does.
# for nonprinting members of the page, 2 and 3 letter codes are used rather than unicode visualisation replacements or even unicode equivalent replacements known to be appropriate for the page
proc page {encname args} {
variable invalid
set encname [encname $encname]
@ -467,7 +471,7 @@ namespace eval punk::char {
variable charsets
set encname [encname $pagename]
set defaults [list\
-range {0 256}\
-range {0 255}\
-charset ""\
]
set opts [dict merge $defaults $args]
@ -510,6 +514,7 @@ namespace eval punk::char {
.= ascii |> .=> linelist -line trimleft |> .=* concat |> {regexp -all -inline {\S+} $data}
}
#review - use terminal to display actual supported DEC specials vs using dict at: punk::ansi::map_special_graphics which maps to known unicode equivalents
proc asciidict2 {} {
set d [dict create]
dict for {k v} [basedict_display] {

21
src/bootsupport/modules/punk/lib-0.1.1.tm

@ -302,6 +302,27 @@ namespace eval punk::lib {
#[para]see [uri https://wiki.tcl-lang.org/page/K]
#[para]It is used in cases where command-substitution at the calling-point performs some desired effect.
proc is_utf8_multibyteprefix {bytes} {
#*** !doctools
#[call [fun is_utf8_multibyteprefix] [arg str]]
#[para] Returns a boolean if str is potentially a prefix for a multibyte utf-8 character
#[para] ie - tests if it is possible that appending more data will result in a utf-8 codepoint
#[para] Will return false for an already complete utf-8 codepoint
#[para] It is assumed the incomplete sequence is at the beginning of the bytes argument
#[para] Suitable input for this might be from the unreturned tail portion of get_utf8_leading $testbytes
#[para] e.g using: set head [lb]get_utf8_leading $testbytes[rb] ; set tail [lb]string range $testbytes [lb]string length $head[rb] end[rb]
regexp {(?x)
^
(?:
[\xC0-\xDF] | #possible prefix for two-byte codepoint
[\xE0-\xEF] [\x80-\xBF]{0,1} | #possible prefix for three-byte codepoint
[\xF0-\xF4] [\x80-\xBF]{0,2} #possible prefix for
)
$
} $bytes
}
proc is_utf8_first {str} {
regexp {(?x) # Expanded regexp syntax, so I can put in comments :-)
^

159
src/modules/punk/basictelnet-999999.0a1.0.tm

@ -105,6 +105,12 @@ namespace eval punk::basictelnet {
set window_cols 80
set window_rows 25
#Some modern(?) telnet servers seem to just pump out utf-8 encoded graphics by default - without negotiating or confirming binary etc? review
variable encoding_guess utf-8
#we will experimentally assume utf-8 - which should handle ascii fine - and flip to cp437 when data encountered that cannot be valid utf-8
#todo - proper charset negotiation
variable debug
set debug 0
proc debug {{on_off ""}} {
@ -137,7 +143,7 @@ namespace eval punk::basictelnet {
#try: DUMB,ANSI,VT100,XTERM
#see also the Mud Terminal Type Standard as an extensiont to RFC1091 Telenet Terminal-Type
variable remote_terminal_type
set remote_terminal_type "" ;#emtpy until/unless reported via an option 24 send from the remote
set remote_terminal_type "" ;#empty until/unless reported via an option 24 send from the remote
#*** !doctools
@ -146,13 +152,13 @@ namespace eval punk::basictelnet {
#[list_begin definitions]
variable optioncodes
dict set optioncodes 0 [list name "Binary Transmission"]
dict set optioncodes 1 [list name "Echo"]
dict set optioncodes 2 [list name "Reconnection"]
dict set optioncodes 3 [list name "Suppress Go Ahead"]
dict set optioncodes 4 [list name "Approx Message Size Negotiation"]
dict set optioncodes 5 [list name "Status"] ;#RFC 859
dict set optioncodes 6 [list name "Timing Mark"]
dict set optioncodes 0 [list name "Binary Transmission" short "bin"]
dict set optioncodes 1 [list name "Echo" short "echo"]
dict set optioncodes 2 [list name "Reconnection" short "recon"]
dict set optioncodes 3 [list name "Suppress Go Ahead" short "Suppress GA"]
dict set optioncodes 4 [list name "Approx Message Size Negotiation" short "msgsize nego"]
dict set optioncodes 5 [list name "Status" short "status"] ;#RFC 859
dict set optioncodes 6 [list name "Timing Mark" short "tmark"]
dict set optioncodes 7 [list name "Remote Controlled Trans and Echo"]
dict set optioncodes 8 [list name "Output Line Width"]
dict set optioncodes 9 [list name "Output Page Size"]
@ -170,29 +176,29 @@ namespace eval punk::basictelnet {
dict set optioncodes 21 [list name "SUPDUP"]
dict set optioncodes 22 [list name "SUPDUP Output"]
dict set optioncodes 23 [list name "Send Location"]
dict set optioncodes 24 [list name "Terminal Type"]
dict set optioncodes 24 [list name "Terminal Type" short "ttype"]
dict set optioncodes 25 [list name "End of Record"]
dict set optioncodes 26 [list name "TACACS User Identification"]
dict set optioncodes 26 [list name "TACACS User Identification" short "tacacs"]
dict set optioncodes 27 [list name "Output Marking"]
dict set optioncodes 28 [list name "Terminal Location Number"]
dict set optioncodes 29 [list name "Telnet 3270 Regime"]
dict set optioncodes 30 [list name "X.3 PAD"]
dict set optioncodes 31 [list name "Negotiate About Window Size"] ;#RFC 1073 NAWS
dict set optioncodes 32 [list name "Terminal Speed"] ;#rfc 1079
dict set optioncodes 31 [list name "Negotiate About Window Size" short "NAWS"] ;#RFC 1073 NAWS
dict set optioncodes 32 [list name "Terminal Speed" short "tspeed"] ;#rfc 1079
dict set optioncodes 33 [list name "Remote Flow Control"]
dict set optioncodes 34 [list name "Line Mode"]
dict set optioncodes 35 [list name "X Display Location"]
dict set optioncodes 36 [list name "Environment Option"]
dict set optioncodes 37 [list name "Authentication Option"]
dict set optioncodes 38 [list name "Encryption Option"]
dict set optioncodes 39 [list name "New Environment Option"]
dict set optioncodes 36 [list name "Environment Option" short "env opt"]
dict set optioncodes 37 [list name "Authentication Option" short "authent opt"]
dict set optioncodes 38 [list name "Encryption Option" short "encrypt opt"]
dict set optioncodes 39 [list name "New Environment Option" short "new env opt"]
dict set optioncodes 40 [list name "TN3270E"]
dict set optioncodes 41 [list name "XAUTH"]
dict set optioncodes 42 [list name "CHARSET"]
dict set optioncodes 43 [list name "Telnet Remote Serial Port (RSP)"]
dict set optioncodes 43 [list name "Telnet Remote Serial Port (RSP)" short "RSP"]
dict set optioncodes 44 [list name "Com Port Control Option"]
dict set optioncodes 45 [list name "Telnet Supress Local Echo"]
dict set optioncodes 46 [list name "Telnet Start TLS"]
dict set optioncodes 45 [list name "Telnet Supress Local Echo" short "no-local-echo"]
dict set optioncodes 46 [list name "Telnet Start TLS" short "starttls"]
dict set optioncodes 47 [list name "KERMIT"]
dict set optioncodes 48 [list name "SEND-URL"]
dict set optioncodes 49 [list name "FORWARD_X"]
@ -216,6 +222,8 @@ namespace eval punk::basictelnet {
variable optioncodes
variable server_option_state
variable client_option_state
variable encoding_guess
set encoding_guess utf-8
dict for {k _v} $optioncodes {
dict set server_option_state $k 0 ;#DO from our perspective
@ -223,6 +231,36 @@ namespace eval punk::basictelnet {
}
}
reset_option_states ;#initialise to NVT - all off
proc get_server_option_state_summary {} {
variable server_option_state
variable optioncodes
set summary ""
dict for {k v} $server_option_state {
if {$v} {
if {[dict exists $optioncodes $k short]} {
append summary "[dict get $optioncodes $k short],"
} else {
append summary "[dict get $optioncodes $k name],"
}
}
}
set summary [string trimright $summary ,]
}
proc get_client_option_state_summary {} {
variable client_option_state
variable optioncodes
set summary ""
dict for {k v} $client_option_state {
if {$v} {
if {[dict exists $optioncodes $k short]} {
append summary "[dict get $optioncodes $k short],"
} else {
append summary "[dict get $optioncodes $k name],"
}
}
}
set summary [string trimright $summary ,]
}
# -----------------------------------
# A rudimentary hardcoded configuration for options/negotiation
@ -319,6 +357,8 @@ namespace eval punk::basictelnet {
#experiment
proc debug_frame {info inputchannel outputchannel} {
variable encoding_guess
variable terminal_type
variable writing_debug_frame
if {$writing_debug_frame == 1} {
after 1 {punk::basictelnet::add_debug "" $readchannel $writechannel}
@ -332,13 +372,17 @@ namespace eval punk::basictelnet {
incr writing_debug_frame
set server_summary "SVR:[get_server_option_state_summary]"
set client_summary "CLI:[get_client_option_state_summary]"
set info $server_summary\n$client_summary\n$info
#set existing_handler [fileevent stdin readable]
set RST "\x1b\[m"
set w 80
set infoframe [textblock::frame -width $w -ansiborder [a+ green bold] -title "[a cyan]Telnet Debug$RST" $info]
set infoframe [textblock::frame -width $w -ansiborder [a+ green bold] -title "[a cyan]Telnet Debug $terminal_type (encoding guess:$encoding_guess)$RST" $info]
#set w [textblock::width $infoframe]
set spacepatch [textblock::block $w 4 "$RST "]
set spacepatch "$RST[textblock::block $w 4 { }]"
#puts -nonewline [punk::ansi::cursor_off]
#use non cursorsave version - slower - but less likely to interfere with cursor operations in data
@ -353,9 +397,10 @@ namespace eval punk::basictelnet {
if {[catch {
#punk::console::move_emitblock_return 6 90 $spacepatch\n$infoframe
punk::console::move_emitblock_return 6 90 $spacepatch
flush stdout
punk::console::move_emitblock_return 10 90 $infoframe
#puts -nonewline stdout [punk::ansi::cursor_on] ;#Enabling cursor should take account of whether it was enabled before - we don't have that info currently! review
flush stdout
#puts -nonewline stdout [punk::ansi::cursor_on] ;#Enabling cursor should take account of whether it was enabled before - we don't have that info currently! review
} errM]} {
puts stderr "debug_frame error: $errM"
}
@ -405,6 +450,7 @@ namespace eval punk::basictelnet {
#waiting data will be supplied to this handler if the other handler over-read (e.g repl handling ANSI ESC \[6n response on stdin finding other data before the ANSI response.)
proc toServer {sock} {
variable server_option_state
variable encoding_guess
upvar ::punk::console::input_chunks_waiting input_chunks_waiting
set nextwaiting ""
@ -456,8 +502,6 @@ namespace eval punk::basictelnet {
set wrote_sock 0
}
#JJJ
#update idletasks
if {$wrote_sock && ![eof $sock]} {
##################################################################################
#Re-enable channel read handler only if no waiting chunks - must process in order
@ -486,6 +530,8 @@ namespace eval punk::basictelnet {
}
proc fromServer {sock} {
variable encoding_guess
variable debug
variable fromserver_unprocessed
fileevent $sock readable {}
variable in_sb
@ -532,16 +578,30 @@ namespace eval punk::basictelnet {
#mini debug buffer for each fromServer call - render using add_debug each loop
set debug_info ""
append debug_info "------raw data [string length $data]---prev unprocessed:[string length $last_unprocessed]---" \n
#append debug_info [ansistring VIEW -lf 1 -vt 1 [encoding convertfrom utf-8 $data]] \n
set
append debug_info "------------------------------------------" \n
if {$debug} {
#only do this text-processing work if debug is on
append debug_info "------raw data [string length $data]---prev unprocessed:[string length $last_unprocessed]---" \n
#append debug_info [ansistring VIEW -lf 1 -vt 1 [encoding convertfrom utf-8 $data]] \n
set rawview [ansistring VIEW -lf 1 -vt 1 [encoding convertfrom $encoding_guess $data]]
set viewblock [overtype::left -wrap 1 -width 78 -height 4 "" $rawview]
set lines [split $viewblock \n]
if {[llength $lines] > 4} {
append debug_info [join [list {*}[lrange $lines 0 1] "...<[expr {[llength $lines] -4}] lines undisplayed>..." {*}[lrange $lines end-1 end]] \n]
} else {
append debug_info $viewblock
}
append debug_info "\n------------------------------------------" \n
}
if {[string length $data]} {
#puts "1----------------------------------"
#puts [ansistring VIEW -lf 1 -vt 1 $data]
#---------------
#TODO - fix possible chunk boundary that gives us an incomplete IAC sequence.
#As it stands - we won't properly handle it - possible it will cause intermittent telnet protocol bugs!
#will need a mechanism within protocol function and loop to abort and throw back to next fromServer event
#---------------
while 1 {
if {!$in_sb} {
#\xff 255 is the IAC Data Byte (Interpret As Command)
@ -555,22 +615,24 @@ namespace eval punk::basictelnet {
break
}
#write [string range $data 0 $idx-1]
puts -nonewline stdout [encoding convertfrom utf-8 [string range $data 0 $idx-1]]
puts -nonewline stdout [encoding convertfrom $encoding_guess [string range $data 0 $idx-1]]
flush stdout
set post_IAC_byte [string index $data [expr {$idx+1}]]
incr idx 2
if {$post_IAC_byte < "\xef"} {
#??
#write \xf0$post_IAC_byte ;#from wiki code. purpose not understood.
puts stderr "unexpected - byte less than EF following IAC"
set data [string range $data $idx-1 end]
set data [string range $data $idx+1 end]
incr idx
} elseif {$post_IAC_byte == "\xff"} {
#write \xf0 ;#?? This came from wiki code - intention unclear.. latin small letter Eth
#RFC indicates double up of \xff is treated as literal
#this can't be part of utf-8 - so
#this can't be part of utf-8 -
puts -nonewline stdout \xff
set data [string range $data $idx end]
set data [string range $data $idx+2 end]
incr idx 2
} else {
incr idx 2
set ophex ""
#telnet commands are at least 2 bytes
binary scan $post_IAC_byte H2 cmdhex
@ -578,7 +640,7 @@ namespace eval punk::basictelnet {
fb - fc - fd - fe {
#WILL, WON'T, DO, DON'T
#3bytes - last is option
set opbyte [string index $data [expr {$idx}]]
set opbyte [string index $data $idx]
#don't incr idx - protocol will do so
#incr idx
binary scan $opbyte H2 ophex
@ -630,15 +692,27 @@ namespace eval punk::basictelnet {
}
} ;#end inner while
#puts -nonewline stdout $data
set prefix [punk::lib::get_utf8_leading $data]
set plen [string length $prefix]
set tail [string range $data $plen end]
if {$encoding_guess eq "utf-8"} {
set prefix [punk::lib::get_utf8_leading $data]
set plen [string length $prefix]
set tail [string range $data $plen end]
if {[string length $tail] && [string index $tail 0] ne "\xFF" && ![punk::lib::is_utf8_multibyteprefix $tail]} {
puts stderr ">>>>[ansistring VIEW -lf 1 -vt 1 $tail]<<<<"
#Our utf-8 guess has been disproven!
set encoding_guess cp437
set prefix $data
set tail ""
}
} else {
set prefix $data
set tail ""
}
set ansisplits [list]
if {[string length $tail]} {
set fromserver_unprocessed $tail
puts -nonewline stdout "[encoding convertfrom utf-8 $prefix]"
puts -nonewline stdout [encoding convertfrom $encoding_guess $prefix]
} else {
set fromserver_unprocessed ""
#look for incomplete ansi sequences
@ -647,10 +721,10 @@ namespace eval punk::basictelnet {
set last_pt [lindex $ansisplits end] ;#last part is supposed to be plaintext - if it looks like it contains a partial ansi - throw it to fromserver_unprocessed for next fromServer call
if {[string first "\x1b" $last_pt] >= 0} {
set complete [join [lrange $ansisplits 0 end-1] ""]
puts -nonewline stdout "[encoding convertfrom utf-8 $complete]"
puts -nonewline stdout [encoding convertfrom $encoding_guess $complete]
set fromserver_unprocessed $last_pt
} else {
puts -nonewline stdout "[encoding convertfrom utf-8 $prefix]"
puts -nonewline stdout [encoding convertfrom $encoding_guess $prefix]
}
}
@ -662,7 +736,6 @@ namespace eval punk::basictelnet {
set debug_info ""
#add_debug has potentially written to another part of the screen with different SGR colour/background etc
#ie - by interrupting the telnet data with our own output, we lose SGR context when returning to normal output.
variable debug
if {$debug} {
if {![llength $ansisplits]} {
set ansisplits [punk::ansi::ta::split_codes_single $prefix]

9
src/modules/punk/char-999999.0a1.0.tm

@ -89,7 +89,7 @@ namespace eval punk::char {
variable invalid "???" ;# ideally this would be 0xFFFD - which should display as black diamond/rhombus with question mark. As at 2023 - this tends to display indistinguishably from other missing glyph boxes - so we use a longer sequence we can detect by length and to display obviously
variable invalid_display_char \u25ab; #todo - change to 0xFFFD once diamond glyph more common?
#just the 7-bit ascii. use [page ascii] for the 8-bit
#just the 7-bit ascii. use [page ascii] for the 8-bit layout
proc ascii {} {return {
00 NUL 01 SOH 02 STX 03 ETX 04 EOT 05 ENQ 06 ACK 07 BEL
08 BS 09 HT 0a LF 0b VT 0c FF 0d CR 0e SO 0f SI
@ -235,6 +235,10 @@ namespace eval punk::char {
dict set args -cols 16
tailcall page $encname {*}$args
}
#This will not display for example, c0 glyphs for cp437
# we could use the punk::ansi::cp437_map dict - but while that might be what some expect to see - it would probably be too much magic for this function - which is intended to align more with what Tcl's encoding convertfrom/to actually does.
# for nonprinting members of the page, 2 and 3 letter codes are used rather than unicode visualisation replacements or even unicode equivalent replacements known to be appropriate for the page
proc page {encname args} {
variable invalid
set encname [encname $encname]
@ -467,7 +471,7 @@ namespace eval punk::char {
variable charsets
set encname [encname $pagename]
set defaults [list\
-range {0 256}\
-range {0 255}\
-charset ""\
]
set opts [dict merge $defaults $args]
@ -510,6 +514,7 @@ namespace eval punk::char {
.= ascii |> .=> linelist -line trimleft |> .=* concat |> {regexp -all -inline {\S+} $data}
}
#review - use terminal to display actual supported DEC specials vs using dict at: punk::ansi::map_special_graphics which maps to known unicode equivalents
proc asciidict2 {} {
set d [dict create]
dict for {k v} [basedict_display] {

21
src/modules/punk/lib-999999.0a1.0.tm

@ -302,6 +302,27 @@ namespace eval punk::lib {
#[para]see [uri https://wiki.tcl-lang.org/page/K]
#[para]It is used in cases where command-substitution at the calling-point performs some desired effect.
proc is_utf8_multibyteprefix {bytes} {
#*** !doctools
#[call [fun is_utf8_multibyteprefix] [arg str]]
#[para] Returns a boolean if str is potentially a prefix for a multibyte utf-8 character
#[para] ie - tests if it is possible that appending more data will result in a utf-8 codepoint
#[para] Will return false for an already complete utf-8 codepoint
#[para] It is assumed the incomplete sequence is at the beginning of the bytes argument
#[para] Suitable input for this might be from the unreturned tail portion of get_utf8_leading $testbytes
#[para] e.g using: set head [lb]get_utf8_leading $testbytes[rb] ; set tail [lb]string range $testbytes [lb]string length $head[rb] end[rb]
regexp {(?x)
^
(?:
[\xC0-\xDF] | #possible prefix for two-byte codepoint
[\xE0-\xEF] [\x80-\xBF]{0,1} | #possible prefix for three-byte codepoint
[\xF0-\xF4] [\x80-\xBF]{0,2} #possible prefix for
)
$
} $bytes
}
proc is_utf8_first {str} {
regexp {(?x) # Expanded regexp syntax, so I can put in comments :-)
^

Loading…
Cancel
Save