@ -9,7 +9,7 @@
# http://paste.tclers.tk/5977
#
# @@ Meta Begin
# Application punk::cesu 0.1.0
# Application punk::cesu 0.1.1
# Meta platform tcl
# Meta license <unknown>
# @@ Meta End
@ -19,14 +19,14 @@
# doctools header
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[manpage_begin punkshell_module_punk::cesu 0 0.1.0 ]
#[manpage_begin punkshell_module_punk::cesu 0 0.1.1 ]
#[copyright "2024"]
#[titledesc {CESU compatibility ehcoding scheme for utf-16: 8-Bit (CESU-8) ??}] [comment {-- Name section and table of contents description --}]
#[moddesc {CESU experimental}] [comment {-- Description at end of page heading --}]
#[moddesc {CESU experimental}] [comment {-- Description at end of page heading --}]
#[require punk::cesu]
#[keywords module cesu encoding compatibility experimental unofficial]
#[description]
#[para] experimental
#[para] experimental
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
@ -34,8 +34,8 @@
#[section Overview]
#[para] overview of punk::cesu
#[subsection Concepts]
#[para] cesu-8 may be mistaken for utf-8 if no supplementary chars present.
#[para] see: https://www.unicode.org/reports/tr26/tr26-4.html
#[para] cesu-8 may be mistaken for utf-8 if no supplementary chars present.
#[para] see: https://www.unicode.org/reports/tr26/tr26-4.html
#[para] Particulary note discouragement of use especially in external interchange.
@ -52,9 +52,6 @@ package require Tcl 8.6-
#*** !doctools
#[item] [package {Tcl 8.6}]
# #package require frobz
# #*** !doctools
# #[item] [package {frobz}]
#*** !doctools
#[list_end]
@ -70,11 +67,11 @@ package require Tcl 8.6-
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
tcl::namespace::eval punk::cesu {
tcl::namespace::export {[a-z]*} ;# Convention: export all lowercase
variable PUNKARGS
variable PUNKARGS
#*** !doctools
#[subsection {Namespace punk::cesu}]
#[para] Core API functions for punk::cesu
#[para] Core API functions for punk::cesu
#[list_begin definitions]
@ -127,7 +124,7 @@ tcl::namespace::eval punk::cesu {
binary scan $1 c 1
binary scan $2 c 2
binary scan $3 c 3
puts [list $1 $2 $3]
# puts [list $1 $2 $3]
#binary scan $4 c 4
incr 1 ;#// Effectively adds 0x10000 to the codepoint ?
@ -155,7 +152,7 @@ tcl::namespace::eval punk::cesu {
[expr {0x80 | (($1 & 0x3) << 4) | (($2 & 0x3C) >> 2)}] \
[expr {0x80 | (($2 & 0x3) << 4) | ($3 & 0xF)}] \
$4]
} else {
puts "Invalid sequence: $char"
return $char
@ -177,26 +174,78 @@ tcl::namespace::eval punk::cesu {
#e.g from_surrogatestring "note \ud83f\udd1e etc"
#e.g from_surrogatestring "faces \ud83d\ude10 \ud83d\ude21 \ud83d\ude31"
#note: from_surrogatestring \U1f600 returns a mouse (\U1f400) instead of smiley
#note: from_surrogatestring \U1f600 returns a mouse (\U1f400) instead of smiley
# but from_surrogatestring \U1f400 returns a mouse.
# Tcl bug - fixed some time in 9.x
# surrogated_string shouldn't include non BMP chars anyway (G.I.G.O ?)
# surrogated_string theoretically shouldn't include non BMP chars anyway (but may in some contexts? mixed surrogate escapes and raw nonbmp ?)
lappend PUNKARGS [list {
@id -id ::punk::cesu::from_surrogatestring
@cmd -name punk::cesu::from_surrogatestring -help\
"Convert a string containing surrogate pairs
to string with pairs converted to unicode non-BMP
to Tcl string with pairs converted to unicode non-BMP
characters"
@values
@values
surrogated_string -help\
"May contain a mix of surrogate pairs and other
characters - only the surrogate pairs will be converted."
}]
proc from_surrogatestring {surrogated_string} {
proc from_surrogatestring {str} {
#high surrogate character rep followed by low surrogate character rep
if {[regexp {[\uD800-\uDBFF][\uDC00-\uDFFF]} $str]} {
set str [string map {\[ \\\[ \] \\\]} $str] ;#Make sure any existing commandlike structures aren't executed
return [subst -novariables -nobackslashes [regsub -all {([\uD800-\uDBFF])([\uDC00-\uDFFF])} $str {[surrogatepair_to_codepoint \1 \2]} ]]
} else {
return $str
}
}
proc surrogatepair_to_codepoint {highchar lowchar} {
if {[string length $highchar] != 1 || [string length $lowchar] !=1} {
error "surrogatepair_to_codepoint expected surrogate pair encoded as 2 characters"
}
#NOTE in tcl8 - we get oddity that 'split <surrogatepair> ""' returns a list of length 1 even though there are 2 chars
#fixed in tcl9
#lassign [split $2_surrogate_chars ""] highSurrogateChar lowSurrogateChar
scan $highchar %c highDecimal
scan $lowchar %c lowDecimal
set highDecimal [expr {$highDecimal - 0xD800}]
set lowDecimal [expr {$lowDecimal - 0xDC00}]
# Combine the values and add 0x10000 to get the original code point
set codepointDecimal [expr {($highDecimal << 10) + $lowDecimal + 0x10000}]
#puts "->codepointDecimal $codepointDecimal"
#In tcl8 - we will get \uFFFD for non BMP codepoints - todo ?
return [format %c $codepointDecimal]
}
#e.g {\ud83d\ude00}
proc escaped_surrogatepair_to_codepoint {spair} {
set spair [string map {" " ""} $spair]
if {[string length $spair] != 12} {
error "escaped_surrogatepair_to_codepoint expected input of form \\uXXXX\\uXXXX"
}
set normalised [regsub -all {\\+u} $spair ""]
set highSurrogate [string range $normalised 0 3]
set lowSurrogate [string range $normalised 4 end]
scan $highSurrogate %x highDecimal
scan $lowSurrogate %x lowDecimal
set highDecimal [expr {$highDecimal - 0xD800}]
set lowDecimal [expr {$lowDecimal - 0xDC00}]
# Combine the values and add 0x10000 to get the original code point
set codepointDecimal [expr {($highDecimal << 10) + $lowDecimal + 0x10000}]
return [format %c $codepointDecimal]
}
proc from_surrogatestring_via_cesu {surrogated_string} {
#we can do this without cesu (from_surrogatestring)
set cesu [encoding convertto cesu-8 $surrogated_string]
set x [cesu2utf $cesu]
encoding convertfrom utf-8 $x
}
proc _to_test {emoji} {
puts stderr "_to_test incomplete"
set cesu [encoding convertto cesu-8 $e]
@ -209,7 +258,7 @@ tcl::namespace::eval punk::cesu {
-format -default escape -choices {raw escape} -choicelabels {
raw\
" emit raw surrogate pairs
may not be writable to
may not be writable to
output channels"
escape\
" emit unprocessed backslash hex
@ -224,7 +273,7 @@ tcl::namespace::eval punk::cesu {
e.g
>to_surrogatestring -format escape \"mouse: \\U1f400\"
mouse: \\uD83D\\uDC00
"
"
}]
proc to_surrogatestring {args} {
set argd [punk::args::parse $args withid ::punk::cesu::to_surrogatestring]
@ -273,14 +322,14 @@ tcl::namespace::eval punk::cesu {
#set lsbinfo [punk::char::char_info_dec $lsbfinal -fields all -except testwidth] ;#don't use all/testwidth will try to emit the char and fail/show error
set esc "\\u$msbhex\\u$lsbhex"
set raw [format %c $msbfinal][format %c $lsbfinal]
return [dict create escapes $esc msbdec $msbfinal msbhex $msbhex lsbdec $lsbfinal lsbhex $lsbhex raw $raw]
return [dict create escapes $esc msbdec $msbfinal msbhex $msbhex lsbdec $lsbfinal lsbhex $lsbhex raw $raw]
}
#
#test_enc_equivalency \U1f400 \U1f600
proc test_enc_equivalency {c1 c2} {
package require punk::ansi
namespace import ::punk::ansi::a+ ::punk::ansi::a
namespace import ::punk::ansi::a+ ::punk::ansi::a
foreach enc [lsort [encoding names]] {
puts stdout "testing $enc"
if {$enc in "iso2022 iso2022-jp iso2022-kr"} {
@ -315,14 +364,14 @@ tcl::namespace::eval punk::cesu::lib {
tcl::namespace::path [tcl::namespace::parent]
#*** !doctools
#[subsection {Namespace punk::cesu::lib}]
#[para] Secondary functions that are part of the API
#[para] Secondary functions that are part of the API
#[list_begin definitions]
#proc utility1 {p1 args} {
# #*** !doctools
# #[call lib::[fun utility1] [arg p1] [opt {?option value...?}]]
# #[para]Description of utility1
# return 1
# #[para]Description of utility1
# return 1
#}
@ -340,15 +389,15 @@ tcl::namespace::eval punk::cesu::lib {
#tcl::namespace::eval punk::cesu::system {
#*** !doctools
#[subsection {Namespace punk::cesu::system}]
#[para] Internal functions that are not part of the API
#[para] Internal functions that are not part of the API
#}
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# == === === === === === === === === === === === === === ===
# == === === === === === === === === === === === === === ===
# Sample 'about' function with punk::args documentation
# == === === === === === === === === === === === === === ===
# == === === === === === === === === === === === === === ===
tcl::namespace::eval punk::cesu {
tcl::namespace::export {[a-z]*} ;# Convention: export all lowercase
variable PUNKARGS
@ -371,7 +420,7 @@ tcl::namespace::eval punk::cesu {
set about_topics [list]
foreach f $topic_funs {
set tail [namespace tail $f]
lappend about_topics [string range $tail [string length get_topic_] end]
lappend about_topics [string range $tail [string length get_topic_] end]
}
#Adjust this function or 'default_topics' if a different order is required
return [lsort $about_topics]
@ -379,12 +428,12 @@ tcl::namespace::eval punk::cesu {
proc default_topics {} {return [list Description *]}
# -------------------------------------------------------------
# get_topic_ functions add more to auto-include in about topics
# get_topic_ functions add more to auto-include in about topics
# -------------------------------------------------------------
proc get_topic_Description {} {
punk::args::lib::tstr [string trim {
punk::args::lib::tstr [string trim {
package punk::cesu
description to come..
cesu and surrogate-pair processing
} \n]
}
proc get_topic_License {} {
@ -406,7 +455,8 @@ tcl::namespace::eval punk::cesu {
}
proc get_topic_custom-topic {} {
punk::args::lib::tstr -return string {
nothing to see here
This library can be used for surrogate-pair handling.
cesu utilities are used internally in from_surrogatestring
}
}
# -------------------------------------------------------------
@ -415,9 +465,9 @@ tcl::namespace::eval punk::cesu {
# we re-use the argument definition from punk::args::standard_about and override some items
set overrides [dict create]
dict set overrides @id -id "::punk::cesu::about"
dict set overrides @cmd -name "punk::cesu::about"
dict set overrides @cmd -name "punk::cesu::about"
dict set overrides @cmd -help [string trim [punk::args::lib::tstr {
About punk::cesu
About punk::cesu
}] \n]
dict set overrides topic -choices [list {*}[punk::cesu::argdoc::about_topics] *]
dict set overrides topic -choicerestricted 1
@ -433,7 +483,7 @@ tcl::namespace::eval punk::cesu {
}
}
# end of sample 'about' function
# == === === === === === === === === === === === === === ===
# == === === === === === === === === === === === === === ===
# -----------------------------------------------------------------------------
@ -446,11 +496,11 @@ namespace eval ::punk::args::register {
lappend ::punk::args::register::NAMESPACES ::punk::cesu
}
# -----------------------------------------------------------------------------
## Ready
## Ready
package provide punk::cesu [tcl::namespace::eval punk::cesu {
variable pkg punk::cesu
variable version
set version 0.1.0
set version 0.1.1
}]
return