|
|
@ -9,7 +9,7 @@ |
|
|
|
# http://paste.tclers.tk/5977 |
|
|
|
# http://paste.tclers.tk/5977 |
|
|
|
# |
|
|
|
# |
|
|
|
# @@ Meta Begin |
|
|
|
# @@ Meta Begin |
|
|
|
# Application punk::cesu 0.1.0 |
|
|
|
# Application punk::cesu 0.1.1 |
|
|
|
# Meta platform tcl |
|
|
|
# Meta platform tcl |
|
|
|
# Meta license <unknown> |
|
|
|
# Meta license <unknown> |
|
|
|
# @@ Meta End |
|
|
|
# @@ Meta End |
|
|
@ -19,7 +19,7 @@ |
|
|
|
# doctools header |
|
|
|
# doctools header |
|
|
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
|
|
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ |
|
|
|
#*** !doctools |
|
|
|
#*** !doctools |
|
|
|
#[manpage_begin punkshell_module_punk::cesu 0 0.1.0] |
|
|
|
#[manpage_begin punkshell_module_punk::cesu 0 0.1.1] |
|
|
|
#[copyright "2024"] |
|
|
|
#[copyright "2024"] |
|
|
|
#[titledesc {CESU compatibility ehcoding scheme for utf-16: 8-Bit (CESU-8) ??}] [comment {-- Name section and table of contents description --}] |
|
|
|
#[titledesc {CESU compatibility ehcoding scheme for utf-16: 8-Bit (CESU-8) ??}] [comment {-- Name section and table of contents description --}] |
|
|
|
#[moddesc {CESU experimental}] [comment {-- Description at end of page heading --}] |
|
|
|
#[moddesc {CESU experimental}] [comment {-- Description at end of page heading --}] |
|
|
@ -52,9 +52,6 @@ package require Tcl 8.6- |
|
|
|
#*** !doctools |
|
|
|
#*** !doctools |
|
|
|
#[item] [package {Tcl 8.6}] |
|
|
|
#[item] [package {Tcl 8.6}] |
|
|
|
|
|
|
|
|
|
|
|
# #package require frobz |
|
|
|
|
|
|
|
# #*** !doctools |
|
|
|
|
|
|
|
# #[item] [package {frobz}] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#*** !doctools |
|
|
|
#*** !doctools |
|
|
|
#[list_end] |
|
|
|
#[list_end] |
|
|
@ -127,7 +124,7 @@ tcl::namespace::eval punk::cesu { |
|
|
|
binary scan $1 c 1 |
|
|
|
binary scan $1 c 1 |
|
|
|
binary scan $2 c 2 |
|
|
|
binary scan $2 c 2 |
|
|
|
binary scan $3 c 3 |
|
|
|
binary scan $3 c 3 |
|
|
|
puts [list $1 $2 $3] |
|
|
|
#puts [list $1 $2 $3] |
|
|
|
#binary scan $4 c 4 |
|
|
|
#binary scan $4 c 4 |
|
|
|
incr 1 ;#// Effectively adds 0x10000 to the codepoint ? |
|
|
|
incr 1 ;#// Effectively adds 0x10000 to the codepoint ? |
|
|
|
|
|
|
|
|
|
|
@ -180,23 +177,75 @@ tcl::namespace::eval punk::cesu { |
|
|
|
#note: from_surrogatestring \U1f600 returns a mouse (\U1f400) instead of smiley |
|
|
|
#note: from_surrogatestring \U1f600 returns a mouse (\U1f400) instead of smiley |
|
|
|
# but from_surrogatestring \U1f400 returns a mouse. |
|
|
|
# but from_surrogatestring \U1f400 returns a mouse. |
|
|
|
# Tcl bug - fixed some time in 9.x |
|
|
|
# Tcl bug - fixed some time in 9.x |
|
|
|
# surrogated_string shouldn't include non BMP chars anyway (G.I.G.O?) |
|
|
|
# surrogated_string theoretically shouldn't include non BMP chars anyway (but may in some contexts? mixed surrogate escapes and raw nonbmp?) |
|
|
|
lappend PUNKARGS [list { |
|
|
|
lappend PUNKARGS [list { |
|
|
|
@id -id ::punk::cesu::from_surrogatestring |
|
|
|
@id -id ::punk::cesu::from_surrogatestring |
|
|
|
@cmd -name punk::cesu::from_surrogatestring -help\ |
|
|
|
@cmd -name punk::cesu::from_surrogatestring -help\ |
|
|
|
"Convert a string containing surrogate pairs |
|
|
|
"Convert a string containing surrogate pairs |
|
|
|
to string with pairs converted to unicode non-BMP |
|
|
|
to Tcl string with pairs converted to unicode non-BMP |
|
|
|
characters" |
|
|
|
characters" |
|
|
|
@values |
|
|
|
@values |
|
|
|
surrogated_string -help\ |
|
|
|
surrogated_string -help\ |
|
|
|
"May contain a mix of surrogate pairs and other |
|
|
|
"May contain a mix of surrogate pairs and other |
|
|
|
characters - only the surrogate pairs will be converted." |
|
|
|
characters - only the surrogate pairs will be converted." |
|
|
|
}] |
|
|
|
}] |
|
|
|
proc from_surrogatestring {surrogated_string} { |
|
|
|
|
|
|
|
|
|
|
|
proc from_surrogatestring {str} { |
|
|
|
|
|
|
|
#high surrogate character rep followed by low surrogate character rep |
|
|
|
|
|
|
|
if {[regexp {[\uD800-\uDBFF][\uDC00-\uDFFF]} $str]} { |
|
|
|
|
|
|
|
set str [string map {\[ \\\[ \] \\\]} $str] ;#Make sure any existing commandlike structures aren't executed |
|
|
|
|
|
|
|
return [subst -novariables -nobackslashes [regsub -all {([\uD800-\uDBFF])([\uDC00-\uDFFF])} $str {[surrogatepair_to_codepoint \1 \2]} ]] |
|
|
|
|
|
|
|
} else { |
|
|
|
|
|
|
|
return $str |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
proc surrogatepair_to_codepoint {highchar lowchar} { |
|
|
|
|
|
|
|
if {[string length $highchar] != 1 || [string length $lowchar] !=1} { |
|
|
|
|
|
|
|
error "surrogatepair_to_codepoint expected surrogate pair encoded as 2 characters" |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
#NOTE in tcl8 - we get oddity that 'split <surrogatepair> ""' returns a list of length 1 even though there are 2 chars |
|
|
|
|
|
|
|
#fixed in tcl9 |
|
|
|
|
|
|
|
#lassign [split $2_surrogate_chars ""] highSurrogateChar lowSurrogateChar |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scan $highchar %c highDecimal |
|
|
|
|
|
|
|
scan $lowchar %c lowDecimal |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
set highDecimal [expr {$highDecimal - 0xD800}] |
|
|
|
|
|
|
|
set lowDecimal [expr {$lowDecimal - 0xDC00}] |
|
|
|
|
|
|
|
# Combine the values and add 0x10000 to get the original code point |
|
|
|
|
|
|
|
set codepointDecimal [expr {($highDecimal << 10) + $lowDecimal + 0x10000}] |
|
|
|
|
|
|
|
#puts "->codepointDecimal $codepointDecimal" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#In tcl8 - we will get \uFFFD for non BMP codepoints - todo ? |
|
|
|
|
|
|
|
return [format %c $codepointDecimal] |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
#e.g {\ud83d\ude00} |
|
|
|
|
|
|
|
proc escaped_surrogatepair_to_codepoint {spair} { |
|
|
|
|
|
|
|
set spair [string map {" " ""} $spair] |
|
|
|
|
|
|
|
if {[string length $spair] != 12} { |
|
|
|
|
|
|
|
error "escaped_surrogatepair_to_codepoint expected input of form \\uXXXX\\uXXXX" |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
set normalised [regsub -all {\\+u} $spair ""] |
|
|
|
|
|
|
|
set highSurrogate [string range $normalised 0 3] |
|
|
|
|
|
|
|
set lowSurrogate [string range $normalised 4 end] |
|
|
|
|
|
|
|
scan $highSurrogate %x highDecimal |
|
|
|
|
|
|
|
scan $lowSurrogate %x lowDecimal |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
set highDecimal [expr {$highDecimal - 0xD800}] |
|
|
|
|
|
|
|
set lowDecimal [expr {$lowDecimal - 0xDC00}] |
|
|
|
|
|
|
|
# Combine the values and add 0x10000 to get the original code point |
|
|
|
|
|
|
|
set codepointDecimal [expr {($highDecimal << 10) + $lowDecimal + 0x10000}] |
|
|
|
|
|
|
|
return [format %c $codepointDecimal] |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
proc from_surrogatestring_via_cesu {surrogated_string} { |
|
|
|
|
|
|
|
#we can do this without cesu (from_surrogatestring) |
|
|
|
set cesu [encoding convertto cesu-8 $surrogated_string] |
|
|
|
set cesu [encoding convertto cesu-8 $surrogated_string] |
|
|
|
set x [cesu2utf $cesu] |
|
|
|
set x [cesu2utf $cesu] |
|
|
|
encoding convertfrom utf-8 $x |
|
|
|
encoding convertfrom utf-8 $x |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
proc _to_test {emoji} { |
|
|
|
proc _to_test {emoji} { |
|
|
|
puts stderr "_to_test incomplete" |
|
|
|
puts stderr "_to_test incomplete" |
|
|
|
set cesu [encoding convertto cesu-8 $e] |
|
|
|
set cesu [encoding convertto cesu-8 $e] |
|
|
@ -384,7 +433,7 @@ tcl::namespace::eval punk::cesu { |
|
|
|
proc get_topic_Description {} { |
|
|
|
proc get_topic_Description {} { |
|
|
|
punk::args::lib::tstr [string trim { |
|
|
|
punk::args::lib::tstr [string trim { |
|
|
|
package punk::cesu |
|
|
|
package punk::cesu |
|
|
|
description to come.. |
|
|
|
cesu and surrogate-pair processing |
|
|
|
} \n] |
|
|
|
} \n] |
|
|
|
} |
|
|
|
} |
|
|
|
proc get_topic_License {} { |
|
|
|
proc get_topic_License {} { |
|
|
@ -406,7 +455,8 @@ tcl::namespace::eval punk::cesu { |
|
|
|
} |
|
|
|
} |
|
|
|
proc get_topic_custom-topic {} { |
|
|
|
proc get_topic_custom-topic {} { |
|
|
|
punk::args::lib::tstr -return string { |
|
|
|
punk::args::lib::tstr -return string { |
|
|
|
nothing to see here |
|
|
|
This library can be used for surrogate-pair handling. |
|
|
|
|
|
|
|
cesu utilities are used internally in from_surrogatestring |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
# ------------------------------------------------------------- |
|
|
|
# ------------------------------------------------------------- |
|
|
@ -450,7 +500,7 @@ namespace eval ::punk::args::register { |
|
|
|
package provide punk::cesu [tcl::namespace::eval punk::cesu { |
|
|
|
package provide punk::cesu [tcl::namespace::eval punk::cesu { |
|
|
|
variable pkg punk::cesu |
|
|
|
variable pkg punk::cesu |
|
|
|
variable version |
|
|
|
variable version |
|
|
|
set version 0.1.0 |
|
|
|
set version 0.1.1 |
|
|
|
}] |
|
|
|
}] |
|
|
|
return |
|
|
|
return |
|
|
|
|
|
|
|
|