Updated 2012-09-10 12:09:26 by dkf

Method used to encode general strings (containing arbitrary Unicode characters) as strings of characters allowed in internet domain names.

Anyone up for implementing this in Tcl?

Here is a TCL implementation. pi31415

punycode.tcl
# This is based on the following Python implementation.
# http://pydoc.org/get.cgi/usr/local/lib/python2.4/encodings/punycode.py
#
# Also done in C, Perl, PHP, and Javascript.
# http://www.ietf.org/rfc/rfc3492.txt
# http://cpansearch.perl.org/src/MIYAGAWA/IDNA-Punycode-0.02/lib/IDNA/Punycode.pm
# http://svn.php.net/viewvc/pear/packages/Net_IDNA/trunk/Net/IDNA.php?revision=300850&view=co
# https://gist.github.com/1035853

namespace eval ::puny {
    set digits "abcdefghijklmnopqrstuvwxyz0123456789"
}

proc ::puny::toChars {nums} {
    set chars [list]
    foreach {char} $nums {
        lappend chars [format "%c" $char]
    }
    return [join $chars ""]
}

# Encoding
proc ::puny::enumerate {sequence {start 0}} {
    set retval [list]
    set n $start
    foreach {elem} $sequence {
        lappend retval $n $elem
        incr n
    }
    return $retval
}

# 3.1 Basic code point segregation
proc ::puny::segregate {str} {
    set base [list]
    set extended [list]
    foreach {c} [split $str ""] {
        scan $c "%c" {char}
        if {$char < 128} {
            lappend base $char
        } else {
            lappend extended $c
        }
    }
    set extended [lsort $extended]
    return [list $base $extended]
}

# Return the length of str, considering only characters below max.
proc ::puny::selectiveLen {str max} {
    set res 0
    foreach c [split $str ""] {
        scan $c "%c" {char}
        if {$char < $max} {
            incr res
        }
    }
    return $res
}

# Return a pair (index, pos), indicating the next occurrence of
# char in str. index is the position of the character considering
# only ordinals up to and including char, and pos is the position in
# the full string. index/pos is the starting position in the full
# string.
proc ::puny::selectiveFind {str char index pos} {
    set l [string length $str]
    while {true} {
        incr pos
        if {$pos == $l} {
            return [list -1 -1]
        }
        set c [string index $str $pos]
        if {$c == $char} {
            incr index
            return [list $index $pos]
        } elseif {$c < $char} {
            incr index
        }
    }
}

# 3.2 Insertion unsort coding
proc ::puny::insertionUnsort {str extended} {
    set oldchar 128
    set result [list]
    set oldindex -1
    foreach {c} $extended {
        set index -1
        set pos -1
        scan $c "%c" {char}
        set curlen [::puny::selectiveLen $str $char]
        set delta [expr {($curlen + 1) * ($char - $oldchar)}]
        while {true} {
            lassign [::puny::selectiveFind $str $c $index $pos] {index} {pos}
            if {$index == -1} {
                break
            }
            set delta [expr {$delta + $index - $oldindex}]
            lappend result [expr {$delta - 1}]
            set oldindex $index
            set delta 0
       }
       set oldchar $char
    }
    return $result
}

# Punycode parameters: tmin = 1, tmax = 26, base = 36
proc ::puny::T {j bias} {
    set res [expr {36 * ($j + 1) - $bias}]
    if {$res < 1} {
        return 1
    }
    if {$res > 26} {
        return 26
    }
    return $res
}

# 3.3 Generalized variable-length integers
proc ::puny::generateGeneralizedInteger {N bias} {
    set result [list]
    set j 0
    while {true} {
        set t [::puny::T $j $bias]
        if {$N < $t} {
            lappend result [string index $::puny::digits $N]
            return $result
        }
        set pos [expr {$t + (($N - $t) % (36 - $t))}]
        lappend result [string index $::puny::digits $pos]
        set N [expr {int(($N - $t) / (36 - $t))}]
        incr j
    }
}

proc ::puny::adapt {delta first numchars} {
    if {$first} {
        set delta [expr {int($delta / 700)}]
    } else {
        set delta [expr {int($delta / 2)}]
    }
    incr delta [expr {int($delta / $numchars)}]
    # int((($base - $tmin) * $tmax) / 2) == 455
    set divisions 0
    while {$delta > 455} {
        # base - tmin
        set delta [expr {int($delta / 35)}]
        incr divisions 36
    }
    set bias [expr {$divisions + int(36 * $delta / ($delta + 38))}]
    return $bias
}

# 3.4 Bias adaptation
# Punycode parameters: initial bias = 72, damp = 700, skew = 38
proc ::puny::generateIntegers {baselen deltas} {
    set result [list]
    set bias 72
    foreach {points delta} [::puny::enumerate $deltas] {
        set s [::puny::generateGeneralizedInteger $delta $bias]
        set result [concat $result $s]
        set bias [::puny::adapt             \
            $delta                          \
            [expr {$points == 0}]           \
            [expr {$baselen + $points + 1}] \
        ]
    }
    return $result
}

proc ::puny::encode {text} {
    lassign [::puny::segregate $text] {base} {extended}
    set deltas [::puny::insertionUnsort $text $extended]
    set extended [::puny::generateIntegers [llength $base] $deltas]
    if {[llength $base]} {
        return [format "%s-%s" [::puny::toChars $base] [join $extended ""]]
    }
    return [join $extended ""]
}

# Decoding
proc ::puny::toNums {text} {
    set retval [list]
    foreach {c} [split $text ""] {
        lappend retval [scan $c "%c"]
    }
    return $retval
}

# 3.3 Generalized variable-length integers
proc ::puny::decodeGeneralizedNumber {extended extpos bias errors} {
    set result 0
    set w 1
    set j 0
    while {true} {
        set c [lindex $extended $extpos]
        if {[string length $c] == 0} {
            if {$errors == "strict"} {
                error "incomplete punicode string"
            }
            incr extpos
            return [list $extpos None]
        }
        scan $c "%c" {char}
        incr extpos
        if {65 <= $char && $char <= 90} {
            # A-Z
            set digit [expr {$char - 65}]
        } elseif {48 <= $char  && $char <= 57} {
            # 0x30-26
            set digit [expr {$char - 22}]
        } elseif {$errors == "strict"} {
            set pos [lindex $extended $extpos]
            error [format "Invalid extended code point '%s'" $pos]
        } else {
            return [list $extpos None]
        }
        set t [::puny::T $j $bias]
        set result [expr {$result + $digit * $w}]
        if {$digit < $t} {
            return [list $extpos $result]
        }
        set w [expr {$w * (36 - $t)}]
        incr j
    }
}

# 3.2 Insertion unsort coding
proc ::puny::insertionSort {base extended errors} {
    set char 128
    set pos -1
    set bias 72
    set extpos 0
    while {$extpos < [llength $extended]} {
        lassign [::puny::decodeGeneralizedNumber \
            $extended                              \
            $extpos                                \
            $bias                                  \
            $errors                                \
        ] {newpos} {delta}
        if {$delta == "None"} {
            # There was an error in decoding. We can't continue because
            # synchronization is lost.
            return $base
        }
        set pos [expr {$pos + $delta + 1}]
        set char [expr {$char + int($pos / ([llength $base] + 1))}]
        if {$char > 1114111} {
            if {$errors == "strict"} {
                error [format "Invalid character U+%x" $char]
            }
            scan "?" "%c" {char}
        }
        set pos [expr {$pos % ([llength $base] + 1)}]
        set base [concat              \
            [lrange $base 0 "$pos-1"] \
            $char                     \
            [lrange $base $pos end]   \
        ]
        set bias [::puny::adapt   \
            $delta                \
            [expr {$extpos == 0}] \
            [llength $base]       \
        ]
        set extpos $newpos
    }
    return $base
}

proc ::puny::decode {text errors} {
    set pos [string last "-" $text]
    if {$pos == -1} {
        set base [list]
        set code [string toupper $text]
        set extended [split $code ""]
    } else {
        set base [::puny::toNums [string range $text 0 "$pos-1"]]
        set code [string toupper [string range $text "$pos+1" end]]
        set extended [split $code ""]
    }
    return [::puny::toChars [::puny::insertionSort $base $extended $errors]]
}

test.tcl
package require tcltest
source punycode.tcl

proc testme {str} {
    set code [::puny::encode $str]
    set text [::puny::decode $code ""]
    if {$str != $text} {
        error "Round-trip error"
    }
    return $code
}

::tcltest::test emptyinput "Empty input" \
    -body {testme ""} \
    -result ""

::tcltest::test singleunichar "Single unicode character" \
    -body {testme "ü"} \
    -result "tda"

::tcltest::test nounichar "No unicode character" \
    -body {testme "Goethe"} \
    -result "Goethe-"

::tcltest::test midleunichar "Unicode character in the middle" \
    -body {testme "Bücher"} \
    -result "Bcher-kva"

::tcltest::test indashes "Text within dashes" \
    -body {testme {-> $1.00 <-}} \
    -result {-> $1.00 <--}

DKF: Here's a working punycode encoder, converted from the code in the RFC:
namespace eval punycode {
    variable digits [split "abcdefghijklmnopqrstuvwxyz0123456789" ""]
    # Bootstring parameters for Punycode
    variable base 36
    variable tmin 1
    variable tmax 26
    variable skew 38
    variable damp 700
    variable initial_bias 72
    variable initial_n 0x80

    proc adapt {delta first numchars} {
        variable base
        variable tmin
        variable tmax
        variable damp
        variable skew

        set delta [expr {$delta / ($first ? $damp : 2)}]
        incr delta [expr {$delta / $numchars}]
        set k 0
        while {$delta > ($base - $tmin) * $tmax / 2} {
            set delta [expr {$delta / ($base-$tmin)}]
            incr k $base
        }
        return [expr {$k + ($base-$tmin+1) * $delta / ($delta+$skew)}]
    }

    # Main encode function
    proc encode {input {case ""}} {
        variable digits
        variable tmin
        variable tmax
        variable base
        variable initial_n
        variable initial_bias

        set in [split $input ""]
        set output {}

        # Initialize the state:
        set n $initial_n
        set delta 0
        set bias $initial_bias

        # Handle the basic code points:
        foreach ch $in {
            if {$ch < "\u0080"} {
                if {$case ne ""} {
                    if {$case} {
                        append output [string toupper $ch]
                    } else {
                        append output [string tolower $ch]
                    }
                } else {
                    append output $ch
                }
            }
        }

        set h [set b [string length $output]]

        # h is the number of code points that have been handled, b is the
        # number of basic code points.

        if {$b} {
            append output "-"
        }

        # Main encoding loop:

        while {$h < [llength $in]} {
            # All non-basic code points < n have been handled already.  Find
            # the next larger one:

            for {set m inf; set j 0} {$j < [llength $in]} {incr j} {
                scan [lindex $in $j] "%c" ch
                if {$ch >= $n && $ch < $m} {
                    set m $ch
                }
            }

            # Increase delta enough to advance the decoder's <n,i> state to
            # <m,0>, but guard against overflow:

            if {$m-$n > (0xffffffff-$delta)/($h+1)} {
                error "overflow in delta computation"
            }
            incr delta [expr {($m-$n) * ($h+1)}]
            set n $m

            for {set j 0} {$j < [llength $in]} {incr j} {
                scan [lindex $in $j] "%c" ch
                if {$ch < $n && ([incr delta] & 0xffffffff) == 0} {
                    error "overflow in delta computation"
                }

                if {$ch == $n} {
                    # Represent delta as a generalized variable-length
                    # integer:

                    for {set q $delta; set k $base} true {incr k $base} {
                        set t [expr {min(max($k-$bias,$tmin),$tmax)}]
                        if {$q < $t} break
                        append output \
                            [lindex $digits [expr {$t + ($q-$t)%($base-$t)}]]
                        set q [expr {($q-$t) / ($base-$t)}]
                    }

                    append output [lindex $digits $q]
                    set bias [adapt $delta [expr {$h==$b}] [expr {$h+1}]]
                    set delta 0
                    incr h
                }
            }

            incr delta
            incr n
        }

        return $output
    }
}