- revert writing direction to right-to-left
- don't do this on numbers, though
- depending on context (preceding/succeeding letter), for each "abstract" letter select one of two or four shape variants (glyphs) which are happily also in the Unicode, page FE)
- substitute punctuations (parentheses go the other way round!)
- optionally substitute "Euro-arabic" digits with "Indo-arabics"
- line-breaks need much more treatment, otherwise Arabic text seems to run bottom-up!
- optionally (if the font allows), use multi-letter ligatures
proc buckwalter2uc {s} { # The Buckwalter transliteration is used by Xerox, ArabTex, and documented at # http://www.cis.upenn.edu/~cis639/arabic/info/buckwalter-about.html array set tbl { ' \u0621 | \u0622 > \u0623 & \u0624 < \u0625 \} \u0626 A \u0627 b \u0628 p \u0629 t \u062A v \u062B j \u062C H \u062D x \u062E d \u062F * \u0630 r \u0631 z \u0632 s \u0633 $ \u0634 S \u0635 D \u0636 T \u0637 Z \u0638 E \u0639 g \u063A _ \u0640 f \u0641 q \u0642 k \u0643 l \u0644 m \u0645 n \u0646 h \u0647 w \u0648 Y \u0649 y \u064A F \u064B N \u064C K \u064D a \u064e u \u064F i \u0650 ~ \u0651 o \u0652 ` \u0670 \{ \u0671 c \u0634 V \u0630 } ;# Abdullah Al-Zaid: c, V set res "" foreach i [split $s ""] { if [info exists tbl($i)] {append res $tbl($i)} else {append res $i} } set res } proc string:revert s { set res "" foreach i [split $s ""] { if {$i==")"} {set i (} elseif {$i=="("} {set i )} set res $i$res } set res }This proc does the real job, especially the glyph selection. This is implemented as as series of regsubs. An abstract character offers its connectivities by commas to left and right. Sequences of two commas after substitution denote the fact that the two surrounding characters are connected. Glyphs are substituted in the order ,,x,, ,,x x,, x. Finally, the commas are removed.
proc uc:arabchar2glyph {s {arnum ""}} { set s2 [list] foreach i [eval list [string:revert $s]] { if [regexp {[0-9\u0660-\u0669][- .,0-9\u0660-\u0669]*[0-9\u0660-\u0669]} $i] { lappend s2 [string:revert $i] } else { lappend s2 $i } } ;# modified: handle more than one number per string correctly if [string length $arnum] { foreach {i j} { 0 \u0660 1 \u0661 2 \u0662 3 \u0663 4 \u0664 5 \u0665 6 \u0666 7 \u0667 8 \u0668 9 \u0669 } { regsub -all "\[$i\]" $s2 $j s2 } ;# optional: indo-arabic digits } foreach {i j} { , \u066C ? \u061F} { regsub -all "\[$i\]" $s2 $j s2 } ;# special characters foreach i { \u0622 \u0623 \u0624 \u0625 \u0627 \u0629 \u062F \u0630 \u0631 \u0632 \u0648 \u0649 } { regsub -all $i $s2 $i, s2 } ;# joining right only foreach i { \u0626 \u0628 \u062A \u062B \u062C \u062D \u062E \u0633 \u0634 \u0635 \u0636 \u0637 \u0638 \u0639 \u063A \u0640 \u0641 \u0642 \u0643 \u0644 \u0645 \u0646 \u0647 \u064A \u064B \u064C \u064D \u064E \u064F \u0650 \u0651 \u0652 \u0670 \u0671 } { regsub -all $i $s2 ,$i, s2 } ;# joining both sides foreach {i j} { \u0622,,\u0644,, \uFEF6,, \u0622,,\u0644 \uFEF5 \u0622,, \uFE82 \u0622 \uFE81 \u0623,,\u0644,, \uFEF8,, \u0623,,\u0644 \uFEF7 \u0623,, \uFE84 \u0623 \uFE83 \u0624,, \uFE86 \u0624 \uFE85 \u0625,,\u0644,, \uFEFA,, \u0625,,\u0644 \uFEF9 \u0625,, \uFE88 \u0625 \uFE87 ,,\u0626,, ,,\uFE8C,, \u0626,, \uFE8A,, ,,\u0626 ,,\uFE8B \u0626 \uFE89 \u0627,,\u0644,, \uFEFC,, \u0627,,\u0644 \uFEFB \u0627,, \uFE8E,, \u0627 \uFE8D ,,\u0628,, ,,\uFE92,, \u0628,, \uFE90,, ,,\u0628 ,,\uFE91 \u0628 \uFE8F \u0629,, \uFE94,, \u0629 \uFE93 ,,\u062A,, ,,\uFE98,, \u062A,, \uFE96,, ,,\u062A ,,\uFE97 \u062A \uFE95 ,,\u062B,, ,,\uFE9C,, \u062B,, \uFE9A,, ,,\u062B ,,\uFE9B \u062B \uFE99 ,,\u062C,, ,,\uFEA0,, \u062C,, \uFE9E,, ,,\u062C ,,\uFE9F \u062C \uFE9D ,,\u062D,, ,,\uFEA4,, \u062D,, \uFEA2,, ,,\u062D ,,\uFEA3 \u062D \uFEA1 ,,\u062E,, ,,\uFEA8,, \u062E,, \uFEA6,, ,,\u062E ,,\uFEA7 \u062E \uFEA5 \u062F,, \uFEAA,, \u062F \uFEA9 \u0630,, \uFEAC,, \u0630 \uFEAB \u0631,, \uFEAE,, \u0631 \uFEAD \u0632,, \uFEB0,, \u0632 \uFEAF ,,\u0633,, ,,\uFEB4,, \u0633,, \uFEB2,, ,,\u0633 ,,\uFEB3 \u0633 \uFEB1 ,,\u0634,, ,,\uFEB8,, \u0634,, \uFEB6,, ,,\u0634 ,,\uFEB7 \u0634 \uFEB5 ,,\u0635,, ,,\uFEBC,, \u0635,, \uFEBA,, ,,\u0635 ,,\uFEBB \u0635 \uFEB9 ,,\u0636,, ,,\uFEC0,, \u0636,, \uFEBE,, ,,\u0636 ,,\uFEBF \u0636 \uFEBD ,,\u0637,, ,,\uFEC4,, \u0637,, \uFEC2,, ,,\u0637 ,,\uFEC3 \u0637 \uFEC1 ,,\u0638,, ,,\uFEC8,, \u0638,, \uFEC6,, ,,\u0638 ,,\uFEC7 \u0638 \uFEC5 ,,\u0639,, ,,\uFECC,, \u0639,, \uFECA,, ,,\u0639 ,,\uFECB \u0639 \uFEC9 ,,\u063A,, ,,\uFED0,, \u063A,, \uFECE,, ,,\u063A ,,\uFECF \u063A \uFECD ,,\u0641,, ,,\uFED4,, \u0641,, \uFED2,, ,,\u0641 ,,\uFED3 \u0641 \uFED1 ,,\u0642,, ,,\uFED8,, \u0642,, \uFED6,, ,,\u0642 ,,\uFED7 \u0642 \uFED5 ,,\u0643,, ,,\uFEDC,, \u0643,, \uFEDA,, ,,\u0643 ,,\uFEDB \u0643 \uFED9 ,,\u0644,, ,,\uFEE0,, \u0644,, \uFEDE,, ,,\u0644 ,,\uFEDF \u0644 \uFEDD ,,\u0645,, ,,\uFEE4,, \u0645,, \uFEE2,, ,,\u0645 ,,\uFEE3 \u0645 \uFEE1 ,,\u0646,, ,,\uFEE8,, \u0646,, \uFEE6,, ,,\u0646 ,,\uFEE7 \u0646 \uFEE5 ,,\u0647,, ,,\uFEEC,, \u0647,, \uFEEA,, ,,\u0647 ,,\uFEEB \u0647 \uFEE9 \u0648,, \uFEEE,, \u0648 \uFEED \u0649,, \uFEF0,, \u0649 \uFEEF ,,\u064A,, ,,\uFEF4,, \u064A,, \uFEF2,, ,,\u064A ,,\uFEF3 \u064A \uFEF1 } { if [regsub -all $i $s2 $j s2] { #text:add $s2\n } } regsub -all , $s2 "" res set res } proc ar:ligatures s { # input: a rendered Arab Unicode string (context forms) # applies those optional ligatures contained in Bitstream Cyberbit foreach {from to} { \uFEA4\uFEE4\uFEDF \uFD88 \uFEEA\uFEE0\uFEDF \uFDF2 \uFEE2\uFE91 \uFC08 \uFEE2\uFE97 \uFC0E \uFEF2\uFED3 \uFC32 \uFE9E\uFEDF \uFC3F \uFEA2\uFEDF \uFC40 \uFEA6\uFEDF \uFC41 \uFEE2\uFEDF \uFC42 \uFEF0\uFEDF \uFC43 \uFEF2\uFEDF \uFC44 \uFEE2\uFEE7 \uFC4E \uFEAE\uFE92 \uFC6A \uFEE6\uFE92 \uFC6D \uFEF2\uFE92 \uFC6F \uFEAE\uFE98 \uFC70 \uFEE6\uFE98 \uFC73 \uFEF2\uFE98 \uFC75 \uFEF2\uFEE8 \uFC8F \uFEAE\uFEF4 \uFC91 \uFEE6\uFEF4 \uFC94 \uFEA0\uFE91 \uFC9C \uFEA4\uFE91 \uFC9D \uFEA8\uFE91 \uFC9E \uFEE4\uFE91 \uFC9F \uFEA0\uFE97 \uFCA1 \uFEA4\uFE97 \uFCA2 \uFEA8\uFE97 \uFCA3 \uFEE4\uFE97 \uFCA4 \uFEE4\uFE9B \uFCA6 \uFEE4\uFE9F \uFCA8 \uFEE4\uFEA3 \uFCAA \uFEE4\uFEA7 \uFCAC \uFEE4\uFEB3 \uFCB0 \uFEA0\uFEDF \uFCC9 \uFEA4\uFEDF \uFCCA \uFEA8\uFEDF \uFCCB \uFEE4\uFEDF \uFCCC \uFEEC\uFEDF \uFCCD \uFEA0\uFEE3 \uFCCE \uFEA4\uFEE3 \uFCCF \uFEA8\uFEE3 \uFCD0 \uFEE4\uFEE3 \uFCD1 \uFEA0\uFEE7 \uFCD2 \uFEA4\uFEE7 \uFCD3 \uFEA8\uFEE7 \uFCD4 \uFEE4\uFEE7 \uFCD5 \uFEA0\uFEF3 \uFCDA \uFEA4\uFEF3 \uFCDB \uFEA8\uFEF3 \uFCDC \uFEE4\uFEF3 \uFCDD \uFEE4\uFEB7 \uFD30 } { regsub -all $from $s $to s } set s }And finally, a nice wrapper that supplies a default test text, and handles the -digits switch for Indo-arabic numbers, and -lig for ligatures:
proc arblish args { set convertdigits "" set ligatures 0 if {$args==""} {set args "bsm Allh AlrHmn AlrHym"} if [regsub -- -dig(its)? $args "" args] {set convertdigits -digits} if [regsub -- -lig $args "" args] {incr ligatures} set res [uc:arabchar2glyph [buckwalter2uc $args] $convertdigits] if $ligatures {set res [ar:ligatures $res]} set res }... and an even briefer wrapper around that wrapper:
proc ar args {eval arblish $args}Now you're set for an Arblish example. Read that file into a string, subst it, and voila!
54293 % arblish "bsm Allh AlrHmn AlrHym" ﺊﻤﻴﺣﺮﻟﺍ ﻦﻤﺣﺮﻟﺍ ﻪﻠﻟﺍ ﻢﺴﺒ
This is the Tcl Wiki, but I have to point out that Roman Czyborra has a nice renderer in Perl at http://czyborra.com/arabjoin/arabjoinVK 23-mar-2005 I very doubt this conforms to Unicode Standard Annex #9 - The Bidirectional Algorithm at [1], I very doubt it does things right. - RS: It's an approximation. The multiple nesting discussed in the Bidi algorithm isn't provided. But concrete bug reports are always welcome :)See also: