Updated 2013-09-01 23:20:50 by RLE

-- Text to HTML (quick & dirty)

Sometimes, I receive some Word documents the text of which must be published in HTML. What a mess. So, I've made this little tool where you interactively can paste the text from Word and convert it to HTML.

  • Converts bullets to <ul>...</ul>
  • Converts numbers for <ol>...</ol>
  • Converts tabs to <table>...</table>

No nested lists or tables. Nothing sophisticated. Just quick'n'easy.
 package require Tk
 
 proc lines2html lines {
     set result ""
     foreach line [split [string trim $lines] \n] {
         if {[set line [string trim $line]] ne ""} then {
             append result \n <p> $line </p> \n
         }
     }
     string trim $result
 }
 
 proc getListIndices txt {
     regexp -inline -indices -all\
         [subst -nocommand -novariable\
              {(?:\n *[\u2022*-][\t ]+[^\n]*){2,}}]\
         $txt
 }
 
 proc list2html lst {
     set result <ul>
     foreach line [split [string trim $lst] \n] {
         append result \n <li>\
             [string trim [string range [string trim $line] 1 end]]\
             </li>
     }
     append result \n </ul>
 }
 
 proc getNumberingIndices txt {
     regexp -inline -indices -all\
         {(?:\n *[0-9]+[[:punct:]]?[\t ]+[^\t\n]*){2,}}\
         $txt
 }
 
 proc numbering2html nrs {
     set result <ol>
     foreach line [split [string trim $nrs] \n] {
         if {[catch {
             set nr [lindex $line 0]
             set l [string length $nr]
         }]} then {
             set l 0
         }
         append result \n <li>\
             [string trim [string range [string trim $line] $l end]]\
             </li>
     }
     append result \n </ol>
 }
 
 proc getTableIndices txt {
     regexp -inline -indices -all {(?:\n(?:[^\n\t]*\t)+[^\n\t]*)+} $txt
 }
 
 proc tab2html tab {
     set rows {}
     foreach row [split [string trim $tab] \n] {
         set cols {}
         foreach td [split $row \t] {
             lappend cols $td
         }
         lappend rows $cols
     }
     set maxCols 0
     foreach row $rows {
         if {[set l [llength $row]] > $maxCols} then {
             set maxCols $l
         }
     }
     set table {}
     foreach row $rows {
         while {[llength $row] < $maxCols} {
             lappend row {}
         }
         lappend table $row
     }
     set result <table>
     append result \n <tbody>
     foreach row $table {
         append result \n <tr>
         foreach td $row {
             append result \n <td> $td </td>
         }
         append result \n </tr>
     }
     append result \n </tbody> \n </table>
 }
 
 proc struct2list {txt args} {
     array set opt [concat {
         -indexfunc getNumberingIndices
         -transformfunc numbering2html
     } $args]
     set indices {-1}
     eval eval lappend indices [$opt(-indexfunc) $txt]
     lappend indices [string length $txt]
     set raw {}
     set lists {}
     foreach {from to} $indices {
         incr from
         lappend raw [string range $txt $from $to]
     }
     foreach {from to} [lrange $indices 1 end-1] {
         lappend lists [$opt(-transformfunc) [string range $txt $from $to]]
     }
     set result {}
     foreach r $raw l $lists {
         lappend result $r $l
     }
     lrange $result 0 end-1
 }
 
 proc txt2html txt {
     set result ""
     foreach {raw1 ol} [struct2list $txt\
                           -indexfunc getNumberingIndices\
                           -transformfunc numbering2html] {
         foreach {raw2 ul} [struct2list $raw1\
                                -indexfunc getListIndices\
                                -transformfunc list2html] {
             foreach {raw3 table} [struct2list $raw2\
                                       -indexfunc getTableIndices\
                                       -transformfunc tab2html] {
                 append result \n\
                     [lines2html $raw3]\
                     \n \n $table \n
             }
             append result \n $ul \n
         }
         append result \n $ol \n
     }
     string trim $result
 }
 
 proc widget2html textwidget {
     $textwidget configure -autoseparator 0
     $textwidget edit separator
     if {[$textwidget tag ranges sel] eq {}} then {
         set result [txt2html [$textwidget get 1.0 end]]
         $textwidget delete 1.0 end
         $textwidget insert 1.0 $result
     } else {
         set from [lindex [$textwidget tag ranges sel] 0]
         set to [lindex [$textwidget tag ranges sel] end]
         set result [txt2html [$textwidget get $from $to]]
         $textwidget delete $from $to
         $textwidget mark set insert $from
         $textwidget insert insert $result
         $textwidget tag add sel $from insert
         $textwidget mark set insert $from
         $textwidget see insert
     }
     $textwidget configure -autoseparator 1
 }
 
 grid\
     [text .t -wrap none\
          -yscrollcommand [list .v set]\
          -xscrollcommand [list .h set]\
          -undo yes]\
     [scrollbar .v\
          -command [list .t yview]]\
     -sticky news
 
 grid\
     [scrollbar .h\
          -orient horizontal\
          -command [list .t xview]]\
     -sticky news
 
 grid rowconfigure . 0 -weight 1
 grid columnconfigure . 0 -weight 1
 
 . configure -menu [menu .m]
 
 .m add cascade\
     -label File\
     -menu [menu .m.file]
 
 .m.file add command\
     -label Exit\
     -command exit
 
 .m add cascade\
     -label Edit\
     -menu [menu .m.edit]
 
 .m add cascade\
     -label View\
     -menu [menu .m.view]
 
 .m.view add command\
     -label {No wrap}\
     -command [list .t configure -wrap none]
 
 .m.view add command\
     -label {Word wrap}\
     -command [list .t configure -wrap word]
 
 .m.view add command\
     -label {Char wrap}\
     -command [list .t configure -wrap char]
 
 .m add cascade\
     -label ?\
     -menu [menu .m.help]
 
 .m.help add command\
     -label Index\
     -command [list tk_messageBox -icon info -message {text2html
 When pasted from Word into this widget,
 tables are made with tabs, and
 lists are made with leading dashes.
 This widget recognises these characters
 and replaces the text by appropriate HTML source.}]
 
 if {$tcl_platform(platform) eq "windows"} then {
     .m add cascade\
         -label System\
         -menu [menu .m.system -tearoff no]
     .m.system add command\
        -label Console\
        -command [list console show]
 }
 
 
 bind .t <3> [list tk_popup .m.edit %X %Y]
 
 .m.edit add command\
     -label {Select all}\
     -command [list .t tag add sel 1.0 end-1chars]
 .m.edit add separator
 .m.edit add command\
     -label cut\
     -command [list event generate .t <<Cut>>]
 .m.edit add command\
     -label Copy\
     -command [list event generate .t <<Copy>>]
 .m.edit add command\
     -label Paste\
     -command [list event generate .t <<Paste>>]
 .m.edit add command\
     -label Undo\
     -command [list event generate .t <<Undo>>]
 .m.edit add command\
     -label Redo\
     -command [list event generate .t <<Redo>>]
 .m.edit add separator
 .m.edit add command\
     -label {Transform text to HTML}\
     -command [list widget2html .t]