Updated 2012-01-19 03:01:57 by RLE

George Peter Staplin: Jan 29, 2005 - As an experiment I decided to try to build a prototype for a Tcl VM that uses threaded code. Threaded code is an interesting concept that is fairly well documented on the web. Threaded code has been used in Ken Thompson's original B language, and Charles Moore's Forth.

This prototype converts a series of bytecodes into addresses. The addresses are then used to jump around in the threaded code.

Note: this is not related to POSIX threads, or concurrent programming techniques.

Tcl_Types.h
 #define TCL_TYPE_STRING 2
 #define TCL_TYPE_INT 4
 #define TCL_TYPE_DOUBLE 8

main.c
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 #include "Tcl_Types.h"

 /* We define an operator table that is used with the symbols for bytecodes to lookup code addresses.
  */
 struct {
  void *code;
 } ops[3];

 /* These are our bytecodes: */
 enum {
  TCL_ADD,
  TCL_PUSH,
  TCL_RETURN
 };

 /* These must match the .equ in vm.S */
 typedef struct {
  int type;
  int ref; /* 4 */
  int i; /* 8 */
  char *s; /* 12 */
 } Tcl_Obj;

 /* This is our execution token.  It stores a linear series of addresses. */
 typedef struct {
  caddr_t *start;
  /* Our cursor is the current position after the last address was appended. */
  caddr_t *cursor;
  size_t remaining;
  size_t total;
 } Tcl_Xt;

 extern void Tcl_InitOperators ();
 extern Tcl_Obj *Tcl_Run ();
 extern void Tcl_SetStringObj (Tcl_Obj *obj, char *str);

 /* bytecode to address */
 #define BCTOA(bc) ops[bc].code

 /* append code to xt */
 #define ACTOXT(xt,sym) \
  *(xt->cursor) = sym; \
  xt->cursor += 1

 void *Tcl_Alloc (size_t s) {
  void *r = malloc (s);

  if (NULL == r) {
   perror ("malloc");
   exit (EXIT_FAILURE);
  }
  return r;
 }

 void Tcl_Free (void *p) {
  free (p);
 }

 void *Tcl_Realloc (void *old, size_t s) {
  void *new;

  new = realloc (old, s);

  if (NULL == new) {
   perror ("realloc");
   exit (EXIT_FAILURE);
  }

  return new;
 }

 void Tcl_EnlargeXt (Tcl_Xt *xt) {
  size_t new_size;
  size_t used;
  void *p;

  new_size = (xt->total * 2);

  used = (xt->remaining - xt->total);

  p = Tcl_Realloc (xt->start, new_size);

  xt->remaining = (new_size - used);
  xt->total = new_size;
  xt->cursor = (p + (xt->cursor - xt->start));
  xt->start = p;
 }

 void Tcl_CompileExampleAdd (Tcl_Obj *objv[], Tcl_Xt *xt) {

  if ((sizeof (void *) * 7) < xt->remaining) {
   Tcl_EnlargeXt (xt);
  }

  ACTOXT (xt, BCTOA (TCL_PUSH));
  ACTOXT (xt, (caddr_t)objv[0]);
  ACTOXT (xt, BCTOA (TCL_PUSH));
  ACTOXT (xt, (caddr_t)objv[1]);
  ACTOXT (xt, BCTOA (TCL_ADD));
  ACTOXT (xt, BCTOA (TCL_RETURN));

  xt->remaining -= (sizeof (void *) * 7);
 }

 Tcl_Obj *Tcl_NewObj (void) {
  Tcl_Obj *obj = Tcl_Alloc (sizeof (Tcl_Obj));

  obj->type = TCL_TYPE_STRING;
  obj->ref = 0;

  return obj;
 }

 Tcl_Xt *Tcl_NewXt (void) {
  Tcl_Xt *xt = Tcl_Alloc (sizeof (Tcl_Xt));

  xt->total = xt->remaining = 2000;
  xt->cursor = xt->start = Tcl_Alloc (xt->remaining);

  return xt;
 }

 void Tcl_SetStringObj (Tcl_Obj *obj, char *str) {
  obj->type = TCL_TYPE_STRING;
  obj->s = str;
 }

 extern void test_convert (Tcl_Obj *obj);

 int main () {
  Tcl_Obj **objv;
  Tcl_Obj *result;
  Tcl_Xt *xt;

  Tcl_InitOperators (
   &(ops[TCL_ADD].code),
   &(ops[TCL_PUSH].code),
   &(ops[TCL_RETURN].code));

  printf ("add_op code is %p\n", ops[TCL_ADD].code);
  printf ("push_op code is %p\n", ops[TCL_PUSH].code);
  printf ("return code is %p\n", ops[TCL_RETURN].code);

  objv = Tcl_Alloc (sizeof (Tcl_Obj *) * 2);
  objv[0] = Tcl_NewObj ();
  objv[1] = Tcl_NewObj ();

  Tcl_SetStringObj (objv[0], "255");
  Tcl_SetStringObj (objv[1], "456");

  xt = Tcl_NewXt ();

  Tcl_CompileExampleAdd (objv, xt);

 #if 0
  printf ("type before %d\n", objv[0]->type);
  test_convert (objv[1]);
  printf ("type after %d i %d\n", objv[0]->type, objv[0]->i);

  test_convert (objv[1]);
  return 0;
 #endif

  result = Tcl_Run (xt->start);

  printf ("RESULT is %d\n", result->i);

  printf ("after return from Tcl_Run\n");

  return EXIT_SUCCESS;
 }

registers.h
 /* We may need to change the register allocation in the future, so we use these macros.
  */

 /* OPERAND STACK */
 #define ops esi

 /* TOP OF STACK */
 #define tos edi

 /* VIRTUAL INSTRUCTION POINTER */
 #define vip edx

vm.S
 #include "registers.h"
 #include "Tcl_Types.h"

 /* We cache the top-of-stack (TOS) in a register.
  * This is a technique that can result in better performance.
  */

 .macro NEXT
  movl (%vip),%eax
  jmp *%eax
 .endm

 .macro POP_INTO reg
  movl %tos,\reg
  movl (%ops),%tos
  addl $4,%ops
 .endm

 .macro PUSH reg
  subl $4,%ops
  movl %tos,(%ops)
  movl \reg,%tos
 .endm

 .macro RESTORE_TOS
  movl (%ops),%tos
  addl $4,%ops
 .endm

 .macro SAVE_TOS
  subl $4,%ops
  movl %tos,(%ops)
 .endm

 .macro VM_WORD word
  \word:
  addl $4,%vip
 .endm

 /* our Tcl_Obj structure layout */
 .equ type_offset, 0
 .equ ref_offset, 4
 .equ int_offset, 8
 .equ string_offset, 12

 /**** EXPORTED SYMBOLS ****/
 .global Tcl_InitOperators
 .global Tcl_Run

 /**** SETUP VARIOUS SECTIONS ****/

 .data
  .comm operand_stack,2000
 .text

 .section .rodata
  emit_hex_fmt: .string "HEX 0x%x\n"
  emit_invalid_integer: .string "This isn't an integer string.\n"
  emit_string_fmt: .string "DEBUG STRING %s\n"
  emit_ptr_fmt: .string "ptr %p\n"
  emit_internal_error: .string "internal error\n"
 .text

 /**** UTILITY CODE ****/

 /* convert the integer in the Tcl_Obj stored in %eax
  (saving any registers that could be clobbered)

  We use:

   %eax result
   %cl low bits for the character
   %esi our offset into the string
   %edi our Tcl_Obj (after we move it from %eax)
   %edx for multiplication
  */

 convert_to_int:
  pushl %edi
  pushl %esi
  pushl %ecx
  pushl %edx

  movl %eax,%edi
  movl string_offset(%eax),%esi

  /* our initial value */
  movl $0,%eax

  repeat:
   /* clear our temporary register's high bits */
   movl $0,%ecx
   movb (%esi),%cl

   cmpb $0,%cl
   je got_zero_byte

   /* %eax * 10 */
   movl $10,%edx
   mull %edx

   cmpb $'0',%cl
   jl not_integer
   cmpb $'9',%cl
   jg not_integer

   subb $'0',%cl
   addl %ecx,%eax

   /* advance to the next char */
   addl $1,%esi
   jmp repeat

  got_zero_byte:

   /* save the result in our Tcl_Obj's memory */
   movl $TCL_TYPE_INT,type_offset(%edi)
   movl %eax,int_offset(%edi)
   movl %edi,%eax

   /* now restore the registers in reverse order */
   popl %edx
   popl %ecx
   popl %esi
   popl %edi
   ret

  not_integer:
   pushl $emit_invalid_integer
   call printf
   addl $4,%esp
   /* XXX lovely error handling.  I think in the final version we will use a longjmp. */
   call abort

 /**** ENTRY POINTS ****/

 Tcl_InitOperators:
  movl 4(%esp),%eax
  movl $add_op,(%eax)

  movl 8(%esp),%eax
  movl $push_op,(%eax)

  movl 12(%esp),%eax
  movl $exit_Tcl_Run,(%eax)

  ret

 /* Tcl_Run (start)
  */
 Tcl_Run:
  movl 4(%esp),%vip
  movl $operand_stack,%ops
  /* our stack grows downward */
  addl $2000,%ops

  NEXT

  exit_Tcl_Run:

   /* return our Tcl_Obj result in %eax */
   POP_INTO %eax
   ret

 .global test_convert
 test_convert:
  movl 4(%esp),%eax
  call convert_to_int

  pushl int_offset(%eax)
  pushl $emit_hex_fmt
  call printf
  addl $8,%esp

  ret

 emit_debug:
  pusha
  pushl %eax
  pushl $emit_hex_fmt
  call printf
  addl $8,%esp
  popa
  ret

 /**** OPERATORS ****/

 /* The idea is that we have various VM words that get threaded together.
  * Each word ends with a NEXT macro that expands to jump to the next address.
  */

 /* add $a $b */
 VM_WORD add_op
  /* convert object $b to int if needed */
  POP_INTO %eax
  cmpl $TCL_TYPE_INT,type_offset(%eax)
  je 1f
   call convert_to_int
  1:

  /* save the Tcl_Obj for $b */
  pushl %eax

  POP_INTO %eax
  cmpl $TCL_TYPE_INT,type_offset(%eax)
  je 1f
   call convert_to_int
  1:

  movl int_offset(%eax),%ecx
  popl %eax
  addl int_offset(%eax),%ecx

  /* Now we save the critical registers that the C code will possibly clobber. */
  pushl %ecx
  pushl %tos
  pushl %vip
  pushl %ops

  call Tcl_NewObj

  popl %ops
  popl %vip
  popl %tos
  popl %ecx

  movl %ecx,int_offset(%eax)
  movl $TCL_TYPE_INT,type_offset(%eax)

  PUSH %eax

  NEXT

 VM_WORD push_op
  SAVE_TOS
  /* set TOS to the Tcl_Obj */
  movl (%vip),%tos
  addl $4,%vip

  NEXT

 /*END*/

1. Ken Thompson's B language (the precursor to C): http://cm.bell-labs.com/cm/cs/who/dmr/kbman.html

2. Threaded Code Definition at FOLDOC: http://wombat.doc.ic.ac.uk/foldoc/foldoc.cgi?query=threaded+code

3. My revisions of this code (tcltcvm): http://www.xmission.com/~georgeps/engineering/prototype/

PWQ 31 Jan 05, If the bytecode interpreter was implemented with call threading then it would be easier to add new byte codes. From C it is not easy to implement direct or indirect threading, I wonder how many takers there will be to reimplement the interpreter for new CPU's in assembly.

George Peter Staplin: Jan 30, 2005 - I've examined several implementations of high performance threaded code that are mainly written in C. Some rely on allocating a global register as a virtual instruction pointer (which may not always work with every library, unless the libraries are compiled for this, and the compiler supports this). Others use the gcc feature of labels as values (&& on a label to get a void * for goto *ptr;).

I've been pondering doing an experiment with the Tcl sources. It would consist of implementing enough of the VM instructions in threaded code (written in assembly) to run small test programs. I would like to retain the bytecode code branch for platforms that don't have the assembly.

PWQ, Ideally the functions would call NEXT rather than returning to a main loop, gcc supports function attributes such as naked and noreturn but they are not supported on all targets. The scoping rules for C don't allow functions direct access to their caller variables even if called via goto *ptr (no uplevel equiv). This means passing around a structure pointer for all the state information.

Still this technique may be more efficient than the massive switch statement approach. Plus it can be extended by adding more function pointers to the jump table.

atp: According to the papers by M. Anton Ertl and David Gregg, e.g., "The Structure and Performance of EFFICIENT Interpreters", "switch threading" - aka, the "massive switch statement approach" - is pretty much always slower than direct or indirect threading.

The advantage of the switch statment approach is that it is portable across non-gcc compilers, while doing direct or indirect threading typically means either using gcc's "C Labels as Values" extension to Standard C, or some lower level technique, like assembler.

http://www.complang.tuwien.ac.at/forth/threaded-code.html http://www.complang.tuwien.ac.at/projects/interpreters.html

The Tcl Core Team appears to be aware of this, as this 2002 paper does discuss threaded code as a future possibility for Tcl:

"Tcl bytecode optimization: some experiences" by Kevin Kenny, Miguel Sofer, Jeffrey Hobbs http://aspn.activestate.com/ASPN/Tcl/TclConferencePapers2002/Tcl2002papers/kenny-bytecode/paperKBK.html