Reply by Jyrki Holopainen December 9, 20052005-12-09
Hello world,

Browsed couple days ago throuh the IAR 78K0 C 3.34B libraries and was 
somehow disapointed to notice that the the 32 bit multiplication was a 
'traditional' one, i.e. one that does not use the microcontroller's own 
multiply instruction. I ported a routine that I originally wrote for 
Intel 805X and Dunfiled DDS C-compiler and here is the result.

My version is 3-5 times faster than the library multiplication. The real 
life speed improvement factor is probably around 3.7. The C library 
routine can make ~2600 - 4300 multiplications/sec (8Mhz system), while 
this one handles ~13000 muls/sec. The numbers are not the absolute 
truth, but give an estimate what these routines can do.

The function prototype is: uint32 uint32mul(uint32 left, uint32 right);
(typedef unsigned long uint32;) I think that it should be possible to 
override the library function with this on with a linker-file definition.

I have not tested this with the new IAR C/C++ -compiler, but it 
should(?) work.

File mul.s26:
;-----------------------------------------------------------------------------
;
; COPYRIGHT (c) 2005 Jyrki Holopainen, all rights reserved
;
; The copyright to the computer program(s) herein is the property of
; Jyrki Holopainen. The code may be used, copied or edited freely as
; long as the original copyright message is retained.
;
;-----------------------------------------------------------------------------
;
;  Multiplies two unsigned 32-bit operands and sets status.
;  For NEC 78K0 microcontrollers with MULU-instruction.
;  Compiler: IAR 3.34B C-compiler/assembler
;
; Revision history:
;  Original:       07.12.2005 Jyrki Holopainen jyrki_mul32 AT halo.pp.fi
;  Last modified:  08.12.2005 Jyrki Holopainen
;
;-----------------------------------------------------------------------------
;
;       Input:  AX, BC          Operand 1
;               [SP+0:1]        Return address
;               [SP+2:5]        Operand 2
;
;       Output: AX, BC          Op1 mul Op2
;               Z-flag ?
;
;
; Function prototype:
;  uint32 uint32mul(uint32 left, uint32 right);
;
;-----------------------------------------------------------------------------
;
; Compare compiler 32 * 32 -> 32 multiplication with this one
; (IAR 3.34, NEC 78K0 8 Mhz):
;
;              lib*    this    Ratio   Operation
;  -------------------------------------------------------
;   ops/sec    4286    13100   3.1     0 * 0                 (var * var)
;              3673    13057   3.6     999 * 999             (var * var)
;              3546    13036   3.7     99999 * 99999         (var * var)
;              3313    13015   3.9     9999999 * 9999999     (var * var)
;              2932    12909   4.4     999999999 * 999999999 (var * var)
;
;              3211    13057   4.1     0xffff * 0xffff       (var * var)
;              2599    12960   5.0     0x7fffffff * 0x7fff...(var * var)
;
;------------------------------------------------------------------------------
         MODULE  LONG_MUL_L03_Fast

         PUBLIC  uint32mul

         EXTERN ?L_F_DEALLOC_L06

         RSEG    RCODE

;-----------------------------------------------------------------------------
;
; Before starting the multiplication, the stack is set up following way:
; 14-17         Operand 2 ('right')
; 12-13         Return address
; 8 -11         Saved working registers
; 4 - 7         Operand 1 ('left')
; 0 - 3         Result
; . . .         Saved hl
;
; At the beginning [hl] pointer points to the byte 0 of the result.
; The pointer is incrementd (byte 0 -> 3) at the outer loop as the
; multiplication proceeds.
;
; On the outer loop the 'result' and 'right' bytes are processed
; at the same rate, so the hl pointer and pointer + fixed offset
; can be used to access a byte from both of the variables.
;
; On the inner loop the 'result' and 'left' bytes are processed
; at the same rate, so the hl pointer and pointer + variable offset
; can be used to access a byte from both of the variables.
;
; Outer multiply loop registers
; Work:
;  hl:   base pointer to Nth byte of result
;  b:    Offset between base and 'left'. Used also as a loop counter.
;
;
; Inner multiply loop registers
; In:
;  hl:  Base pointer to Nth byte of result
;  b:   Distance between the base pointer and the left operand
;  e:   Current byte from the 'right'
;  c:   Number of bytes to process, used as loop counter
;
; Work:
;  ax:  Calc accumulator
;  d:   Carry byte
;
;
; The high byte on the add sequence on the inner loop cannot overflow:
;
; Worst case:
; 1) 0xff * 0xff =      0xfe01          temp = *left * rightByte;
; 2) 0xfe01 + 0xff =    0xff00          temp += mulCarry;
; 3) 0xff00 + 0xff =    0xffff          temp += *sum
;
;

;
; Distance of 'result' and 'right' on the stack.
;
#define BASE_OP1_OFFSET         14

uint32mul:
         push    de              ; Save working registers
         push    hl

         push    bc              ; Push operand 1 (left) to stack
         push    ax

         movw    ax, #0          ; Allocate stack for the result & clear it
         push    ax
         push    ax

         movw    ax, sp
         decw    ax              ; Point one byte past result (fixed later)
         push    ax              ; Store the base pointer

         mov     b,#4            ; Difference between left (operand 1) and
                                 ; base pointer in memory

?mulOutLoop:
         mov     a, b            ; bytes
         mov     c, a

         pop     hl              ; Get the base pointer
         incw    hl              ; Step to next
         push    hl

         mov     a, [hl + BASE_OP1_OFFSET] ; right
         mov     e, a           ; e: rightByte


         mov     d, #0                   ; d: mulCarry = 0;

?mulInLoop:
         mov     a, e                    ; right
         mov     x, a
         mov     a, [hl + b]             ; left

         mulu    x                       ; temp = *left * rightByte;

         xch     a,x

         add     a, d                    ; temp += mulCarry;
         bnc     ?skipMulCarryAdd1
         inc     x
?skipMulCarryAdd1:

         add     a, [hl]                 ; temp += *sum
         bnc     ?skipMulCarryAdd2
         inc     x
?skipMulCarryAdd2:

         mov     [hl], a                 ; *sum = (uint8)temp;

         mov     a, x
         mov     d, a                    ; mulCarry = temp >> 8;

         incw    hl                      ; sum++, left++

         dbnz    c, ?mulInLoop           ; while (--bytes);


         dbnz    b, ?mulOutLoop  ; Decrement diff or quit

         ;
         ; Get the result & clean up
         ;
         pop     hl              ; Drop the base pointer

         pop     bc              ; Get the result
         pop     de

         pop     hl              ; Drop left
         pop     hl

         br      ?L_F_DEALLOC_L06  ; Deallocate params and set status

;-----------------------------------------------------------------------------

#if 0

/*
** The above multiplication written in C
*/

typedef unsigned char uint8;
typedef unsigned int  uint16;
typedef unsigned long uint32;

void
mul1(uint8*     sum,
      uint8*     left,
      uint8      rightByte,
      uint8      bytes)
{
    uint8        mulCarry;

    mulCarry = 0;

    do
    {
       uint16 temp;

       temp = *left * rightByte;         /* 8 * 8 -> 16 */
       temp += *sum;                     /* 16 + 8 -> 16 */
       temp += mulCarry;                 /* 16 + 8 -> 16 */
       *sum = (uint8)temp;               /* low byte */
       mulCarry = temp >> 8;             /* high byte */

       left++;
       sum++;
    }
    while (--bytes);
}


uint32
longmulC(uint32 left, uint32 right)
{
    uint32 result32;
    uint8* result;
    uint8* rig;

    result32 = 0;
    result = (uint8*)&result32;
    rig = (uint8*)&right;

    mul1(&result[0], (uint8*)&left, rig[0], 4);
    mul1(&result[1], (uint8*)&left, rig[1], 3);
    mul1(&result[2], (uint8*)&left, rig[2], 2);
    mul1(&result[3], (uint8*)&left, rig[3], 1);

    return result32;
}


#endif

;-----------------------------------------------------------------------------

	END