Reply by Jyrki Holopainen●December 9, 20052005-12-09
Hello world,
Browsed couple days ago throuh the IAR 78K0 C 3.34B libraries and was
somehow disapointed to notice that the the 32 bit multiplication was a
'traditional' one, i.e. one that does not use the microcontroller's own
multiply instruction. I ported a routine that I originally wrote for
Intel 805X and Dunfiled DDS C-compiler and here is the result.
My version is 3-5 times faster than the library multiplication. The real
life speed improvement factor is probably around 3.7. The C library
routine can make ~2600 - 4300 multiplications/sec (8Mhz system), while
this one handles ~13000 muls/sec. The numbers are not the absolute
truth, but give an estimate what these routines can do.
The function prototype is: uint32 uint32mul(uint32 left, uint32 right);
(typedef unsigned long uint32;) I think that it should be possible to
override the library function with this on with a linker-file definition.
I have not tested this with the new IAR C/C++ -compiler, but it
should(?) work.
File mul.s26:
;-----------------------------------------------------------------------------
;
; COPYRIGHT (c) 2005 Jyrki Holopainen, all rights reserved
;
; The copyright to the computer program(s) herein is the property of
; Jyrki Holopainen. The code may be used, copied or edited freely as
; long as the original copyright message is retained.
;
;-----------------------------------------------------------------------------
;
; Multiplies two unsigned 32-bit operands and sets status.
; For NEC 78K0 microcontrollers with MULU-instruction.
; Compiler: IAR 3.34B C-compiler/assembler
;
; Revision history:
; Original: 07.12.2005 Jyrki Holopainen jyrki_mul32 AT halo.pp.fi
; Last modified: 08.12.2005 Jyrki Holopainen
;
;-----------------------------------------------------------------------------
;
; Input: AX, BC Operand 1
; [SP+0:1] Return address
; [SP+2:5] Operand 2
;
; Output: AX, BC Op1 mul Op2
; Z-flag ?
;
;
; Function prototype:
; uint32 uint32mul(uint32 left, uint32 right);
;
;-----------------------------------------------------------------------------
;
; Compare compiler 32 * 32 -> 32 multiplication with this one
; (IAR 3.34, NEC 78K0 8 Mhz):
;
; lib* this Ratio Operation
; -------------------------------------------------------
; ops/sec 4286 13100 3.1 0 * 0 (var * var)
; 3673 13057 3.6 999 * 999 (var * var)
; 3546 13036 3.7 99999 * 99999 (var * var)
; 3313 13015 3.9 9999999 * 9999999 (var * var)
; 2932 12909 4.4 999999999 * 999999999 (var * var)
;
; 3211 13057 4.1 0xffff * 0xffff (var * var)
; 2599 12960 5.0 0x7fffffff * 0x7fff...(var * var)
;
;------------------------------------------------------------------------------
MODULE LONG_MUL_L03_Fast
PUBLIC uint32mul
EXTERN ?L_F_DEALLOC_L06
RSEG RCODE
;-----------------------------------------------------------------------------
;
; Before starting the multiplication, the stack is set up following way:
; 14-17 Operand 2 ('right')
; 12-13 Return address
; 8 -11 Saved working registers
; 4 - 7 Operand 1 ('left')
; 0 - 3 Result
; . . . Saved hl
;
; At the beginning [hl] pointer points to the byte 0 of the result.
; The pointer is incrementd (byte 0 -> 3) at the outer loop as the
; multiplication proceeds.
;
; On the outer loop the 'result' and 'right' bytes are processed
; at the same rate, so the hl pointer and pointer + fixed offset
; can be used to access a byte from both of the variables.
;
; On the inner loop the 'result' and 'left' bytes are processed
; at the same rate, so the hl pointer and pointer + variable offset
; can be used to access a byte from both of the variables.
;
; Outer multiply loop registers
; Work:
; hl: base pointer to Nth byte of result
; b: Offset between base and 'left'. Used also as a loop counter.
;
;
; Inner multiply loop registers
; In:
; hl: Base pointer to Nth byte of result
; b: Distance between the base pointer and the left operand
; e: Current byte from the 'right'
; c: Number of bytes to process, used as loop counter
;
; Work:
; ax: Calc accumulator
; d: Carry byte
;
;
; The high byte on the add sequence on the inner loop cannot overflow:
;
; Worst case:
; 1) 0xff * 0xff = 0xfe01 temp = *left * rightByte;
; 2) 0xfe01 + 0xff = 0xff00 temp += mulCarry;
; 3) 0xff00 + 0xff = 0xffff temp += *sum
;
;
;
; Distance of 'result' and 'right' on the stack.
;
#define BASE_OP1_OFFSET 14
uint32mul:
push de ; Save working registers
push hl
push bc ; Push operand 1 (left) to stack
push ax
movw ax, #0 ; Allocate stack for the result & clear it
push ax
push ax
movw ax, sp
decw ax ; Point one byte past result (fixed later)
push ax ; Store the base pointer
mov b,#4 ; Difference between left (operand 1) and
; base pointer in memory
?mulOutLoop:
mov a, b ; bytes
mov c, a
pop hl ; Get the base pointer
incw hl ; Step to next
push hl
mov a, [hl + BASE_OP1_OFFSET] ; right
mov e, a ; e: rightByte
mov d, #0 ; d: mulCarry = 0;
?mulInLoop:
mov a, e ; right
mov x, a
mov a, [hl + b] ; left
mulu x ; temp = *left * rightByte;
xch a,x
add a, d ; temp += mulCarry;
bnc ?skipMulCarryAdd1
inc x
?skipMulCarryAdd1:
add a, [hl] ; temp += *sum
bnc ?skipMulCarryAdd2
inc x
?skipMulCarryAdd2:
mov [hl], a ; *sum = (uint8)temp;
mov a, x
mov d, a ; mulCarry = temp >> 8;
incw hl ; sum++, left++
dbnz c, ?mulInLoop ; while (--bytes);
dbnz b, ?mulOutLoop ; Decrement diff or quit
;
; Get the result & clean up
;
pop hl ; Drop the base pointer
pop bc ; Get the result
pop de
pop hl ; Drop left
pop hl
br ?L_F_DEALLOC_L06 ; Deallocate params and set status
;-----------------------------------------------------------------------------
#if 0
/*
** The above multiplication written in C
*/
typedef unsigned char uint8;
typedef unsigned int uint16;
typedef unsigned long uint32;
void
mul1(uint8* sum,
uint8* left,
uint8 rightByte,
uint8 bytes)
{
uint8 mulCarry;
mulCarry = 0;
do
{
uint16 temp;
temp = *left * rightByte; /* 8 * 8 -> 16 */
temp += *sum; /* 16 + 8 -> 16 */
temp += mulCarry; /* 16 + 8 -> 16 */
*sum = (uint8)temp; /* low byte */
mulCarry = temp >> 8; /* high byte */
left++;
sum++;
}
while (--bytes);
}
uint32
longmulC(uint32 left, uint32 right)
{
uint32 result32;
uint8* result;
uint8* rig;
result32 = 0;
result = (uint8*)&result32;
rig = (uint8*)&right;
mul1(&result[0], (uint8*)&left, rig[0], 4);
mul1(&result[1], (uint8*)&left, rig[1], 3);
mul1(&result[2], (uint8*)&left, rig[2], 2);
mul1(&result[3], (uint8*)&left, rig[3], 1);
return result32;
}
#endif
;-----------------------------------------------------------------------------
END