Hello world,
Browsed couple days ago throuh the IAR 78K0 C 3.34B libraries and was somehow disapointed to notice that the the 32 bit multiplication was a 'traditional' one, i.e. one that does not use the microcontroller's own multiply instruction. I ported a routine that I originally wrote for Intel 805X and Dunfiled DDS C-compiler and here is the result.
My version is 3-5 times faster than the library multiplication. The real life speed improvement factor is probably around 3.7. The C library routine can make ~2600 - 4300 multiplications/sec (8Mhz system), while this one handles ~13000 muls/sec. The numbers are not the absolute truth, but give an estimate what these routines can do.
The function prototype is: uint32 uint32mul(uint32 left, uint32 right); (typedef unsigned long uint32;) I think that it should be possible to override the library function with this on with a linker-file definition.
I have not tested this with the new IAR C/C++ -compiler, but it should(?) work.
File mul.s26: ;----------------------------------------------------------------------------- ; ; COPYRIGHT (c) 2005 Jyrki Holopainen, all rights reserved ; ; The copyright to the computer program(s) herein is the property of ; Jyrki Holopainen. The code may be used, copied or edited freely as ; long as the original copyright message is retained. ; ;----------------------------------------------------------------------------- ; ; Multiplies two unsigned 32-bit operands and sets status. ; For NEC 78K0 microcontrollers with MULU-instruction. ; Compiler: IAR 3.34B C-compiler/assembler ; ; Revision history: ; Original: 07.12.2005 Jyrki Holopainen jyrki_mul32 AT halo.pp.fi ; Last modified: 08.12.2005 Jyrki Holopainen ; ;----------------------------------------------------------------------------- ; ; Input: AX, BC Operand 1 ; [SP+0:1] Return address ; [SP+2:5] Operand 2 ; ; Output: AX, BC Op1 mul Op2 ; Z-flag ? ; ; ; Function prototype: ; uint32 uint32mul(uint32 left, uint32 right); ; ;----------------------------------------------------------------------------- ; ; Compare compiler 32 * 32 -> 32 multiplication with this one ; (IAR 3.34, NEC 78K0 8 Mhz): ; ; lib* this Ratio Operation ; ------------------------------------------------------- ; ops/sec 4286 13100 3.1 0 * 0 (var * var) ; 3673 13057 3.6 999 * 999 (var * var) ; 3546 13036 3.7 99999 * 99999 (var * var) ; 3313 13015 3.9 9999999 * 9999999 (var * var) ; 2932 12909 4.4 999999999 * 999999999 (var * var) ; ; 3211 13057 4.1 0xffff * 0xffff (var * var) ; 2599 12960 5.0 0x7fffffff * 0x7fff...(var * var) ; ;------------------------------------------------------------------------------ MODULE LONG_MUL_L03_Fast
PUBLIC uint32mul
EXTERN ?L_F_DEALLOC_L06
RSEG RCODE
;----------------------------------------------------------------------------- ; ; Before starting the multiplication, the stack is set up following way: ; 14-17 Operand 2 ('right') ; 12-13 Return address ; 8 -11 Saved working registers ; 4 - 7 Operand 1 ('left') ; 0 - 3 Result ; . . . Saved hl ; ; At the beginning [hl] pointer points to the byte 0 of the result. ; The pointer is incrementd (byte 0 -> 3) at the outer loop as the ; multiplication proceeds. ; ; On the outer loop the 'result' and 'right' bytes are processed ; at the same rate, so the hl pointer and pointer + fixed offset ; can be used to access a byte from both of the variables. ; ; On the inner loop the 'result' and 'left' bytes are processed ; at the same rate, so the hl pointer and pointer + variable offset ; can be used to access a byte from both of the variables. ; ; Outer multiply loop registers ; Work: ; hl: base pointer to Nth byte of result ; b: Offset between base and 'left'. Used also as a loop counter. ; ; ; Inner multiply loop registers ; In: ; hl: Base pointer to Nth byte of result ; b: Distance between the base pointer and the left operand ; e: Current byte from the 'right' ; c: Number of bytes to process, used as loop counter ; ; Work: ; ax: Calc accumulator ; d: Carry byte ; ; ; The high byte on the add sequence on the inner loop cannot overflow: ; ; Worst case: ; 1) 0xff * 0xff = 0xfe01 temp = *left * rightByte; ; 2) 0xfe01 + 0xff = 0xff00 temp += mulCarry; ; 3) 0xff00 + 0xff = 0xffff temp += *sum ; ;
; ; Distance of 'result' and 'right' on the stack. ; #define BASE_OP1_OFFSET 14
uint32mul: push de ; Save working registers push hl
push bc ; Push operand 1 (left) to stack push ax
movw ax, #0 ; Allocate stack for the result & clear it push ax push ax
movw ax, sp decw ax ; Point one byte past result (fixed later) push ax ; Store the base pointer
mov b,#4 ; Difference between left (operand 1) and ; base pointer in memory
?mulOutLoop: mov a, b ; bytes mov c, a
pop hl ; Get the base pointer incw hl ; Step to next push hl
mov a, [hl + BASE_OP1_OFFSET] ; right mov e, a ; e: rightByte
mov d, #0 ; d: mulCarry = 0;
?mulInLoop: mov a, e ; right mov x, a mov a, [hl + b] ; left
mulu x ; temp = *left * rightByte;
xch a,x
add a, d ; temp += mulCarry; bnc ?skipMulCarryAdd1 inc x ?skipMulCarryAdd1:
add a, [hl] ; temp += *sum bnc ?skipMulCarryAdd2 inc x ?skipMulCarryAdd2:
mov [hl], a ; *sum = (uint8)temp;
mov a, x mov d, a ; mulCarry = temp >> 8;
incw hl ; sum++, left++
dbnz c, ?mulInLoop ; while (--bytes);
dbnz b, ?mulOutLoop ; Decrement diff or quit
; ; Get the result & clean up ; pop hl ; Drop the base pointer
pop bc ; Get the result pop de
pop hl ; Drop left pop hl
br ?L_F_DEALLOC_L06 ; Deallocate params and set status
;-----------------------------------------------------------------------------
#if 0
/*
** The above multiplication written in C */typedef unsigned char uint8; typedef unsigned int uint16; typedef unsigned long uint32;
void mul1(uint8* sum, uint8* left, uint8 rightByte, uint8 bytes) { uint8 mulCarry;
mulCarry = 0;
do { uint16 temp;
temp = *left * rightByte; /* 8 * 8 -> 16 */ temp += *sum; /* 16 + 8 -> 16 */ temp += mulCarry; /* 16 + 8 -> 16 */ *sum = (uint8)temp; /* low byte */ mulCarry = temp >> 8; /* high byte */
left++; sum++; } while (--bytes); }
uint32 longmulC(uint32 left, uint32 right) { uint32 result32; uint8* result; uint8* rig;
result32 = 0; result = (uint8*)&result32; rig = (uint8*)&right;
mul1(&result[0], (uint8*)&left, rig[0], 4); mul1(&result[1], (uint8*)&left, rig[1], 3); mul1(&result[2], (uint8*)&left, rig[2], 2); mul1(&result[3], (uint8*)&left, rig[3], 1);
return result32; }
#endif
;-----------------------------------------------------------------------------
END