Faster 32-bit mul for IAR 78K0 3.34B

Hello world,

Browsed couple days ago throuh the IAR 78K0 C 3.34B libraries and was somehow disapointed to notice that the the 32 bit multiplication was a 'traditional' one, i.e. one that does not use the microcontroller's own multiply instruction. I ported a routine that I originally wrote for Intel 805X and Dunfiled DDS C-compiler and here is the result.

My version is 3-5 times faster than the library multiplication. The real life speed improvement factor is probably around 3.7. The C library routine can make ~2600 - 4300 multiplications/sec (8Mhz system), while this one handles ~13000 muls/sec. The numbers are not the absolute truth, but give an estimate what these routines can do.

The function prototype is: uint32 uint32mul(uint32 left, uint32 right); (typedef unsigned long uint32;) I think that it should be possible to override the library function with this on with a linker-file definition.

I have not tested this with the new IAR C/C++ -compiler, but it should(?) work.

File mul.s26: ;----------------------------------------------------------------------------- ; ; COPYRIGHT (c) 2005 Jyrki Holopainen, all rights reserved ; ; The copyright to the computer program(s) herein is the property of ; Jyrki Holopainen. The code may be used, copied or edited freely as ; long as the original copyright message is retained. ; ;----------------------------------------------------------------------------- ; ; Multiplies two unsigned 32-bit operands and sets status. ; For NEC 78K0 microcontrollers with MULU-instruction. ; Compiler: IAR 3.34B C-compiler/assembler ; ; Revision history: ; Original: 07.12.2005 Jyrki Holopainen jyrki_mul32 AT halo.pp.fi ; Last modified: 08.12.2005 Jyrki Holopainen ; ;----------------------------------------------------------------------------- ; ; Input: AX, BC Operand 1 ; [SP+0:1] Return address ; [SP+2:5] Operand 2 ; ; Output: AX, BC Op1 mul Op2 ; Z-flag ? ; ; ; Function prototype: ; uint32 uint32mul(uint32 left, uint32 right); ; ;----------------------------------------------------------------------------- ; ; Compare compiler 32 * 32 -> 32 multiplication with this one ; (IAR 3.34, NEC 78K0 8 Mhz): ; ; lib* this Ratio Operation ; ------------------------------------------------------- ; ops/sec 4286 13100 3.1 0 * 0 (var * var) ; 3673 13057 3.6 999 * 999 (var * var) ; 3546 13036 3.7 99999 * 99999 (var * var) ; 3313 13015 3.9 9999999 * 9999999 (var * var) ; 2932 12909 4.4 999999999 * 999999999 (var * var) ; ; 3211 13057 4.1 0xffff * 0xffff (var * var) ; 2599 12960 5.0 0x7fffffff * 0x7fff...(var * var) ; ;------------------------------------------------------------------------------ MODULE LONG_MUL_L03_Fast

PUBLIC uint32mul

EXTERN ?L_F_DEALLOC_L06

RSEG RCODE

;----------------------------------------------------------------------------- ; ; Before starting the multiplication, the stack is set up following way: ; 14-17 Operand 2 ('right') ; 12-13 Return address ; 8 -11 Saved working registers ; 4 - 7 Operand 1 ('left') ; 0 - 3 Result ; . . . Saved hl ; ; At the beginning [hl] pointer points to the byte 0 of the result. ; The pointer is incrementd (byte 0 -> 3) at the outer loop as the ; multiplication proceeds. ; ; On the outer loop the 'result' and 'right' bytes are processed ; at the same rate, so the hl pointer and pointer + fixed offset ; can be used to access a byte from both of the variables. ; ; On the inner loop the 'result' and 'left' bytes are processed ; at the same rate, so the hl pointer and pointer + variable offset ; can be used to access a byte from both of the variables. ; ; Outer multiply loop registers ; Work: ; hl: base pointer to Nth byte of result ; b: Offset between base and 'left'. Used also as a loop counter. ; ; ; Inner multiply loop registers ; In: ; hl: Base pointer to Nth byte of result ; b: Distance between the base pointer and the left operand ; e: Current byte from the 'right' ; c: Number of bytes to process, used as loop counter ; ; Work: ; ax: Calc accumulator ; d: Carry byte ; ; ; The high byte on the add sequence on the inner loop cannot overflow: ; ; Worst case: ; 1) 0xff * 0xff = 0xfe01 temp = *left * rightByte; ; 2) 0xfe01 + 0xff = 0xff00 temp += mulCarry; ; 3) 0xff00 + 0xff = 0xffff temp += *sum ; ;

; ; Distance of 'result' and 'right' on the stack. ; #define BASE_OP1_OFFSET 14

uint32mul: push de ; Save working registers push hl

push bc ; Push operand 1 (left) to stack push ax

movw ax, #0 ; Allocate stack for the result & clear it push ax push ax

movw ax, sp decw ax ; Point one byte past result (fixed later) push ax ; Store the base pointer

mov b,#4 ; Difference between left (operand 1) and ; base pointer in memory

?mulOutLoop: mov a, b ; bytes mov c, a

pop hl ; Get the base pointer incw hl ; Step to next push hl

mov a, [hl + BASE_OP1_OFFSET] ; right mov e, a ; e: rightByte

mov d, #0 ; d: mulCarry = 0;

?mulInLoop: mov a, e ; right mov x, a mov a, [hl + b] ; left

mulu x ; temp = *left * rightByte;

xch a,x

add a, d ; temp += mulCarry; bnc ?skipMulCarryAdd1 inc x ?skipMulCarryAdd1:

add a, [hl] ; temp += *sum bnc ?skipMulCarryAdd2 inc x ?skipMulCarryAdd2:

mov [hl], a ; *sum = (uint8)temp;

mov a, x mov d, a ; mulCarry = temp >> 8;

incw hl ; sum++, left++

dbnz c, ?mulInLoop ; while (--bytes);

dbnz b, ?mulOutLoop ; Decrement diff or quit

; ; Get the result & clean up ; pop hl ; Drop the base pointer

pop bc ; Get the result pop de

pop hl ; Drop left pop hl

br ?L_F_DEALLOC_L06 ; Deallocate params and set status

;-----------------------------------------------------------------------------

#if 0

/*

** The above multiplication written in C */

typedef unsigned char uint8; typedef unsigned int uint16; typedef unsigned long uint32;

void mul1(uint8* sum, uint8* left, uint8 rightByte, uint8 bytes) { uint8 mulCarry;

mulCarry = 0;

do { uint16 temp;

temp = *left * rightByte; /* 8 * 8 -> 16 */ temp += *sum; /* 16 + 8 -> 16 */ temp += mulCarry; /* 16 + 8 -> 16 */ *sum = (uint8)temp; /* low byte */ mulCarry = temp >> 8; /* high byte */

left++; sum++; } while (--bytes); }

uint32 longmulC(uint32 left, uint32 right) { uint32 result32; uint8* result; uint8* rig;

result32 = 0; result = (uint8*)&result32; rig = (uint8*)&right;

mul1(&result[0], (uint8*)&left, rig[0], 4); mul1(&result[1], (uint8*)&left, rig[1], 3); mul1(&result[2], (uint8*)&left, rig[2], 2); mul1(&result[3], (uint8*)&left, rig[3], 1);

return result32; }

#endif

;-----------------------------------------------------------------------------

END

Reply to
Jyrki Holopainen
Loading thread data ...

ElectronDepot website is not affiliated with any of the manufacturers or service providers discussed here. All logos and trade names are the property of their respective owners.