Op Mon, 23 Apr 2012 20:36:48 +0200 schreef David T. Ashley :
Note that I did not perform exhaustive testing on the below code.
#include
typedef struct uint96_s { uint32_t w0; uint32_t w1; uint32_t w2; } uint96_t;
typedef struct uint128_s { uint32_t w0; uint32_t w1; uint32_t w2; uint32_t w3; } uint128_t;
void umul_96x32c(uint96_t *a, uint32_t b, uint128_t *r) { uint64_t p0, p1, p2; uint32_t r0, r1a, r1b, r2a, r2b, r3; uint64_t r1, r2;
p0 = (uint64_t)a->w0 * (uint64_t)b; r0 = p0; r1a = p0 >> 32; r->w0 = r0; p1 = (uint64_t)a->w1 * (uint64_t)b; r1b = p1; r2a = p1 >> 32; r1 = (uint64_t)r1a + (uint64_t)r1b; r->w1 = r1; p2 = (uint64_t)a->w2 * (uint64_t)b; r2b = p2; r3 = p2 >> 32; r2 = (r1 >> 32) + (uint64_t)r2a + (uint64_t)r2b; r->w2 = r2; r3 += r2 >> 32; r->w3 = r3; }
The obvious thing to do is to add manual ADC instructions.
#pragma inline=never void umul_96x32i(uint96_t *a, uint32_t b, uint128_t *r) { uint64_t p0, p1, p2; uint32_t r0, r1a, r1b, r2a, r2b, r3; uint32_t r1, r2;
p0 = (uint64_t)a->w0 * (uint64_t)b; r0 = p0; r1a = p0 >> 32; r->w0 = r0; p1 = (uint64_t)a->w1 * (uint64_t)b; r1b = p1; r2a = p1 >> 32; r1 = r1a + r1b; r->w1 = r1; p2 = (uint64_t)a->w2 * (uint64_t)b; r2b = p2; r3 = p2 >> 32; __asm("adcs %[Rd],%[Rn],%[Rm]" : [Rd]"=r"(r2) : [Rn]"r" (r2a), [Rm]"r" (r2b) : "cc"); r->w2 = r2; __asm("adc %[Rd],%[Rn],#0" : [Rd]"=r"(r3) : [Rn]"r" (r3)); r->w3 = r3; }
The only further possible optimization was register allocation.
umul_96x32: PUSH {R4} LDR R3,[R0, #+0] UMULL R3,R12,R1,R3 STR R3,[R2, #+0] LDR R3,[R0, #+4] UMULL R3,R4,R1,R3 ADDS R12,R3,R12 STR R12,[R2, #+4] LDR R0,[R0, #+8] UMULL R0,R1,R1,R0 ADCS R3,R4,R0 STR R3,[R2, #+8] ADC R0,R1,#0 STR R0,[R2, #+12] POP {R4} BX LR ;; return