******************************************************************************* * * * 64X32 BIT MULTIPLY (UNSIGNED) * * 01/30/07 (dkc) * * * * This C64 subroutine does 64x32 bit multiplication. The calling sequence * * of the subroutine is; * * * * multiplicand (d[0], d[1]) => a4, b4 * * multiplier (m[0]) => a6 * * address of product => b6 * * * ******************************************************************************* .def _mul64_32 .text _mul64_32: mpyhlu.m1 a4, a6, a0 ; d3 * m0 || mpyhlu.m2x b4, a6, b0 ; d1 * m0 || sub.s1x b15, 4, a3 ; load sp-1 mpyu.m1 a4, a6, a8 ; d2 * m0 || mpyu.m2x b4, a6, b8 ; d0 * m0 || stw.d2 a10, *b15--[2] ; save a10 || stw.d1 b10, *a3--[2] ; save b10 shl.s1 a0, 16, a1 ; d3*m0 << 16 || shl.s2 b0, 16, b1 ; d1*m0 << 16 || zero.l1 a9 ; zero odd register of pair || zero.l2 b9 ; zero odd register of pair || mpyhu.m1 a4, a6, a2 ; d3 * m1 || mpyhu.m2x b4, a6, b2 ; d1 * m1 || stw.d2 a11, *b15--[2] ; save a11 || stw.d1 b11, *a3--[2] ; save b11 addu.l1 a9:a8, a1, a9:a8 ; d3*m0<<16 + d2*m0 || addu.l2 b9:b8, b1, b9:b8 ; p0 = d1*m0<<16 + d0*m0 || shru.s1 a0, 16, a0 ; p2 = d3*m0 >> 16 || shru.s2 b0, 16, b0 ; d1*m0 >> 16 || subab.d2 b1, b1, b1 ; load 0 || mpylhu.m1 a4, a6, a10 ; d2 * m1 || mpylhu.m2x b4, a6, b10 ; d0 * m1 addab.d1 a0, a9, a0 ; p2 = p2 + carry || addu.l2 b1:b0, b9, b1:b0 ; d1*m0>>16 + carry || mv.s2x a8, b9 ; load d3*m0<<16 + d2*m0 || shl.s1 a2, 16, a3 ; d3*m1 << 16 || mpy.m2 b11, 0, b11 ; zero odd register of pair || zero.l1 a11 ; zero odd register of pair addu.l2 b1:b0, b9, b1:b0 ; p1 = d1*m0>>16+carry+d3*m0<<16+d2*mp || shl.s2 b2, 16, b3 ; d1*m1 << 16 || shru.s1 a2, 16, a2 ; p2' = d3*m1 >> 16 || addab.d2 b3, 0, b7 ; save return address add.s1x a0, b1, a0 ; p2 = p2 + carry || addu.l1 a11:a10, a3, a11:a10 ; d3*m1<<16 + d2*m1 || addu.l2 b11:b10, b3, b11:b10 ; p0' = d1*m1<<16 + d0*m1 || shru.s2 b2, 16, b2 ; d1*m1 >> 16 || subab.d2 b3, b3, b3 ; load 0 addab.d1 a2, a11, a2 ; p2' = p2' + carry || addu.l2 b3:b2, b11, b3:b2 ; d1*m1>>16 + carry || mv.s2x a10, b11 ; load d3*m1<<16 + d2*m1 || mpy.m2 b9, 0, b9 ; load 0 addu.l2 b3:b2, b11, b3:b2 ; p1' = d1*m0>>16+carry+d3*m0<<16+d2*mp || shl.s2 b10, 16, b11 ; p0' << 16 || mpy.m2 b1, 0, b1 ; load 0 add.s1x a2, b3, a2 ; p2' = p2' + carry || addu.l2 b9:b8, b11, b9:b8 ; P0 = p0 + p0'<<16 || shru.s2 b10, 16, b10 ; p0' >> 16 || ldw.d2 *++b15[1], b11 ; restore b11 addu.l2 b1:b0, b9, b1:b0 ; p1 + carry || shl.s2 b2, 16, b9 ; p1' << 16 || shl.s1 a2, 16, a2 ; p2' << 16 addu.l2 b1:b0, b10, b1:b0 ; p1 + carry + p0'>>16 || shru.s2 b2, 16, b2 ; p1' >> 16 || add.l1 a0, a2, a0 ; p2 + p2'<<16 || ldw.d2 *++b15[1], a11 ; restore a11 addu.l2 b1:b0, b9, b1:b0 ; P1 = p1'<<16 + p1 + carry + p0'>>16 || add.l1x a0, b2, a0 ; P0 = p2 + p2'<<16 + p1'>>16 || b.s2 b7 ; return || ldw.d2 *++b15[1], b10 ; restore b10 ldw.d2 *++b15[1], a10 ; restore a10 stw.d2 b8, *+b6[2] ; store P0 add.l1x a0, b1, a0 ; P0 = P0 + carry || stw.d2 b0, *+b6[1] ; store P1 stw.d2 a0, *b6 ; store P0 nop .end