/* * Copyright 2014 Martin Peres <martin.peres@free.fr> * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the folloing conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. * * Authors: Martin Peres */ /****************************************************************************** * arith data segment *****************************************************************************/ #ifdef INCLUDE_PROC #endif #ifdef INCLUDE_DATA #endif /****************************************************************************** * arith code segment *****************************************************************************/ #ifdef INCLUDE_CODE // does a 32x32 -> 64 multiplication // // A * B = A_lo * B_lo // + ( A_hi * B_lo ) << 16 // + ( A_lo * B_hi ) << 16 // + ( A_hi * B_hi ) << 32 // // $r15 - current // $r14 - A // $r13 - B // $r12 - mul_lo (return) // $r11 - mul_hi (return) // $r0 - zero mulu32_32_64: push $r1 // A_hi push $r2 // B_hi push $r3 // tmp0 push $r4 // tmp1 shr b32 $r1 $r14 16 shr b32 $r2 $r13 16 clear b32 $r12 clear b32 $r11 // A_lo * B_lo mulu $r12 $r14 $r13 // ( A_hi * B_lo ) << 16 mulu $r3 $r1 $r13 // tmp0 = A_hi * B_lo mov b32 $r4 $r3 and $r3 0xffff // tmp0 = tmp0_lo shl b32 $r3 16 shr b32 $r4 16 // tmp1 = tmp0_hi add b32 $r12 $r3 adc b32 $r11 $r4 // ( A_lo * B_hi ) << 16 mulu $r3 $r14 $r2 // tmp0 = A_lo * B_hi mov b32 $r4 $r3 and $r3 0xffff // tmp0 = tmp0_lo shl b32 $r3 16 shr b32 $r4 16 // tmp1 = tmp0_hi add b32 $r12 $r3 adc b32 $r11 $r4 // ( A_hi * B_hi ) << 32 mulu $r3 $r1 $r2 // tmp0 = A_hi * B_hi add b32 $r11 $r3 pop $r4 pop $r3 pop $r2 pop $r1 ret #endif |