mMathAMD.cpp
Engine/source/math/mMathAMD.cpp
Public Functions
Detailed Description
Public Functions
mInstall_AMD_Math()
1 2//----------------------------------------------------------------------------- 3// Copyright (c) 2012 GarageGames, LLC 4// 5// Permission is hereby granted, free of charge, to any person obtaining a copy 6// of this software and associated documentation files (the "Software"), to 7// deal in the Software without restriction, including without limitation the 8// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 9// sell copies of the Software, and to permit persons to whom the Software is 10// furnished to do so, subject to the following conditions: 11// 12// The above copyright notice and this permission notice shall be included in 13// all copies or substantial portions of the Software. 14// 15// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21// IN THE SOFTWARE. 22//----------------------------------------------------------------------------- 23 24#include "math/mMathFn.h" 25#include "math/mPlane.h" 26#include "math/mMatrix.h" 27 28 29// extern void (*m_matF_x_point3F)(const F32 *m, const F32 *p, F32 *presult); 30// extern void (*m_matF_x_vectorF)(const F32 *m, const F32 *v, F32 *vresult); 31 32/* not currently implemented. 33void Athlon_MatrixF_x_Point3F(const F32 *m, const F32 *p, F32 *presult) 34{ 35 m; 36 p; 37 presult; 38} 39*/ 40 41//============================================================ 42// Here's the C code for MatF_x_MatF: 43// note that the code below does it in a different order (optimal asm, after all!) 44// 45// r[0] = a[0]*b[0] + a[1]*b[4] + a[2]*b[8] + a[3]*b[12]; 46// r[1] = a[0]*b[1] + a[1]*b[5] + a[2]*b[9] + a[3]*b[13]; 47// r[2] = a[0]*b[2] + a[1]*b[6] + a[2]*b[10] + a[3]*b[14]; 48// r[3] = a[0]*b[3] + a[1]*b[7] + a[2]*b[11] + a[3]*b[15]; 49// 50// r[4] = a[4]*b[0] + a[5]*b[4] + a[6]*b[8] + a[7]*b[12]; 51// r[5] = a[4]*b[1] + a[5]*b[5] + a[6]*b[9] + a[7]*b[13]; 52// r[6] = a[4]*b[2] + a[5]*b[6] + a[6]*b[10] + a[7]*b[14]; 53// r[7] = a[4]*b[3] + a[5]*b[7] + a[6]*b[11] + a[7]*b[15]; 54// 55// r[8] = a[8]*b[0] + a[9]*b[4] + a[10]*b[8] + a[11]*b[12]; 56// r[9] = a[8]*b[1] + a[9]*b[5] + a[10]*b[9] + a[11]*b[13]; 57// r[10]= a[8]*b[2] + a[9]*b[6] + a[10]*b[10]+ a[11]*b[14]; 58// r[11]= a[8]*b[3] + a[9]*b[7] + a[10]*b[11]+ a[11]*b[15]; 59// 60// r[12]= a[12]*b[0]+ a[13]*b[4]+ a[14]*b[8] + a[15]*b[12]; 61// r[13]= a[12]*b[1]+ a[13]*b[5]+ a[14]*b[9] + a[15]*b[13]; 62// r[14]= a[12]*b[2]+ a[13]*b[6]+ a[14]*b[10]+ a[15]*b[14]; 63// r[15]= a[12]*b[3]+ a[13]*b[7]+ a[14]*b[11]+ a[15]*b[15]; 64//============================================================ 65 66#if defined(TORQUE_SUPPORTS_VC_INLINE_X86_ASM) 67#define ADD_3DNOW_FUNCS 68// inlined version here. 69void Athlon_MatrixF_x_MatrixF(const F32 *matA, const F32 *matB, F32 *result) 70{ 71 __asm 72 { 73 femms 74 75 mov ecx, matA 76 mov edx, matB 77 mov eax, result 78 79 prefetch [ecx+32] ;// These may help - 80 prefetch [edx+32] ;// and probably don't hurt 81 82 movq mm0,[ecx] ;// a21 | a11 83 movq mm1,[ecx+8] ;// a41 | a31 84 movq mm4,[edx] ;// b21 | b11 85 punpckhdq mm2,mm0 ;// a21 | 86 movq mm5,[edx+16] ;// b22 | b12 87 punpckhdq mm3,mm1 ;// a41 | 88 movq mm6,[edx+32] ;// b23 | b13 89 punpckldq mm0,mm0 ;// a11 | a11 90 punpckldq mm1,mm1 ;// a31 | a31 91 pfmul mm4,mm0 ;// a11*b21 | a11*b11 92 punpckhdq mm2,mm2 ;// a21 | a21 93 pfmul mm0,[edx+8] ;// a11*b41 | a11*b31 94 movq mm7,[edx+48] ;// b24 | b14 95 pfmul mm5,mm2 ;// a21*b22 | a21*b12 96 punpckhdq mm3,mm3 ;// a41 | a41 97 pfmul mm2,[edx+24] ;// a21*b42 | a21*b32 98 pfmul mm6,mm1 ;// a31*b23 | a31*b13 99 pfadd mm5,mm4 ;// a21*b22 + a11*b21 | a21*b12 + a11*b11 100 pfmul mm1,[edx+40] ;// a31*b43 | a31*b33 101 pfadd mm2,mm0 ;// a21*b42 + a11*b41 | a21*b32 + a11*b31 102 pfmul mm7,mm3 ;// a41*b24 | a41*b14 103 pfadd mm6,mm5 ;// a21*b22 + a11*b21 + a31*b23 | a21*b12 + a11*b11 + a31*b13 104 pfmul mm3,[edx+56] ;// a41*b44 | a41*b34 105 pfadd mm2,mm1 ;// a21*b42 + a11*b41 + a31*b43 | a21*b32 + a11*b31 + a31*b33 106 pfadd mm7,mm6 ;// a41*b24 + a21*b22 + a11*b21 + a31*b23 | a41*b14 + a21*b12 + a11*b11 + a31*b13 107 movq mm0,[ecx+16] ;// a22 | a12 108 pfadd mm3,mm2 ;// a41*b44 + a21*b42 + a11*b41 + a31*b43 | a41*b34 + a21*b32 + a11*b31 + a31*b33 109 movq mm1,[ecx+24] ;// a42 | a32 110 movq [eax],mm7 ;// r21 | r11 111 movq mm4,[edx] ;// b21 | b11 112 movq [eax+8],mm3 ;// r41 | r31 113 114 punpckhdq mm2,mm0 ;// a22 | XXX 115 movq mm5,[edx+16] ;// b22 | b12 116 punpckhdq mm3,mm1 ;// a42 | XXX 117 movq mm6,[edx+32] ;// b23 | b13 118 punpckldq mm0,mm0 ;// a12 | a12 119 punpckldq mm1,mm1 ;// a32 | a32 120 pfmul mm4,mm0 ;// a12*b21 | a12*b11 121 punpckhdq mm2,mm2 ;// a22 | a22 122 pfmul mm0,[edx+8] ;// a12*b41 | a12*b31 123 movq mm7,[edx+48] ;// b24 | b14 124 pfmul mm5,mm2 ;// a22*b22 | a22*b12 125 punpckhdq mm3,mm3 ;// a42 | a42 126 pfmul mm2,[edx+24] ;// a22*b42 | a22*b32 127 pfmul mm6,mm1 ;// a32*b23 | a32*b13 128 pfadd mm5,mm4 ;// a12*b21 + a22*b22 | a12*b11 + a22*b12 129 pfmul mm1,[edx+40] ;// a32*b43 | a32*b33 130 pfadd mm2,mm0 ;// a12*b41 + a22*b42 | a12*b11 + a22*b32 131 pfmul mm7,mm3 ;// a42*b24 | a42*b14 132 pfadd mm6,mm5 ;// a32*b23 + a12*b21 + a22*b22 | a32*b13 + a12*b11 + a22*b12 133 pfmul mm3,[edx+56] ;// a42*b44 | a42*b34 134 pfadd mm2,mm1 ;// a32*b43 + a12*b41 + a22*b42 | a32*b33 + a12*b11 + a22*b32 135 pfadd mm7,mm6 ;// a42*b24 + a32*b23 + a12*b21 + a22*b22 | a42*b14 + a32*b13 + a12*b11 + a22*b12 136 movq mm0,[ecx+32] ;// a23 | a13 137 pfadd mm3,mm2 ;// a42*b44 + a32*b43 + a12*b41 + a22*b42 | a42*b34 + a32*b33 + a12*b11 + a22*b32 138 movq mm1,[ecx+40] ;// a43 | a33 139 movq [eax+16],mm7 ;// r22 | r12 140 movq mm4,[edx] ;// b21 | b11 141 movq [eax+24],mm3 ;// r42 | r32 142 143 punpckhdq mm2,mm0 ;// a23 | XXX 144 movq mm5,[edx+16] ;// b22 | b12 145 punpckhdq mm3,mm1 ;// a43 | XXX 146 movq mm6,[edx+32] ;// b23 | b13 147 punpckldq mm0,mm0 ;// a13 | a13 148 punpckldq mm1,mm1 ;// a33 | a33 149 pfmul mm4,mm0 ;// a13*b21 | a13*b11 150 punpckhdq mm2,mm2 ;// a23 | a23 151 pfmul mm0,[edx+8] ;// a13*b41 | a13*b31 152 movq mm7,[edx+48] ;// b24 | b14 153 pfmul mm5,mm2 ;// a23*b22 | a23*b12 154 punpckhdq mm3,mm3 ;// a43 | a43 155 pfmul mm2,[edx+24] ;// a23*b42 | a23*b32 156 pfmul mm6,mm1 ;// a33*b23 | a33*b13 157 pfadd mm5,mm4 ;// a23*b22 + a13*b21 | a23*b12 + a13*b11 158 pfmul mm1,[edx+40] ;// a33*b43 | a33*b33 159 pfadd mm2,mm0 ;// a13*b41 + a23*b42 | a13*b31 + a23*b32 160 pfmul mm7,mm3 ;// a43*b24 | a43*b14 161 pfadd mm6,mm5 ;// a33*b23 + a23*b22 + a13*b21 | a33*b13 + a23*b12 + a13*b11 162 pfmul mm3,[edx+56] ;// a43*b44 | a43*b34 163 pfadd mm2,mm1 ;// a33*b43*a13*b41 + a23*b42 | a33*b33 + a13*b31 + a23*b32 164 pfadd mm7,mm6 ;// a43*b24 + a33*b23 + a23*b22 + a13*b21 | a43*b14 + a33*b13 + a23*b12 + a13*b11 165 movq mm0,[ecx+48] ;// a24 | a14 166 pfadd mm3,mm2 ;// a43*b44 + a33*b43*a13*b41 + a23*b42 | a43*b34 + a33*b33 + a13*b31 + a23*b32 167 movq mm1,[ecx+56] ;// a44 | a34 168 movq [eax+32],mm7 ;// r23 | r13 169 movq mm4,[edx] ;// b21 | b11 170 movq [eax+40],mm3 ;// r43 | r33 171 172 punpckhdq mm2,mm0 ;// a24 | XXX 173 movq mm5,[edx+16] ;// b22 | b12 174 punpckhdq mm3,mm1 ;// a44 | XXX 175 movq mm6,[edx+32] ;// b23 | b13 176 punpckldq mm0,mm0 ;// a14 | a14 177 punpckldq mm1,mm1 ;// a34 | a34 178 pfmul mm4,mm0 ;// a14*b21 | a14*b11 179 punpckhdq mm2,mm2 ;// a24 | a24 180 pfmul mm0,[edx+8] ;// a14*b41 | a14*b31 181 movq mm7,[edx+48] ;// b24 | b14 182 pfmul mm5,mm2 ;// a24*b22 | a24*b12 183 punpckhdq mm3,mm3 ;// a44 | a44 184 pfmul mm2,[edx+24] ;// a24*b 42 | a24*b32 185 pfmul mm6,mm1 ;// a34*b23 | a34*b13 186 pfadd mm5,mm4 ;// a14*b21 + a24*b22 | a14*b11 + a24*b12 187 pfmul mm1,[edx+40] ;// a34*b43 | a34*b33 188 pfadd mm2,mm0 ;// a14*b41 + a24*b 42 | a14*b31 + a24*b32 189 pfmul mm7,mm3 ;// a44*b24 | a44*b14 190 pfadd mm6,mm5 ;// a34*b23 + a14*b21 + a24*b22 | a34*b13 + a14*b11 + a24*b12 191 pfmul mm3,[edx+56] ;// a44*b44 | a44*b34 192 pfadd mm2,mm1 ;// a34*b43 + a14*b41 + a24*b 42 | a34*b33 + a14*b31 + a24*b32 193 pfadd mm7,mm6 ;// a44*b24 + a14*b23 + a24*b 42 | a44*b14 + a14*b31 + a24*b32 194 pfadd mm3,mm2 ;// a44*b44 + a34*b43 + a14*b41 + a24*b42 | a44*b34 + a34*b33 + a14*b31 + a24*b32 195 movq [eax+48],mm7 ;// r24 | r14 196 movq [eax+56],mm3 ;// r44 | r34 197 femms 198 } 199} 200#elif defined(TORQUE_SUPPORTS_NASM) 201#define ADD_3DNOW_FUNCS 202extern "C" 203{ 204 void Athlon_MatrixF_x_MatrixF(const F32 *matA, const F32 *matB, F32 *result); 205} 206 207#endif 208 209void mInstall_AMD_Math() 210{ 211#if defined(ADD_3DNOW_FUNCS) 212 m_matF_x_matF = Athlon_MatrixF_x_MatrixF; 213#endif 214 // m_matF_x_point3F = Athlon_MatrixF_x_Point3F; 215 // m_matF_x_vectorF = Athlon_MatrixF_x_VectorF; 216} 217 218