mMathAMD.cpp

Engine/source/math/mMathAMD.cpp

More...

Public Functions

Detailed Description

Public Functions

mInstall_AMD_Math()

  1
  2//-----------------------------------------------------------------------------
  3// Copyright (c) 2012 GarageGames, LLC
  4//
  5// Permission is hereby granted, free of charge, to any person obtaining a copy
  6// of this software and associated documentation files (the "Software"), to
  7// deal in the Software without restriction, including without limitation the
  8// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  9// sell copies of the Software, and to permit persons to whom the Software is
 10// furnished to do so, subject to the following conditions:
 11//
 12// The above copyright notice and this permission notice shall be included in
 13// all copies or substantial portions of the Software.
 14//
 15// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 21// IN THE SOFTWARE.
 22//-----------------------------------------------------------------------------
 23
 24#include "math/mMathFn.h"
 25#include "math/mPlane.h"
 26#include "math/mMatrix.h"
 27
 28
 29// extern void (*m_matF_x_point3F)(const F32 *m, const F32 *p, F32 *presult);
 30// extern void (*m_matF_x_vectorF)(const F32 *m, const F32 *v, F32 *vresult);
 31
 32/* not currently implemented.
 33void Athlon_MatrixF_x_Point3F(const F32 *m, const F32 *p, F32 *presult)
 34{
 35   m;
 36   p;
 37   presult;
 38}
 39*/
 40
 41//============================================================
 42//  Here's the C code for MatF_x_MatF:
 43//  note that the code below does it in a different order (optimal asm, after all!)
 44//
 45// r[0] = a[0]*b[0] + a[1]*b[4] + a[2]*b[8]  + a[3]*b[12];
 46// r[1] = a[0]*b[1] + a[1]*b[5] + a[2]*b[9]  + a[3]*b[13];
 47// r[2] = a[0]*b[2] + a[1]*b[6] + a[2]*b[10] + a[3]*b[14];
 48// r[3] = a[0]*b[3] + a[1]*b[7] + a[2]*b[11] + a[3]*b[15];
 49//
 50// r[4] = a[4]*b[0] + a[5]*b[4] + a[6]*b[8]  + a[7]*b[12];
 51// r[5] = a[4]*b[1] + a[5]*b[5] + a[6]*b[9]  + a[7]*b[13];
 52// r[6] = a[4]*b[2] + a[5]*b[6] + a[6]*b[10] + a[7]*b[14];
 53// r[7] = a[4]*b[3] + a[5]*b[7] + a[6]*b[11] + a[7]*b[15];
 54//
 55// r[8] = a[8]*b[0] + a[9]*b[4] + a[10]*b[8] + a[11]*b[12];
 56// r[9] = a[8]*b[1] + a[9]*b[5] + a[10]*b[9] + a[11]*b[13];
 57// r[10]= a[8]*b[2] + a[9]*b[6] + a[10]*b[10]+ a[11]*b[14];
 58// r[11]= a[8]*b[3] + a[9]*b[7] + a[10]*b[11]+ a[11]*b[15];
 59//
 60// r[12]= a[12]*b[0]+ a[13]*b[4]+ a[14]*b[8] + a[15]*b[12];
 61// r[13]= a[12]*b[1]+ a[13]*b[5]+ a[14]*b[9] + a[15]*b[13];
 62// r[14]= a[12]*b[2]+ a[13]*b[6]+ a[14]*b[10]+ a[15]*b[14];
 63// r[15]= a[12]*b[3]+ a[13]*b[7]+ a[14]*b[11]+ a[15]*b[15];
 64//============================================================
 65
 66#if defined(TORQUE_SUPPORTS_VC_INLINE_X86_ASM)
 67#define ADD_3DNOW_FUNCS
 68// inlined version here.
 69void Athlon_MatrixF_x_MatrixF(const F32 *matA, const F32 *matB, F32 *result)
 70{
 71   __asm
 72   {
 73      femms
 74
 75      mov         ecx, matA
 76      mov         edx, matB
 77      mov         eax, result
 78
 79      prefetch    [ecx+32]       ;// These may help -
 80      prefetch    [edx+32]       ;//    and probably don't hurt
 81
 82      movq        mm0,[ecx]      ;// a21   | a11
 83      movq        mm1,[ecx+8]      ;// a41   | a31
 84      movq        mm4,[edx]      ;// b21   | b11
 85      punpckhdq   mm2,mm0         ;// a21   |
 86      movq        mm5,[edx+16]   ;// b22   | b12
 87      punpckhdq   mm3,mm1         ;// a41   |
 88      movq        mm6,[edx+32]   ;// b23   | b13
 89      punpckldq   mm0,mm0         ;// a11   | a11
 90      punpckldq   mm1,mm1         ;// a31   | a31
 91      pfmul       mm4,mm0         ;// a11*b21 | a11*b11
 92      punpckhdq   mm2,mm2         ;// a21   | a21
 93      pfmul       mm0,[edx+8]      ;// a11*b41 | a11*b31
 94      movq        mm7,[edx+48]   ;// b24   | b14
 95      pfmul       mm5,mm2         ;// a21*b22 | a21*b12
 96      punpckhdq   mm3,mm3         ;// a41   | a41
 97      pfmul       mm2,[edx+24]   ;// a21*b42 | a21*b32
 98      pfmul       mm6,mm1         ;// a31*b23 | a31*b13
 99      pfadd       mm5,mm4         ;// a21*b22 + a11*b21 | a21*b12 + a11*b11
100      pfmul       mm1,[edx+40]   ;// a31*b43 | a31*b33
101      pfadd       mm2,mm0         ;// a21*b42 + a11*b41 | a21*b32 + a11*b31
102      pfmul       mm7,mm3         ;// a41*b24 | a41*b14
103      pfadd       mm6,mm5         ;// a21*b22 + a11*b21 + a31*b23 | a21*b12 + a11*b11 + a31*b13
104      pfmul       mm3,[edx+56]   ;// a41*b44 | a41*b34
105      pfadd       mm2,mm1         ;// a21*b42 + a11*b41 + a31*b43 | a21*b32 + a11*b31 + a31*b33
106      pfadd       mm7,mm6         ;// a41*b24 + a21*b22 + a11*b21 + a31*b23 |  a41*b14 + a21*b12 + a11*b11 + a31*b13
107      movq        mm0,[ecx+16]   ;// a22   | a12
108      pfadd       mm3,mm2         ;// a41*b44 + a21*b42 + a11*b41 + a31*b43 | a41*b34 + a21*b32 + a11*b31 + a31*b33
109      movq        mm1,[ecx+24]   ;// a42   | a32
110      movq        [eax],mm7      ;// r21   | r11
111      movq        mm4,[edx]      ;// b21   | b11
112      movq        [eax+8],mm3      ;// r41   | r31
113
114      punpckhdq   mm2,mm0         ;// a22   | XXX
115      movq        mm5,[edx+16]   ;// b22   | b12
116      punpckhdq   mm3,mm1         ;// a42   | XXX
117      movq        mm6,[edx+32]   ;// b23   | b13
118      punpckldq   mm0,mm0         ;// a12   | a12
119      punpckldq   mm1,mm1         ;// a32   | a32
120      pfmul       mm4,mm0         ;// a12*b21 | a12*b11
121      punpckhdq   mm2,mm2         ;// a22   | a22
122      pfmul       mm0,[edx+8]      ;// a12*b41 | a12*b31
123      movq        mm7,[edx+48]   ;// b24   | b14
124      pfmul       mm5,mm2         ;// a22*b22 | a22*b12
125      punpckhdq   mm3,mm3         ;// a42   | a42
126      pfmul       mm2,[edx+24]   ;// a22*b42 | a22*b32
127      pfmul       mm6,mm1         ;// a32*b23 | a32*b13
128      pfadd       mm5,mm4         ;// a12*b21 + a22*b22 | a12*b11 + a22*b12
129      pfmul       mm1,[edx+40]   ;// a32*b43 | a32*b33
130      pfadd       mm2,mm0         ;// a12*b41 + a22*b42 | a12*b11 + a22*b32
131      pfmul       mm7,mm3         ;// a42*b24 | a42*b14
132      pfadd       mm6,mm5         ;// a32*b23 + a12*b21 + a22*b22 | a32*b13 + a12*b11 + a22*b12
133      pfmul       mm3,[edx+56]   ;// a42*b44 | a42*b34
134      pfadd       mm2,mm1         ;// a32*b43 + a12*b41 + a22*b42 | a32*b33 + a12*b11 + a22*b32
135      pfadd       mm7,mm6         ;// a42*b24 + a32*b23 + a12*b21 + a22*b22 | a42*b14 + a32*b13 + a12*b11 + a22*b12
136      movq        mm0,[ecx+32]   ;// a23 | a13
137      pfadd       mm3,mm2         ;// a42*b44 + a32*b43 + a12*b41 + a22*b42 | a42*b34 + a32*b33 + a12*b11 + a22*b32
138      movq        mm1,[ecx+40]   ;// a43 | a33
139      movq        [eax+16],mm7   ;// r22 | r12
140      movq        mm4,[edx]      ;// b21   | b11
141      movq        [eax+24],mm3   ;// r42 | r32
142
143      punpckhdq   mm2,mm0         ;// a23 | XXX
144      movq        mm5,[edx+16]   ;// b22 | b12
145      punpckhdq   mm3,mm1         ;// a43 | XXX
146      movq        mm6,[edx+32]   ;// b23 | b13
147      punpckldq   mm0,mm0         ;// a13 | a13
148      punpckldq   mm1,mm1         ;// a33 | a33
149      pfmul       mm4,mm0         ;// a13*b21 | a13*b11
150      punpckhdq   mm2,mm2         ;// a23 | a23
151      pfmul       mm0,[edx+8]      ;// a13*b41 | a13*b31
152      movq        mm7,[edx+48]   ;// b24 | b14
153      pfmul       mm5,mm2         ;// a23*b22 | a23*b12
154      punpckhdq   mm3,mm3         ;// a43 | a43
155      pfmul       mm2,[edx+24]   ;// a23*b42 | a23*b32
156      pfmul       mm6,mm1         ;// a33*b23 | a33*b13
157      pfadd       mm5,mm4         ;// a23*b22 + a13*b21 | a23*b12 + a13*b11
158      pfmul       mm1,[edx+40]   ;// a33*b43 | a33*b33
159      pfadd       mm2,mm0         ;// a13*b41 + a23*b42 | a13*b31 + a23*b32
160      pfmul       mm7,mm3         ;// a43*b24 | a43*b14
161      pfadd       mm6,mm5         ;// a33*b23 + a23*b22 + a13*b21 | a33*b13 + a23*b12 + a13*b11
162      pfmul       mm3,[edx+56]   ;// a43*b44 | a43*b34
163      pfadd       mm2,mm1         ;// a33*b43*a13*b41 + a23*b42 | a33*b33 + a13*b31 + a23*b32
164      pfadd       mm7,mm6         ;// a43*b24 + a33*b23 + a23*b22 + a13*b21 | a43*b14 + a33*b13 + a23*b12 + a13*b11
165      movq        mm0,[ecx+48]   ;// a24 | a14
166      pfadd       mm3,mm2         ;// a43*b44 + a33*b43*a13*b41 + a23*b42 | a43*b34 + a33*b33 + a13*b31 + a23*b32
167      movq        mm1,[ecx+56]   ;// a44 | a34
168      movq        [eax+32],mm7   ;// r23 | r13
169      movq        mm4,[edx]      ;// b21 | b11
170      movq        [eax+40],mm3   ;// r43 | r33
171
172      punpckhdq   mm2,mm0         ;// a24 | XXX
173      movq        mm5,[edx+16]   ;// b22 | b12
174      punpckhdq   mm3,mm1         ;// a44 | XXX
175      movq        mm6,[edx+32]   ;// b23 | b13
176      punpckldq   mm0,mm0         ;// a14 | a14
177      punpckldq   mm1,mm1         ;// a34 | a34
178      pfmul       mm4,mm0         ;// a14*b21 | a14*b11
179      punpckhdq   mm2,mm2         ;// a24 | a24
180      pfmul       mm0,[edx+8]      ;// a14*b41 | a14*b31
181      movq        mm7,[edx+48]   ;// b24 | b14
182      pfmul       mm5,mm2         ;// a24*b22 | a24*b12
183      punpckhdq   mm3,mm3         ;// a44 | a44
184      pfmul       mm2,[edx+24]   ;// a24*b 42 | a24*b32
185      pfmul       mm6,mm1         ;// a34*b23 | a34*b13
186      pfadd       mm5,mm4         ;// a14*b21 + a24*b22 | a14*b11 + a24*b12
187      pfmul       mm1,[edx+40]   ;// a34*b43 | a34*b33
188      pfadd       mm2,mm0         ;// a14*b41 + a24*b 42 | a14*b31 + a24*b32
189      pfmul       mm7,mm3         ;// a44*b24 | a44*b14
190      pfadd       mm6,mm5         ;// a34*b23 + a14*b21 + a24*b22 | a34*b13 + a14*b11 + a24*b12
191      pfmul       mm3,[edx+56]   ;// a44*b44 | a44*b34
192      pfadd       mm2,mm1         ;// a34*b43 + a14*b41 + a24*b 42 | a34*b33 + a14*b31 + a24*b32
193      pfadd       mm7,mm6         ;// a44*b24 + a14*b23 + a24*b 42 | a44*b14 + a14*b31 + a24*b32
194      pfadd       mm3,mm2         ;// a44*b44 + a34*b43 + a14*b41 + a24*b42 | a44*b34 + a34*b33 + a14*b31 + a24*b32
195      movq        [eax+48],mm7   ;// r24 | r14
196      movq        [eax+56],mm3   ;// r44 | r34
197      femms
198   }
199}
200#elif defined(TORQUE_SUPPORTS_NASM)
201#define ADD_3DNOW_FUNCS
202extern "C"
203{
204   void Athlon_MatrixF_x_MatrixF(const F32 *matA, const F32 *matB, F32 *result);
205}
206
207#endif
208
209void mInstall_AMD_Math()
210{
211#if defined(ADD_3DNOW_FUNCS)
212   m_matF_x_matF           = Athlon_MatrixF_x_MatrixF;
213#endif
214   // m_matF_x_point3F = Athlon_MatrixF_x_Point3F;
215   // m_matF_x_vectorF = Athlon_MatrixF_x_VectorF;
216}
217
218