mMathSSE.cpp

Public Functions

void

mInstall_Library_SSE()

Detailed Description

Public Functions

mInstall_Library_SSE()

  1
  2//-----------------------------------------------------------------------------
  3// Copyright (c) 2012 GarageGames, LLC
  4//
  5// Permission is hereby granted, free of charge, to any person obtaining a copy
  6// of this software and associated documentation files (the "Software"), to
  7// deal in the Software without restriction, including without limitation the
  8// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  9// sell copies of the Software, and to permit persons to whom the Software is
 10// furnished to do so, subject to the following conditions:
 11//
 12// The above copyright notice and this permission notice shall be included in
 13// all copies or substantial portions of the Software.
 14//
 15// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 21// IN THE SOFTWARE.
 22//-----------------------------------------------------------------------------
 23
 24#include "math/mMathFn.h"
 25#include "math/mPlane.h"
 26#include "math/mMatrix.h"
 27
 28
 29#if defined(TORQUE_SUPPORTS_VC_INLINE_X86_ASM)
 30#define ADD_SSE_FN
 31// inlined version here.
 32void SSE_MatrixF_x_MatrixF(const F32 *matA, const F32 *matB, F32 *result)
 33{
 34   __asm
 35   {
 36      mov         edx, matA
 37      mov         ecx, matB
 38      mov         eax, result
 39
 40      movss       xmm0, [edx]
 41      movups      xmm1, [ecx]
 42      shufps      xmm0, xmm0, 0
 43      movss       xmm2, [edx+4]
 44      mulps       xmm0, xmm1
 45      shufps      xmm2, xmm2, 0
 46      movups      xmm3, [ecx+10h]
 47      movss       xmm7, [edx+8]
 48      mulps       xmm2, xmm3
 49      shufps      xmm7, xmm7, 0
 50      addps       xmm0, xmm2
 51      movups      xmm4, [ecx+20h]
 52      movss       xmm2, [edx+0Ch]
 53      mulps       xmm7, xmm4
 54      shufps      xmm2, xmm2, 0
 55      addps       xmm0, xmm7
 56      movups      xmm5, [ecx+30h]
 57      movss       xmm6, [edx+10h]
 58      mulps       xmm2, xmm5
 59      movss       xmm7, [edx+14h]
 60      shufps      xmm6, xmm6, 0
 61      addps       xmm0, xmm2
 62      shufps      xmm7, xmm7, 0
 63      movlps      [eax], xmm0
 64      movhps      [eax+8], xmm0
 65      mulps       xmm7, xmm3
 66      movss       xmm0, [edx+18h]
 67      mulps       xmm6, xmm1
 68      shufps      xmm0, xmm0, 0
 69      addps       xmm6, xmm7
 70      mulps       xmm0, xmm4
 71      movss       xmm2, [edx+24h]
 72      addps       xmm6, xmm0
 73      movss       xmm0, [edx+1Ch]
 74      movss       xmm7, [edx+20h]
 75      shufps      xmm0, xmm0, 0
 76      shufps      xmm7, xmm7, 0
 77      mulps       xmm0, xmm5
 78      mulps       xmm7, xmm1
 79      addps       xmm6, xmm0
 80      shufps      xmm2, xmm2, 0
 81      movlps      [eax+10h], xmm6
 82      movhps      [eax+18h], xmm6
 83      mulps       xmm2, xmm3
 84      movss       xmm6, [edx+28h]
 85      addps       xmm7, xmm2
 86      shufps      xmm6, xmm6, 0
 87      movss       xmm2, [edx+2Ch]
 88      mulps       xmm6, xmm4
 89      shufps      xmm2, xmm2, 0
 90      addps       xmm7, xmm6
 91      mulps       xmm2, xmm5
 92      movss       xmm0, [edx+34h]
 93      addps       xmm7, xmm2
 94      shufps      xmm0, xmm0, 0
 95      movlps      [eax+20h], xmm7
 96      movss       xmm2, [edx+30h]
 97      movhps      [eax+28h], xmm7
 98      mulps       xmm0, xmm3
 99      shufps      xmm2, xmm2, 0
100      movss       xmm6, [edx+38h]
101      mulps       xmm2, xmm1
102      shufps      xmm6, xmm6, 0
103      addps       xmm2, xmm0
104      mulps       xmm6, xmm4
105      movss       xmm7, [edx+3Ch]
106      shufps      xmm7, xmm7, 0
107      addps       xmm2, xmm6
108      mulps       xmm7, xmm5
109      addps       xmm2, xmm7
110      movups      [eax+30h], xmm2
111   }
112}
113void SSE_MatrixF_x_MatrixF_Aligned(const F32 *matA, const F32 *matB, F32 *result)
114{
115   __asm
116   {
117      mov         edx, matA
118      mov         ecx, matB
119      mov         eax, result
120
121      movss       xmm0, [edx]
122      movaps      xmm1, [ecx]
123      shufps      xmm0, xmm0, 0
124      movss       xmm2, [edx+4]
125      mulps       xmm0, xmm1
126      shufps      xmm2, xmm2, 0
127      movaps      xmm3, [ecx+10h]
128      movss       xmm7, [edx+8]
129      mulps       xmm2, xmm3
130      shufps      xmm7, xmm7, 0
131      addps       xmm0, xmm2
132      movaps      xmm4, [ecx+20h]
133      movss       xmm2, [edx+0Ch]
134      mulps       xmm7, xmm4
135      shufps      xmm2, xmm2, 0
136      addps       xmm0, xmm7
137      movaps      xmm5, [ecx+30h]
138      movss       xmm6, [edx+10h]
139      mulps       xmm2, xmm5
140      movss       xmm7, [edx+14h]
141      shufps      xmm6, xmm6, 0
142      addps       xmm0, xmm2
143      shufps      xmm7, xmm7, 0
144      movlps      [eax], xmm0
145      movhps      [eax+8], xmm0
146      mulps       xmm7, xmm3
147      movss       xmm0, [edx+18h]
148      mulps       xmm6, xmm1
149      shufps      xmm0, xmm0, 0
150      addps       xmm6, xmm7
151      mulps       xmm0, xmm4
152      movss       xmm2, [edx+24h]
153      addps       xmm6, xmm0
154      movss       xmm0, [edx+1Ch]
155      movss       xmm7, [edx+20h]
156      shufps      xmm0, xmm0, 0
157      shufps      xmm7, xmm7, 0
158      mulps       xmm0, xmm5
159      mulps       xmm7, xmm1
160      addps       xmm6, xmm0
161      shufps      xmm2, xmm2, 0
162      movlps      [eax+10h], xmm6
163      movhps      [eax+18h], xmm6
164      mulps       xmm2, xmm3
165      movss       xmm6, [edx+28h]
166      addps       xmm7, xmm2
167      shufps      xmm6, xmm6, 0
168      movss       xmm2, [edx+2Ch]
169      mulps       xmm6, xmm4
170      shufps      xmm2, xmm2, 0
171      addps       xmm7, xmm6
172      mulps       xmm2, xmm5
173      movss       xmm0, [edx+34h]
174      addps       xmm7, xmm2
175      shufps      xmm0, xmm0, 0
176      movlps      [eax+20h], xmm7
177      movss       xmm2, [edx+30h]
178      movhps      [eax+28h], xmm7
179      mulps       xmm0, xmm3
180      shufps      xmm2, xmm2, 0
181      movss       xmm6, [edx+38h]
182      mulps       xmm2, xmm1
183      shufps      xmm6, xmm6, 0
184      addps       xmm2, xmm0
185      mulps       xmm6, xmm4
186      movss       xmm7, [edx+3Ch]
187      shufps      xmm7, xmm7, 0
188      addps       xmm2, xmm6
189      mulps       xmm7, xmm5
190      addps       xmm2, xmm7
191      movaps      [eax+30h], xmm2
192   }
193}
194// if we set our flag, we always try to build the inlined asm.
195// EXCEPT if we're in an old version of Codewarrior that can't handle SSE code.
196// TODO: the NASM implementation of SSE_MatrixF_x_MatrixF_Aligned is missing,
197// so we temporary disable this until fixed (needed for linux dedicated build)
198//#elif defined(TORQUE_SUPPORTS_NASM)
199#elif 0
200#define ADD_SSE_FN
201extern "C"
202{
203   void SSE_MatrixF_x_MatrixF(const F32 *matA, const F32 *matB, F32 *result);
204   void SSE_MatrixF_x_MatrixF_Aligned(const F32 *matA, const F32 *matB, F32 *result);
205}
206
207#elif defined( TORQUE_COMPILER_GCC ) && (defined( TORQUE_CPU_X86 ) || defined( TORQUE_CPU_X64 ))
208#define ADD_SSE_FN
209
210void SSE_MatrixF_x_MatrixF(const F32 *matA, const F32 *matB, F32 *result)
211{
212   asm
213   (
214      "movss      (%%edx),%%xmm0\n"
215      "movups     (%%ecx),%%xmm1\n"
216      "shufps     $0,%%xmm0,%%xmm0\n"      
217      "movss      4(%%edx),%%xmm2\n"
218      "mulps      %%xmm1,%%xmm0\n"
219      "shufps     $0,%%xmm2,%%xmm2\n"
220      "movups     0x10(%%ecx),%%xmm3\n"
221      "movss      8(%%edx),%%xmm7\n"
222      "mulps      %%xmm3,%%xmm2\n"
223      "shufps     $0,%%xmm7,%%xmm7\n"
224      "addps      %%xmm2,%%xmm0\n"
225      "movups     0x20(%%ecx),%%xmm4\n"
226      "movss      0x0c(%%edx),%%xmm2\n"
227      "mulps      %%xmm4,%%xmm7\n"
228      "shufps     $0,%%xmm2,%%xmm2\n"
229      "addps      %%xmm7,%%xmm0\n"
230      "movups     0x30(%%ecx),%%xmm5\n"
231      "movss      0x10(%%edx),%%xmm6\n"
232      "mulps      %%xmm5,%%xmm2\n"
233      "movss      0x14(%%edx),%%xmm7\n"
234      "shufps     $0,%%xmm6,%%xmm6\n"
235      "addps      %%xmm2,%%xmm0\n"
236      "shufps     $0,%%xmm7,%%xmm7\n"
237      "movlps     %%xmm0,(%%eax)\n"
238      "movhps     %%xmm0,8(%%eax)\n"
239      "mulps      %%xmm3,%%xmm7\n"
240      "movss      0x18(%%edx),%%xmm0\n"
241      "mulps      %%xmm1,%%xmm6\n"
242      "shufps     $0,%%xmm0,%%xmm0\n"
243      "addps      %%xmm7,%%xmm6\n"
244      "mulps      %%xmm4,%%xmm0\n"
245      "movss      0x24(%%edx),%%xmm2\n"
246      "addps      %%xmm0,%%xmm6\n"
247      "movss      0x1c(%%edx),%%xmm0\n"
248      "movss      0x20(%%edx),%%xmm7\n"
249      "shufps     $0,%%xmm0,%%xmm0\n"
250      "shufps     $0,%%xmm7,%%xmm7\n"
251      "mulps      %%xmm5,%%xmm0\n"
252      "mulps      %%xmm1,%%xmm7\n"
253      "addps      %%xmm0,%%xmm6\n"
254      "shufps     $0,%%xmm2,%%xmm2\n"
255      "movlps     %%xmm6,0x10(%%eax)\n"
256      "movhps     %%xmm6,0x18(%%eax)\n"
257      "mulps      %%xmm3,%%xmm2\n"
258      "movss      0x28(%%edx),%%xmm6\n"
259      "addps      %%xmm2,%%xmm7\n"
260      "shufps     $0,%%xmm6,%%xmm6\n"
261      "movss      0x2c(%%edx),%%xmm2\n"
262      "mulps      %%xmm4,%%xmm6\n"
263      "shufps     $0,%%xmm2,%%xmm2\n"
264      "addps      %%xmm6,%%xmm7\n"
265      "mulps      %%xmm5,%%xmm2\n"
266      "movss      0x34(%%edx),%%xmm0\n"
267      "addps      %%xmm2,%%xmm7\n"
268      "shufps     $0,%%xmm0,%%xmm0\n"
269      "movlps     %%xmm7,0x20(%%eax)\n"
270      "movss      0x30(%%edx),%%xmm2\n"
271      "movhps     %%xmm7,0x28(%%eax)\n"
272      "mulps      %%xmm3,%%xmm0\n"
273      "shufps     $0,%%xmm2,%%xmm2\n"
274      "movss      0x38(%%edx),%%xmm6\n"
275      "mulps      %%xmm1,%%xmm2\n"
276      "shufps     $0,%%xmm6,%%xmm6\n"
277      "addps      %%xmm0,%%xmm2\n"
278      "mulps      %%xmm4,%%xmm6\n"
279      "movss      0x3c(%%edx),%%xmm7\n"
280      "shufps     $0,%%xmm7,%%xmm7\n"
281      "addps      %%xmm6,%%xmm2\n"
282      "mulps      %%xmm5,%%xmm7\n"
283      "addps      %%xmm7,%%xmm2\n"
284      "movups     %%xmm2,0x30(%%eax)\n"
285      
286      :
287      : "d" ( matA ),
288        "c" ( matB ),
289        "a" ( result )
290   );
291}
292
293void SSE_MatrixF_x_MatrixF_Aligned(const F32 *matA, const F32 *matB, F32 *result)
294{
295   asm
296      (
297      "movss      (%%edx),%%xmm0\n"
298      "movaps     (%%ecx),%%xmm1\n"
299      "shufps     $0,%%xmm0,%%xmm0\n"      
300      "movss      4(%%edx),%%xmm2\n"
301      "mulps      %%xmm1,%%xmm0\n"
302      "shufps     $0,%%xmm2,%%xmm2\n"
303      "movaps     0x10(%%ecx),%%xmm3\n"
304      "movss      8(%%edx),%%xmm7\n"
305      "mulps      %%xmm3,%%xmm2\n"
306      "shufps     $0,%%xmm7,%%xmm7\n"
307      "addps      %%xmm2,%%xmm0\n"
308      "movaps     0x20(%%ecx),%%xmm4\n"
309      "movss      0x0c(%%edx),%%xmm2\n"
310      "mulps      %%xmm4,%%xmm7\n"
311      "shufps     $0,%%xmm2,%%xmm2\n"
312      "addps      %%xmm7,%%xmm0\n"
313      "movaps     0x30(%%ecx),%%xmm5\n"
314      "movss      0x10(%%edx),%%xmm6\n"
315      "mulps      %%xmm5,%%xmm2\n"
316      "movss      0x14(%%edx),%%xmm7\n"
317      "shufps     $0,%%xmm6,%%xmm6\n"
318      "addps      %%xmm2,%%xmm0\n"
319      "shufps     $0,%%xmm7,%%xmm7\n"
320      "movlps     %%xmm0,(%%eax)\n"
321      "movhps     %%xmm0,8(%%eax)\n"
322      "mulps      %%xmm3,%%xmm7\n"
323      "movss      0x18(%%edx),%%xmm0\n"
324      "mulps      %%xmm1,%%xmm6\n"
325      "shufps     $0,%%xmm0,%%xmm0\n"
326      "addps      %%xmm7,%%xmm6\n"
327      "mulps      %%xmm4,%%xmm0\n"
328      "movss      0x24(%%edx),%%xmm2\n"
329      "addps      %%xmm0,%%xmm6\n"
330      "movss      0x1c(%%edx),%%xmm0\n"
331      "movss      0x20(%%edx),%%xmm7\n"
332      "shufps     $0,%%xmm0,%%xmm0\n"
333      "shufps     $0,%%xmm7,%%xmm7\n"
334      "mulps      %%xmm5,%%xmm0\n"
335      "mulps      %%xmm1,%%xmm7\n"
336      "addps      %%xmm0,%%xmm6\n"
337      "shufps     $0,%%xmm2,%%xmm2\n"
338      "movlps     %%xmm6,0x10(%%eax)\n"
339      "movhps     %%xmm6,0x18(%%eax)\n"
340      "mulps      %%xmm3,%%xmm2\n"
341      "movss      0x28(%%edx),%%xmm6\n"
342      "addps      %%xmm2,%%xmm7\n"
343      "shufps     $0,%%xmm6,%%xmm6\n"
344      "movss      0x2c(%%edx),%%xmm2\n"
345      "mulps      %%xmm4,%%xmm6\n"
346      "shufps     $0,%%xmm2,%%xmm2\n"
347      "addps      %%xmm6,%%xmm7\n"
348      "mulps      %%xmm5,%%xmm2\n"
349      "movss      0x34(%%edx),%%xmm0\n"
350      "addps      %%xmm2,%%xmm7\n"
351      "shufps     $0,%%xmm0,%%xmm0\n"
352      "movlps     %%xmm7,0x20(%%eax)\n"
353      "movss      0x30(%%edx),%%xmm2\n"
354      "movhps     %%xmm7,0x28(%%eax)\n"
355      "mulps      %%xmm3,%%xmm0\n"
356      "shufps     $0,%%xmm2,%%xmm2\n"
357      "movss      0x38(%%edx),%%xmm6\n"
358      "mulps      %%xmm1,%%xmm2\n"
359      "shufps     $0,%%xmm6,%%xmm6\n"
360      "addps      %%xmm0,%%xmm2\n"
361      "mulps      %%xmm4,%%xmm6\n"
362      "movss      0x3c(%%edx),%%xmm7\n"
363      "shufps     $0,%%xmm7,%%xmm7\n"
364      "addps      %%xmm6,%%xmm2\n"
365      "mulps      %%xmm5,%%xmm7\n"
366      "addps      %%xmm7,%%xmm2\n"
367      "movaps     %%xmm2,0x30(%%eax)\n"
368
369      :
370   : "d" ( matA ),
371      "c" ( matB ),
372      "a" ( result )
373      );
374}
375
376#endif
377
378void mInstall_Library_SSE()
379{
380#if defined(ADD_SSE_FN)
381   m_matF_x_matF           = SSE_MatrixF_x_MatrixF;
382   m_matF_x_matF_aligned   = SSE_MatrixF_x_MatrixF_Aligned;
383   // m_matF_x_point3F = Athlon_MatrixF_x_Point3F;
384   // m_matF_x_vectorF = Athlon_MatrixF_x_VectorF;
385#endif
386}
387