mMathSSE.cpp
Engine/source/math/mMathSSE.cpp
Public Functions
Detailed Description
Public Functions
mInstall_Library_SSE()
1 2//----------------------------------------------------------------------------- 3// Copyright (c) 2012 GarageGames, LLC 4// 5// Permission is hereby granted, free of charge, to any person obtaining a copy 6// of this software and associated documentation files (the "Software"), to 7// deal in the Software without restriction, including without limitation the 8// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 9// sell copies of the Software, and to permit persons to whom the Software is 10// furnished to do so, subject to the following conditions: 11// 12// The above copyright notice and this permission notice shall be included in 13// all copies or substantial portions of the Software. 14// 15// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21// IN THE SOFTWARE. 22//----------------------------------------------------------------------------- 23 24#include "math/mMathFn.h" 25#include "math/mPlane.h" 26#include "math/mMatrix.h" 27 28 29#if defined(TORQUE_SUPPORTS_VC_INLINE_X86_ASM) 30#define ADD_SSE_FN 31// inlined version here. 32void SSE_MatrixF_x_MatrixF(const F32 *matA, const F32 *matB, F32 *result) 33{ 34 __asm 35 { 36 mov edx, matA 37 mov ecx, matB 38 mov eax, result 39 40 movss xmm0, [edx] 41 movups xmm1, [ecx] 42 shufps xmm0, xmm0, 0 43 movss xmm2, [edx+4] 44 mulps xmm0, xmm1 45 shufps xmm2, xmm2, 0 46 movups xmm3, [ecx+10h] 47 movss xmm7, [edx+8] 48 mulps xmm2, xmm3 49 shufps xmm7, xmm7, 0 50 addps xmm0, xmm2 51 movups xmm4, [ecx+20h] 52 movss xmm2, [edx+0Ch] 53 mulps xmm7, xmm4 54 shufps xmm2, xmm2, 0 55 addps xmm0, xmm7 56 movups xmm5, [ecx+30h] 57 movss xmm6, [edx+10h] 58 mulps xmm2, xmm5 59 movss xmm7, [edx+14h] 60 shufps xmm6, xmm6, 0 61 addps xmm0, xmm2 62 shufps xmm7, xmm7, 0 63 movlps [eax], xmm0 64 movhps [eax+8], xmm0 65 mulps xmm7, xmm3 66 movss xmm0, [edx+18h] 67 mulps xmm6, xmm1 68 shufps xmm0, xmm0, 0 69 addps xmm6, xmm7 70 mulps xmm0, xmm4 71 movss xmm2, [edx+24h] 72 addps xmm6, xmm0 73 movss xmm0, [edx+1Ch] 74 movss xmm7, [edx+20h] 75 shufps xmm0, xmm0, 0 76 shufps xmm7, xmm7, 0 77 mulps xmm0, xmm5 78 mulps xmm7, xmm1 79 addps xmm6, xmm0 80 shufps xmm2, xmm2, 0 81 movlps [eax+10h], xmm6 82 movhps [eax+18h], xmm6 83 mulps xmm2, xmm3 84 movss xmm6, [edx+28h] 85 addps xmm7, xmm2 86 shufps xmm6, xmm6, 0 87 movss xmm2, [edx+2Ch] 88 mulps xmm6, xmm4 89 shufps xmm2, xmm2, 0 90 addps xmm7, xmm6 91 mulps xmm2, xmm5 92 movss xmm0, [edx+34h] 93 addps xmm7, xmm2 94 shufps xmm0, xmm0, 0 95 movlps [eax+20h], xmm7 96 movss xmm2, [edx+30h] 97 movhps [eax+28h], xmm7 98 mulps xmm0, xmm3 99 shufps xmm2, xmm2, 0 100 movss xmm6, [edx+38h] 101 mulps xmm2, xmm1 102 shufps xmm6, xmm6, 0 103 addps xmm2, xmm0 104 mulps xmm6, xmm4 105 movss xmm7, [edx+3Ch] 106 shufps xmm7, xmm7, 0 107 addps xmm2, xmm6 108 mulps xmm7, xmm5 109 addps xmm2, xmm7 110 movups [eax+30h], xmm2 111 } 112} 113void SSE_MatrixF_x_MatrixF_Aligned(const F32 *matA, const F32 *matB, F32 *result) 114{ 115 __asm 116 { 117 mov edx, matA 118 mov ecx, matB 119 mov eax, result 120 121 movss xmm0, [edx] 122 movaps xmm1, [ecx] 123 shufps xmm0, xmm0, 0 124 movss xmm2, [edx+4] 125 mulps xmm0, xmm1 126 shufps xmm2, xmm2, 0 127 movaps xmm3, [ecx+10h] 128 movss xmm7, [edx+8] 129 mulps xmm2, xmm3 130 shufps xmm7, xmm7, 0 131 addps xmm0, xmm2 132 movaps xmm4, [ecx+20h] 133 movss xmm2, [edx+0Ch] 134 mulps xmm7, xmm4 135 shufps xmm2, xmm2, 0 136 addps xmm0, xmm7 137 movaps xmm5, [ecx+30h] 138 movss xmm6, [edx+10h] 139 mulps xmm2, xmm5 140 movss xmm7, [edx+14h] 141 shufps xmm6, xmm6, 0 142 addps xmm0, xmm2 143 shufps xmm7, xmm7, 0 144 movlps [eax], xmm0 145 movhps [eax+8], xmm0 146 mulps xmm7, xmm3 147 movss xmm0, [edx+18h] 148 mulps xmm6, xmm1 149 shufps xmm0, xmm0, 0 150 addps xmm6, xmm7 151 mulps xmm0, xmm4 152 movss xmm2, [edx+24h] 153 addps xmm6, xmm0 154 movss xmm0, [edx+1Ch] 155 movss xmm7, [edx+20h] 156 shufps xmm0, xmm0, 0 157 shufps xmm7, xmm7, 0 158 mulps xmm0, xmm5 159 mulps xmm7, xmm1 160 addps xmm6, xmm0 161 shufps xmm2, xmm2, 0 162 movlps [eax+10h], xmm6 163 movhps [eax+18h], xmm6 164 mulps xmm2, xmm3 165 movss xmm6, [edx+28h] 166 addps xmm7, xmm2 167 shufps xmm6, xmm6, 0 168 movss xmm2, [edx+2Ch] 169 mulps xmm6, xmm4 170 shufps xmm2, xmm2, 0 171 addps xmm7, xmm6 172 mulps xmm2, xmm5 173 movss xmm0, [edx+34h] 174 addps xmm7, xmm2 175 shufps xmm0, xmm0, 0 176 movlps [eax+20h], xmm7 177 movss xmm2, [edx+30h] 178 movhps [eax+28h], xmm7 179 mulps xmm0, xmm3 180 shufps xmm2, xmm2, 0 181 movss xmm6, [edx+38h] 182 mulps xmm2, xmm1 183 shufps xmm6, xmm6, 0 184 addps xmm2, xmm0 185 mulps xmm6, xmm4 186 movss xmm7, [edx+3Ch] 187 shufps xmm7, xmm7, 0 188 addps xmm2, xmm6 189 mulps xmm7, xmm5 190 addps xmm2, xmm7 191 movaps [eax+30h], xmm2 192 } 193} 194// if we set our flag, we always try to build the inlined asm. 195// EXCEPT if we're in an old version of Codewarrior that can't handle SSE code. 196// TODO: the NASM implementation of SSE_MatrixF_x_MatrixF_Aligned is missing, 197// so we temporary disable this until fixed (needed for linux dedicated build) 198//#elif defined(TORQUE_SUPPORTS_NASM) 199#elif 0 200#define ADD_SSE_FN 201extern "C" 202{ 203 void SSE_MatrixF_x_MatrixF(const F32 *matA, const F32 *matB, F32 *result); 204 void SSE_MatrixF_x_MatrixF_Aligned(const F32 *matA, const F32 *matB, F32 *result); 205} 206 207#elif defined( TORQUE_COMPILER_GCC ) && (defined( TORQUE_CPU_X86 ) || defined( TORQUE_CPU_X64 )) 208#define ADD_SSE_FN 209 210void SSE_MatrixF_x_MatrixF(const F32 *matA, const F32 *matB, F32 *result) 211{ 212 asm 213 ( 214 "movss (%%edx),%%xmm0\n" 215 "movups (%%ecx),%%xmm1\n" 216 "shufps $0,%%xmm0,%%xmm0\n" 217 "movss 4(%%edx),%%xmm2\n" 218 "mulps %%xmm1,%%xmm0\n" 219 "shufps $0,%%xmm2,%%xmm2\n" 220 "movups 0x10(%%ecx),%%xmm3\n" 221 "movss 8(%%edx),%%xmm7\n" 222 "mulps %%xmm3,%%xmm2\n" 223 "shufps $0,%%xmm7,%%xmm7\n" 224 "addps %%xmm2,%%xmm0\n" 225 "movups 0x20(%%ecx),%%xmm4\n" 226 "movss 0x0c(%%edx),%%xmm2\n" 227 "mulps %%xmm4,%%xmm7\n" 228 "shufps $0,%%xmm2,%%xmm2\n" 229 "addps %%xmm7,%%xmm0\n" 230 "movups 0x30(%%ecx),%%xmm5\n" 231 "movss 0x10(%%edx),%%xmm6\n" 232 "mulps %%xmm5,%%xmm2\n" 233 "movss 0x14(%%edx),%%xmm7\n" 234 "shufps $0,%%xmm6,%%xmm6\n" 235 "addps %%xmm2,%%xmm0\n" 236 "shufps $0,%%xmm7,%%xmm7\n" 237 "movlps %%xmm0,(%%eax)\n" 238 "movhps %%xmm0,8(%%eax)\n" 239 "mulps %%xmm3,%%xmm7\n" 240 "movss 0x18(%%edx),%%xmm0\n" 241 "mulps %%xmm1,%%xmm6\n" 242 "shufps $0,%%xmm0,%%xmm0\n" 243 "addps %%xmm7,%%xmm6\n" 244 "mulps %%xmm4,%%xmm0\n" 245 "movss 0x24(%%edx),%%xmm2\n" 246 "addps %%xmm0,%%xmm6\n" 247 "movss 0x1c(%%edx),%%xmm0\n" 248 "movss 0x20(%%edx),%%xmm7\n" 249 "shufps $0,%%xmm0,%%xmm0\n" 250 "shufps $0,%%xmm7,%%xmm7\n" 251 "mulps %%xmm5,%%xmm0\n" 252 "mulps %%xmm1,%%xmm7\n" 253 "addps %%xmm0,%%xmm6\n" 254 "shufps $0,%%xmm2,%%xmm2\n" 255 "movlps %%xmm6,0x10(%%eax)\n" 256 "movhps %%xmm6,0x18(%%eax)\n" 257 "mulps %%xmm3,%%xmm2\n" 258 "movss 0x28(%%edx),%%xmm6\n" 259 "addps %%xmm2,%%xmm7\n" 260 "shufps $0,%%xmm6,%%xmm6\n" 261 "movss 0x2c(%%edx),%%xmm2\n" 262 "mulps %%xmm4,%%xmm6\n" 263 "shufps $0,%%xmm2,%%xmm2\n" 264 "addps %%xmm6,%%xmm7\n" 265 "mulps %%xmm5,%%xmm2\n" 266 "movss 0x34(%%edx),%%xmm0\n" 267 "addps %%xmm2,%%xmm7\n" 268 "shufps $0,%%xmm0,%%xmm0\n" 269 "movlps %%xmm7,0x20(%%eax)\n" 270 "movss 0x30(%%edx),%%xmm2\n" 271 "movhps %%xmm7,0x28(%%eax)\n" 272 "mulps %%xmm3,%%xmm0\n" 273 "shufps $0,%%xmm2,%%xmm2\n" 274 "movss 0x38(%%edx),%%xmm6\n" 275 "mulps %%xmm1,%%xmm2\n" 276 "shufps $0,%%xmm6,%%xmm6\n" 277 "addps %%xmm0,%%xmm2\n" 278 "mulps %%xmm4,%%xmm6\n" 279 "movss 0x3c(%%edx),%%xmm7\n" 280 "shufps $0,%%xmm7,%%xmm7\n" 281 "addps %%xmm6,%%xmm2\n" 282 "mulps %%xmm5,%%xmm7\n" 283 "addps %%xmm7,%%xmm2\n" 284 "movups %%xmm2,0x30(%%eax)\n" 285 286 : 287 : "d" ( matA ), 288 "c" ( matB ), 289 "a" ( result ) 290 ); 291} 292 293void SSE_MatrixF_x_MatrixF_Aligned(const F32 *matA, const F32 *matB, F32 *result) 294{ 295 asm 296 ( 297 "movss (%%edx),%%xmm0\n" 298 "movaps (%%ecx),%%xmm1\n" 299 "shufps $0,%%xmm0,%%xmm0\n" 300 "movss 4(%%edx),%%xmm2\n" 301 "mulps %%xmm1,%%xmm0\n" 302 "shufps $0,%%xmm2,%%xmm2\n" 303 "movaps 0x10(%%ecx),%%xmm3\n" 304 "movss 8(%%edx),%%xmm7\n" 305 "mulps %%xmm3,%%xmm2\n" 306 "shufps $0,%%xmm7,%%xmm7\n" 307 "addps %%xmm2,%%xmm0\n" 308 "movaps 0x20(%%ecx),%%xmm4\n" 309 "movss 0x0c(%%edx),%%xmm2\n" 310 "mulps %%xmm4,%%xmm7\n" 311 "shufps $0,%%xmm2,%%xmm2\n" 312 "addps %%xmm7,%%xmm0\n" 313 "movaps 0x30(%%ecx),%%xmm5\n" 314 "movss 0x10(%%edx),%%xmm6\n" 315 "mulps %%xmm5,%%xmm2\n" 316 "movss 0x14(%%edx),%%xmm7\n" 317 "shufps $0,%%xmm6,%%xmm6\n" 318 "addps %%xmm2,%%xmm0\n" 319 "shufps $0,%%xmm7,%%xmm7\n" 320 "movlps %%xmm0,(%%eax)\n" 321 "movhps %%xmm0,8(%%eax)\n" 322 "mulps %%xmm3,%%xmm7\n" 323 "movss 0x18(%%edx),%%xmm0\n" 324 "mulps %%xmm1,%%xmm6\n" 325 "shufps $0,%%xmm0,%%xmm0\n" 326 "addps %%xmm7,%%xmm6\n" 327 "mulps %%xmm4,%%xmm0\n" 328 "movss 0x24(%%edx),%%xmm2\n" 329 "addps %%xmm0,%%xmm6\n" 330 "movss 0x1c(%%edx),%%xmm0\n" 331 "movss 0x20(%%edx),%%xmm7\n" 332 "shufps $0,%%xmm0,%%xmm0\n" 333 "shufps $0,%%xmm7,%%xmm7\n" 334 "mulps %%xmm5,%%xmm0\n" 335 "mulps %%xmm1,%%xmm7\n" 336 "addps %%xmm0,%%xmm6\n" 337 "shufps $0,%%xmm2,%%xmm2\n" 338 "movlps %%xmm6,0x10(%%eax)\n" 339 "movhps %%xmm6,0x18(%%eax)\n" 340 "mulps %%xmm3,%%xmm2\n" 341 "movss 0x28(%%edx),%%xmm6\n" 342 "addps %%xmm2,%%xmm7\n" 343 "shufps $0,%%xmm6,%%xmm6\n" 344 "movss 0x2c(%%edx),%%xmm2\n" 345 "mulps %%xmm4,%%xmm6\n" 346 "shufps $0,%%xmm2,%%xmm2\n" 347 "addps %%xmm6,%%xmm7\n" 348 "mulps %%xmm5,%%xmm2\n" 349 "movss 0x34(%%edx),%%xmm0\n" 350 "addps %%xmm2,%%xmm7\n" 351 "shufps $0,%%xmm0,%%xmm0\n" 352 "movlps %%xmm7,0x20(%%eax)\n" 353 "movss 0x30(%%edx),%%xmm2\n" 354 "movhps %%xmm7,0x28(%%eax)\n" 355 "mulps %%xmm3,%%xmm0\n" 356 "shufps $0,%%xmm2,%%xmm2\n" 357 "movss 0x38(%%edx),%%xmm6\n" 358 "mulps %%xmm1,%%xmm2\n" 359 "shufps $0,%%xmm6,%%xmm6\n" 360 "addps %%xmm0,%%xmm2\n" 361 "mulps %%xmm4,%%xmm6\n" 362 "movss 0x3c(%%edx),%%xmm7\n" 363 "shufps $0,%%xmm7,%%xmm7\n" 364 "addps %%xmm6,%%xmm2\n" 365 "mulps %%xmm5,%%xmm7\n" 366 "addps %%xmm7,%%xmm2\n" 367 "movaps %%xmm2,0x30(%%eax)\n" 368 369 : 370 : "d" ( matA ), 371 "c" ( matB ), 372 "a" ( result ) 373 ); 374} 375 376#endif 377 378void mInstall_Library_SSE() 379{ 380#if defined(ADD_SSE_FN) 381 m_matF_x_matF = SSE_MatrixF_x_MatrixF; 382 m_matF_x_matF_aligned = SSE_MatrixF_x_MatrixF_Aligned; 383 // m_matF_x_point3F = Athlon_MatrixF_x_Point3F; 384 // m_matF_x_vectorF = Athlon_MatrixF_x_VectorF; 385#endif 386} 387