oggTheoraDecoder.cpp
Engine/source/core/ogg/oggTheoraDecoder.cpp
Public Defines
define
ycbcrToRGB(rgb, pY, pCb, pCr, G) { \ ( \ mPacketFormat.mFormat, \ rgb, \ sClamp[ sRGBY[ *pY ][ 2 ] + sRGBCr[ *pCr ][ 2 ] ], \ sClamp[ sRGBY[ *pY ][ 1 ] + G ], \ sClamp[ sRGBY[ *pY ][ 0 ] + sRGBCb[ *pCb ][ 0 ] ], \ 255 \ ); \ }
Public Variables
sClampBuff [1024]
Detailed Description
Public Defines
ycbcrToRGB(rgb, pY, pCb, pCr, G) { \ ( \ mPacketFormat.mFormat, \ rgb, \ sClamp[ sRGBY[ *pY ][ 2 ] + sRGBCr[ *pCr ][ 2 ] ], \ sClamp[ sRGBY[ *pY ][ 1 ] + G ], \ sClamp[ sRGBY[ *pY ][ 0 ] + sRGBCb[ *pCb ][ 0 ] ], \ 255 \ ); \ }
Public Variables
U8 * sClamp
U8 sClampBuff [1024]
Public Functions
dALIGN(static S32 sRGBY)
initLookupTables()
sampleG(U8 * pCb, U8 * pCr)
1 2//----------------------------------------------------------------------------- 3// Copyright (c) 2012 GarageGames, LLC 4// 5// Permission is hereby granted, free of charge, to any person obtaining a copy 6// of this software and associated documentation files (the "Software"), to 7// deal in the Software without restriction, including without limitation the 8// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 9// sell copies of the Software, and to permit persons to whom the Software is 10// furnished to do so, subject to the following conditions: 11// 12// The above copyright notice and this permission notice shall be included in 13// all copies or substantial portions of the Software. 14// 15// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21// IN THE SOFTWARE. 22//----------------------------------------------------------------------------- 23 24#include "platform/platform.h" 25#include "core/ogg/oggTheoraDecoder.h" 26 27#include "gfx/gfxFormatUtils.h" 28#include "math/mMathFn.h" 29#include "console/console.h" 30 31 32//#define DEBUG_SPEW 33 34 35//----------------------------------------------------------------------------- 36 37// Lookup tables for the transcoders. 38// 39// The Y, Cb, and Cr tables are used by both the SSE2 and the generic transcoder. 40// For the SSE2 code, the data must be 16 byte aligned. 41// 42// The clamping table is only used by the generic transcoder. The SSE2 transcoder 43// uses instructions to implicitly clamp out-of-range values. 44 45dALIGN( static S32 sRGBY[ 256 ][ 4 ] ); 46dALIGN( static S32 sRGBCb[ 256 ][ 4 ] ); 47dALIGN( static S32 sRGBCr[ 256 ][ 4 ] ); 48 49static U8 sClampBuff[ 1024 ]; 50static U8* sClamp = sClampBuff + 384; 51 52static void initLookupTables() 53{ 54 static bool sGenerated = false; 55 if( !sGenerated ) 56 { 57 for( S32 i = 0; i < 256; ++ i ) 58 { 59 // Y. 60 61 sRGBY[ i ][ 0 ] = ( 298 * ( i - 16 ) ) >> 8; // B 62 sRGBY[ i ][ 1 ] = ( 298 * ( i - 16 ) ) >> 8; // G 63 sRGBY[ i ][ 2 ] = ( 298 * ( i - 16 ) ) >> 8; // R 64 sRGBY[ i ][ 3 ] = 0xff; // A 65 66 // Cb. 67 68 sRGBCb[ i ][ 0 ] = ( 516 * ( i - 128 ) + 128 ) >> 8; // B 69 sRGBCb[ i ][ 1 ] = - ( ( 100 * ( i - 128 ) + 128 ) >> 8 ); // G 70 71 // Cr. 72 73 sRGBCr[ i ][ 1 ] = - ( ( 208 * ( i - 128 ) + 128 ) >> 8 ); // B 74 sRGBCr[ i ][ 2 ] = ( 409 * ( i - 128 ) + 128 ) >> 8; // R 75 } 76 77 // Setup clamping table for generic transcoder. 78 79 for( S32 i = -384; i < 640; ++ i ) 80 sClamp[ i ] = mClamp( i, 0, 0xFF ); 81 82 sGenerated = true; 83 } 84} 85 86static inline S32 sampleG( U8* pCb, U8* pCr ) 87{ 88 return sRGBCr[ *pCr ][ 1 ] + sRGBCr[ *pCb ][ 1 ]; 89} 90 91//============================================================================= 92// OggTheoraDecoder. 93//============================================================================= 94 95//----------------------------------------------------------------------------- 96 97OggTheoraDecoder::OggTheoraDecoder( const ThreadSafeRef< OggInputStream>& stream ) 98 : Parent( stream ), 99 mTheoraSetup( NULL ), 100 mTheoraDecoder( NULL ), 101 mTranscoder( TRANSCODER_Auto ) 102#ifdef TORQUE_DEBUG 103 ,mLock( 0 ) 104#endif 105{ 106 // Initialize. 107 108 th_info_init( &mTheoraInfo ); 109 th_comment_init( &mTheoraComment ); 110 111 initLookupTables(); 112} 113 114//----------------------------------------------------------------------------- 115 116OggTheoraDecoder::~OggTheoraDecoder() 117{ 118 // Free packets on the freelist. 119 120 OggTheoraFrame* packet; 121 while( mFreePackets.tryPopFront( packet ) ) 122 destructSingle( packet ); 123 124 // Clean up libtheora structures. 125 126 if( mTheoraDecoder ) 127 th_decode_free( mTheoraDecoder ); 128 if( mTheoraSetup ) 129 th_setup_free( mTheoraSetup ); 130 131 th_comment_clear( &mTheoraComment ); 132 th_info_clear( &mTheoraInfo ); 133} 134 135//----------------------------------------------------------------------------- 136 137bool OggTheoraDecoder::_detect( ogg_page* startPage ) 138{ 139 _setStartPage( startPage ); 140 141 // Read first header packet. 142 143 ogg_packet nextPacket; 144 if( !_readNextPacket( &nextPacket ) 145 || th_decode_headerin( &mTheoraInfo, &mTheoraComment, &mTheoraSetup, &nextPacket ) < 0 ) 146 { 147 th_comment_clear( &mTheoraComment ); 148 th_info_clear( &mTheoraInfo ); 149 150 return false; 151 } 152 153 return true; 154} 155 156//----------------------------------------------------------------------------- 157 158bool OggTheoraDecoder::_init() 159{ 160 ogg_packet nextPacket; 161 162 // Read header packets. 163 164 bool haveTheoraHeader = true; 165 while( 1 ) 166 { 167 if( !_readNextPacket( &nextPacket ) ) 168 { 169 haveTheoraHeader = false; 170 break; 171 } 172 173 S32 result = th_decode_headerin( &mTheoraInfo, &mTheoraComment, &mTheoraSetup, &nextPacket ); 174 if( result < 0 ) 175 { 176 haveTheoraHeader = false; 177 break; 178 } 179 else if( result == 0 ) 180 break; 181 } 182 183 // Fail if we have no valid and complete Theora header. 184 185 if( !haveTheoraHeader ) 186 { 187 th_comment_clear( &mTheoraComment ); 188 th_info_clear( &mTheoraInfo ); 189 190 Con::errorf( "OggTheoraDecoder::_init() - incorrect or corrupt Theora headers" ); 191 192 return false; 193 } 194 195 // Init the decoder. 196 197 mTheoraDecoder = th_decode_alloc( &mTheoraInfo, mTheoraSetup ); 198 199 // Feed the first video packet to the decoder. 200 201 ogg_int64_t granulePos; 202 th_decode_packetin( mTheoraDecoder, &nextPacket, &granulePos ); 203 204 mCurrentFrameTime = th_granule_time( mTheoraDecoder, granulePos ); 205 mCurrentFrameNumber = 0; 206 mFrameDuration = 1.f / getFramesPerSecond(); 207 208 // Make sure we have a valid pitch. 209 210 if( !mPacketFormat.mPitch ) 211 mPacketFormat.mPitch = getFrameWidth() * GFXFormatInfo( mPacketFormat.mFormat ).getBytesPerPixel(); 212 213 return true; 214} 215 216//----------------------------------------------------------------------------- 217 218bool OggTheoraDecoder::_packetin( ogg_packet* packet ) 219{ 220 ogg_int64_t granulePos; 221 222 if( th_decode_packetin( mTheoraDecoder, packet, &granulePos ) != 0 ) 223 return false; 224 225 // See if we should drop this frame. 226 //RDTODO: if we have fallen too far behind, start skipping pages 227 228 F32 granuleTime = th_granule_time( mTheoraDecoder, granulePos ); 229 mCurrentFrameTime = granuleTime; 230 mCurrentFrameNumber ++; 231 232 bool dropThisFrame = false; 233 TimeSourceRef timeSource = mTimeSource; 234 if( timeSource ) 235 { 236 F32 currentTick = F32( timeSource->getPosition() ) / 1000.f; 237 238 if( currentTick >= ( mCurrentFrameTime + mFrameDuration ) ) 239 dropThisFrame = true; 240 } 241 242#ifdef DEBUG_SPEW 243 Platform::outputDebugString( "[OggTheoraDecoder] new frame %i at %f sec%s", 244 U32( th_granule_frame( mTheoraDecoder, granulePos ) ), 245 granuleTime, 246 dropThisFrame ? " !! DROPPED !!" : "" ); 247#endif 248 249 return !dropThisFrame; 250} 251 252//----------------------------------------------------------------------------- 253 254U32 OggTheoraDecoder::read( OggTheoraFrame** buffer, U32 num ) 255{ 256 #ifdef TORQUE_DEBUG 257 AssertFatal( dCompareAndSwap( mLock, 0, 1 ), "OggTheoraDecoder::read() - simultaneous reads not thread-safe" ); 258 #endif 259 260 U32 numRead = 0; 261 262 for( U32 i = 0; i < num; ++ i ) 263 { 264 // Read and decode a packet. 265 266 if( !_nextPacket() ) 267 return numRead; // End of stream. 268 269 // Decode the frame to Y'CbCr. 270 271 th_ycbcr_buffer ycbcr; 272 th_decode_ycbcr_out( mTheoraDecoder, ycbcr ); 273 274 // Allocate a packet. 275 276 const U32 width = getFrameWidth(); 277 const U32 height = getFrameHeight(); 278 279 OggTheoraFrame* packet; 280 if( !mFreePackets.tryPopFront( packet ) ) 281 packet = constructSingle< OggTheoraFrame* >( mPacketFormat.mPitch * height ); 282 283 packet->mFrameNumber = mCurrentFrameNumber; 284 packet->mFrameTime = mCurrentFrameTime; 285 packet->mFrameDuration = mFrameDuration; 286 287 // Transcode the packet. 288 289 #if ( defined( TORQUE_COMPILER_GCC ) || defined( TORQUE_COMPILER_VISUALC ) ) && (defined( TORQUE_CPU_X86 ) ) 290 if( ( mTranscoder == TRANSCODER_Auto || mTranscoder == TRANSCODER_SSE2420RGBA ) && 291 getDecoderPixelFormat() == PIXEL_FORMAT_420 && 292 Platform::SystemInfo.processor.properties & CPU_PROP_SSE2 && 293 mPacketFormat.mFormat == GFXFormatR8G8B8A8 && 294 mTheoraInfo.pic_x == 0 && 295 mTheoraInfo.pic_y == 0 ) 296 { 297 _transcode420toRGBA_SSE2( ycbcr, ( U8* ) packet->data, width, height, mPacketFormat.mPitch ); 298 } 299 else 300 301 #endif 302 303 { 304 // Use generic transcoder. 305 306 _transcode( ycbcr, ( U8* ) packet->data, width, height ); 307 } 308 309 buffer[ i ] = packet; 310 ++ numRead; 311 } 312 313 #ifdef TORQUE_DEBUG 314 AssertFatal( dCompareAndSwap( mLock, 1, 0 ), "" ); 315 #endif 316 317 return numRead; 318} 319 320//----------------------------------------------------------------------------- 321 322void OggTheoraDecoder::_transcode( th_ycbcr_buffer ycbcr, U8* buffer, const U32 width, const U32 height ) 323{ 324 #define ycbcrToRGB( rgb, pY, pCb, pCr, G ) \ 325 { \ 326 GFXPackPixel( \ 327 mPacketFormat.mFormat, \ 328 rgb, \ 329 sClamp[ sRGBY[ *pY ][ 2 ] + sRGBCr[ *pCr ][ 2 ] ], \ 330 sClamp[ sRGBY[ *pY ][ 1 ] + G ], \ 331 sClamp[ sRGBY[ *pY ][ 0 ] + sRGBCb[ *pCb ][ 0 ] ], \ 332 255 \ 333 ); \ 334 } 335 336 // Determine number of chroma samples per 4-pixel luma block. 337 338 U32 numChromaSamples = 4; 339 EPixelFormat pixelFormat = getDecoderPixelFormat(); 340 if( pixelFormat == PIXEL_FORMAT_422 ) 341 numChromaSamples = 2; 342 else if( pixelFormat == OggTheoraDecoder::PIXEL_FORMAT_420 ) 343 numChromaSamples = 1; 344 345 // Convert and copy the pixels. Deal with all three 346 // possible plane configurations. 347 348 const U32 pictOffsetY = _getPictureOffset( ycbcr, 0 ); 349 const U32 pictOffsetU = _getPictureOffset( ycbcr, 1 ); 350 const U32 pictOffsetV = _getPictureOffset( ycbcr, 2 ); 351 352 for( U32 y = 0; y < height; y += 2 ) 353 { 354 U8* dst0 = buffer + y * mPacketFormat.mPitch; 355 U8* dst1 = dst0 + mPacketFormat.mPitch; 356 357 U8* pY0 = _getPixelPtr( ycbcr, 0, pictOffsetY, 0, y ); 358 U8* pY1 = _getPixelPtr( ycbcr, 0, pictOffsetY, 0, y + 1 ); 359 U8* pU0 = _getPixelPtr( ycbcr, 1, pictOffsetU, 0, y ); 360 U8* pU1 = _getPixelPtr( ycbcr, 1, pictOffsetU, 0, y + 1 ); 361 U8* pV0 = _getPixelPtr( ycbcr, 2, pictOffsetV, 0, y ); 362 U8* pV1 = _getPixelPtr( ycbcr, 2, pictOffsetV, 0, y + 1 ); 363 364 for( U32 x = 0; x < width; x += 2 ) 365 { 366 // Pixel 0x0. 367 368 S32 G = sampleG( pU0, pV0 ); 369 370 ycbcrToRGB( dst0, pY0, pU0, pV0, G ); 371 372 ++ pY0; 373 374 if( numChromaSamples == 4 ) 375 { 376 ++ pU0; 377 ++ pV0; 378 } 379 380 // Pixel 0x1. 381 382 if( numChromaSamples == 4 ) 383 G = sampleG( pU0, pV0 ); 384 385 ycbcrToRGB( dst0, pY0, pU0, pV0, G ); 386 387 ++ pY0; 388 ++ pU0; 389 ++ pV0; 390 391 // Pixel 1x0. 392 393 if( numChromaSamples != 1 ) 394 G = sampleG( pU1, pV1 ); 395 396 ycbcrToRGB( dst1, pY1, pU1, pV1, G ); 397 398 ++ pY1; 399 400 if( numChromaSamples == 4 ) 401 { 402 ++ pU1; 403 ++ pV1; 404 } 405 406 // Pixel 1x1. 407 408 if( numChromaSamples == 4 ) 409 G = sampleG( pU1, pV1 ); 410 411 ycbcrToRGB( dst1, pY1, pU1, pV1, G ); 412 413 ++ pY1; 414 ++ pU1; 415 ++ pV1; 416 } 417 } 418 419 #undef ycbcrToRGB 420} 421 422//----------------------------------------------------------------------------- 423#if defined( TORQUE_CPU_X86 ) 424void OggTheoraDecoder::_transcode420toRGBA_SSE2( th_ycbcr_buffer ycbcr, U8* buffer, U32 width, U32 height, U32 pitch ) 425{ 426 AssertFatal( width % 2 == 0, "OggTheoraDecoder::_transcode420toRGBA_SSE2() - width must be multiple of 2" ); 427 AssertFatal( height % 2 == 0, "OggTheoraDecoder::_transcode420toRGBA_SSE2() - height must be multiple of 2" ); 428 429 unsigned char* ydata = ycbcr[ 0 ].data; 430 unsigned char* udata = ycbcr[ 1 ].data; 431 unsigned char* vdata = ycbcr[ 2 ].data; 432 433 S32* ycoeff = ( S32* ) sRGBY; 434 S32* ucoeff = ( S32* ) sRGBCb; 435 S32* vcoeff = ( S32* ) sRGBCr; 436 437 // At the end of a line loop, we need to jump over the padding resulting from the difference 438 // between pitch and width plus jump a whole scanline as we always operate two scanlines 439 // at a time. 440 const U32 stride = pitch - width * 4 + pitch; 441 442 // Same thing for the Y channel. 443 const U32 ystrideDelta = ycbcr[ 0 ].stride - width + ycbcr[ 0 ].stride; 444 const U32 ypitch = ycbcr[ 0 ].stride; 445 446 // U and V only jump a single scanline so we only need to advance by the padding on the 447 // right. Both planes are half-size. 448 const U32 ustrideDelta = ycbcr[ 1 ].stride - width / 2; 449 const U32 vstrideDelta = ycbcr[ 2 ].stride - width / 2; 450 451 #if defined( TORQUE_COMPILER_VISUALC ) && defined( TORQUE_CPU_X86 ) 452 453 __asm 454 { 455 mov ecx,height 456 457 hloop: 458 459 push ecx 460 mov ecx,width 461 462 wloop: 463 464 push ecx 465 xor eax,eax 466 467 // Load and accumulate coefficients for U and V in XMM0. 468 469 mov esi,udata 470 mov ebx,ucoeff 471 mov edx,ydata 472 mov al,[esi] 473 xor ecx,ecx 474 mov edi,vdata 475 shl eax,4 476 movdqa xmm0,[ebx+eax] 477 478 mov ebx,vcoeff 479 mov cl,[edi] 480 mov esi,ycoeff 481 shl ecx,4 482 paddd xmm0,[ebx+ecx] 483 xor eax,eax 484 xor ebx,ebx 485 486 // Load coefficients for Y of the four pixels into XMM1-XMM4. 487 488 mov ecx,ypitch 489 mov al,[edx] 490 mov bl,[edx+1] 491 shl eax,4 492 shl ebx,4 493 movdqa xmm1,[esi+eax] 494 movdqa xmm2,[esi+ebx] 495 xor eax,eax 496 xor ebx,ebx 497 498 mov al,[edx+ecx] 499 mov bl,[edx+ecx+1] 500 shl eax,4 501 shl ebx,4 502 movdqa xmm3,[esi+eax] 503 movdqa xmm4,[esi+ebx] 504 505 mov edi,buffer 506 mov ecx,pitch 507 508 // Add Cb and Cr on top of Y. 509 510 paddd xmm1,xmm0 511 paddd xmm2,xmm0 512 paddd xmm3,xmm0 513 paddd xmm4,xmm0 514 515 // Pack pixels together. We need to pack twice per pixel 516 // to go from 32bits via 16bits to 8bits. 517 // 518 // Right now we're simply packing two garbage pixels for the 519 // second packing operation. An alternative would be to pack the 520 // four pixels into one XMM register and then do a packed shuffle 521 // to split out the lower two pixels before the move. 522 523 packssdw xmm1,xmm2 524 packssdw xmm3,xmm4 525 packuswb xmm1,xmm6 526 packuswb xmm3,xmm7 527 528 // Store pixels. 529 530 movq qword ptr [edi],xmm1 531 movq qword ptr [edi+ecx],xmm3 532 533 // Loop width. 534 535 pop ecx 536 537 add ydata,2 538 inc udata 539 inc vdata 540 add buffer,8 541 542 sub ecx,2 543 jnz wloop 544 545 // Loop height. 546 547 pop ecx 548 549 mov ebx,stride 550 mov eax,ystrideDelta 551 mov edi,ustrideDelta 552 mov esi,vstrideDelta 553 554 add buffer,ebx 555 add ydata,eax 556 add udata,edi 557 add vdata,esi 558 559 sub ecx,2 560 jnz hloop 561 }; 562 563 #elif defined( TORQUE_COMPILER_GCC ) && defined( TORQUE_CPU_X86 ) 564 565 asm( "pushal\n" // Save all general-purpose registers. 566 567 "movl %0,%%ecx\n" // Load height into ECX. 568 569 ".hloop_sse:\n" 570 571 "pushl %%ecx\n" // Save counter. 572 "movl %1,%%ecx\n" // Load width into ECX. 573 574 ".wloop_sse:\n" 575 576 "pushl %%ecx\n" // Save counter. 577 "xorl %%eax,%%eax\n" // Zero out eax for later use. 578 579 // Load and accumulate coefficients for U and V in XMM0. 580 581 "movl %3,%%esi\n" // Load U pointer into ESI. 582 "movl %8,%%ebx\n" // Load U coefficient table into EBX. 583 "movl %2,%%edx\n" // Load Y pointer into EDX. 584 "movb (%%esi),%%al\n" // Load U into AL. 585 "xorl %%ecx,%%ecx\n" // Clear ECX. 586 "movl %4,%%edi\n" // Load V pointer into EDI. 587 "shll $4,%%eax\n" // Multiply EAX by 16 to index into table. 588 "movdqa (%%ebx,%%eax),%%xmm0\n" // Load Cb coefficient into XMM0. 589 590 "movl %9,%%ebx\n" // Load V coefficients table into EBX. 591 "movb (%%edi),%%cl\n" // Load V into CL. 592 "movl %7,%%esi\n" // Load Y coefficients table into ESI. 593 "shll $4,%%ecx\n" // Multiply ECX by 16 to index into table. 594 "paddd (%%ebx,%%ecx),%%xmm0\n" // Add Cr coefficient to Cb coefficient. 595 "xorl %%eax,%%eax\n" // Clear EAX. 596 "xorl %%ebx,%%ebx\n" // Clear EBX. 597 598 // Load coefficients for Y of the four pixels into XMM1-XMM4. 599 600 "movl %14,%%ecx\n" // Load Y pitch into ECX (needed later for lower two pixels). 601 "movb (%%edx),%%al\n" // Load upper-left pixel Y into AL. 602 "movb 1(%%edx),%%bl\n" // Load upper-right pixel Y into BL. 603 "shll $4,%%eax\n" // Multiply EAX by 16 to index into table. 604 "shll $4,%%ebx\n" // Multiply EBX by 16 to index into table. 605 "movdqa (%%esi,%%eax),%%xmm1\n" // Load coefficient for upper-left pixel Y into XMM1. 606 "movdqa (%%esi,%%ebx),%%xmm2\n" // Load coefficient for upper-right pixel Y into XMM2. 607 "xorl %%eax,%%eax\n" // Clear EAX. 608 "xorl %%ebx,%%ebx\n" // Clear EBX. 609 610 "movb (%%edx,%%ecx),%%al\n" // Load lower-left pixel Y into AL. 611 "movb 1(%%edx,%%ecx),%%bl\n" // Load lower-right pixel Y into AL. 612 "shll $4,%%eax\n" // Multiply EAX by 16 to index into table. 613 "shll $4,%%ebx\n" // Multiply EBX by 16 to index into table. 614 "movdqa (%%esi,%%eax),%%xmm3\n" // Load coefficient for lower-left pixel Y into XMM3. 615 "movdqa (%%esi,%%ebx),%%xmm4\n" // Load coefficient for lower-right pixel Y into XMM4. 616 617 "movl %5,%%edi\n" // Load buffer pointer into EDI (for later use). 618 "movl %6,%%ecx\n" // Load pitch into ECX (for later use). 619 620 // Add Cb and Cr on top of Y. 621 622 "paddd %%xmm0,%%xmm1\n" // Add chroma channels to upper-left pixel. 623 "paddd %%xmm0,%%xmm2\n" // Add chroma channels to upper-right pixel. 624 "paddd %%xmm0,%%xmm3\n" // Add chroma channels to lower-left pixel. 625 "paddd %%xmm0,%%xmm4\n" // Add chroma channels to lower-right pixel. 626 627 // Pack pixels together. We need to pack twice per pixel 628 // to go from 32bits via 16bits to 8bits. 629 // 630 // Right now we're simply packing two garbage pixels for the 631 // second packing operation. An alternative would be to pack the 632 // four pixels into one XMM register and then do a packed shuffle 633 // to split out the lower two pixels before the move. 634 635 "packssdw %%xmm2,%%xmm1\n" // Pack 32bit channels together into 16bit channels on upper two pixels. 636 "packssdw %%xmm4,%%xmm3\n" // Pack 32bit channels together into 16bit channels on lower two pixels. 637 "packuswb %%xmm6,%%xmm1\n" // Pack 16bit channels together into 8bit channels on upper two pixels (plus two garbage pixels). 638 "packuswb %%xmm7,%%xmm3\n" // Pack 16bit channels together into 8bit channels on lower two pixels (plus two garbage pixels). 639 640 // Store pixels. 641 642 "movq %%xmm1,(%%edi)\n" // Store upper two pixels. 643 "movq %%xmm3,(%%edi,%%ecx)\n" // Store lower two pixels. 644 645 // Loop width. 646 647 "popl %%ecx\n" // Restore width counter. 648 649 "addl $2,%2\n" // Bump Y pointer by two pixels (1 bpp). 650 "incl %3\n" // Bump U pointer by one pixel (1 bpp). 651 "incl %4\n" // Bump V pointer by one pixel (1 bpp). 652 "addl $8,%5\n" // Bump buffer pointer by two pixels (4 bpp). 653 654 "subl $2,%%ecx\n" 655 "jnz .wloop_sse\n" 656 657 // Loop height. 658 659 "popl %%ecx\n" // Restore height counter. 660 661 "movl %10,%%ebx\n" // Load buffer stride into EBX. 662 "movl %11,%%eax\n" // Load Y stride delta into EAX. 663 "movl %12,%%edi\n" // Load U stride delta into EDI. 664 "movl %13,%%esi\n" // Load V stride delta into ESI. 665 666 "addl %%ebx,%5\n" // Bump buffer pointer by stride delta. 667 "addl %%eax,%2\n" // Bump Y pointer by stride delta. 668 "addl %%edi,%3\n" // Bump U pointer by stride delta. 669 "addl %%esi,%4\n" // Bump V pointer by stride delta. 670 671 "subl $2,%%ecx\n" 672 "jnz .hloop_sse\n" 673 674 "popal\n" 675 : 676 : "m" ( height ), // 0 677 "m" ( width ), // 1 678 "m" ( ydata ), // 2 679 "m" ( udata ), // 3 680 "m" ( vdata ), // 4 681 "m" ( buffer ), // 5 682 "m" ( pitch ), // 6 683 "m" ( ycoeff ), // 7 684 "m" ( ucoeff ), // 8 685 "m" ( vcoeff ), // 9 686 "m" ( stride ), // 10 687 "m" ( ystrideDelta ), // 11 688 "m" ( ustrideDelta ), // 12 689 "m" ( vstrideDelta ), // 13 690 "m" ( ypitch ) // 14 691 ); 692 693 #endif 694} 695#endif 696