oggTheoraDecoder.cpp

Engine/source/core/ogg/oggTheoraDecoder.cpp

Public Defines

define

ycbcrToRGB(rgb, pY, pCb, pCr, G)    {                                                                 \
      (                                                  \
         mPacketFormat.mFormat,                                      \
         rgb,                                                        \
         sClamp[ sRGBY[ *pY ][ 2 ] + sRGBCr[ *pCr ][ 2 ] ],          \
         sClamp[ sRGBY[ *pY ][ 1 ] + G ],                            \
         sClamp[ sRGBY[ *pY ][ 0 ] + sRGBCb[ *pCb ][ 0 ] ],          \
         255                                                         \
      );                                                             \
   }

Public Variables

U8 *

sClamp

U8

sClampBuff [1024]

Public Functions

dALIGN(static S32 sRGBY)

void

initLookupTables()

S32

sampleG(U8 * pCb, U8 * pCr)

Detailed Description

Public Defines

ycbcrToRGB(rgb, pY, pCb, pCr, G)    {                                                                 \
      (                                                  \
         mPacketFormat.mFormat,                                      \
         rgb,                                                        \
         sClamp[ sRGBY[ *pY ][ 2 ] + sRGBCr[ *pCr ][ 2 ] ],          \
         sClamp[ sRGBY[ *pY ][ 1 ] + G ],                            \
         sClamp[ sRGBY[ *pY ][ 0 ] + sRGBCb[ *pCb ][ 0 ] ],          \
         255                                                         \
      );                                                             \
   }

Public Variables

U8 * sClamp

U8 sClampBuff [1024]

Public Functions

dALIGN(static S32 sRGBY)

initLookupTables()

sampleG(U8 * pCb, U8 * pCr)

  1
  2//-----------------------------------------------------------------------------
  3// Copyright (c) 2012 GarageGames, LLC
  4//
  5// Permission is hereby granted, free of charge, to any person obtaining a copy
  6// of this software and associated documentation files (the "Software"), to
  7// deal in the Software without restriction, including without limitation the
  8// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  9// sell copies of the Software, and to permit persons to whom the Software is
 10// furnished to do so, subject to the following conditions:
 11//
 12// The above copyright notice and this permission notice shall be included in
 13// all copies or substantial portions of the Software.
 14//
 15// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 21// IN THE SOFTWARE.
 22//-----------------------------------------------------------------------------
 23
 24#include "platform/platform.h"
 25#include "core/ogg/oggTheoraDecoder.h"
 26
 27#include "gfx/gfxFormatUtils.h"
 28#include "math/mMathFn.h"
 29#include "console/console.h"
 30
 31
 32//#define DEBUG_SPEW
 33
 34
 35//-----------------------------------------------------------------------------
 36
 37// Lookup tables for the transcoders.
 38//
 39// The Y, Cb, and Cr tables are used by both the SSE2 and the generic transcoder.
 40// For the SSE2 code, the data must be 16 byte aligned.
 41//
 42// The clamping table is only used by the generic transcoder.  The SSE2 transcoder
 43// uses instructions to implicitly clamp out-of-range values.
 44
 45dALIGN( static S32 sRGBY[ 256 ][ 4 ] );
 46dALIGN( static S32 sRGBCb[ 256 ][ 4 ] );
 47dALIGN( static S32 sRGBCr[ 256 ][ 4 ] );
 48
 49static U8  sClampBuff[ 1024 ];
 50static U8* sClamp = sClampBuff + 384;
 51
 52static void initLookupTables()
 53{
 54   static bool sGenerated = false;
 55   if( !sGenerated )
 56   {
 57      for( S32 i = 0; i < 256; ++ i )
 58      {
 59         // Y.
 60         
 61         sRGBY[ i ][ 0 ]   = ( 298 * ( i - 16 ) ) >> 8; // B
 62         sRGBY[ i ][ 1 ]   = ( 298 * ( i - 16 ) ) >> 8; // G
 63         sRGBY[ i ][ 2 ]   = ( 298 * ( i - 16 ) ) >> 8; // R
 64         sRGBY[ i ][ 3 ]   = 0xff;                      // A
 65         
 66         // Cb.
 67         
 68         sRGBCb[ i ][ 0 ]  = ( 516 * ( i - 128 ) + 128 ) >> 8;       // B
 69         sRGBCb[ i ][ 1 ]  = - ( ( 100 * ( i - 128 ) + 128 ) >> 8 ); // G 
 70         
 71         // Cr.
 72         
 73         sRGBCr[ i ][ 1 ]  = - ( ( 208 * ( i - 128 ) + 128 ) >> 8 ); // B
 74         sRGBCr[ i ][ 2 ]  = ( 409 * ( i - 128 ) + 128 ) >> 8;       // R
 75      }
 76
 77      // Setup clamping table for generic transcoder.
 78      
 79      for( S32 i = -384; i < 640; ++ i )
 80         sClamp[ i ] = mClamp( i, 0, 0xFF );
 81      
 82      sGenerated = true;
 83   }
 84}
 85
 86static inline S32 sampleG( U8* pCb, U8* pCr )
 87{
 88   return sRGBCr[ *pCr ][ 1 ] + sRGBCr[ *pCb ][ 1 ];
 89}
 90
 91//=============================================================================
 92//    OggTheoraDecoder.
 93//=============================================================================
 94
 95//-----------------------------------------------------------------------------
 96
 97OggTheoraDecoder::OggTheoraDecoder( const ThreadSafeRef< OggInputStream>& stream )
 98   : Parent( stream ),
 99     mTheoraSetup( NULL ),
100     mTheoraDecoder( NULL ),
101     mTranscoder( TRANSCODER_Auto )
102#ifdef TORQUE_DEBUG
103     ,mLock( 0 )
104#endif
105{
106   // Initialize.
107      
108   th_info_init( &mTheoraInfo );
109   th_comment_init( &mTheoraComment );
110      
111   initLookupTables();
112}
113
114//-----------------------------------------------------------------------------
115
116OggTheoraDecoder::~OggTheoraDecoder()
117{
118   // Free packets on the freelist.
119   
120   OggTheoraFrame* packet;
121   while( mFreePackets.tryPopFront( packet ) )
122      destructSingle( packet );
123      
124   // Clean up libtheora structures.
125      
126   if( mTheoraDecoder )
127      th_decode_free( mTheoraDecoder );
128   if( mTheoraSetup )
129      th_setup_free( mTheoraSetup );
130      
131   th_comment_clear( &mTheoraComment );
132   th_info_clear( &mTheoraInfo );
133}
134
135//-----------------------------------------------------------------------------
136
137bool OggTheoraDecoder::_detect( ogg_page* startPage )
138{
139   _setStartPage( startPage );
140
141   // Read first header packet.
142
143   ogg_packet nextPacket;
144   if( !_readNextPacket( &nextPacket )
145       || th_decode_headerin( &mTheoraInfo, &mTheoraComment, &mTheoraSetup, &nextPacket ) < 0 )
146   {
147      th_comment_clear( &mTheoraComment );
148      th_info_clear( &mTheoraInfo );
149      
150      return false;
151   }
152   
153   return true;
154}
155
156//-----------------------------------------------------------------------------
157
158bool OggTheoraDecoder::_init()
159{
160   ogg_packet nextPacket;
161   
162   // Read header packets.
163   
164   bool haveTheoraHeader = true;
165   while( 1 )
166   {
167      if( !_readNextPacket( &nextPacket ) )
168      {
169         haveTheoraHeader = false;
170         break;
171      }
172      
173      S32 result = th_decode_headerin( &mTheoraInfo, &mTheoraComment, &mTheoraSetup, &nextPacket );
174      if( result < 0 )
175      {
176         haveTheoraHeader = false;
177         break;
178      }
179      else if( result == 0 )
180         break;
181   }
182   
183   // Fail if we have no valid and complete Theora header.
184      
185   if( !haveTheoraHeader )
186   {
187      th_comment_clear( &mTheoraComment );
188      th_info_clear( &mTheoraInfo );
189      
190      Con::errorf( "OggTheoraDecoder::_init() - incorrect or corrupt Theora headers" );
191      
192      return false;
193   }
194
195   // Init the decoder.
196   
197   mTheoraDecoder = th_decode_alloc( &mTheoraInfo, mTheoraSetup );
198   
199   // Feed the first video packet to the decoder.
200   
201   ogg_int64_t granulePos;
202   th_decode_packetin( mTheoraDecoder, &nextPacket, &granulePos );
203   
204   mCurrentFrameTime = th_granule_time( mTheoraDecoder, granulePos );
205   mCurrentFrameNumber = 0;
206   mFrameDuration = 1.f / getFramesPerSecond();
207   
208   // Make sure we have a valid pitch.
209   
210   if( !mPacketFormat.mPitch )
211      mPacketFormat.mPitch = getFrameWidth() * GFXFormatInfo( mPacketFormat.mFormat ).getBytesPerPixel();
212      
213   return true;
214}
215
216//-----------------------------------------------------------------------------
217
218bool OggTheoraDecoder::_packetin( ogg_packet* packet )
219{
220   ogg_int64_t granulePos;
221
222   if( th_decode_packetin( mTheoraDecoder, packet, &granulePos ) != 0 )
223      return false;
224
225   // See if we should drop this frame.
226   //RDTODO: if we have fallen too far behind, start skipping pages
227
228   F32 granuleTime = th_granule_time( mTheoraDecoder, granulePos );
229   mCurrentFrameTime = granuleTime;
230   mCurrentFrameNumber ++;
231
232   bool dropThisFrame = false;
233   TimeSourceRef timeSource = mTimeSource;
234   if( timeSource )
235   {
236      F32 currentTick = F32( timeSource->getPosition() ) / 1000.f;
237
238      if( currentTick >= ( mCurrentFrameTime + mFrameDuration ) )
239         dropThisFrame = true;
240   }
241
242#ifdef DEBUG_SPEW
243   Platform::outputDebugString( "[OggTheoraDecoder] new frame %i at %f sec%s",
244      U32( th_granule_frame( mTheoraDecoder, granulePos ) ),
245      granuleTime,
246      dropThisFrame ? " !! DROPPED !!" : "" );
247#endif
248
249   return !dropThisFrame;
250}
251
252//-----------------------------------------------------------------------------
253
254U32 OggTheoraDecoder::read( OggTheoraFrame** buffer, U32 num )
255{
256   #ifdef TORQUE_DEBUG
257   AssertFatal( dCompareAndSwap( mLock, 0, 1 ), "OggTheoraDecoder::read() - simultaneous reads not thread-safe" );
258   #endif
259   
260   U32 numRead = 0;
261   
262   for( U32 i = 0; i < num; ++ i )
263   {
264      // Read and decode a packet.
265      
266      if( !_nextPacket() )
267         return numRead; // End of stream.
268      
269      // Decode the frame to Y'CbCr.
270      
271      th_ycbcr_buffer ycbcr;
272      th_decode_ycbcr_out( mTheoraDecoder, ycbcr );
273      
274      // Allocate a packet.
275      
276      const U32 width = getFrameWidth();
277      const U32 height = getFrameHeight();
278      
279      OggTheoraFrame* packet;
280      if( !mFreePackets.tryPopFront( packet ) )
281         packet = constructSingle< OggTheoraFrame* >( mPacketFormat.mPitch * height );
282         
283      packet->mFrameNumber = mCurrentFrameNumber;
284      packet->mFrameTime = mCurrentFrameTime;
285      packet->mFrameDuration = mFrameDuration;
286      
287      // Transcode the packet.
288      
289      #if ( defined( TORQUE_COMPILER_GCC ) || defined( TORQUE_COMPILER_VISUALC ) ) && (defined( TORQUE_CPU_X86 ) )      
290      if(      ( mTranscoder == TRANSCODER_Auto || mTranscoder == TRANSCODER_SSE2420RGBA ) &&
291               getDecoderPixelFormat() == PIXEL_FORMAT_420 &&
292               Platform::SystemInfo.processor.properties & CPU_PROP_SSE2 &&
293               mPacketFormat.mFormat == GFXFormatR8G8B8A8 &&
294               mTheoraInfo.pic_x == 0 &&
295               mTheoraInfo.pic_y == 0 )
296      {
297         _transcode420toRGBA_SSE2( ycbcr, ( U8* ) packet->data, width, height, mPacketFormat.mPitch );
298      }
299      else
300      
301      #endif
302      
303      {
304         // Use generic transcoder.
305         
306         _transcode( ycbcr, ( U8* ) packet->data, width, height );
307      }
308                  
309      buffer[ i ] = packet;
310      ++ numRead;
311   }
312   
313   #ifdef TORQUE_DEBUG
314   AssertFatal( dCompareAndSwap( mLock, 1, 0 ), "" );
315   #endif
316   
317   return numRead;
318}
319
320//-----------------------------------------------------------------------------
321
322void OggTheoraDecoder::_transcode( th_ycbcr_buffer ycbcr, U8* buffer, const U32 width, const U32 height )
323{
324   #define ycbcrToRGB( rgb, pY, pCb, pCr, G )                        \
325   {                                                                 \
326      GFXPackPixel(                                                  \
327         mPacketFormat.mFormat,                                      \
328         rgb,                                                        \
329         sClamp[ sRGBY[ *pY ][ 2 ] + sRGBCr[ *pCr ][ 2 ] ],          \
330         sClamp[ sRGBY[ *pY ][ 1 ] + G ],                            \
331         sClamp[ sRGBY[ *pY ][ 0 ] + sRGBCb[ *pCb ][ 0 ] ],          \
332         255                                                         \
333      );                                                             \
334   }
335
336   // Determine number of chroma samples per 4-pixel luma block.
337   
338   U32 numChromaSamples = 4;
339   EPixelFormat pixelFormat = getDecoderPixelFormat();
340   if( pixelFormat == PIXEL_FORMAT_422 )
341      numChromaSamples = 2;
342   else if( pixelFormat == OggTheoraDecoder::PIXEL_FORMAT_420 )
343      numChromaSamples = 1;
344
345   // Convert and copy the pixels.  Deal with all three
346   // possible plane configurations.
347               
348   const U32 pictOffsetY = _getPictureOffset( ycbcr, 0 );
349   const U32 pictOffsetU = _getPictureOffset( ycbcr, 1 );
350   const U32 pictOffsetV = _getPictureOffset( ycbcr, 2 );
351
352   for( U32 y = 0; y < height; y += 2 )
353   {
354      U8* dst0 = buffer + y * mPacketFormat.mPitch;
355      U8* dst1 = dst0 + mPacketFormat.mPitch;
356   
357      U8* pY0 = _getPixelPtr( ycbcr, 0, pictOffsetY, 0, y );
358      U8* pY1 = _getPixelPtr( ycbcr, 0, pictOffsetY, 0, y + 1 );
359      U8* pU0 = _getPixelPtr( ycbcr, 1, pictOffsetU, 0, y );
360      U8* pU1 = _getPixelPtr( ycbcr, 1, pictOffsetU, 0, y + 1 );
361      U8* pV0 = _getPixelPtr( ycbcr, 2, pictOffsetV, 0, y );
362      U8* pV1 = _getPixelPtr( ycbcr, 2, pictOffsetV, 0, y + 1 );
363      
364      for( U32 x = 0; x < width; x += 2 )
365      {
366         // Pixel 0x0.
367         
368         S32 G = sampleG( pU0, pV0 );
369         
370         ycbcrToRGB( dst0, pY0, pU0, pV0, G );
371         
372         ++ pY0;
373         
374         if( numChromaSamples == 4 )
375         {
376            ++ pU0;
377            ++ pV0;
378         }
379         
380         // Pixel 0x1.
381         
382         if( numChromaSamples == 4 )
383            G = sampleG( pU0, pV0 );
384            
385         ycbcrToRGB( dst0, pY0, pU0, pV0, G );
386         
387         ++ pY0;
388         ++ pU0;
389         ++ pV0;
390         
391         // Pixel 1x0.
392         
393         if( numChromaSamples != 1 )
394            G = sampleG( pU1, pV1 );
395         
396         ycbcrToRGB( dst1, pY1, pU1, pV1, G );
397         
398         ++ pY1;
399         
400         if( numChromaSamples == 4 )
401         {
402            ++ pU1;
403            ++ pV1;
404         }
405         
406         // Pixel 1x1.
407         
408         if( numChromaSamples == 4 )
409            G = sampleG( pU1, pV1 );
410            
411         ycbcrToRGB( dst1, pY1, pU1, pV1, G );
412         
413         ++ pY1;
414         ++ pU1;
415         ++ pV1;
416      }
417   }
418   
419   #undef ycbcrToRGB
420}
421
422//-----------------------------------------------------------------------------
423#if defined( TORQUE_CPU_X86 )
424void OggTheoraDecoder::_transcode420toRGBA_SSE2( th_ycbcr_buffer ycbcr, U8* buffer, U32 width, U32 height, U32 pitch )
425{
426   AssertFatal( width % 2 == 0, "OggTheoraDecoder::_transcode420toRGBA_SSE2() - width must be multiple of 2" );
427   AssertFatal( height % 2 == 0, "OggTheoraDecoder::_transcode420toRGBA_SSE2() - height must be multiple of 2" );
428      
429   unsigned char* ydata = ycbcr[ 0 ].data;
430   unsigned char* udata = ycbcr[ 1 ].data;
431   unsigned char* vdata = ycbcr[ 2 ].data;
432   
433   S32* ycoeff = ( S32* ) sRGBY;
434   S32* ucoeff = ( S32* ) sRGBCb;
435   S32* vcoeff = ( S32* ) sRGBCr;
436      
437   // At the end of a line loop, we need to jump over the padding resulting from the difference
438   // between pitch and width plus jump a whole scanline as we always operate two scanlines
439   // at a time.
440   const U32 stride = pitch - width * 4 + pitch;
441   
442   // Same thing for the Y channel.
443   const U32 ystrideDelta = ycbcr[ 0 ].stride - width + ycbcr[ 0 ].stride;
444   const U32 ypitch = ycbcr[ 0 ].stride;
445   
446   // U and V only jump a single scanline so we only need to advance by the padding on the
447   // right.  Both planes are half-size.
448   const U32 ustrideDelta = ycbcr[ 1 ].stride - width / 2;
449   const U32 vstrideDelta = ycbcr[ 2 ].stride - width / 2;
450         
451   #if defined( TORQUE_COMPILER_VISUALC ) && defined( TORQUE_CPU_X86 )
452
453   __asm
454   {
455         mov ecx,height
456
457      hloop:
458
459         push ecx
460         mov ecx,width
461
462      wloop:
463
464         push ecx
465         xor eax,eax
466         
467         // Load and accumulate coefficients for U and V in XMM0.
468         
469         mov esi,udata
470         mov ebx,ucoeff
471         mov edx,ydata
472         mov al,[esi]
473         xor ecx,ecx
474         mov edi,vdata
475         shl eax,4
476         movdqa xmm0,[ebx+eax]
477
478         mov ebx,vcoeff
479         mov cl,[edi]
480         mov esi,ycoeff
481         shl ecx,4
482         paddd xmm0,[ebx+ecx]
483         xor eax,eax
484         xor ebx,ebx
485         
486         // Load coefficients for Y of the four pixels into XMM1-XMM4.
487         
488         mov ecx,ypitch
489         mov al,[edx]
490         mov bl,[edx+1]
491         shl eax,4
492         shl ebx,4
493         movdqa xmm1,[esi+eax]
494         movdqa xmm2,[esi+ebx]
495         xor eax,eax
496         xor ebx,ebx
497         
498         mov al,[edx+ecx]
499         mov bl,[edx+ecx+1]
500         shl eax,4
501         shl ebx,4
502         movdqa xmm3,[esi+eax]
503         movdqa xmm4,[esi+ebx]
504
505         mov edi,buffer
506         mov ecx,pitch
507         
508         // Add Cb and Cr on top of Y.
509         
510         paddd xmm1,xmm0
511         paddd xmm2,xmm0
512         paddd xmm3,xmm0
513         paddd xmm4,xmm0
514                  
515         // Pack pixels together.  We need to pack twice per pixel
516         // to go from 32bits via 16bits to 8bits.
517         //
518         // Right now we're simply packing two garbage pixels for the
519         // second packing operation.  An alternative would be to pack the
520         // four pixels into one XMM register and then do a packed shuffle
521         // to split out the lower two pixels before the move.
522         
523         packssdw xmm1,xmm2
524         packssdw xmm3,xmm4
525         packuswb xmm1,xmm6
526         packuswb xmm3,xmm7
527         
528         // Store pixels.
529   
530         movq qword ptr [edi],xmm1
531         movq qword ptr [edi+ecx],xmm3
532         
533         // Loop width.
534         
535         pop ecx
536
537         add ydata,2
538         inc udata
539         inc vdata
540         add buffer,8
541
542         sub ecx,2
543         jnz wloop
544         
545         // Loop height.
546     
547         pop ecx
548
549         mov ebx,stride
550         mov eax,ystrideDelta
551         mov edi,ustrideDelta
552         mov esi,vstrideDelta
553
554         add buffer,ebx
555         add ydata,eax
556         add udata,edi
557         add vdata,esi
558  
559         sub ecx,2
560         jnz hloop
561   };
562   
563   #elif defined( TORQUE_COMPILER_GCC ) && defined( TORQUE_CPU_X86 ) 
564
565   asm(  "pushal\n"                                // Save all general-purpose registers.
566         
567         "movl %0,%%ecx\n"                         // Load height into ECX.
568         
569      ".hloop_sse:\n"
570      
571         "pushl %%ecx\n"                           // Save counter.
572         "movl %1,%%ecx\n"                         // Load width into ECX.
573         
574      ".wloop_sse:\n"
575      
576         "pushl %%ecx\n"                           // Save counter.
577         "xorl %%eax,%%eax\n"                      // Zero out eax for later use.
578         
579         // Load and accumulate coefficients for U and V in XMM0.
580         
581         "movl %3,%%esi\n"                         // Load U pointer into ESI.
582         "movl %8,%%ebx\n"                         // Load U coefficient table into EBX.
583         "movl %2,%%edx\n"                         // Load Y pointer into EDX.
584         "movb (%%esi),%%al\n"                     // Load U into AL.
585         "xorl %%ecx,%%ecx\n"                      // Clear ECX.
586         "movl %4,%%edi\n"                         // Load V pointer into EDI.
587         "shll $4,%%eax\n"                         // Multiply EAX by 16 to index into table.
588         "movdqa (%%ebx,%%eax),%%xmm0\n"           // Load Cb coefficient into XMM0.
589         
590         "movl %9,%%ebx\n"                         // Load V coefficients table into EBX.
591         "movb (%%edi),%%cl\n"                     // Load V into CL.
592         "movl %7,%%esi\n"                         // Load Y coefficients table into ESI.
593         "shll $4,%%ecx\n"                         // Multiply ECX by 16 to index into table.
594         "paddd (%%ebx,%%ecx),%%xmm0\n"            // Add Cr coefficient to Cb coefficient.
595         "xorl %%eax,%%eax\n"                      // Clear EAX.
596         "xorl %%ebx,%%ebx\n"                      // Clear EBX.
597         
598         // Load coefficients for Y of the four pixels into XMM1-XMM4.
599         
600         "movl %14,%%ecx\n"                        // Load Y pitch into ECX (needed later for lower two pixels).
601         "movb (%%edx),%%al\n"                     // Load upper-left pixel Y into AL.
602         "movb 1(%%edx),%%bl\n"                    // Load upper-right pixel Y into BL.
603         "shll $4,%%eax\n"                         // Multiply EAX by 16 to index into table.
604         "shll $4,%%ebx\n"                         // Multiply EBX by 16 to index into table.
605         "movdqa (%%esi,%%eax),%%xmm1\n"           // Load coefficient for upper-left pixel Y into XMM1.
606         "movdqa (%%esi,%%ebx),%%xmm2\n"           // Load coefficient for upper-right pixel Y into XMM2.
607         "xorl %%eax,%%eax\n"                      // Clear EAX.
608         "xorl %%ebx,%%ebx\n"                      // Clear EBX.
609         
610         "movb (%%edx,%%ecx),%%al\n"               // Load lower-left pixel Y into AL.
611         "movb 1(%%edx,%%ecx),%%bl\n"              // Load lower-right pixel Y into AL.
612         "shll $4,%%eax\n"                         // Multiply EAX by 16 to index into table.
613         "shll $4,%%ebx\n"                         // Multiply EBX by 16 to index into table.
614         "movdqa (%%esi,%%eax),%%xmm3\n"           // Load coefficient for lower-left pixel Y into XMM3.
615         "movdqa (%%esi,%%ebx),%%xmm4\n"           // Load coefficient for lower-right pixel Y into XMM4.
616         
617         "movl %5,%%edi\n"                         // Load buffer pointer into EDI (for later use).
618         "movl %6,%%ecx\n"                         // Load pitch into ECX (for later use).
619         
620         // Add Cb and Cr on top of Y.
621         
622         "paddd %%xmm0,%%xmm1\n"                   // Add chroma channels to upper-left pixel.
623         "paddd %%xmm0,%%xmm2\n"                   // Add chroma channels to upper-right pixel.
624         "paddd %%xmm0,%%xmm3\n"                   // Add chroma channels to lower-left pixel.
625         "paddd %%xmm0,%%xmm4\n"                   // Add chroma channels to lower-right pixel.
626                  
627         // Pack pixels together.  We need to pack twice per pixel
628         // to go from 32bits via 16bits to 8bits.
629         //
630         // Right now we're simply packing two garbage pixels for the
631         // second packing operation.  An alternative would be to pack the
632         // four pixels into one XMM register and then do a packed shuffle
633         // to split out the lower two pixels before the move.
634         
635         "packssdw %%xmm2,%%xmm1\n"                // Pack 32bit channels together into 16bit channels on upper two pixels.
636         "packssdw %%xmm4,%%xmm3\n"                // Pack 32bit channels together into 16bit channels on lower two pixels.
637         "packuswb %%xmm6,%%xmm1\n"                // Pack 16bit channels together into 8bit channels on upper two pixels (plus two garbage pixels).
638         "packuswb %%xmm7,%%xmm3\n"                // Pack 16bit channels together into 8bit channels on lower two pixels (plus two garbage pixels).
639         
640         // Store pixels.
641         
642         "movq %%xmm1,(%%edi)\n"                    // Store upper two pixels.
643         "movq %%xmm3,(%%edi,%%ecx)\n"              // Store lower two pixels.
644         
645         // Loop width.
646         
647         "popl %%ecx\n"                            // Restore width counter.
648         
649         "addl $2,%2\n"                            // Bump Y pointer by two pixels (1 bpp).
650         "incl %3\n"                               // Bump U pointer by one pixel (1 bpp).
651         "incl %4\n"                               // Bump V pointer by one pixel (1 bpp).
652         "addl $8,%5\n"                            // Bump buffer pointer by two pixels (4 bpp).
653
654         "subl $2,%%ecx\n"
655         "jnz .wloop_sse\n"
656         
657         // Loop height.
658         
659         "popl %%ecx\n"                            // Restore height counter.
660
661         "movl %10,%%ebx\n"                        // Load buffer stride into EBX.
662         "movl %11,%%eax\n"                        // Load Y stride delta into EAX.
663         "movl %12,%%edi\n"                        // Load U stride delta into EDI.
664         "movl %13,%%esi\n"                        // Load V stride delta into ESI.
665         
666         "addl %%ebx,%5\n"                         // Bump buffer pointer by stride delta.
667         "addl %%eax,%2\n"                         // Bump Y pointer by stride delta.
668         "addl %%edi,%3\n"                         // Bump U pointer by stride delta.
669         "addl %%esi,%4\n"                         // Bump V pointer by stride delta.
670         
671         "subl $2,%%ecx\n"
672         "jnz .hloop_sse\n"
673         
674         "popal\n"
675      :
676      : "m" ( height ),                                        // 0
677        "m" ( width ),                                         // 1
678        "m" ( ydata ),                                         // 2
679        "m" ( udata ),                                         // 3
680        "m" ( vdata ),                                         // 4
681        "m" ( buffer ),                                        // 5
682        "m" ( pitch ),                                         // 6
683        "m" ( ycoeff ),                                        // 7
684        "m" ( ucoeff ),                                        // 8
685        "m" ( vcoeff ),                                        // 9
686        "m" ( stride ),                                        // 10
687        "m" ( ystrideDelta ),                                  // 11
688        "m" ( ustrideDelta ),                                  // 12
689        "m" ( vstrideDelta ),                                  // 13
690        "m" ( ypitch )                                         // 14
691   );
692   
693   #endif
694}
695#endif
696