Torque3D Documentation / _generateds / platformCPUCount.cpp

platformCPUCount.cpp

Engine/source/platform/platformCPUCount.cpp

More...

Detailed Description

  1
  2// Original code is:
  3// Copyright (c) 2005 Intel Corporation 
  4// All Rights Reserved
  5//
  6// CPUCount.cpp : Detects three forms of hardware multi-threading support across IA-32 platform
  7//             The three forms of HW multithreading are: Multi-processor, Multi-core, and 
  8//             HyperThreading Technology.
  9//             This application enumerates all the logical processors enabled by OS and BIOS,
 10//             determine the HW topology of these enabled logical processors in the system 
 11//             using information provided by CPUID instruction.
 12//             A multi-processing system can support any combination of the three forms of HW
 13//             multi-threading support. The relevant topology can be identified using a 
 14//             three level decomposition of the "initial APIC ID" into 
 15//             Package_id, core_id, and SMT_id. Such decomposition provides a three-level map of 
 16//             the topology of hardware resources and
 17//             allow multi-threaded software to manage shared hardware resources in 
 18//             the platform to reduce resource contention
 19
 20//             Multicore detection algorithm for processor and cache topology requires
 21//             all leaf functions of CPUID instructions be available. System administrator
 22//             must ensure BIOS settings is not configured to restrict CPUID functionalities.
 23//-------------------------------------------------------------------------------------------------
 24
 25#if defined(TORQUE_OS_LINUX) || defined(LINUX)
 26
 27// TODO GCC code don't compile on Release with optimizations, mover code to platform layer
 28
 29#else
 30
 31#include "platform/platform.h"
 32#include "platform/platformCPUCount.h"
 33
 34#if defined(TORQUE_OS_LINUX) || defined(TORQUE_OS_OSX)
 35
 36#ifdef TORQUE_OS_LINUX
 37//    The Linux source code listing can be compiled using Linux kernel verison 2.6 
 38// or higher (e.g. RH 4AS-2.8 using GCC 3.4.4). 
 39// Due to syntax variances of Linux affinity APIs with earlier kernel versions 
 40// and dependence on glibc library versions, compilation on Linux environment 
 41// with older kernels and compilers may require kernel patches or compiler upgrades.
 42
 43#include <stdlib.h>
 44#include <unistd.h>
 45#include <string.h>
 46#include <sched.h>
 47#define DWORD unsigned long
 48#elif defined( TORQUE_OS_WIN )
 49#include <windows.h>
 50#elif defined( TORQUE_OS_MAC )
 51#  include <sys/types.h>
 52#  include <sys/sysctl.h>
 53#else
 54#error Not implemented on platform.
 55#endif
 56#include <stdio.h>
 57#include <assert.h>
 58
 59namespace CPUInfo {
 60
 61#define HWD_MT_BIT         0x10000000     // EDX[28]  Bit 28 is set if HT or multi-core is supported
 62#define NUM_LOGICAL_BITS   0x00FF0000     // EBX[23:16] Bit 16-23 in ebx contains the number of logical
 63      // processors per physical processor when execute cpuid with 
 64      // eax set to 1
 65#define NUM_CORE_BITS      0xFC000000     // EAX[31:26] Bit 26-31 in eax contains the number of cores minus one
 66      // per physical processor when execute cpuid with 
 67      // eax set to 4. 
 68
 69
 70#define INITIAL_APIC_ID_BITS  0xFF000000  // EBX[31:24] Bits 24-31 (8 bits) return the 8-bit unique 
 71      // initial APIC ID for the processor this code is running on.
 72
 73
 74      #ifndef TORQUE_OS_MAC
 75      static U32  CpuIDSupported(void);      
 76      static U32  find_maskwidth(unsigned int);
 77      static U32  HWD_MTSupported(void);
 78      static U32  MaxLogicalProcPerPhysicalProc(void);
 79      static U32  MaxCorePerPhysicalProc(void);
 80      static U8 GetAPIC_ID(void);
 81      static U8 GetNzbSubID(U8, U8, U8);
 82      #endif
 83
 84      static char g_s3Levels[2048];
 85
 86#ifndef TORQUE_OS_MAC
 87
 88      //
 89      // CpuIDSupported will return 0 if CPUID instruction is unavailable. Otherwise, it will return 
 90      // the maximum supported standard function.
 91      //
 92      static U32 CpuIDSupported(void)
 93      {
 94         U32 maxInputValue = 0;
 95         // If CPUID instruction is supported
 96#ifdef TORQUE_COMPILER_GCC
 97         try    
 98         {     
 99            // call cpuid with eax = 0
100            asm
101               (
102               "pushl %%ebx\n\t"
103               "xorl %%eax,%%eax\n\t"
104               "cpuid\n\t"
105               "popl %%ebx\n\t"
106               : "=a" (maxInputValue)
107               : 
108               : "%ecx", "%edx"
109               );    
110         }
111         catch (...)
112         {
113            return(0);                   // cpuid instruction is unavailable
114         }
115#elif defined( TORQUE_COMPILER_VISUALC )
116         try
117         {
118            // call cpuid with eax = 0
119            __asm
120            {
121               xor eax, eax
122                  cpuid
123                  mov maxInputValue, eax
124            }
125         }
126         catch (...)
127         {
128            // cpuid instruction is unavailable
129         }
130#else
131#  error Not implemented.
132#endif
133
134         return maxInputValue;
135      }
136
137
138
139      //
140      // Function returns the maximum cores per physical package. Note that the number of 
141      // AVAILABLE cores per physical to be used by an application might be less than this
142      // maximum value.
143      //
144
145      static U32 MaxCorePerPhysicalProc(void)
146      {
147
148         U32 Regeax        = 0;
149
150         if (!HWD_MTSupported()) return (U32) 1;  // Single core
151#ifdef TORQUE_COMPILER_GCC
152         {
153            asm
154               (
155               "pushl %ebx\n\t"
156               "xorl %eax, %eax\n\t"
157               "cpuid\n\t"
158               "cmpl $4, %eax\n\t"        // check if cpuid supports leaf 4
159               "jl .single_core\n\t"      // Single core
160               "movl $4, %eax\n\t"     
161               "movl $0, %ecx\n\t"        // start with index = 0; Leaf 4 reports
162               "popl %ebx\n\t"
163               );                      // at least one valid cache level
164            asm
165               (
166               "cpuid"
167               : "=a" (Regeax)
168               :
169               : "%ecx", "%edx"
170               );    
171            asm
172               (
173               "jmp .multi_core\n"
174               ".single_core:\n\t"
175               "xor %eax, %eax\n"
176               ".multi_core:"
177               );    
178         }
179#elif defined( TORQUE_COMPILER_VISUALC )
180         __asm
181         {
182            xor eax, eax
183               cpuid
184               cmp eax, 4        // check if cpuid supports leaf 4
185               jl single_core    // Single core
186               mov eax, 4        
187               mov ecx, 0        // start with index = 0; Leaf 4 reports
188               cpuid          // at least one valid cache level
189               mov Regeax, eax
190               jmp multi_core
191
192single_core:
193            xor eax, eax      
194
195multi_core:
196
197         }
198#else
199#  error Not implemented.
200#endif
201         return (U32)((Regeax & NUM_CORE_BITS) >> 26)+1;
202
203      }
204
205
206
207      //
208      // The function returns 0 when the hardware multi-threaded bit is not set.
209      //
210      static U32 HWD_MTSupported(void)
211      {
212
213
214         U32 Regedx      = 0;
215
216
217         if ((CpuIDSupported() >= 1))
218         {
219#ifdef TORQUE_COMPILER_GCC
220            asm 
221               (
222               "pushl %%ebx\n\t"
223               "movl $1,%%eax\n\t"
224               "cpuid\n\t"
225               "popl %%ebx\n\t"
226               : "=d" (Regedx)
227               :
228               : "%eax","%ecx"
229               );
230#elif defined( TORQUE_COMPILER_VISUALC )
231            __asm
232            {
233               mov eax, 1
234                  cpuid
235                  mov Regedx, edx
236            }     
237#else
238#  error Not implemented.
239#endif
240         }
241
242         return (Regedx & HWD_MT_BIT);  
243
244
245      }
246
247
248
249      //
250      // Function returns the maximum logical processors per physical package. Note that the number of 
251      // AVAILABLE logical processors per physical to be used by an application might be less than this
252      // maximum value.
253      //
254      static U32 MaxLogicalProcPerPhysicalProc(void)
255      {
256
257         U32 Regebx = 0;
258
259         if (!HWD_MTSupported()) return (U32) 1;
260#ifdef TORQUE_COMPILER_GCC
261         asm 
262            (
263            "movl $1,%%eax\n\t"
264            "cpuid"
265            : "=b" (Regebx)
266            :
267            : "%eax","%ecx","%edx"
268            );
269#elif defined( TORQUE_COMPILER_VISUALC )
270         __asm
271         {
272            mov eax, 1
273               cpuid
274               mov Regebx, ebx
275         }
276#else
277#  error Not implemented.
278#endif
279         return (unsigned int) ((Regebx & NUM_LOGICAL_BITS) >> 16);
280
281      }
282
283
284      static U8 GetAPIC_ID(void)
285      {
286
287         U32 Regebx = 0;
288#ifdef TORQUE_COMPILER_GCC
289         asm
290            (
291            "movl $1, %%eax\n\t" 
292            "cpuid"
293            : "=b" (Regebx) 
294            :
295            : "%eax","%ecx","%edx" 
296            );
297
298#elif defined( TORQUE_COMPILER_VISUALC )
299         __asm
300         {
301            mov eax, 1
302               cpuid
303               mov Regebx, ebx
304         }
305#else
306#  error Not implemented.
307#endif                                
308
309         return (unsigned char) ((Regebx & INITIAL_APIC_ID_BITS) >> 24);
310
311      }
312
313      //
314      // Determine the width of the bit field that can represent the value count_item. 
315      //
316      U32 find_maskwidth(U32 CountItem)
317      {
318         U32 MaskWidth,
319            count = CountItem;
320#ifdef TORQUE_COMPILER_GCC
321         asm
322            (
323#ifdef __x86_64__    // define constant to compile  
324            "push %%rcx\n\t"     // under 64-bit Linux
325            "push %%rax\n\t"
326#else
327            "pushl %%ecx\n\t"
328            "pushl %%eax\n\t"
329#endif
330            //    "movl $count, %%eax\n\t" //done by Assembler below
331            "xorl %%ecx, %%ecx"
332            //    "movl %%ecx, MaskWidth\n\t" //done by Assembler below
333            : "=c" (MaskWidth)
334            : "a" (count)
335            //    : "%ecx", "%eax" We don't list these as clobbered because we don't want the assembler
336            //to put them back when we are done
337            );
338         asm
339            (
340            "decl %%eax\n\t"
341            "bsrw %%ax,%%cx\n\t"
342            "jz next\n\t"
343            "incw %%cx\n\t"
344            //    "movl %%ecx, MaskWidth\n" //done by Assembler below
345            : "=c" (MaskWidth)
346            :
347         );
348         asm
349            (
350            "next:\n\t"
351#ifdef __x86_64__
352            "pop %rax\n\t"
353            "pop %rcx"     
354#else
355            "popl %eax\n\t"
356            "popl %ecx"    
357#endif
358            );
359
360#elif defined( TORQUE_COMPILER_VISUALC )
361         __asm
362         {
363            mov eax, count
364               mov ecx, 0
365               mov MaskWidth, ecx
366               dec eax
367               bsr cx, ax
368               jz next
369               inc cx
370               mov MaskWidth, ecx
371next:
372
373         }
374#else
375#  error Not implemented.
376#endif
377         return MaskWidth;
378      }
379
380
381      //
382      // Extract the subset of bit field from the 8-bit value FullID.  It returns the 8-bit sub ID value
383      //
384      static U8 GetNzbSubID(U8 FullID,
385         U8 MaxSubIDValue,
386         U8 ShiftCount)
387      {
388         U32 MaskWidth;
389         U8 MaskBits;
390
391         MaskWidth = find_maskwidth((U32) MaxSubIDValue);
392         MaskBits  = (0xff << ShiftCount) ^ 
393            ((U8) (0xff << (ShiftCount + MaskWidth)));
394
395         return (FullID & MaskBits);
396      }
397
398#endif
399
400
401      //
402      //
403      //
404      EConfig CPUCount(U32& TotAvailLogical, U32& TotAvailCore, U32& PhysicalNum)
405      {
406         EConfig StatusFlag = CONFIG_UserConfigIssue;
407
408         g_s3Levels[0] = 0;
409         TotAvailCore = 1;
410         PhysicalNum  = 1;
411         
412         U32 numLPEnabled = 0;
413         S32 MaxLPPerCore = 1;
414
415#ifdef TORQUE_OS_MAC
416
417         //FIXME: This isn't a proper port but more or less just some sneaky cheating
418         //  to get around having to mess with yet another crap UNIX-style API.  Seems
419         //  like there isn't a way to do this that's working across all OSX incarnations
420         //  and machine configurations anyway.
421
422         S32 numCPUs;
423         S32 numPackages;
424
425         // Get the number of CPUs.
426
427         size_t len = sizeof( numCPUs );
428         if( sysctlbyname( "hw.ncpu", &numCPUs, &len, 0, 0 ) == -1 )
429            return CONFIG_UserConfigIssue;
430
431         // Get the number of packages.
432         len = sizeof( numPackages );
433         if( sysctlbyname( "hw.packages", &numPackages, &len, 0, 0 ) == -1 )
434            return CONFIG_UserConfigIssue;
435
436         TotAvailCore = numCPUs;
437         TotAvailLogical = numCPUs;
438         PhysicalNum = numPackages;
439#else
440
441         U32 dwAffinityMask;
442         S32 j = 0;
443         U8 apicID, PackageIDMask;
444         U8 tblPkgID[256], tblCoreID[256], tblSMTID[256];
445         char  tmp[256];
446
447#ifdef TORQUE_OS_LINUX
448         //we need to make sure that this process is allowed to run on 
449         //all of the logical processors that the OS itself can run on.
450         //A process could acquire/inherit affinity settings that restricts the 
451         // current process to run on a subset of all logical processor visible to OS.
452
453         // Linux doesn't easily allow us to look at the Affinity Bitmask directly,
454         // but it does provide an API to test affinity maskbits of the current process 
455         // against each logical processor visible under OS.
456         S32 sysNumProcs = sysconf(_SC_NPROCESSORS_CONF); //This will tell us how many 
457         //CPUs are currently enabled.
458
459         //this will tell us which processors this process can run on. 
460         cpu_set_t allowedCPUs;   
461         sched_getaffinity(0, sizeof(allowedCPUs), &allowedCPUs);
462
463         for (S32 i = 0; i < sysNumProcs; i++ )
464         {
465            if ( CPU_ISSET(i, &allowedCPUs) == 0 )
466               return CONFIG_UserConfigIssue;
467         }
468#elif defined( TORQUE_OS_WIN )
469         DWORD dwProcessAffinity, dwSystemAffinity;
470         GetProcessAffinityMask(GetCurrentProcess(), 
471            &dwProcessAffinity,
472            &dwSystemAffinity);
473         if (dwProcessAffinity != dwSystemAffinity)  // not all CPUs are enabled
474            return CONFIG_UserConfigIssue;
475#else
476#  error Not implemented.
477#endif
478
479         // Assume that cores within a package have the SAME number of 
480         // logical processors.  Also, values returned by
481         // MaxLogicalProcPerPhysicalProc and MaxCorePerPhysicalProc do not have
482         // to be power of 2.
483
484         MaxLPPerCore = MaxLogicalProcPerPhysicalProc() / MaxCorePerPhysicalProc();
485         dwAffinityMask = 1;
486
487#ifdef TORQUE_OS_LINUX
488         cpu_set_t currentCPU;
489         while ( j < sysNumProcs )
490         {
491            CPU_ZERO(&currentCPU);
492            CPU_SET(j, &currentCPU);
493            if ( sched_setaffinity (0, sizeof(currentCPU), &currentCPU) == 0 )
494            {
495               sleep(0);  // Ensure system to switch to the right CPU
496#elif defined( TORQUE_OS_WIN )
497         while (dwAffinityMask && dwAffinityMask <= dwSystemAffinity)
498         {
499            if (SetThreadAffinityMask(GetCurrentThread(), dwAffinityMask))
500            {
501               Sleep(0);  // Ensure system to switch to the right CPU
502#else
503#  error Not implemented.
504#endif
505               apicID = GetAPIC_ID();
506
507
508               // Store SMT ID and core ID of each logical processor
509               // Shift vlaue for SMT ID is 0
510               // Shift value for core ID is the mask width for maximum logical
511               // processors per core
512
513               tblSMTID[j]  = GetNzbSubID(apicID, MaxLPPerCore, 0);
514               U8 maxCorePPP = MaxCorePerPhysicalProc();
515               U8 maskWidth = find_maskwidth(MaxLPPerCore);
516               tblCoreID[j] = GetNzbSubID(apicID, maxCorePPP, maskWidth);
517
518               // Extract package ID, assume single cluster.
519               // Shift value is the mask width for max Logical per package
520
521               PackageIDMask = (unsigned char) (0xff << 
522                  find_maskwidth(MaxLogicalProcPerPhysicalProc()));
523
524               tblPkgID[j] = apicID & PackageIDMask;
525               sprintf(tmp,"  AffinityMask = %d; Initial APIC = %d; Physical ID = %d, Core ID = %d,  SMT ID = %d\n",
526                  dwAffinityMask, apicID, tblPkgID[j], tblCoreID[j], tblSMTID[j]);
527               dStrcat(g_s3Levels, tmp, 2048);
528
529               numLPEnabled ++;   // Number of available logical processors in the system.
530
531            } // if
532
533            j++;  
534            dwAffinityMask = 1 << j;
535         } // while
536
537         // restore the affinity setting to its original state
538#ifdef TORQUE_OS_LINUX
539         sched_setaffinity (0, sizeof(allowedCPUs), &allowedCPUs);
540         sleep(0);
541#elif defined( TORQUE_OS_WIN )
542         SetThreadAffinityMask(GetCurrentThread(), dwProcessAffinity);
543         Sleep(0);
544#else
545#  error Not implemented.
546#endif
547         TotAvailLogical = numLPEnabled;
548
549         //
550         // Count available cores (TotAvailCore) in the system
551         //
552         U8 CoreIDBucket[256];
553         DWORD ProcessorMask, pCoreMask[256];
554         U32 i, ProcessorNum;
555
556         CoreIDBucket[0] = tblPkgID[0] | tblCoreID[0];
557         ProcessorMask = 1;
558         pCoreMask[0] = ProcessorMask;
559
560         for (ProcessorNum = 1; ProcessorNum < numLPEnabled; ProcessorNum++)
561         {
562            ProcessorMask <<= 1;
563            for (i = 0; i < TotAvailCore; i++)
564            {
565               // Comparing bit-fields of logical processors residing in different packages
566               // Assuming the bit-masks are the same on all processors in the system.
567               if ((tblPkgID[ProcessorNum] | tblCoreID[ProcessorNum]) == CoreIDBucket[i])
568               {
569                  pCoreMask[i] |= ProcessorMask;
570                  break;
571               }
572
573            }  // for i
574
575            if (i == TotAvailCore)   // did not match any bucket.  Start a new one.
576            {
577               CoreIDBucket[i] = tblPkgID[ProcessorNum] | tblCoreID[ProcessorNum];
578               pCoreMask[i] = ProcessorMask;
579
580               TotAvailCore++;   // Number of available cores in the system
581
582            }
583
584         }  // for ProcessorNum
585
586
587         //
588         // Count physical processor (PhysicalNum) in the system
589         //
590         U8 PackageIDBucket[256];
591         DWORD pPackageMask[256];
592
593         PackageIDBucket[0] = tblPkgID[0];
594         ProcessorMask = 1;
595         pPackageMask[0] = ProcessorMask;
596
597         for (ProcessorNum = 1; ProcessorNum < numLPEnabled; ProcessorNum++)
598         {
599            ProcessorMask <<= 1;
600            for (i = 0; i < PhysicalNum; i++)
601            {
602               // Comparing bit-fields of logical processors residing in different packages
603               // Assuming the bit-masks are the same on all processors in the system.
604               if (tblPkgID[ProcessorNum]== PackageIDBucket[i])
605               {
606                  pPackageMask[i] |= ProcessorMask;
607                  break;
608               }
609
610            }  // for i
611
612            if (i == PhysicalNum)   // did not match any bucket.  Start a new one.
613            {
614               PackageIDBucket[i] = tblPkgID[ProcessorNum];
615               pPackageMask[i] = ProcessorMask;
616
617               PhysicalNum++; // Total number of physical processors in the system
618
619            }
620
621         }  // for ProcessorNum
622#endif
623
624         //
625         // Check to see if the system is multi-core 
626         // Check if the system is hyper-threading
627         //
628         if (TotAvailCore > PhysicalNum) 
629         {
630            // Multi-core
631            if (MaxLPPerCore == 1)
632               StatusFlag = CONFIG_MultiCoreAndHTNotCapable;
633            else if (numLPEnabled > TotAvailCore)
634               StatusFlag = CONFIG_MultiCoreAndHTEnabled;
635            else StatusFlag = CONFIG_MultiCoreAndHTDisabled;
636
637         }
638         else
639         {
640            // Single-core
641            if (MaxLPPerCore == 1)
642               StatusFlag = CONFIG_SingleCoreAndHTNotCapable;
643            else if (numLPEnabled > TotAvailCore)
644               StatusFlag = CONFIG_SingleCoreHTEnabled;
645            else StatusFlag = CONFIG_SingleCoreHTDisabled;
646
647
648         }
649
650
651
652         return StatusFlag;
653      }
654
655} // namespace CPUInfo
656#endif
657
658#endif
659