1 /**
2  * This is a quick change of core.cpuid, to be usable in programs without a runtime.
3  *
4  * Copyright: Copyright Don Clugston 2007 - 2009.
5  *            Copyright Auburn Sounds 2017.
6  * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7  * Authors:   Don Clugston, Tomas Lindquist Olsen <tomas@famolsen.dk>
8  * Source:    $(DRUNTIMESRC core/_cpuid.d)
9  */
10 module dplug.core.cpuid;
11 
12 @trusted:
13 nothrow:
14 @nogc:
15 
16 // If optimizing for a particular processor, it is generally better
17 // to identify based on features rather than model. NOTE: Normally
18 // it's only worthwhile to optimise for the latest Intel and AMD CPU,
19 // with a backup for other CPUs.
20 // Pentium    -- preferPentium1()
21 // PMMX       --   + mmx()
22 // PPro       -- default
23 // PII        --   + mmx()
24 // PIII       --   + mmx() + sse()
25 // PentiumM   --   + mmx() + sse() + sse2()
26 // Pentium4   -- preferPentium4()
27 // PentiumD   --   + isX86_64()
28 // Core2      -- default + isX86_64()
29 // AMD K5     -- preferPentium1()
30 // AMD K6     --   + mmx()
31 // AMD K6-II  --   + mmx() + 3dnow()
32 // AMD K7     -- preferAthlon()
33 // AMD K8     --   + sse2()
34 // AMD K10    --   + isX86_64()
35 // Cyrix 6x86 -- preferPentium1()
36 //    6x86MX  --   + mmx()
37 version(D_InlineAsm_X86)
38 {
39     version = InlineAsm_X86_Any;
40 }
41 else version(D_InlineAsm_X86_64)
42 {
43     version = InlineAsm_X86_Any;
44 }
45 
46 public:
47 
48 /// Cache size and behaviour
49 struct CacheInfo
50 {
51     /// Size of the cache, in kilobytes, per CPU.
52     /// For L1 unified (data + code) caches, this size is half the physical size.
53     /// (we don't halve it for larger sizes, since normally
54     /// data size is much greater than code size for critical loops).
55     size_t size;
56     /// Number of ways of associativity, eg:
57     /// $(UL
58     /// $(LI 1 = direct mapped)
59     /// $(LI 2 = 2-way set associative)
60     /// $(LI 3 = 3-way set associative)
61     /// $(LI ubyte.max = fully associative)
62     /// )
63     ubyte associativity;
64     /// Number of bytes read into the cache when a cache miss occurs.
65     uint lineSize;
66 }
67 
68 public:
69     /// $(RED Scheduled for deprecation. Please use $(D dataCaches) instead.)
70     // Note: When we deprecate it, we simply make it private.
71     __gshared CacheInfo[5] datacache;
72 
73 @property
74 {
75     /// The data caches. If there are fewer than 5 physical caches levels,
76     /// the remaining levels are set to size_t.max (== entire memory space)
77     const(CacheInfo)[5] dataCaches() { return _dataCaches; }
78 
79     /// Returns vendor string, for display purposes only.
80     /// Do NOT use this to determine features!
81     /// Note that some CPUs have programmable vendorIDs.
82     string vendor()     {return _vendor;}
83     /// Returns processor string, for display purposes only
84     string processor()  {return _processor;}
85 
86     /// Does it have an x87 FPU on-chip?
87     bool x87onChip()    {return _x87onChip;}
88     /// Is MMX supported?
89     bool mmx()          {return _mmx;}
90     /// Is SSE supported?
91     bool sse()          {return _sse;}
92     /// Is SSE2 supported?
93     bool sse2()         {return _sse2;}
94     /// Is SSE3 supported?
95     bool sse3()         {return _sse3;}
96     /// Is SSSE3 supported?
97     bool ssse3()         {return _ssse3;}
98     /// Is SSE4.1 supported?
99     bool sse41()        {return _sse41;}
100     /// Is SSE4.2 supported?
101     bool sse42()        {return _sse42;}
102     /// Is SSE4a supported?
103     bool sse4a()        {return _sse4a;}
104     /// Is AES supported
105     bool aes()          {return _aes;}
106     /// Is pclmulqdq supported
107     bool hasPclmulqdq() {return _hasPclmulqdq;}
108     /// Is rdrand supported
109     bool hasRdrand()    {return _hasRdrand;}
110     /// Is AVX supported
111     bool avx()          {return _avx;}
112     /// Is VEX-Encoded AES supported
113     bool vaes()         {return _vaes;}
114     /// Is vpclmulqdq supported
115     bool hasVpclmulqdq(){return _hasVpclmulqdq; }
116     /// Is FMA supported
117     bool fma()          {return _fma;}
118     /// Is FP16C supported
119     bool fp16c()        {return _fp16c;}
120     /// Is AVX2 supported
121     bool avx2()         {return _avx2;}
122     /// Is HLE (hardware lock elision) supported
123     bool hle()          {return _hle;}
124     /// Is RTM (restricted transactional memory) supported
125     bool rtm()          {return _rtm;}
126     /// Is rdseed supported
127     bool hasRdseed()    {return _hasRdseed;}
128     /// Is SHA supported
129     bool hasSha()       {return _hasSha;}
130     /// Is AMD 3DNOW supported?
131     bool amd3dnow()     {return _amd3dnow;}
132     /// Is AMD 3DNOW Ext supported?
133     bool amd3dnowExt()  {return _amd3dnowExt;}
134     /// Are AMD extensions to MMX supported?
135     bool amdMmx()       {return _amdMmx;}
136     /// Is fxsave/fxrstor supported?
137     bool hasFxsr()          {return _hasFxsr;}
138     /// Is cmov supported?
139     bool hasCmov()          {return _hasCmov;}
140     /// Is rdtsc supported?
141     bool hasRdtsc()         {return _hasRdtsc;}
142     /// Is cmpxchg8b supported?
143     bool hasCmpxchg8b()     {return _hasCmpxchg8b;}
144     /// Is cmpxchg8b supported?
145     bool hasCmpxchg16b()    {return _hasCmpxchg16b;}
146     /// Is SYSENTER/SYSEXIT supported?
147     bool hasSysEnterSysExit() {return _hasSysEnterSysExit;}
148     /// Is 3DNow prefetch supported?
149     bool has3dnowPrefetch()   {return _has3dnowPrefetch;}
150     /// Are LAHF and SAHF supported in 64-bit mode?
151     bool hasLahfSahf()        {return _hasLahfSahf;}
152     /// Is POPCNT supported?
153     bool hasPopcnt()        {return _hasPopcnt;}
154     /// Is LZCNT supported?
155     bool hasLzcnt()         {return _hasLzcnt;}
156     /// Is this an Intel64 or AMD 64?
157     bool isX86_64()         {return _isX86_64;}
158 
159     /// Is this an IA64 (Itanium) processor?
160     bool isItanium()        { return _isItanium; }
161 
162     /// Is hyperthreading supported?
163     bool hyperThreading()   { return _hyperThreading; }
164     /// Returns number of threads per CPU
165     uint threadsPerCPU()    {return _threadsPerCPU;}
166     /// Returns number of cores in CPU
167     uint coresPerCPU()      {return _coresPerCPU;}
168 
169     /// Optimisation hints for assembly code.
170     ///
171     /// For forward compatibility, the CPU is compared against different
172     /// microarchitectures. For 32-bit x86, comparisons are made against
173     /// the Intel PPro/PII/PIII/PM family.
174     ///
175     /// The major 32-bit x86 microarchitecture 'dynasties' have been:
176     ///
177     /// $(UL
178     /// $(LI Intel P6 (PentiumPro, PII, PIII, PM, Core, Core2). )
179     /// $(LI AMD Athlon (K7, K8, K10). )
180     /// $(LI Intel NetBurst (Pentium 4, Pentium D). )
181     /// $(LI In-order Pentium (Pentium1, PMMX, Atom) )
182     /// )
183     ///
184     /// Other early CPUs (Nx586, AMD K5, K6, Centaur C3, Transmeta,
185     /// Cyrix, Rise) were mostly in-order.
186     ///
187     /// Some new processors do not fit into the existing categories:
188     ///
189     /// $(UL
190     /// $(LI Intel Atom 230/330 (family 6, model 0x1C) is an in-order core. )
191     /// $(LI Centaur Isiah = VIA Nano (family 6, model F) is an out-of-order core. )
192     /// )
193     ///
194     /// Within each dynasty, the optimisation techniques are largely
195     /// identical (eg, use instruction pairing for group 4). Major
196     /// instruction set improvements occur within each dynasty.
197 
198     /// Does this CPU perform better on AMD K7 code than PentiumPro..Core2 code?
199     bool preferAthlon() { return _preferAthlon; }
200     /// Does this CPU perform better on Pentium4 code than PentiumPro..Core2 code?
201     bool preferPentium4() { return _preferPentium4; }
202     /// Does this CPU perform better on Pentium I code than Pentium Pro code?
203     bool preferPentium1() { return _preferPentium1; }
204 }
205 
206 private __gshared 
207 {
208     /* These exist as immutables so that the query property functions can
209      * be backwards compatible with code that called them with ().
210      * Also, immutables can only be set by the static this().
211      */
212     CacheInfo[5] _dataCaches;
213     string _vendor;
214     string _processor;
215     bool _x87onChip;
216     bool _mmx;
217     bool _sse;
218     bool _sse2;
219     bool _sse3;
220     bool _ssse3;
221     bool _sse41;
222     bool _sse42;
223     bool _sse4a;
224     bool _aes;
225     bool _hasPclmulqdq;
226     bool _hasRdrand;
227     bool _avx;
228     bool _vaes;
229     bool _hasVpclmulqdq;
230     bool _fma;
231     bool _fp16c;
232     bool _avx2;
233     bool _hle;
234     bool _rtm;
235     bool _hasRdseed;
236     bool _hasSha;
237     bool _amd3dnow;
238     bool _amd3dnowExt;
239     bool _amdMmx;
240     bool _hasFxsr;
241     bool _hasCmov;
242     bool _hasRdtsc;
243     bool _hasCmpxchg8b;
244     bool _hasCmpxchg16b;
245     bool _hasSysEnterSysExit;
246     bool _has3dnowPrefetch;
247     bool _hasLahfSahf;
248     bool _hasPopcnt;
249     bool _hasLzcnt;
250     bool _isX86_64;
251     bool _isItanium;
252     bool _hyperThreading;
253     uint _threadsPerCPU;
254     uint _coresPerCPU;
255     bool _preferAthlon;
256     bool _preferPentium4;
257     bool _preferPentium1;
258 }
259 
260 __gshared:
261     // All these values are set only once, and never subsequently modified.
262 public:
263     /// $(RED Warning: This field will be turned into a property in a future release.)
264     ///
265     /// Processor type (vendor-dependent).
266     /// This should be visible ONLY for display purposes.
267     uint stepping, model, family;
268     /// $(RED This field has been deprecated. Please use $(D cacheLevels) instead.)
269     uint numCacheLevels = 1;
270     /// The number of cache levels in the CPU.
271     @property uint cacheLevels() { return numCacheLevels; }
272 private:
273 
274 struct CpuFeatures
275 {
276     bool probablyIntel; // true = _probably_ an Intel processor, might be faking
277     bool probablyAMD; // true = _probably_ an AMD processor
278     string processorName;
279     char [12] vendorID;
280     char [48] processorNameBuffer;
281     uint features = 0;     // mmx, sse, sse2, hyperthreading, etc
282     uint miscfeatures = 0; // sse3, etc.
283     uint extfeatures = 0;  // HLE, AVX2, RTM, etc.
284     uint amdfeatures = 0;  // 3DNow!, mmxext, etc
285     uint amdmiscfeatures = 0; // sse4a, sse5, svm, etc
286     ulong xfeatures = 0;   // XFEATURES_ENABLED_MASK
287     uint maxCores = 1;
288     uint maxThreads = 1;
289 }
290 
291 __gshared CpuFeatures cpuFeatures;
292 
293 /* Hide from the optimizer where cf (a register) is coming from, so that
294  * cf doesn't get "optimized away". The idea is to  reference
295  * the global data through cf so not so many fixups are inserted
296  * into the executable image.
297  */
298 CpuFeatures* getCpuFeatures() @nogc nothrow
299 {
300     pragma(inline, false);
301     return &cpuFeatures;
302 }
303 
304     // Note that this may indicate multi-core rather than hyperthreading.
305     @property bool hyperThreadingBit()    { return (cpuFeatures.features&HTT_BIT)!=0;}
306 
307     // feature flags CPUID1_EDX
308     enum : uint
309     {
310         FPU_BIT = 1,
311         TIMESTAMP_BIT = 1<<4, // rdtsc
312         MDSR_BIT = 1<<5,      // RDMSR/WRMSR
313         CMPXCHG8B_BIT = 1<<8,
314         SYSENTERSYSEXIT_BIT = 1<<11,
315         CMOV_BIT = 1<<15,
316         MMX_BIT = 1<<23,
317         FXSR_BIT = 1<<24,
318         SSE_BIT = 1<<25,
319         SSE2_BIT = 1<<26,
320         HTT_BIT = 1<<28,
321         IA64_BIT = 1<<30
322     }
323     // feature flags misc CPUID1_ECX
324     enum : uint
325     {
326         SSE3_BIT = 1,
327         PCLMULQDQ_BIT = 1<<1, // from AVX
328         MWAIT_BIT = 1<<3,
329         SSSE3_BIT = 1<<9,
330         FMA_BIT = 1<<12,     // from AVX
331         CMPXCHG16B_BIT = 1<<13,
332         SSE41_BIT = 1<<19,
333         SSE42_BIT = 1<<20,
334         POPCNT_BIT = 1<<23,
335         AES_BIT = 1<<25, // AES instructions from AVX
336         OSXSAVE_BIT = 1<<27, // Used for AVX
337         AVX_BIT = 1<<28,
338         FP16C_BIT = 1<<29,
339         RDRAND_BIT = 1<<30,
340     }
341     // Feature flags for cpuid.{EAX = 7, ECX = 0}.EBX.
342     enum : uint
343     {
344         FSGSBASE_BIT = 1 << 0,
345         BMI1_BIT = 1 << 3,
346         HLE_BIT = 1 << 4,
347         AVX2_BIT = 1 << 5,
348         SMEP_BIT = 1 << 7,
349         BMI2_BIT = 1 << 8,
350         ERMS_BIT = 1 << 9,
351         INVPCID_BIT = 1 << 10,
352         RTM_BIT = 1 << 11,
353         RDSEED_BIT = 1 << 18,
354         SHA_BIT = 1 << 29,
355     }
356     // feature flags XFEATURES_ENABLED_MASK
357     enum : ulong
358     {
359         XF_FP_BIT  = 0x1,
360         XF_SSE_BIT = 0x2,
361         XF_YMM_BIT = 0x4,
362     }
363     // AMD feature flags CPUID80000001_EDX
364     enum : uint
365     {
366         AMD_MMX_BIT = 1<<22,
367 //      FXR_OR_CYRIXMMX_BIT = 1<<24, // Cyrix/NS: 6x86MMX instructions.
368         FFXSR_BIT = 1<<25,
369         PAGE1GB_BIT = 1<<26, // support for 1GB pages
370         RDTSCP_BIT = 1<<27,
371         AMD64_BIT = 1<<29,
372         AMD_3DNOW_EXT_BIT = 1<<30,
373         AMD_3DNOW_BIT = 1<<31
374     }
375     // AMD misc feature flags CPUID80000001_ECX
376     enum : uint
377     {
378         LAHFSAHF_BIT = 1,
379         LZCNT_BIT = 1<<5,
380         SSE4A_BIT = 1<<6,
381         AMD_3DNOW_PREFETCH_BIT = 1<<8,
382     }
383 
384 
385 version(InlineAsm_X86_Any) {
386 // Note that this code will also work for Itanium in x86 mode.
387 
388 __gshared uint max_cpuid, max_extended_cpuid;
389 
390 // CPUID2: "cache and tlb information"
391 void getcacheinfoCPUID2()
392 {
393     // We are only interested in the data caches
394     void decipherCpuid2(ubyte x) @nogc nothrow {
395         if (x==0) return;
396         // Values from http://www.sandpile.org/ia32/cpuid.htm.
397         // Includes Itanium and non-Intel CPUs.
398         //
399         static immutable ubyte [63] ids = [
400             0x0A, 0x0C, 0x0D, 0x2C, 0x60, 0x0E, 0x66, 0x67, 0x68,
401             // level 2 cache
402             0x41, 0x42, 0x43, 0x44, 0x45, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7F,
403             0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x49, 0x4E,
404             0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x48, 0x80, 0x81,
405             // level 3 cache
406             0x22, 0x23, 0x25, 0x29, 0x46, 0x47, 0x4A, 0x4B, 0x4C, 0x4D,
407 
408             0xD0, 0xD1, 0xD2, 0xD6, 0xD7, 0xD8, 0xDC, 0xDD, 0xDE,
409             0xE2, 0xE3, 0xE4, 0xEA, 0xEB, 0xEC
410         ];
411         static immutable uint [63] sizes = [
412             8, 16, 16, 64, 16, 24, 8, 16, 32,
413             128, 256, 512, 1024, 2048, 1024, 128, 256, 512, 1024, 2048, 512,
414             256, 512, 1024, 2048, 512, 1024, 4096, 6*1024,
415             128, 192, 128, 256, 384, 512, 3072, 512, 128,
416             512, 1024, 2048, 4096, 4096, 8192, 6*1024, 8192, 12*1024, 16*1024,
417 
418             512, 1024, 2048, 1024, 2048, 4096, 1024+512, 3*1024, 6*1024,
419             2*1024, 4*1024, 8*1024, 12*1024, 28*1024, 24*1024
420         ];
421     // CPUBUG: Pentium M reports 0x2C but tests show it is only 4-way associative
422         static immutable ubyte [63] ways = [
423             2, 4, 4, 8, 8, 6, 4, 4, 4,
424             4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 2,
425             8, 8, 8, 8, 4, 8, 16, 24,
426             4, 6, 2, 4, 6, 4, 12, 8, 8,
427             4, 8, 8, 8, 4, 8, 12, 16, 12, 16,
428             4, 4, 4, 8, 8, 8, 12, 12, 12,
429             16, 16, 16, 24, 24, 24
430         ];
431         enum { FIRSTDATA2 = 8, FIRSTDATA3 = 28+9 }
432         for (size_t i=0; i< ids.length; ++i) {
433             if (x==ids[i]) {
434                 int level = i< FIRSTDATA2 ? 0: i<FIRSTDATA3 ? 1 : 2;
435                 if (x==0x49 && family==0xF && model==0x6) level=2;
436                 datacache[level].size=sizes[i];
437                 datacache[level].associativity=ways[i];
438                 if (level == 3 || x==0x2C || x==0x0D || (x>=0x48 && x<=0x80)
439                                    || x==0x86 || x==0x87
440                                    || (x>=0x66 && x<=0x68) || (x>=0x39 && x<=0x3E)){
441                     datacache[level].lineSize = 64;
442                 } else datacache[level].lineSize = 32;
443             }
444         }
445     }
446 
447     uint[4] a;
448     bool firstTime = true;
449     // On a multi-core system, this could theoretically fail, but it's only used
450     // for old single-core CPUs.
451     uint numinfos = 1;
452     do {
453         asm pure nothrow @nogc {
454             mov EAX, 2;
455             cpuid;
456             mov a, EAX;
457             mov a+4, EBX;
458             mov a+8, ECX;
459             mov a+12, EDX;
460         }
461         if (firstTime) {
462             if (a[0]==0x0000_7001 && a[3]==0x80 && a[1]==0 && a[2]==0) {
463         // Cyrix MediaGX MMXEnhanced returns: EAX= 00007001, EDX=00000080.
464         // These are NOT standard Intel values
465         // (TLB = 32 entry, 4 way associative, 4K pages)
466         // (L1 cache = 16K, 4way, linesize16)
467                 datacache[0].size=8;
468                 datacache[0].associativity=4;
469                 datacache[0].lineSize=16;
470                 return;
471             }
472             // lsb of a is how many times to loop.
473             numinfos = a[0] & 0xFF;
474             // and otherwise it should be ignored
475             a[0] &= 0xFFFF_FF00;
476             firstTime = false;
477         }
478         for (int c=0; c<4;++c) {
479             // high bit set == no info.
480             if (a[c] & 0x8000_0000) continue;
481             decipherCpuid2(cast(ubyte)(a[c] & 0xFF));
482             decipherCpuid2(cast(ubyte)((a[c]>>8) & 0xFF));
483             decipherCpuid2(cast(ubyte)((a[c]>>16) & 0xFF));
484             decipherCpuid2(cast(ubyte)((a[c]>>24) & 0xFF));
485         }
486     } while (--numinfos);
487 }
488 
489 // CPUID4: "Deterministic cache parameters" leaf
490 void getcacheinfoCPUID4()
491 {
492     int cachenum = 0;
493     for(;;) {
494         uint a, b, number_of_sets;
495         asm pure nothrow @nogc {
496             mov EAX, 4;
497             mov ECX, cachenum;
498             cpuid;
499             mov a, EAX;
500             mov b, EBX;
501             mov number_of_sets, ECX;
502         }
503         ++cachenum;
504         if ((a&0x1F)==0) break; // no more caches
505         uint numthreads = ((a>>14) & 0xFFF)  + 1;
506         uint numcores = ((a>>26) & 0x3F) + 1;
507         if (numcores > cpuFeatures.maxCores) cpuFeatures.maxCores = numcores;
508         if ((a&0x1F)!=1 && ((a&0x1F)!=3)) continue; // we only want data & unified caches
509 
510         ++number_of_sets;
511         ubyte level = cast(ubyte)(((a>>5)&7)-1);
512         if (level > datacache.length) continue; // ignore deep caches
513         datacache[level].associativity = a & 0x200 ? ubyte.max :cast(ubyte)((b>>22)+1);
514         datacache[level].lineSize = (b & 0xFFF)+ 1; // system coherency line size
515         uint line_partitions = ((b >> 12)& 0x3FF) + 1;
516         // Size = number of sets * associativity * cachelinesize * linepartitions
517         // and must convert to Kb, also dividing by the number of hyperthreads using this cache.
518         ulong sz = (datacache[level].associativity< ubyte.max)? number_of_sets *
519             datacache[level].associativity : number_of_sets;
520         datacache[level].size = cast(uint)(
521                 (sz * datacache[level].lineSize * line_partitions ) / (numthreads *1024));
522         if (level == 0 && (a&0xF)==3) {
523             // Halve the size for unified L1 caches
524             datacache[level].size/=2;
525         }
526     }
527 }
528 
529 // CPUID8000_0005 & 6
530 void getAMDcacheinfo()
531 {
532     uint c5, c6, d6;
533     asm pure nothrow @nogc {
534         mov EAX, 0x8000_0005; // L1 cache
535         cpuid;
536         // EAX has L1_TLB_4M.
537         // EBX has L1_TLB_4K
538         // EDX has L1 instruction cache
539         mov c5, ECX;
540     }
541 
542     datacache[0].size = ( (c5>>24) & 0xFF);
543     datacache[0].associativity = cast(ubyte)( (c5 >> 16) & 0xFF);
544     datacache[0].lineSize = c5 & 0xFF;
545 
546     if (max_extended_cpuid >= 0x8000_0006) {
547         // AMD K6-III or K6-2+ or later.
548         ubyte numcores = 1;
549         if (max_extended_cpuid >=0x8000_0008) {
550             asm pure nothrow @nogc {
551                 mov EAX, 0x8000_0008;
552                 cpuid;
553                 mov numcores, CL;
554             }
555             ++numcores;
556             if (numcores>cpuFeatures.maxCores) cpuFeatures.maxCores = numcores;
557         }
558         asm pure nothrow @nogc {
559             mov EAX, 0x8000_0006; // L2/L3 cache
560             cpuid;
561             mov c6, ECX; // L2 cache info
562             mov d6, EDX; // L3 cache info
563         }
564 
565         static immutable ubyte [] assocmap = [ 0, 1, 2, 0, 4, 0, 8, 0, 16, 0, 32, 48, 64, 96, 128, 0xFF ];
566         datacache[1].size = (c6>>16) & 0xFFFF;
567         datacache[1].associativity = assocmap[(c6>>12)&0xF];
568         datacache[1].lineSize = c6 & 0xFF;
569 
570         // The L3 cache value is TOTAL, not per core.
571         datacache[2].size = ((d6>>18)*512)/numcores; // could be up to 2 * this, -1.
572         datacache[2].associativity = assocmap[(d6>>12)&0xF];
573         datacache[2].lineSize = d6 & 0xFF;
574     }
575 }
576 
577 // For Intel CoreI7 and later, use function 0x0B
578 // to determine number of processors.
579 void getCpuInfo0B()
580 {
581     int level=0;
582     int threadsPerCore;
583     uint a, b, c, d;
584     do {
585         asm pure nothrow @nogc {
586             mov EAX, 0x0B;
587             mov ECX, level;
588             cpuid;
589             mov a, EAX;
590             mov b, EBX;
591             mov c, ECX;
592             mov d, EDX;
593         }
594         if (b!=0) {
595            // I'm not sure about this. The docs state that there
596            // are 2 hyperthreads per core if HT is factory enabled.
597             if (level==0)
598                 threadsPerCore = b & 0xFFFF;
599             else if (level==1) {
600                 cpuFeatures.maxThreads = b & 0xFFFF;
601                 cpuFeatures.maxCores = cpuFeatures.maxThreads / threadsPerCore;
602             }
603 
604         }
605         ++level;
606     } while (a!=0 || b!=0);
607 
608 }
609 
610 void cpuidX86()
611 {
612     auto cf = getCpuFeatures();
613 
614     char * venptr = cf.vendorID.ptr;
615     uint a, b, c, d, a2;
616     version(D_InlineAsm_X86)
617     {
618         asm pure nothrow @nogc {
619             mov EAX, 0;
620             cpuid;
621             mov a, EAX;
622             mov EAX, venptr;
623             mov [EAX], EBX;
624             mov [EAX + 4], EDX;
625             mov [EAX + 8], ECX;
626         }
627     }
628     else version(D_InlineAsm_X86_64)
629     {
630         asm pure nothrow @nogc {
631             mov EAX, 0;
632             cpuid;
633             mov a, EAX;
634             mov RAX, venptr;
635             mov [RAX], EBX;
636             mov [RAX + 4], EDX;
637             mov [RAX + 8], ECX;
638         }
639     }
640     asm pure nothrow @nogc {
641         mov EAX, 0x8000_0000;
642         cpuid;
643         mov a2, EAX;
644     }
645     max_cpuid = a;
646     max_extended_cpuid = a2;
647 
648 
649     cf.probablyIntel = cf.vendorID == "GenuineIntel";
650     cf.probablyAMD = cf.vendorID == "AuthenticAMD";
651     uint apic = 0; // brand index, apic id
652     asm pure nothrow @nogc {
653         mov EAX, 1; // model, stepping
654         cpuid;
655         mov a, EAX;
656         mov apic, EBX;
657         mov c, ECX;
658         mov d, EDX;
659     }
660     cf.features = d;
661     cf.miscfeatures = c;
662 
663     if (max_cpuid >= 7)
664     {
665         uint ext;
666 
667         asm pure nothrow @nogc
668         {
669             mov EAX, 7; // Structured extended feature leaf.
670             mov ECX, 0; // Main leaf.
671             cpuid;
672             mov ext, EBX; // HLE, AVX2, RTM, etc.
673         }
674 
675         cf.extfeatures = ext;
676     }
677 
678     if (cf.miscfeatures & OSXSAVE_BIT)
679     {
680         asm pure nothrow @nogc {
681             mov ECX, 0;
682             xgetbv;
683             mov d, EDX;
684             mov a, EAX;
685         }
686         cf.xfeatures = cast(ulong)d << 32 | a;
687     }
688     cf.amdfeatures = 0;
689     cf.amdmiscfeatures = 0;
690     if (max_extended_cpuid >= 0x8000_0001) {
691         asm pure nothrow @nogc {
692             mov EAX, 0x8000_0001;
693             cpuid;
694             mov c, ECX;
695             mov d, EDX;
696         }
697         cf.amdmiscfeatures = c;
698         cf.amdfeatures = d;
699     }
700     // Try to detect fraudulent vendorIDs
701     if (amd3dnow) cf.probablyIntel = false;
702 
703     stepping = a & 0xF;
704     uint fbase = (a >> 8) & 0xF;
705     uint mbase = (a >> 4) & 0xF;
706     family = ((fbase == 0xF) || (fbase == 0)) ? fbase + (a >> 20) & 0xFF : fbase;
707     model = ((fbase == 0xF) || (fbase == 6 && cf.probablyIntel) ) ?
708          mbase + ((a >> 12) & 0xF0) : mbase;
709 
710     if (!cf.probablyIntel && max_extended_cpuid >= 0x8000_0008) {
711         // determine max number of cores for AMD
712         asm pure nothrow @nogc {
713             mov EAX, 0x8000_0008;
714             cpuid;
715             mov c, ECX;
716         }
717         //http://support.amd.com/TechDocs/25481.pdf pg.36
718         cf.maxCores = 1;
719         if (hyperThreadingBit) {
720             cf.maxCores += c & 0xFF;
721         }
722     }
723 
724     if (max_extended_cpuid >= 0x8000_0004) {
725         char *procptr = cf.processorNameBuffer.ptr;
726         version(D_InlineAsm_X86)
727         {
728             asm pure nothrow @nogc {
729                 push ESI;
730                 mov ESI, procptr;
731                 mov EAX, 0x8000_0002;
732                 cpuid;
733                 mov [ESI], EAX;
734                 mov [ESI+4], EBX;
735                 mov [ESI+8], ECX;
736                 mov [ESI+12], EDX;
737                 mov EAX, 0x8000_0003;
738                 cpuid;
739                 mov [ESI+16], EAX;
740                 mov [ESI+20], EBX;
741                 mov [ESI+24], ECX;
742                 mov [ESI+28], EDX;
743                 mov EAX, 0x8000_0004;
744                 cpuid;
745                 mov [ESI+32], EAX;
746                 mov [ESI+36], EBX;
747                 mov [ESI+40], ECX;
748                 mov [ESI+44], EDX;
749                 pop ESI;
750             }
751         }
752         else version(D_InlineAsm_X86_64)
753         {
754             asm pure nothrow @nogc {
755                 push RSI;
756                 mov RSI, procptr;
757                 mov EAX, 0x8000_0002;
758                 cpuid;
759                 mov [RSI], EAX;
760                 mov [RSI+4], EBX;
761                 mov [RSI+8], ECX;
762                 mov [RSI+12], EDX;
763                 mov EAX, 0x8000_0003;
764                 cpuid;
765                 mov [RSI+16], EAX;
766                 mov [RSI+20], EBX;
767                 mov [RSI+24], ECX;
768                 mov [RSI+28], EDX;
769                 mov EAX, 0x8000_0004;
770                 cpuid;
771                 mov [RSI+32], EAX;
772                 mov [RSI+36], EBX;
773                 mov [RSI+40], ECX;
774                 mov [RSI+44], EDX;
775                 pop RSI;
776             }
777         }
778         // Intel P4 and PM pad at front with spaces.
779         // Other CPUs pad at end with nulls.
780         int start = 0, end = 0;
781         while (cf.processorNameBuffer[start] == ' ') { ++start; }
782         while (cf.processorNameBuffer[cf.processorNameBuffer.length-end-1] == 0) { ++end; }
783         cf.processorName = cast(string)(cf.processorNameBuffer[start..$-end]);
784     } else {
785         cf.processorName = "Unknown CPU";
786     }
787     // Determine cache sizes
788 
789     // Intel docs specify that they return 0 for 0x8000_0005.
790     // AMD docs do not specify the behaviour for 0004 and 0002.
791     // Centaur/VIA and most other manufacturers use the AMD method,
792     // except Cyrix MediaGX MMX Enhanced uses their OWN form of CPUID2!
793     // NS Geode GX1 provides CyrixCPUID2 _and_ does the same wrong behaviour
794     // for CPUID80000005. But Geode GX uses the AMD method
795 
796     // Deal with Geode GX1 - make it same as MediaGX MMX.
797     if (max_extended_cpuid==0x8000_0005 && max_cpuid==2) {
798         max_extended_cpuid = 0x8000_0004;
799     }
800     // Therefore, we try the AMD method unless it's an Intel chip.
801     // If we still have no info, try the Intel methods.
802     datacache[0].size = 0;
803     if (max_cpuid<2 || !cf.probablyIntel) {
804         if (max_extended_cpuid >= 0x8000_0005) {
805             getAMDcacheinfo();
806         } else if (cf.probablyAMD) {
807             // According to AMDProcRecognitionAppNote, this means CPU
808             // K5 model 0, or Am5x86 (model 4), or Am4x86DX4 (model 4)
809             // Am5x86 has 16Kb 4-way unified data & code cache.
810             datacache[0].size = 8;
811             datacache[0].associativity = 4;
812             datacache[0].lineSize = 32;
813         } else {
814             // Some obscure CPU.
815             // Values for Cyrix 6x86MX (family 6, model 0)
816             datacache[0].size = 64;
817             datacache[0].associativity = 4;
818             datacache[0].lineSize = 32;
819         }
820     }
821     if ((datacache[0].size == 0) && max_cpuid>=4) {
822         getcacheinfoCPUID4();
823     }
824     if ((datacache[0].size == 0) && max_cpuid>=2) {
825         getcacheinfoCPUID2();
826     }
827     if (datacache[0].size == 0) {
828         // Pentium, PMMX, late model 486, or an obscure CPU
829         if (mmx) { // Pentium MMX. Also has 8kB code cache.
830             datacache[0].size = 16;
831             datacache[0].associativity = 4;
832             datacache[0].lineSize = 32;
833         } else { // Pentium 1 (which also has 8kB code cache)
834                  // or 486.
835             // Cyrix 6x86: 16, 4way, 32 linesize
836             datacache[0].size = 8;
837             datacache[0].associativity = 2;
838             datacache[0].lineSize = 32;
839         }
840     }
841     if (max_cpuid >=0x0B) {
842         // For Intel i7 and later, use function 0x0B to determine
843         // cores and hyperthreads.
844         getCpuInfo0B();
845     } else {
846         if (hyperThreadingBit) cf.maxThreads = (apic>>>16) & 0xFF;
847         else cf.maxThreads = cf.maxCores;
848     }
849 }
850 
851 // Return true if the cpuid instruction is supported.
852 // BUG(WONTFIX): Returns false for Cyrix 6x86 and 6x86L. They will be treated as 486 machines.
853 bool hasCPUID()
854 {
855     version(D_InlineAsm_X86_64)
856         return true;
857     else version(D_InlineAsm_X86)
858     {
859         uint flags;
860         asm nothrow @nogc {
861             pushfd;
862             pop EAX;
863             mov flags, EAX;
864             xor EAX, 0x0020_0000;
865             push EAX;
866             popfd;
867             pushfd;
868             pop EAX;
869             xor flags, EAX;
870         }
871         return (flags & 0x0020_0000) !=0;
872     }
873 }
874 
875 } else { // inline asm X86
876 
877     bool hasCPUID() { return false; }
878 
879     void cpuidX86()
880     {
881             datacache[0].size = 8;
882             datacache[0].associativity = 2;
883             datacache[0].lineSize = 32;
884     }
885 }
886 
887 /*
888 // TODO: Implement this function with OS support
889 void cpuidPPC()
890 {
891     enum :int  { PPC601, PPC603, PPC603E, PPC604,
892                  PPC604E, PPC620, PPCG3, PPCG4, PPCG5 }
893 
894     // TODO:
895     // asm { mfpvr; } returns the CPU version but unfortunately it can
896     // only be used in kernel mode. So OS support is required.
897     int cputype = PPC603;
898 
899     // 601 has a 8KB combined data & code L1 cache.
900     uint sizes[] = [4, 8, 16, 16, 32, 32, 32, 32, 64];
901     ubyte ways[] = [8, 2,  4,  4,  4,  8,  8,  8,  8];
902     uint L2size[]= [0, 0,  0,  0,  0,  0,  0,  256,  512];
903     uint L3size[]= [0, 0,  0,  0,  0,  0,  0,  2048,  0];
904 
905     datacache[0].size = sizes[cputype];
906     datacache[0].associativity = ways[cputype];
907     datacache[0].lineSize = (cputype==PPCG5)? 128 :
908         (cputype == PPC620 || cputype == PPCG3)? 64 : 32;
909     datacache[1].size = L2size[cputype];
910     datacache[2].size = L3size[cputype];
911     datacache[1].lineSize = datacache[0].lineSize;
912     datacache[2].lineSize = datacache[0].lineSize;
913 }
914 
915 // TODO: Implement this function with OS support
916 void cpuidSparc()
917 {
918     // UltaSparcIIi  : L1 = 16,  2way. L2 = 512, 4 way.
919     // UltraSparcIII : L1 = 64,  4way. L2= 4096 or 8192.
920     // UltraSparcIIIi: L1 = 64,  4way. L2= 1024, 4 way
921     // UltraSparcIV  : L1 = 64,  4way. L2 = 16*1024.
922     // UltraSparcIV+ : L1 = 64,  4way. L2 = 2048, L3=32*1024.
923     // Sparc64V      : L1 = 128, 2way. L2 = 4096 4way.
924 }
925 */
926 
927 __gshared initializedCpuid = false;
928 
929 shared static this()
930 {
931     initializeCpuid();
932 }
933 
934 /// Instead of a static this constructor, this 
935 public void initializeCpuid()
936 {
937     if (initializedCpuid)
938         return;
939 
940     initializedCpuid = true;
941 
942     auto cf = getCpuFeatures();
943 
944     if (hasCPUID()) {
945         cpuidX86();
946     } else {
947         // it's a 386 or 486, or a Cyrix 6x86.
948         //Probably still has an external cache.
949     }
950     if (datacache[0].size==0) {
951             // Guess same as Pentium 1.
952             datacache[0].size = 8;
953             datacache[0].associativity = 2;
954             datacache[0].lineSize = 32;
955     }
956     numCacheLevels = 1;
957     // And now fill up all the unused levels with full memory space.
958     for (size_t i=1; i< datacache.length; ++i) {
959         if (datacache[i].size==0) {
960             // Set all remaining levels of cache equal to full address space.
961             datacache[i].size = size_t.max/1024;
962             datacache[i].associativity = 1;
963             datacache[i].lineSize = datacache[i-1].lineSize;
964         }
965         else
966             ++numCacheLevels;
967     }
968 
969     // Set the immortals
970 
971     _dataCaches =     datacache;
972     _vendor =         cast(string)cf.vendorID;
973     _processor =      cf.processorName;
974     _x87onChip =      (cf.features&FPU_BIT)!=0;
975     _mmx =            (cf.features&MMX_BIT)!=0;
976     _sse =            (cf.features&SSE_BIT)!=0;
977     _sse2 =           (cf.features&SSE2_BIT)!=0;
978     _sse3 =           (cf.miscfeatures&SSE3_BIT)!=0;
979     _ssse3 =          (cf.miscfeatures&SSSE3_BIT)!=0;
980     _sse41 =          (cf.miscfeatures&SSE41_BIT)!=0;
981     _sse42 =          (cf.miscfeatures&SSE42_BIT)!=0;
982     _sse4a =          (cf.amdmiscfeatures&SSE4A_BIT)!=0;
983     _aes =            (cf.miscfeatures&AES_BIT)!=0;
984     _hasPclmulqdq =   (cf.miscfeatures&PCLMULQDQ_BIT)!=0;
985     _hasRdrand =      (cf.miscfeatures&RDRAND_BIT)!=0;
986 
987     enum avx_mask = XF_SSE_BIT|XF_YMM_BIT;
988     _avx =            (cf.xfeatures & avx_mask) == avx_mask && (cf.miscfeatures&AVX_BIT)!=0;
989 
990     _vaes =           avx && aes;
991     _hasVpclmulqdq =  avx && hasPclmulqdq;
992     _fma =            avx && (cf.miscfeatures&FMA_BIT)!=0;
993     _fp16c =          avx && (cf.miscfeatures&FP16C_BIT)!=0;
994     _avx2 =           avx && (cf.extfeatures & AVX2_BIT) != 0;
995     _hle =            (cf.extfeatures & HLE_BIT) != 0;
996     _rtm =            (cf.extfeatures & RTM_BIT) != 0;
997     _hasRdseed =      (cf.extfeatures&RDSEED_BIT)!=0;
998     _hasSha =         (cf.extfeatures&SHA_BIT)!=0;
999     _amd3dnow =       (cf.amdfeatures&AMD_3DNOW_BIT)!=0;
1000     _amd3dnowExt =    (cf.amdfeatures&AMD_3DNOW_EXT_BIT)!=0;
1001     _amdMmx =         (cf.amdfeatures&AMD_MMX_BIT)!=0;
1002     _hasFxsr =        (cf.features&FXSR_BIT)!=0;
1003     _hasCmov =        (cf.features&CMOV_BIT)!=0;
1004     _hasRdtsc =       (cf.features&TIMESTAMP_BIT)!=0;
1005     _hasCmpxchg8b =   (cf.features&CMPXCHG8B_BIT)!=0;
1006     _hasCmpxchg16b =  (cf.miscfeatures&CMPXCHG16B_BIT)!=0;
1007     _hasSysEnterSysExit =
1008         // The SYSENTER/SYSEXIT features were buggy on Pentium Pro and early PentiumII.
1009         // (REF: www.geoffchappell.com).
1010         (cf.probablyIntel && (family < 6 || (family==6 && (model< 3 || (model==3 && stepping<3)))))
1011             ? false
1012             : (cf.features & SYSENTERSYSEXIT_BIT)!=0;
1013     _has3dnowPrefetch = (cf.amdmiscfeatures&AMD_3DNOW_PREFETCH_BIT)!=0;
1014     _hasLahfSahf =    (cf.amdmiscfeatures&LAHFSAHF_BIT)!=0;
1015     _hasPopcnt =      (cf.miscfeatures&POPCNT_BIT)!=0;
1016     _hasLzcnt =       (cf.amdmiscfeatures&LZCNT_BIT)!=0;
1017     _isX86_64 =       (cf.amdfeatures&AMD64_BIT)!=0;
1018     _isItanium =      (cf.features&IA64_BIT)!=0;
1019     _hyperThreading = cf.maxThreads>cf.maxCores;
1020     _threadsPerCPU =  cf.maxThreads;
1021     _coresPerCPU =    cf.maxCores;
1022     _preferAthlon =   cf.probablyAMD && family >=6;
1023     _preferPentium4 = cf.probablyIntel && family == 0xF;
1024     _preferPentium1 = family < 6 || (family==6 && model < 0xF && !cf.probablyIntel);
1025 }