__llwpcb

Visual Studio 2010

Visual Studio 2010 SP1 is required.

Microsoft Specific

Generates the lightweight profiling (LWP) instruction llwpcb to pass the address of a lightweight profiling control block (LWPCB) to the LWP hardware and enable or disable lightweight profiling.

void __llwpcb(
   void *pcbaddress
);

[in] pcbaddress

Zero or a pointer to an LWP control block.

Intrinsic

Architecture

__llwpcb

LWP

Header file <intrin.h>

This intrinsic generates code that enables or disables lightweight profiling. Lightweight profiling writes records for certain user-selected hardware or software events into a ring buffer in user space. The LWP control block selects the events to be recorded and describes the ring buffer. Sampling and writing event records happens very quickly, with minimal disruption to user code. When the ring buffer becomes nearly full, the hardware can generate an interrupt to activate code that processes the entries in the ring buffer. See AMD's "Lightweight Profiling Specification" (Publication Number 43724) for detailed information on lightweight profiling.

When the parameter pcbaddress points to a valid LWP control block, lightweight profiling is enabled as described by the control block. When pcbaddress is 0, the LWP internal buffers are flushed to the LWP ring buffer and LWP is disabled.

The llwpcb instruction is part of the LWP family of instructions. The LWP instructions require both hardware and operating system support. To determine hardware support for LWP, call the __cpuid intrinsic with InfoType = 0x80000001 and check bit 15 of CPUInfo[2] (ECX). This bit is 1 when LWP is supported by the hardware, and 0 otherwise. Once you know that LWP is supported by the hardware, call the __cpuid intrinsic with InfoType = 0x8000001C and check bit 0 of CPUInfo[0] (EAX). This bit is 1 if the operating system has made LWP available, 0 otherwise. If the bit is 1, there will be other important information available in CPUInfo[], as described below. Note that the values in CPUInfo[3] (EDX) represent the same LWP features as those in CPUInfo[0] (EAX), but the values in EDX represent the LWP capabilities of the hardware, while the values in EAX represent the LWP features currently enabled by the operating system.

CpuInfo[]

Bits

Field Name

Description

0 (EAX)

0

LwpAvail

LWP is supported by HW and OS

1

LwpVAL

LWPVAL instruction available

2

LwpIRE

Instructions retired event available

3

LwpBRE

Branch retired event available

4

LwpDME

DCache miss event available

5

LwpCNH

CPU clocks not halted event available

6

LwpRNH

CPU reference clocks not halted event available

30-7

Reserved

31

LwpInt

Threshold overflow interrupt available

1 (EBX)

7-0

LwpCbSize

Size of LWPCB in quadwords

15-8

LwpEventSize

Size of a ring buffer event record in bytes

23-16

LwpMaxEvents

Maximum Eventid value supported (not including 255)

31-24

LwpEventOffset

Offset in bytes of EventInterval1 field from start of LWPCB. Always a multiple of 8.

2 (ECX)

4-0

LwpLatencyMax

Number of bits in cache latency counters

5

LwpDataAddress

1 if cache miss event records report data address of missing reference

8-6

LwpLatencyRnd

Bits by which cache latency is rounded (0 to 4)

15-9

LwpVersion

Version of LWP implementation

23-16

LwpMinBufferSize

Minimum size of the LWP ring buffer, in units of 32*EventSize

#include <intrin.h>
#include <stdio.h>

#define MAX_EVENTS 6
#define LWPEVENTSIZE 32
#define BUFFERSIZE 4096

struct lwpEventRecord {
    unsigned __int64 EventId : 8;
    unsigned __int64 CoreId : 8;
    unsigned __int64 Flags : 16;
    unsigned __int64 Data1 : 32;
    unsigned __int64 InstructionAddress;
    unsigned __int64 Data2;
    unsigned __int64 Reserved;
};

struct lwpEventRecord ringBuffer[BUFFERSIZE];

struct lwpcb0 {
    unsigned __int64 Flags : 32;
    unsigned __int64 BufferSize : 28;
    unsigned __int64 Random : 4;
};

struct lwpcbEvent {
    unsigned __int64 EventInterval : 26;
    unsigned __int64 EIReserved1 : 6;
    unsigned __int64 EventCounter : 26;
    unsigned __int64 EIReserved2 : 6;

};

struct lwpcbStruct {
    unsigned __int64 Flags : 32;
    unsigned __int64 BufferSize : 28;
    unsigned __int64 Random : 4;

    unsigned __int64 BufferBase;

    unsigned __int64 BufferHeadOffset : 32;
    unsigned __int64 Reserved1 : 32;

    unsigned __int64 MissedEvents;

    unsigned __int64 Threshold : 32;
    unsigned __int64 Filters : 32;

    unsigned __int64 BaseIP;

    unsigned __int64 LimitIP;

    unsigned __int64 Reserved2;

    unsigned __int64 BufferTailOffset : 32;
    unsigned __int64 Reserved3 : 32;

    unsigned __int64 Reserved4[7];

    struct lwpcbEvent Events[MAX_EVENTS]; // event 1 == index 0
} myLWPCBStruct;

__m128d data[100];

extern void __cpuid(int *CPUInfo, int InfoType);
// Return 1 if LWP is supported by the hardware
int LwpSupported()
{
    int cpuInfo[4] = {0};
    __cpuid(cpuInfo, 0x80000001);
    if (cpuInfo[2] & (1 << 15)) return 1;
    else return 0;
}

// Return 1 if LWP is enabled by the OS
// Assumes LWP is supported by the hardware
int LwpAvailable()
{
    int cpuInfo[4] = {0};
    __cpuid(cpuInfo, 0x8000001C);
    if (cpuInfo[0] & 1) return 1;
    else return 0;
}

// Return 1 if LWPVAL instruction is supported by this hardware
// Assumes LWP is supported by the hardware
int LwpvalSupported()
{
    int cpuInfo[4] = {0};
    __cpuid(cpuInfo, 0x8000001C);
    if (cpuInfo[3] & (1 << 1)) return 1;
    else return 0;
}

// Return 1 if LWPVAL instruction is enabled by the OS
// Assumes LWPVAL is supported by the hardware
int LwpvalAvailable()
{
    int cpuInfo[4] = {0};
    __cpuid(cpuInfo, 0x8000001C);
    if (cpuInfo[3] & (1 << 1)) return 1;
    else return 0;
}

void
initializeLWPCB(struct lwpcbStruct *p)
{
    int i, lwpvalok;
    unsigned __int64 *dummy;
    p->Flags =  0; // disable HW counters & threshold interrupts
    p->BufferSize = sizeof(ringBuffer)/sizeof(struct lwpEventRecord);
    p->BufferSize *= sizeof(struct lwpEventRecord);
    p->Random = 0; // No randomness in counters
    p->BufferBase = (unsigned __int64)&ringBuffer[0];
    /// Value of BufferHeadOffset here is arbitrary
    p->BufferHeadOffset = p->BufferSize -
                          3*sizeof(struct lwpEventRecord);
    p->MissedEvents = 0;
    p->Threshold = 2*p->BufferSize; // don't want threshold interrupts
    p->Filters = 0; // shouldn't matter for this test
    p->BaseIP = 0; // shouldn't matter for this test
    p->LimitIP = 0; // shouldn't matter for this test
    p->BufferTailOffset = p->BufferHeadOffset; // ring buffer empty
    p->Reserved1 = p->Reserved2 = p->Reserved3 = 0;
    for (i = 0; i < 7; i++) p->Reserved4[i] = 0;
    for (i = 0; i < MAX_EVENTS; i++) {
        p->Events[i-1].EventInterval = 0;
        p->Events[i-1].EIReserved1 = 0;
        p->Events[i-1].EventCounter = 0;
        p->Events[i-1].EIReserved2 = 0;
    }
    if (LwpvalSupported() && LwpvalAvailable()) {
        p->Flags |= 2; // Count LWPVAL events
        p->Events[0].EventInterval = 9; // count every 10th LWPVAL
    }
}
#define LOOPSIZE 31
main()
{
    int i;
    __m128d temp;
    double sum = 0;
    struct lwpcbstruct *plwpcb;
    unsigned int tailOffset, headOffset, bufferSize, bufferCapacity;
    unsigned int headRecord, tailRecord;
    int headMinusTail;
    unsigned int recordSize = sizeof(struct lwpEventRecord);
    unsigned int numEntries;
    unsigned int lwpvalCount, lwpinsCount;

    if (!LwpSupported()) {
        printf("LWP is not supported by this hardware\n");
        exit(1);
    }
    if (!LwpAvailable()) {
        printf("OS has not made LWP available\n");
        exit(1);
    }
#if defined(_M_X64)
    printf("64-bit compiler\n");
#else
    printf("32-bit compiler\n");
#endif
    initializeLWPCB(&myLWPCBStruct);
    __llwpcb(&myLWPCBStruct);
    plwpcb = __slwpcb();
    if ((unsigned __int64)plwpcb != (unsigned __int64)&myLWPCBStruct) {
        printf("Test failed: bad return from __slwpcb()\n");
        exit(1);
    }

    if (LwpvalSupported() && LwpvalAvailable()) {
        for (i = 0; i < LOOPSIZE; i ++) {
            if (i % 7 == 0) __lwpins32(0xdeadbeef, i, 0x01234567); 
            __lwpval32(0x0badf00d, i, 0xcad00cad);
        }
#if defined(_M_X64)
        for (i = 0; i < LOOPSIZE; i ++) {
            if (i % 7 == 0) __lwpins64(0xdeadbeefdeadbeefll, i, 0x01234567); 
            __lwpval64(0x0badf00d0badf00dll, i, 0xcad00cad);
        }
#endif
    } else {
        if (!LwpvalSupported()) {
            printf("LWPVAL instruction not supported by the hardware\n");
        } else if (!LwpvalAvailable()) {
            printf("LWPVAL instruction not enabled\n");
        }
        for (i = 0; i < LOOPSIZE; i ++) {
            if (i % 7 == 0) __lwpins32(0xdeadbeef, i, 0x01234567); 
        }
#if defined(_M_X64)
        for (i = 0; i < LOOPSIZE; i ++) {
            if (i % 7 == 0) __lwpins64(0xdeadbeefdeadbeefll, i, 0x01234567); 
        }
#endif
    }

    plwpcb = __slwpcb();

    tailOffset = myLWPCBStruct.BufferTailOffset;
    headOffset = myLWPCBStruct.BufferHeadOffset;
    bufferSize = myLWPCBStruct.BufferSize;
    bufferCapacity = bufferSize / recordSize;

    headMinusTail = headOffset;
    headMinusTail -= tailOffset;
    if (tailOffset <= headOffset) numEntries = headMinusTail;
    else numEntries = headMinusTail + bufferSize;
    numEntries /= recordSize;

    tailRecord = tailOffset / recordSize;
    headRecord = headOffset / recordSize;
    printf("%d entries in ring buffer\n", numEntries);

    lwpvalCount = lwpinsCount = 0;
    for (i = tailRecord; i != headRecord; i = (i + 1)%bufferCapacity) {
        switch(ringBuffer[i].EventId) {
            case 1:
                lwpvalCount += 1;
                break;
            case 255:
                lwpinsCount += 1;
                break;
            default:
                printf("WARNING: bad EventId %d in ring buffer\n", 
                        ringBuffer[i].EventId);
                break;
        }
    }
    printf("%d LWPVAL instructions, %d LWPINS instructions\n",
            lwpvalCount, lwpinsCount);
}
32-bit compiler
9 entries in ring buffer
4 LWPVAL instructions, 5 LWPINS instructions
or
64-bit compiler
17 entries in ring buffer
7 LWPVAL instructions, 10 LWPINS instructions
or
      A message saying that LWPVAL is not available along with the appropriate output from above, but with counts of zero for LWPVAL and with the number of ring buffer entries reduced to match.
or
A message that LWP is not supported or available.

Community Additions

ADD
Show: