ARM Intrinsics

 

For the latest documentation on Visual Studio 2017, see Visual Studio 2017 Documentation.

For the latest documentation on Visual Studio 2017, see ARM Intrinsics on docs.microsoft.com. The Visual C++ compiler makes the following intrinsics available on the ARM architecture. For more information about ARM, see the ARM Architecture Reference Manuals and ARM Assembler Tools Guide on the ARM Infocenter website.

The NEON vector instruction set extensions for ARM provide Single Instruction Multiple Data (SIMD) capabilities that resemble those in the MMX and SSE vector instruction sets that are common to x86 and x64 architecture processors.

NEON intrinsics are supported, as provided in the header file arm_neon.h. The Visual C++ compiler support for NEON intrinsics resembles that of the ARM compiler, which is documented in Appendix G of the ARM Compiler toolchain, Version 4.1 Compiler Reference on the ARM Infocenter website.

The primary difference between the Visual C++ compiler and the ARM compiler is that the Visual C++ compiler adds _ex variants of the vldX and vstX vector load and store instructions. The _ex variants take an additional parameter that specifies the alignment of the pointer argument but are otherwise identical to their non-_ex counterparts.

Function NameInstructionFunction Prototype
_arm_smlalSMLAL__int64 _arm_smlal(__int64 _RdHiLo, int _Rn, int _Rm)
_arm_umlalUMLALunsigned __int64 _arm_umlal(unsigned __int64 _RdHiLo, unsigned int _Rn, unsigned int _Rm)
_arm_clzCLZunsigned int _arm_clz(unsigned int _Rm)
_arm_qaddQADDint _arm_qadd(int _Rm, int _Rn)
_arm_qdaddQDADDint _arm_qdadd(int _Rm, int _Rn)
_arm_qdsubQDSUBint _arm_qdsub(int _Rm, int _Rn)
_arm_qsubQSUBint _arm_qsub(int _Rm, int _Rn)
_arm_smlabbSMLABBint _arm_smlabb(int _Rn, int _Rm, int _Ra)
_arm_smlabtSMLABTint _arm_smlabt(int _Rn, int _Rm, int _Ra)
_arm_smlatbSMLATBint _arm_smlatb(int _Rn, int _Rm, int _Ra)
_arm_smlattSMLATTint _arm_smlatt(int _Rn, int _Rm, int _Ra)
_arm_smlalbbSMLALBB__int64 _arm_smlalbb(__int64 _RdHiLo, int _Rn, int _Rm)
_arm_smlalbtSMLALBT__int64 _arm_smlalbt(__int64 _RdHiLo, int _Rn, int _Rm)
_arm_smlaltbSMLALTB__int64 _arm_smlaltb(__int64 _RdHiLo, int _Rn, int _Rm)
_arm_smlalttSMLALTT__int64 _arm_smlaltt(__int64 _RdHiLo, int _Rn, int _Rm)
_arm_smlawbSMLAWBint _arm_smlawb(int _Rn, int _Rm, int _Ra)
_arm_smlawtSMLAWTint _arm_smlawt(int _Rn, int _Rm, int _Ra)
_arm_smulbbSMULBBint _arm_smulbb(int _Rn, int _Rm)
_arm_smulbtSMULBTint _arm_smulbt(int _Rn, int _Rm)
_arm_smultbSMULTBint _arm_smultb(int _Rn, int _Rm)
_arm_smulttSMULTTint _arm_smultt(int _Rn, int _Rm)
_arm_smulwbSMULWBint _arm_smulwb(int _Rn, int _Rm)
_arm_smulwtSMULWTint _arm_smulwt(int _Rn, int _Rm)
_arm_sadd16SADD16int _arm_sadd16(int _Rn, int _Rm)
_arm_sadd8SADD8int _arm_sadd8(int _Rn, int _Rm)
_arm_sasxSASXint _arm_sasx(int _Rn, int _Rm)
_arm_ssaxSSAXint _arm_ssax(int _Rn, int _Rm)
_arm_ssub16SSUB16int _arm_ssub16(int _Rn, int _Rm)
_arm_ssub8SSUB8int _arm_ssub8(int _Rn, int _Rm)
_arm_shadd16SHADD16int _arm_shadd16(int _Rn, int _Rm)
_arm_shadd8SHADD8int _arm_shadd8(int _Rn, int _Rm)
_arm_shasxSHASXint _arm_shasx(int _Rn, int _Rm)
_arm_shsaxSHSAXint _arm_shsax(int _Rn, int _Rm)
_arm_shsub16SHSUB16int _arm_shsub16(int _Rn, int _Rm)
_arm_shsub8SHSUB8int _arm_shsub8(int _Rn, int _Rm)
_arm_qadd16QADD16int _arm_qadd16(int _Rn, int _Rm)
_arm_qadd8QADD8int _arm_qadd8(int _Rn, int _Rm)
_arm_qasxQASXint _arm_qasx(int _Rn, int _Rm)
_arm_qsaxQSAXint _arm_qsax(int _Rn, int _Rm)
_arm_qsub16QSUB16int _arm_qsub16(int _Rn, int _Rm)
_arm_qsub8QSUB8int _arm_qsub8(int _Rn, int _Rm)
_arm_uadd16UADD16unsigned int _arm_uadd16(unsigned int _Rn, unsigned int _Rm)
_arm_uadd8UADD8unsigned int _arm_uadd8(unsigned int _Rn, unsigned int _Rm)
_arm_uasxUASXunsigned int _arm_uasx(unsigned int _Rn, unsigned int _Rm)
_arm_usaxUSAXunsigned int _arm_usax(unsigned int _Rn, unsigned int _Rm)
_arm_usub16USUB16unsigned int _arm_usub16(unsigned int _Rn, unsigned int _Rm)
_arm_usub8USUB8unsigned int _arm_usub8(unsigned int _Rn, unsigned int _Rm)
_arm_uhadd16UHADD16unsigned int _arm_uhadd16(unsigned int _Rn, unsigned int _Rm)
_arm_uhadd8UHADD8unsigned int _arm_uhadd8(unsigned int _Rn, unsigned int _Rm)
_arm_uhasxUHASXunsigned int _arm_uhasx(unsigned int _Rn, unsigned int _Rm)
_arm_uhsaxUHSAXunsigned int _arm_uhsax(unsigned int _Rn, unsigned int _Rm)
_arm_uhsub16UHSUB16unsigned int _arm_uhsub16(unsigned int _Rn, unsigned int _Rm)
_arm_uhsub8UHSUB8unsigned int _arm_uhsub8(unsigned int _Rn, unsigned int _Rm)
_arm_uqadd16UQADD16unsigned int _arm_uqadd16(unsigned int _Rn, unsigned int _Rm)
_arm_uqadd8UQADD8unsigned int _arm_uqadd8(unsigned int _Rn, unsigned int _Rm)
_arm_uqasxUQASXunsigned int _arm_uqasx(unsigned int _Rn, unsigned int _Rm)
_arm_uqsaxUQSAXunsigned int _arm_uqsax(unsigned int _Rn, unsigned int _Rm)
_arm_uqsub16UQSUB16unsigned int _arm_uqsub16(unsigned int _Rn, unsigned int _Rm)
_arm_uqsub8UQSUB8unsigned int _arm_uqsub8(unsigned int _Rn, unsigned int _Rm)
_arm_sxtabSXTABint _arm_sxtab(int _Rn, int _Rm, unsigned int _Rotation)
_arm_sxtab16SXTAB16int _arm_sxtab16(int _Rn, int _Rm, unsigned int _Rotation)
_arm_sxtahSXTAHint _arm_sxtah(int _Rn, int _Rm, unsigned int _Rotation)
_arm_uxtabUXTABunsigned int _arm_uxtab(unsigned int _Rn, unsigned int _Rm, unsigned int _Rotation)
_arm_uxtab16UXTAB16unsigned int _arm_uxta16b(unsigned int _Rn, unsigned int _Rm, unsigned int _Rotation)
_arm_uxtahUXTAHunsigned int _arm_uxtah(unsigned int _Rn, unsigned int _Rm, unsigned int _Rotation)
_arm_sxtbSXTBint _arm_sxtb(int _Rn, unsigned int _Rotation)
_arm_sxtb16SXTB16int _arm_sxtb16(int _Rn, unsigned int _Rotation)
_arm_sxthSXTHint _arm_sxth(int _Rn, unsigned int _Rotation)
_arm_uxtbUXTBunsigned int _arm_uxtb(unsigned int _Rn, unsigned int _Rotation)
_arm_uxtb16UXTB16unsigned int _arm_uxtb16(unsigned int _Rn, unsigned int _Rotation)
_arm_uxthUXTHunsigned int _arm_uxth(unsigned int _Rn, unsigned int _Rotation)
_arm_pkhbtPKHBTint _arm_pkhbt(int _Rn, int _Rm, unsigned int _Lsl_imm)
_arm_pkhtbPKHTBint _arm_pkhtb(int _Rn, int _Rm, unsigned int _Asr_imm)
_arm_usad8USAD8unsigned int _arm_usad8(unsigned int _Rn, unsigned int _Rm)
_arm_usada8USADA8unsigned int _arm_usada8(unsigned int _Rn, unsigned int _Rm, unsigned int _Ra)
_arm_ssatSSATint _arm_ssat(unsigned int _Sat_imm, _int _Rn, _ARMINTR_SHIFT_T _Shift_type, unsigned int _Shift_imm)
_arm_usatUSATint _arm_usat(unsigned int _Sat_imm, _int _Rn, _ARMINTR_SHIFT_T _Shift_type, unsigned int _Shift_imm)
_arm_ssat16SSAT16int _arm_ssat16(unsigned int _Sat_imm, _int _Rn)
_arm_usat16USAT16int _arm_usat16(unsigned int _Sat_imm, _int _Rn)
_arm_revREVunsigned int _arm_rev(unsigned int _Rm)
_arm_rev16REV16unsigned int _arm_rev16(unsigned int _Rm)
_arm_revshREVSHunsigned int _arm_revsh(unsigned int _Rm)
_arm_smladSMLADint _arm_smlad(int _Rn, int _Rm, int _Ra)
_arm_smladxSMLADXint _arm_smladx(int _Rn, int _Rm, int _Ra)
_arm_smlsdSMLSDint _arm_smlsd(int _Rn, int _Rm, int _Ra)
_arm_smlsdxSMLSDXint _arm_smlsdx(int _Rn, int _Rm, int _Ra)
_arm_smmlaSMMLAint _arm_smmla(int _Rn, int _Rm, int _Ra)
_arm_smmlarSMMLARint _arm_smmlar(int _Rn, int _Rm, int _Ra)
_arm_smmlsSMMLSint _arm_smmls(int _Rn, int _Rm, int _Ra)
_arm_smmlsrSMMLSRint _arm_smmlsr(int _Rn, int _Rm, int _Ra)
_arm_smmulSMMULint _arm_smmul(int _Rn, int _Rm)
_arm_smmulrSMMULRint _arm_smmulr(int _Rn, int _Rm)
_arm_smlaldSMLALD__int64 _arm_smlald(__int64 _RdHiLo, int _Rn, int _Rm)
_arm_smlaldxSMLALDX__int64 _arm_smlaldx(__int64 _RdHiLo, int _Rn, int _Rm)
_arm_smlsldSMLSLD__int64 _arm_smlsld(__int64 _RdHiLo, int _Rn, int _Rm)
_arm_smlsldxSMLSLDX__int64 _arm_smlsldx(__int64 _RdHiLo, int _Rn, int _Rm)
_arm_smuadSMUADint _arm_smuad(int _Rn, int _Rm)
_arm_smuadxSMUADXint _arm_muadxs(int _Rn, int _Rm)
_arm_smusdSMUSDint _arm_smusd(int _Rn, int _Rm)
_arm_smusdxSMUSDXint _arm_smusdx(int _Rn, int _Rm)
_arm_smullSMULL__int64 _arm_smull(int _Rn, int _Rm)
_arm_umullUMULLunsigned __int64 _arm_umull(unsigned int _Rn, unsigned int _Rm)
_arm_umaalUMAALunsigned __int64 _arm_umaal(unsigned int _RdLo, unsigned int _RdHi, unsigned int _Rn, unsigned int _Rm)
_arm_bfcBFCunsigned int _arm_bfc(unsigned int _Rd, unsigned int _Lsb, unsigned int _Width)
_arm_bfiBFIunsigned int _arm_bfi(unsigned int _Rd, unsigned int _Rn, unsigned int _Lsb, unsigned int _Width)
_arm_rbitRBITunsigned int _arm_rbit(unsigned int _Rm)
_arm_sbfxSBFXint _arm_sbfx(int _Rn, unsigned int _Lsb, unsigned int _Width)
_arm_ubfxUBFXunsigned int _arm_ubfx(unsigned int _Rn, unsigned int _Lsb, unsigned int _Width)
_arm_sdivSDIVint _arm_sdiv(int _Rn, int _Rm)
_arm_udivUDIVunsigned int _arm_udiv(unsigned int _Rn, unsigned int _Rm)
__cpsCPSvoid __cps(unsigned int _Ops, unsigned int _Flags, unsigned int _Mode)
__dmbDMBvoid __dmb(unsigned int _Type)

Inserts a memory barrier operation into the instruction stream. The parameter _Type specifies the kind of restriction that the barrier enforces.

For more information about the kinds of restrictions that can be enforced, see Memory Barrier Restrictions.
__dsbDSBvoid __dsb(unsigned int _Type)

Inserts a memory barrier operation into the instruction stream. The parameter _Type specifies the kind of restriction that the barrier enforces.

For more information about the kinds of restrictions that can be enforced, see Memory Barrier Restrictions.
__isbISBvoid __isb(unsigned int _Type)

Inserts a memory barrier operation into the instruction stream. The parameter _Type specifies the kind of restriction that the barrier enforces.

For more information about the kinds of restrictions that can be enforced, see Memory Barrier Restrictions.
__emitvoid __emit(unsigned __int32 opcode)

Inserts a specified instruction into the stream of instructions that is output by the compiler.

The value of opcode must be a constant expression that is known at compile time. The size of an instruction word is 16 bits and the most significant 16 bits of opcode are ignored.

The compiler makes no attempt to interpret the contents of opcode and does not guarantee a CPU or memory state before the inserted instruction is executed.

The compiler assumes that the CPU and memory states are unchanged after the inserted instruction is executed. Therefore, instructions that do change state can have a detrimental impact on normal code that is generated by the compiler.

For this reason, use emit only to insert instructions that affect a CPU state that the compiler does not normally process—for example, coprocessor state—or to implement functions that are declared by using declspec(naked).
__hvcHVCunsigned int __hvc(unsigned int, ...)
__iso_volatile_load16__int16 __iso_volatile_load16(const volatile __int16 *)

For more information, see __iso_volatile_load/store instrinsics.
__iso_volatile_load32__int32 __iso_volatile_load32(const volatile __int32 *)

For more information, see __iso_volatile_load/store instrinsics.
__iso_volatile_load64__int64 __iso_volatile_load64(const volatile __int64 *)

For more information, see __iso_volatile_load/store instrinsics.
__iso_volatile_load8__int8 __iso_volatile_load8(const volatile __int8 *)

For more information, see __iso_volatile_load/store instrinsics.
__iso_volatile_store16void __iso_volatile_store16(volatile __int16 *, __int16)

For more information, see __iso_volatile_load/store instrinsics.
__iso_volatile_store32void __iso_volatile_store32(volatile __int32 *, __int32)

For more information, see __iso_volatile_load/store instrinsics.
__iso_volatile_store64void __iso_volatile_store64(volatile __int64 *, __int64)

For more information, see __iso_volatile_load/store instrinsics.
__iso_volatile_store8void __iso_volatile_store8(volatile __int8 *, __int8)

For more information, see __iso_volatile_load/store instrinsics.
__ldrexdLDREXD__int64 __ldrexd(const volatile __int64 *)
__prefetchPLDvoid __cdecl __prefetch(const void *)

Provides a PLD memory hint to the system that memory at or near the specified address may be accessed soon. Some systems may choose to optimize for that memory access pattern to increase runtime performance. However, from the C++ language point of view, the function has no observable effect, and may do nothing at all.
__rdpmccntr64unsigned __int64 __rdpmccntr64(void)
__sevSEVvoid __sev(void)
__static_assertvoid __static_assert(int, const char *)
__swiSVCunsigned int __swi(unsigned int, ...)
__trapBKPTint __trap(int, ...)
__wfeWFEvoid __wfe(void)
__wfiWFIvoid __wfi(void)
_AddSatIntQADDint _AddSatInt(int, int)
_CopyDoubleFromInt64double _CopyDoubleFromInt64(__int64)
_CopyFloatFromInt32float _CopyFloatFromInt32(__int32)
_CopyInt32FromFloat__int32 _CopyInt32FromFloat(float)
_CopyInt64FromDouble__int64 _CopyInt64FromDouble(double)
_CountLeadingOnesunsigned int _CountLeadingOnes(unsigned long)
_CountLeadingOnes64unsigned int _CountLeadingOnes64(unsigned __int64)
_CountLeadingSignsunsigned int _CountLeadingSigns(long)
_CountLeadingSigns64unsigned int _CountLeadingSigns64(__int64)
_CountLeadingZerosunsigned int _CountLeadingZeros(unsigned long)
_CountLeadingZeros64unsigned int _CountLeadingZeros64(unsigned __int64)
_CountOneBitsunsigned int _CountOneBits(unsigned long)
_CountOneBits64unsigned int _CountOneBits64(unsigned __int64)
_DAddSatIntQDADDint _DAddSatInt(int, int)
_DSubSatIntQDSUBint _DSubSatInt(int, int)
_isunorderedint _isunordered(double, double)
_isunorderedfint _isunorderedf(float, float)
_MoveFromCoprocessorMRCunsigned int _MoveFromCoprocessor(unsigned int, unsigned int, unsigned int, unsigned int, unsigned int)

Reads data from an ARM coprocessor by using the coprocessor data transfer instructions. For more information see _MoveFromCoprocessor, _MoveFromCoprocessor2.
_MoveFromCoprocessor2MRC2unsigned int _MoveFromCoprocessor2(unsigned int, unsigned int, unsigned int, unsigned int, unsigned int)

Reads data from an ARM coprocessor by using the coprocessor data transfer instructions. For more information see _MoveFromCoprocessor, _MoveFromCoprocessor2.
_MoveFromCoprocessor64MRRCunsigned __int64 _MoveFromCoprocessor64(unsigned int, unsigned int, unsigned int)

Reads data from an ARM coprocessor by using the coprocessor data transfer instructions. For more information see _MoveFromCoprocessor64.
_MoveToCoprocessorMCRvoid _MoveToCoprocessor(unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int)

Reads data from an ARM coprocessor by using the coprocessor data transfer instructions. For more information see _MoveToCoprocessor, _MoveToCoprocessor2.
_MoveToCoprocessor2MCR2void _MoveToCoprocessor2(unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int)

Reads data from an ARM coprocessor by using the coprocessor data transfer instructions. For more information see _MoveToCoprocessor, _MoveToCoprocessor2.
_MoveToCoprocessor64MCRRvoid _MoveToCoprocessor64(unsigned __int64, unsigned int, unsigned int, unsigned int)

Reads data from an ARM coprocessor by using the coprocessor data transfer instructions. For more information see _MoveToCoprocessor64.
_MulHighlong _MulHigh(long, long)
_MulUnsignedHighunsigned long _MulUnsignedHigh(unsigned long, unsigned long)
_ReadBankedRegMRSint _ReadBankedReg(int _Reg)
_ReadStatusRegMRSint _ReadStatusReg(int)
_SubSatIntQSUBint _SubSatInt(int, int)
_WriteBankedRegMSRvoid _WriteBankedReg(int _Value, int _Reg)
_WriteStatusRegMSRvoid _WriteStatusReg(int, int, int)

[NEON]

Memory Barrier Restrictions

The intrinsic functions __dmb (data memory barrier), __dsb (data synchronization barrier), and __isb (instruction synchronization barrier) use the following predefined values to specify the memory barrier restriction in terms of the sharing domain and kind of access that are affected by the operation.

Restriction ValueDescription
_ARM_BARRIER_SYFull system, reads and writes.
_ARM_BARRIER_STFull system, writes only.
_ARM_BARRIER_ISHInner sharable, reads and writes.
_ARM_BARRIER_ISHSTInner sharable, writes only.
_ARM_BARRIER_NSHNon-sharable, reads and writes.
_ARM_BARRIER_NSHSTNon-sharable, writes only.
_ARM_BARRIER_OSHOuter sharable, reads and writes.
_ARM_BARRIER_OSHSTOuter sharable, writes only.

For the __isb intrinsic, the only restriction that is currently valid is _ARM_BARRIER_SY; all other values are reserved by the architecture.

__iso_volatile_load/store instrinsics

These intrinsic functions explicitly perform loads and stores that are not subject to compiler optimizations.

__int16 __iso_volatile_load16(const volatile __int16 * Location)  
__int32 __iso_volatile_load32(const volatile __int32 * Location)  
__int64 __iso_volatile_load64(const volatile __int64 * Location)  
__int8 __iso_volatile_load8(const volatile __int8 * Location)  
  
void __iso_volatile_store16(volatile __int16 * Location, __int16 Value)  
void __iso_volatile_store32(volatile __int32 * Location, __int32 Value)  
void __iso_volatile_store64(volatile __int64 * Location, __int64 Value)  
void __iso_volatile_store8(volatile __int8 * Location, __int8 Value)  
  

Parameters

Location
The address of a memory location to read from or write to.

Value (store intrinsics only)
The value to write to the specified memory location.

Return Value (load intrinsics only)

The value of the memory location that is specified by Location.

Remarks

You can use the __iso_volatile_load8/16/32/64 and __iso_volatile_store8/16/32/64 intrinsics to explicitly perform memory accesses that are not subject to compiler optimizations. The compiler cannot remove, synthetize, or change the relative order of these operations, but it does not generate implicit hardware memory barriers. Therefore, the hardware may still reorder the observable memory accesses across multiple threads. More precisely, these intrinsics are equivalent to the following expressions as compiled under /volatile:iso.

  
      int a = __iso_volatile_load32(p);    // equivalent to: int a = *(const volatile __int32*)p;   
__iso_volatile_store32(p, a);        // equivalent to: *(volatile __int32*)p = a;  

Notice that the intrinsics take volatile pointers to accommodate volatile variables. However, there is no requirement or recommendation to use volatile pointers as arguments; the semantics of these operations are exactly the same if a regular, non-volatile type is used.

For more information about the /volatile:iso command-line argument, see /volatile (volatile Keyword Interpretation).

_MoveFromCoprocessor, _MoveFromCoprocessor2

These intrinsic functions read data from ARM coprocessors by using the coprocessor data transfer instructions.

int _MoveFromCoprocessor(  
      unsigned int coproc,  
      unsigned int opcode1,  
      unsigned int crn,  
      unsigned int crm,  
      unsigned int opcode2  
);  
  
int _MoveFromCoprocessor2(  
      unsigned int coproc,  
      unsigned int opcode1,  
      unsigned int crn,  
      unsigned int crm,  
      unsigned int opcode2  
);  
  

Parameters

coproc
Coprocessor number in the range 0 to 15.

opcode1
Coprocessor-specific opcode in the range 0 to 7

crn
Coprocessor register number, in the range 0 to 15, that specifies the first operand to the instruction.

crm
Coprocessor register number, in the range 0 to 15, that specifies an additional source or destination operand.

opcode2
Additional coprocessor-specific opcode in the range 0 to 7.

Return Value

The value that is read from the coprocessor.

Remarks

The values of all five parameters of this intrinsic must be constant expressions that are known at compile time.

_MoveFromCoprocessor uses the MRC instruction; _MoveFromCoprocessor2 uses MRC2. The parameters correspond to bitfields that are encoded directly into the instruction word. The interpretation of the parameters is coprocessor-dependent. For more information, see the manual for the coprocessor in question.

_MoveFromCoprocessor64

Reads data from ARM coprocessors by using the coprocessor data transfer instructions.

unsigned __int64 _MoveFromCoprocessor64(  
      unsigned int coproc,  
      unsigned int opcode1,  
      unsigned int crm,  
);  
  

Parameters

coproc
Coprocessor number in the range 0 to 15.

opcode1
Coprocessor-specific opcode in the range 0 to 15.

crm
Coprocessor register number, in the range 0 to 15, that specifies an additional source or destination operand.

Returns Value

The value that is read from the coprocessor.

Remarks

The values of all three parameters of this intrinsic must be constant expressions that are known at compile time.

_MoveFromCoprocessor64 uses the MRRC instruction. The parameters correspond to bitfields that are encoded directly into the instruction word. The interpretation of the parameters is coprocessor-dependent. For more information, see the manual for the coprocessor in question.

_MoveToCoprocessor, _MoveToCoprocessor2

These intrinsic functions write data to ARM coprocessors by using the coprocessor data transfer instructions.

void _MoveToCoprocessor(  
      unsigned int value,  
      unsigned int coproc,  
      unsigned int opcode1,  
      unsigned int crn,  
      unsigned int crm,  
      unsigned int opcode2  
);  
  
void _MoveToCoprocessor2(  
      unsigned int value,  
      unsigned int coproc,  
      unsigned int opcode1,  
      unsigned int crn,  
      unsigned int crm,  
      unsigned int opcode2  
);  
  

Parameters

value
The value to be written to the coprocessor.

coproc
Coprocessor number in the range 0 to 15.

opcode1
Coprocessor-specific opcode in the range 0 to 7.

crn
Coprocessor register number, in the range 0 to 15, that specifies the first operand to the instruction.

crm
Coprocessor register number, in the range 0 to 15, that specifies an additional source or destination operand.

opcode2
Additional coprocessor-specific opcode in the range 0 to 7.

Return Value

None.

Remarks

The values of the coproc, opcode1, crn, crm, and opcode2 parameters of this intrinsic must be constant expressions that are known at compile time.

_MoveToCoprocessor uses the MCR instruction; _MoveToCoprocessor2 uses MCR2. The parameters correspond to bitfields that are encoded directly into the instruction word. The interpretation of the parameters is coprocessor-dependent. For more information, see the manual for the coprocessor in question.

_MoveToCoprocessor64

These intrinsic functions write data to ARM coprocessors by using the coprocessor data transfer instructions.

void _MoveFromCoprocessor64(  
      unsigned __int64 value,  
      unsigned int coproc,  
      unsigned int opcode1,  
      unsigned int crm,  
);  
  

Parameters

coproc
Coprocessor number in the range 0 to 15.

opcode1
Coprocessor-specific opcode in the range 0 to 15.

crm
Coprocessor register number, in the range 0 to 15, that specifies an additional source or destination operand.

Return Value

None.

Remarks

The values of the coproc, opcode1, and crm parameters of this intrinsic must be constant expressions that are known at compile time.

_MoveFromCoprocessor64 uses the MCRR instruction. The parameters correspond to bitfields that are encoded directly into the instruction word. The interpretation of the parameters is coprocessor-dependent. For more information, see the manual for the coprocessor in question.

The following table lists intrinsics from other architectures that are supported on ARM platforms. Where the behavior of an intrinsic on ARM differs from its behavior on other hardware architectures, additional details are noted.

Function NameFunction Prototype
__assumevoid __assume(int)
__code_segvoid __code_seg(const char *)
__debugbreakvoid __cdecl __debugbreak(void)
__fastfail__declspec(noreturn) void __fastfail(unsigned int)
__nopvoid __nop(void) Note: On ARM platforms, this function generates a NOP instruction if one is implemented in the target architecture; otherwise, an alternative instruction that does not change the state of the program or CPU is generated—for example, MOV r8, r8. This is functionally equivalent to the __nop intrinsic for other hardware architectures. Because an instruction that has no effect on the state of the program or CPU might be ignored by the target architecture as an optimization, the instruction does not necessarily consume CPU cycles. Therefore, do not use the __nop intrinsic to manipulate the execution time of a code sequence unless you are certain about how the CPU will behave. Instead, you can use the __nop intrinsic to align the next instruction to a specific 32-bit boundary address.
__yieldvoid __yield(void) Note: On ARM platforms, this function generates the YIELD instruction, which indicates that the thread is performing a task that can be temporarily suspended from execution—for example, a spinlock—without adversely affecting the program. This enables the CPU to execute other tasks during execution cycles that would otherwise be wasted.
_AddressOfReturnAddressvoid * _AddressOfReturnAddress(void)
_BitScanForwardunsigned char _BitScanForward(unsigned long * _Index, unsigned long _Mask)
_BitScanReverseunsigned char _BitScanReverse(unsigned long * _Index, unsigned long _Mask)
_bittestunsigned char _bittest(long const *, long)
_bittestandcomplementunsigned char _bittestandcomplement(long *, long)
_bittestandresetunsigned char _bittestandreset(long *, long)
_bittestandsetunsigned char _bittestandset(long *, long)
_byteswap_uint64unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64)
_byteswap_ulongunsigned long __cdecl _byteswap_ulong(unsigned long)
_byteswap_ushortunsigned short __cdecl _byteswap_ushort(unsigned short)
_disablevoid __cdecl _disable(void) Note: On ARM platforms, this function generates the CPSID instruction; it is only available as an intrinsic.
_enablevoid __cdecl _enable(void) Note: On ARM platforms, this function generates the CPSIE instruction; it is only available as an intrinsic.
_lrotlunsigned long __cdecl _lrotl(unsigned long, int)
_lrotrunsigned long __cdecl _lrotr(unsigned long, int)
_ReadBarriervoid _ReadBarrier(void)
_ReadWriteBarriervoid _ReadWriteBarrier(void)
_ReturnAddressvoid * _ReturnAddress(void)
_rotlunsigned int __cdecl _rotl(unsigned int _Value, int _Shift)
_rotl16unsigned short _rotl16(unsigned short _Value, unsigned char _Shift)
_rotl64unsigned __int64 __cdecl _rotl64(unsigned __int64 _Value, int _Shift)
_rotl8unsigned char _rotl8(unsigned char _Value, unsigned char _Shift)
_rotrunsigned int __cdecl _rotr(unsigned int _Value, int _Shift)
_rotr16unsigned short _rotr16(unsigned short _Value, unsigned char _Shift)
_rotr64unsigned __int64 __cdecl _rotr64(unsigned __int64 _Value, int _Shift)
_rotr8unsigned char _rotr8(unsigned char _Value, unsigned char _Shift)
_setjmpexint __cdecl _setjmpex(jmp_buf)
_WriteBarriervoid _WriteBarrier(void)

[NEON]

Interlocked intrinsics are a set of intrinsics that are used to perform atomic read-modify-write operations. Some of them are common to all platforms. They are listed separately here because there are a large number of them, but because their definitions are mostly redundant, it's easier to think about them in general terms. Their names can be used to derive the exact behaviors.

The following table summarizes the ARM support of the non-bittest interlocked intrinsics. Each cell in the table corresponds to a name that is derived by appending the operation name in the left-most cell of the row and the type name in the top-most cell of the column to _Interlocked. For example, the cell at the intersection of the Xor row and the 8 column corresponds to _InterlockedXor8 and is fully supported. Most of the supported functions offer these optional suffixes: _acq, _rel, and _nf. The _acq suffix indicates an "acquire" semantic and the _rel suffix indicates a "release" semantic. The _nf or “no fence” suffix is unique to ARM and is discussed in the next section.

8163264P
AddNoneNoneFullFullNone
AndFullFullFullFullNone
CompareExchangeFullFullFullFullFull
DecrementNoneFullFullFullNone
ExchangePartialPartialPartialPartialPartial
ExchangeAddFullFullFullFullNone
IncrementNoneFullFullFullNone
OrFullFullFullFullNone
XorFullFullFullFullNone

Key:

  • Full: supports plain, _acq, _rel, and _nf forms.

  • Partial: supports plain, _acq, and _nf forms.

  • None: Not supported

_nf (no fence) Suffix

The _nf or "no fence" suffix indicates that the operation does not behave as any kind of memory barrier. This is in contrast to the other three forms (plain, _acq, and _rel), which all behave as some kind of barrier. One possible use of the _nf forms is to maintain a statistic counter that is updated by multiple threads at the same time but whose value is not otherwise used while multiple threads are executing.

List of Interlocked Intrinsics

Function NameFunction Prototype
_InterlockedAddlong _InterlockedAdd(long _volatile *, long)
_InterlockedAdd64__int64 _InterlockedAdd64(__int64 volatile *, __int64)
_InterlockedAdd64_acq__int64 _InterlockedAdd64_acq(__int64 volatile *, __int64)
_InterlockedAdd64_nf__int64 _InterlockedAdd64_nf(__int64 volatile *, __int64)
_InterlockedAdd64_rel__int64 _InterlockedAdd64_rel(__int64 volatile *, __int64)
_InterlockedAdd_acqlong _InterlockedAdd_acq(long volatile *, long)
_InterlockedAdd_nflong _InterlockedAdd_nf(long volatile *, long)
_InterlockedAdd_rellong _InterlockedAdd_rel(long volatile *, long)
_InterlockedAndlong _InterlockedAnd(long volatile *, long)
_InterlockedAnd16short _InterlockedAnd16(short volatile *, short)
_InterlockedAnd16_acqshort _InterlockedAnd16_acq(short volatile *, short)
_InterlockedAnd16_nfshort _InterlockedAnd16_nf(short volatile *, short)
_InterlockedAnd16_relshort _InterlockedAnd16_rel(short volatile *, short)
_InterlockedAnd64__int64 _InterlockedAnd64(__int64 volatile *, __int64)
_InterlockedAnd64_acq__int64 _InterlockedAnd64_acq(__int64 volatile *, __int64)
_InterlockedAnd64_nf__int64 _InterlockedAnd64_nf(__int64 volatile *, __int64)
_InterlockedAnd64_rel__int64 _InterlockedAnd64_rel(__int64 volatile *, __int64)
_InterlockedAnd8char _InterlockedAnd8(char volatile *, char)
_InterlockedAnd8_acqchar _InterlockedAnd8_acq(char volatile *, char)
_InterlockedAnd8_nfchar _InterlockedAnd8_nf(char volatile *, char)
_InterlockedAnd8_relchar _InterlockedAnd8_rel(char volatile *, char)
_InterlockedAnd_acqlong _InterlockedAnd_acq(long volatile *, long)
_InterlockedAnd_nflong _InterlockedAnd_nf(long volatile *, long)
_InterlockedAnd_rellong _InterlockedAnd_rel(long volatile *, long)
_InterlockedCompareExchangelong __cdecl _InterlockedCompareExchange(long volatile *, long, long)
_InterlockedCompareExchange16short _InterlockedCompareExchange16(short volatile *, short, short)
_InterlockedCompareExchange16_acqshort _InterlockedCompareExchange16_acq(short volatile *, short, short)
_InterlockedCompareExchange16_nfshort _InterlockedCompareExchange16_nf(short volatile *, short, short)
_InterlockedCompareExchange16_relshort _InterlockedCompareExchange16_rel(short volatile *, short, short)
_InterlockedCompareExchange64__int64 _InterlockedCompareExchange64(__int64 volatile *, __int64, __int64)
_InterlockedCompareExchange64_acq__int64 _InterlockedCompareExchange64_acq(__int64 volatile *, __int64, __int64)
_InterlockedCompareExchange64_nf__int64 _InterlockedCompareExchange64_nf(__int64 volatile *, __int64, __int64)
_InterlockedCompareExchange64_rel__int64 _InterlockedCompareExchange64_rel(__int64 volatile *, __int64, __int64)
_InterlockedCompareExchange8char _InterlockedCompareExchange8(char volatile *, char, char)
_InterlockedCompareExchange8_acqchar _InterlockedCompareExchange8_acq(char volatile *, char, char)
_InterlockedCompareExchange8_nfchar _InterlockedCompareExchange8_nf(char volatile *, char, char)
_InterlockedCompareExchange8_relchar _InterlockedCompareExchange8_rel(char volatile *, char, char)
_InterlockedCompareExchangePointervoid * _InterlockedCompareExchangePointer(void * volatile *, void *, void *)
_InterlockedCompareExchangePointer_acqvoid * _InterlockedCompareExchangePointer_acq(void * volatile *, void *, void *)
_InterlockedCompareExchangePointer_nfvoid * _InterlockedCompareExchangePointer_nf(void * volatile *, void *, void *)
_InterlockedCompareExchangePointer_relvoid * _InterlockedCompareExchangePointer_rel(void * volatile *, void *, void *)
_InterlockedCompareExchange_acqlong _InterlockedCompareExchange_acq(long volatile *, long, long)
_InterlockedCompareExchange_nflong _InterlockedCompareExchange_nf(long volatile *, long, long)
_InterlockedCompareExchange_rellong _InterlockedCompareExchange_rel(long volatile *, long, long)
_InterlockedDecrementlong __cdecl _InterlockedDecrement(long volatile *)
_InterlockedDecrement16short _InterlockedDecrement16(short volatile *)
_InterlockedDecrement16_acqshort _InterlockedDecrement16_acq(short volatile *)
_InterlockedDecrement16_nfshort _InterlockedDecrement16_nf(short volatile *)
_InterlockedDecrement16_relshort _InterlockedDecrement16_rel(short volatile *)
_InterlockedDecrement64__int64 _InterlockedDecrement64(__int64 volatile *)
_InterlockedDecrement64_acq__int64 _InterlockedDecrement64_acq(__int64 volatile *)
_InterlockedDecrement64_nf__int64 _InterlockedDecrement64_nf(__int64 volatile *)
_InterlockedDecrement64_rel__int64 _InterlockedDecrement64_rel(__int64 volatile *)
_InterlockedDecrement_acqlong _InterlockedDecrement_acq(long volatile *)
_InterlockedDecrement_nflong _InterlockedDecrement_nf(long volatile *)
_InterlockedDecrement_rellong _InterlockedDecrement_rel(long volatile *)
_InterlockedExchangelong __cdecl _InterlockedExchange(long volatile * _Target, long)
_InterlockedExchange16short _InterlockedExchange16(short volatile * _Target, short)
_InterlockedExchange16_acqshort _InterlockedExchange16_acq(short volatile * _Target, short)
_InterlockedExchange16_nfshort _InterlockedExchange16_nf(short volatile * _Target, short)
_InterlockedExchange64__int64 _InterlockedExchange64(__int64 volatile * _Target, __int64)
_InterlockedExchange64_acq__int64 _InterlockedExchange64_acq(__int64 volatile * _Target, __int64)
_InterlockedExchange64_nf__int64 _InterlockedExchange64_nf(__int64 volatile * _Target, __int64)
_InterlockedExchange8char _InterlockedExchange8(char volatile * _Target, char)
_InterlockedExchange8_acqchar _InterlockedExchange8_acq(char volatile * _Target, char)
_InterlockedExchange8_nfchar _InterlockedExchange8_nf(char volatile * _Target, char)
_InterlockedExchangeAddlong __cdecl _InterlockedExchangeAdd(long volatile *, long)
_InterlockedExchangeAdd16short _InterlockedExchangeAdd16(short volatile *, short)
_InterlockedExchangeAdd16_acqshort _InterlockedExchangeAdd16_acq(short volatile *, short)
_InterlockedExchangeAdd16_nfshort _InterlockedExchangeAdd16_nf(short volatile *, short)
_InterlockedExchangeAdd16_relshort _InterlockedExchangeAdd16_rel(short volatile *, short)
_InterlockedExchangeAdd64__int64 _InterlockedExchangeAdd64(__int64 volatile *, __int64)
_InterlockedExchangeAdd64_acq__int64 _InterlockedExchangeAdd64_acq(__int64 volatile *, __int64)
_InterlockedExchangeAdd64_nf__int64 _InterlockedExchangeAdd64_nf(__int64 volatile *, __int64)
_InterlockedExchangeAdd64_rel__int64 _InterlockedExchangeAdd64_rel(__int64 volatile *, __int64)
_InterlockedExchangeAdd8char _InterlockedExchangeAdd8(char volatile *, char)
_InterlockedExchangeAdd8_acqchar _InterlockedExchangeAdd8_acq(char volatile *, char)
_InterlockedExchangeAdd8_nfchar _InterlockedExchangeAdd8_nf(char volatile *, char)
_InterlockedExchangeAdd8_relchar _InterlockedExchangeAdd8_rel(char volatile *, char)
_InterlockedExchangeAdd_acqlong _InterlockedExchangeAdd_acq(long volatile *, long)
_InterlockedExchangeAdd_nflong _InterlockedExchangeAdd_nf(long volatile *, long)
_InterlockedExchangeAdd_rellong _InterlockedExchangeAdd_rel(long volatile *, long)
_InterlockedExchangePointervoid * _InterlockedExchangePointer(void * volatile * _Target, void *)
_InterlockedExchangePointer_acqvoid * _InterlockedExchangePointer_acq(void * volatile * _Target, void *)
_InterlockedExchangePointer_nfvoid * _InterlockedExchangePointer_nf(void * volatile * _Target, void *)
_InterlockedExchange_acqlong _InterlockedExchange_acq(long volatile * _Target, long)
_InterlockedExchange_nflong _InterlockedExchange_nf(long volatile * _Target, long)
_InterlockedIncrementlong __cdecl _InterlockedIncrement(long volatile *)
_InterlockedIncrement16short _InterlockedIncrement16(short volatile *)
_InterlockedIncrement16_acqshort _InterlockedIncrement16_acq(short volatile *)
_InterlockedIncrement16_nfshort _InterlockedIncrement16_nf(short volatile *)
_InterlockedIncrement16_relshort _InterlockedIncrement16_rel(short volatile *)
_InterlockedIncrement64__int64 _InterlockedIncrement64(__int64 volatile *)
_InterlockedIncrement64_acq__int64 _InterlockedIncrement64_acq(__int64 volatile *)
_InterlockedIncrement64_nf__int64 _InterlockedIncrement64_nf(__int64 volatile *)
_InterlockedIncrement64_rel__int64 _InterlockedIncrement64_rel(__int64 volatile *)
_InterlockedIncrement_acqlong _InterlockedIncrement_acq(long volatile *)
_InterlockedIncrement_nflong _InterlockedIncrement_nf(long volatile *)
_InterlockedIncrement_rellong _InterlockedIncrement_rel(long volatile *)
_InterlockedOrlong _InterlockedOr(long volatile *, long)
_InterlockedOr16short _InterlockedOr16(short volatile *, short)
_InterlockedOr16_acqshort _InterlockedOr16_acq(short volatile *, short)
_InterlockedOr16_nfshort _InterlockedOr16_nf(short volatile *, short)
_InterlockedOr16_relshort _InterlockedOr16_rel(short volatile *, short)
_InterlockedOr64__int64 _InterlockedOr64(__int64 volatile *, __int64)
_InterlockedOr64_acq__int64 _InterlockedOr64_acq(__int64 volatile *, __int64)
_InterlockedOr64_nf__int64 _InterlockedOr64_nf(__int64 volatile *, __int64)
_InterlockedOr64_rel__int64 _InterlockedOr64_rel(__int64 volatile *, __int64)
_InterlockedOr8char _InterlockedOr8(char volatile *, char)
_InterlockedOr8_acqchar _InterlockedOr8_acq(char volatile *, char)
_InterlockedOr8_nfchar _InterlockedOr8_nf(char volatile *, char)
_InterlockedOr8_relchar _InterlockedOr8_rel(char volatile *, char)
_InterlockedOr_acqlong _InterlockedOr_acq(long volatile *, long)
_InterlockedOr_nflong _InterlockedOr_nf(long volatile *, long)
_InterlockedOr_rellong _InterlockedOr_rel(long volatile *, long)
_InterlockedXorlong _InterlockedXor(long volatile *, long)
_InterlockedXor16short _InterlockedXor16(short volatile *, short)
_InterlockedXor16_acqshort _InterlockedXor16_acq(short volatile *, short)
_InterlockedXor16_nfshort _InterlockedXor16_nf(short volatile *, short)
_InterlockedXor16_relshort _InterlockedXor16_rel(short volatile *, short)
_InterlockedXor64__int64 _InterlockedXor64(__int64 volatile *, __int64)
_InterlockedXor64_acq__int64 _InterlockedXor64_acq(__int64 volatile *, __int64)
_InterlockedXor64_nf__int64 _InterlockedXor64_nf(__int64 volatile *, __int64)
_InterlockedXor64_rel__int64 _InterlockedXor64_rel(__int64 volatile *, __int64)
_InterlockedXor8char _InterlockedXor8(char volatile *, char)
_InterlockedXor8_acqchar _InterlockedXor8_acq(char volatile *, char)
_InterlockedXor8_nfchar _InterlockedXor8_nf(char volatile *, char)
_InterlockedXor8_relchar _InterlockedXor8_rel(char volatile *, char)
_InterlockedXor_acqlong _InterlockedXor_acq(long volatile *, long)
_InterlockedXor_nflong _InterlockedXor_nf(long volatile *, long)
_InterlockedXor_rellong _InterlockedXor_rel(long volatile *, long)

[NEON]

_interlockedbittest Intrinsics

The plain interlocked bittest intrinsics are common to all platforms. ARM adds _acq, _rel, and _nf variants, which just modify the barrier semantics of an operation, as described in _nf (no fence) Suffix earlier in this article.

Function NameFunction Prototype
_interlockedbittestandresetunsigned char _interlockedbittestandreset(long volatile *, long)
_interlockedbittestandreset_acqunsigned char _interlockedbittestandreset_acq(long volatile *, long)
_interlockedbittestandreset_nfunsigned char _interlockedbittestandreset_nf(long volatile *, long)
_interlockedbittestandreset_relunsigned char _interlockedbittestandreset_rel(long volatile *, long)
_interlockedbittestandsetunsigned char _interlockedbittestandset(long volatile *, long)
_interlockedbittestandset_acqunsigned char _interlockedbittestandset_acq(long volatile *, long)
_interlockedbittestandset_nfunsigned char _interlockedbittestandset_nf(long volatile *, long)
_interlockedbittestandset_relunsigned char _interlockedbittestandset_rel(long volatile *, long)

[NEON]

Compiler Intrinsics
ARM Assembler Reference
C++ Language Reference

Show: