diff --git a/examples/cpu_features/SSE_and_MMX_Extensions/README.md b/examples/cpu_features/SSE_and_MMX_Extensions/README.md new file mode 100644 index 00000000000000..6ac44657e3ba8f --- /dev/null +++ b/examples/cpu_features/SSE_and_MMX_Extensions/README.md @@ -0,0 +1,63 @@ +Note: To more deep study see https://en.wikibooks.org/wiki/X86_Assembly + +# SSE and MMX Extensions + +This document provides an overview of the SSE and MMX extensions used in the project. + +## Table of Contents + +- [Introduction](#introduction) +- [SSE Extensions](#sse-extensions) +- [MMX Extensions](#mmx-extensions) +- [Usage](#usage) + +## Introduction + +SSE (Streaming SIMD Extensions) and MMX (MultiMedia eXtensions) are instruction sets used to +enhance the performance of multimedia and signal processing applications. + +## SSE Extensions + +SSE extensions provide a set of instructions that can handle multiple data with a single +instruction, improving the performance of applications that require heavy mathematical +computations. + +from: [wikibooks](https://en.wikibooks.org/wiki/X86_Assembly/SSE#SSE_Instruction_Set) +There are literally hundreds of SSE instructions, some of which are capable of much more than +simple SIMD arithmetic. For more in-depth references take a look at the resources chapter of this +book. + +You may notice that many floating point SSE instructions end with something like PS or SD. These +suffixes differentiate between different versions of the operation. The first letter describes +whether the instruction should be Packed or Scalar. Packed operations are applied to every member +of the register, while scalar operations are applied to only the first value. For example, in +pseudo-code, a packed add would be executed as: + +``` +v1[0] = v1[0] + v2[0] +v1[1] = v1[1] + v2[1] +v1[2] = v1[2] + v2[2] +v1[3] = v1[3] + v2[3] +``` + +While a scalar add would only be: + +``` +v1[0] = v1[0] + v2[0] +``` + +The second letter refers to the data size: either Single or Double. This simply tells the +processor whether to use the register as four 32-bit floats or two 64-bit doubles, respectively. + +## MMX Extensions + +MMX extensions are designed to accelerate multimedia and communication applications by providing +instructions that can process multiple data elements in parallel. + +## Usage + +To use these extensions in your project, ensure that your compiler supports them and that you have +enabled the appropriate flags. +On Linux, you can run the command `lscpu` + +Note: the examples here will compile, but not run on CPU architectures != amd64, like ARM or RISCV . diff --git a/examples/cpu_features/SSE_and_MMX_Extensions/mmx.v b/examples/cpu_features/SSE_and_MMX_Extensions/mmx.v new file mode 100644 index 00000000000000..9aad22a92c2f2b --- /dev/null +++ b/examples/cpu_features/SSE_and_MMX_Extensions/mmx.v @@ -0,0 +1,39 @@ +// MMX Instruction Set +// Several suffixes are used to indicate what data size the instruction operates on: +// Byte (8 bits) +// Word (16 bits) +// Double word (32 bits) +// Quad word (64 bits) +// The signedness of the operation is also signified by the suffix: US for unsigned and S for signed. +// For example, PSUBUSB subtracts unsigned bytes, while PSUBSD subtracts signed double words. +// MMX defined over 40 new instructions, listed below. +// EMMS, MOVD, MOVQ, PACKSSDW, PACKSSWB, PACKUSWB, PADDB, PADDD, PADDSB, PADDSW, PADDUSB, PADDUSW, +// PADDW, PAND, PANDN, PCMPEQB, PCMPEQD, PCMPEQW, PCMPGTB, PCMPGTD, PCMPGTW, PMADDWD, PMULHW, PMULLW, +// POR, PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW, PSUBB, PSUBD, PSUBSB, PSUBSW, PSUBUSB, +// PSUBUSW, PSUBW, PUNPCKHBW, PUNPCKHDQ, PUNPCKHWD, PUNPCKLBW, PUNPCKLDQ, PUNPCKLWD, PXOR + +@[if amd64 && !tinyc && !msvc] +fn add_vectors_mmx(a &u8, b &u8, result &u8) { + unsafe { + asm volatile amd64 { + movq mm0, [a] // Load 8 bytes from a into MMX register mm0 + movq mm1, [b] // Load 8 bytes from b into MMX register mm1 + paddb mm0, mm1 // Add the two vectors using MMX instruction + movq [result], mm0 // Store the result back to memory + ; ; r (a) + r (b) + r (result) + ; mm0 + mm1 + } + } +} + +fn main() { + a := [u8(1), 2, 3, 4, 5, 6, 7, 8] + b := [u8(8), 7, 6, 5, 4, 3, 2, 1] + result := []u8{len: 8} + add_vectors_mmx(&a[0], &b[0], &result[0]) + println(result) + assert result == [u8(9), 9, 9, 9, 9, 9, 9, 9] +} diff --git a/examples/cpu_features/SSE_and_MMX_Extensions/sse.v b/examples/cpu_features/SSE_and_MMX_Extensions/sse.v new file mode 100644 index 00000000000000..b5c73a94992881 --- /dev/null +++ b/examples/cpu_features/SSE_and_MMX_Extensions/sse.v @@ -0,0 +1,37 @@ +// SSE Instruction Set +// SSE: Added with Pentium III +// Floating-point Instructions: +// ADDPS, ADDSS, CMPPS, CMPSS, COMISS, CVTPI2PS, CVTPS2PI, CVTSI2SS, CVTSS2SI, CVTTPS2PI, CVTTSS2SI, +// DIVPS, DIVSS, LDMXCSR, MAXPS, MAXSS, MINPS, MINSS, MOVAPS, MOVHLPS, MOVHPS, MOVLHPS, MOVLPS, +// MOVMSKPS, MOVNTPS, MOVSS, MOVUPS, MULPS, MULSS, RCPPS, RCPSS, RSQRTPS, RSQRTSS, SHUFPS, SQRTPS, +// SQRTSS, STMXCSR, SUBPS, SUBSS, UCOMISS, UNPCKHPS, UNPCKLPS +// +// Integer Instructions: +// ANDNPS, ANDPS, ORPS, PAVGB, PAVGW, PEXTRW, PINSRW, PMAXSW, PMAXUB, PMINSW, PMINUB, PMOVMSKB, PMULHUW, PSADBW, PSHUFW, XORPS +// The ADDPS instruction adds two vectors of floats using SSE instructions. + +@[if amd64 && !tinyc && !msvc] +fn add_vectors_sse(a &f32, b &f32, result &f32) { + unsafe { + asm volatile amd64 { + movups xmm0, [a] // Load 4 floats from array a into SSE register xmm0 + movups xmm1, [b] // Load 4 floats from array b into SSE register xmm1 + addps xmm0, xmm1 // Add the two vectors using SSE instruction + movups [result], xmm0 // Store the result back to memory + ; ; r (a) + r (b) + r (result) + ; xmm0 + xmm1 + } + } +} + +fn main() { + a := [f32(1.0), 2.0, 3.0, 4.0] + b := [f32(4.0), 3.0, 2.0, 1.0] + result := []f32{len: 4} + add_vectors_sse(&a[0], &b[0], &result[0]) + println(result) + assert result == [f32(5.0), 5.0, 5.0, 5.0] +} diff --git a/examples/cpu_features/SSE_and_MMX_Extensions/sse2.v b/examples/cpu_features/SSE_and_MMX_Extensions/sse2.v new file mode 100644 index 00000000000000..f320fea3c21796 --- /dev/null +++ b/examples/cpu_features/SSE_and_MMX_Extensions/sse2.v @@ -0,0 +1,42 @@ +// SSE Instruction Set +// SSE2: Added with Pentium 4 +// Floating-point Instructions: +// ADDPD, ADDSD, ANDNPD, ANDPD, CMPPD, CMPSD*, COMISD, CVTDQ2PD, CVTDQ2PS, CVTPD2DQ, CVTPD2PI, +// CVTPD2PS, CVTPI2PD, CVTPS2DQ, CVTPS2PD, CVTSD2SI, CVTSD2SS, CVTSI2SD, CVTSS2SD, CVTTPD2DQ, +// CVTTPD2PI, CVTTPS2DQ, CVTTSD2SI, DIVPD, DIVSD, MAXPD, MAXSD, MINPD, MINSD, MOVAPD, MOVHPD, +// MOVLPD, MOVMSKPD, MOVSD*, MOVUPD, MULPD, MULSD, ORPD, SHUFPD, SQRTPD, SQRTSD, SUBPD, SUBSD, +// UCOMISD, UNPCKHPD, UNPCKLPD, XORPD +// * CMPSD and MOVSD have the same name as the string instruction mnemonics CMPSD (CMPS) and +// MOVSD (MOVS); however, the former refer to scalar double-precision floating-points whereas +// the latter refer to doubleword strings. +// Integer Instructions: +// MOVDQ2Q, MOVDQA, MOVDQU, MOVQ2DQ, PADDQ, PSUBQ, PMULUDQ, PSHUFHW, PSHUFLW, PSHUFD, PSLLDQ, PSRLDQ, PUNPCKHQDQ, PUNPCKLQDQ +// The MULPD instruction multiplies two vectors of doubles using SSE2 instructions. + +@[if amd64 && !tinyc && !msvc] +fn multiply_vectors_sse2(a &f64, b &f64, result &f64) { + unsafe { + asm volatile amd64 { + movupd xmm0, [a] // Load 2 doubles from array a into SSE2 register xmm0 + movupd xmm1, [b] // Load 2 doubles from array b into SSE2 register xmm1 + mulpd xmm0, xmm1 // Multiply the two vectors using SSE2 instruction + movupd [result], xmm0 // Store the result back to memory + ; ; r (a) + r (b) + r (result) + ; xmm0 + xmm1 + } + } +} + +fn main() { + a := [f64(1.5), 2.5] + b := [f64(3.5), 4.5] + result := []f64{len: 2} + multiply_vectors_sse2(&a[0], &b[0], &result[0]) + println(result) + // 5.25 = 1.5 * 3.5 + // 11.25 = 2.5 * 4.5 + assert result == [f64(5.25), 11.25] +} diff --git a/examples/cpu_features/SSE_and_MMX_Extensions/sse3.v b/examples/cpu_features/SSE_and_MMX_Extensions/sse3.v new file mode 100644 index 00000000000000..c4bf470568b350 --- /dev/null +++ b/examples/cpu_features/SSE_and_MMX_Extensions/sse3.v @@ -0,0 +1,36 @@ +// SSE Instruction Set +// SSE3: Added with later Pentium 4 +// ADDSUBPD, ADDSUBPS, HADDPD, HADDPS, HSUBPD, HSUBPS, MOVDDUP, MOVSHDUP, MOVSLDUP +// The HADDPS instruction performs horizontal addition of two vectors of floats using SSE3 +// instructions. + +@[if amd64 && !tinyc && !msvc] +fn horizontal_add_sse3(a &f32, b &f32, result &f32) { + unsafe { + asm volatile amd64 { + movaps xmm0, [a] // Load 4 floats from array a into SSE3 register xmm0 + movaps xmm1, [b] // Load 4 floats from array b into SSE3 register xmm1 + haddps xmm0, xmm1 // Perform horizontal add of xmm0 and xmm1 + movaps [result], xmm0 // Store the result back to memory + ; ; r (a) + r (b) + r (result) + ; xmm0 + xmm1 + } + } +} + +fn main() { + a := [f32(1.0), 2.0, 3.0, 4.0] + b := [f32(5.0), 6.0, 7.0, 8.0] + result := []f32{len: 4} + horizontal_add_sse3(&a[0], &b[0], &result[0]) + println(result) + // The result should be [3.0, 7.0, 11.0, 15.0] due to horizontal addition. + // 1.0 + 2.0 = 3.0 + // 3.0 + 4.0 = 7.0 + // 5.0 + 6.0 = 11.0 + // 7.0 + 8.0 = 15.0 + assert result == [f32(3.0), 7.0, 11.0, 15.0] +} diff --git a/examples/cpu_features/SSE_and_MMX_Extensions/sse4_1.v b/examples/cpu_features/SSE_and_MMX_Extensions/sse4_1.v new file mode 100644 index 00000000000000..eb6fd7331e573e --- /dev/null +++ b/examples/cpu_features/SSE_and_MMX_Extensions/sse4_1.v @@ -0,0 +1,31 @@ +// SSE Instruction Set +// SSE4.1: Added with later Core 2 +// MPSADBW, PHMINPOSUW, PMULLD, PMULDQ, DPPS, DPPD, BLENDPS, BLENDPD, BLENDVPS, BLENDVPD, +// PBLENDVB, PBLENDW, PMINSB, PMAXSB, PMINUW, PMAXUW, PMINUD, PMAXUD, PMINSD, PMAXSD, ROUNDPS, +// ROUNDSS, ROUNDPD, ROUNDSD, INSERTPS, PINSRB, PINSRD, PINSRQ, EXTRACTPS, PEXTRB, PEXTRW, +// PEXTRD, PEXTRQ, PMOVSXBW, PMOVZXBW, PMOVSXBD, PMOVZXBD, PMOVSXBQ, PMOVZXBQ, PMOVSXWD, +// PMOVZXWD, PMOVSXWQ, PMOVZXWQ, PMOVSXDQ, PMOVZXDQ, PTEST, PCMPEQQ, PACKUSDW, MOVNTDQA + +@[if amd64 && !tinyc && !msvc] +fn round_floats_sse4_1(a &f32, result &f32) { + unsafe { + asm volatile amd64 { + movups xmm0, [a] // Load 4 floats from array a into xmm0 + roundps xmm0, xmm0, 0 // Round to nearest integer + movups [result], xmm0 // Store the result in result array + ; ; r (a) + r (result) + ; xmm0 + } + } +} + +fn main() { + a := [f32(1.2), 2.5, 3.8, 4.4] + result := []f32{len: 4} + // Rounding mode 0 corresponds to rounding to the nearest integer + round_floats_sse4_1(&a[0], &result[0]) + println(result) + // The expected rounded result should be [1.0, 2.0, 4.0, 4.0] + assert result == [f32(1.0), 2.0, 4.0, 4.0] +} diff --git a/examples/cpu_features/SSE_and_MMX_Extensions/ssse3.v b/examples/cpu_features/SSE_and_MMX_Extensions/ssse3.v new file mode 100644 index 00000000000000..ba75aae5e1418a --- /dev/null +++ b/examples/cpu_features/SSE_and_MMX_Extensions/ssse3.v @@ -0,0 +1,31 @@ +// SSE Instruction Set +// SSSE3: Added with Xeon 5100 and early Core 2 +// PSIGNW, PSIGND, PSIGNB, PSHUFB, PMULHRSW, PMADDUBSW, PHSUBW, PHSUBSW, PHSUBD, PHADDW, PHADDSW, +// PHADDD, PALIGNR, PABSW, PABSD, PABSB +// The PSIGNW instruction negates or leaves elements unchanged based on another vector's signs. + +@[if amd64 && !tinyc && !msvc] +fn psignw_example(a &i16, b &i16, result &i16) { + unsafe { + asm volatile amd64 { + movdqa xmm0, [a] // Load 8 signed 16-bit integers from array a into xmm0 + movdqa xmm1, [b] // Load 8 signed 16-bit integers from array b into xmm1 + psignw xmm0, xmm1 // Adjust the sign of elements in xmm0 based on xmm1 + movdqa [result], xmm0 // Store the result back to memory + ; ; r (a) + r (b) + r (result) + ; xmm0 + xmm1 + } + } +} + +fn main() { + a0 := [i16(1), -2, 3, -4, 5, -6, 7, -8] + b0 := [i16(1), -1, 1, -1, 1, -1, 1, -1] + result0 := []i16{len: 8} + psignw_example(&a0[0], &b0[0], &result0[0]) + dump(result0) + assert result0 == [i16(1), 2, 3, 4, 5, 6, 7, 8] +}