SIMD向量化 #

一、SIMD概念 #

1.1 什么是SIMD #

SIMD(Single Instruction, Multiple Data)是一种并行计算技术:

  • 单条指令同时处理多个数据
  • 利用CPU向量寄存器
  • 大幅提升计算性能

1.2 SIMD优势 #

  • 数据并行处理
  • 减少循环开销
  • 充分利用硬件能力
  • 适合数值计算

二、SIMD类型 #

2.1 创建SIMD向量 #

mojo
from SIMD import SIMD, DType

def main():
    let v1 = SIMD[DType.float64, 4](1.0, 2.0, 3.0, 4.0)
    let v2 = SIMD[DType.int32, 4](1, 2, 3, 4)
    
    print(v1)
    print(v2)

main()

2.2 SIMD类型参数 #

mojo
from SIMD import SIMD, DType

def main():
    let float32_vec = SIMD[DType.float32, 8]()
    let float64_vec = SIMD[DType.float64, 4]()
    let int32_vec = SIMD[DType.int32, 8]()
    let int64_vec = SIMD[DType.int64, 4]()
    
    print("SIMD vectors created")

main()

2.3 填充SIMD #

mojo
from SIMD import SIMD, DType

def main():
    let v1 = SIMD[DType.float64, 4].splat(5.0)
    let v2 = SIMD[DType.int32, 8].splat(10)
    
    print(v1)
    print(v2)

main()

三、SIMD运算 #

3.1 算术运算 #

mojo
from SIMD import SIMD, DType

def main():
    let a = SIMD[DType.float64, 4](1.0, 2.0, 3.0, 4.0)
    let b = SIMD[DType.float64, 4](5.0, 6.0, 7.0, 8.0)
    
    let sum = a + b
    let diff = a - b
    let product = a * b
    let quotient = a / b
    
    print(f"Sum: {sum}")
    print(f"Product: {product}")

main()

3.2 比较运算 #

mojo
from SIMD import SIMD, DType

def main():
    let a = SIMD[DType.float64, 4](1.0, 2.0, 3.0, 4.0)
    let b = SIMD[DType.float64, 4](3.0, 2.0, 1.0, 4.0)
    
    let gt = a > b
    let eq = a == b
    let lt = a < b
    
    print(f"a > b: {gt}")
    print(f"a == b: {eq}")

main()

3.3 数学函数 #

mojo
from SIMD import SIMD, DType
from Math import sqrt, sin, cos

def main():
    let v = SIMD[DType.float64, 4](1.0, 4.0, 9.0, 16.0)
    
    let sqrts = sqrt(v)
    print(f"Square roots: {sqrts}")

main()

3.4 归约操作 #

mojo
from SIMD import SIMD, DType

def main():
    let v = SIMD[DType.float64, 4](1.0, 2.0, 3.0, 4.0)
    
    let sum = v.reduce_add()
    let product = v.reduce_mul()
    let max_val = v.reduce_max()
    let min_val = v.reduce_min()
    
    print(f"Sum: {sum}")
    print(f"Product: {product}")
    print(f"Max: {max_val}")
    print(f"Min: {min_val}")

main()

四、SIMD加载与存储 #

4.1 从数组加载 #

mojo
from SIMD import SIMD, DType

def main():
    let data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]
    
    let v = SIMD[DType.float64, 4].load(data, 0)
    print(v)
    
    let v2 = SIMD[DType.float64, 4].load(data, 4)
    print(v2)

main()

4.2 存储到数组 #

mojo
from SIMD import SIMD, DType

def main():
    var data = [0.0, 0.0, 0.0, 0.0]
    
    let v = SIMD[DType.float64, 4](1.0, 2.0, 3.0, 4.0)
    v.store(data, 0)
    
    for val in data:
        print(val)

main()

4.3 指针加载 #

mojo
from SIMD import SIMD, DType

def main():
    let ptr = Pointer[Float64].alloc(4)
    
    for i in range(4):
        ptr.store(i, Float64(i + 1))
    
    let v = SIMD[DType.float64, 4].load(ptr, 0)
    print(v)
    
    ptr.free()

main()

五、SIMD向量运算实例 #

5.1 向量加法 #

mojo
from SIMD import SIMD, DType

fn vector_add(a: List[Float64], b: List[Float64]) -> List[Float64]:
    let n = len(a)
    var result: List[Float64] = []
    
    for i in range(0, n, 4):
        let va = SIMD[DType.float64, 4].load(a, i)
        let vb = SIMD[DType.float64, 4].load(b, i)
        let vr = va + vb
        vr.store(result, i)
    
    return result

def main():
    let a = [1.0, 2.0, 3.0, 4.0]
    let b = [5.0, 6.0, 7.0, 8.0]
    
    let result = vector_add(a, b)
    print(result)

main()

5.2 点积计算 #

mojo
from SIMD import SIMD, DType

fn dot_product(a: List[Float64], b: List[Float64]) -> Float64:
    let n = len(a)
    var sum_vec = SIMD[DType.float64, 4].splat(0.0)
    
    for i in range(0, n, 4):
        let va = SIMD[DType.float64, 4].load(a, i)
        let vb = SIMD[DType.float64, 4].load(b, i)
        sum_vec = sum_vec + va * vb
    
    return sum_vec.reduce_add()

def main():
    let a = [1.0, 2.0, 3.0, 4.0]
    let b = [5.0, 6.0, 7.0, 8.0]
    
    let result = dot_product(a, b)
    print(f"Dot product: {result}")

main()

5.3 矩阵乘法 #

mojo
from SIMD import SIMD, DType

fn matrix_multiply(A: List[List[Float64]], B: List[List[Float64]]) -> List[List[Float64]]:
    let m = len(A)
    let n = len(B[0])
    let k = len(B)
    
    var C: List[List[Float64]] = []
    for i in range(m):
        var row: List[Float64] = []
        for j in range(n):
            var sum: Float64 = 0.0
            for p in range(k):
                sum += A[i][p] * B[p][j]
            row.append(sum)
        C.append(row)
    
    return C

def main():
    let A = [[1.0, 2.0], [3.0, 4.0]]
    let B = [[5.0, 6.0], [7.0, 8.0]]
    
    let C = matrix_multiply(A, B)
    print(C)

main()

六、SIMD掩码操作 #

6.1 条件选择 #

mojo
from SIMD import SIMD, DType

def main():
    let a = SIMD[DType.float64, 4](1.0, 2.0, 3.0, 4.0)
    let b = SIMD[DType.float64, 4](10.0, 20.0, 30.0, 40.0)
    let mask = a > SIMD[DType.float64, 4].splat(2.5)
    
    let result = mask.select(b, a)
    print(result)

main()

6.2 掩码归约 #

mojo
from SIMD import SIMD, DType

def main():
    let v = SIMD[DType.float64, 4](1.0, 2.0, 3.0, 4.0)
    let mask = v > SIMD[DType.float64, 4].splat(2.0)
    
    let count = mask.reduce_add()
    print(f"Count > 2.0: {count}")

main()

七、SIMD性能优化 #

7.1 数据对齐 #

mojo
from SIMD import SIMD, DType

def main():
    let aligned_ptr = Pointer[Float64].alloc_aligned(16, 64)
    
    let v = SIMD[DType.float64, 4].load_aligned(aligned_ptr, 0)
    
    aligned_ptr.free()

main()

7.2 循环展开 #

mojo
from SIMD import SIMD, DType

fn process_array(data: List[Float64]):
    let n = len(data)
    
    for i in range(0, n, 8):
        let v1 = SIMD[DType.float64, 4].load(data, i)
        let v2 = SIMD[DType.float64, 4].load(data, i + 4)
        
        v1.store(data, i)
        v2.store(data, i + 4)

def main():
    var data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]
    process_array(data)
    print(data)

main()

7.3 避免数据依赖 #

mojo
from SIMD import SIMD, DType

fn independent_operations(data: List[Float64]):
    let n = len(data)
    
    for i in range(0, n, 4):
        let v = SIMD[DType.float64, 4].load(data, i)
        let result = v * 2.0 + 1.0
        result.store(data, i)

def main():
    var data = [1.0, 2.0, 3.0, 4.0]
    independent_operations(data)
    print(data)

main()

八、SIMD最佳实践 #

8.1 选择合适的向量宽度 #

mojo
from SIMD import SIMD, DType

def main():
    let v128 = SIMD[DType.float32, 4]()
    let v256 = SIMD[DType.float32, 8]()
    let v512 = SIMD[DType.float32, 16]()
    
    print("Different SIMD widths created")

main()

8.2 处理剩余元素 #

mojo
from SIMD import SIMD, DType

fn process_with_remainder(data: List[Float64]):
    let n = len(data)
    let simd_width = 4
    let simd_count = n // simd_width * simd_width
    
    for i in range(0, simd_count, simd_width):
        let v = SIMD[DType.float64, 4].load(data, i)
        let result = v * 2.0
        result.store(data, i)
    
    for i in range(simd_count, n):
        data[i] *= 2.0

def main():
    var data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]
    process_with_remainder(data)
    print(data)

main()

8.3 使用SIMD友好算法 #

mojo
from SIMD import SIMD, DType

fn sum_array(data: List[Float64]) -> Float64:
    let n = len(data)
    var sum = SIMD[DType.float64, 4].splat(0.0)
    
    for i in range(0, n, 4):
        let v = SIMD[DType.float64, 4].load(data, i)
        sum = sum + v
    
    return sum.reduce_add()

def main():
    let data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]
    print(f"Sum: {sum_array(data)}")

main()

九、总结 #

本章学习了:

  • SIMD概念与优势
  • SIMD类型创建
  • SIMD运算操作
  • 数据加载与存储
  • 向量运算实例
  • 掩码操作
  • 性能优化技巧

下一章,我们将学习Python互操作。

最后更新:2026-03-27