SIMD向量化 #
一、SIMD概念 #
1.1 什么是SIMD #
SIMD(Single Instruction, Multiple Data)是一种并行计算技术:
- 单条指令同时处理多个数据
- 利用CPU向量寄存器
- 大幅提升计算性能
1.2 SIMD优势 #
- 数据并行处理
- 减少循环开销
- 充分利用硬件能力
- 适合数值计算
二、SIMD类型 #
2.1 创建SIMD向量 #
mojo
from SIMD import SIMD, DType
def main():
let v1 = SIMD[DType.float64, 4](1.0, 2.0, 3.0, 4.0)
let v2 = SIMD[DType.int32, 4](1, 2, 3, 4)
print(v1)
print(v2)
main()
2.2 SIMD类型参数 #
mojo
from SIMD import SIMD, DType
def main():
let float32_vec = SIMD[DType.float32, 8]()
let float64_vec = SIMD[DType.float64, 4]()
let int32_vec = SIMD[DType.int32, 8]()
let int64_vec = SIMD[DType.int64, 4]()
print("SIMD vectors created")
main()
2.3 填充SIMD #
mojo
from SIMD import SIMD, DType
def main():
let v1 = SIMD[DType.float64, 4].splat(5.0)
let v2 = SIMD[DType.int32, 8].splat(10)
print(v1)
print(v2)
main()
三、SIMD运算 #
3.1 算术运算 #
mojo
from SIMD import SIMD, DType
def main():
let a = SIMD[DType.float64, 4](1.0, 2.0, 3.0, 4.0)
let b = SIMD[DType.float64, 4](5.0, 6.0, 7.0, 8.0)
let sum = a + b
let diff = a - b
let product = a * b
let quotient = a / b
print(f"Sum: {sum}")
print(f"Product: {product}")
main()
3.2 比较运算 #
mojo
from SIMD import SIMD, DType
def main():
let a = SIMD[DType.float64, 4](1.0, 2.0, 3.0, 4.0)
let b = SIMD[DType.float64, 4](3.0, 2.0, 1.0, 4.0)
let gt = a > b
let eq = a == b
let lt = a < b
print(f"a > b: {gt}")
print(f"a == b: {eq}")
main()
3.3 数学函数 #
mojo
from SIMD import SIMD, DType
from Math import sqrt, sin, cos
def main():
let v = SIMD[DType.float64, 4](1.0, 4.0, 9.0, 16.0)
let sqrts = sqrt(v)
print(f"Square roots: {sqrts}")
main()
3.4 归约操作 #
mojo
from SIMD import SIMD, DType
def main():
let v = SIMD[DType.float64, 4](1.0, 2.0, 3.0, 4.0)
let sum = v.reduce_add()
let product = v.reduce_mul()
let max_val = v.reduce_max()
let min_val = v.reduce_min()
print(f"Sum: {sum}")
print(f"Product: {product}")
print(f"Max: {max_val}")
print(f"Min: {min_val}")
main()
四、SIMD加载与存储 #
4.1 从数组加载 #
mojo
from SIMD import SIMD, DType
def main():
let data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]
let v = SIMD[DType.float64, 4].load(data, 0)
print(v)
let v2 = SIMD[DType.float64, 4].load(data, 4)
print(v2)
main()
4.2 存储到数组 #
mojo
from SIMD import SIMD, DType
def main():
var data = [0.0, 0.0, 0.0, 0.0]
let v = SIMD[DType.float64, 4](1.0, 2.0, 3.0, 4.0)
v.store(data, 0)
for val in data:
print(val)
main()
4.3 指针加载 #
mojo
from SIMD import SIMD, DType
def main():
let ptr = Pointer[Float64].alloc(4)
for i in range(4):
ptr.store(i, Float64(i + 1))
let v = SIMD[DType.float64, 4].load(ptr, 0)
print(v)
ptr.free()
main()
五、SIMD向量运算实例 #
5.1 向量加法 #
mojo
from SIMD import SIMD, DType
fn vector_add(a: List[Float64], b: List[Float64]) -> List[Float64]:
let n = len(a)
var result: List[Float64] = []
for i in range(0, n, 4):
let va = SIMD[DType.float64, 4].load(a, i)
let vb = SIMD[DType.float64, 4].load(b, i)
let vr = va + vb
vr.store(result, i)
return result
def main():
let a = [1.0, 2.0, 3.0, 4.0]
let b = [5.0, 6.0, 7.0, 8.0]
let result = vector_add(a, b)
print(result)
main()
5.2 点积计算 #
mojo
from SIMD import SIMD, DType
fn dot_product(a: List[Float64], b: List[Float64]) -> Float64:
let n = len(a)
var sum_vec = SIMD[DType.float64, 4].splat(0.0)
for i in range(0, n, 4):
let va = SIMD[DType.float64, 4].load(a, i)
let vb = SIMD[DType.float64, 4].load(b, i)
sum_vec = sum_vec + va * vb
return sum_vec.reduce_add()
def main():
let a = [1.0, 2.0, 3.0, 4.0]
let b = [5.0, 6.0, 7.0, 8.0]
let result = dot_product(a, b)
print(f"Dot product: {result}")
main()
5.3 矩阵乘法 #
mojo
from SIMD import SIMD, DType
fn matrix_multiply(A: List[List[Float64]], B: List[List[Float64]]) -> List[List[Float64]]:
let m = len(A)
let n = len(B[0])
let k = len(B)
var C: List[List[Float64]] = []
for i in range(m):
var row: List[Float64] = []
for j in range(n):
var sum: Float64 = 0.0
for p in range(k):
sum += A[i][p] * B[p][j]
row.append(sum)
C.append(row)
return C
def main():
let A = [[1.0, 2.0], [3.0, 4.0]]
let B = [[5.0, 6.0], [7.0, 8.0]]
let C = matrix_multiply(A, B)
print(C)
main()
六、SIMD掩码操作 #
6.1 条件选择 #
mojo
from SIMD import SIMD, DType
def main():
let a = SIMD[DType.float64, 4](1.0, 2.0, 3.0, 4.0)
let b = SIMD[DType.float64, 4](10.0, 20.0, 30.0, 40.0)
let mask = a > SIMD[DType.float64, 4].splat(2.5)
let result = mask.select(b, a)
print(result)
main()
6.2 掩码归约 #
mojo
from SIMD import SIMD, DType
def main():
let v = SIMD[DType.float64, 4](1.0, 2.0, 3.0, 4.0)
let mask = v > SIMD[DType.float64, 4].splat(2.0)
let count = mask.reduce_add()
print(f"Count > 2.0: {count}")
main()
七、SIMD性能优化 #
7.1 数据对齐 #
mojo
from SIMD import SIMD, DType
def main():
let aligned_ptr = Pointer[Float64].alloc_aligned(16, 64)
let v = SIMD[DType.float64, 4].load_aligned(aligned_ptr, 0)
aligned_ptr.free()
main()
7.2 循环展开 #
mojo
from SIMD import SIMD, DType
fn process_array(data: List[Float64]):
let n = len(data)
for i in range(0, n, 8):
let v1 = SIMD[DType.float64, 4].load(data, i)
let v2 = SIMD[DType.float64, 4].load(data, i + 4)
v1.store(data, i)
v2.store(data, i + 4)
def main():
var data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]
process_array(data)
print(data)
main()
7.3 避免数据依赖 #
mojo
from SIMD import SIMD, DType
fn independent_operations(data: List[Float64]):
let n = len(data)
for i in range(0, n, 4):
let v = SIMD[DType.float64, 4].load(data, i)
let result = v * 2.0 + 1.0
result.store(data, i)
def main():
var data = [1.0, 2.0, 3.0, 4.0]
independent_operations(data)
print(data)
main()
八、SIMD最佳实践 #
8.1 选择合适的向量宽度 #
mojo
from SIMD import SIMD, DType
def main():
let v128 = SIMD[DType.float32, 4]()
let v256 = SIMD[DType.float32, 8]()
let v512 = SIMD[DType.float32, 16]()
print("Different SIMD widths created")
main()
8.2 处理剩余元素 #
mojo
from SIMD import SIMD, DType
fn process_with_remainder(data: List[Float64]):
let n = len(data)
let simd_width = 4
let simd_count = n // simd_width * simd_width
for i in range(0, simd_count, simd_width):
let v = SIMD[DType.float64, 4].load(data, i)
let result = v * 2.0
result.store(data, i)
for i in range(simd_count, n):
data[i] *= 2.0
def main():
var data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]
process_with_remainder(data)
print(data)
main()
8.3 使用SIMD友好算法 #
mojo
from SIMD import SIMD, DType
fn sum_array(data: List[Float64]) -> Float64:
let n = len(data)
var sum = SIMD[DType.float64, 4].splat(0.0)
for i in range(0, n, 4):
let v = SIMD[DType.float64, 4].load(data, i)
sum = sum + v
return sum.reduce_add()
def main():
let data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]
print(f"Sum: {sum_array(data)}")
main()
九、总结 #
本章学习了:
- SIMD概念与优势
- SIMD类型创建
- SIMD运算操作
- 数据加载与存储
- 向量运算实例
- 掩码操作
- 性能优化技巧
下一章,我们将学习Python互操作。
最后更新:2026-03-27