矩阵运算#

%cd ../..
import set_env
/media/pc/data/4tb/lxw/home/lxw/tvm-book/doc/tutorials
import numpy as np
import tvm
from tvm import te
from tvm.ir.module import IRModule

下面以矩阵的一些简单运算。

矩阵转置#

n = te.var('n')
m = te.var('m')
A = te.placeholder((n, m), name='A')
B = te.compute((m, n), lambda i, j: A[j, i], 'A^T')
te_func = te.create_prim_func([A, B])
te_func.show()
# from tvm.script import tir as T
@T.prim_func
def func(var_A: T.handle, var_A^T: T.handle):
    # function attr dict
    T.func_attr({"global_symbol": "main", "tir.noalias": True})
    m = T.var("int32")
    n = T.var("int32")
    A = T.match_buffer(var_A, [n, m], dtype="float32")
    A^T = T.match_buffer(var_A^T, [m, n], dtype="float32")
    # body
    # with T.block("root")
    for i0, i1 in T.grid(m, n):
        with T.block("A^T"):
            i, j = T.axis.remap("SS", [i0, i1])
            T.reads(A[j, i])
            T.writes(A^T[i, j])
            A^T[i, j] = A[j, i]
a_np = np.arange(12, dtype="float32").reshape((3, 4))
b_np = a_np.T # 基准结果
a_np, b_np
(array([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.]], dtype=float32),
 array([[ 0.,  4.,  8.],
        [ 1.,  5.,  9.],
        [ 2.,  6., 10.],
        [ 3.,  7., 11.]], dtype=float32))
a_nd = tvm.nd.array(a_np)
b_nd = tvm.nd.empty(b_np.shape, dtype="float32")
rt_lib = tvm.build(te_func, target="llvm")
rt_lib(a_nd, b_nd)
b_nd
<tvm.nd.NDArray shape=(4, 3), cpu(0)>
array([[ 0.,  4.,  8.],
       [ 1.,  5.,  9.],
       [ 2.,  6., 10.],
       [ 3.,  7., 11.]], dtype=float32)

逐位相加#

a = np.arange(16).reshape(4, 4)
b = np.arange(16, 0, -1).reshape(4, 4)
c_np = a + b # 基准结果
c_np
array([[16, 16, 16, 16],
       [16, 16, 16, 16],
       [16, 16, 16, 16],
       [16, 16, 16, 16]])
from tvm.script import tir as T

@tvm.script.ir_module
class MyAdd:
    @T.prim_func
    def add(A: T.Buffer[(4, 4), "int64"],
            B: T.Buffer[(4, 4), "int64"],
            C: T.Buffer[(4, 4), "int64"]):
        T.func_attr({"global_symbol": "add"})
        for i, j in T.grid(4, 4):
            with T.block("C"):
                vi, vj = T.axis.remap("SS", [i, j])
                C[vi, vj] = A[vi, vj] + B[vi, vj]

rt_lib = tvm.build(MyAdd, target="llvm")
a_tvm = tvm.nd.array(a)
b_tvm = tvm.nd.array(b)
c_tvm = tvm.nd.array(np.empty((4, 4), dtype=np.int64))
rt_lib["add"](a_tvm, b_tvm, c_tvm)
np.testing.assert_allclose(c_tvm.numpy(), c_np, rtol=1e-5)

广播加法#

准备数据:

a = np.arange(16).reshape(4, 4)
b = np.arange(4, 0, -1).reshape(4)
c_np = a + b # 基准
c_np
array([[ 4,  4,  4,  4],
       [ 8,  8,  8,  8],
       [12, 12, 12, 12],
       [16, 16, 16, 16]])

TIR 实现:

@tvm.script.ir_module
class MyAdd:
    @T.prim_func
    def add(A: T.Buffer[(4, 4), "int64"],
            B: T.Buffer[(4,), "int64"],
            C: T.Buffer[(4, 4), "int64"]):
        T.func_attr({"global_symbol": "add", "tir.noalias": True})
        for i, j in T.grid(4, 4):
            with T.block("C"):
                vi, vj = T.axis.remap("SS", [i, j])
                C[vi, vj] = A[vi, vj] + B[vj]

rt_lib = tvm.build(MyAdd, target="llvm")
a_tvm = tvm.nd.array(a)
b_tvm = tvm.nd.array(b)
c_tvm = tvm.nd.array(np.empty((4, 4), dtype=np.int64))
rt_lib["add"](a_tvm, b_tvm, c_tvm)
np.testing.assert_allclose(c_tvm.numpy(), c_np, rtol=1e-5)

TE 实现:

A = te.placeholder((4, 4), "int64", name="A")
B = te.placeholder((4,), "int64", name="B")
C = te.compute((4, 4), lambda i, j: A[i, j] + B[j], name="C")
te_add = te.create_prim_func([A, B, C]).with_attr({"global_symbol": "add"})

AddTE = tvm.IRModule({"add": te_add})
# 查看脚本
AddTE.show()
# from tvm.script import tir as T
@tvm.script.ir_module
class Module:
    @T.prim_func
    def add(A: T.Buffer[(4, 4), "int64"], B: T.Buffer[4, "int64"], C: T.Buffer[(4, 4), "int64"]):
        # function attr dict
        T.func_attr({"global_symbol": "add", "tir.noalias": True})
        # body
        # with T.block("root")
        for i0, i1 in T.grid(4, 4):
            with T.block("C"):
                i, j = T.axis.remap("SS", [i0, i1])
                T.reads(A[i, j], B[j])
                T.writes(C[i, j])
                C[i, j] = A[i, j] + B[j]
    

矩阵乘法#

矩阵乘法是科学计算和深度学习中应用最广泛的运算之一,通常被称为通用矩阵乘法(GEneral Matrix Multiply,简称 GEMM)。

给定 \(A\in\mathbb R^{n\times l}\)\(B \in\mathbb R^{l\times m}\),如果 \(C=AB\) 那么 \(C \in\mathbb R^{n\times m}\),且

\[C_{i,j} = \sum_{k=1}^l A_{i,k} B_{k,j}.\]

TE 实现:

def matmul(n, m, l):
    """Return the computing expression of matrix multiplication
    A : n x l matrix
    B : l x m matrix
    C : n x m matrix with C = A B
    """
    k = te.reduce_axis((0, l), name='k')
    A = te.placeholder((n, l), name='A')
    B = te.placeholder((l, m), name='B')
    C = te.compute((n, m),
                    lambda x, y: te.sum(A[x, k] * B[k, y], axis=k),
                    name='C')
    return A, B, C
n = 100
A, B, C = matmul(n, n, n)
te_func = te.create_prim_func([A, B, C]).with_attr({"global_symbol": "matmul"})
MMTE = tvm.IRModule({"matmul": te_func})
MMTE.show()
# from tvm.script import tir as T
@tvm.script.ir_module
class Module:
    @T.prim_func
    def matmul(A: T.Buffer[(100, 100), "float32"], B: T.Buffer[(100, 100), "float32"], C: T.Buffer[(100, 100), "float32"]):
        # function attr dict
        T.func_attr({"global_symbol": "matmul", "tir.noalias": True})
        # body
        # with T.block("root")
        for i0, i1, i2 in T.grid(100, 100, 100):
            with T.block("C"):
                x, y, k = T.axis.remap("SSR", [i0, i1, i2])
                T.reads(A[x, k], B[k, y])
                T.writes(C[x, y])
                with T.init():
                    C[x, y] = T.float32(0)
                C[x, y] = C[x, y] + A[x, k] * B[k, y]