测试 Roofline 模型#

import set_env
import warnings
warnings.filterwarnings("ignore", category=UserWarning) # 忽略用户警告
import csv
import json
import os
import platform
from io import StringIO

import numpy as np

import tvm.testing
import tvm.utils
from tvm import relay, rpc
from tvm.contrib import utils
from tvm.contrib.debugger import debug_executor
from tvm.relay.testing import mlp
from tvm.runtime import profiler_vm
from tvm.runtime.profiling import Report
from tvm.script import tir as T

estimate_peak_flops_cpu#

for dtype in ["float32", "int8", "int32"]:
    server = rpc.Server(key="roofline_flops_cpu")
    remote = rpc.connect("127.0.0.1", server.port, key="roofline_flops_cpu")
    target = tvm.target.Target("llvm -mattr=+fma,+avx2")
    dev = remote.device(str(target))
    # This test uses vectorized instructions so we need a target that supports them
    flops = tvm.utils.roofline.x86.estimate_peak_fma_vector_flops(target, dev, remote, dtype)
    # Assume we can achieve 1 GFLOP/s per thread, which is 1 FLOP per cycle on a 1GHz cpu.
    assert (
        flops > 10**9 and flops < 10**14
    ), f"FLOP/s should be between 10^9 and 10^14, but it is {flops}"
2024-01-19 14:00:46.799 INFO bind to 0.0.0.0:9091
2024-01-19 14:00:46.800 INFO connected from ('127.0.0.1', 37002)
2024-01-19 14:00:46.801 INFO start serving at /tmp/tmpb673zo51
2024-01-19 14:00:46.940 INFO load_module /tmp/tmpb673zo51/peak_fma_flops.tar
2024-01-19 14:00:48.501 INFO bind to 0.0.0.0:9092
2024-01-19 14:00:48.538 INFO connected from ('127.0.0.1', 54284)
2024-01-19 14:00:48.539 INFO start serving at /tmp/tmp_2qik6wl
2024-01-19 14:00:48.684 INFO load_module /tmp/tmp_2qik6wl/peak_fma_flops.tar
2024-01-19 14:00:51.826 INFO bind to 0.0.0.0:9091
2024-01-19 14:00:51.880 INFO connected from ('127.0.0.1', 43448)
2024-01-19 14:00:51.881 INFO start serving at /tmp/tmp_8y155vs
2024-01-19 14:00:52.016 INFO load_module /tmp/tmp_8y155vs/peak_fma_flops.tar

estimate_peak_flops_gpu#

from tvm_book.config.env import set_cudnn
set_cudnn() # 设置 CUDA 环境
server = rpc.Server(key="roofline_flops_gpu")
remote = rpc.connect("127.0.0.1", server.port, key="roofline_flops_gpu")
target = tvm.target.Target("cuda")
dev = remote.device(str(target))
# This test uses vectorized instructions so we need a target that supports them
flops = tvm.utils.roofline.cuda.estimate_peak_flops_tensorcore(target, dev, remote)
# should be able to hit a TFLOP/s with tensor cores
assert (
    flops > 10**12 and flops < 10**14
), f"FLOP/s should be between 10^12 and 10^14, but it is {flops}"

# this test should run on all gpus
flops = tvm.utils.roofline.cuda.estimate_peak_flops_fma(target, dev, remote, "float32")
# most gpus since 2016 should be able to hit a TFLOP/s with fma instructions
assert (
    flops > 10**12 and flops < 10**14
), f"FLOP/s should be between 10^12 and 10^14, but it is {flops}"
2024-01-19 14:00:54.378 INFO bind to 0.0.0.0:9092
2024-01-19 14:00:54.410 INFO connected from ('127.0.0.1', 55232)
2024-01-19 14:00:54.411 INFO start serving at /tmp/tmpmkby12ft
2024-01-19 14:00:56.002 INFO load_module /tmp/tmpmkby12ft/peak_mma_flops.tar
2024-01-19 14:00:56.861 INFO load_module /tmp/tmpmkby12ft/peak_fma_flops.tar

estimate_peak_bandwidth_cpu#

server = rpc.Server(key="roofline_bandwidth_cpu")
remote = rpc.connect("127.0.0.1", server.port, key="roofline_bandwidth_cpu")
target = tvm.target.Target("llvm -mattr=+fma,+avx2")
dev = remote.device(str(target))
# This test uses vectorized instructions so we need a target that supports them
bandwidth = tvm.utils.roofline.x86.estimate_peak_bandwidth_dram(target, dev, remote)
# Assume we can achieve 1 GB/s. DDR2 should transfer somewhere around 6
# GB/s, so this should leave enough wiggle room.
assert (
    bandwidth > 10**9 and bandwidth < 10**12
), f"Bandwidth should be between 10^9 and 10^12, but it is {bandwidth}"
2024-01-19 14:00:58.037 INFO bind to 0.0.0.0:9091
2024-01-19 14:00:58.094 INFO connected from ('127.0.0.1', 43462)
2024-01-19 14:00:58.095 INFO start serving at /tmp/tmpcxaxh3kt
2024-01-19 14:00:58.229 INFO load_module /tmp/tmpcxaxh3kt/peak_bandwidth.tar

estimate_peak_bandwidth_gpu#

server = rpc.Server(key="roofline_bandwidth_gpu")
remote = rpc.connect("127.0.0.1", server.port, key="roofline_bandwidth_gpu")
target = tvm.target.Target("cuda")
dev = remote.device(str(target))
# This test uses vectorized instructions so we need a target that supports them
bandwidth = tvm.utils.roofline.cuda.estimate_peak_bandwidth_global_mem(target, dev, remote)
# should be able to hit a 100 GB/s on a GPU. GTX 280 hits 140 GB/s and
# it is really old.
assert (
    bandwidth > 10**11 and bandwidth < 10**13
), f"Bandwidth should be between 10^9 and 10^12, but it is {bandwidth}"
2024-01-19 14:01:11.762 INFO bind to 0.0.0.0:9092
2024-01-19 14:01:11.815 INFO connected from ('127.0.0.1', 48560)
2024-01-19 14:01:11.816 INFO start serving at /tmp/tmpwz5b9dc6
2024-01-19 14:01:12.493 INFO load_module /tmp/tmpwz5b9dc6/peak_bandwidth.tar

roofline_analysis#

target, dev = "llvm -mattr=+fma,+avx2", "cuda"
a = relay.var("a", relay.TensorType((512, 512), "float32"))
b = relay.var("b", relay.TensorType((512, 512), "float32"))
c = relay.nn.dense(a, b)
mod = tvm.IRModule.from_expr(relay.Function([a, b], c))
params = {}

server = rpc.Server(key="roofline")
remote = rpc.connect("127.0.0.1", server.port, key="roofline")
dev = remote.device(target)

report = tvm.utils.roofline_analysis(mod, params, target, dev, remote=remote)
print(report)

assert "Bound" in report.table()
assert "Percent of Theoretical Optimal" in report.table()
for call in report.calls:
    if "Percent of Theoretical Optimal" in call:
        if target.startswith("llvm"):
            # Ideally we'd like a little tighter bound here, but it is hard to
            # know how well this dense will perform without tuning. And we
            # don't have an operator that uses a specific number of flops.
            assert call["Percent of Theoretical Optimal"].ratio >= 5.0
        elif target == "cuda":
            # The cuda gpu kernel is really poorly optimized
            assert 90 >= call["Percent of Theoretical Optimal"].ratio >= 0.01
Name                   Duration (us)  Percent  Device  Count                                          Argument Shapes  Arithmetic Intensity  Bandwidth   Bound  Estimated FLOPs   FLOP/s              Hash  Loaded Bytes  Percent of Theoretical Optimal  VM::Argument Shapes  
vm_mod_fused_nn_dense       1,911.23    95.44    cpu0      1  float32[512, 512], float32[512, 512], float32[512, 512]                    18    7.7e+09  memory      268,435,456  1.4e+11  6bf92d0ede030db0    14,696,448                              32                       
VM::AllocStorage               13.25     0.66    cpu0      1                                                                                                                                                                                                float32[512, 512]  
VM::AllocTensor                 1.98     0.10    cpu0      1                                        float32[512, 512]                                                                                                                                                          
VM::UnknownOp                   1.07     0.05    cpu0      3                                                                                                                                                                                                                   
----------                                                                                                                                                                                                                                                                     
Sum                         1,927.53    96.25              6                                                                                                        268,435,456                               14,696,448                                                       
Total                       2,002.57             cpu0      1                                                                                                                                                                                                                   

Configuration
-------------
Number of threads: 24
Estimated Peak Bandwidth (DRAM, byte/second): 2.4e+10
Executor: VM
Estimated Peak FLOP/s (float32 FMA): 2e+12
2024-01-19 14:01:28.584 INFO bind to 0.0.0.0:9091
2024-01-19 14:01:28.618 INFO connected from ('127.0.0.1', 51894)
2024-01-19 14:01:28.619 INFO start serving at /tmp/tmpmu073udv
One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.
2024-01-19 14:01:28.871 INFO load_module /tmp/tmpmu073udv/roofline_lib.tar
2024-01-19 14:01:30.796 INFO load_module /tmp/tmpmu073udv/peak_fma_flops.tar
2024-01-19 14:01:31.740 INFO load_module /tmp/tmpmu073udv/peak_bandwidth.tar