测试自动量化#

import numpy as np

import tvm
from tvm import te
from tvm import relay
from tvm.relay import testing
from tvm.relay.expr import Call
from tvm.topi.utils import get_const_tuple


def quantize_and_build(out, skip_conv_layers=[]):
    f = relay.Function(relay.analysis.free_vars(out), out)
    mod, params = testing.create_workload(f)

    with relay.quantize.qconfig(skip_conv_layers=skip_conv_layers):
        qmod = relay.quantize.quantize(mod, params)

    relay.build(qmod, "llvm", params=params)
    return mod, qmod
relay.transform.FuseOps??
Signature: relay.transform.FuseOps(fuse_opt_level=-1)
Source:   
def FuseOps(fuse_opt_level=-1):
    """Fuse operators in an expr to a larger operator according to some rules.

    Parameters
    ----------
    fuse_opt_level : int
        The level of fuse optimization. -1 indicates that the level will be
        inferred from pass context.

    Returns
    -------
    ret : tvm.transform.Pass
        The registered pass for operator fusion.
    """
    return _ffi_api.FuseOps(fuse_opt_level)
File:      /media/pc/data/lxw/ai/tvm/xinetzone/__pypackages__/3.10/lib/tvm/relay/transform/transform.py
Type:      function

乘法算子的右操作数不是常量#

data = relay.var("data", shape=(1, 16, 64, 64))
multiplier = relay.sigmoid(relay.var("data", shape=(1, 16, 1, 1)))
conv = relay.nn.conv2d(
    data, relay.var("weight"), kernel_size=(3, 3), padding=(1, 1), channels=16
)
act = relay.nn.relu(data=conv)
mod, qmod = quantize_and_build(act * multiplier)
mod.show()
qmod.show()
pool = relay.nn.global_avg_pool2d(data=act)
mod, qmod = quantize_and_build(act * pool)
mod.show()
qmod.show()
One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.
/media/pc/data/lxw/ai/tvm/xinetzone/__pypackages__/3.10/lib/tvm/script/highlight.py:117: UserWarning: No module named 'black'
To print formatted TVM script, please install the formatter 'Black':
/media/pc/data/tmp/cache/conda/envs/tvmz/bin/python -m pip install "black==22.3.0" --upgrade --user
  warnings.warn(
/media/pc/data/lxw/ai/tvm/xinetzone/__pypackages__/3.10/lib/tvm/script/highlight.py:117: UserWarning: No module named 'black'
To print formatted TVM script, please install the formatter 'Black':
/media/pc/data/tmp/cache/conda/envs/tvmz/bin/python -m pip install "black==22.3.0" --upgrade --user
  warnings.warn(
/media/pc/data/lxw/ai/tvm/xinetzone/__pypackages__/3.10/lib/tvm/script/highlight.py:117: UserWarning: No module named 'black'
To print formatted TVM script, please install the formatter 'Black':
/media/pc/data/tmp/cache/conda/envs/tvmz/bin/python -m pip install "black==22.3.0" --upgrade --user
  warnings.warn(
/media/pc/data/lxw/ai/tvm/xinetzone/__pypackages__/3.10/lib/tvm/script/highlight.py:117: UserWarning: No module named 'black'
To print formatted TVM script, please install the formatter 'Black':
/media/pc/data/tmp/cache/conda/envs/tvmz/bin/python -m pip install "black==22.3.0" --upgrade --user
  warnings.warn(
def @main(%data: Tensor[(1, 16, 64, 64), float32] /* ty=Tensor[(1, 16, 64, 64), float32] */, %data1: Tensor[(1, 16, 1, 1), float32] /* ty=Tensor[(1, 16, 1, 1), float32] */) -> Tensor[(1, 16, 64, 64), float32] {
  %0 = nn.conv2d(%data, meta[relay.Constant][0], padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, 64, 64), float32] */;
  %1 = nn.relu(%0) /* ty=Tensor[(1, 16, 64, 64), float32] */;
  %2 = sigmoid(%data1) /* ty=Tensor[(1, 16, 1, 1), float32] */;
  multiply(%1, %2) /* ty=Tensor[(1, 16, 64, 64), float32] */
}
def @main(%data: Tensor[(1, 16, 64, 64), float32] /* ty=Tensor[(1, 16, 64, 64), float32] */, %data1: Tensor[(1, 16, 1, 1), float32] /* ty=Tensor[(1, 16, 1, 1), float32] */) -> Tensor[(1, 16, 64, 64), float32] {
  %0 = multiply(%data, 16f /* ty=float32 */) /* ty=Tensor[(1, 16, 64, 64), float32] */;
  %1 = round(%0) /* ty=Tensor[(1, 16, 64, 64), float32] */;
  %2 = clip(%1, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 16, 64, 64), float32] */;
  %3 = cast(%2, dtype="int8") /* ty=Tensor[(1, 16, 64, 64), int8] */;
  %4 = nn.conv2d(%3, meta[relay.Constant][0] /* ty=Tensor[(16, 16, 3, 3), int8] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3], out_dtype="int32") /* ty=Tensor[(1, 16, 64, 64), int32] */;
  %5 = nn.relu(%4) /* ty=Tensor[(1, 16, 64, 64), int32] */;
  %6 = add(%5, 256 /* ty=int32 */) /* ty=Tensor[(1, 16, 64, 64), int32] */;
  %7 = right_shift(%6, 9 /* ty=int32 */) /* ty=Tensor[(1, 16, 64, 64), int32] */;
  %8 = clip(%7, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 16, 64, 64), int32] */;
  %9 = cast(%8, dtype="int8") /* ty=Tensor[(1, 16, 64, 64), int8] */;
  %10 = annotation.stop_fusion(%9) /* ty=Tensor[(1, 16, 64, 64), int8] */;
  %11 = sigmoid(%data1) /* ty=Tensor[(1, 16, 1, 1), float32] */;
  %12 = multiply(%11, 16f /* ty=float32 */) /* ty=Tensor[(1, 16, 1, 1), float32] */;
  %13 = round(%12) /* ty=Tensor[(1, 16, 1, 1), float32] */;
  %14 = clip(%13, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 16, 1, 1), float32] */;
  %15 = cast(%10, dtype="int32") /* ty=Tensor[(1, 16, 64, 64), int32] */;
  %16 = cast(%14, dtype="int32") /* ty=Tensor[(1, 16, 1, 1), int32] */;
  %17 = multiply(%15, %16) /* ty=Tensor[(1, 16, 64, 64), int32] */;
  %18 = add(%17, 8 /* ty=int32 */) /* ty=Tensor[(1, 16, 64, 64), int32] */;
  %19 = right_shift(%18, 4 /* ty=int32 */) /* ty=Tensor[(1, 16, 64, 64), int32] */;
  %20 = clip(%19, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 16, 64, 64), int32] */;
  %21 = cast(%20, dtype="int8") /* ty=Tensor[(1, 16, 64, 64), int8] */;
  %22 = annotation.stop_fusion(%21) /* ty=Tensor[(1, 16, 64, 64), int8] */;
  %23 = cast(%22, dtype="float32") /* ty=Tensor[(1, 16, 64, 64), float32] */;
  multiply(%23, 0.0625f /* ty=float32 */) /* ty=Tensor[(1, 16, 64, 64), float32] */
}
def @main(%data: Tensor[(1, 16, 64, 64), float32] /* ty=Tensor[(1, 16, 64, 64), float32] */) -> Tensor[(1, 16, 64, 64), float32] {
  %0 = nn.conv2d(%data, meta[relay.Constant][0], padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, 64, 64), float32] */;
  %1 = nn.relu(%0) /* ty=Tensor[(1, 16, 64, 64), float32] */;
  %2 = nn.global_avg_pool2d(%1) /* ty=Tensor[(1, 16, 1, 1), float32] */;
  multiply(%1, %2) /* ty=Tensor[(1, 16, 64, 64), float32] */
}
def @main(%data: Tensor[(1, 16, 64, 64), float32] /* ty=Tensor[(1, 16, 64, 64), float32] */) -> Tensor[(1, 16, 64, 64), float32] {
  %0 = multiply(%data, 16f /* ty=float32 */) /* ty=Tensor[(1, 16, 64, 64), float32] */;
  %1 = round(%0) /* ty=Tensor[(1, 16, 64, 64), float32] */;
  %2 = clip(%1, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 16, 64, 64), float32] */;
  %3 = cast(%2, dtype="int8") /* ty=Tensor[(1, 16, 64, 64), int8] */;
  %4 = nn.conv2d(%3, meta[relay.Constant][0] /* ty=Tensor[(16, 16, 3, 3), int8] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3], out_dtype="int32") /* ty=Tensor[(1, 16, 64, 64), int32] */;
  %5 = nn.relu(%4) /* ty=Tensor[(1, 16, 64, 64), int32] */;
  %6 = add(%5, 256 /* ty=int32 */) /* ty=Tensor[(1, 16, 64, 64), int32] */;
  %7 = right_shift(%6, 9 /* ty=int32 */) /* ty=Tensor[(1, 16, 64, 64), int32] */;
  %8 = clip(%7, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 16, 64, 64), int32] */;
  %9 = cast(%8, dtype="int8") /* ty=Tensor[(1, 16, 64, 64), int8] */;
  %10 = annotation.stop_fusion(%9) /* ty=Tensor[(1, 16, 64, 64), int8] */;
  %11 = cast(%8, dtype="int8") /* ty=Tensor[(1, 16, 64, 64), int8] */;
  %12 = annotation.stop_fusion(%11) /* ty=Tensor[(1, 16, 64, 64), int8] */;
  %13 = cast(%12, dtype="int32") /* ty=Tensor[(1, 16, 64, 64), int32] */;
  %14 = cast(%10, dtype="int32") /* ty=Tensor[(1, 16, 64, 64), int32] */;
  %15 = nn.global_avg_pool2d(%13) /* ty=Tensor[(1, 16, 1, 1), int32] */;
  %16 = multiply(%14, %15) /* ty=Tensor[(1, 16, 64, 64), int32] */;
  %17 = annotation.stop_fusion(%16) /* ty=Tensor[(1, 16, 64, 64), int32] */;
  %18 = cast(%17, dtype="float32") /* ty=Tensor[(1, 16, 64, 64), float32] */;
  multiply(%18, 0.00390625f /* ty=float32 */) /* ty=Tensor[(1, 16, 64, 64), float32] */
}

跳过卷积#

data = relay.var("data", shape=(1, 16, 64, 64))
np_weight = np.random.rand(16, 16, 3, 3)
conv0_weight = relay.Constant(tvm.nd.array(np_weight)).astype("float32")
conv1_weight = relay.Constant(tvm.nd.array(np_weight)).astype("float32")
multiplier = relay.sigmoid(relay.var("data", shape=(1, 16, 1, 1)))

conv0 = relay.nn.conv2d(data, conv0_weight, kernel_size=(3, 3), padding=(1, 1), channels=16)
act0 = relay.nn.relu(data=conv0)
conv1 = relay.nn.conv2d(act0, conv1_weight, kernel_size=(3, 3), padding=(1, 1), channels=16)
act1 = relay.nn.relu(data=conv1)

quantize_and_build(act1 * multiplier)
quantize_and_build(act1 * multiplier, skip_conv_layers=[0])
quantize_and_build(act1 * multiplier, skip_conv_layers=[1])
mod, qmod = quantize_and_build(act1 * multiplier, skip_conv_layers=[0, 1])

stop_quantize#

data = relay.var("data", shape=(1, 16, 64, 64))
np_weight0 = np.random.rand(16, 16, 3, 3)
conv0_weight = relay.Constant(tvm.nd.array(np_weight0)).astype("float32")
np_weight1 = np.random.rand(16, 16, 1, 1)
conv1_weight = relay.Constant(tvm.nd.array(np_weight1)).astype("float32")
multiplier = relay.sigmoid(relay.var("data", shape=(1, 16, 1, 1)))

conv0 = relay.nn.conv2d(data, conv0_weight, kernel_size=(3, 3), padding=(1, 1), channels=16)
act0 = relay.nn.relu(data=conv0)

pool = relay.nn.global_avg_pool2d(data=act0)

conv1 = relay.nn.conv2d(pool, conv1_weight, kernel_size=(1, 1), padding=(0, 0), channels=16)
act1 = relay.nn.relu(data=conv1)

mod, qmod = quantize_and_build(act1 * multiplier)
mod.show()
qmod.show()

batch_flatten#

data = relay.var("data", shape=(1, 16, 64, 64), dtype="float32")

out = relay.nn.conv2d(
    data, relay.var("weight"), kernel_size=(3, 3), padding=(1, 1), channels=16
)

out = relay.nn.batch_flatten(out)

mod, qmod = quantize_and_build(out)

def _check_batch_flatten(node):
    if isinstance(node, Call):
        if node.op.name == "nn.batch_flatten":
            assert node.checked_type.dtype == "int8"

# check if batch_flatten is quantized
relay.analysis.post_order_visit(qmod["main"], _check_batch_flatten)

batch_matmul#

data = relay.var("data", shape=(1, 4, 16, 16))
data2 = relay.sigmoid(relay.var("data", shape=(4, 16, 64)))
out = relay.nn.conv2d(data, relay.var("weight"), kernel_size=(3, 3), padding=(1, 1), channels=8)

out = relay.nn.batch_flatten(out)
out = relay.reshape(out, [1, 32, 64])
out = relay.nn.batch_matmul(out, data2)

mod, qmod = quantize_and_build(out)

def _check_batch_matmul(node):
    if isinstance(node, Call):

        if node.op.name in ["nn.batch_matmul", "nn.conv2d"]:
            assert node.checked_type.dtype == "int32"
        elif node.op.name == "nn.batch_flatten":
            assert node.checked_type.dtype == "int8"

# check if batch_matmul is quantized
relay.analysis.post_order_visit(qmod["main"], _check_batch_matmul)

calibration_dataset#

def get_calibration_dataset(mod, input_name):
    dataset = []
    input_shape = [int(x) for x in mod["main"].checked_type.arg_types[0].shape]
    for i in range(5):
        data = np.random.uniform(size=input_shape)
        dataset.append({input_name: data})
    return dataset
mod, params = testing.synthetic.get_workload()
dataset = get_calibration_dataset(mod, "data")
create_target = True
with relay.quantize.qconfig(calibrate_mode="kl_divergence"):
    if create_target:
        with tvm.target.Target("llvm"):
            relay.quantize.quantize(mod, params, dataset)
    else:
        # current_target = None
        relay.quantize.quantize(mod, params, dataset)

calibrate_memory_bound:

mod, params = testing.synthetic.get_workload()
dataset = get_calibration_dataset(mod, "data")
import multiprocessing

num_cpu = multiprocessing.cpu_count()
with relay.quantize.qconfig(calibrate_mode="kl_divergence", calibrate_chunk_by=num_cpu):
    relay.quantize.quantize(mod, params, dataset)

calibrate_percentile:

mod, params = testing.synthetic.get_workload()
dataset = get_calibration_dataset(mod, "data")
with relay.quantize.qconfig(calibrate_mode="percentile"):
    relay.quantize.quantize(mod, params, dataset)