TVM Operator Inventory (TOPI)简介
TOPI簡介
 這是 TVM Operator Inventory (TOPI) 的介紹。TOPI 提供了比 TVM 具有更高抽象的 numpy 風格的,通用操作和調度。TOPI 如何在 TVM 中,編寫樣板代碼。
 from future import absolute_import, print_function
import tvm
 import tvm.testing
 from tvm import te
 from tvm import topi
 import numpy as np
 基本示例
 重新審視行總和操作(相當于B = numpy.sum(A, axis=1)),要計算二維 TVM 張量 A 行總和,應該指定符號操作及調度。
 n = te.var(“n”)
 m = te.var(“m”)
 A = te.placeholder((n, m), name=“A”)
 k = te.reduce_axis((0, m), “k”)
 B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name=“B”)
 s = te.create_schedule(B.op)
 以人類可讀的格式,檢查 IR 代碼,可以這樣做。
 print(tvm.lower(s, [A], simple_mode=True))
 輸出:
 primfn(A_1: handle) -> ()
 attr = {“from_legacy_te_schedule”: True, “global_symbol”: “main”, “tir.noalias”: True}
 buffers = {A: Buffer(A_2: Pointer(float32), float32, [n: int32, m: int32], [stride: int32, stride_1: int32], type=“auto”)}
 buffer_map = {A_1: A} {
 allocate(B: Pointer(global float32), float32, [n]), storage_scope = global;
 for (i: int32, 0, n) {
 B[i] = 0f32
 for (k: int32, 0, m) {
 B[i] = ((float32*)B[i] + (float32*)A_2[((istride) + (kstride_1))])
 }
 }
 }
 對于這樣一個常見的操作,必須定義 reduce 軸,以及使用 te.compute進行顯式計算 。對于更復雜的操作,需要提供多少細節??梢杂煤唵蝨opi.sum的,如numpy.sum,替換這兩行。
 C = topi.sum(A, axis=1)
 ts = te.create_schedule(C.op)
 print(tvm.lower(ts, [A], simple_mode=True))
 輸出:
 primfn(A_1: handle) -> ()
 attr = {“from_legacy_te_schedule”: True, “global_symbol”: “main”, “tir.noalias”: True}
 buffers = {A: Buffer(A_2: Pointer(float32), float32, [n: int32, m: int32], [stride: int32, stride_1: int32], type=“auto”)}
 buffer_map = {A_1: A} {
 allocate(A_red: Pointer(global float32), float32, [n]), storage_scope = global;
 for (ax0: int32, 0, n) {
 A_red[ax0] = 0f32
 for (k1: int32, 0, m) {
 A_red[ax0] = ((float32*)A_red[ax0] + (float32*)A_2[((ax0stride) + (k1stride_1))])
 }
 }
 }
 Numpy 風格的算子重載
 可以使用topi.broadcast_add具有正確(可廣播特定)shape的張量,添加兩個張量。TOPI 為此類常見操作,提供了算子重載。例如,
 x, y = 100, 10
 a = te.placeholder((x, y, y), name=“a”)
 b = te.placeholder((y, y), name=“b”)
 c = a + b # same as topi.broadcast_add
 d = a * b # same as topi.broadcast_mul
 使用相同的語法重載,TOPI 處理,將原語(int,float)廣播到 tensor d - 3.14。
 通用調度和融合操作
 TOPI 如何免于在較低級別的 API 中,編寫顯式計算。像以前一樣進行調度,TOPI根據給定的上下文,提供更高級別的調度方法。例如,對于 CUDA,可以using only topi.generic.schedule_reduce,調度topi.sum結尾的一系列操作。
 e = topi.elemwise_sum([c, d])
 f = e / 2.0
 g = topi.sum(f)
 with tvm.target.cuda():
 sg = topi.cuda.schedule_reduce(g)
 print(tvm.lower(sg, [a, b], simple_mode=True))
 輸出:
 primfn(a_1: handle, b_1: handle) -> ()
 attr = {“from_legacy_te_schedule”: True, “global_symbol”: “main”, “tir.noalias”: True}
 buffers = {b: Buffer(b_2: Pointer(float32), float32, [10, 10], []),
 a: Buffer(a_2: Pointer(float32), float32, [100, 10, 10], [])}
 buffer_map = {a_1: a, b_1: b} {
 allocate(T_divide_red: Pointer(global float32), float32, [1]), storage_scope = global;
 attr [IterVar(threadIdx.x: int32, [0:1024], “ThreadIndex”, “threadIdx.x”)] “thread_extent” = 1024;
 allocate(T_divide_red.rf: Pointer(local float32), float32, [1]), storage_scope = local;
 allocate(reduce_temp0: Pointer(local float32), float32, [1]), storage_scope = local {
 T_divide_red.rf[0] = 0f32
 for (k0.k1.fused.k2.fused.outer: int32, 0, 10) {
 if @tir.likely((((((k0.k1.fused.k2.fused.outer1024) + threadIdx.x) < 10000) && (((k0.k1.fused.k2.fused.outer1024) + threadIdx.x) < 10000)) && (((k0.k1.fused.k2.fused.outer1024) + threadIdx.x) < 10000)), dtype=bool) {
 T_divide_red.rf[0] = ((float32)T_divide_red.rf[0] + ((((float32*)a_2[((k0.k1.fused.k2.fused.outer1024) + threadIdx.x)] + (float32)b_2[floormod(((k0.k1.fused.k2.fused.outer1024) + threadIdx.x), 100)]) + ((float32)a_2[((k0.k1.fused.k2.fused.outer1024) + threadIdx.x)](float32*)b_2[floormod(((k0.k1.fused.k2.fused.outer1024) + threadIdx.x), 100)]))0.5f32))
 }
 }
 attr [meta[tir.CommReducer][0]] “reduce_scope” = @tir.reinterpret(0u64, dtype=handle);
 @tir.tvm_thread_allreduce(1u32, (float32)T_divide_red.rf[0], True, reduce_temp0, threadIdx.x, dtype=handle)
 if (threadIdx.x == 0) {
 T_divide_red[0] = (float32)reduce_temp0[0]
 }
 }
 }
 計算的預定階段已經累積,可以通過以下方式檢查。
 print(sg.stages)
 輸出:
 [stage(a, placeholder(a, 0xd9c0fa00)), stage(b, placeholder(b, 0xe225cf70)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_elemwise_sum, compute(T_elemwise_sum, body=[(T_add[ax0, ax1, ax2] + T_multiply[ax0, ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=elemwise, attrs={})), stage(T_divide, compute(T_divide, body=[(T_elemwise_sum[ax0, ax1, ax2]/2f)], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=elemwise, attrs={})), stage(T_divide_red.rf, compute(T_divide_red.rf, body=[reduce(combiner=comm_reducer(result=[(x + y)], lhs=[x], rhs=[y], identity_element=[0f]), source=[T_divide[floordiv(floordiv((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer1024)), 10), 10), floormod(floordiv((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer1024)), 10), 10), floormod((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer1024)), 10)]], init=[], axis=[iter_var(k0.k1.fused.k2.fused.outer, range(min=0, ext=10))], where=tir.likely((((floordiv(floordiv((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer1024)), 10), 10) < 100) && (floordiv((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer1024)), 10) < 1000)) && ((k0.k1.fused.k2.fused.inner + (k0.k1.fused.k2.fused.outer1024)) < 10000))), value_index=0)], axis=[iter_var(k0.k1.fused.k2.fused.inner, range(min=0, ext=1024))], reduce_axis=[iter_var(k0.k1.fused.k2.fused.outer, range(min=0, ext=10))], tag=, attrs={})), stage(T_divide_red, compute(T_divide_red.repl, body=[reduce(combiner=comm_reducer(result=[(x + y)], lhs=[x], rhs=[y], identity_element=[0f]), source=[T_divide_red.rf[k0.k1.fused.k2.fused.inner.v]], init=[], axis=[iter_var(k0.k1.fused.k2.fused.inner.v, range(min=0, ext=1024))], where=(bool)1, value_index=0)], axis=[], reduce_axis=[iter_var(k0.k1.fused.k2.fused.inner.v, range(min=0, ext=1024))], tag=, attrs={}))]
 可以通過與numpy結果進行比較,測試正確性,如下所示。
 func = tvm.build(sg, [a, b, g], “cuda”)
 dev = tvm.cuda(0)
 a_np = np.random.uniform(size=(x, y, y)).astype(a.dtype)
 b_np = np.random.uniform(size=(y, y)).astype(b.dtype)
 g_np = np.sum(np.add(a_np + b_np, a_np * b_np) / 2.0)
 a_nd = tvm.nd.array(a_np, dev)
 b_nd = tvm.nd.array(b_np, dev)
 g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), dev)
 func(a_nd, b_nd, g_nd)
 tvm.testing.assert_allclose(g_nd.numpy(), g_np, rtol=1e-5)
 TOPI 提供常用的神經網絡操作,如 softmax 優化調度
 tarray = te.placeholder((512, 512), name=“tarray”)
 softmax_topi = topi.nn.softmax(tarray)
 with tvm.target.Target(“cuda”):
 sst = topi.cuda.schedule_softmax(softmax_topi)
 print(tvm.lower(sst, [tarray], simple_mode=True))
 輸出:
 primfn(tarray_1: handle) -> ()
 attr = {“from_legacy_te_schedule”: True, “global_symbol”: “main”, “tir.noalias”: True}
 buffers = {tarray: Buffer(tarray_2: Pointer(float32), float32, [512, 512], [])}
 buffer_map = {tarray_1: tarray} {
 allocate(T_softmax_norm: Pointer(global float32x4), float32x4, [65536]), storage_scope = global;
 attr [IterVar(blockIdx.x: int32, (nullptr), “ThreadIndex”, “blockIdx.x”)] “thread_extent” = 512;
 allocate(normal_reduce_temp0: Pointer(local float32), float32, [1]), storage_scope = local;
 allocate(reduce_temp0: Pointer(local float32), float32, [1]), storage_scope = local;
 allocate(T_softmax_exp: Pointer(warp float32), float32, [512]), storage_scope = warp;
 allocate(normal_reduce_temp0_1: Pointer(local float32), float32, [1]), storage_scope = local;
 allocate(reduce_temp0_1: Pointer(local float32), float32, [1]), storage_scope = local {
 attr [IterVar(threadIdx.x: int32, [0:32], “ThreadIndex”, “threadIdx.x”)] “thread_extent” = 32 {
 normal_reduce_temp0[0] = -3.40282e+38f32
 for (k.inner: int32, 0, 16) {
 normal_reduce_temp0[0] = max((float32)normal_reduce_temp0[0], (float32*)tarray_2[(((blockIdx.x512) + (threadIdx.x16)) + k.inner)])
 }
 attr [meta[tir.CommReducer][0]] “reduce_scope” = @tir.reinterpret(0u64, dtype=handle);
 @tir.tvm_thread_allreduce(1u32, (float32*)normal_reduce_temp0[0], True, reduce_temp0, threadIdx.x, dtype=handle)
 for (i1.inner.outer: int32, 0, 4) {
 T_softmax_exp[ramp(((threadIdx.x16) + (i1.inner.outer4)), 1, 4)] = @tir.exp(((float32x4*)tarray_2[ramp((((blockIdx.x512) + (threadIdx.x16)) + (i1.inner.outer4)), 1, 4)] - broadcast((float32)reduce_temp0[0], 4)), dtype=float32x4)
 }
 }
 attr [IterVar(threadIdx.x, [0:32], “ThreadIndex”, “threadIdx.x”)] “thread_extent” = 32 {
 normal_reduce_temp0_1[0] = 0f32
 for (k.inner_1: int32, 0, 16) {
 normal_reduce_temp0_1[0] = ((float32*)normal_reduce_temp0_1[0] + (float32*)T_softmax_exp[((threadIdx.x16) + k.inner_1)])
 }
 attr [meta[tir.CommReducer][1]] “reduce_scope” = @tir.reinterpret(0u64, dtype=handle);
 @tir.tvm_thread_allreduce(1u32, (float32)normal_reduce_temp0_1[0], True, reduce_temp0_1, threadIdx.x, dtype=handle)
 for (i1.inner.outer_1: int32, 0, 4) {
 T_softmax_norm[ramp((((blockIdx.x512) + (threadIdx.x16)) + (i1.inner.outer_14)), 1, 4)] = ((float32x4)T_softmax_exp[ramp(((threadIdx.x16) + (i1.inner.outer_14)), 1, 4)] / broadcast((float32*)reduce_temp0_1[0], 4))
 }
 }
 }
 }
 融合卷積
 可以融合topi.nn.conv2d和topi.nn.relu在一起。
 TOPI 函數都是通用函數。對不同的后端,有不同的實現優化性能。對于每個后端,有必要在計算聲明和調度的目標范圍內調用。TVM 將選擇正確的函數,調用目標信息。
 data = te.placeholder((1, 3, 224, 224))
 kernel = te.placeholder((10, 3, 5, 5))
with tvm.target.Target(“cuda”):
 conv = topi.cuda.conv2d_nchw(data, kernel, 1, 2, 1)
 out = topi.nn.relu(conv)
 sconv = topi.cuda.schedule_conv2d_nchw([out])
 print(tvm.lower(sconv, [data, kernel], simple_mode=True))
 Out:
 primfn(placeholder_2: handle, placeholder_3: handle) -> ()
 attr = {“from_legacy_te_schedule”: True, “global_symbol”: “main”, “tir.noalias”: True}
 buffers = {placeholder_1: Buffer(placeholder_4: Pointer(float32), float32, [10, 3, 5, 5], []),
 placeholder: Buffer(placeholder_5: Pointer(float32), float32, [1, 3, 224, 224], [])}
 buffer_map = {placeholder_2: placeholder, placeholder_3: placeholder_1} {
 allocate(compute: Pointer(global float32), float32, [501760]), storage_scope = global;
 attr [IterVar(blockIdx.z: int32, (nullptr), “ThreadIndex”, “blockIdx.z”)] “thread_extent” = 5;
 allocate(compute_1: Pointer(local float32), float32, [14]), storage_scope = local;
 allocate(pad_temp.shared: Pointer(shared float32), float32, [112]), storage_scope = shared;
 allocate(placeholder.shared: Pointer(shared float32), float32, [2]), storage_scope = shared;
 attr [IterVar(blockIdx.y: int32, (nullptr), “ThreadIndex”, “blockIdx.y”)] “thread_extent” = 224;
 attr [IterVar(blockIdx.x: int32, (nullptr), “ThreadIndex”, “blockIdx.x”)] “thread_extent” = 2;
 attr [IterVar(threadIdx.z: int32, (nullptr), “ThreadIndex”, “threadIdx.z”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.y: int32, (nullptr), “ThreadIndex”, “threadIdx.y”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.x: int32, (nullptr), “ThreadIndex”, “threadIdx.x”)] “thread_extent” = 16 {
 compute_1[0] = 0f32
 compute_1[2] = 0f32
 compute_1[4] = 0f32
 compute_1[6] = 0f32
 compute_1[8] = 0f32
 compute_1[10] = 0f32
 compute_1[12] = 0f32
 compute_1[1] = 0f32
 compute_1[3] = 0f32
 compute_1[5] = 0f32
 compute_1[7] = 0f32
 compute_1[9] = 0f32
 compute_1[11] = 0f32
 compute_1[13] = 0f32
 for (rc.outer: int32, 0, 3) {
 for (ry.outer: int32, 0, 5) {
 attr [IterVar(threadIdx.z_1: int32, (nullptr), “ThreadIndex”, “threadIdx.z”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.y_1: int32, (nullptr), “ThreadIndex”, “threadIdx.y”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.x_1: int32, (nullptr), “ThreadIndex”, “threadIdx.x”)] “thread_extent” = 16 {
 pad_temp.shared[(threadIdx.x_17)] = @tir.if_then_else((((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)) && (2 <= ((blockIdx.x112) + (threadIdx.x_17)))), (float32)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 450)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 1)] = @tir.if_then_else((((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)) && (1 <= ((blockIdx.x112) + (threadIdx.x_17)))), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 449)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 2)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 448)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 3)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 447)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 4)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 446)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 5)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 445)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 6)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 444)], 0f32, dtype=float32)
 }
 attr [IterVar(threadIdx.z_2: int32, (nullptr), “ThreadIndex”, “threadIdx.z”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.y_2: int32, (nullptr), “ThreadIndex”, “threadIdx.y”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.x_2: int32, (nullptr), “ThreadIndex”, “threadIdx.x”)] “thread_extent” = 16;
 if @tir.likely((threadIdx.x_2 < 2), dtype=bool) {
 placeholder.shared[threadIdx.x_2] = (float32)placeholder_4[((((blockIdx.z150) + (threadIdx.x_275)) + (rc.outer25)) + (ry.outer5))]
 }
 compute_1[0] = ((float32*)compute_1[0] + ((float32*)pad_temp.shared[threadIdx.x](float32)placeholder.shared[0]))
 compute_1[2] = ((float32*)compute_1[2] + ((float32*)pad_temp.shared[(threadIdx.x + 16)](float32)placeholder.shared[0]))
 compute_1[4] = ((float32*)compute_1[4] + ((float32*)pad_temp.shared[(threadIdx.x + 32)](float32)placeholder.shared[0]))
 compute_1[6] = ((float32*)compute_1[6] + ((float32*)pad_temp.shared[(threadIdx.x + 48)](float32)placeholder.shared[0]))
 compute_1[8] = ((float32*)compute_1[8] + ((float32*)pad_temp.shared[(threadIdx.x + 64)](float32)placeholder.shared[0]))
 compute_1[10] = ((float32*)compute_1[10] + ((float32*)pad_temp.shared[(threadIdx.x + 80)](float32)placeholder.shared[0]))
 compute_1[12] = ((float32*)compute_1[12] + ((float32*)pad_temp.shared[(threadIdx.x + 96)](float32)placeholder.shared[0]))
 compute_1[1] = ((float32*)compute_1[1] + ((float32*)pad_temp.shared[threadIdx.x](float32)placeholder.shared[1]))
 compute_1[3] = ((float32*)compute_1[3] + ((float32*)pad_temp.shared[(threadIdx.x + 16)](float32)placeholder.shared[1]))
 compute_1[5] = ((float32*)compute_1[5] + ((float32*)pad_temp.shared[(threadIdx.x + 32)](float32)placeholder.shared[1]))
 compute_1[7] = ((float32*)compute_1[7] + ((float32*)pad_temp.shared[(threadIdx.x + 48)](float32)placeholder.shared[1]))
 compute_1[9] = ((float32*)compute_1[9] + ((float32*)pad_temp.shared[(threadIdx.x + 64)](float32)placeholder.shared[1]))
 compute_1[11] = ((float32*)compute_1[11] + ((float32*)pad_temp.shared[(threadIdx.x + 80)](float32)placeholder.shared[1]))
 compute_1[13] = ((float32*)compute_1[13] + ((float32*)pad_temp.shared[(threadIdx.x + 96)](float32)placeholder.shared[1]))
 attr [IterVar(threadIdx.z_1, (nullptr), “ThreadIndex”, “threadIdx.z”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.y_1, (nullptr), “ThreadIndex”, “threadIdx.y”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.x_1, (nullptr), “ThreadIndex”, “threadIdx.x”)] “thread_extent” = 16 {
 pad_temp.shared[(threadIdx.x_17)] = @tir.if_then_else((((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)) && (1 <= ((blockIdx.x112) + (threadIdx.x_17)))), (float32)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 449)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 1)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 448)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 2)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 447)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 3)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 446)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 4)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 445)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 5)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 444)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 6)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 443)], 0f32, dtype=float32)
 }
 attr [IterVar(threadIdx.z_2, (nullptr), “ThreadIndex”, “threadIdx.z”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.y_2, (nullptr), “ThreadIndex”, “threadIdx.y”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.x_2, (nullptr), “ThreadIndex”, “threadIdx.x”)] “thread_extent” = 16;
 if @tir.likely((threadIdx.x_2 < 2), dtype=bool) {
 placeholder.shared[threadIdx.x_2] = (float32)placeholder_4[(((((blockIdx.z150) + (threadIdx.x_275)) + (rc.outer25)) + (ry.outer5)) + 1)]
 }
 compute_1[0] = ((float32*)compute_1[0] + ((float32*)pad_temp.shared[threadIdx.x](float32)placeholder.shared[0]))
 compute_1[2] = ((float32*)compute_1[2] + ((float32*)pad_temp.shared[(threadIdx.x + 16)](float32)placeholder.shared[0]))
 compute_1[4] = ((float32*)compute_1[4] + ((float32*)pad_temp.shared[(threadIdx.x + 32)](float32)placeholder.shared[0]))
 compute_1[6] = ((float32*)compute_1[6] + ((float32*)pad_temp.shared[(threadIdx.x + 48)](float32)placeholder.shared[0]))
 compute_1[8] = ((float32*)compute_1[8] + ((float32*)pad_temp.shared[(threadIdx.x + 64)](float32)placeholder.shared[0]))
 compute_1[10] = ((float32*)compute_1[10] + ((float32*)pad_temp.shared[(threadIdx.x + 80)](float32)placeholder.shared[0]))
 compute_1[12] = ((float32*)compute_1[12] + ((float32*)pad_temp.shared[(threadIdx.x + 96)](float32)placeholder.shared[0]))
 compute_1[1] = ((float32*)compute_1[1] + ((float32*)pad_temp.shared[threadIdx.x](float32)placeholder.shared[1]))
 compute_1[3] = ((float32*)compute_1[3] + ((float32*)pad_temp.shared[(threadIdx.x + 16)](float32)placeholder.shared[1]))
 compute_1[5] = ((float32*)compute_1[5] + ((float32*)pad_temp.shared[(threadIdx.x + 32)](float32)placeholder.shared[1]))
 compute_1[7] = ((float32*)compute_1[7] + ((float32*)pad_temp.shared[(threadIdx.x + 48)](float32)placeholder.shared[1]))
 compute_1[9] = ((float32*)compute_1[9] + ((float32*)pad_temp.shared[(threadIdx.x + 64)](float32)placeholder.shared[1]))
 compute_1[11] = ((float32*)compute_1[11] + ((float32*)pad_temp.shared[(threadIdx.x + 80)](float32)placeholder.shared[1]))
 compute_1[13] = ((float32*)compute_1[13] + ((float32*)pad_temp.shared[(threadIdx.x + 96)](float32)placeholder.shared[1]))
 attr [IterVar(threadIdx.z_1, (nullptr), “ThreadIndex”, “threadIdx.z”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.y_1, (nullptr), “ThreadIndex”, “threadIdx.y”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.x_1, (nullptr), “ThreadIndex”, “threadIdx.x”)] “thread_extent” = 16 {
 pad_temp.shared[(threadIdx.x_17)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 448)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 1)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 447)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 2)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 446)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 3)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 445)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 4)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 444)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 5)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 443)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 6)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 442)], 0f32, dtype=float32)
 }
 attr [IterVar(threadIdx.z_2, (nullptr), “ThreadIndex”, “threadIdx.z”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.y_2, (nullptr), “ThreadIndex”, “threadIdx.y”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.x_2, (nullptr), “ThreadIndex”, “threadIdx.x”)] “thread_extent” = 16;
 if @tir.likely((threadIdx.x_2 < 2), dtype=bool) {
 placeholder.shared[threadIdx.x_2] = (float32)placeholder_4[(((((blockIdx.z150) + (threadIdx.x_275)) + (rc.outer25)) + (ry.outer5)) + 2)]
 }
 compute_1[0] = ((float32*)compute_1[0] + ((float32*)pad_temp.shared[threadIdx.x](float32)placeholder.shared[0]))
 compute_1[2] = ((float32*)compute_1[2] + ((float32*)pad_temp.shared[(threadIdx.x + 16)](float32)placeholder.shared[0]))
 compute_1[4] = ((float32*)compute_1[4] + ((float32*)pad_temp.shared[(threadIdx.x + 32)](float32)placeholder.shared[0]))
 compute_1[6] = ((float32*)compute_1[6] + ((float32*)pad_temp.shared[(threadIdx.x + 48)](float32)placeholder.shared[0]))
 compute_1[8] = ((float32*)compute_1[8] + ((float32*)pad_temp.shared[(threadIdx.x + 64)](float32)placeholder.shared[0]))
 compute_1[10] = ((float32*)compute_1[10] + ((float32*)pad_temp.shared[(threadIdx.x + 80)](float32)placeholder.shared[0]))
 compute_1[12] = ((float32*)compute_1[12] + ((float32*)pad_temp.shared[(threadIdx.x + 96)](float32)placeholder.shared[0]))
 compute_1[1] = ((float32*)compute_1[1] + ((float32*)pad_temp.shared[threadIdx.x](float32)placeholder.shared[1]))
 compute_1[3] = ((float32*)compute_1[3] + ((float32*)pad_temp.shared[(threadIdx.x + 16)](float32)placeholder.shared[1]))
 compute_1[5] = ((float32*)compute_1[5] + ((float32*)pad_temp.shared[(threadIdx.x + 32)](float32)placeholder.shared[1]))
 compute_1[7] = ((float32*)compute_1[7] + ((float32*)pad_temp.shared[(threadIdx.x + 48)](float32)placeholder.shared[1]))
 compute_1[9] = ((float32*)compute_1[9] + ((float32*)pad_temp.shared[(threadIdx.x + 64)](float32)placeholder.shared[1]))
 compute_1[11] = ((float32*)compute_1[11] + ((float32*)pad_temp.shared[(threadIdx.x + 80)](float32)placeholder.shared[1]))
 compute_1[13] = ((float32*)compute_1[13] + ((float32*)pad_temp.shared[(threadIdx.x + 96)](float32)placeholder.shared[1]))
 attr [IterVar(threadIdx.z_1, (nullptr), “ThreadIndex”, “threadIdx.z”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.y_1, (nullptr), “ThreadIndex”, “threadIdx.y”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.x_1, (nullptr), “ThreadIndex”, “threadIdx.x”)] “thread_extent” = 16 {
 pad_temp.shared[(threadIdx.x_17)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 447)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 1)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 446)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 2)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 445)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 3)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 444)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 4)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 443)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 5)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 442)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 6)] = @tir.if_then_else((((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)) && (((blockIdx.x112) + (threadIdx.x_17)) < 217)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 441)], 0f32, dtype=float32)
 }
 attr [IterVar(threadIdx.z_2, (nullptr), “ThreadIndex”, “threadIdx.z”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.y_2, (nullptr), “ThreadIndex”, “threadIdx.y”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.x_2, (nullptr), “ThreadIndex”, “threadIdx.x”)] “thread_extent” = 16;
 if @tir.likely((threadIdx.x_2 < 2), dtype=bool) {
 placeholder.shared[threadIdx.x_2] = (float32)placeholder_4[(((((blockIdx.z150) + (threadIdx.x_275)) + (rc.outer25)) + (ry.outer5)) + 3)]
 }
 compute_1[0] = ((float32*)compute_1[0] + ((float32*)pad_temp.shared[threadIdx.x](float32)placeholder.shared[0]))
 compute_1[2] = ((float32*)compute_1[2] + ((float32*)pad_temp.shared[(threadIdx.x + 16)](float32)placeholder.shared[0]))
 compute_1[4] = ((float32*)compute_1[4] + ((float32*)pad_temp.shared[(threadIdx.x + 32)](float32)placeholder.shared[0]))
 compute_1[6] = ((float32*)compute_1[6] + ((float32*)pad_temp.shared[(threadIdx.x + 48)](float32)placeholder.shared[0]))
 compute_1[8] = ((float32*)compute_1[8] + ((float32*)pad_temp.shared[(threadIdx.x + 64)](float32)placeholder.shared[0]))
 compute_1[10] = ((float32*)compute_1[10] + ((float32*)pad_temp.shared[(threadIdx.x + 80)](float32)placeholder.shared[0]))
 compute_1[12] = ((float32*)compute_1[12] + ((float32*)pad_temp.shared[(threadIdx.x + 96)](float32)placeholder.shared[0]))
 compute_1[1] = ((float32*)compute_1[1] + ((float32*)pad_temp.shared[threadIdx.x](float32)placeholder.shared[1]))
 compute_1[3] = ((float32*)compute_1[3] + ((float32*)pad_temp.shared[(threadIdx.x + 16)](float32)placeholder.shared[1]))
 compute_1[5] = ((float32*)compute_1[5] + ((float32*)pad_temp.shared[(threadIdx.x + 32)](float32)placeholder.shared[1]))
 compute_1[7] = ((float32*)compute_1[7] + ((float32*)pad_temp.shared[(threadIdx.x + 48)](float32)placeholder.shared[1]))
 compute_1[9] = ((float32*)compute_1[9] + ((float32*)pad_temp.shared[(threadIdx.x + 64)](float32)placeholder.shared[1]))
 compute_1[11] = ((float32*)compute_1[11] + ((float32*)pad_temp.shared[(threadIdx.x + 80)](float32)placeholder.shared[1]))
 compute_1[13] = ((float32*)compute_1[13] + ((float32*)pad_temp.shared[(threadIdx.x + 96)](float32)placeholder.shared[1]))
 attr [IterVar(threadIdx.z_1, (nullptr), “ThreadIndex”, “threadIdx.z”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.y_1, (nullptr), “ThreadIndex”, “threadIdx.y”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.x_1, (nullptr), “ThreadIndex”, “threadIdx.x”)] “thread_extent” = 16 {
 pad_temp.shared[(threadIdx.x_17)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 446)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 1)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 445)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 2)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 444)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 3)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 443)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 4)] = @tir.if_then_else(((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 442)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 5)] = @tir.if_then_else((((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)) && (((blockIdx.x112) + (threadIdx.x_17)) < 217)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 441)], 0f32, dtype=float32)
 pad_temp.shared[((threadIdx.x_17) + 6)] = @tir.if_then_else((((2 <= (blockIdx.y + ry.outer)) && ((blockIdx.y + ry.outer) < 226)) && (((blockIdx.x112) + (threadIdx.x_17)) < 216)), (float32*)placeholder_5[((((((rc.outer50176) + (blockIdx.y224)) + (ry.outer224)) + (blockIdx.x112)) + (threadIdx.x_17)) - 440)], 0f32, dtype=float32)
 }
 attr [IterVar(threadIdx.z_2, (nullptr), “ThreadIndex”, “threadIdx.z”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.y_2, (nullptr), “ThreadIndex”, “threadIdx.y”)] “thread_extent” = 1;
 attr [IterVar(threadIdx.x_2, (nullptr), “ThreadIndex”, “threadIdx.x”)] “thread_extent” = 16;
 if @tir.likely((threadIdx.x_2 < 2), dtype=bool) {
 placeholder.shared[threadIdx.x_2] = (float32)placeholder_4[(((((blockIdx.z150) + (threadIdx.x_275)) + (rc.outer25)) + (ry.outer5)) + 4)]
 }
 compute_1[0] = ((float32*)compute_1[0] + ((float32*)pad_temp.shared[threadIdx.x](float32)placeholder.shared[0]))
 compute_1[2] = ((float32*)compute_1[2] + ((float32*)pad_temp.shared[(threadIdx.x + 16)](float32)placeholder.shared[0]))
 compute_1[4] = ((float32*)compute_1[4] + ((float32*)pad_temp.shared[(threadIdx.x + 32)](float32)placeholder.shared[0]))
 compute_1[6] = ((float32*)compute_1[6] + ((float32*)pad_temp.shared[(threadIdx.x + 48)](float32)placeholder.shared[0]))
 compute_1[8] = ((float32*)compute_1[8] + ((float32*)pad_temp.shared[(threadIdx.x + 64)](float32)placeholder.shared[0]))
 compute_1[10] = ((float32*)compute_1[10] + ((float32*)pad_temp.shared[(threadIdx.x + 80)](float32)placeholder.shared[0]))
 compute_1[12] = ((float32*)compute_1[12] + ((float32*)pad_temp.shared[(threadIdx.x + 96)](float32)placeholder.shared[0]))
 compute_1[1] = ((float32*)compute_1[1] + ((float32*)pad_temp.shared[threadIdx.x](float32)placeholder.shared[1]))
 compute_1[3] = ((float32*)compute_1[3] + ((float32*)pad_temp.shared[(threadIdx.x + 16)](float32)placeholder.shared[1]))
 compute_1[5] = ((float32*)compute_1[5] + ((float32*)pad_temp.shared[(threadIdx.x + 32)](float32)placeholder.shared[1]))
 compute_1[7] = ((float32*)compute_1[7] + ((float32*)pad_temp.shared[(threadIdx.x + 48)](float32)placeholder.shared[1]))
 compute_1[9] = ((float32*)compute_1[9] + ((float32*)pad_temp.shared[(threadIdx.x + 64)](float32)placeholder.shared[1]))
 compute_1[11] = ((float32*)compute_1[11] + ((float32*)pad_temp.shared[(threadIdx.x + 80)](float32)placeholder.shared[1]))
 compute_1[13] = ((float32*)compute_1[13] + ((float32*)pad_temp.shared[(threadIdx.x + 96)](float32)placeholder.shared[1]))
 }
 }
 compute[((((blockIdx.z100352) + (blockIdx.y224)) + (blockIdx.x112)) + threadIdx.x)] = max((float32)compute_1[0], 0f32)
 compute[(((((blockIdx.z100352) + (blockIdx.y224)) + (blockIdx.x112)) + threadIdx.x) + 16)] = max((float32)compute_1[2], 0f32)
 compute[(((((blockIdx.z100352) + (blockIdx.y224)) + (blockIdx.x112)) + threadIdx.x) + 32)] = max((float32)compute_1[4], 0f32)
 compute[(((((blockIdx.z100352) + (blockIdx.y224)) + (blockIdx.x112)) + threadIdx.x) + 48)] = max((float32)compute_1[6], 0f32)
 compute[(((((blockIdx.z100352) + (blockIdx.y224)) + (blockIdx.x112)) + threadIdx.x) + 64)] = max((float32)compute_1[8], 0f32)
 compute[(((((blockIdx.z100352) + (blockIdx.y224)) + (blockIdx.x112)) + threadIdx.x) + 80)] = max((float32)compute_1[10], 0f32)
 compute[(((((blockIdx.z100352) + (blockIdx.y224)) + (blockIdx.x112)) + threadIdx.x) + 96)] = max((float32)compute_1[12], 0f32)
 compute[(((((blockIdx.z100352) + (blockIdx.y224)) + (blockIdx.x112)) + threadIdx.x) + 50176)] = max((float32)compute_1[1], 0f32)
 compute[(((((blockIdx.z100352) + (blockIdx.y224)) + (blockIdx.x112)) + threadIdx.x) + 50192)] = max((float32)compute_1[3], 0f32)
 compute[(((((blockIdx.z100352) + (blockIdx.y224)) + (blockIdx.x112)) + threadIdx.x) + 50208)] = max((float32)compute_1[5], 0f32)
 compute[(((((blockIdx.z100352) + (blockIdx.y224)) + (blockIdx.x112)) + threadIdx.x) + 50224)] = max((float32)compute_1[7], 0f32)
 compute[(((((blockIdx.z100352) + (blockIdx.y224)) + (blockIdx.x112)) + threadIdx.x) + 50240)] = max((float32)compute_1[9], 0f32)
 compute[(((((blockIdx.z100352) + (blockIdx.y224)) + (blockIdx.x112)) + threadIdx.x) + 50256)] = max((float32)compute_1[11], 0f32)
 compute[(((((blockIdx.z100352) + (blockIdx.y224)) + (blockIdx.x112)) + threadIdx.x) + 50272)] = max((float32)compute_1[13], 0f32)
 }
 }
概括
 本節內容
 ? 如何使用 TOPI API 進行 numpy算子的常見操作。
 ? TOPI 如何促進上下文的通用調度和算子融合,生成優化的內核代碼。
參考鏈接:
 https://tvm.apache.org/docs/tutorials/topi/intro_topi.html#sphx-glr-tutorials-topi-intro-topi-py
總結
以上是生活随笔為你收集整理的TVM Operator Inventory (TOPI)简介的全部內容,希望文章能夠幫你解決所遇到的問題。
 
                            
                        - 上一篇: Conda安装Glossary词汇表
- 下一篇: Relay IR表示
