# multihead self-attention layer
# mb = 1, num_heads = 12, hidden_size = 768, t_x = t_y = 384
12x384x64:12x64x384:12x384x384_n"encoder:QK_matmul:12"
12x384x384:12x384x64:12x384x64_n"encoder:WV_matmul:12"

# mb = 128, num_heads = 12, hidden_size = 768, t_x = t_y = 384
1536x384x64:1536x64x384:1536x384x384_n"encoder:QK_matmul:12"
1536x384x384:1536x384x64:1536x384x64_n"encoder:WV_matmul:12"

# mb = 128, num_heads = 16, hidden_size = 1024, t_x = t_y = 384
#2048x384x64:2048x64x384:2048x384x384_n"encoder:QK_matmul:24"
#2048x384x384:2048x384x64:2048x384x64_n"encoder:WV_matmul:24"
