Reports

__global__ void swizzle_kernel(half *A,half* B){
    
    __shared__ half s[32][64];
    int lane_id = threadIdx.x % 32;
    int warp_id = threadIdx.x / 32;
    int row = warp_id /4 * 16 + lane_id % 16 ;
    int col = ((warp_id % 4) * 2 +lane_id/ 16) ^ (row & 7);

    *((int4*)(&s[row][col * 8])) = *((int4*)(A + row * 64 + ((warp_id % 4) * 2 +lane_id / 16 ) * 8));

    __syncthreads();
    int4 tmp;
    tmp =  *((int4*)(&s[((warp_id+1)%8) / 4 *16 + lane_id / 2 ][col * 8]));

While the above distribution of data and threads is theoretically free of bank conflict, when I try to prioritize the layout of both data and threads, the bank conflict disappears. So my answer for the time being is to keep the thread and shared memory layout clean, so maybe the compiler will better optimize the code.

79268348