Reports

For @talonmies answer above, on newer GPUs(compute capability 7.x or higher), you may have to use the following.

#define FULL_MASK 0xffffffff
__inline__ __device__ void warpReduceMin(int& val, int& idx)
{
    for (int offset = warpSize / 2; offset > 0; offset /= 2) {
        int tmpVal = __shfl_down_sync(FULL_MASK, val, offset);
        int tmpIdx = __shfl_down_sync(FULL_MASK, idx, offset);
        if (tmpVal < val) {
            val = tmpVal;
            idx = tmpIdx;
        }
    }
}

79300302