For @talonmies answer above, on newer GPUs(compute capability 7.x or higher), you may have to use the following.
#define FULL_MASK 0xffffffff
__inline__ __device__ void warpReduceMin(int& val, int& idx)
{
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
int tmpVal = __shfl_down_sync(FULL_MASK, val, offset);
int tmpIdx = __shfl_down_sync(FULL_MASK, idx, offset);
if (tmpVal < val) {
val = tmpVal;
idx = tmpIdx;
}
}
}