Reports

Though I would update with my current best solution based on the responses. Thanks to @Jérôme Richard for the detailed analysis, I've re-implemented in a way that I think incorporates the most impactful suggestions:

cimport numpy as cnp

ctypedef cnp.npy_uint32 UINT32_t

from libc.math cimport exp
from libc.stdlib cimport malloc, free

cdef inline UINT32_t DEFAULT_SEED = 1
cdef enum:
    # Max value for our rand_r replacement (near the bottom).
    # We don't use RAND_MAX because it's different across platforms and
    # particularly tiny on Windows/MSVC.
    # It corresponds to the maximum representable value for
    # 32-bit signed integers (i.e. 2^31 - 1).
    RAND_R_MAX = 2147483647

##################################################################

cdef class FiniteSet:
    """
    Custom unordered set with O(1) add/remove methods
    which takes advantage of the fixed maximum set size
    """

    cdef:
        int* index     # Index of element in value array
        int* value     # Array of contained element
        int size       # Current number of elements
        
    def __cinit__(self, cnp.ndarray[int] indices, int maximum):
        self.size = len(indices)
        
        # Allocate arrays
        self.index = <int*>malloc(maximum * sizeof(int))
        self.value = <int*>malloc(maximum * sizeof(int))
        
        if not self.index or not self.value:
            raise MemoryError("Could not allocate memory")
        
        # Initialize index array to -1 (invalid)
        cdef int i
        for i in range(maximum):
            self.index[i] = -1
            
        # Set up initial values
        for i in range(self.size):
            self.value[i] = indices[i]
            self.index[indices[i]] = i 

    def __dealloc__(self):
        """Cleanup C memory"""
        if self.index:
            free(self.index)
        if self.value:
            free(self.value)

    cdef inline bint contains(self, int i) nogil:
        return self.index[i] >= 0 

    cdef inline void add(self, int i) nogil:
        """
        Add element to set
        """
        # if not self.contains(i):
        if self.index[i] < 0:
            self.value[self.size] = i
            self.index[i] = self.size
            ## Increase
            self.size += 1

    cdef inline void remove(self, int i) nogil:
        """
        Remove element from set
        """
        # if self.contains(i):
        if self.index[i] >= 0:
            self.value[self.index[i]] = self.value[self.size - 1]
            self.index[self.value[self.size - 1]] = self.index[i]
            self.index[i] = -1
            self.size -= 1

cdef class XORRNG:
    """
    Custom XORRNG sampler that I copied from the scikit-learn source code
    """

    cdef UINT32_t state

    def __cinit__(self, UINT32_t seed=DEFAULT_SEED):
        if (seed == 0):
            seed = DEFAULT_SEED
        self.state = seed

    cdef inline double sample(self) nogil:
        """Generate a pseudo-random np.uint32 from a np.uint32 seed"""
        self.state ^= <UINT32_t>(self.state << 13)
        self.state ^= <UINT32_t>(self.state >> 17)
        self.state ^= <UINT32_t>(self.state << 5)

        # Use the modulo to make sure that we don't return a values greater than the
        # maximum representable value for signed 32bit integers (i.e. 2^31 - 1).
        # Note that the parenthesis are needed to avoid overflow: here
        # RAND_R_MAX is cast to UINT32_t before 1 is added.
        return <double>(self.state % ((<UINT32_t>RAND_R_MAX) + 1))/RAND_R_MAX

def xorrng_py(int seed=1):
    cdef XORRNG prng = XORRNG(seed)
    return prng.sample()

###############################################################

cdef XORRNG prng = XORRNG()

def spgreedy(cnp.ndarray[double, ndim=2] J, 
             cnp.ndarray[double, ndim=1] h,
             FiniteSet s,
             double temp,
             ):

    cdef int d
    d = J.shape[0]

    cdef int j, k
    cdef double dot, curr, prob

    for j in range(d):

        s.remove(j)

        dot = h[j]
        for k in range(s.size):
            dot += J[j, s.value[k]]

        curr = dot / temp

        if curr < -100:
            prob = 0.0
        elif curr > 100:
            prob = 1.0
        else:
            prob = 1.0 / (1.0 + exp(-curr))

        if (prng.sample() < prob):
            s.add(j)

    return s

which is now better than the dense algorithm (in the problems I've tested on). It also takes a similar amount of time as numpy's matrix multiplication (just doing J@s+h) which is both reassuring, since that code is very optimized I assume, but a bit disappointing because I don't get much benefit from the sparsity after all. I also don't know much about Cython so it's very possible I did something silly in this implementation as well!

79636266