Though I would update with my current best solution based on the responses. Thanks to @Jérôme Richard for the detailed analysis, I've re-implemented in a way that I think incorporates the most impactful suggestions:
cimport numpy as cnp
ctypedef cnp.npy_uint32 UINT32_t
from libc.math cimport exp
from libc.stdlib cimport malloc, free
cdef inline UINT32_t DEFAULT_SEED = 1
cdef enum:
# Max value for our rand_r replacement (near the bottom).
# We don't use RAND_MAX because it's different across platforms and
# particularly tiny on Windows/MSVC.
# It corresponds to the maximum representable value for
# 32-bit signed integers (i.e. 2^31 - 1).
RAND_R_MAX = 2147483647
##################################################################
cdef class FiniteSet:
"""
Custom unordered set with O(1) add/remove methods
which takes advantage of the fixed maximum set size
"""
cdef:
int* index # Index of element in value array
int* value # Array of contained element
int size # Current number of elements
def __cinit__(self, cnp.ndarray[int] indices, int maximum):
self.size = len(indices)
# Allocate arrays
self.index = <int*>malloc(maximum * sizeof(int))
self.value = <int*>malloc(maximum * sizeof(int))
if not self.index or not self.value:
raise MemoryError("Could not allocate memory")
# Initialize index array to -1 (invalid)
cdef int i
for i in range(maximum):
self.index[i] = -1
# Set up initial values
for i in range(self.size):
self.value[i] = indices[i]
self.index[indices[i]] = i
def __dealloc__(self):
"""Cleanup C memory"""
if self.index:
free(self.index)
if self.value:
free(self.value)
cdef inline bint contains(self, int i) nogil:
return self.index[i] >= 0
cdef inline void add(self, int i) nogil:
"""
Add element to set
"""
# if not self.contains(i):
if self.index[i] < 0:
self.value[self.size] = i
self.index[i] = self.size
## Increase
self.size += 1
cdef inline void remove(self, int i) nogil:
"""
Remove element from set
"""
# if self.contains(i):
if self.index[i] >= 0:
self.value[self.index[i]] = self.value[self.size - 1]
self.index[self.value[self.size - 1]] = self.index[i]
self.index[i] = -1
self.size -= 1
cdef class XORRNG:
"""
Custom XORRNG sampler that I copied from the scikit-learn source code
"""
cdef UINT32_t state
def __cinit__(self, UINT32_t seed=DEFAULT_SEED):
if (seed == 0):
seed = DEFAULT_SEED
self.state = seed
cdef inline double sample(self) nogil:
"""Generate a pseudo-random np.uint32 from a np.uint32 seed"""
self.state ^= <UINT32_t>(self.state << 13)
self.state ^= <UINT32_t>(self.state >> 17)
self.state ^= <UINT32_t>(self.state << 5)
# Use the modulo to make sure that we don't return a values greater than the
# maximum representable value for signed 32bit integers (i.e. 2^31 - 1).
# Note that the parenthesis are needed to avoid overflow: here
# RAND_R_MAX is cast to UINT32_t before 1 is added.
return <double>(self.state % ((<UINT32_t>RAND_R_MAX) + 1))/RAND_R_MAX
def xorrng_py(int seed=1):
cdef XORRNG prng = XORRNG(seed)
return prng.sample()
###############################################################
cdef XORRNG prng = XORRNG()
def spgreedy(cnp.ndarray[double, ndim=2] J,
cnp.ndarray[double, ndim=1] h,
FiniteSet s,
double temp,
):
cdef int d
d = J.shape[0]
cdef int j, k
cdef double dot, curr, prob
for j in range(d):
s.remove(j)
dot = h[j]
for k in range(s.size):
dot += J[j, s.value[k]]
curr = dot / temp
if curr < -100:
prob = 0.0
elif curr > 100:
prob = 1.0
else:
prob = 1.0 / (1.0 + exp(-curr))
if (prng.sample() < prob):
s.add(j)
return s
which is now better than the dense algorithm (in the problems I've tested on). It also takes a similar amount of time as numpy's matrix multiplication (just doing J@s+h
) which is both reassuring, since that code is very optimized I assume, but a bit disappointing because I don't get much benefit from the sparsity after all. I also don't know much about Cython so it's very possible I did something silly in this implementation as well!