Not for pythonicism, but out of curiosity, I quickly made up a benchmark and compared three variants:
My version I currently use in a web stats backend
Something I found in a C++ SO post (https://stackoverflow.com/questions/45225089)
Is #2 on steroids: Roll out bit by bit. And it's the fastest one, at least on my machine (i7-12700, Python 3.13).
from datetime import datetime as dt
# This is my version currently in use (31 is to fit cnt2)
def cntz(i):
if i == 0: return 31
z = 0
while i > 0 and i & 1 == 0:
z += 1
i >>= 1
return z
# The C++ one from https://stackoverflow.com/questions/45225089,
def cntz2(i):
z = 0
if i & 0xFFFF == 0:
z += 16
i >>= 16
if i & 0xFF == 0:
z += 8
i >>= 8
if i & 0xF == 0:
z += 4
i >>= 4
if i & 0x3 == 0:
z += 2
i >>= 2
z += (i & 1) ^ 1
return z
# My try: Unroll it bit-by-bit. Stops at 23 to fit the 10M test
def cntz3(i):
if i & 0x00000001: return 0
if i & 0x00000003: return 1
if i & 0x00000007: return 2
if i & 0x0000000f: return 3
if i & 0x0000001f: return 4
if i & 0x0000003f: return 5
if i & 0x0000007f: return 6
if i & 0x000000ff: return 7
if i & 0x000001ff: return 8
if i & 0x000003ff: return 9
if i & 0x000007ff: return 10
if i & 0x00000fff: return 11
if i & 0x00001fff: return 12
if i & 0x00003fff: return 13
if i & 0x00007fff: return 14
if i & 0x0000ffff: return 15
if i & 0x0001ffff: return 16
if i & 0x0003ffff: return 17
if i & 0x0007ffff: return 18
if i & 0x000fffff: return 19
if i & 0x001fffff: return 20
if i & 0x003fffff: return 21
if i & 0x007fffff: return 22
if i & 0x00ffffff: return 23
return 31
input = range(10000000)
for f in [cntz, cntz2, cntz3]:
t0 = dt.now()
output = list( f(_) for _ in input )
print(f'{(dt.now()-t0).total_seconds():.3f} {output[:33]}')
Output
1.797 [31, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5]
1.917 [31, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5]
0.873 [31, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5]