I'm new to polars, so unfortunately I also don't know how to use the pull request 13747 updates, but issue 10833 had a code snippet and I tried to adapt your approach as well. I tried 3 different approaches shown below and got the following timings for a fake dataset of 25,000 sequences
Here's the code:
#from memory_profiler import profile
import polars as pl
import numpy as np
import time
np.random.seed(0)
num_seqs = 25_000
min_seq_len = 100
max_seq_len = 1_000
seq_lens = np.random.randint(min_seq_len,max_seq_len,num_seqs)
sequences = [''.join(np.random.choice(['A','C','G','T'],seq_len)) for seq_len in seq_lens]
data = {'sequence': sequences, 'length': seq_lens}
df = pl.DataFrame(data)
ksize = 24
def op_approach(df):
start = time.time()
kmer_df = df.group_by("sequence").map_groups(
lambda group_df: group_df.with_columns(kmers=pl.col("sequence").repeat_by("length"))
.explode("kmers")
.with_row_index()
).with_columns(
pl.col("kmers").str.slice("index", ksize)
).filter(pl.col("kmers").str.len_chars() == ksize)
print(f"Took {time.time()-start:.2f} seconds for op_approach")
return kmer_df
def kmer_index_approach(df):
start = time.time()
kmer_df = df.with_columns(
pl.int_ranges(0,pl.col("length").sub(ksize)+1).alias("kmer_starts")
).explode("kmer_starts").with_columns(
pl.col("sequence").str.slice("kmer_starts", ksize).alias("kmers")
)
print(f"Took {time.time()-start:.2f} seconds for kmer_index_approach")
return kmer_df
def map_elements_approach(df):
#Stolen nearly directly from https://github.com/pola-rs/polars/issues/10833#issuecomment-1703894870
start = time.time()
def create_cngram(message, ngram=3):
if ngram <= 0:
return []
return [message[i:i+ngram] for i in range(len(message) - ngram + 1)]
kmer_df = df.with_columns(
pl.col("sequence").map_elements(
lambda message: create_cngram(message=message, ngram=ksize),
return_dtype = pl.List(pl.String),
).alias("kmers")
).explode("kmers")
print(f"Took {time.time()-start:.2f} seconds for map_elements_approach")
return kmer_df
op_res = op_approach(df)
kmer_index_res = kmer_index_approach(df)
map_res = map_elements_approach(df)
assert op_res["kmers"].sort().equals(map_res["kmers"].sort())
assert op_res["kmers"].sort().equals(kmer_index_res["kmers"].sort())
The kmer_index_approach
is inspired by your use of str.slice
which I think is cool. But it avoids having to do any grouping and it first explodes a new column of indices which might require less memory than replicating the entire sequence before replacement with a kmer. Also avoids having to do the filtering step to remove partial kmers. This results in an extra column kmer_starts
which needs to be removed.
The map_elements_approach
is based on the approach mentioned in the github issue where mmantyla uses map/apply to just apply a python function to all elements.
I'm personally surprised that the map_elements
approach is the fastest, and by a large margin, but again I don't know if there's a different better approach based on the pull request you shared.