Reports

No, your Masking layer does not mask your loss because the layers.LSTM(return_sequences=False) layer breaks the mask propagation. The built-in loss function will incorrectly try to reconstruct the [0,0,0] padding.

One of the solution is to use a custom loss function and a custom metric that manually ignore padded steps.

Q1: Is there any way to visualize that this is exactly how it is working?

You can verify the mask is lost by checking model.layers[3].output_mask (the output of your LSTM(return_sequences=False) layer); it will be None.

Adjacent Q2: Is there a simple way to calculate the loss later on with the same masking...

Yes, the custom loss function below is the simple way. It works automatically for both model.fit() and model.evaluate(), which correctly calculate the masked loss at all times.

import tensorflow.keras.backend as K
import tensorflow as tf

def masked_cosine_distance(y_true, y_pred):
    mask = K.cast(K.greater(K.sum(K.abs(y_true), axis=-1), 1e-9), K.floatx())
    y_true_normalized = K.l2_normalize(y_true, axis=-1)
    y_pred_normalized = K.l2_normalize(y_pred, axis=-1)
    similarity = K.sum(y_true_normalized * y_pred_normalized, axis=-1)
    loss_per_timestep = 1 - similarity 
    masked_loss = loss_per_timestep * mask
    sum_of_losses = K.sum(masked_loss)
    num_unmasked_steps = K.sum(mask)
    return sum_of_losses / (num_unmasked_steps + K.epsilon())

def masked_cosine_similarity(y_true, y_pred):
    mask = K.cast(K.greater(K.sum(K.abs(y_true), axis=-1), 1e-9), K.floatx())
    y_true_normalized = K.l2_normalize(y_true, axis=-1)
    y_pred_normalized = K.l2_normalize(y_pred, axis=-1)
    similarity = K.sum(y_true_normalized * y_pred_normalized, axis=-1)
    masked_similarity = similarity * mask
    sum_of_similarity = K.sum(masked_similarity)
    num_unmasked_steps = K.sum(mask)
    return sum_of_similarity / (num_unmasked_steps + K.epsilon())

model = tf.keras.models.Sequential([
    tf.keras.Input(shape=(window_size, line_feature_size)),
    tf.keras.layers.Masking(mask_value=0.0), # Still good for LSTM efficiency
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=False)),
    tf.keras.layers.RepeatVector(window_size),
    tf.keras.layers.LSTM(32, return_sequences=True),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(line_feature_size))
])
model.compile(optimizer="adam", 
              loss=masked_cosine_distance, 
              metrics=[masked_cosine_similarity])

79809655