If you have a model and stack that structurally expects a text input and needs a tokenization LAYER within the model graph, not "pre-tokinizing the text before sending it through the model graph", it seems to be astoundingly not straightforward how to make this work:
Here is a concrete example:
# Model that we want to work
inp = tf.keras.layers.Input(shape=(), dtype=tf.string)
# gp2_tokenizer = TokenizerLayer(max_seq_length=max_seq_length)
gp2_tokenizer = NewTokenizerLayer(max_seq_length=max_seq_length,tokenizer_checkpoint=tokenizer_checkpoint)
embedded = tf.keras.layers.Embedding(...)(gp2_tokenizer)
flattened = tf.keras.layers.Flatten()(embedded)
base_model = tf.keras.Model(inputs=inp, outputs = flattened)
# A second model (logistic regression model) will take this as its input
# ... (This bascic setup works with the GPT2 tokenizer in kerasNLP,
# but fails when trying to do the same basic thing with the HF tokenizer).
# For reference, the code is working and validated with this used to
# instantiate the object gp2_tokenizer:
class TokenizerLayer(tf.keras.layers.Layer):
def __init__(self, max_seq_length, **kwargs):
super(TokenizerLayer, self).__init__(**kwargs) # Update this line
self.tokenizer = GPT2Tokenizer.from_preset("gpt2_extra_large_en")
self.preprocessor = GPT2Preprocessor(self.tokenizer, sequence_length=max_seq_length)
self.max_seq_length = max_seq_length
def call(self, inputs):
prep = self.preprocessor([inputs])
return prep['token_ids']
def get_config(self):
config = super(TokenizerLayer, self).get_config()
config.update({'max_seq_length': self.max_seq_length})
return config
@classmethod
def from_config(cls, config):
return cls(max_seq_length=config['max_seq_length'])
# Simple case attempt:
# This Fails because the huggingface tokenizer expects str or list[str]
# and is internally being fed a tensor of strings in call()
# by the Keras backend.
class NewTokenizerLayer(tf.keras.layers.Layer):
def call(self, inputs):
tokenized = self.tokenizer(inputs.numpy().astype("U").tolist(),
max_length=self.max_seq_length,
padding='max_length',
truncation=True,
return_tensors='tf',
return_overflowing_tokens=False)
return tokenized
# The logical next step seems to try to convert to string: ... well about that
# Raises: OperatorNotAllowedInGraphError:
# Iterating over a symbolic `tf.Tensor` is not allowed
class NewTokenizerLayer(tf.keras.layers.Layer):
def call(self, inputs):
inputs = [x.decode('utf-8') for x in inputs]
tokenized = self.tokenizer(inputs.numpy().astype("U").tolist(),
max_length=self.max_seq_length,
padding='max_length',
truncation=True,
return_tensors='tf',
return_overflowing_tokens=False)
return tokenized
# Raises an error: EagerTensor has no attribute .numpy()
class NewTokenizerLayer(tf.keras.layers.Layer):
def call(self, inputs):
inputs.numpy().astype('U').tolist()
tokenized = self.tokenizer(inputs.numpy().astype("U").tolist(),
max_length=self.max_seq_length,
padding='max_length',
truncation=True,
return_tensors='tf',
return_overflowing_tokens=False)
return tokenized
# Raises TypeError: Input 'input_values' of 'UnicodeEncode' Op has type string that does not match expected type of int32.
class NewTokenizerLayer(tf.keras.layers.Layer):
def call(self, inputs):
inputs = tf.strings.unicode_encode(inputs, 'UTF-8')
tokenized = self.tokenizer(inputs.numpy().astype("U").tolist(),
max_length=self.max_seq_length,
padding='max_length',
truncation=True,
return_tensors='tf',
return_overflowing_tokens=False)
return tokenized
... I have made several other tries ...
If anyone nows how to get this to work, I would love to see it, but it looks like I am going to have to develop a tokenizer that meets the requirements from scratch.