I made progress:
import torch
import onnxruntime_extensions
import onnx
import onnxruntime as ort
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import subprocess
model_name = "spital/gpt2-small-czech-cs"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
input_text = "Téma: Umělá inteligence v moderní společnosti."
# Export the tokenizers to ONNX using gen_processing_models
onnx_tokenizer_coder_path = "results/v5/model/tokenizer_coder.onnx"
onnx_tokenizer_decoder_path = "results/v5/model/tokenizer_decoder.onnx"
# Generate the tokenizers ONNX model
gen_tokenizer_coder_onnx_model = onnxruntime_extensions.gen_processing_models(tokenizer, pre_kwargs={})[0]
gen_tokenizer_decoder_onnx_model = onnxruntime_extensions.gen_processing_models(tokenizer, post_kwargs={})[1]
# Save the tokenizers ONNX model
with open(onnx_tokenizer_coder_path, "wb") as f:
f.write(gen_tokenizer_coder_onnx_model.SerializeToString())
with open(onnx_tokenizer_decoder_path, "wb") as f:
f.write(gen_tokenizer_decoder_onnx_model.SerializeToString())
# Export the Huggingface model to ONNX
onnx_model_path = "results/v5/model/"
# Export the model to ONNX
command = [
"optimum-cli", "export", "onnx",
"-m", model_name,
"--opset", "18",
"--monolith",
"--task", "text-generation",
onnx_model_path
]
subprocess.run(command, check=True)
# Adding position_ids for tokenizer coder for model
add_tokenizer_coder_onnx_model = onnx.load(onnx_tokenizer_coder_path)
shape_node = onnx.helper.make_node(
"Shape",
inputs=["input_ids"],
outputs=["input_shape"]
)
gather_node = onnx.helper.make_node(
"Gather",
inputs=["input_shape", "one"],
outputs=["sequence_length"],
axis=0
)
cast_node = onnx.helper.make_node(
"Cast",
inputs=["sequence_length"],
outputs=["sequence_length_int"],
to=onnx.TensorProto.INT64
)
# Creating position_ids node for tokenizer coder for model
position_ids_node = onnx.helper.make_node(
"Range",
inputs=["zero", "sequence_length_int", "one"],
outputs=["shorter_position_ids"]
)
zero_const = onnx.helper.make_tensor("zero", onnx.TensorProto.INT64, [1], [0])
one_const = onnx.helper.make_tensor("one", onnx.TensorProto.INT64, [1], [1])
position_ids_output = onnx.helper.make_tensor_value_info(
"position_ids",
onnx.TensorProto.INT64,
["sequence_length"]
)
unsqueeze_axes = onnx.helper.make_tensor(
"unsqueeze_axes",
onnx.TensorProto.INT64,
dims=[1],
vals=[0]
)
expand_node = onnx.helper.make_node(
"Unsqueeze",
inputs=["shorter_position_ids", "unsqueeze_axes"],
outputs=["position_ids"]
)
expanded_position_ids_output = onnx.helper.make_tensor_value_info(
"position_ids",
onnx.TensorProto.INT64,
["batch_size", "sequence_length"]
)
# Adding position_ids to outputs of tokenizer coder for model
add_tokenizer_coder_onnx_model.graph.node.extend([shape_node, gather_node, cast_node, position_ids_node, expand_node])
add_tokenizer_coder_onnx_model.graph.output.append(expanded_position_ids_output)
add_tokenizer_coder_onnx_model.graph.initializer.extend([zero_const, one_const, unsqueeze_axes])
# Export tokenizer coder with position_ids for model
onnx.save(add_tokenizer_coder_onnx_model, onnx_tokenizer_coder_path)
# Adding operation ArgMax node to transfer logits -> ids
onnx_argmax_model_path = "results/v5/model/argmax.onnx"
ArgMax_node = onnx.helper.make_node(
"ArgMax",
inputs=["logits"],
outputs=["ids"],
axis=-1,
keepdims=0
)
# Creating ArgMax graph
ArgMax_graph = onnx.helper.make_graph(
[ArgMax_node],
"ArgMaxGraph",
[onnx.helper.make_tensor_value_info("logits", onnx.TensorProto.FLOAT, ["batch_size", "sequence_length", "vocab_size"])],
[onnx.helper.make_tensor_value_info("ids", onnx.TensorProto.INT64, ["batch_size", "sequence_length"])]
)
# Creating ArgMax ONNX model
gen_ArgMax_onnx_model = onnx.helper.make_model(ArgMax_graph)
# Exporting ArgMax ONNX model
onnx.save(gen_ArgMax_onnx_model, onnx_argmax_model_path)
# Adding shape for Tokenizer decoder outputs (Assuming shape with batch_size and sequence_length)
add_tokenizer_decoder_onnx_model = onnx.load(onnx_tokenizer_decoder_path)
expanded_shape = onnx.helper.make_tensor_value_info(
"str",
onnx.TensorProto.STRING,
["batch_size", "sequence_length"]
)
# Adding shape to Tokenizer decoder outputs
output_tensor = add_tokenizer_decoder_onnx_model.graph.output[0]
output_tensor.type.tensor_type.shape.dim.clear()
output_tensor.type.tensor_type.shape.dim.extend(expanded_shape.type.tensor_type.shape.dim)
# Exporting Tokenizer decoder with shape ONNX model
onnx.save(add_tokenizer_decoder_onnx_model, onnx_tokenizer_decoder_path)
# Test Tokenizer coder, Model, ArgMax, Tokenizer decoder using an Inference session with ONNX Runtime Extensions before merging
# Test the tokenizers ONNX model
# Initialize ONNX Runtime SessionOptions and load custom ops library
sess_options = ort.SessionOptions()
sess_options.register_custom_ops_library(onnxruntime_extensions.get_library_path())
# Initialize ONNX Runtime Inference session with Extensions
coder = ort.InferenceSession(onnx_tokenizer_coder_path, sess_options=sess_options, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
model = ort.InferenceSession(onnx_model_path + "model.onnx", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
ArgMax = ort.InferenceSession(onnx_argmax_model_path, sess_options=sess_options, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
decoder = ort.InferenceSession(onnx_tokenizer_decoder_path, sess_options=sess_options, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
# Prepare dummy input text
input_feed = {"input_text": np.asarray([input_text])} # Assuming "input_text" is the input expected by the tokenizers
# Run the tokenizer coder
tokenized = coder.run(None, input_feed)
print("Tokenized:", tokenized)
# Run the model
model_output = model.run(None, {"input_ids": tokenized[0], "attention_mask": tokenized[1], "position_ids": tokenized[2]})
print("Model output (logits):", model_output[0])
# Run the ArgMax
argmax_output = ArgMax.run(None, {"logits": model_output[0]})
print("ArgMax output (token ids):", argmax_output[0])
# Run the tokenizer decoder
detokenized = decoder.run(None, input_feed={"ids": argmax_output[0]})
print("Detokenized:", detokenized)
# Merge the tokenizer and model ONNX files into one
onnx_combined_model_path = "results/v5/model/combined_model_tokenizer.onnx"
# Load the tokenizers and model ONNX files
tokenizer_coder_onnx_model = onnx.load(onnx_tokenizer_coder_path)
model_onnx_model = onnx.load(onnx_model_path + "model.onnx")
ArgMax_onnx_model = onnx.load(onnx_argmax_model_path)
tokenizer_decoder_onnx_model = onnx.load(onnx_tokenizer_decoder_path)
# Inspect the ONNX models to find the correct input/output names
print("\nTokenizer coder Model Inputs:", [node.name for node in tokenizer_coder_onnx_model.graph.input])
print("Tokenizer coder Model Outputs:", [node.name for node in tokenizer_coder_onnx_model.graph.output])
print("Tokenizer coder Model Shape:", [node.type.tensor_type.shape for node in tokenizer_coder_onnx_model.graph.output])
print("Tokenizer coder Model Type:", [node.type.tensor_type.elem_type for node in tokenizer_coder_onnx_model.graph.output])
print("\nModel Inputs:", [node.name for node in model_onnx_model.graph.input])
print("Model Outputs:", [node.name for node in model_onnx_model.graph.output])
print("Model Shape:", [node.type.tensor_type.shape for node in model_onnx_model.graph.output])
print("Model Type:", [node.type.tensor_type.elem_type for node in model_onnx_model.graph.output])
print("\nArgMax Inputs:", [node.name for node in ArgMax_onnx_model.graph.input])
print("ArgMax Outputs:", [node.name for node in ArgMax_onnx_model.graph.output])
print("ArgMax Shape:", [node.type.tensor_type.shape for node in ArgMax_onnx_model.graph.output])
print("ArgMax Type:", [node.type.tensor_type.elem_type for node in ArgMax_onnx_model.graph.output])
print("\nTokenizer decoder Model Inputs:", [node.name for node in tokenizer_decoder_onnx_model.graph.input])
print("Tokenizer decoder Model Outputs:", [node.name for node in tokenizer_decoder_onnx_model.graph.output])
print("Tokenizer decoder Model Shape:", [node.type.tensor_type.shape for node in tokenizer_decoder_onnx_model.graph.output])
print("Tokenizer decoder Model Type:", [node.type.tensor_type.elem_type for node in tokenizer_decoder_onnx_model.graph.output])
# Merge the tokenizer coder and model ONNX files
combined_model = onnx.compose.merge_models(
tokenizer_coder_onnx_model,
model_onnx_model,
io_map=[('input_ids', 'input_ids'), ('attention_mask', 'attention_mask'), ('position_ids', 'position_ids')]
)
# Merge the model and ArgMax ONNX files
combined_model = onnx.compose.merge_models(
combined_model,
ArgMax_onnx_model,
io_map=[('logits', 'logits')]
)
# Merge the ArgMax and tokenizer decoder ONNX files
combined_model = onnx.compose.merge_models(
combined_model,
tokenizer_decoder_onnx_model,
io_map=[('ids', 'ids')]
)
# Check combined ONNX model
inferred_model = onnx.shape_inference.infer_shapes(combined_model)
onnx.checker.check_model(inferred_model)
# Save the combined model
onnx.save(combined_model, onnx_combined_model_path)
# Test the combined ONNX model using an Inference session with ONNX Runtime Extensions
# Initialize ONNX Runtime SessionOptions and load custom ops library
sess_options = ort.SessionOptions()
sess_options.register_custom_ops_library(onnxruntime_extensions.get_library_path())
# Initialize ONNX Runtime Inference session with Extensions
session = ort.InferenceSession(onnx_combined_model_path, sess_options=sess_options, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
# Prepare dummy input text
input_feed = {"input_text": np.asarray([input_text])} # Assuming "input_text" is the input expected by the tokenizer
# Run the model
outputs = session.run(None, input_feed)
# Print the outputs
print("logits:", outputs)
This is example of all in one GPT2 ONNX model with tests.
Now I am trying to make it generate more than one token of text that doesn't make sense I checked onnx_gpt_loop but I don't know how to implement it. I also know that onnx.helper could have tool for this problem as well.