feat: update models to compatible with keras 3

TensorSpeech · nglehuy · Jun 15, 2024 · Jun 18, 2024 · Jun 22, 2024 · Jun 24, 2024
commit a853eba3bac704f58e2a383b65b4995933722459
diff --git a/examples/models/ctc/conformer/char-small.yml.j2 b/examples/models/ctc/conformer/char-small.yml.j2
@@ -63,7 +63,7 @@ model_config:
 
 learning_config:
   optimizer_config:
-    class_name: Custom>Adam
+    class_name: Adam
     config:
       learning_rate:
         class_name: tensorflow_asr.optimizers.schedules>TransformerSchedule
@@ -91,7 +91,7 @@ learning_config:
       config: {}
     - class_name: tensorflow_asr.callbacks>ModelCheckpoint
       config:
-        filepath: {{modeldir}}/checkpoints/{epoch:02d}.h5
+        filepath: {{modeldir}}/checkpoints/{epoch:02d}.weights.h5
         save_best_only: False
         save_weights_only: True
         save_freq: epoch

diff --git a/examples/models/ctc/deepspeech2/base.yml.j2 b/examples/models/ctc/deepspeech2/base.yml.j2
@@ -60,7 +60,7 @@ model_config:
 
 learning_config:
   optimizer_config:
-    class_name: Custom>Adam
+    class_name: Adam
     config:
       learning_rate:
         class_name: ExponentialDecay
@@ -81,7 +81,7 @@ learning_config:
   callbacks:
     - class_name: tensorflow_asr.callbacks>ModelCheckpoint
       config:
-        filepath: {{modeldir}}/checkpoints/{epoch:02d}.h5
+        filepath: {{modeldir}}/checkpoints/{epoch:02d}.weights.h5
         save_best_only: False
         save_weights_only: True
         save_freq: epoch

diff --git a/examples/models/ctc/deepspeech2/uni.yml.j2 b/examples/models/ctc/deepspeech2/uni.yml.j2
@@ -59,7 +59,7 @@ model_config:
 
 learning_config:
   optimizer_config:
-    class_name: Custom>Adam
+    class_name: Adam
     config:
       learning_rate:
         class_name: tensorflow_asr.optimizers.schedules>TransformerSchedule
@@ -84,7 +84,7 @@ learning_config:
   callbacks:
     - class_name: tensorflow_asr.callbacks>ModelCheckpoint
       config:
-        filepath: {{modeldir}}/checkpoints/{epoch:02d}.h5
+        filepath: {{modeldir}}/checkpoints/{epoch:02d}.weights.h5
         save_best_only: False
         save_weights_only: True
         save_freq: epoch

diff --git a/examples/models/ctc/jasper/base.yml.j2 b/examples/models/ctc/jasper/base.yml.j2
@@ -46,7 +46,7 @@ model_config:
 
 learning_config:
   optimizer_config:
-    class_name: Custom>Adam
+    class_name: Adam
     config:
       learning_rate: 0.001
       beta_1: 0.9
@@ -66,7 +66,7 @@ learning_config:
       config: {}
     - class_name: tensorflow_asr.callbacks>ModelCheckpoint
       config:
-        filepath: {{modeldir}}/checkpoints/{epoch:02d}.h5
+        filepath: {{modeldir}}/checkpoints/{epoch:02d}.weights.h5
         save_best_only: False
         save_weights_only: True
         save_freq: epoch

diff --git a/examples/models/ctc/transformer/base.yml.j2 b/examples/models/ctc/transformer/base.yml.j2
@@ -57,7 +57,7 @@ model_config:
 
 learning_config:
   optimizer_config:
-    class_name: Custom>Adam
+    class_name: Adam
     config:
       learning_rate:
         class_name: tensorflow_asr.optimizers.schedules>TransformerSchedule
@@ -83,7 +83,7 @@ learning_config:
       config: {}
     - class_name: tensorflow_asr.callbacks>ModelCheckpoint
       config:
-        filepath: {{modeldir}}/checkpoints/{epoch:02d}.h5
+        filepath: {{modeldir}}/checkpoints/{epoch:02d}.weights.h5
         save_best_only: False
         save_weights_only: True
         save_freq: epoch

diff --git a/examples/models/transducer/conformer/small-nfft.yml.j2 b/examples/models/transducer/conformer/small-nfft.yml.j2
@@ -45,7 +45,7 @@ model_config:
     encoder_interleave_relpe: True
     encoder_use_attention_causal_mask: False
     encoder_use_attention_auto_mask: True
-    encoder_mhsam_use_attention_bias: True
+    encoder_mhsam_use_attention_bias: False
     encoder_kernel_size: 32
     encoder_dropout: 0.1
     encoder_padding: causal
@@ -78,7 +78,7 @@ model_config:
 
 learning_config:
   optimizer_config:
-    class_name: Custom>Adam
+    class_name: Adam
     config:
       learning_rate:
         class_name: tensorflow_asr.optimizers.schedules>TransformerSchedule
@@ -108,7 +108,7 @@ learning_config:
       config: {}
     - class_name: tensorflow_asr.callbacks>ModelCheckpoint
       config:
-        filepath: {{modeldir}}/checkpoints/{epoch:02d}.h5
+        filepath: {{modeldir}}/checkpoints/{epoch:02d}.weights.h5
         save_best_only: False
         save_weights_only: True
         save_freq: epoch

diff --git a/examples/models/transducer/conformer/small-no-decay.yml.j2 b/examples/models/transducer/conformer/small-no-decay.yml.j2
@@ -77,7 +77,7 @@ model_config:
 
 learning_config:
   optimizer_config:
-    class_name: Custom>Adam
+    class_name: Adam
     config:
       learning_rate:
         class_name: tensorflow_asr.optimizers.schedules>TransformerSchedule
@@ -105,7 +105,7 @@ learning_config:
       config: {}
     - class_name: tensorflow_asr.callbacks>ModelCheckpoint
       config:
-        filepath: {{modeldir}}/checkpoints/{epoch:02d}.h5
+        filepath: {{modeldir}}/checkpoints/{epoch:02d}.weights.h5
         save_best_only: False
         save_weights_only: True
         save_freq: epoch

diff --git a/examples/models/transducer/conformer/small.yml.j2 b/examples/models/transducer/conformer/small.yml.j2
@@ -77,7 +77,7 @@ model_config:
 
 learning_config:
   optimizer_config:
-    class_name: Custom>Adam
+    class_name: Adam
     config:
       learning_rate:
         class_name: tensorflow_asr.optimizers.schedules>TransformerSchedule
@@ -107,7 +107,7 @@ learning_config:
       config: {}
     - class_name: tensorflow_asr.callbacks>ModelCheckpoint
       config:
-        filepath: {{modeldir}}/checkpoints/{epoch:02d}.h5
+        filepath: {{modeldir}}/checkpoints/{epoch:02d}.weights.h5
         save_best_only: False
         save_weights_only: True
         save_freq: epoch

diff --git a/examples/models/transducer/contextnet/small.yml.j2 b/examples/models/transducer/contextnet/small.yml.j2
@@ -226,7 +226,7 @@ model_config:
 
 learning_config:
   optimizer_config:
-    class_name: Custom>Adam
+    class_name: Adam
     config:
       learning_rate:
         class_name: tensorflow_asr.optimizers.schedules>TransformerSchedule
@@ -256,7 +256,7 @@ learning_config:
       config: {}
     - class_name: tensorflow_asr.callbacks>ModelCheckpoint
       config:
-        filepath: {{modeldir}}/checkpoints/{epoch:02d}.h5
+        filepath: {{modeldir}}/checkpoints/{epoch:02d}.weights.h5
         save_best_only: False
         save_weights_only: True
         save_freq: epoch

diff --git a/examples/models/transducer/rnnt/small.yml.j2 b/examples/models/transducer/rnnt/small.yml.j2
@@ -57,7 +57,7 @@ model_config:
 
 learning_config:
   optimizer_config:
-    class_name: Custom>Adam
+    class_name: Adam
     config:
       learning_rate:
         class_name: tensorflow_asr.optimizers.schedules>TransformerSchedule
@@ -87,7 +87,7 @@ learning_config:
       config: {}
     - class_name: tensorflow_asr.callbacks>ModelCheckpoint
       config:
-        filepath: {{modeldir}}/checkpoints/{epoch:02d}.h5
+        filepath: {{modeldir}}/checkpoints/{epoch:02d}.weights.h5
         save_best_only: False
         save_weights_only: True
         save_freq: epoch

diff --git a/examples/models/transducer/rnnt/tiny.yml.j2 b/examples/models/transducer/rnnt/tiny.yml.j2
@@ -56,7 +56,7 @@ model_config:
 
 learning_config:
   optimizer_config:
-    class_name: Custom>Adam
+    class_name: Adam
     config:
       learning_rate:
         class_name: tensorflow_asr.optimizers.schedules>TransformerSchedule
@@ -82,7 +82,7 @@ learning_config:
       config: {}
     - class_name: tensorflow_asr.callbacks>ModelCheckpoint
       config:
-        filepath: {{modeldir}}/checkpoints/{epoch:02d}.h5
+        filepath: {{modeldir}}/checkpoints/{epoch:02d}.weights.h5
         save_best_only: False
         save_weights_only: True
         save_freq: epoch

diff --git a/examples/models/transducer/transformer/base.yml.j2 b/examples/models/transducer/transformer/base.yml.j2
@@ -93,7 +93,7 @@ learning_config:
     batch_size: 2
     num_epochs: 300
     checkpoint:
-      filepath: {{modeldir}}/checkpoints/{epoch:02d}.h5
+      filepath: {{modeldir}}/checkpoints/{epoch:02d}.weights.h5
       save_best_only: False
       save_weights_only: True
       save_freq: epoch

diff --git a/tensorflow_asr/losses/rnnt_loss.py b/tensorflow_asr/losses/rnnt_loss.py
@@ -360,6 +360,9 @@ def rnnt_loss_tf(
         orig_dtype = logits.dtype
         if orig_dtype in (tf.float16, tf.bfloat16):
             logits = tf.cast(logits, tf.float32)
+        logit_length = tf.cast(logit_length, tf.int32)
+        labels = tf.cast(labels, tf.int32)
+        label_length = tf.cast(label_length, tf.int32)
 
         args = [logits, labels, label_length, logit_length]
 

diff --git a/tensorflow_asr/models/base_layer.py b/tensorflow_asr/models/base_layer.py
@@ -36,13 +36,7 @@ def call(self, inputs):
         outputs = math_util.merge_two_last_dims(outputs)
         return outputs, outputs_length
 
-    def compute_output_shape(self, input_shape):
-        output_shape, output_length_shape = input_shape
-        output_shape = output_shape[:2] + (output_shape[2] * output_shape[3],)
-        return output_shape, output_length_shape
-
-
-@keras.utils.register_keras_serializable(package=__name__)
-class Identity(Layer):
-    def call(self, inputs):
-        return inputs
+    # def compute_output_shape(self, input_shape):
+    #     output_shape, output_length_shape = input_shape
+    #     output_shape = output_shape[:2] + (output_shape[2] * output_shape[3],)
+    #     return output_shape, output_length_shape
diff --git a/tensorflow_asr/models/encoders/conformer.py b/tensorflow_asr/models/encoders/conformer.py
@@ -16,7 +16,7 @@
 
 from tensorflow_asr import keras, tf
 from tensorflow_asr.models.activations.glu import GLU
-from tensorflow_asr.models.base_layer import Identity, Layer
+from tensorflow_asr.models.base_layer import Layer
 from tensorflow_asr.models.layers.convolution import DepthwiseConv1D
 from tensorflow_asr.models.layers.multihead_attention import MultiHeadAttention, MultiHeadRelativeAttention
 from tensorflow_asr.models.layers.positional_encoding import RelativeSinusoidalPositionalEncoding, SinusoidalPositionalEncoding
@@ -61,7 +61,7 @@ def __init__(
         self.pre_norm = (
             keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
             if norm_position == "pre"
-            else Identity(name="preiden" if norm_position == "none" else "iden", dtype=self.dtype)
+            else keras.layers.Identity(name="preiden" if norm_position == "none" else "iden", dtype=self.dtype)
         )
         self.ffn1 = keras.layers.Dense(
             units=scale_factor * input_dim,
@@ -83,7 +83,7 @@ def __init__(
         self.post_norm = (
             keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
             if norm_position == "post"
-            else Identity(name="postiden" if norm_position == "none" else "iden", dtype=self.dtype)
+            else keras.layers.Identity(name="postiden" if norm_position == "none" else "iden", dtype=self.dtype)
         )
         self.residual = Residual(factor=residual_factor, regularizer=bias_regularizer, name="residual", dtype=self.dtype)
 
@@ -94,11 +94,11 @@ def call(self, inputs, training=False):
         outputs = self.ffn2(outputs, training=training)
         outputs = self.do2(outputs, training=training)
         outputs = self.post_norm(outputs, training=training)
-        outputs = self.residual([inputs, outputs], training=training)
+        outputs = self.residual((inputs, outputs), training=training)
         return outputs
 
-    def compute_output_shape(self, input_shape):
-        return input_shape
+    # def compute_output_shape(self, input_shape):
+    #     return input_shape
 
 
 @keras.utils.register_keras_serializable(package=__name__)
@@ -139,7 +139,7 @@ def __init__(
         self.pre_norm = (
             keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
             if norm_position == "pre"
-            else Identity(name="preiden" if norm_position == "none" else "iden", dtype=self.dtype)
+            else keras.layers.Identity(name="preiden" if norm_position == "none" else "iden", dtype=self.dtype)
         )
         if mha_type == "relmha":
             self.mha = MultiHeadRelativeAttention(
@@ -169,7 +169,7 @@ def __init__(
         self.post_norm = (
             keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
             if norm_position == "post"
-            else Identity(name="postiden" if norm_position == "none" else "iden", dtype=self.dtype)
+            else keras.layers.Identity(name="postiden" if norm_position == "none" else "iden", dtype=self.dtype)
         )
         self.residual = Residual(factor=residual_factor, regularizer=bias_regularizer, name="residual", dtype=self.dtype)
 
@@ -179,30 +179,50 @@ def get_initial_state(self, batch_size: int):
     def call(
         self,
         inputs,
+        content_attention_bias=None,
+        positional_attention_bias=None,
         initial_state=None,
         training=False,
         attention_mask=None,
         use_causal_mask=False,
         use_auto_mask=True,
+        return_states=False,
     ):
-        _inputs, relative_position_encoding, content_attention_bias, positional_attention_bias = inputs
+        _inputs, relative_position_encoding = inputs
         outputs = self.pre_norm(_inputs, training=training)
-        outputs, states = self.mha(
-            [outputs, outputs, outputs, relative_position_encoding, content_attention_bias, positional_attention_bias],
+        outputs, *states = self.mha(
+            [outputs, outputs, outputs, relative_position_encoding],
+            content_attention_bias=content_attention_bias,
+            positional_attention_bias=positional_attention_bias,
             initial_state=initial_state,
             training=training,
             attention_mask=attention_mask,
             use_causal_mask=use_causal_mask,
             use_auto_mask=use_auto_mask,
+            return_states=return_states,
         )
         outputs = self.do(outputs, training=training)
         outputs = self.post_norm(outputs, training=training)
-        outputs = self.residual([_inputs, outputs], training=training)
-        return outputs, states
-
-    def compute_output_shape(self, input_shape):
-        output_shape, *_ = input_shape
-        return output_shape
+        outputs = self.residual((_inputs, outputs), training=training)
+        if return_states:
+            return [outputs] + states
+        return [outputs]
+
+    # def compute_output_shape(self, input_shape):
+    #     output_shape, *_ = input_shape
+    #     return output_shape
+
+    # def compute_output_spec(
+    #     self,
+    #     inputs,
+    #     initial_state=None,
+    #     attention_mask=None,
+    #     use_causal_mask=False,
+    #     use_auto_mask=True,
+    # ):
+    #     return self.mha.compute_output_spec(
+    #         inputs, attention_mask=attention_mask, use_causal_mask=use_causal_mask, use_auto_mask=use_auto_mask, initial_state=initial_state
+    #     )
 
 
 @keras.utils.register_keras_serializable(package=__name__)
@@ -247,7 +267,7 @@ def __init__(
         self.pre_norm = (
             keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
             if norm_position == "pre"
-            else Identity(name="preiden" if norm_position == "none" else "iden", dtype=self.dtype)
+            else keras.layers.Identity(name="preiden" if norm_position == "none" else "iden", dtype=self.dtype)
         )
         self.pw_conv_1 = keras.layers.Conv1D(
             filters=scale_factor * input_dim,
@@ -304,7 +324,7 @@ def __init__(
         self.post_norm = (
             keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
             if norm_position == "post"
-            else Identity(name="postiden" if norm_position == "none" else "iden", dtype=self.dtype)
+            else keras.layers.Identity(name="postiden" if norm_position == "none" else "iden", dtype=self.dtype)
         )
         self.residual = Residual(factor=residual_factor, regularizer=bias_regularizer, name="residual", dtype=self.dtype)
 
@@ -318,11 +338,11 @@ def call(self, inputs, training=False):
         outputs = self.pw_conv_2(outputs, training=training)
         outputs = self.do(outputs, training=training)
         outputs = self.post_norm(outputs, training=training)
-        outputs = self.residual([inputs, outputs], training=training)
+        outputs = self.residual((inputs, outputs), training=training)
         return outputs
 
-    def compute_output_shape(self, input_shape):
-        return input_shape
+    # def compute_output_shape(self, input_shape):
+    #     return input_shape
 
 
 @keras.utils.register_keras_serializable(package=__name__)
@@ -366,7 +386,7 @@ def __init__(
         self.pre_norm = (
             keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
             if block_norm_position == "pre"
-            else Identity(name="preiden" if block_norm_position == "none" else "iden", dtype=self.dtype)
+            else keras.layers.Identity(name="preiden" if block_norm_position == "none" else "iden", dtype=self.dtype)
         )
         self.ffm1 = FFModule(
             input_dim=input_dim,
@@ -423,7 +443,7 @@ def __init__(
         self.post_norm = (
             keras.layers.LayerNormalization(name="ln", gamma_regularizer=kernel_regularizer, beta_regularizer=kernel_regularizer, dtype=self.dtype)
             if block_norm_position == "post"
-            else Identity(name="postiden" if block_norm_position == "none" else "iden", dtype=self.dtype)
+            else keras.layers.Identity(name="postiden" if block_norm_position == "none" else "iden", dtype=self.dtype)
         )
 
     def get_initial_state(self, batch_size: int):
@@ -432,31 +452,39 @@ def get_initial_state(self, batch_size: int):
     def call(
         self,
         inputs,
+        content_attention_bias=None,
+        positional_attention_bias=None,
         initial_state=None,
         training=False,
         attention_mask=None,
         use_causal_mask=False,
         use_auto_mask=True,
+        return_states=False,
     ):
-        inputs, relative_position_encoding, content_attention_bias, positional_attention_bias = inputs
-        outputs = self.pre_norm(inputs, training=training)
+        _inputs, relative_position_encoding = inputs
+        outputs = self.pre_norm(_inputs, training=training)
         outputs = self.ffm1(outputs, training=training)
-        outputs, states = self.mhsam(
-            [outputs, relative_position_encoding, content_attention_bias, positional_attention_bias],
+        outputs, *states = self.mhsam(
+            [outputs, relative_position_encoding],
+            content_attention_bias=content_attention_bias,
+            positional_attention_bias=positional_attention_bias,
             initial_state=initial_state,
             training=training,
             attention_mask=attention_mask,
             use_causal_mask=use_causal_mask,
             use_auto_mask=use_auto_mask,
+            return_states=return_states,
         )
         outputs = self.convm(outputs, training=training)
         outputs = self.ffm2(outputs, training=training)
         outputs = self.post_norm(outputs, training=training)
-        return outputs, states
+        if return_states:
+            return [outputs] + states
+        return [outputs]
 
-    def compute_output_shape(self, input_shape):
-        output_shape, *_ = input_shape
-        return output_shape
+    # def compute_output_shape(self, input_shape):
+    #     output_shape, *_ = input_shape
+    #     return output_shape
 
 
 @keras.utils.register_keras_serializable(package=__name__)
@@ -578,29 +606,36 @@ def __init__(
         else:
             self.content_attention_bias, self.positional_attention_bias = None, None
 
-    def call(self, inputs, initial_state=None, training=False):
+    def call(
+        self,
+        inputs,
+        initial_state=None,
+        training=False,
+        return_states=False,
+    ):
         outputs, outputs_length = inputs
-        outputs, outputs_length = self.conv_subsampling([outputs, outputs_length], training=training)
+        outputs, outputs_length = self.conv_subsampling((outputs, outputs_length), training=training)
         outputs = self.linear(outputs, training=training)
         outputs = self.do(outputs, training=training)
-        outputs, relative_position_encoding = self.relpe([outputs, outputs_length], training=training)
+        outputs, relative_position_encoding = self.relpe((outputs, outputs_length), training=training)
         states = None if self._memory_length is None else []
         for i, cblock in enumerate(self.conformer_blocks):
-            outputs, _states = cblock(
-                [
-                    outputs,
-                    relative_position_encoding,
-                    self.content_attention_bias,
-                    self.positional_attention_bias,
-                ],
+            outputs, *_states = cblock(
+                (outputs, relative_position_encoding),
+                content_attention_bias=self.content_attention_bias,
+                positional_attention_bias=self.positional_attention_bias,
                 initial_state=None if initial_state is None else initial_state[i],
                 training=training,
                 use_causal_mask=self._use_attention_causal_mask,
                 use_auto_mask=self._use_attention_auto_mask,
+                return_states=return_states,
             )
-            if states is not None:
-                states.append(_states)
-        return outputs, outputs_length, states
+            if not states:
+                continue
+            states.extend(_states)
+        if return_states:
+            return outputs, outputs_length, states
+        return outputs, outputs_length
 
     def call_next(self, features, features_length, previous_encoder_states, *args, **kwargs):
         """
@@ -617,17 +652,17 @@ def call_next(self, features, features_length, previous_encoder_states, *args, *
             Outputs, outputs_length, new_states
         """
         with tf.name_scope(f"{self.name}_call_next"):
-            return self.call((features, features_length), initial_state=previous_encoder_states, training=False)
+            return self((features, features_length), initial_state=previous_encoder_states, training=False, return_states=True)
 
     def compute_mask(self, inputs, mask=None):
-        return *self.conv_subsampling.compute_mask(inputs, mask=mask), None
-
-    def compute_output_shape(self, input_shape):
-        output_shape, output_length_shape = input_shape
-        output_shape, output_length_shape = self.conv_subsampling.compute_output_shape((output_shape, output_length_shape))
-        output_shape = self.linear.compute_output_shape(output_shape)
-        output_shape, relative_position_encoding_shape = self.relpe.compute_output_shape((output_shape, output_length_shape))
-        output_shape = self.do.compute_output_shape(output_shape)
-        for cblock in self.conformer_blocks:
-            output_shape = cblock.compute_output_shape((output_shape, relative_position_encoding_shape, None, None))
-        return output_shape, output_length_shape
+        return self.conv_subsampling.compute_mask(inputs, mask=mask)
+
+    # def compute_output_shape(self, input_shape):
+    #     output_shape, output_length_shape = input_shape
+    #     output_shape, output_length_shape = self.conv_subsampling.compute_output_shape((output_shape, output_length_shape))
+    #     output_shape = self.linear.compute_output_shape(output_shape)
+    #     output_shape, relative_position_encoding_shape = self.relpe.compute_output_shape((output_shape, output_length_shape))
+    #     output_shape = self.do.compute_output_shape(output_shape)
+    #     for cblock in self.conformer_blocks:
+    #         output_shape = cblock.compute_output_shape((output_shape, relative_position_encoding_shape, None, None))
+    #     return output_shape, output_length_shape
diff --git a/tensorflow_asr/models/encoders/deepspeech2.py b/tensorflow_asr/models/encoders/deepspeech2.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from tensorflow_asr import keras, tf
-from tensorflow_asr.models.base_layer import Identity, Layer, Reshape
+from tensorflow_asr.models.base_layer import Layer, Reshape
 from tensorflow_asr.models.layers.convolution import DepthwiseConv1D
 from tensorflow_asr.utils import layer_util, math_util
 
@@ -121,12 +121,12 @@ def compute_mask(self, inputs, mask=None):
         mask = tf.sequence_mask(outputs_length, maxlen=maxlen, dtype=tf.bool)
         return mask, None
 
-    def compute_output_shape(self, input_shape):
-        output_shape, output_length_shape = input_shape
-        output_shape = self.conv.compute_output_shape(output_shape)
-        output_shape = self.bn.compute_output_shape(output_shape)
-        output_shape = self.act.compute_output_shape(output_shape)
-        return output_shape, output_length_shape
+    # def compute_output_shape(self, input_shape):
+    #     output_shape, output_length_shape = input_shape
+    #     output_shape = self.conv.compute_output_shape(output_shape)
+    #     output_shape = self.bn.compute_output_shape(output_shape)
+    #     output_shape = self.act.compute_output_shape(output_shape)
+    #     return output_shape, output_length_shape
 
 
 @keras.utils.register_keras_serializable(package=__name__)
@@ -148,7 +148,7 @@ def __init__(
         assert conv_type in ("conv1d", "conv2d")
         assert len(kernels) == len(strides) == len(filters)
 
-        self.pre = Reshape(name="preprocess", dtype=self.dtype) if conv_type == "conv1d" else Identity(name="iden", dtype=self.dtype)
+        self.pre = Reshape(name="preprocess", dtype=self.dtype) if conv_type == "conv1d" else keras.layers.Identity(name="iden", dtype=self.dtype)
 
         self.convs = []
         self.time_reduction_factor = 1
@@ -169,7 +169,7 @@ def __init__(
             self.convs.append(conv_block)
             self.time_reduction_factor *= conv_block.time_reduction_factor
 
-        self.post = Reshape(name="postprocess", dtype=self.dtype) if conv_type == "conv2d" else Identity(name="iden", dtype=self.dtype)
+        self.post = Reshape(name="postprocess", dtype=self.dtype) if conv_type == "conv2d" else keras.layers.Identity(name="iden", dtype=self.dtype)
 
     def call(self, inputs, training=False):
         outputs = self.pre(inputs, training=training)
@@ -178,13 +178,13 @@ def call(self, inputs, training=False):
         outputs = self.post(outputs, training=training)
         return outputs
 
-    def compute_output_shape(self, input_shape):
-        output_shape = input_shape
-        output_shape = self.pre.compute_output_shape(output_shape)
-        for conv in self.convs:
-            output_shape = conv.compute_output_shape(output_shape)
-        output_shape = self.post.compute_output_shape(output_shape)
-        return output_shape
+    # def compute_output_shape(self, input_shape):
+    #     output_shape = input_shape
+    #     output_shape = self.pre.compute_output_shape(output_shape)
+    #     for conv in self.convs:
+    #         output_shape = conv.compute_output_shape(output_shape)
+    #     output_shape = self.post.compute_output_shape(output_shape)
+    #     return output_shape
 
 
 # ------------------------------------ RNN ----------------------------------- #

diff --git a/tensorflow_asr/models/layers/blurpool.py b/tensorflow_asr/models/layers/blurpool.py
@@ -30,10 +30,9 @@ def __init__(
         trainable=True,
         name="blurpool2d",
         dtype=None,
-        dynamic=False,
         **kwargs,
     ):
-        super().__init__(trainable, name, dtype, dynamic, **kwargs)
+        super().__init__(trainable=trainable, name=name, dtype=dtype, **kwargs)
         self.filters = filters
         self.kernel_size = kernel_size
         self.strides = strides
@@ -88,10 +87,9 @@ def __init__(
         trainable=True,
         name="blurpool1d",
         dtype=None,
-        dynamic=False,
         **kwargs,
     ):
-        super().__init__(trainable, name, dtype, dynamic, **kwargs)
+        super().__init__(trainable=trainable, name=name, dtype=dtype, **kwargs)
         self.filters = filters
         self.kernel_size = kernel_size
         self.strides = strides

diff --git a/tensorflow_asr/models/layers/embedding.py b/tensorflow_asr/models/layers/embedding.py
@@ -52,10 +52,10 @@ def compute_mask(self, inputs, mask=None):
         mask = tf.sequence_mask(outputs_length, maxlen=tf.shape(outputs)[1], dtype=tf.bool)
         return mask, None
 
-    def compute_output_shape(self, input_shape):
-        output_shape, output_length_shape = input_shape
-        output_shape = super().compute_output_shape(output_shape)
-        return output_shape, output_length_shape
+    # def compute_output_shape(self, input_shape):
+    #     output_shape, output_length_shape = input_shape
+    #     output_shape = super().compute_output_shape(output_shape)
+    #     return output_shape, output_length_shape
 
 
 @keras.utils.register_keras_serializable(package=__name__)
@@ -87,7 +87,7 @@ def compute_mask(self, inputs, mask=None):
         mask = tf.sequence_mask(outputs_length, maxlen=tf.shape(outputs)[1], dtype=tf.bool)
         return mask, None
 
-    def compute_output_shape(self, input_shape):
-        output_shape, output_length_shape = input_shape
-        output_shape = output_shape + (self.depth,)
-        return output_shape, output_length_shape
+    # def compute_output_shape(self, input_shape):
+    #     output_shape, output_length_shape = input_shape
+    #     output_shape = output_shape + (self.depth,)
+    #     return output_shape, output_length_shape
diff --git a/tensorflow_asr/models/layers/feature_extraction.py b/tensorflow_asr/models/layers/feature_extraction.py
@@ -312,11 +312,11 @@ def compute_mask(self, inputs, mask=None):
         padded_nframes = self.get_nframes(tf.shape(signals, tf.int32)[1])
         return tf.sequence_mask(nframes, maxlen=padded_nframes, dtype=tf.bool), None
 
-    def compute_output_shape(self, input_shape):
-        signal_shape, signal_length_shape = input_shape
-        B, nsamples = signal_shape
-        if nsamples is None:
-            output_shape = [B, None, self.num_feature_bins, 1]
-        else:
-            output_shape = [B, self.get_nframes(nsamples + self.padding), self.num_feature_bins, 1]
-        return tf.TensorShape(output_shape), tf.TensorShape(signal_length_shape)
+    # def compute_output_shape(self, input_shape):
+    #     signal_shape, signal_length_shape = input_shape
+    #     B, nsamples = signal_shape
+    #     if nsamples is None:
+    #         output_shape = [B, None, self.num_feature_bins, 1]
+    #     else:
+    #         output_shape = [B, self.get_nframes(nsamples + self.padding), self.num_feature_bins, 1]
+    #     return tf.TensorShape(output_shape), tf.TensorShape(signal_length_shape)
diff --git a/tensorflow_asr/models/layers/memory.py b/tensorflow_asr/models/layers/memory.py
@@ -79,8 +79,8 @@ def call(self, inputs, memories=None, training=False):
         new_memory._keras_mask = new_memory_mask  # pylint: disable=protected-access
         return new_inputs, new_memory
 
-    def compute_output_shape(self, input_shape):
-        return input_shape[0], self.memory_length, self.dmodel
+    # def compute_output_shape(self, input_shape):
+    #     return input_shape[0], self.memory_length, self.dmodel
 
-    def compute_output_spec(self, *args, **kwargs):
-        return super().compute_output_spec(*args, **kwargs)
+    # def compute_output_spec(self, *args, **kwargs):
+    #     return super().compute_output_spec(*args, **kwargs)
diff --git a/tensorflow_asr/models/layers/multihead_attention.py b/tensorflow_asr/models/layers/multihead_attention.py
@@ -244,6 +244,8 @@ def call(
         training=None,
         use_causal_mask=False,
         initial_state=None,
+        return_states=False,
+        **kwargs,
     ):
         query, key, value = inputs
 
@@ -269,14 +271,20 @@ def call(
         # `value` = [B, S, N, H]
         value = self._value_dense(value)
 
-        query, key, value, states = self._with_memory(query, key, value, initial_state, training)
+        if return_states:
+            query, key, value, states = self._with_memory(query, key, value, initial_state, training)
 
         attention_output, attention_scores = self._compute_attention(query, key, value, attention_mask, training)
         attention_output = self._output_dense(attention_output)
 
         if return_attention_scores:
-            return attention_output, states, attention_scores
-        return attention_output, states
+            if return_states:
+                return attention_output, states, attention_scores
+            return attention_output, attention_scores
+
+        if return_states:
+            return attention_output, states
+        return (attention_output,)
 
     def compute_output_shape(self, input_shape):
         query_shape, key_shape, value_shape, *_ = input_shape
@@ -294,11 +302,22 @@ def compute_output_spec(
         training=None,
         use_causal_mask=False,
         initial_state=None,
+        return_states=False,
     ):
         query, value, key, *_ = inputs
         output_spec, *attention_score_spec = super().compute_output_spec(
             query, value, key, query_mask, value_mask, key_mask, attention_mask, return_attention_scores, training, use_causal_mask
         )
+        if not return_states:
+            return [output_spec] + attention_score_spec
+        if self._memory_length is None:
+            return [output_spec, None] + attention_score_spec
+        states_shape = (query.shape[0], self._memory_length, query.shape[-1])
+        states_spec = {
+            "key": keras.KerasTensor(states_shape, dtype=self.compute_dtype),
+            "value": keras.KerasTensor(states_shape, dtype=self.compute_dtype),
+        }
+        return [output_spec, states_spec] + attention_score_spec
 
 
 @keras.utils.register_keras_serializable(package=__name__)
@@ -348,7 +367,7 @@ def __init__(
         self._causal = causal
 
     def build(self, input_shape):
-        *rest_input_shape, relpe_shape, _, _ = input_shape
+        *rest_input_shape, relpe_shape = input_shape
         relpe_rank = len(relpe_shape)
         einsum_equation, bias_axes, output_rank = mha_module._build_proj_equation(relpe_rank - 1, bound_dims=1, output_dims=2)
         self._relpe_dense = keras.layers.EinsumDense(
@@ -423,6 +442,8 @@ def _compute_attention(
     def call(
         self,
         inputs,
+        content_attention_bias=None,
+        positional_attention_bias=None,
         query_mask=None,
         value_mask=None,
         key_mask=None,
@@ -432,8 +453,10 @@ def call(
         training=None,
         use_causal_mask=False,
         initial_state=None,
+        return_states=False,
+        **kwargs,
     ):
-        query, key, value, relpe, content_attention_bias, positional_attention_bias = inputs
+        query, key, value, relpe = inputs
 
         if use_auto_mask:
             attention_mask = self._compute_attention_mask(
@@ -460,7 +483,8 @@ def call(
         # `position` = [B, R, N, H]
         position = self._relpe_dense(relpe)
 
-        query, key, value, states = self._with_memory(query, key, value, initial_state, training)
+        if return_states:
+            query, key, value, states = self._with_memory(query, key, value, initial_state, training)
 
         attention_output, attention_scores = self._compute_attention(
             query,
@@ -475,5 +499,10 @@ def call(
         attention_output = self._output_dense(attention_output)
 
         if return_attention_scores:
-            return attention_output, states, attention_scores
-        return attention_output, states
+            if return_states:
+                return attention_output, states, attention_scores
+            return attention_output, attention_scores
+
+        if return_states:
+            return attention_output, states
+        return (attention_output,)
diff --git a/tensorflow_asr/models/layers/positional_encoding.py b/tensorflow_asr/models/layers/positional_encoding.py
@@ -83,9 +83,9 @@ def call(self, inputs, training=False):
         outputs += pe
         return outputs, pe
 
-    def compute_output_shape(self, input_shape):
-        output_shape, _ = input_shape
-        return output_shape, output_shape
+    # def compute_output_shape(self, input_shape):
+    #     output_shape, _ = input_shape
+    #     return output_shape, output_shape
 
 
 @keras.utils.register_keras_serializable(package=__name__)
@@ -172,10 +172,10 @@ def call(self, inputs, training=False):
         pe = self.do(pe, training=training)
         return outputs, pe
 
-    def compute_output_shape(self, input_shape):
-        output_shape, _ = input_shape
-        B, T, V = output_shape
-        pT = 2 * T - 1 if T is not None else None
-        if self._memory_length > 0 and T is not None:
-            pT += self._memory_length
-        return output_shape, (B, pT, V)
+    # def compute_output_shape(self, input_shape):
+    #     output_shape, _ = input_shape
+    #     B, T, V = output_shape
+    #     pT = 2 * T - 1 if T is not None else None
+    #     if self._memory_length > 0 and T is not None:
+    #         pT += self._memory_length
+    #     return output_shape, (B, pT, V)
diff --git a/tensorflow_asr/models/layers/residual.py b/tensorflow_asr/models/layers/residual.py
@@ -52,14 +52,14 @@ def build(self, input_shape):
             )
         else:
             assert isinstance(self._factor, (int, float))
-            self._alpha = tf.convert_to_tensor(self._factor, dtype=self.compute_dtype)
+            self._alpha = self._factor
         return super().build(input_shape)
 
     def call(self, inputs):
         x, residual_x = inputs
-        alpha = tf.cast(self._alpha, residual_x.dtype)
+        alpha = tf.cast(tf.convert_to_tensor(self._alpha, dtype=self.dtype), residual_x.dtype)
         x = x + alpha * residual_x
         return x
 
-    def compute_output_shape(self, input_shape):
-        return input_shape[0]
+    # def compute_output_shape(self, input_shape):
+    #     return input_shape[0]
diff --git a/tensorflow_asr/models/layers/subsampling.py b/tensorflow_asr/models/layers/subsampling.py
@@ -45,11 +45,11 @@ def compute_mask(self, inputs, mask=None):
         mask = tf.sequence_mask(outputs_length, maxlen=maxlen, dtype=tf.bool)
         return mask, None
 
-    def compute_output_shape(self, input_shape):
-        output_shape, output_length_shape = input_shape
-        reduced_time = math_util.legacy_get_reduced_length(output_shape[1], self.time_reduction_factor)
-        output_shape = output_shape[:1] + (reduced_time,) + output_shape[2:]
-        return output_shape, output_length_shape
+    # def compute_output_shape(self, input_shape):
+    #     output_shape, output_length_shape = input_shape
+    #     reduced_time = math_util.legacy_get_reduced_length(output_shape[1], self.time_reduction_factor)
+    #     output_shape = output_shape[:1] + (reduced_time,) + output_shape[2:]
+    #     return output_shape, output_length_shape
 
 
 @keras.utils.register_keras_serializable(package=__name__)
@@ -141,16 +141,16 @@ def compute_mask(self, inputs, mask=None):
         mask = tf.sequence_mask(outputs_length, maxlen=maxlen, dtype=tf.bool)
         return mask, None
 
-    def compute_output_shape(self, input_shape):
-        output_shape, output_length_shape = input_shape
-        outputs_shape = self.conv1.compute_output_shape(output_shape)
-        outputs_shape = self.conv2.compute_output_shape(outputs_shape)
-        outputs_shape = self.maxpool1.compute_output_shape(outputs_shape)
-        outputs_shape = self.conv3.compute_output_shape(outputs_shape)
-        outputs_shape = self.conv4.compute_output_shape(outputs_shape)
-        outputs_shape = self.maxpool2.compute_output_shape(outputs_shape)
-        outputs_shape = outputs_shape[:2] + (outputs_shape[2] * outputs_shape[3],)
-        return outputs_shape, output_length_shape
+    # def compute_output_shape(self, input_shape):
+    #     output_shape, output_length_shape = input_shape
+    #     outputs_shape = self.conv1.compute_output_shape(output_shape)
+    #     outputs_shape = self.conv2.compute_output_shape(outputs_shape)
+    #     outputs_shape = self.maxpool1.compute_output_shape(outputs_shape)
+    #     outputs_shape = self.conv3.compute_output_shape(outputs_shape)
+    #     outputs_shape = self.conv4.compute_output_shape(outputs_shape)
+    #     outputs_shape = self.maxpool2.compute_output_shape(outputs_shape)
+    #     outputs_shape = outputs_shape[:2] + (outputs_shape[2] * outputs_shape[3],)
+    #     return outputs_shape, output_length_shape
 
 
 @keras.utils.register_keras_serializable(package=__name__)
@@ -235,12 +235,12 @@ def compute_mask(self, inputs, mask=None):
         mask = tf.sequence_mask(outputs_length, maxlen=maxlen, dtype=tf.bool)
         return mask, None
 
-    def compute_output_shape(self, input_shape):
-        output_shape, output_length_shape = input_shape
-        for block in self.convs:
-            output_shape = block.layers[0].compute_output_shape(output_shape)
-        output_shape = output_shape[:2] + (output_shape[2] * output_shape[3],)
-        return output_shape, output_length_shape
+    # def compute_output_shape(self, input_shape):
+    #     output_shape, output_length_shape = input_shape
+    #     for block in self.convs:
+    #         output_shape = block.layers[0].compute_output_shape(output_shape)
+    #     output_shape = output_shape[:2] + (output_shape[2] * output_shape[3],)
+    #     return output_shape, output_length_shape
 
 
 @keras.utils.register_keras_serializable(package=__name__)
@@ -325,9 +325,9 @@ def compute_mask(self, inputs, mask=None):
         mask = tf.sequence_mask(outputs_length, maxlen=maxlen, dtype=tf.bool)
         return mask, None
 
-    def compute_output_shape(self, input_shape):
-        output_shape, output_length_shape = input_shape
-        output_shape = output_shape[:2] + (output_shape[2] * output_shape[3],)
-        for block in self.convs:
-            output_shape = block.layers[0].compute_output_shape(output_shape)
-        return output_shape, output_length_shape
+    # def compute_output_shape(self, input_shape):
+    #     output_shape, output_length_shape = input_shape
+    #     output_shape = output_shape[:2] + (output_shape[2] * output_shape[3],)
+    #     for block in self.convs:
+    #         output_shape = block.layers[0].compute_output_shape(output_shape)
+    #     return output_shape, output_length_shape
diff --git a/tensorflow_asr/models/transducer/base_transducer.py b/tensorflow_asr/models/transducer/base_transducer.py
@@ -152,16 +152,16 @@ def call_next(self, inputs, previous_decoder_states):
     def compute_mask(self, inputs, mask=None):
         return self.label_encoder.compute_mask(inputs, mask=mask)
 
-    def compute_output_shape(self, input_shape):
-        output_shape, output_length_shape = input_shape
-        output_shape, output_length_shape = self.label_encoder.compute_output_shape((output_shape, output_length_shape))
-        for i, rnn in enumerate(self.rnns):
-            output_shape = (
-                self.projections[i].compute_output_shape(output_shape)
-                if self.projections[i] is not None
-                else rnn.compute_output_shape(output_shape)[0]
-            )
-        return tuple(output_shape), tuple(output_length_shape)
+    # def compute_output_shape(self, input_shape):
+    #     output_shape, output_length_shape = input_shape
+    #     output_shape, output_length_shape = self.label_encoder.compute_output_shape((output_shape, output_length_shape))
+    #     for i, rnn in enumerate(self.rnns):
+    #         output_shape = (
+    #             self.projections[i].compute_output_shape(output_shape)
+    #             if self.projections[i] is not None
+    #             else rnn.compute_output_shape(output_shape)[0]
+    #         )
+    #     return tuple(output_shape), tuple(output_length_shape)
 
 
 @keras.utils.register_keras_serializable(package=__name__)
@@ -197,9 +197,9 @@ def call(self, inputs):
             outputs = tf.multiply(enc_out, pred_out)  # broadcast operator
         return outputs  # [B, T, U, V]
 
-    def compute_output_shape(self, input_shape):
-        enc_shape, pred_shape = input_shape
-        return enc_shape[0], enc_shape[1], pred_shape[1], enc_shape[-1]
+    # def compute_output_shape(self, input_shape):
+    #     enc_shape, pred_shape = input_shape
+    #     return enc_shape[0], enc_shape[1], pred_shape[1], enc_shape[-1]
 
 
 @keras.utils.register_keras_serializable(package=__name__)
@@ -281,11 +281,11 @@ def call(self, inputs, training=False):
     def compute_mask(self, inputs, mask=None):
         return self.joint.compute_mask(inputs, mask=mask)
 
-    def compute_output_shape(self, input_shape):
-        encoder_shape, prediction_shape = input_shape
-        batch_shape = encoder_shape[0]
-        encoder_time_shape, prediction_time_shape = encoder_shape[1], prediction_shape[1]
-        return batch_shape, encoder_time_shape, prediction_time_shape, self.ffn_out.units
+    # def compute_output_shape(self, input_shape):
+    #     encoder_shape, prediction_shape = input_shape
+    #     batch_shape = encoder_shape[0]
+    #     encoder_time_shape, prediction_time_shape = encoder_shape[1], prediction_shape[1]
+    #     return batch_shape, encoder_time_shape, prediction_time_shape, self.ffn_out.units
 
 
 class Transducer(BaseModel):
@@ -407,8 +407,8 @@ def remove_gwn(self, original_weights):
 
     def call(self, inputs: schemas.TrainInput, training=False):
         features, features_length = self.feature_extraction((inputs.inputs, inputs.inputs_length), training=training)
-        enc, logits_length, _ = self.encoder((features, features_length), training=training)
-        pred, _ = self.predict_net((inputs.predictions, inputs.predictions_length), training=training)
+        enc, logits_length, *_ = self.encoder((features, features_length), training=training)
+        pred, *_ = self.predict_net((inputs.predictions, inputs.predictions_length), training=training)
         logits = self.joint_net((enc, pred), training=training)
         return schemas.TrainOutput(
             logits=logits,

diff --git a/tensorflow_asr/utils/layer_util.py b/tensorflow_asr/utils/layer_util.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import List
+
 from tensorflow_asr import keras, tf
 from tensorflow_asr.models.layers.convolution import Conv2D
 
@@ -37,13 +39,13 @@ def get_conv(
 
 
 def add_gwn(
-    trainable_weights: list,
+    trainable_weights: List[tf.Variable],
     stddev: float = 1.0,
 ):
     original_weights = []
     for weight in trainable_weights:
         noise = tf.stop_gradient(tf.random.normal(mean=0.0, stddev=stddev, shape=weight.shape, dtype=weight.dtype))
-        original_weights.append(weight.value())
+        original_weights.append(weight)
         weight.assign_add(noise)
     return original_weights
 

diff --git a/tensorflow_asr/utils/shape_util.py b/tensorflow_asr/utils/shape_util.py
@@ -23,8 +23,8 @@ def shape_list(x, out_type=tf.int32):
 
 
 def shape_list_per_replica(x, per_replica_batch_size):
-    shapes = x.shape.as_list()
-    shapes[0] = int(per_replica_batch_size)
+    _, *rest_shape = x.shape
+    shapes = (int(per_replica_batch_size),) + tuple(rest_shape)
     return shapes