fix: configs

TensorSpeech · nglehuy · Jun 15, 2024 · Jun 18, 2024 · Jun 22, 2024 · Jun 24, 2024
commit eebc36165ffe5f39c4b7bea6b267c7dee5ea6a17
diff --git a/examples/models/ctc/conformer/small-streaming-kaggle.yml.j2 b/examples/models/ctc/conformer/small-streaming-kaggle.yml.j2
@@ -1,4 +1,4 @@
-{% import "examples/configs/librispeech/characters/char.yml.j2" as decoder_config with context %}
+{% import "examples/configs/librispeech/sentencepiece/sp.256.yml.j2" as decoder_config with context %}
 {{decoder_config}}
 
 model_config:
@@ -12,19 +12,6 @@ model_config:
       nfft: 512
       num_feature_bins: 80
       feature_type: log_mel_spectrogram
-      augmentation_config:
-        feature_augment:
-          time_masking:
-            prob: 0.5
-            num_masks: 5
-            mask_factor: -1 # whole utterance
-            p_upperbound: 0.05
-            mask_value: 0
-          freq_masking:
-            prob: 0.5
-            num_masks: 2
-            mask_factor: 27
-            mask_value: 0
     encoder_subsampling:
       class_name: tensorflow_asr.models.layers.subsampling>Conv2dSubsampling
       config:
@@ -50,8 +37,8 @@ model_config:
     encoder_dropout: 0.1
     encoder_padding: causal
     encoder_memory_length: null
-    encoder_history_size: 60 # frames
-    encoder_chunk_size: 24 # frames
+    encoder_history_size: 64 # frames = 4 * chunk_size
+    encoder_chunk_size: 16 # frames
     blank: 0
     vocab_size: {{decoder_config.vocabsize}}
     kernel_regularizer:

diff --git a/examples/models/ctc/conformer/small-streaming.yml.j2 b/examples/models/ctc/conformer/small-streaming.yml.j2
@@ -12,19 +12,6 @@ model_config:
       nfft: 512
       num_feature_bins: 80
       feature_type: log_mel_spectrogram
-      augmentation_config:
-        feature_augment:
-          time_masking:
-            prob: 0.5
-            num_masks: 5
-            mask_factor: -1 # whole utterance
-            p_upperbound: 0.05
-            mask_value: 0
-          freq_masking:
-            prob: 0.5
-            num_masks: 2
-            mask_factor: 27
-            mask_value: 0
     encoder_subsampling:
       class_name: tensorflow_asr.models.layers.subsampling>Conv2dSubsampling
       config:
@@ -50,8 +37,8 @@ model_config:
     encoder_dropout: 0.1
     encoder_padding: causal
     encoder_memory_length: null
-    encoder_history_size: 60 # frames
-    encoder_chunk_size: 24 # frames
+    encoder_history_size: 64 # frames = 4 * chunk_size
+    encoder_chunk_size: 16 # frames
     blank: 0
     vocab_size: {{decoder_config.vocabsize}}
     kernel_regularizer:

diff --git a/examples/models/transducer/conformer/small-streaming-kaggle.yml.j2 b/examples/models/transducer/conformer/small-streaming-kaggle.yml.j2
@@ -12,19 +12,6 @@ model_config:
       nfft: 512
       num_feature_bins: 80
       feature_type: log_mel_spectrogram
-      augmentation_config:
-        feature_augment:
-          time_masking:
-            prob: 1.0
-            num_masks: 10
-            mask_factor: -1
-            p_upperbound: 0.05
-            mask_value: 0
-          freq_masking:
-            prob: 1.0
-            num_masks: 1
-            mask_factor: 27
-            mask_value: 0
     encoder_subsampling:
       class_name: tensorflow_asr.models.layers.subsampling>Conv2dSubsampling
       config:
@@ -50,8 +37,8 @@ model_config:
     encoder_dropout: 0.1
     encoder_padding: causal
     encoder_memory_length: null
-    encoder_history_size: 60 # frames
-    encoder_chunk_size: 24 # frames lookahead
+    encoder_history_size: 64 # frames = 4 * chunk_size
+    encoder_chunk_size: 16 # frames
     prediction_label_encode_mode: embedding
     prediction_embed_dim: 320
     prediction_num_rnns: 1

diff --git a/examples/models/transducer/conformer/small-streaming.yml.j2 b/examples/models/transducer/conformer/small-streaming.yml.j2
@@ -12,19 +12,6 @@ model_config:
       nfft: 512
       num_feature_bins: 80
       feature_type: log_mel_spectrogram
-      augmentation_config:
-        feature_augment:
-          time_masking:
-            prob: 1.0
-            num_masks: 10
-            mask_factor: -1
-            p_upperbound: 0.05
-            mask_value: 0
-          freq_masking:
-            prob: 1.0
-            num_masks: 1
-            mask_factor: 27
-            mask_value: 0
     encoder_subsampling:
       class_name: tensorflow_asr.models.layers.subsampling>Conv2dSubsampling
       config:
@@ -50,8 +37,8 @@ model_config:
     encoder_dropout: 0.1
     encoder_padding: causal
     encoder_memory_length: null
-    encoder_history_size: 60 # frames
-    encoder_chunk_size: 15 # frames
+    encoder_history_size: 64 # frames = 4 * chunk_size
+    encoder_chunk_size: 16 # frames
     prediction_label_encode_mode: embedding
     prediction_embed_dim: 320
     prediction_num_rnns: 1