@@ -100,6 +100,10 @@ class TasksManager:
100
100
"masked-im" : "AutoModelForMaskedImageModeling" ,
101
101
"semantic-segmentation" : "AutoModelForSemanticSegmentation" ,
102
102
"speech2seq-lm" : "AutoModelForSpeechSeq2Seq" ,
103
+ "audio-classification" : "AutoModelForAudioClassification" ,
104
+ "audio-frame-classification" : "AutoModelForAudioFrameClassification" ,
105
+ "audio-ctc" : "AutoModelForCTC" ,
106
+ "audio-xvector" : "AutoModelForAudioXVector" ,
103
107
"stable-diffusion" : "StableDiffusionPipeline" ,
104
108
}
105
109
if is_tf_available ():
@@ -130,11 +134,20 @@ class TasksManager:
130
134
"masked-im" : "transformers" ,
131
135
"semantic-segmentation" : "transformers" ,
132
136
"speech2seq-lm" : "transformers" ,
137
+ "audio-ctc" : "transformers" ,
138
+ "audio-classification" : "transformers" ,
139
+ "audio-frame-classification" : "transformers" ,
140
+ "audio-xvector" : "transformers" ,
133
141
"stable-diffusion" : "diffusers" ,
134
142
}
135
143
136
144
# Set of model topologies we support associated to the tasks supported by each topology and the factory
137
145
_SUPPORTED_MODEL_TYPE = {
146
+ "audio-spectrogram-transformer" : supported_tasks_mapping (
147
+ "default" ,
148
+ "audio-classification" ,
149
+ onnx = "ASTOnnxConfig" ,
150
+ ),
138
151
"albert" : supported_tasks_mapping (
139
152
"default" ,
140
153
"masked-lm" ,
@@ -273,6 +286,14 @@ class TasksManager:
273
286
# "semantic-segmentation",
274
287
onnx = "Data2VecVisionOnnxConfig" ,
275
288
),
289
+ "data2vec-audio" : supported_tasks_mapping (
290
+ "default" ,
291
+ "audio-ctc" ,
292
+ "audio-classification" ,
293
+ "audio-frame-classification" ,
294
+ "audio-xvector" ,
295
+ onnx = "Data2VecAudioOnnxConfig" ,
296
+ ),
276
297
"deberta" : supported_tasks_mapping (
277
298
"default" ,
278
299
"masked-lm" ,
@@ -356,6 +377,12 @@ class TasksManager:
356
377
"default" ,
357
378
onnx = "GroupViTOnnxConfig" ,
358
379
),
380
+ "hubert" : supported_tasks_mapping (
381
+ "default" ,
382
+ "audio-ctc" ,
383
+ "audio-classification" ,
384
+ onnx = "HubertOnnxConfig" ,
385
+ ),
359
386
"ibert" : supported_tasks_mapping (
360
387
"default" ,
361
388
"masked-lm" ,
@@ -423,6 +450,12 @@ class TasksManager:
423
450
"question-answering" ,
424
451
onnx = "MBartOnnxConfig" ,
425
452
),
453
+ # TODO: enable once the missing operator is supported.
454
+ # "mctct": supported_tasks_mapping(
455
+ # "default",
456
+ # "audio-ctc",
457
+ # onnx="MCTCTOnnxConfig",
458
+ # ),
426
459
"mobilebert" : supported_tasks_mapping (
427
460
"default" ,
428
461
"masked-lm" ,
@@ -521,6 +554,25 @@ class TasksManager:
521
554
"semantic-segmentation" ,
522
555
onnx = "SegformerOnnxConfig" ,
523
556
),
557
+ "sew" : supported_tasks_mapping (
558
+ "default" ,
559
+ "audio-ctc" ,
560
+ "audio-classification" ,
561
+ onnx = "SEWOnnxConfig" ,
562
+ ),
563
+ "sew-d" : supported_tasks_mapping (
564
+ "default" ,
565
+ "audio-ctc" ,
566
+ "audio-classification" ,
567
+ onnx = "SEWDOnnxConfig" ,
568
+ ),
569
+ "speech-to-text" : supported_tasks_mapping (
570
+ "default" ,
571
+ "default-with-past" ,
572
+ "speech2seq-lm" ,
573
+ "speech2seq-lm-with-past" ,
574
+ onnx = "Speech2TextOnnxConfig" ,
575
+ ),
524
576
"squeezebert" : supported_tasks_mapping (
525
577
"default" ,
526
578
"masked-lm" ,
@@ -530,6 +582,12 @@ class TasksManager:
530
582
"question-answering" ,
531
583
onnx = "SqueezeBertOnnxConfig" ,
532
584
),
585
+ "swin" : supported_tasks_mapping (
586
+ "default" ,
587
+ "image-classification" ,
588
+ "masked-im" ,
589
+ onnx = "SwinOnnxConfig" ,
590
+ ),
533
591
"t5" : supported_tasks_mapping (
534
592
"default" ,
535
593
"default-with-past" ,
@@ -541,11 +599,49 @@ class TasksManager:
541
599
"semantic-segmentation" ,
542
600
onnx = "UNetOnnxConfig" ,
543
601
),
602
+ "unispeech" : supported_tasks_mapping (
603
+ "default" ,
604
+ "audio-ctc" ,
605
+ "audio-classification" ,
606
+ onnx = "UniSpeechOnnxConfig" ,
607
+ ),
608
+ "unispeech-sat" : supported_tasks_mapping (
609
+ "default" ,
610
+ "audio-ctc" ,
611
+ "audio-classification" ,
612
+ "audio-frame-classification" ,
613
+ "audio-xvector" ,
614
+ onnx = "UniSpeechSATOnnxConfig" ,
615
+ ),
544
616
"vae" : supported_tasks_mapping (
545
617
"semantic-segmentation" ,
546
618
onnx = "VaeOnnxConfig" ,
547
619
),
548
620
"vit" : supported_tasks_mapping ("default" , "image-classification" , "masked-im" , onnx = "ViTOnnxConfig" ),
621
+ "wavlm" : supported_tasks_mapping (
622
+ "default" ,
623
+ "audio-ctc" ,
624
+ "audio-classification" ,
625
+ "audio-frame-classification" ,
626
+ "audio-xvector" ,
627
+ onnx = "WavLMOnnxConfig" ,
628
+ ),
629
+ "wav2vec2" : supported_tasks_mapping (
630
+ "default" ,
631
+ "audio-ctc" ,
632
+ "audio-classification" ,
633
+ "audio-frame-classification" ,
634
+ "audio-xvector" ,
635
+ onnx = "Wav2Vec2OnnxConfig" ,
636
+ ),
637
+ "wav2vec2-conformer" : supported_tasks_mapping (
638
+ "default" ,
639
+ "audio-ctc" ,
640
+ "audio-classification" ,
641
+ "audio-frame-classification" ,
642
+ "audio-xvector" ,
643
+ onnx = "Wav2Vec2ConformerOnnxConfig" ,
644
+ ),
549
645
"whisper" : supported_tasks_mapping (
550
646
"default" ,
551
647
"default-with-past" ,
@@ -580,12 +676,6 @@ class TasksManager:
580
676
"object-detection" ,
581
677
onnx = "YolosOnnxConfig" ,
582
678
),
583
- "swin" : supported_tasks_mapping (
584
- "default" ,
585
- "image-classification" ,
586
- "masked-im" ,
587
- onnx = "SwinOnnxConfig" ,
588
- ),
589
679
}
590
680
_UNSUPPORTED_CLI_MODEL_TYPE = {"unet" , "vae" , "clip-text-model" }
591
681
_SUPPORTED_CLI_MODEL_TYPE = set (_SUPPORTED_MODEL_TYPE .keys ()) - _UNSUPPORTED_CLI_MODEL_TYPE
0 commit comments