diff --git a/docs/source/en/package_reference/inference_types.md b/docs/source/en/package_reference/inference_types.md
index 925cc0c375..ca2f039ffe 100644
--- a/docs/source/en/package_reference/inference_types.md
+++ b/docs/source/en/package_reference/inference_types.md
@@ -325,6 +325,16 @@ This part of the lib is still under development and will be improved in future r
 
 
 
+## text_to_video
+
+[[autodoc]] huggingface_hub.TextToVideoInput
+
+[[autodoc]] huggingface_hub.TextToVideoOutput
+
+[[autodoc]] huggingface_hub.TextToVideoParameters
+
+
+
 ## token_classification
 
 [[autodoc]] huggingface_hub.TokenClassificationInput
diff --git a/docs/source/ko/package_reference/inference_types.md b/docs/source/ko/package_reference/inference_types.md
index 6ad9033b15..08063558a7 100644
--- a/docs/source/ko/package_reference/inference_types.md
+++ b/docs/source/ko/package_reference/inference_types.md
@@ -324,6 +324,16 @@ rendered properly in your Markdown viewer.
 
 
 
+## text_to_video[[huggingface_hub.TextToVideoInput]]
+
+[[autodoc]] huggingface_hub.TextToVideoInput
+
+[[autodoc]] huggingface_hub.TextToVideoOutput
+
+[[autodoc]] huggingface_hub.TextToVideoParameters
+
+
+
 ## token_classification[[huggingface_hub.TokenClassificationInput]]
 
 [[autodoc]] huggingface_hub.TokenClassificationInput
diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py
index 19a86a26e3..d1b46db2a8 100644
--- a/src/huggingface_hub/__init__.py
+++ b/src/huggingface_hub/__init__.py
@@ -398,6 +398,9 @@
         "TextToSpeechInput",
         "TextToSpeechOutput",
         "TextToSpeechParameters",
+        "TextToVideoInput",
+        "TextToVideoOutput",
+        "TextToVideoParameters",
         "TokenClassificationAggregationStrategy",
         "TokenClassificationInput",
         "TokenClassificationOutputElement",
@@ -705,6 +708,9 @@
     "TextToSpeechInput",
     "TextToSpeechOutput",
     "TextToSpeechParameters",
+    "TextToVideoInput",
+    "TextToVideoOutput",
+    "TextToVideoParameters",
     "TokenClassificationAggregationStrategy",
     "TokenClassificationInput",
     "TokenClassificationOutputElement",
@@ -1334,6 +1340,9 @@ def __dir__():
         TextToSpeechInput,  # noqa: F401
         TextToSpeechOutput,  # noqa: F401
         TextToSpeechParameters,  # noqa: F401
+        TextToVideoInput,  # noqa: F401
+        TextToVideoOutput,  # noqa: F401
+        TextToVideoParameters,  # noqa: F401
         TokenClassificationAggregationStrategy,  # noqa: F401
         TokenClassificationInput,  # noqa: F401
         TokenClassificationOutputElement,  # noqa: F401
diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py
index 4d3c902c26..2f7af43d18 100644
--- a/src/huggingface_hub/inference/_client.py
+++ b/src/huggingface_hub/inference/_client.py
@@ -1280,7 +1280,7 @@ def image_to_image(
         image: ContentT,
         prompt: Optional[str] = None,
         *,
-        negative_prompt: Optional[List[str]] = None,
+        negative_prompt: Optional[str] = None,
         num_inference_steps: Optional[int] = None,
         guidance_scale: Optional[float] = None,
         model: Optional[str] = None,
@@ -1301,8 +1301,8 @@ def image_to_image(
                 The input image for translation. It can be raw bytes, an image file, or a URL to an online image.
             prompt (`str`, *optional*):
                 The text prompt to guide the image generation.
-            negative_prompt (`List[str]`, *optional*):
-                One or several prompt to guide what NOT to include in image generation.
+            negative_prompt (`str`, *optional*):
+                One prompt to guide what NOT to include in image generation.
             num_inference_steps (`int`, *optional*):
                 For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher
                 quality image at the expense of slower inference.
@@ -2377,7 +2377,7 @@ def text_to_image(
         self,
         prompt: str,
         *,
-        negative_prompt: Optional[List[str]] = None,
+        negative_prompt: Optional[str] = None,
         height: Optional[float] = None,
         width: Optional[float] = None,
         num_inference_steps: Optional[int] = None,
@@ -2400,8 +2400,8 @@ def text_to_image(
         Args:
             prompt (`str`):
                 The prompt to generate an image from.
-            negative_prompt (`List[str`, *optional*):
-                One or several prompt to guide what NOT to include in image generation.
+            negative_prompt (`str`, *optional*):
+                One prompt to guide what NOT to include in image generation.
             height (`float`, *optional*):
                 The height in pixels of the image to generate.
             width (`float`, *optional*):
diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py
index 7d0721f43b..6796c96305 100644
--- a/src/huggingface_hub/inference/_generated/_async_client.py
+++ b/src/huggingface_hub/inference/_generated/_async_client.py
@@ -1328,7 +1328,7 @@ async def image_to_image(
         image: ContentT,
         prompt: Optional[str] = None,
         *,
-        negative_prompt: Optional[List[str]] = None,
+        negative_prompt: Optional[str] = None,
         num_inference_steps: Optional[int] = None,
         guidance_scale: Optional[float] = None,
         model: Optional[str] = None,
@@ -1349,8 +1349,8 @@ async def image_to_image(
                 The input image for translation. It can be raw bytes, an image file, or a URL to an online image.
             prompt (`str`, *optional*):
                 The text prompt to guide the image generation.
-            negative_prompt (`List[str]`, *optional*):
-                One or several prompt to guide what NOT to include in image generation.
+            negative_prompt (`str`, *optional*):
+                One prompt to guide what NOT to include in image generation.
             num_inference_steps (`int`, *optional*):
                 For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher
                 quality image at the expense of slower inference.
@@ -2436,7 +2436,7 @@ async def text_to_image(
         self,
         prompt: str,
         *,
-        negative_prompt: Optional[List[str]] = None,
+        negative_prompt: Optional[str] = None,
         height: Optional[float] = None,
         width: Optional[float] = None,
         num_inference_steps: Optional[int] = None,
@@ -2459,8 +2459,8 @@ async def text_to_image(
         Args:
             prompt (`str`):
                 The prompt to generate an image from.
-            negative_prompt (`List[str`, *optional*):
-                One or several prompt to guide what NOT to include in image generation.
+            negative_prompt (`str`, *optional*):
+                One prompt to guide what NOT to include in image generation.
             height (`float`, *optional*):
                 The height in pixels of the image to generate.
             width (`float`, *optional*):
diff --git a/src/huggingface_hub/inference/_generated/types/__init__.py b/src/huggingface_hub/inference/_generated/types/__init__.py
index 3de22e6023..137c3c3e23 100644
--- a/src/huggingface_hub/inference/_generated/types/__init__.py
+++ b/src/huggingface_hub/inference/_generated/types/__init__.py
@@ -149,6 +149,7 @@
     TextToSpeechOutput,
     TextToSpeechParameters,
 )
+from .text_to_video import TextToVideoInput, TextToVideoOutput, TextToVideoParameters
 from .token_classification import (
     TokenClassificationAggregationStrategy,
     TokenClassificationInput,
diff --git a/src/huggingface_hub/inference/_generated/types/image_to_image.py b/src/huggingface_hub/inference/_generated/types/image_to_image.py
index 8a37d9b856..d4b6c7ca25 100644
--- a/src/huggingface_hub/inference/_generated/types/image_to_image.py
+++ b/src/huggingface_hub/inference/_generated/types/image_to_image.py
@@ -4,7 +4,7 @@
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
 from dataclasses import dataclass
-from typing import Any, List, Optional
+from typing import Any, Optional
 
 from .base import BaseInferenceType
 
@@ -25,8 +25,8 @@ class ImageToImageParameters(BaseInferenceType):
     """For diffusion models. A higher guidance scale value encourages the model to generate
     images closely linked to the text prompt at the expense of lower image quality.
     """
-    negative_prompt: Optional[List[str]] = None
-    """One or several prompt to guide what NOT to include in image generation."""
+    negative_prompt: Optional[str] = None
+    """One prompt to guide what NOT to include in image generation."""
     num_inference_steps: Optional[int] = None
     """For diffusion models. The number of denoising steps. More denoising steps usually lead to
     a higher quality image at the expense of slower inference.
diff --git a/src/huggingface_hub/inference/_generated/types/text_to_audio.py b/src/huggingface_hub/inference/_generated/types/text_to_audio.py
index 0a6d64e4d9..b57fadb86f 100644
--- a/src/huggingface_hub/inference/_generated/types/text_to_audio.py
+++ b/src/huggingface_hub/inference/_generated/types/text_to_audio.py
@@ -97,6 +97,5 @@ class TextToAudioOutput(BaseInferenceType):
 
     audio: Any
     """The generated audio waveform."""
-    sampling_rate: Any
-    text_to_audio_output_sampling_rate: Optional[float] = None
+    sampling_rate: float
     """The sampling rate of the generated audio waveform."""
diff --git a/src/huggingface_hub/inference/_generated/types/text_to_image.py b/src/huggingface_hub/inference/_generated/types/text_to_image.py
index 7e5a0de157..8d2ff187a3 100644
--- a/src/huggingface_hub/inference/_generated/types/text_to_image.py
+++ b/src/huggingface_hub/inference/_generated/types/text_to_image.py
@@ -4,7 +4,7 @@
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
 from dataclasses import dataclass
-from typing import Any, List, Optional
+from typing import Any, Optional
 
 from .base import BaseInferenceType
 
@@ -25,8 +25,8 @@ class TextToImageParameters(BaseInferenceType):
     """A higher guidance scale value encourages the model to generate images closely linked to
     the text prompt, but values too high may cause saturation and other artifacts.
     """
-    negative_prompt: Optional[List[str]] = None
-    """One or several prompt to guide what NOT to include in image generation."""
+    negative_prompt: Optional[str] = None
+    """One prompt to guide what NOT to include in image generation."""
     num_inference_steps: Optional[int] = None
     """The number of denoising steps. More denoising steps usually lead to a higher quality
     image at the expense of slower inference.
diff --git a/src/huggingface_hub/inference/_generated/types/text_to_speech.py b/src/huggingface_hub/inference/_generated/types/text_to_speech.py
index 12f6b50e4d..20bcd27965 100644
--- a/src/huggingface_hub/inference/_generated/types/text_to_speech.py
+++ b/src/huggingface_hub/inference/_generated/types/text_to_speech.py
@@ -93,12 +93,9 @@ class TextToSpeechInput(BaseInferenceType):
 
 @dataclass
 class TextToSpeechOutput(BaseInferenceType):
-    """Outputs for Text to Speech inference
-    Outputs of inference for the Text To Audio task
-    """
+    """Outputs of inference for the Text To Speech task"""
 
     audio: Any
-    """The generated audio waveform."""
-    sampling_rate: Any
-    text_to_speech_output_sampling_rate: Optional[float] = None
+    """The generated audio"""
+    sampling_rate: Optional[float] = None
     """The sampling rate of the generated audio waveform."""
diff --git a/src/huggingface_hub/inference/_generated/types/text_to_video.py b/src/huggingface_hub/inference/_generated/types/text_to_video.py
new file mode 100644
index 0000000000..4adea263e3
--- /dev/null
+++ b/src/huggingface_hub/inference/_generated/types/text_to_video.py
@@ -0,0 +1,47 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from dataclasses import dataclass
+from typing import Any, List, Optional
+
+from .base import BaseInferenceType
+
+
+@dataclass
+class TextToVideoParameters(BaseInferenceType):
+    """Additional inference parameters for Text To Video"""
+
+    guidance_scale: Optional[float] = None
+    """A higher guidance scale value encourages the model to generate images closely linked to
+    the text prompt, but values too high may cause saturation and other artifacts.
+    """
+    negative_prompt: Optional[List[str]] = None
+    """One or several prompt to guide what NOT to include in image generation."""
+    num_frames: Optional[float] = None
+    """The num_frames parameter determines how many video frames are generated."""
+    num_inference_steps: Optional[int] = None
+    """The number of denoising steps. More denoising steps usually lead to a higher quality
+    image at the expense of slower inference.
+    """
+    seed: Optional[int] = None
+    """Seed for the random number generator."""
+
+
+@dataclass
+class TextToVideoInput(BaseInferenceType):
+    """Inputs for Text To Video inference"""
+
+    inputs: str
+    """The input text data (sometimes called "prompt")"""
+    parameters: Optional[TextToVideoParameters] = None
+    """Additional inference parameters for Text To Video"""
+
+
+@dataclass
+class TextToVideoOutput(BaseInferenceType):
+    """Outputs of inference for the Text To Video task"""
+
+    video: Any
+    """The generated video returned as raw bytes in the payload."""
diff --git a/src/huggingface_hub/inference/_generated/types/visual_question_answering.py b/src/huggingface_hub/inference/_generated/types/visual_question_answering.py
index eae04cb852..9001b3bd17 100644
--- a/src/huggingface_hub/inference/_generated/types/visual_question_answering.py
+++ b/src/huggingface_hub/inference/_generated/types/visual_question_answering.py
@@ -15,7 +15,7 @@ class VisualQuestionAnsweringInputData(BaseInferenceType):
 
     image: Any
     """The image."""
-    question: Any
+    question: str
     """The question to answer based on the image."""