diff --git a/docs/source/en/package_reference/inference_types.md b/docs/source/en/package_reference/inference_types.md index 925cc0c375..ca2f039ffe 100644 --- a/docs/source/en/package_reference/inference_types.md +++ b/docs/source/en/package_reference/inference_types.md @@ -325,6 +325,16 @@ This part of the lib is still under development and will be improved in future r +## text_to_video + +[[autodoc]] huggingface_hub.TextToVideoInput + +[[autodoc]] huggingface_hub.TextToVideoOutput + +[[autodoc]] huggingface_hub.TextToVideoParameters + + + ## token_classification [[autodoc]] huggingface_hub.TokenClassificationInput diff --git a/docs/source/ko/package_reference/inference_types.md b/docs/source/ko/package_reference/inference_types.md index 6ad9033b15..08063558a7 100644 --- a/docs/source/ko/package_reference/inference_types.md +++ b/docs/source/ko/package_reference/inference_types.md @@ -324,6 +324,16 @@ rendered properly in your Markdown viewer. +## text_to_video[[huggingface_hub.TextToVideoInput]] + +[[autodoc]] huggingface_hub.TextToVideoInput + +[[autodoc]] huggingface_hub.TextToVideoOutput + +[[autodoc]] huggingface_hub.TextToVideoParameters + + + ## token_classification[[huggingface_hub.TokenClassificationInput]] [[autodoc]] huggingface_hub.TokenClassificationInput diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py index 19a86a26e3..d1b46db2a8 100644 --- a/src/huggingface_hub/__init__.py +++ b/src/huggingface_hub/__init__.py @@ -398,6 +398,9 @@ "TextToSpeechInput", "TextToSpeechOutput", "TextToSpeechParameters", + "TextToVideoInput", + "TextToVideoOutput", + "TextToVideoParameters", "TokenClassificationAggregationStrategy", "TokenClassificationInput", "TokenClassificationOutputElement", @@ -705,6 +708,9 @@ "TextToSpeechInput", "TextToSpeechOutput", "TextToSpeechParameters", + "TextToVideoInput", + "TextToVideoOutput", + "TextToVideoParameters", "TokenClassificationAggregationStrategy", "TokenClassificationInput", "TokenClassificationOutputElement", @@ -1334,6 +1340,9 @@ def __dir__(): TextToSpeechInput, # noqa: F401 TextToSpeechOutput, # noqa: F401 TextToSpeechParameters, # noqa: F401 + TextToVideoInput, # noqa: F401 + TextToVideoOutput, # noqa: F401 + TextToVideoParameters, # noqa: F401 TokenClassificationAggregationStrategy, # noqa: F401 TokenClassificationInput, # noqa: F401 TokenClassificationOutputElement, # noqa: F401 diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py index 4d3c902c26..2f7af43d18 100644 --- a/src/huggingface_hub/inference/_client.py +++ b/src/huggingface_hub/inference/_client.py @@ -1280,7 +1280,7 @@ def image_to_image( image: ContentT, prompt: Optional[str] = None, *, - negative_prompt: Optional[List[str]] = None, + negative_prompt: Optional[str] = None, num_inference_steps: Optional[int] = None, guidance_scale: Optional[float] = None, model: Optional[str] = None, @@ -1301,8 +1301,8 @@ def image_to_image( The input image for translation. It can be raw bytes, an image file, or a URL to an online image. prompt (`str`, *optional*): The text prompt to guide the image generation. - negative_prompt (`List[str]`, *optional*): - One or several prompt to guide what NOT to include in image generation. + negative_prompt (`str`, *optional*): + One prompt to guide what NOT to include in image generation. num_inference_steps (`int`, *optional*): For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. @@ -2377,7 +2377,7 @@ def text_to_image( self, prompt: str, *, - negative_prompt: Optional[List[str]] = None, + negative_prompt: Optional[str] = None, height: Optional[float] = None, width: Optional[float] = None, num_inference_steps: Optional[int] = None, @@ -2400,8 +2400,8 @@ def text_to_image( Args: prompt (`str`): The prompt to generate an image from. - negative_prompt (`List[str`, *optional*): - One or several prompt to guide what NOT to include in image generation. + negative_prompt (`str`, *optional*): + One prompt to guide what NOT to include in image generation. height (`float`, *optional*): The height in pixels of the image to generate. width (`float`, *optional*): diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py index 7d0721f43b..6796c96305 100644 --- a/src/huggingface_hub/inference/_generated/_async_client.py +++ b/src/huggingface_hub/inference/_generated/_async_client.py @@ -1328,7 +1328,7 @@ async def image_to_image( image: ContentT, prompt: Optional[str] = None, *, - negative_prompt: Optional[List[str]] = None, + negative_prompt: Optional[str] = None, num_inference_steps: Optional[int] = None, guidance_scale: Optional[float] = None, model: Optional[str] = None, @@ -1349,8 +1349,8 @@ async def image_to_image( The input image for translation. It can be raw bytes, an image file, or a URL to an online image. prompt (`str`, *optional*): The text prompt to guide the image generation. - negative_prompt (`List[str]`, *optional*): - One or several prompt to guide what NOT to include in image generation. + negative_prompt (`str`, *optional*): + One prompt to guide what NOT to include in image generation. num_inference_steps (`int`, *optional*): For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. @@ -2436,7 +2436,7 @@ async def text_to_image( self, prompt: str, *, - negative_prompt: Optional[List[str]] = None, + negative_prompt: Optional[str] = None, height: Optional[float] = None, width: Optional[float] = None, num_inference_steps: Optional[int] = None, @@ -2459,8 +2459,8 @@ async def text_to_image( Args: prompt (`str`): The prompt to generate an image from. - negative_prompt (`List[str`, *optional*): - One or several prompt to guide what NOT to include in image generation. + negative_prompt (`str`, *optional*): + One prompt to guide what NOT to include in image generation. height (`float`, *optional*): The height in pixels of the image to generate. width (`float`, *optional*): diff --git a/src/huggingface_hub/inference/_generated/types/__init__.py b/src/huggingface_hub/inference/_generated/types/__init__.py index 3de22e6023..137c3c3e23 100644 --- a/src/huggingface_hub/inference/_generated/types/__init__.py +++ b/src/huggingface_hub/inference/_generated/types/__init__.py @@ -149,6 +149,7 @@ TextToSpeechOutput, TextToSpeechParameters, ) +from .text_to_video import TextToVideoInput, TextToVideoOutput, TextToVideoParameters from .token_classification import ( TokenClassificationAggregationStrategy, TokenClassificationInput, diff --git a/src/huggingface_hub/inference/_generated/types/image_to_image.py b/src/huggingface_hub/inference/_generated/types/image_to_image.py index 8a37d9b856..d4b6c7ca25 100644 --- a/src/huggingface_hub/inference/_generated/types/image_to_image.py +++ b/src/huggingface_hub/inference/_generated/types/image_to_image.py @@ -4,7 +4,7 @@ # - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts # - specs: https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks. from dataclasses import dataclass -from typing import Any, List, Optional +from typing import Any, Optional from .base import BaseInferenceType @@ -25,8 +25,8 @@ class ImageToImageParameters(BaseInferenceType): """For diffusion models. A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality. """ - negative_prompt: Optional[List[str]] = None - """One or several prompt to guide what NOT to include in image generation.""" + negative_prompt: Optional[str] = None + """One prompt to guide what NOT to include in image generation.""" num_inference_steps: Optional[int] = None """For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. diff --git a/src/huggingface_hub/inference/_generated/types/text_to_audio.py b/src/huggingface_hub/inference/_generated/types/text_to_audio.py index 0a6d64e4d9..b57fadb86f 100644 --- a/src/huggingface_hub/inference/_generated/types/text_to_audio.py +++ b/src/huggingface_hub/inference/_generated/types/text_to_audio.py @@ -97,6 +97,5 @@ class TextToAudioOutput(BaseInferenceType): audio: Any """The generated audio waveform.""" - sampling_rate: Any - text_to_audio_output_sampling_rate: Optional[float] = None + sampling_rate: float """The sampling rate of the generated audio waveform.""" diff --git a/src/huggingface_hub/inference/_generated/types/text_to_image.py b/src/huggingface_hub/inference/_generated/types/text_to_image.py index 7e5a0de157..8d2ff187a3 100644 --- a/src/huggingface_hub/inference/_generated/types/text_to_image.py +++ b/src/huggingface_hub/inference/_generated/types/text_to_image.py @@ -4,7 +4,7 @@ # - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts # - specs: https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks. from dataclasses import dataclass -from typing import Any, List, Optional +from typing import Any, Optional from .base import BaseInferenceType @@ -25,8 +25,8 @@ class TextToImageParameters(BaseInferenceType): """A higher guidance scale value encourages the model to generate images closely linked to the text prompt, but values too high may cause saturation and other artifacts. """ - negative_prompt: Optional[List[str]] = None - """One or several prompt to guide what NOT to include in image generation.""" + negative_prompt: Optional[str] = None + """One prompt to guide what NOT to include in image generation.""" num_inference_steps: Optional[int] = None """The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. diff --git a/src/huggingface_hub/inference/_generated/types/text_to_speech.py b/src/huggingface_hub/inference/_generated/types/text_to_speech.py index 12f6b50e4d..20bcd27965 100644 --- a/src/huggingface_hub/inference/_generated/types/text_to_speech.py +++ b/src/huggingface_hub/inference/_generated/types/text_to_speech.py @@ -93,12 +93,9 @@ class TextToSpeechInput(BaseInferenceType): @dataclass class TextToSpeechOutput(BaseInferenceType): - """Outputs for Text to Speech inference - Outputs of inference for the Text To Audio task - """ + """Outputs of inference for the Text To Speech task""" audio: Any - """The generated audio waveform.""" - sampling_rate: Any - text_to_speech_output_sampling_rate: Optional[float] = None + """The generated audio""" + sampling_rate: Optional[float] = None """The sampling rate of the generated audio waveform.""" diff --git a/src/huggingface_hub/inference/_generated/types/text_to_video.py b/src/huggingface_hub/inference/_generated/types/text_to_video.py new file mode 100644 index 0000000000..4adea263e3 --- /dev/null +++ b/src/huggingface_hub/inference/_generated/types/text_to_video.py @@ -0,0 +1,47 @@ +# Inference code generated from the JSON schema spec in @huggingface/tasks. +# +# See: +# - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts +# - specs: https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks. +from dataclasses import dataclass +from typing import Any, List, Optional + +from .base import BaseInferenceType + + +@dataclass +class TextToVideoParameters(BaseInferenceType): + """Additional inference parameters for Text To Video""" + + guidance_scale: Optional[float] = None + """A higher guidance scale value encourages the model to generate images closely linked to + the text prompt, but values too high may cause saturation and other artifacts. + """ + negative_prompt: Optional[List[str]] = None + """One or several prompt to guide what NOT to include in image generation.""" + num_frames: Optional[float] = None + """The num_frames parameter determines how many video frames are generated.""" + num_inference_steps: Optional[int] = None + """The number of denoising steps. More denoising steps usually lead to a higher quality + image at the expense of slower inference. + """ + seed: Optional[int] = None + """Seed for the random number generator.""" + + +@dataclass +class TextToVideoInput(BaseInferenceType): + """Inputs for Text To Video inference""" + + inputs: str + """The input text data (sometimes called "prompt")""" + parameters: Optional[TextToVideoParameters] = None + """Additional inference parameters for Text To Video""" + + +@dataclass +class TextToVideoOutput(BaseInferenceType): + """Outputs of inference for the Text To Video task""" + + video: Any + """The generated video returned as raw bytes in the payload.""" diff --git a/src/huggingface_hub/inference/_generated/types/visual_question_answering.py b/src/huggingface_hub/inference/_generated/types/visual_question_answering.py index eae04cb852..9001b3bd17 100644 --- a/src/huggingface_hub/inference/_generated/types/visual_question_answering.py +++ b/src/huggingface_hub/inference/_generated/types/visual_question_answering.py @@ -15,7 +15,7 @@ class VisualQuestionAnsweringInputData(BaseInferenceType): image: Any """The image.""" - question: Any + question: str """The question to answer based on the image."""