Skip to content

Commit 993665b

Browse files
authored
Add Gradio helpers - part 4 (openvinotoolkit#2315)
Ticket: CVS-147626 Notebooks: 1. animate-anyone/animate-anyone.ipynb 2. bark-text-to-audio/bark-text-to-audio.ipynb 3. blip-visual-language-processing/blip-visual-language-processing.ipynb 4. controlnet-stable-diffusion/controlnet-stable-diffusion.ipynb 5. ddcolor-image-colorization/ddcolor-image-colorization.ipynb 6. depth-anything/depth-anything-v2.ipynb 7. depth-anything/depth-anything.ipynb 8. dolly-2-instruction-following/dolly-2-instruction-following.ipynb 9. dynamicrafter-animating-images/dynamicrafter-animating-images.ipynb
1 parent 6f0fd17 commit 993665b

17 files changed

+658
-553
lines changed

notebooks/animate-anyone/animate-anyone.ipynb

+23-46
Original file line numberDiff line numberDiff line change
@@ -165,9 +165,7 @@
165165
"from diffusers.image_processor import VaeImageProcessor\n",
166166
"from transformers import CLIPImageProcessor\n",
167167
"import torch\n",
168-
"import gradio as gr\n",
169168
"import ipywidgets as widgets\n",
170-
"import numpy as np\n",
171169
"\n",
172170
"from src.pipelines.pipeline_pose2vid_long import Pose2VideoPipeline\n",
173171
"from src.utils.util import get_fps, read_frames\n",
@@ -1523,11 +1521,14 @@
15231521
},
15241522
{
15251523
"cell_type": "code",
1526-
"execution_count": 27,
1527-
"id": "2832a501-a8cb-4a44-a249-846f8524e3d6",
1524+
"execution_count": null,
1525+
"id": "6a361d62",
15281526
"metadata": {},
15291527
"outputs": [],
15301528
"source": [
1529+
"import gradio as gr\n",
1530+
"\n",
1531+
"\n",
15311532
"def generate(\n",
15321533
" img,\n",
15331534
" pose_vid,\n",
@@ -1573,48 +1574,24 @@
15731574
" n_rows=3,\n",
15741575
" fps=12,\n",
15751576
" )\n",
1576-
" return out_path\n",
1577-
"\n",
1578-
"\n",
1579-
"demo = gr.Interface(\n",
1580-
" generate,\n",
1581-
" [\n",
1582-
" gr.Image(label=\"Reference Image\", type=\"pil\"),\n",
1583-
" gr.Video(label=\"Pose video\"),\n",
1584-
" gr.Slider(\n",
1585-
" label=\"Seed\",\n",
1586-
" value=42,\n",
1587-
" minimum=np.iinfo(np.int32).min,\n",
1588-
" maximum=np.iinfo(np.int32).max,\n",
1589-
" ),\n",
1590-
" gr.Slider(label=\"Guidance scale\", value=3.5, minimum=1.1, maximum=10),\n",
1591-
" gr.Slider(label=\"Number of inference steps\", value=30, minimum=15, maximum=100),\n",
1592-
" ],\n",
1593-
" \"video\",\n",
1594-
" examples=[\n",
1595-
" [\n",
1596-
" \"Moore-AnimateAnyone/configs/inference/ref_images/anyone-2.png\",\n",
1597-
" \"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-2_kps.mp4\",\n",
1598-
" ],\n",
1599-
" [\n",
1600-
" \"Moore-AnimateAnyone/configs/inference/ref_images/anyone-10.png\",\n",
1601-
" \"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-1_kps.mp4\",\n",
1602-
" ],\n",
1603-
" [\n",
1604-
" \"Moore-AnimateAnyone/configs/inference/ref_images/anyone-11.png\",\n",
1605-
" \"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-1_kps.mp4\",\n",
1606-
" ],\n",
1607-
" [\n",
1608-
" \"Moore-AnimateAnyone/configs/inference/ref_images/anyone-3.png\",\n",
1609-
" \"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-2_kps.mp4\",\n",
1610-
" ],\n",
1611-
" [\n",
1612-
" \"Moore-AnimateAnyone/configs/inference/ref_images/anyone-5.png\",\n",
1613-
" \"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-2_kps.mp4\",\n",
1614-
" ],\n",
1615-
" ],\n",
1616-
" allow_flagging=\"never\",\n",
1617-
")\n",
1577+
" return out_path"
1578+
]
1579+
},
1580+
{
1581+
"cell_type": "code",
1582+
"execution_count": 27,
1583+
"id": "2832a501-a8cb-4a44-a249-846f8524e3d6",
1584+
"metadata": {},
1585+
"outputs": [],
1586+
"source": [
1587+
"if not Path(\"gradio_helper.py\").exists():\n",
1588+
" r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/animate-anyone/gradio_helper.py\")\n",
1589+
" open(\"gradio_helper.py\", \"w\").write(r.text)\n",
1590+
"\n",
1591+
"from gradio_helper import make_demo\n",
1592+
"\n",
1593+
"demo = make_demo(fn=generate)\n",
1594+
"\n",
16181595
"try:\n",
16191596
" demo.queue().launch(debug=True)\n",
16201597
"except Exception:\n",
+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from typing import Callable
2+
import gradio as gr
3+
import numpy as np
4+
5+
examples = [
6+
[
7+
"Moore-AnimateAnyone/configs/inference/ref_images/anyone-2.png",
8+
"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-2_kps.mp4",
9+
],
10+
[
11+
"Moore-AnimateAnyone/configs/inference/ref_images/anyone-10.png",
12+
"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-1_kps.mp4",
13+
],
14+
[
15+
"Moore-AnimateAnyone/configs/inference/ref_images/anyone-11.png",
16+
"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-1_kps.mp4",
17+
],
18+
[
19+
"Moore-AnimateAnyone/configs/inference/ref_images/anyone-3.png",
20+
"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-2_kps.mp4",
21+
],
22+
[
23+
"Moore-AnimateAnyone/configs/inference/ref_images/anyone-5.png",
24+
"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-2_kps.mp4",
25+
],
26+
]
27+
28+
29+
def make_demo(fn: Callable):
30+
demo = gr.Interface(
31+
fn=fn,
32+
inputs=[
33+
gr.Image(label="Reference Image", type="pil"),
34+
gr.Video(label="Pose video"),
35+
gr.Slider(
36+
label="Seed",
37+
value=42,
38+
minimum=np.iinfo(np.int32).min,
39+
maximum=np.iinfo(np.int32).max,
40+
),
41+
gr.Slider(label="Guidance scale", value=3.5, minimum=1.1, maximum=10),
42+
gr.Slider(label="Number of inference steps", value=30, minimum=15, maximum=100),
43+
],
44+
outputs="video",
45+
examples=examples,
46+
allow_flagging="never",
47+
)
48+
return demo

notebooks/bark-text-to-audio/bark-text-to-audio.ipynb

+24-113
Original file line numberDiff line numberDiff line change
@@ -1132,12 +1132,12 @@
11321132
{
11331133
"cell_type": "code",
11341134
"execution_count": null,
1135-
"id": "590b9db5",
1135+
"id": "3637baad",
11361136
"metadata": {},
11371137
"outputs": [],
11381138
"source": [
11391139
"import numpy as np\n",
1140-
"import gradio as gr\n",
1140+
"\n",
11411141
"from bark import SAMPLE_RATE\n",
11421142
"from bark.generation import SUPPORTED_LANGS\n",
11431143
"\n",
@@ -1151,124 +1151,35 @@
11511151
"PROMPT_LOOKUP[\"Unconditional\"] = None\n",
11521152
"PROMPT_LOOKUP[\"Announcer\"] = \"announcer\"\n",
11531153
"\n",
1154-
"default_text = \"Hello, my name is Suno. And, uh — and I like pizza. [laughs]\\nBut I also have other interests such as playing tic tac toe.\"\n",
1155-
"\n",
1156-
"title = \"# 🐶 Bark: Text-to-Speech using OpenVINO</div>\"\n",
1157-
"\n",
1158-
"description = \"\"\"\n",
1159-
"Bark is a universal text-to-audio model created by [Suno](http://suno.ai). \\\n",
1160-
"Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. \\\n",
1161-
"The model output is not censored and the authors do not endorse the opinions in the generated content. \\\n",
1162-
"Use at your own risk.\n",
1163-
"\"\"\"\n",
1164-
"\n",
1165-
"article = \"\"\"\n",
1166-
"\n",
1167-
"## 🌎 Foreign Language\n",
1168-
"\n",
1169-
"Bark supports various languages out-of-the-box and automatically determines language from input text. \\\n",
1170-
"When prompted with code-switched text, Bark will even attempt to employ the native accent for the respective languages in the same voice.\n",
1171-
"\n",
1172-
"Try the prompt:\n",
1173-
"\n",
1174-
"```\n",
1175-
"Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible.\n",
1176-
"```\n",
1177-
"\n",
1178-
"## 🤭 Non-Speech Sounds\n",
1179-
"\n",
1180-
"Below is a list of some known non-speech sounds, but we are finding more every day. \\\n",
1181-
"Please let us know if you find patterns that work particularly well on Discord!\n",
1182-
"\n",
1183-
"* [laughter]\n",
1184-
"* [laughs]\n",
1185-
"* [sighs]\n",
1186-
"* [music]\n",
1187-
"* [gasps]\n",
1188-
"* [clears throat]\n",
1189-
"* — or ... for hesitations\n",
1190-
"* ♪ for song lyrics\n",
1191-
"* capitalization for emphasis of a word\n",
1192-
"* MAN/WOMAN: for bias towards speaker\n",
1193-
"\n",
1194-
"Try the prompt:\n",
1195-
"\n",
1196-
"```\n",
1197-
"\" [clears throat] Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as... ♪ singing ♪.\"\n",
1198-
"```\n",
1199-
"\n",
1200-
"## 🎶 Music\n",
1201-
"Bark can generate all types of audio, and, in principle, doesn't see a difference between speech and music. \\\n",
1202-
"Sometimes Bark chooses to generate text as music, but you can help it out by adding music notes around your lyrics.\n",
1203-
"\n",
1204-
"Try the prompt:\n",
1205-
"\n",
1206-
"```\n",
1207-
"♪ In the jungle, the mighty jungle, the lion barks tonight ♪\n",
1208-
"```\n",
1209-
"\n",
1210-
"## 🧬 Voice Cloning\n",
1211-
"\n",
1212-
"Bark has the capability to fully clone voices - including tone, pitch, emotion and prosody. \\\n",
1213-
"The model also attempts to preserve music, ambient noise, etc. from input audio. \\\n",
1214-
"However, to mitigate misuse of this technology, we limit the audio history prompts to a limited set of Suno-provided, fully synthetic options to choose from.\n",
1215-
"\n",
1216-
"## 👥 Speaker Prompts\n",
1217-
"\n",
1218-
"You can provide certain speaker prompts such as NARRATOR, MAN, WOMAN, etc. \\\n",
1219-
"Please note that these are not always respected, especially if a conflicting audio history prompt is given.\n",
1220-
"\n",
1221-
"Try the prompt:\n",
1222-
"\n",
1223-
"```\n",
1224-
"WOMAN: I would like an oatmilk latte please.\n",
1225-
"MAN: Wow, that's expensive!\n",
1226-
"```\n",
1227-
"\n",
1228-
"\"\"\"\n",
1229-
"\n",
1230-
"examples = [\n",
1231-
" [\n",
1232-
" \"Please surprise me and speak in whatever voice you enjoy. Vielen Dank und Gesundheit!\",\n",
1233-
" \"Unconditional\",\n",
1234-
" ],\n",
1235-
" [\n",
1236-
" \"Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe.\",\n",
1237-
" \"Speaker 1 (en)\",\n",
1238-
" ],\n",
1239-
" [\n",
1240-
" \"Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible.\",\n",
1241-
" \"Speaker 0 (es)\",\n",
1242-
" ],\n",
1243-
"]\n",
1244-
"\n",
12451154
"\n",
12461155
"def gen_tts(text, history_prompt):\n",
12471156
" history_prompt = PROMPT_LOOKUP[history_prompt]\n",
12481157
" audio_arr = generate_audio(text, history_prompt=history_prompt)\n",
12491158
" audio_arr = (audio_arr * 32767).astype(np.int16)\n",
1250-
" return (SAMPLE_RATE, audio_arr)\n",
1251-
"\n",
1252-
"\n",
1253-
"with gr.Blocks() as block:\n",
1254-
" gr.Markdown(title)\n",
1255-
" gr.Markdown(description)\n",
1256-
" with gr.Row():\n",
1257-
" with gr.Column():\n",
1258-
" input_text = gr.Textbox(label=\"Input Text\", lines=2, value=default_text)\n",
1259-
" options = gr.Dropdown(AVAILABLE_PROMPTS, value=\"Speaker 1 (en)\", label=\"Acoustic Prompt\")\n",
1260-
" run_button = gr.Button()\n",
1261-
" with gr.Column():\n",
1262-
" audio_out = gr.Audio(label=\"Generated Audio\", type=\"numpy\")\n",
1263-
" inputs = [input_text, options]\n",
1264-
" outputs = [audio_out]\n",
1265-
" gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs)\n",
1266-
" gr.Markdown(article)\n",
1267-
" run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)\n",
1159+
" return (SAMPLE_RATE, audio_arr)"
1160+
]
1161+
},
1162+
{
1163+
"cell_type": "code",
1164+
"execution_count": null,
1165+
"id": "590b9db5",
1166+
"metadata": {},
1167+
"outputs": [],
1168+
"source": [
1169+
"import requests\n",
1170+
"\n",
1171+
"if not Path(\"gradio_helper.py\").exists():\n",
1172+
" r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/bark-text-to-audio/gradio_helper.py\")\n",
1173+
" open(\"gradio_helper.py\", \"w\").write(r.text)\n",
1174+
"\n",
1175+
"from gradio_helper import make_demo\n",
1176+
"\n",
1177+
"demo = make_demo(fn=gen_tts, available_prompts=AVAILABLE_PROMPTS)\n",
1178+
"\n",
12681179
"try:\n",
1269-
" block.launch(debug=True)\n",
1180+
" demo.launch(debug=True)\n",
12701181
"except Exception:\n",
1271-
" block.launch(share=True, debug=True)\n",
1182+
" demo.launch(share=True, debug=True)\n",
12721183
"# if you are launching remotely, specify server_name and server_port\n",
12731184
"# demo.launch(server_name='your server name', server_port='server port in int')\n",
12741185
"# Read more in the docs: https://gradio.app/docs/"

0 commit comments

Comments
 (0)