openvinotoolkit · apaniukov · Mar 29, 2022 · Mar 29, 2022 · Mar 29, 2022 · Mar 30, 2022
diff --git a/automation/bom/image_BOM.txt b/automation/bom/image_BOM.txt
@@ -506,6 +506,8 @@ wb/main/jupyter_notebooks/cell_templates/summary_docs_cell.jinja
 wb/main/jupyter_notebooks/cell_templates/tokenize_dataset_code_cell.jinja
 wb/main/jupyter_notebooks/cell_templates/tokenize_dataset_docs_cell.jinja
 wb/main/jupyter_notebooks/cell_templates/tokenizer_parameters_code_cell.jinja
+wb/main/jupyter_notebooks/cell_templates/transformers_onnx_converter_code_cell.jinja
+wb/main/jupyter_notebooks/cell_templates/transformers_onnx_converter_docs_cell.jinja
 wb/main/jupyter_notebooks/cell_templates/validate_ir_model_code_cell.jinja
 wb/main/jupyter_notebooks/cell_templates/validate_ir_model_docs_cell.jinja
 wb/main/jupyter_notebooks/cli_tools_options.py

diff --git a/wb/main/jupyter_notebooks/cell_template_contexts.py b/wb/main/jupyter_notebooks/cell_template_contexts.py
@@ -85,6 +85,7 @@ class TokenizerParametersTemplateContext(TypedDict):
     batch: Optional[int]
     streams: Optional[int]
 
+
 class AccuracyDocsCellTemplateContext(TypedDict):
     yaml_config_path: str
 
@@ -107,3 +108,7 @@ class Int8OptimizationCodeCellTemplateContext(Int8OptimizationDocsCellTemplateCo
 
 class InstallRequirementsCodeCellTemplateContext(TypedDict):
     requirements_file: str
+
+
+class TransformersONNXCodeCellTemplateContext(TypedDict):
+    model_checkpoint: str
diff --git a/wb/main/jupyter_notebooks/cell_templates/obtain_model_docs_cell.jinja b/wb/main/jupyter_notebooks/cell_templates/obtain_model_docs_cell.jinja
@@ -13,7 +13,14 @@ In this tutorial we are working with the following model:
 
 - Model Name: **{{ project_model_name }}**
 
-- Model Source: {{ 'Open Model Zoo' if project_model_source == ModelSourceEnum.omz.value else 'User provided model' }}
+- Model Source:
+{% if project_model_source == ModelSourceEnum.huggingface.value %}
+Hugging Face Hub
+{% elif project_model_source == ModelSourceEnum.omz.value %}
+Open Model Zoo
+{% else %}
+User provided model
+{% endif %}
 
 - Model Framework: {{ SupportedFrameworksEnum.get_name(project_model_framework) }}
 
@@ -26,6 +33,8 @@ No conversion to IR required. Download the model with the Model Downloader and p
 {%      else %}
 Download the model with Model Downloader and then convert it to IR format with Model Converter.
 {%      endif %}
+{% elif project_model_source == ModelSourceEnum.huggingface.value %}
+Your original model is PyTorch format. Use `transformers.onnx` CLI tool to convert it to ONNX, than convert the model to the IR format with Model Optimizer.
 {% elif project_model_source == ModelSourceEnum.original.value %}
 Your original model is in one of the supported frameworks. Convert model to IR format with Model Optimizer.
 {% endif %}
diff --git a/wb/main/jupyter_notebooks/cell_templates/transformers_onnx_converter_code_cell.jinja b/wb/main/jupyter_notebooks/cell_templates/transformers_onnx_converter_code_cell.jinja
@@ -0,0 +1,7 @@
+{% include 'autogenerated_note_code.jinja' %}
+
+
+!python -m transformers.onnx \
+    --model={{ model_checkpoint }} \
+    --feature sequence-classification \
+    onnx
diff --git a/wb/main/jupyter_notebooks/cell_templates/transformers_onnx_converter_docs_cell.jinja b/wb/main/jupyter_notebooks/cell_templates/transformers_onnx_converter_docs_cell.jinja
@@ -0,0 +1,9 @@
+### Get ONNX model from Hugging Face Hub
+
+To obtain an ONNX model use [transformers.onnx](https://huggingface.co/docs/transformers/main/en/serialization#onnx) CLI tool from the `transformers` library.
+
+It will execute this steps:
+1. Download the model files and tokenizer files from the Hugging Face Hub
+1. Generate the dummy input with the tokenizer and pass it to the model to trace the model execution graph
+1. Use the execution graph to generate ONNX model
+1. Check the that the result model outputs is close to the original one
diff --git a/wb/main/jupyter_notebooks/jupyter_notebook_cell.py b/wb/main/jupyter_notebooks/jupyter_notebook_cell.py
@@ -66,6 +66,8 @@ class NotebookCellIds(enum.Enum):
     tokenize_dataset_docs = 'tokenize_dataset_docs'
     tokenize_dataset_code = 'tokenize_dataset_code'
     tokenizer_parameters_code = 'tokenizer_parameters_code'
+    transformers_onnx_converter_docs = 'transformers_onnx_converter_docs'
+    transformers_onnx_converter_code = 'transformers_onnx_converter_code'
 
 
 class NotebookCellConfig:
@@ -144,6 +146,18 @@ class NotebookCells:
         cell_type=NotebookCellTypes.markdown,
         template_filename='model_converter_result_docs_cell.jinja')
 
+    transformers_onnx_converter_docs = NotebookCellConfig(
+        cell_id=NotebookCellIds.transformers_onnx_converter_docs,
+        cell_type=NotebookCellTypes.markdown,
+        template_filename='transformers_onnx_converter_docs_cell.jinja'
+    )
+
+    transformers_onnx_converter_code = NotebookCellConfig(
+        cell_id=NotebookCellIds.transformers_onnx_converter_code,
+        cell_type=NotebookCellTypes.code,
+        template_filename='transformers_onnx_converter_code_cell.jinja'
+    )
+
     model_optimizer_docs = NotebookCellConfig(
         cell_id=NotebookCellIds.model_optimizer_docs,
         cell_type=NotebookCellTypes.markdown,
@@ -179,7 +193,7 @@ class NotebookCells:
         cell_type=NotebookCellTypes.code,
         template_filename='validate_ir_model_code_cell.jinja')
 
-    tokenizer_parameters_code_code = NotebookCellConfig(
+    tokenizer_parameters_code = NotebookCellConfig(
         cell_id=NotebookCellIds.tokenizer_parameters_code,
         cell_type=NotebookCellTypes.code,
         template_filename='tokenizer_parameters_code_cell.jinja')

diff --git a/wb/main/jupyter_notebooks/notebook_template_creator.py b/wb/main/jupyter_notebooks/notebook_template_creator.py
@@ -48,7 +48,7 @@ class NotebookTemplateCreator:
         ],
         model_tokenizer=[
             NotebookCells.load_tokenizer_docs,
-            NotebookCells.tokenizer_parameters_code_code,
+            NotebookCells.tokenizer_parameters_code,
             NotebookCells.load_tokenizer_code,
             NotebookCells.tokenize_dataset_docs,
             NotebookCells.tokenize_dataset_code,
@@ -98,6 +98,16 @@ def _obtain_model_section_cells(self) -> List[NotebookCellConfig]:
                 ])
             obtain_model_cells.append(NotebookCells.obtain_model_result_docs)
             return obtain_model_cells
+        if self._original_model_source == ModelSourceEnum.huggingface:
+            return [
+                NotebookCells.obtain_model_docs,
+                NotebookCells.transformers_onnx_converter_docs,
+                NotebookCells.transformers_onnx_converter_code,
+                NotebookCells.model_optimizer_docs,
+                NotebookCells.model_optimizer_code,
+                NotebookCells.model_optimizer_result_docs,
+                NotebookCells.obtain_model_result_docs,
+            ]
         if self._original_model_source == ModelSourceEnum.original:
             return [
                 NotebookCells.obtain_model_docs,

diff --git a/wb/main/jupyter_notebooks/resources/requirements_nlp.txt b/wb/main/jupyter_notebooks/resources/requirements_nlp.txt
@@ -2,5 +2,9 @@ PyYAML==5.4.1
 numpy==1.19.5
 progress==1.6
 
-transformers==4.12.2
+transformers[onnx]==4.16.2
+
+# tokenization dependencies
 sentencepiece==0.1.96
+fugashi==1.1.2 # ja tokenizers
+ipadic==1.0.0 # ja tokenizers
diff --git a/wb/main/models/jupyter_notebook_model.py b/wb/main/models/jupyter_notebook_model.py
@@ -33,7 +33,7 @@
     ObtainModelDocsCellTemplateContext, ModelDownloaderCodeCellTemplateContext, \
     CheckModelFormatDocsCellTemplateContext, ModelConverterCodeCellTemplateContext, \
     ModelOptimizerCodeCellTemplateContext, InstallRequirementsCodeCellTemplateContext, \
-    TokenizerParametersTemplateContext
+    TokenizerParametersTemplateContext, TransformersONNXCodeCellTemplateContext
 from wb.main.jupyter_notebooks.cli_tools_options import CLIToolEnum
 from wb.main.jupyter_notebooks.config_file_dumpers import AccuracyConfigFileDumper, Int8OptimizationConfigFileDumper
 from wb.main.jupyter_notebooks.jupyter_notebook_cell import NotebookCellIds
@@ -409,6 +409,11 @@ def _tokenizer_parameters_template_context(self) -> TokenizerParametersTemplateC
             streams=streams,
         )
 
+    @property
+    def _transformers_onnx_template_context(self) -> TransformersONNXCodeCellTemplateContext:
+        model_checkpoint = self.project.topology.name
+        return TransformersONNXCodeCellTemplateContext(model_checkpoint=model_checkpoint)
+
     _job_type_to_update_cell_ids_map = {
         JobTypesEnum.profiling_type: [
             NotebookCellIds.intro_docs,
@@ -465,7 +470,8 @@ def _tokenizer_parameters_template_context(self) -> TokenizerParametersTemplateC
         NotebookCellIds.int8_optimization_code: _int8_optimization_code_cell_template_context,
         NotebookCellIds.int8_optimization_result_docs: _int8_optimization_code_cell_template_context,
         NotebookCellIds.tokenizer_parameters_code: _tokenizer_parameters_template_context,
-        NotebookCellIds.install_python_requirements_code: _install_requirements_template_context
+        NotebookCellIds.install_python_requirements_code: _install_requirements_template_context,
+        NotebookCellIds.transformers_onnx_converter_code: _transformers_onnx_template_context
     }