Skip to content
This repository was archived by the owner on Aug 28, 2023. It is now read-only.

[82639] Add HF Hub model download for autogenerated notebooks #52

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
2 changes: 2 additions & 0 deletions automation/bom/image_BOM.txt
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,8 @@ wb/main/jupyter_notebooks/cell_templates/summary_docs_cell.jinja
wb/main/jupyter_notebooks/cell_templates/tokenize_dataset_code_cell.jinja
wb/main/jupyter_notebooks/cell_templates/tokenize_dataset_docs_cell.jinja
wb/main/jupyter_notebooks/cell_templates/tokenizer_parameters_code_cell.jinja
wb/main/jupyter_notebooks/cell_templates/transformers_onnx_converter_code_cell.jinja
wb/main/jupyter_notebooks/cell_templates/transformers_onnx_converter_docs_cell.jinja
wb/main/jupyter_notebooks/cell_templates/validate_ir_model_code_cell.jinja
wb/main/jupyter_notebooks/cell_templates/validate_ir_model_docs_cell.jinja
wb/main/jupyter_notebooks/cli_tools_options.py
Expand Down
5 changes: 5 additions & 0 deletions wb/main/jupyter_notebooks/cell_template_contexts.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ class TokenizerParametersTemplateContext(TypedDict):
batch: Optional[int]
streams: Optional[int]


class AccuracyDocsCellTemplateContext(TypedDict):
yaml_config_path: str

Expand All @@ -107,3 +108,7 @@ class Int8OptimizationCodeCellTemplateContext(Int8OptimizationDocsCellTemplateCo

class InstallRequirementsCodeCellTemplateContext(TypedDict):
requirements_file: str


class TransformersONNXCodeCellTemplateContext(TypedDict):
model_checkpoint: str
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,14 @@ In this tutorial we are working with the following model:

- Model Name: **{{ project_model_name }}**

- Model Source: {{ 'Open Model Zoo' if project_model_source == ModelSourceEnum.omz.value else 'User provided model' }}
- Model Source:
{% if project_model_source == ModelSourceEnum.huggingface.value %}
Hugging Face Hub
{% elif project_model_source == ModelSourceEnum.omz.value %}
Open Model Zoo
{% else %}
User provided model
{% endif %}

- Model Framework: {{ SupportedFrameworksEnum.get_name(project_model_framework) }}

Expand All @@ -26,6 +33,8 @@ No conversion to IR required. Download the model with the Model Downloader and p
{% else %}
Download the model with Model Downloader and then convert it to IR format with Model Converter.
{% endif %}
{% elif project_model_source == ModelSourceEnum.huggingface.value %}
Your original model is PyTorch format. Use `transformers.onnx` CLI tool to convert it to ONNX, than convert the model to the IR format with Model Optimizer.
{% elif project_model_source == ModelSourceEnum.original.value %}
Your original model is in one of the supported frameworks. Convert model to IR format with Model Optimizer.
{% endif %}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{% include 'autogenerated_note_code.jinja' %}


!python -m transformers.onnx \
--model={{ model_checkpoint }} \
--feature sequence-classification \
onnx
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
### Get ONNX model from Hugging Face Hub

To obtain an ONNX model use [transformers.onnx](https://huggingface.co/docs/transformers/main/en/serialization#onnx) CLI tool from the `transformers` library.

It will execute this steps:
1. Download the model files and tokenizer files from the Hugging Face Hub
1. Generate the dummy input with the tokenizer and pass it to the model to trace the model execution graph
1. Use the execution graph to generate ONNX model
1. Check the that the result model outputs is close to the original one
16 changes: 15 additions & 1 deletion wb/main/jupyter_notebooks/jupyter_notebook_cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ class NotebookCellIds(enum.Enum):
tokenize_dataset_docs = 'tokenize_dataset_docs'
tokenize_dataset_code = 'tokenize_dataset_code'
tokenizer_parameters_code = 'tokenizer_parameters_code'
transformers_onnx_converter_docs = 'transformers_onnx_converter_docs'
transformers_onnx_converter_code = 'transformers_onnx_converter_code'


class NotebookCellConfig:
Expand Down Expand Up @@ -144,6 +146,18 @@ class NotebookCells:
cell_type=NotebookCellTypes.markdown,
template_filename='model_converter_result_docs_cell.jinja')

transformers_onnx_converter_docs = NotebookCellConfig(
cell_id=NotebookCellIds.transformers_onnx_converter_docs,
cell_type=NotebookCellTypes.markdown,
template_filename='transformers_onnx_converter_docs_cell.jinja'
)

transformers_onnx_converter_code = NotebookCellConfig(
cell_id=NotebookCellIds.transformers_onnx_converter_code,
cell_type=NotebookCellTypes.code,
template_filename='transformers_onnx_converter_code_cell.jinja'
)

model_optimizer_docs = NotebookCellConfig(
cell_id=NotebookCellIds.model_optimizer_docs,
cell_type=NotebookCellTypes.markdown,
Expand Down Expand Up @@ -179,7 +193,7 @@ class NotebookCells:
cell_type=NotebookCellTypes.code,
template_filename='validate_ir_model_code_cell.jinja')

tokenizer_parameters_code_code = NotebookCellConfig(
tokenizer_parameters_code = NotebookCellConfig(
cell_id=NotebookCellIds.tokenizer_parameters_code,
cell_type=NotebookCellTypes.code,
template_filename='tokenizer_parameters_code_cell.jinja')
Expand Down
12 changes: 11 additions & 1 deletion wb/main/jupyter_notebooks/notebook_template_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class NotebookTemplateCreator:
],
model_tokenizer=[
NotebookCells.load_tokenizer_docs,
NotebookCells.tokenizer_parameters_code_code,
NotebookCells.tokenizer_parameters_code,
NotebookCells.load_tokenizer_code,
NotebookCells.tokenize_dataset_docs,
NotebookCells.tokenize_dataset_code,
Expand Down Expand Up @@ -98,6 +98,16 @@ def _obtain_model_section_cells(self) -> List[NotebookCellConfig]:
])
obtain_model_cells.append(NotebookCells.obtain_model_result_docs)
return obtain_model_cells
if self._original_model_source == ModelSourceEnum.huggingface:
return [
NotebookCells.obtain_model_docs,
NotebookCells.transformers_onnx_converter_docs,
NotebookCells.transformers_onnx_converter_code,
NotebookCells.model_optimizer_docs,
NotebookCells.model_optimizer_code,
NotebookCells.model_optimizer_result_docs,
NotebookCells.obtain_model_result_docs,
]
if self._original_model_source == ModelSourceEnum.original:
return [
NotebookCells.obtain_model_docs,
Expand Down
6 changes: 5 additions & 1 deletion wb/main/jupyter_notebooks/resources/requirements_nlp.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,9 @@ PyYAML==5.4.1
numpy==1.19.5
progress==1.6

transformers==4.12.2
transformers[onnx]==4.16.2

# tokenization dependencies
sentencepiece==0.1.96
fugashi==1.1.2 # ja tokenizers
ipadic==1.0.0 # ja tokenizers
10 changes: 8 additions & 2 deletions wb/main/models/jupyter_notebook_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
ObtainModelDocsCellTemplateContext, ModelDownloaderCodeCellTemplateContext, \
CheckModelFormatDocsCellTemplateContext, ModelConverterCodeCellTemplateContext, \
ModelOptimizerCodeCellTemplateContext, InstallRequirementsCodeCellTemplateContext, \
TokenizerParametersTemplateContext
TokenizerParametersTemplateContext, TransformersONNXCodeCellTemplateContext
from wb.main.jupyter_notebooks.cli_tools_options import CLIToolEnum
from wb.main.jupyter_notebooks.config_file_dumpers import AccuracyConfigFileDumper, Int8OptimizationConfigFileDumper
from wb.main.jupyter_notebooks.jupyter_notebook_cell import NotebookCellIds
Expand Down Expand Up @@ -409,6 +409,11 @@ def _tokenizer_parameters_template_context(self) -> TokenizerParametersTemplateC
streams=streams,
)

@property
def _transformers_onnx_template_context(self) -> TransformersONNXCodeCellTemplateContext:
model_checkpoint = self.project.topology.name
return TransformersONNXCodeCellTemplateContext(model_checkpoint=model_checkpoint)

_job_type_to_update_cell_ids_map = {
JobTypesEnum.profiling_type: [
NotebookCellIds.intro_docs,
Expand Down Expand Up @@ -465,7 +470,8 @@ def _tokenizer_parameters_template_context(self) -> TokenizerParametersTemplateC
NotebookCellIds.int8_optimization_code: _int8_optimization_code_cell_template_context,
NotebookCellIds.int8_optimization_result_docs: _int8_optimization_code_cell_template_context,
NotebookCellIds.tokenizer_parameters_code: _tokenizer_parameters_template_context,
NotebookCellIds.install_python_requirements_code: _install_requirements_template_context
NotebookCellIds.install_python_requirements_code: _install_requirements_template_context,
NotebookCellIds.transformers_onnx_converter_code: _transformers_onnx_template_context
}


Expand Down