[81218] [NLP] Progress tracking (#17)

apaniukov · web-flow · commit 8aec16c4444e · 2022-03-24T16:04:50.000+03:00
diff --git a/automation/bom/image_BOM.txt b/automation/bom/image_BOM.txt
@@ -153,6 +153,7 @@ wb/error/parse_env_error.py
 wb/error/request_error.py
 wb/error/sanitize_parameter_error.py
 wb/error/ssh_client_error.py
+wb/error/transformers_onnx_conversion_error_map.json
 wb/extensions_factories/__init__.py
 wb/extensions_factories/celery.py
 wb/extensions_factories/database.py
diff --git a/config/constants.py b/config/constants.py
@@ -50,6 +50,7 @@
 JUPYTER_CELL_TEMPLATES_FOLDER = os.path.join(ROOT_FOLDER, 'wb', 'main', 'jupyter_notebooks', 'cell_templates')
 UPLOAD_FOLDER_MODELS = os.path.join(ESSENTIAL_DATA_FOLDER, 'models')
 CONSOLE_TOOL_WRAPPER_FOLDER = os.path.join(ROOT_FOLDER, 'wb', 'main', 'console_tool_wrapper')
+TRANSFORMERS_ONNX_ERROR_MAP_JSON = Path(ROOT_FOLDER) / 'wb' / 'error' / 'transformers_onnx_conversion_error_map.json'
 VOC_IMAGES_FOLDER = 'JPEGImages'
 VOC_ANNOTATIONS_FOLDER = 'Annotations'
 VOC_IMAGESETS_FOLDER = 'ImageSets'
diff --git a/wb/error/code_registry.py b/wb/error/code_registry.py
@@ -46,6 +46,7 @@ class CodeRegistry:
         'DEPLOYMENT_MANAGER_ERROR': 4008,
         'DATUMARO_ERROR': 4009,
         'RESHAPE_MODEL_ERROR': 4010,
+        'TRANSFORMERS_ONNX_ERROR': 4020,
 
         # DATABASE ERRORS
         'DATABASE_ERROR': 5001,
@@ -160,3 +161,7 @@ def get_dev_cloud_remote_job_error_code(cls):
     @classmethod
     def get_reshape_model_error_code(cls):
         return cls.CODES['RESHAPE_MODEL_ERROR']
+
+    @classmethod
+    def get_transformers_onnx_error_code(cls):
+        return cls.CODES['TRANSFORMERS_ONNX_ERROR']
diff --git a/wb/error/job_error.py b/wb/error/job_error.py
@@ -14,7 +14,9 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-from config.constants import CELERY_RETRY_COUNTDOWN, CELERY_RETRY_MAX_RETRY
+import json
+
+from config.constants import CELERY_RETRY_COUNTDOWN, CELERY_RETRY_MAX_RETRY, TRANSFORMERS_ONNX_ERROR_MAP_JSON
 from wb.error.general_error import GeneralError
 from wb.error.code_registry import CodeRegistry
 from wb.main.enumerates import RemoteSetupStatusMessagesEnum, RemoteSetupStatusCodeEnum
@@ -103,3 +105,21 @@ def __init__(self, error_type, job_id):
         RemoteSetupStatusMessagesEnum.PIP_VERSION_ERROR.value:
             RemoteSetupStatusCodeEnum.PIP_VERSION_ERROR.value,
     }
+
+
+class TransformersONNXConversionError(JobGeneralError):
+    code = CodeRegistry.get_transformers_onnx_error_code()
+
+    with open(TRANSFORMERS_ONNX_ERROR_MAP_JSON) as f:
+        message_map = json.load(f)
+
+    def __init__(self, message: str,  job_id: int):
+        message = self.replace_error_message(message)
+        super().__init__(message, job_id)
+
+    def replace_error_message(self, message: str) -> str:
+        for substring, replacement_string in self.message_map.items():
+            if substring in message:
+                return replacement_string
+
+        return f'Unexpected error: {message}'
diff --git a/wb/error/transformers_onnx_conversion_error_map.json b/wb/error/transformers_onnx_conversion_error_map.json
@@ -0,0 +1,19 @@
+{
+    "RuntimeError: 0INTERNAL ASSERT FAILED": "PyTorch JiT trace error",
+    "onnxruntime.capi.onnxruntime_pybind11_state.InvalidArgument": "Wrong tokenizer type in the repository. To convert the model you could create a repository with the same model and right tokenizer type and use it instead.",
+    "Connection error, and we cannot find the requested files in the cached path": "Connection error. Check the internet connection.",
+    "TypeError: not a string": "Cannot initialize tokenizer from the repository.",
+    "Error(s) in loading state_dict": "Cannot initialize the the model from the repository. Try to connect the repository creator of the repository.",
+    "sequence item 0: expected str instance": "[unk] and [pad] tokens for tokenizer are not set.",
+    "expected str, bytes or os.PathLike object": "Not enough files for tokenizer initialization in the repository",
+    "path should be string, bytes, os.PathLike or integer": "Not enough files for tokenizer initialization in the repository",
+    "The state dictionary of the model you are training to load is corrupted": "Cannot initialize the model form the repository. It may not have been saved properly.",
+    "No such file or directory (os error 2)": "Cannot initialize tokenizer from the repository.",
+    "Can't load tokenizer for": "Cannot initialize tokenizer from the repository.",
+    "Exporting model exceed maximum protobuf size of 2GB": "Cannot convert a large model to ONNX.",
+    "Connection error": "Connection error, check the internet connection.",
+    "Model and config inputs doesn't match": "Cannot convert model to ONNX - model and config inputs doesn't match. The repository might contain wrong tokenizer type.",
+    "Wrong index found for [MASK]": "Cannot initialize tokenizer from the model repository.",
+    "JSONDecodeError": "Cannot initialize tokenizer from the model repository - the json file is corrupted.",
+    "PreValidation Error": "Cannot initialize tokenizer or model from the repository."
+}
diff --git a/wb/main/console_tool_wrapper/huggingface_model_downloader/tool.py b/wb/main/console_tool_wrapper/huggingface_model_downloader/tool.py
@@ -14,11 +14,22 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 """
+
+import re
 from pathlib import Path
 
 from wb.main.console_tool_wrapper.console_parameter_validator import ConsoleParametersTypes
 from wb.main.console_tool_wrapper.python_console_tool import PythonModuleTool
-from wb.main.jobs.tools_runner.console_output_parser import ConsoleToolOutputParser
+from wb.main.jobs.interfaces.job_state import JobStateSubject
+from wb.main.jobs.tools_runner.console_output_parser import ConsoleToolOutputParser, skip_empty_line_decorator
+
+
+DOWNLOAD_PROGRESS_STRING_START = 'Downloading:'
+CONVERSION_START = 'Using framework PyTorch'
+MODEL_SAVED = 'All good, model saved'
+TOLERANCE_CHECK_FAILED = "Outputs values doesn't match between reference model and ONNX exported model"
+VALIDATING_ONNX_MODEL = 'Validating ONNX model'
+NOT_ALL_WEIGHTS_USED = 'Some weights of the model checkpoint'
 
 
 class HuggingfaceModelDownloaderTool(PythonModuleTool):
@@ -39,9 +50,84 @@ def __init__(self, python_exec: Path, model_id: str, onnx_model_path: Path):
 
 
 class HuggingfaceModelDownloaderParser(ConsoleToolOutputParser):
-    def __init__(self):
-        super().__init__()
+    def __init__(self, job_state_subject: JobStateSubject):
+        super().__init__(job_state_subject=job_state_subject)
+        self.current_pct = 0
+        self.current_step = 0
+
+        self.download_steps = 5
+        self.pct_per_step = round(60 / self.download_steps)
+
+        self.current = re.compile(r'(\d+\.?\d*)[Mk]?/\d+\.?\d*')
+        self.total = re.compile(r'\d+\.?\d*[Mk]?/(\d+\.?\d*)')
+
+        self.downloaded = False
+        self.downloaded_pct = 60
+        self.converted = False
+        self.converted_pct = 80
 
+        self.error = False
+
+    @skip_empty_line_decorator
     def parse(self, string: str):
-        # todo: implement progress reporting
-        print(string)
+        if self.error:
+            return
+
+        string = string.strip()
+
+        # skip tensorflow error message
+        if 'error' in string.lower() and 'tensorflow' not in string.lower():
+            self.error = True
+            self._job_state_subject.update_state(progress=100)
+            return
+
+        if not self.downloaded:
+            if string.startswith(DOWNLOAD_PROGRESS_STRING_START):
+                self.parse_download_stage(string)
+            elif string.startswith(CONVERSION_START):
+                self.current_pct = self.downloaded_pct
+                self.downloaded = True
+                self.parse_convert_stage(string)
+        elif not self.converted:
+            self.parse_convert_stage(string)
+        else:
+            self.parse_validation_stage(string)
+
+        self._job_state_subject.update_state(progress=self.current_pct)
+
+    def parse_download_stage(self, string: str):
+        current_size_match = self.current.search(string)
+        total_size_match = self.total.search(string)
+
+        if not current_size_match or not total_size_match:
+            return
+
+        current_size = float(current_size_match.group(1))
+        total_size = float(total_size_match.group(1))
+
+        ratio = 0 if current_size > total_size else current_size / total_size
+
+        self.current_pct = max(
+            self.current_step * self.pct_per_step + round(ratio * self.pct_per_step),
+            self.current_pct
+        )
+
+        self.current_step += (current_size == total_size)
+
+    def parse_convert_stage(self, string: str) -> None:
+        self.current_pct = min(self.current_pct + 1, 100)
+
+        if NOT_ALL_WEIGHTS_USED in string:
+            self.error = True
+        elif VALIDATING_ONNX_MODEL in string:
+            self.converted = True
+            self.current_pct = self.converted_pct
+            self.parse_validation_stage(string)
+
+    def parse_validation_stage(self, string: str) -> None:
+        self.current_pct = min(self.current_pct + 4, 100)
+
+        if TOLERANCE_CHECK_FAILED in string:
+            self.error = True
+        elif MODEL_SAVED in string:
+            self.current_pct = 100
diff --git a/wb/main/console_tool_wrapper/model_downloader/console_output_parser.py b/wb/main/console_tool_wrapper/model_downloader/console_output_parser.py
@@ -22,7 +22,7 @@
 from wb.main.jobs.tools_runner.console_output_parser import ConsoleToolOutputParser, skip_empty_line_decorator
 
 
-class DownloadingFile():
+class DownloadingFile:
     def __init__(self, name: str, size: float):
         self.name = name
         self.size = size
diff --git a/wb/main/console_tool_wrapper/model_optimizer/console_output_parser.py b/wb/main/console_tool_wrapper/model_optimizer/console_output_parser.py
@@ -25,9 +25,15 @@ class ModelOptimizerParser(ConsoleToolOutputParser):
     def __init__(self, job_state_subject: ModelOptimizerJobStateSubject):
         super().__init__(job_state_subject=job_state_subject)
         self._progress_pattern = re.compile(r'.*Progress: \[.*]\s*(?P<progress>\d+(.\d+)?)%\sdone$')
+        self.update_every_pct = 5
 
     def parse(self, string: str):
         progress_match = self._progress_pattern.search(string)
         if progress_match:
+            if self._job_state_subject.job_progress is None:
+                self._job_state_subject.update_state(progress=0)
+
             percent = float(progress_match.group('progress'))
-            self._job_state_subject.update_state(progress=percent)
+
+            if percent - self._job_state_subject.job_progress > self.update_every_pct:
+                self._job_state_subject.update_state(progress=percent)
diff --git a/wb/main/huggingface_api/huggingface_api.py b/wb/main/huggingface_api/huggingface_api.py
@@ -89,6 +89,12 @@ def json(self):
         }
 
 
+contains_decoder = {
+        model_type for model_type, tasks in FeaturesManager._SUPPORTED_MODEL_TYPE.items()
+        if any("with-past" in task for task in tasks)
+    }
+
+
 def _validate_hf_model(model: ModelInfo) -> ValidationResult:
     if not model.config:
         return ValidationResult(disabled=True, message='Model has no config')
@@ -102,6 +108,11 @@ def _validate_hf_model(model: ModelInfo) -> ValidationResult:
             disabled=True,
             message=f'Sequence classification feature is not supported for model type {model_type}'
         )
+    if model_type in contains_decoder:
+        return ValidationResult(
+            disabled=True,
+            message=f'The model type {model_type} contains transformer decoder and is not supported by DL Workbench'
+        )
     return ValidationResult(disabled=False)
 
 
diff --git a/wb/main/jobs/models/import_huggingface_model_job.py b/wb/main/jobs/models/import_huggingface_model_job.py
@@ -18,8 +18,9 @@
 from pathlib import Path
 
 from sqlalchemy.orm import Session
+from transformers import AutoTokenizer, PretrainedConfig
 
-from wb.error.job_error import ModelOptimizerError
+from wb.error.job_error import TransformersONNXConversionError
 from wb.extensions_factories.database import get_db_session_for_celery
 from wb.main.console_tool_wrapper.huggingface_model_downloader.tool import (HuggingfaceModelDownloaderTool,
                                                                             HuggingfaceModelDownloaderParser)
@@ -52,6 +53,8 @@ def run(self):
 
             topology: TopologiesModel = import_job.model
 
+            self.pre_validation(import_job.huggingface_model_id)
+
             environment: EnvironmentModel = topology.environment
             python_executable = environment.python_executable
 
@@ -61,14 +64,27 @@ def run(self):
             onnx_model_path=Path(topology.path),
         )
 
-        parser = HuggingfaceModelDownloaderParser()
+        parser = HuggingfaceModelDownloaderParser(self._job_state_subject)
         runner = LocalRunner(tool, parser)
 
         return_code, message = runner.run_console_tool(self)
 
         if return_code:
             self._job_state_subject.update_state(status=StatusEnum.error, error_message='error')
-            raise ModelOptimizerError(message, self.job_id)
+            raise TransformersONNXConversionError(message, self.job_id)
 
         self._job_state_subject.update_state(progress=100, status=StatusEnum.ready)
         self._job_state_subject.detach_all_observers()
+
+    def pre_validation(self, huggingface_model_id: str) -> None:
+        """
+        Check that the tokenizer and model config can be initialized from the repository before loading a model
+        """
+        try:
+            AutoTokenizer.from_pretrained(huggingface_model_id)
+            PretrainedConfig.from_pretrained(huggingface_model_id)
+        except Exception:
+            self._job_state_subject.update_state(status=StatusEnum.error, error_message='error')
+            raise TransformersONNXConversionError(
+                'PreValidation Error', self.job_id
+            )
diff --git a/wb/main/utils/tokenizer/tokeinzer_wrapper.py b/wb/main/utils/tokenizer/tokeinzer_wrapper.py
@@ -41,7 +41,7 @@ def __init__(self, tokenizer_folder: Path, tokenizer_type: Optional[TokenizerTyp
         )
 
     @classmethod
-    def from_model(cls, tokenizer_model: TokenizerModel) -> "TokenizerWrapper":
+    def from_model(cls, tokenizer_model: TokenizerModel) -> 'TokenizerWrapper':
         return cls(
             tokenizer_folder=Path(tokenizer_model.path),
             tokenizer_type=tokenizer_model.tokenizer_type,

Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ def __init__(self, tokenizer_folder: Path, tokenizer_type: Optional[TokenizerTyp`
`41`	`41`	`)`
`42`	`42`
`43`	`43`	`@classmethod`
`44`		`- def from_model(cls, tokenizer_model: TokenizerModel) -> "TokenizerWrapper":`
	`44`	`+ def from_model(cls, tokenizer_model: TokenizerModel) -> 'TokenizerWrapper':`
`45`	`45`	`return cls(`
`46`	`46`	`tokenizer_folder=Path(tokenizer_model.path),`
`47`	`47`	`tokenizer_type=tokenizer_model.tokenizer_type,`