update magike notebook to use new magike version (#2838)

eaidova · web-flow · commit b64001d0ada5 · 2025-03-24T18:47:39.000+04:00
diff --git a/notebooks/magika-content-type-recognition/magika-content-type-recognition.ipynb b/notebooks/magika-content-type-recognition/magika-content-type-recognition.ipynb
@@ -56,14 +56,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "91f4a127-0133-4daa-9021-f62dec73625b",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
    "source": [
-    "%pip install -q magika \"openvino>=2024.1.0\" \"gradio>=4.19\""
+    "%pip install -q \"magika>=0.6.1\" \"openvino>=2024.1.0\" \"gradio>=4.19\""
    ]
   },
   {
@@ -95,7 +106,10 @@
     "\n",
     "from magika import Magika\n",
     "from magika.types import ModelFeatures, ModelOutput, MagikaResult\n",
-    "from magika.prediction_mode import PredictionMode\n",
+    "try:\n",
+    "    from magika.types.prediction_mode import PredictionMode\n",
+    "except ImportError:\n",
+    "    from magika.prediction_mode import PredictionMode\n",
     "import numpy.typing as npt\n",
     "import numpy as np\n",
     "\n",
@@ -144,25 +158,21 @@
     "        Given a list of (path, features), return a (files_num, features_size)\n",
     "        matrix encoding the predictions.\n",
     "        \"\"\"\n",
-    "\n",
-    "        dataset_format = self._model_config[\"train_dataset_info\"][\"dataset_format\"]\n",
-    "        assert dataset_format == \"int-concat/one-hot\"\n",
     "        start_time = time.time()\n",
     "        X_bytes = []\n",
     "        for _, fs in features:\n",
     "            sample_bytes = []\n",
-    "            if self._input_sizes[\"beg\"] > 0:\n",
-    "                sample_bytes.extend(fs.beg[: self._input_sizes[\"beg\"]])\n",
-    "            if self._input_sizes[\"mid\"] > 0:\n",
-    "                sample_bytes.extend(fs.mid[: self._input_sizes[\"mid\"]])\n",
-    "            if self._input_sizes[\"end\"] > 0:\n",
-    "                sample_bytes.extend(fs.end[-self._input_sizes[\"end\"] :])\n",
+    "            if self._model_config.beg_size > 0:\n",
+    "                sample_bytes.extend(fs.beg[: self._model_config.beg_size])\n",
+    "            if self._model_config.mid_size > 0:\n",
+    "                sample_bytes.extend(fs.mid[: self._model_config.mid_size])\n",
+    "            if self._model_config.end_size > 0:\n",
+    "                sample_bytes.extend(fs.end[-self._model_config.end_size :])\n",
     "            X_bytes.append(sample_bytes)\n",
-    "        X = np.array(X_bytes).astype(np.float32)\n",
-    "        elapsed_time = time.time() - start_time\n",
-    "        self._log.debug(f\"DL input prepared in {elapsed_time:.03f} seconds\")\n",
+    "        X = np.array(X_bytes, dtype=np.int32)\n",
+    "        elapsed_time = 1000 * (time.time() - start_time)\n",
+    "        self._log.debug(f\"DL input prepared in {elapsed_time:.03f} ms\")\n",
     "\n",
-    "        start_time = time.time()\n",
     "        raw_predictions_list = []\n",
     "        samples_num = X.shape[0]\n",
     "\n",
@@ -172,7 +182,9 @@
     "            batches_num += 1\n",
     "\n",
     "        for batch_idx in range(batches_num):\n",
-    "            self._log.debug(f\"Getting raw predictions for (internal) batch {batch_idx+1}/{batches_num}\")\n",
+    "            self._log.debug(\n",
+    "                f\"Getting raw predictions for (internal) batch {batch_idx+1}/{batches_num}\"\n",
+    "            )\n",
     "            start_idx = batch_idx * max_internal_batch_size\n",
     "            end_idx = min((batch_idx + 1) * max_internal_batch_size, samples_num)\n",
     "            batch_raw_predictions = self._onnx_session({\"bytes\": X[start_idx:end_idx, :]})[\"target_label\"]\n",
@@ -218,12 +230,12 @@
     "            # the user.\n",
     "            results = []\n",
     "            for out in model_output:\n",
-    "                output_ct_label = self._get_output_ct_label_from_dl_result(out.ct_label, out.score)\n",
+    "                output_ct_label, _ = self._get_output_ct_label_from_dl_result(out.ct_label, out.score)\n",
     "\n",
     "                results.append(\n",
     "                    self._get_result_from_labels_and_score(\n",
     "                        path,\n",
-    "                        dl_ct_label=out.ct_label,\n",
+    "                        dl_ct_label=output_ct_label,\n",
     "                        output_ct_label=output_ct_label,\n",
     "                        score=out.score,\n",
     "                    )\n",
@@ -267,12 +279,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "52c226d6276b49afbb7f17b5d5d8c27a",
+       "model_id": "0e3cc43da6ef4fba99d8a432124324bc",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO')"
+       "Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO')"
       ]
      },
      "execution_count": 3,
@@ -332,13 +344,21 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Content type: markdown - 99.29%\n"
+      "Content type: markdown - 82.43%\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ea/work/py311/lib/python3.11/site-packages/magika/types/content_type_info.py:22: DeprecationWarning: `.ct_label` is deprecated and will be removed in a future version. Use `.label` instead. Consult the documentation for more information.\n",
+      "  warnings.warn(\n"
      ]
     }
    ],
    "source": [
     "result = ov_magika.identify_bytes(b\"# Example\\nThis is an example of markdown!\")\n",
-    "print(f\"Content type: {result.output.ct_label} - {result.output.score * 100:.4}%\")"
+    "print(f\"Content type: {result.output.ct_label} - {result.score * 100:.4}%\")"
    ]
   },
   {
@@ -363,7 +383,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Content type: markdown - 100.0%\n"
+      "Content type: markdown - 99.97%\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ea/work/py311/lib/python3.11/site-packages/magika/types/content_type_info.py:22: DeprecationWarning: `.ct_label` is deprecated and will be removed in a future version. Use `.label` instead. Consult the documentation for more information.\n",
+      "  warnings.warn(\n"
      ]
     }
    ],
@@ -376,7 +404,7 @@
     "    with open(\"README.md\", \"w\") as f:\n",
     "        f.write(r.text)\n",
     "result = ov_magika.identify_path(input_file)\n",
-    "print(f\"Content type: {result.output.ct_label} - {result.output.score * 100:.4}%\")"
+    "print(f\"Content type: {result.output.ct_label} - {result.score * 100:.4}%\")"
    ]
   },
   {
@@ -412,7 +440,7 @@
     "    \"\"\"\n",
     "    results = ov_magika.identify_bytes_topk(file_path)\n",
     "\n",
-    "    return {result.dl.ct_label: float(result.output.score) for result in results}\n",
+    "    return {result.output.ct_label: float(result.score) for result in results}\n",
     "\n",
     "\n",
     "demo = gr.Interface(\n",
@@ -450,7 +478,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.11.4"
   },
   "openvino_notebooks": {
    "imageUrl": "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/b99cb2c0-d9cb-47a7-ba17-1b4b2eed01da",
@@ -469,36 +497,34 @@
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
     "state": {
-     "52c226d6276b49afbb7f17b5d5d8c27a": {
+     "0e3cc43da6ef4fba99d8a432124324bc": {
       "model_module": "@jupyter-widgets/controls",
       "model_module_version": "2.0.0",
       "model_name": "DropdownModel",
       "state": {
        "_options_labels": [
         "CPU",
-        "GPU.0",
-        "GPU.1",
         "AUTO"
        ],
        "description": "Device:",
-       "index": 3,
-       "layout": "IPY_MODEL_5ee2fa796f56446ea39ee28ba8d3c174",
-       "style": "IPY_MODEL_9707408129d94f1e8f0c5218a99c5170"
+       "index": 1,
+       "layout": "IPY_MODEL_d515552eb19847b3a492898a29172af8",
+       "style": "IPY_MODEL_bdae71df78e8483d9cb09cab767fde4c"
       }
      },
-     "5ee2fa796f56446ea39ee28ba8d3c174": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "9707408129d94f1e8f0c5218a99c5170": {
+     "bdae71df78e8483d9cb09cab767fde4c": {
       "model_module": "@jupyter-widgets/controls",
       "model_module_version": "2.0.0",
       "model_name": "DescriptionStyleModel",
       "state": {
        "description_width": ""
       }
+     },
+     "d515552eb19847b3a492898a29172af8": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {}
      }
     },
     "version_major": 2,