|
56 | 56 | },
|
57 | 57 | {
|
58 | 58 | "cell_type": "code",
|
59 |
| - "execution_count": null, |
| 59 | + "execution_count": 1, |
60 | 60 | "id": "91f4a127-0133-4daa-9021-f62dec73625b",
|
61 | 61 | "metadata": {
|
62 | 62 | "tags": []
|
63 | 63 | },
|
64 |
| - "outputs": [], |
| 64 | + "outputs": [ |
| 65 | + { |
| 66 | + "name": "stdout", |
| 67 | + "output_type": "stream", |
| 68 | + "text": [ |
| 69 | + "\n", |
| 70 | + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n", |
| 71 | + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", |
| 72 | + "Note: you may need to restart the kernel to use updated packages.\n" |
| 73 | + ] |
| 74 | + } |
| 75 | + ], |
65 | 76 | "source": [
|
66 |
| - "%pip install -q magika \"openvino>=2024.1.0\" \"gradio>=4.19\"" |
| 77 | + "%pip install -q \"magika>=0.6.1\" \"openvino>=2024.1.0\" \"gradio>=4.19\"" |
67 | 78 | ]
|
68 | 79 | },
|
69 | 80 | {
|
|
95 | 106 | "\n",
|
96 | 107 | "from magika import Magika\n",
|
97 | 108 | "from magika.types import ModelFeatures, ModelOutput, MagikaResult\n",
|
98 |
| - "from magika.prediction_mode import PredictionMode\n", |
| 109 | + "try:\n", |
| 110 | + " from magika.types.prediction_mode import PredictionMode\n", |
| 111 | + "except ImportError:\n", |
| 112 | + " from magika.prediction_mode import PredictionMode\n", |
99 | 113 | "import numpy.typing as npt\n",
|
100 | 114 | "import numpy as np\n",
|
101 | 115 | "\n",
|
|
144 | 158 | " Given a list of (path, features), return a (files_num, features_size)\n",
|
145 | 159 | " matrix encoding the predictions.\n",
|
146 | 160 | " \"\"\"\n",
|
147 |
| - "\n", |
148 |
| - " dataset_format = self._model_config[\"train_dataset_info\"][\"dataset_format\"]\n", |
149 |
| - " assert dataset_format == \"int-concat/one-hot\"\n", |
150 | 161 | " start_time = time.time()\n",
|
151 | 162 | " X_bytes = []\n",
|
152 | 163 | " for _, fs in features:\n",
|
153 | 164 | " sample_bytes = []\n",
|
154 |
| - " if self._input_sizes[\"beg\"] > 0:\n", |
155 |
| - " sample_bytes.extend(fs.beg[: self._input_sizes[\"beg\"]])\n", |
156 |
| - " if self._input_sizes[\"mid\"] > 0:\n", |
157 |
| - " sample_bytes.extend(fs.mid[: self._input_sizes[\"mid\"]])\n", |
158 |
| - " if self._input_sizes[\"end\"] > 0:\n", |
159 |
| - " sample_bytes.extend(fs.end[-self._input_sizes[\"end\"] :])\n", |
| 165 | + " if self._model_config.beg_size > 0:\n", |
| 166 | + " sample_bytes.extend(fs.beg[: self._model_config.beg_size])\n", |
| 167 | + " if self._model_config.mid_size > 0:\n", |
| 168 | + " sample_bytes.extend(fs.mid[: self._model_config.mid_size])\n", |
| 169 | + " if self._model_config.end_size > 0:\n", |
| 170 | + " sample_bytes.extend(fs.end[-self._model_config.end_size :])\n", |
160 | 171 | " X_bytes.append(sample_bytes)\n",
|
161 |
| - " X = np.array(X_bytes).astype(np.float32)\n", |
162 |
| - " elapsed_time = time.time() - start_time\n", |
163 |
| - " self._log.debug(f\"DL input prepared in {elapsed_time:.03f} seconds\")\n", |
| 172 | + " X = np.array(X_bytes, dtype=np.int32)\n", |
| 173 | + " elapsed_time = 1000 * (time.time() - start_time)\n", |
| 174 | + " self._log.debug(f\"DL input prepared in {elapsed_time:.03f} ms\")\n", |
164 | 175 | "\n",
|
165 |
| - " start_time = time.time()\n", |
166 | 176 | " raw_predictions_list = []\n",
|
167 | 177 | " samples_num = X.shape[0]\n",
|
168 | 178 | "\n",
|
|
172 | 182 | " batches_num += 1\n",
|
173 | 183 | "\n",
|
174 | 184 | " for batch_idx in range(batches_num):\n",
|
175 |
| - " self._log.debug(f\"Getting raw predictions for (internal) batch {batch_idx+1}/{batches_num}\")\n", |
| 185 | + " self._log.debug(\n", |
| 186 | + " f\"Getting raw predictions for (internal) batch {batch_idx+1}/{batches_num}\"\n", |
| 187 | + " )\n", |
176 | 188 | " start_idx = batch_idx * max_internal_batch_size\n",
|
177 | 189 | " end_idx = min((batch_idx + 1) * max_internal_batch_size, samples_num)\n",
|
178 | 190 | " batch_raw_predictions = self._onnx_session({\"bytes\": X[start_idx:end_idx, :]})[\"target_label\"]\n",
|
|
218 | 230 | " # the user.\n",
|
219 | 231 | " results = []\n",
|
220 | 232 | " for out in model_output:\n",
|
221 |
| - " output_ct_label = self._get_output_ct_label_from_dl_result(out.ct_label, out.score)\n", |
| 233 | + " output_ct_label, _ = self._get_output_ct_label_from_dl_result(out.ct_label, out.score)\n", |
222 | 234 | "\n",
|
223 | 235 | " results.append(\n",
|
224 | 236 | " self._get_result_from_labels_and_score(\n",
|
225 | 237 | " path,\n",
|
226 |
| - " dl_ct_label=out.ct_label,\n", |
| 238 | + " dl_ct_label=output_ct_label,\n", |
227 | 239 | " output_ct_label=output_ct_label,\n",
|
228 | 240 | " score=out.score,\n",
|
229 | 241 | " )\n",
|
|
267 | 279 | {
|
268 | 280 | "data": {
|
269 | 281 | "application/vnd.jupyter.widget-view+json": {
|
270 |
| - "model_id": "52c226d6276b49afbb7f17b5d5d8c27a", |
| 282 | + "model_id": "0e3cc43da6ef4fba99d8a432124324bc", |
271 | 283 | "version_major": 2,
|
272 | 284 | "version_minor": 0
|
273 | 285 | },
|
274 | 286 | "text/plain": [
|
275 |
| - "Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO')" |
| 287 | + "Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO')" |
276 | 288 | ]
|
277 | 289 | },
|
278 | 290 | "execution_count": 3,
|
|
332 | 344 | "name": "stdout",
|
333 | 345 | "output_type": "stream",
|
334 | 346 | "text": [
|
335 |
| - "Content type: markdown - 99.29%\n" |
| 347 | + "Content type: markdown - 82.43%\n" |
| 348 | + ] |
| 349 | + }, |
| 350 | + { |
| 351 | + "name": "stderr", |
| 352 | + "output_type": "stream", |
| 353 | + "text": [ |
| 354 | + "/home/ea/work/py311/lib/python3.11/site-packages/magika/types/content_type_info.py:22: DeprecationWarning: `.ct_label` is deprecated and will be removed in a future version. Use `.label` instead. Consult the documentation for more information.\n", |
| 355 | + " warnings.warn(\n" |
336 | 356 | ]
|
337 | 357 | }
|
338 | 358 | ],
|
339 | 359 | "source": [
|
340 | 360 | "result = ov_magika.identify_bytes(b\"# Example\\nThis is an example of markdown!\")\n",
|
341 |
| - "print(f\"Content type: {result.output.ct_label} - {result.output.score * 100:.4}%\")" |
| 361 | + "print(f\"Content type: {result.output.ct_label} - {result.score * 100:.4}%\")" |
342 | 362 | ]
|
343 | 363 | },
|
344 | 364 | {
|
|
363 | 383 | "name": "stdout",
|
364 | 384 | "output_type": "stream",
|
365 | 385 | "text": [
|
366 |
| - "Content type: markdown - 100.0%\n" |
| 386 | + "Content type: markdown - 99.97%\n" |
| 387 | + ] |
| 388 | + }, |
| 389 | + { |
| 390 | + "name": "stderr", |
| 391 | + "output_type": "stream", |
| 392 | + "text": [ |
| 393 | + "/home/ea/work/py311/lib/python3.11/site-packages/magika/types/content_type_info.py:22: DeprecationWarning: `.ct_label` is deprecated and will be removed in a future version. Use `.label` instead. Consult the documentation for more information.\n", |
| 394 | + " warnings.warn(\n" |
367 | 395 | ]
|
368 | 396 | }
|
369 | 397 | ],
|
|
376 | 404 | " with open(\"README.md\", \"w\") as f:\n",
|
377 | 405 | " f.write(r.text)\n",
|
378 | 406 | "result = ov_magika.identify_path(input_file)\n",
|
379 |
| - "print(f\"Content type: {result.output.ct_label} - {result.output.score * 100:.4}%\")" |
| 407 | + "print(f\"Content type: {result.output.ct_label} - {result.score * 100:.4}%\")" |
380 | 408 | ]
|
381 | 409 | },
|
382 | 410 | {
|
|
412 | 440 | " \"\"\"\n",
|
413 | 441 | " results = ov_magika.identify_bytes_topk(file_path)\n",
|
414 | 442 | "\n",
|
415 |
| - " return {result.dl.ct_label: float(result.output.score) for result in results}\n", |
| 443 | + " return {result.output.ct_label: float(result.score) for result in results}\n", |
416 | 444 | "\n",
|
417 | 445 | "\n",
|
418 | 446 | "demo = gr.Interface(\n",
|
|
450 | 478 | "name": "python",
|
451 | 479 | "nbconvert_exporter": "python",
|
452 | 480 | "pygments_lexer": "ipython3",
|
453 |
| - "version": "3.8.10" |
| 481 | + "version": "3.11.4" |
454 | 482 | },
|
455 | 483 | "openvino_notebooks": {
|
456 | 484 | "imageUrl": "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/b99cb2c0-d9cb-47a7-ba17-1b4b2eed01da",
|
|
469 | 497 | "widgets": {
|
470 | 498 | "application/vnd.jupyter.widget-state+json": {
|
471 | 499 | "state": {
|
472 |
| - "52c226d6276b49afbb7f17b5d5d8c27a": { |
| 500 | + "0e3cc43da6ef4fba99d8a432124324bc": { |
473 | 501 | "model_module": "@jupyter-widgets/controls",
|
474 | 502 | "model_module_version": "2.0.0",
|
475 | 503 | "model_name": "DropdownModel",
|
476 | 504 | "state": {
|
477 | 505 | "_options_labels": [
|
478 | 506 | "CPU",
|
479 |
| - "GPU.0", |
480 |
| - "GPU.1", |
481 | 507 | "AUTO"
|
482 | 508 | ],
|
483 | 509 | "description": "Device:",
|
484 |
| - "index": 3, |
485 |
| - "layout": "IPY_MODEL_5ee2fa796f56446ea39ee28ba8d3c174", |
486 |
| - "style": "IPY_MODEL_9707408129d94f1e8f0c5218a99c5170" |
| 510 | + "index": 1, |
| 511 | + "layout": "IPY_MODEL_d515552eb19847b3a492898a29172af8", |
| 512 | + "style": "IPY_MODEL_bdae71df78e8483d9cb09cab767fde4c" |
487 | 513 | }
|
488 | 514 | },
|
489 |
| - "5ee2fa796f56446ea39ee28ba8d3c174": { |
490 |
| - "model_module": "@jupyter-widgets/base", |
491 |
| - "model_module_version": "2.0.0", |
492 |
| - "model_name": "LayoutModel", |
493 |
| - "state": {} |
494 |
| - }, |
495 |
| - "9707408129d94f1e8f0c5218a99c5170": { |
| 515 | + "bdae71df78e8483d9cb09cab767fde4c": { |
496 | 516 | "model_module": "@jupyter-widgets/controls",
|
497 | 517 | "model_module_version": "2.0.0",
|
498 | 518 | "model_name": "DescriptionStyleModel",
|
499 | 519 | "state": {
|
500 | 520 | "description_width": ""
|
501 | 521 | }
|
| 522 | + }, |
| 523 | + "d515552eb19847b3a492898a29172af8": { |
| 524 | + "model_module": "@jupyter-widgets/base", |
| 525 | + "model_module_version": "2.0.0", |
| 526 | + "model_name": "LayoutModel", |
| 527 | + "state": {} |
502 | 528 | }
|
503 | 529 | },
|
504 | 530 | "version_major": 2,
|
|
0 commit comments