Skip to content

Commit b64001d

Browse files
authored
update magike notebook to use new magike version (#2838)
1 parent 9ede790 commit b64001d

File tree

1 file changed

+67
-41
lines changed

1 file changed

+67
-41
lines changed

notebooks/magika-content-type-recognition/magika-content-type-recognition.ipynb

+67-41
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,25 @@
5656
},
5757
{
5858
"cell_type": "code",
59-
"execution_count": null,
59+
"execution_count": 1,
6060
"id": "91f4a127-0133-4daa-9021-f62dec73625b",
6161
"metadata": {
6262
"tags": []
6363
},
64-
"outputs": [],
64+
"outputs": [
65+
{
66+
"name": "stdout",
67+
"output_type": "stream",
68+
"text": [
69+
"\n",
70+
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n",
71+
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
72+
"Note: you may need to restart the kernel to use updated packages.\n"
73+
]
74+
}
75+
],
6576
"source": [
66-
"%pip install -q magika \"openvino>=2024.1.0\" \"gradio>=4.19\""
77+
"%pip install -q \"magika>=0.6.1\" \"openvino>=2024.1.0\" \"gradio>=4.19\""
6778
]
6879
},
6980
{
@@ -95,7 +106,10 @@
95106
"\n",
96107
"from magika import Magika\n",
97108
"from magika.types import ModelFeatures, ModelOutput, MagikaResult\n",
98-
"from magika.prediction_mode import PredictionMode\n",
109+
"try:\n",
110+
" from magika.types.prediction_mode import PredictionMode\n",
111+
"except ImportError:\n",
112+
" from magika.prediction_mode import PredictionMode\n",
99113
"import numpy.typing as npt\n",
100114
"import numpy as np\n",
101115
"\n",
@@ -144,25 +158,21 @@
144158
" Given a list of (path, features), return a (files_num, features_size)\n",
145159
" matrix encoding the predictions.\n",
146160
" \"\"\"\n",
147-
"\n",
148-
" dataset_format = self._model_config[\"train_dataset_info\"][\"dataset_format\"]\n",
149-
" assert dataset_format == \"int-concat/one-hot\"\n",
150161
" start_time = time.time()\n",
151162
" X_bytes = []\n",
152163
" for _, fs in features:\n",
153164
" sample_bytes = []\n",
154-
" if self._input_sizes[\"beg\"] > 0:\n",
155-
" sample_bytes.extend(fs.beg[: self._input_sizes[\"beg\"]])\n",
156-
" if self._input_sizes[\"mid\"] > 0:\n",
157-
" sample_bytes.extend(fs.mid[: self._input_sizes[\"mid\"]])\n",
158-
" if self._input_sizes[\"end\"] > 0:\n",
159-
" sample_bytes.extend(fs.end[-self._input_sizes[\"end\"] :])\n",
165+
" if self._model_config.beg_size > 0:\n",
166+
" sample_bytes.extend(fs.beg[: self._model_config.beg_size])\n",
167+
" if self._model_config.mid_size > 0:\n",
168+
" sample_bytes.extend(fs.mid[: self._model_config.mid_size])\n",
169+
" if self._model_config.end_size > 0:\n",
170+
" sample_bytes.extend(fs.end[-self._model_config.end_size :])\n",
160171
" X_bytes.append(sample_bytes)\n",
161-
" X = np.array(X_bytes).astype(np.float32)\n",
162-
" elapsed_time = time.time() - start_time\n",
163-
" self._log.debug(f\"DL input prepared in {elapsed_time:.03f} seconds\")\n",
172+
" X = np.array(X_bytes, dtype=np.int32)\n",
173+
" elapsed_time = 1000 * (time.time() - start_time)\n",
174+
" self._log.debug(f\"DL input prepared in {elapsed_time:.03f} ms\")\n",
164175
"\n",
165-
" start_time = time.time()\n",
166176
" raw_predictions_list = []\n",
167177
" samples_num = X.shape[0]\n",
168178
"\n",
@@ -172,7 +182,9 @@
172182
" batches_num += 1\n",
173183
"\n",
174184
" for batch_idx in range(batches_num):\n",
175-
" self._log.debug(f\"Getting raw predictions for (internal) batch {batch_idx+1}/{batches_num}\")\n",
185+
" self._log.debug(\n",
186+
" f\"Getting raw predictions for (internal) batch {batch_idx+1}/{batches_num}\"\n",
187+
" )\n",
176188
" start_idx = batch_idx * max_internal_batch_size\n",
177189
" end_idx = min((batch_idx + 1) * max_internal_batch_size, samples_num)\n",
178190
" batch_raw_predictions = self._onnx_session({\"bytes\": X[start_idx:end_idx, :]})[\"target_label\"]\n",
@@ -218,12 +230,12 @@
218230
" # the user.\n",
219231
" results = []\n",
220232
" for out in model_output:\n",
221-
" output_ct_label = self._get_output_ct_label_from_dl_result(out.ct_label, out.score)\n",
233+
" output_ct_label, _ = self._get_output_ct_label_from_dl_result(out.ct_label, out.score)\n",
222234
"\n",
223235
" results.append(\n",
224236
" self._get_result_from_labels_and_score(\n",
225237
" path,\n",
226-
" dl_ct_label=out.ct_label,\n",
238+
" dl_ct_label=output_ct_label,\n",
227239
" output_ct_label=output_ct_label,\n",
228240
" score=out.score,\n",
229241
" )\n",
@@ -267,12 +279,12 @@
267279
{
268280
"data": {
269281
"application/vnd.jupyter.widget-view+json": {
270-
"model_id": "52c226d6276b49afbb7f17b5d5d8c27a",
282+
"model_id": "0e3cc43da6ef4fba99d8a432124324bc",
271283
"version_major": 2,
272284
"version_minor": 0
273285
},
274286
"text/plain": [
275-
"Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO')"
287+
"Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO')"
276288
]
277289
},
278290
"execution_count": 3,
@@ -332,13 +344,21 @@
332344
"name": "stdout",
333345
"output_type": "stream",
334346
"text": [
335-
"Content type: markdown - 99.29%\n"
347+
"Content type: markdown - 82.43%\n"
348+
]
349+
},
350+
{
351+
"name": "stderr",
352+
"output_type": "stream",
353+
"text": [
354+
"/home/ea/work/py311/lib/python3.11/site-packages/magika/types/content_type_info.py:22: DeprecationWarning: `.ct_label` is deprecated and will be removed in a future version. Use `.label` instead. Consult the documentation for more information.\n",
355+
" warnings.warn(\n"
336356
]
337357
}
338358
],
339359
"source": [
340360
"result = ov_magika.identify_bytes(b\"# Example\\nThis is an example of markdown!\")\n",
341-
"print(f\"Content type: {result.output.ct_label} - {result.output.score * 100:.4}%\")"
361+
"print(f\"Content type: {result.output.ct_label} - {result.score * 100:.4}%\")"
342362
]
343363
},
344364
{
@@ -363,7 +383,15 @@
363383
"name": "stdout",
364384
"output_type": "stream",
365385
"text": [
366-
"Content type: markdown - 100.0%\n"
386+
"Content type: markdown - 99.97%\n"
387+
]
388+
},
389+
{
390+
"name": "stderr",
391+
"output_type": "stream",
392+
"text": [
393+
"/home/ea/work/py311/lib/python3.11/site-packages/magika/types/content_type_info.py:22: DeprecationWarning: `.ct_label` is deprecated and will be removed in a future version. Use `.label` instead. Consult the documentation for more information.\n",
394+
" warnings.warn(\n"
367395
]
368396
}
369397
],
@@ -376,7 +404,7 @@
376404
" with open(\"README.md\", \"w\") as f:\n",
377405
" f.write(r.text)\n",
378406
"result = ov_magika.identify_path(input_file)\n",
379-
"print(f\"Content type: {result.output.ct_label} - {result.output.score * 100:.4}%\")"
407+
"print(f\"Content type: {result.output.ct_label} - {result.score * 100:.4}%\")"
380408
]
381409
},
382410
{
@@ -412,7 +440,7 @@
412440
" \"\"\"\n",
413441
" results = ov_magika.identify_bytes_topk(file_path)\n",
414442
"\n",
415-
" return {result.dl.ct_label: float(result.output.score) for result in results}\n",
443+
" return {result.output.ct_label: float(result.score) for result in results}\n",
416444
"\n",
417445
"\n",
418446
"demo = gr.Interface(\n",
@@ -450,7 +478,7 @@
450478
"name": "python",
451479
"nbconvert_exporter": "python",
452480
"pygments_lexer": "ipython3",
453-
"version": "3.8.10"
481+
"version": "3.11.4"
454482
},
455483
"openvino_notebooks": {
456484
"imageUrl": "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/b99cb2c0-d9cb-47a7-ba17-1b4b2eed01da",
@@ -469,36 +497,34 @@
469497
"widgets": {
470498
"application/vnd.jupyter.widget-state+json": {
471499
"state": {
472-
"52c226d6276b49afbb7f17b5d5d8c27a": {
500+
"0e3cc43da6ef4fba99d8a432124324bc": {
473501
"model_module": "@jupyter-widgets/controls",
474502
"model_module_version": "2.0.0",
475503
"model_name": "DropdownModel",
476504
"state": {
477505
"_options_labels": [
478506
"CPU",
479-
"GPU.0",
480-
"GPU.1",
481507
"AUTO"
482508
],
483509
"description": "Device:",
484-
"index": 3,
485-
"layout": "IPY_MODEL_5ee2fa796f56446ea39ee28ba8d3c174",
486-
"style": "IPY_MODEL_9707408129d94f1e8f0c5218a99c5170"
510+
"index": 1,
511+
"layout": "IPY_MODEL_d515552eb19847b3a492898a29172af8",
512+
"style": "IPY_MODEL_bdae71df78e8483d9cb09cab767fde4c"
487513
}
488514
},
489-
"5ee2fa796f56446ea39ee28ba8d3c174": {
490-
"model_module": "@jupyter-widgets/base",
491-
"model_module_version": "2.0.0",
492-
"model_name": "LayoutModel",
493-
"state": {}
494-
},
495-
"9707408129d94f1e8f0c5218a99c5170": {
515+
"bdae71df78e8483d9cb09cab767fde4c": {
496516
"model_module": "@jupyter-widgets/controls",
497517
"model_module_version": "2.0.0",
498518
"model_name": "DescriptionStyleModel",
499519
"state": {
500520
"description_width": ""
501521
}
522+
},
523+
"d515552eb19847b3a492898a29172af8": {
524+
"model_module": "@jupyter-widgets/base",
525+
"model_module_version": "2.0.0",
526+
"model_name": "LayoutModel",
527+
"state": {}
502528
}
503529
},
504530
"version_major": 2,

0 commit comments

Comments
 (0)