|
76 | 76 | "outputs": [],
|
77 | 77 | "source": [
|
78 | 78 | "import sagemaker\n",
|
| 79 | + "\n", |
79 | 80 | "sagemaker.__version__"
|
80 | 81 | ]
|
81 | 82 | },
|
|
377 | 378 | "import pytest\n",
|
378 | 379 | "from sagemaker.pytorch import PyTorch\n",
|
379 | 380 | "from sagemaker import get_execution_role\n",
|
380 |
| - "from sagemaker.debugger import Rule, DebuggerHookConfig, TensorBoardOutputConfig, CollectionConfig, rule_configs" |
| 381 | + "from sagemaker.debugger import (\n", |
| 382 | + " Rule,\n", |
| 383 | + " DebuggerHookConfig,\n", |
| 384 | + " TensorBoardOutputConfig,\n", |
| 385 | + " CollectionConfig,\n", |
| 386 | + " rule_configs,\n", |
| 387 | + ")" |
381 | 388 | ]
|
382 | 389 | },
|
383 | 390 | {
|
|
393 | 400 | "metadata": {},
|
394 | 401 | "outputs": [],
|
395 | 402 | "source": [
|
396 |
| - "hyperparameters={\n", |
397 |
| - " \"epochs\": \"5\",\n", |
398 |
| - " \"batch-size\": \"32\",\n", |
399 |
| - " \"test-batch-size\": \"100\",\n", |
400 |
| - " \"lr\": \"0.001\"\n", |
401 |
| - "}" |
| 403 | + "hyperparameters = {\"epochs\": \"5\", \"batch-size\": \"32\", \"test-batch-size\": \"100\", \"lr\": \"0.001\"}" |
402 | 404 | ]
|
403 | 405 | },
|
404 | 406 | {
|
|
422 | 424 | " Rule.sagemaker(rule_configs.vanishing_gradient()),\n",
|
423 | 425 | " Rule.sagemaker(rule_configs.overfit()),\n",
|
424 | 426 | " Rule.sagemaker(rule_configs.overtraining()),\n",
|
425 |
| - " Rule.sagemaker(rule_configs.poor_weight_initialization())\n", |
| 427 | + " Rule.sagemaker(rule_configs.poor_weight_initialization()),\n", |
426 | 428 | "]"
|
427 | 429 | ]
|
428 | 430 | },
|
|
459 | 461 | "outputs": [],
|
460 | 462 | "source": [
|
461 | 463 | "hook_config = DebuggerHookConfig(\n",
|
462 |
| - " hook_parameters={\n", |
463 |
| - " \"train.save_interval\": \"100\",\n", |
464 |
| - " \"eval.save_interval\": \"10\"\n", |
465 |
| - " }\n", |
| 464 | + " hook_parameters={\"train.save_interval\": \"100\", \"eval.save_interval\": \"10\"}\n", |
466 | 465 | ")"
|
467 | 466 | ]
|
468 | 467 | },
|
|
480 | 479 | "outputs": [],
|
481 | 480 | "source": [
|
482 | 481 | "estimator = PyTorch(\n",
|
483 |
| - " entry_point='scripts/pytorch_mnist.py',\n", |
484 |
| - " base_job_name='smdebugger-demo-mnist-pytorch',\n", |
| 482 | + " entry_point=\"scripts/pytorch_mnist.py\",\n", |
| 483 | + " base_job_name=\"smdebugger-demo-mnist-pytorch\",\n", |
485 | 484 | " role=get_execution_role(),\n",
|
486 | 485 | " instance_count=1,\n",
|
487 |
| - " instance_type='ml.p2.xlarge',\n", |
| 486 | + " instance_type=\"ml.p2.xlarge\",\n", |
488 | 487 | " volume_size=400,\n",
|
489 | 488 | " max_run=3600,\n",
|
490 | 489 | " hyperparameters=hyperparameters,\n",
|
491 |
| - " framework_version='1.8',\n", |
492 |
| - " py_version='py36',\n", |
493 |
| - " \n", |
| 490 | + " framework_version=\"1.8\",\n", |
| 491 | + " py_version=\"py36\",\n", |
494 | 492 | " ## Debugger parameters\n",
|
495 |
| - " rules = rules,\n", |
496 |
| - " debugger_hook_config=hook_config\n", |
| 493 | + " rules=rules,\n", |
| 494 | + " debugger_hook_config=hook_config,\n", |
497 | 495 | ")"
|
498 | 496 | ]
|
499 | 497 | },
|
|
534 | 532 | "metadata": {},
|
535 | 533 | "outputs": [],
|
536 | 534 | "source": [
|
537 |
| - "job_name=estimator.latest_training_job.name\n", |
| 535 | + "job_name = estimator.latest_training_job.name\n", |
538 | 536 | "client = estimator.sagemaker_session.sagemaker_client\n",
|
539 | 537 | "description = client.describe_training_job(TrainingJobName=estimator.latest_training_job.name)"
|
540 | 538 | ]
|
|
547 | 545 | "source": [
|
548 | 546 | "import time\n",
|
549 | 547 | "from IPython import display\n",
|
| 548 | + "\n", |
550 | 549 | "%matplotlib inline\n",
|
551 | 550 | "\n",
|
552 | 551 | "while description[\"SecondaryStatus\"] not in {\"Stopped\", \"Completed\"}:\n",
|
|
557 | 556 | " print(\"TrainingJobStatus: \", primary_status, \" | SecondaryStatus: \", secondary_status)\n",
|
558 | 557 | " print(\"====================================================================\")\n",
|
559 | 558 | " for r in range(len(estimator.latest_training_job.rule_job_summary())):\n",
|
560 |
| - " rule_summary=estimator.latest_training_job.rule_job_summary()\n", |
561 |
| - " print(rule_summary[r]['RuleConfigurationName'], \": \", rule_summary[r]['RuleEvaluationStatus'])\n", |
562 |
| - " if rule_summary[r]['RuleEvaluationStatus']=='IssuesFound':\n", |
563 |
| - " print(rule_summary[r]['StatusDetails'])\n", |
| 559 | + " rule_summary = estimator.latest_training_job.rule_job_summary()\n", |
| 560 | + " print(\n", |
| 561 | + " rule_summary[r][\"RuleConfigurationName\"], \": \", rule_summary[r][\"RuleEvaluationStatus\"]\n", |
| 562 | + " )\n", |
| 563 | + " if rule_summary[r][\"RuleEvaluationStatus\"] == \"IssuesFound\":\n", |
| 564 | + " print(rule_summary[r][\"StatusDetails\"])\n", |
564 | 565 | " print(\"====================================================================\")\n",
|
565 | 566 | " print(\"Current time: \", time.asctime())\n",
|
566 | 567 | " display.clear_output(wait=True)\n",
|
|
581 | 582 | "outputs": [],
|
582 | 583 | "source": [
|
583 | 584 | "def _get_rule_job_name(training_job_name, rule_configuration_name, rule_job_arn):\n",
|
584 |
| - " \"\"\"Helper function to get the rule job name with correct casing\"\"\"\n", |
585 |
| - " return \"{}-{}-{}\".format(\n", |
586 |
| - " training_job_name[:26], rule_configuration_name[:26], rule_job_arn[-8:]\n", |
587 |
| - " )\n", |
588 |
| - " \n", |
| 585 | + " \"\"\"Helper function to get the rule job name with correct casing\"\"\"\n", |
| 586 | + " return \"{}-{}-{}\".format(\n", |
| 587 | + " training_job_name[:26], rule_configuration_name[:26], rule_job_arn[-8:]\n", |
| 588 | + " )\n", |
| 589 | + "\n", |
| 590 | + "\n", |
589 | 591 | "def _get_cw_url_for_rule_job(rule_job_name, region):\n",
|
590 |
| - " return \"https://{}.console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/ProcessingJobs;prefix={};streamFilter=typeLogStreamPrefix\".format(region, region, rule_job_name)\n", |
| 592 | + " return \"https://{}.console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/ProcessingJobs;prefix={};streamFilter=typeLogStreamPrefix\".format(\n", |
| 593 | + " region, region, rule_job_name\n", |
| 594 | + " )\n", |
591 | 595 | "\n",
|
592 | 596 | "\n",
|
593 | 597 | "def get_rule_jobs_cw_urls(estimator):\n",
|
594 | 598 | " region = boto3.Session().region_name\n",
|
595 | 599 | " training_job = estimator.latest_training_job\n",
|
596 | 600 | " training_job_name = training_job.describe()[\"TrainingJobName\"]\n",
|
597 | 601 | " rule_eval_statuses = training_job.describe()[\"DebugRuleEvaluationStatuses\"]\n",
|
598 |
| - " \n", |
599 |
| - " result={}\n", |
| 602 | + "\n", |
| 603 | + " result = {}\n", |
600 | 604 | " for status in rule_eval_statuses:\n",
|
601 | 605 | " if status.get(\"RuleEvaluationJobArn\", None) is not None:\n",
|
602 |
| - " rule_job_name = _get_rule_job_name(training_job_name, status[\"RuleConfigurationName\"], status[\"RuleEvaluationJobArn\"])\n", |
603 |
| - " result[status[\"RuleConfigurationName\"]] = _get_cw_url_for_rule_job(rule_job_name, region)\n", |
| 606 | + " rule_job_name = _get_rule_job_name(\n", |
| 607 | + " training_job_name, status[\"RuleConfigurationName\"], status[\"RuleEvaluationJobArn\"]\n", |
| 608 | + " )\n", |
| 609 | + " result[status[\"RuleConfigurationName\"]] = _get_cw_url_for_rule_job(\n", |
| 610 | + " rule_job_name, region\n", |
| 611 | + " )\n", |
604 | 612 | " return result\n",
|
605 | 613 | "\n",
|
| 614 | + "\n", |
606 | 615 | "get_rule_jobs_cw_urls(estimator)"
|
607 | 616 | ]
|
608 | 617 | },
|
|
632 | 641 | "source": [
|
633 | 642 | "from smdebug.trials import create_trial\n",
|
634 | 643 | "from smdebug.core.modes import ModeKeys\n",
|
| 644 | + "\n", |
635 | 645 | "trial = create_trial(estimator.latest_job_debugger_artifacts_path())"
|
636 | 646 | ]
|
637 | 647 | },
|
|
762 | 772 | "import matplotlib.pyplot as plt\n",
|
763 | 773 | "from mpl_toolkits.axes_grid1 import host_subplot\n",
|
764 | 774 | "\n",
|
| 775 | + "\n", |
765 | 776 | "def plot_tensor(trial, tensor_name):\n",
|
766 | 777 | "\n",
|
767 | 778 | " steps_train, vals_train = get_data(trial, tensor_name, mode=ModeKeys.TRAIN)\n",
|
768 | 779 | " print(\"loaded TRAIN data\")\n",
|
769 | 780 | " steps_eval, vals_eval = get_data(trial, tensor_name, mode=ModeKeys.EVAL)\n",
|
770 | 781 | " print(\"loaded EVAL data\")\n",
|
771 | 782 | "\n",
|
772 |
| - " fig = plt.figure(figsize=(10,7))\n", |
| 783 | + " fig = plt.figure(figsize=(10, 7))\n", |
773 | 784 | " host = host_subplot(111)\n",
|
774 | 785 | "\n",
|
775 | 786 | " par = host.twiny()\n",
|
776 | 787 | "\n",
|
777 | 788 | " host.set_xlabel(\"Steps (TRAIN)\")\n",
|
778 | 789 | " par.set_xlabel(\"Steps (EVAL)\")\n",
|
779 | 790 | " host.set_ylabel(tensor_name)\n",
|
780 |
| - " \n", |
781 |
| - " p1, = host.plot(steps_train, vals_train, label=tensor_name)\n", |
| 791 | + "\n", |
| 792 | + " (p1,) = host.plot(steps_train, vals_train, label=tensor_name)\n", |
782 | 793 | " print(\"completed TRAIN plot\")\n",
|
783 |
| - " p2, = par.plot(steps_eval, vals_eval, label=\"val_\"+tensor_name)\n", |
| 794 | + " (p2,) = par.plot(steps_eval, vals_eval, label=\"val_\" + tensor_name)\n", |
784 | 795 | " print(\"completed EVAL plot\")\n",
|
785 | 796 | " leg = plt.legend()\n",
|
786 | 797 | "\n",
|
|
791 | 802 | " leg.texts[1].set_color(p2.get_color())\n",
|
792 | 803 | "\n",
|
793 | 804 | " plt.ylabel(tensor_name)\n",
|
794 |
| - " \n", |
| 805 | + "\n", |
795 | 806 | " plt.show()"
|
796 | 807 | ]
|
797 | 808 | },
|
|
15516 | 15527 | ],
|
15517 | 15528 | "source": [
|
15518 | 15529 | "import IPython\n",
|
15519 |
| - "IPython.display.HTML(filename=profiler_report_name+\"/profiler-output/profiler-report.html\")" |
| 15530 | + "\n", |
| 15531 | + "IPython.display.HTML(filename=profiler_report_name + \"/profiler-output/profiler-report.html\")" |
15520 | 15532 | ]
|
15521 | 15533 | },
|
15522 | 15534 | {
|
|
0 commit comments