Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gaudi OpenShift notebook container added #364

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
315 changes: 315 additions & 0 deletions enterprise/redhat/openshift-ai/gaudi/demo/oneapi-sample.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,315 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "1e973d1b-c6d0-48a5-a774-0f114101e81e",
"metadata": {},
"source": [
"# Getting started with PyTorch on Intel® Gaudi.\n",
"\n",
"This notebook is to help you get started quickly using the Intel® Gaudi accelerator in this container. A simple MNIST model is trained on the Gaudi acclerator. You can tune some of the parameters below to change configuration of the training. For more information and reference please refer to the official documentation of [Intel® Gaudi acclerator](https://docs.habana.ai/en/latest/index.html)."
]
},
{
"cell_type": "markdown",
"id": "7eaacf55-bea2-43be-bb48-163848db1a30",
"metadata": {
"tags": []
},
"source": [
"### Setup modes for training\n",
"\n",
"1. lazy_mode: Set to True(False) to enable(disable) lazy mode.\n",
"2. enable_amp: Set to True(False) to enable Automatic Mixed Precision.\n",
"3. epochs: Number of epochs for training\n",
"4. lr: Learning rate for training\n",
"5. batch_size: Number of samples in a batch\n",
"6. milestones: Milestone epochs for the stepLR scheduler."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5e7cf831-6fe6-46ed-a6fd-f2651cc226af",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"lazy_mode = False\n",
"enable_amp = False\n",
"epochs = 20\n",
"batch_size = 128\n",
"lr = 0.01\n",
"milestones = [10,15]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cee8ad90-c52d-4a50-876f-ce0762cb1b62",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import os\n",
"os.environ['HABANA_LOGS']='/opt/app-root/logs'\n",
"if lazy_mode:\n",
" os.environ['PT_HPU_LAZY_MODE'] = '1'\n",
"else:\n",
" os.environ['PT_HPU_LAZY_MODE'] = '0'"
]
},
{
"cell_type": "markdown",
"id": "6eac33d0-2e64-4233-8b3f-40bb7217fef8",
"metadata": {
"tags": []
},
"source": [
"### Import packages"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "06ad44ff-9744-4d6f-af90-375e64717b59",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"import torch.nn.functional as F\n",
"import torchvision\n",
"import torchvision.transforms as transforms\n",
"import os\n",
"\n",
"# Import Habana Torch Library\n",
"import habana_frameworks.torch.core as htcore"
]
},
{
"cell_type": "markdown",
"id": "062de7f3-4561-4af3-a9ed-2c4cfc918f2f",
"metadata": {},
"source": [
"### Define Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9df57abb-0b63-4e1c-9d9b-87e74964300e",
"metadata": {},
"outputs": [],
"source": [
"class SimpleModel(nn.Module):\n",
" def __init__(self):\n",
" super(SimpleModel, self).__init__()\n",
"\n",
" self.fc1 = nn.Linear(784, 256)\n",
" self.fc2 = nn.Linear(256, 64)\n",
" self.fc3 = nn.Linear(64, 10)\n",
"\n",
" def forward(self, x):\n",
"\n",
" out = x.view(-1,28*28)\n",
" out = F.relu(self.fc1(out))\n",
" out = F.relu(self.fc2(out))\n",
" out = self.fc3(out)\n",
"\n",
" return out"
]
},
{
"cell_type": "markdown",
"id": "d899885b-5b4d-4557-a90c-9d507875c2ee",
"metadata": {},
"source": [
"### Define training routine"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7b17e9aa-fa11-4870-a7d4-183b803177ab",
"metadata": {},
"outputs": [],
"source": [
"def train(net,criterion,optimizer,trainloader,device):\n",
"\n",
" net.train()\n",
" if not lazy_mode:\n",
" net = torch.compile(net,backend=\"hpu_backend\")\n",
" train_loss = 0.0\n",
" correct = 0\n",
" total = 0\n",
"\n",
" for batch_idx, (data, targets) in enumerate(trainloader):\n",
"\n",
" data, targets = data.to(device), targets.to(device)\n",
"\n",
" optimizer.zero_grad()\n",
" if enable_amp:\n",
" with torch.autocast(device_type=\"hpu\", dtype=torch.bfloat16):\n",
" outputs = net(data)\n",
" loss = criterion(outputs, targets)\n",
" else:\n",
" outputs = net(data)\n",
" loss = criterion(outputs, targets)\n",
"\n",
" loss.backward()\n",
" \n",
" # API call to trigger execution\n",
" if lazy_mode:\n",
" htcore.mark_step()\n",
" \n",
" optimizer.step()\n",
"\n",
" # API call to trigger execution\n",
" if lazy_mode:\n",
" htcore.mark_step()\n",
"\n",
" train_loss += loss.item()\n",
" _, predicted = outputs.max(1)\n",
" total += targets.size(0)\n",
" correct += predicted.eq(targets).sum().item()\n",
"\n",
" train_loss = train_loss/(batch_idx+1)\n",
" train_acc = 100.0*(correct/total)\n",
" print(\"Training loss is {} and training accuracy is {}\".format(train_loss,train_acc))"
]
},
{
"cell_type": "markdown",
"id": "b7a22d69-a91f-48e1-8fac-e1cfe68590b7",
"metadata": {},
"source": [
"### Define testing routine"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f9aa379b-b376-4623-9b5c-f778c3d90ce7",
"metadata": {},
"outputs": [],
"source": [
"def test(net,criterion,testloader,device):\n",
"\n",
" net.eval()\n",
" test_loss = 0\n",
" correct = 0\n",
" total = 0\n",
"\n",
" with torch.no_grad():\n",
"\n",
" for batch_idx, (data, targets) in enumerate(testloader):\n",
"\n",
" data, targets = data.to(device), targets.to(device)\n",
" \n",
" if enable_amp:\n",
" with torch.autocast(device_type=\"hpu\", dtype=torch.bfloat16):\n",
" outputs = net(data)\n",
" loss = criterion(outputs, targets)\n",
" else:\n",
" outputs = net(data)\n",
" loss = criterion(outputs, targets)\n",
"\n",
"\n",
" # API call to trigger execution\n",
" if lazy_mode:\n",
" htcore.mark_step()\n",
"\n",
" test_loss += loss.item()\n",
" _, predicted = outputs.max(1)\n",
" total += targets.size(0)\n",
" correct += predicted.eq(targets).sum().item()\n",
"\n",
" test_loss = test_loss/(batch_idx+1)\n",
" test_acc = 100.0*(correct/total)\n",
" print(\"Testing loss is {} and testing accuracy is {}\".format(test_loss,test_acc))"
]
},
{
"cell_type": "markdown",
"id": "22e76af9-e355-4299-b84d-f34c9a25e76d",
"metadata": {},
"source": [
"### Run the main routine to train and test the model"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c8ddfb1-d4f7-44b2-aff0-f86f1db8c971",
"metadata": {},
"outputs": [],
"source": [
"load_path = './data'\n",
"save_path = './checkpoints'\n",
"\n",
"if(not os.path.exists(save_path)):\n",
" os.makedirs(save_path)\n",
"\n",
"# Target the Gaudi HPU device\n",
"device = torch.device(\"hpu\")\n",
"\n",
"# Data\n",
"transform = transforms.Compose([\n",
" transforms.ToTensor(),\n",
"])\n",
"\n",
"trainset = torchvision.datasets.MNIST(root=load_path, train=True,\n",
" download=True, transform=transform)\n",
"trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,\n",
" shuffle=True, num_workers=2)\n",
"testset = torchvision.datasets.MNIST(root=load_path, train=False,\n",
" download=True, transform=transform)\n",
"testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,\n",
" shuffle=False, num_workers=2)\n",
"\n",
"net = SimpleModel()\n",
"net.to(device)\n",
"\n",
"criterion = nn.CrossEntropyLoss()\n",
"optimizer = optim.SGD(net.parameters(), lr=lr,\n",
" momentum=0.9, weight_decay=5e-4)\n",
"scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.1)\n",
"\n",
"for epoch in range(1, epochs+1):\n",
" print(\"=====================================================================\")\n",
" print(\"Epoch : {}\".format(epoch))\n",
" train(net,criterion,optimizer,trainloader,device)\n",
" test(net,criterion,testloader,device)\n",
"\n",
" torch.save(net.state_dict(), os.path.join(save_path,'epoch_{}.pth'.format(epoch)))\n",
"\n",
" scheduler.step()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading