diff --git a/machine-learning-das/notebooks/3.3-dense-keras-JetTagging.ipynb b/machine-learning-das/notebooks/3.3-dense-keras-JetTagging.ipynb
index 1ddb9f5..86ecd41 100644
--- a/machine-learning-das/notebooks/3.3-dense-keras-JetTagging.ipynb
+++ b/machine-learning-das/notebooks/3.3-dense-keras-JetTagging.ipynb
@@ -1,434 +1,448 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "Rf258xT0XIwV"
- },
- "source": [
- "# Training a Jet Tagging with **DNN** \n",
- "\n",
- "---\n",
- "In this notebook, we perform a Jet identification task using a multiclass classifier based on a \n",
- "Dense Neural Network (DNN), also called multi-layer perceptron (MLP). The problem consists on identifying a given jet as a quark, a gluon, a W, a Z, or a top,\n",
- "based on set of physics-motivated high-level features.\n",
- "\n",
- "For details on the physics problem, see https://arxiv.org/pdf/1804.06913.pdf \n",
- "\n",
- "For details on the dataset, see Notebook1\n",
- "\n",
- "---"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "metadata": {
- "id": "4OMAZgtyXIwY"
- },
- "outputs": [],
- "source": [
- "import os\n",
- "import h5py\n",
- "import glob\n",
- "import numpy as np\n",
- "import matplotlib.pyplot as plt\n",
- "%matplotlib inline"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "2lbB-J3hXIwb"
- },
- "source": [
- "# Preparation of the training and validation samples\n",
- "\n",
- "---\n",
- "In order to import the dataset, we now\n",
- "- clone the dataset repository (to import the data in Colab)\n",
- "- load the h5 files in the data/ repository\n",
- "- extract the data we need: a target and jet features\n",
- "\n",
- "To type shell commands, we start the command line with !\n",
- "\n",
- "**nb, if you are running locally and you have already downloaded the datasets you can skip the cell below and, if needed, change the paths later to point to the folder with your previous download of the datasets.**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "jWjxFaRPXIwb"
- },
- "outputs": [],
- "source": [
- "! curl https://cernbox.cern.ch/s/6Ec5pGFEpFWeH6S/download -o Data-MLtutorial.tar.gz\n",
- "! tar -xvzf Data-MLtutorial.tar.gz \n",
- "! ls Data-MLtutorial/JetDataset/\n",
- "! rm Data-MLtutorial.tar.gz "
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Dataset Exploration"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# let's open the file\n",
- "data_dir = 'Data-MLtutorial/JetDataset/'\n",
- "fileIN = data_dir+'jetImage_7_100p_30000_40000.h5'\n",
- "f = h5py.File(fileIN)\n",
- "# and see what it contains\n",
- "print(list(f.keys()))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "* 'jetImage' ,' jetImageECAL' and 'jetImageHCAL' contains the image representation of the jets . We will not use them today but build our point cloud from the other information.\n",
- "* 'jetConstituentList' is the list of particles cointained in the jet. For each particle, a list of relevant quantities is stored. This is the dataset we will consider in this notebook.\n",
- "* 'particleFeatureNames' is the list of the names corresponding to the quantities contained in 'jetConstituentList'\n",
- "* 'jets' is the list of jets with the high-level jet features stored. We will only use jet ID from it, indecies [-6:-1]\n",
- "* 'jetFeatureNames' is the list of the names corresponding to the quantities contained in 'jets'. These quantities are build using physics knowledge and correspond to high-level infromation and features per graph (as opposed to per node)\n",
- "\n",
- "The first 100 highest transverse momentum $p_T$ particles are considered for each jet.\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "metadata": {
- "id": "cCGhrKdwXIwc"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_30000_40000.h5\n",
- "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_60000_70000.h5\n",
- "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_50000_60000.h5\n",
- "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_10000_20000.h5\n",
- "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_0_10000.h5\n",
- "(50000, 5) (50000, 49)\n",
- "[b'j_tau1_b1', b'j_tau2_b1', b'j_tau3_b1', b'j_tau1_b2', b'j_tau2_b2', b'j_tau3_b2', b'j_tau32_b1', b'j_tau32_b2', b'j_zlogz', b'j_c1_b0', b'j_c1_b1', b'j_c1_b2', b'j_c2_b1', b'j_c2_b2', b'j_d2_b1', b'j_d2_b2', b'j_d2_a1_b1', b'j_d2_a1_b2', b'j_m2_b1', b'j_m2_b2', b'j_n2_b1', b'j_n2_b2', b'j_tau1_b1_mmdt', b'j_tau2_b1_mmdt', b'j_tau3_b1_mmdt', b'j_tau1_b2_mmdt', b'j_tau2_b2_mmdt', b'j_tau3_b2_mmdt', b'j_tau32_b1_mmdt', b'j_tau32_b2_mmdt', b'j_c1_b0_mmdt', b'j_c1_b1_mmdt', b'j_c1_b2_mmdt', b'j_c2_b1_mmdt', b'j_c2_b2_mmdt', b'j_d2_b1_mmdt', b'j_d2_b2_mmdt', b'j_d2_a1_b1_mmdt', b'j_d2_a1_b2_mmdt', b'j_m2_b1_mmdt', b'j_m2_b2_mmdt', b'j_n2_b1_mmdt', b'j_n2_b2_mmdt', b'j_mass_trim', b'j_mass_mmdt', b'j_mass_prun', b'j_mass_sdb2', b'j_mass_sdm1', b'j_multiplicity']\n"
- ]
- }
- ],
- "source": [
- "target = np.array([])\n",
- "AllFeatures = np.array([])\n",
- "\n",
- "# we cannot load all data on Colab. So we just take a few files\n",
- "datafiles = ['Data-MLtutorial/JetDataset/jetImage_7_100p_30000_40000.h5',\n",
- " 'Data-MLtutorial/JetDataset/jetImage_7_100p_60000_70000.h5',\n",
- " 'Data-MLtutorial/JetDataset/jetImage_7_100p_50000_60000.h5',\n",
- " 'Data-MLtutorial/JetDataset/jetImage_7_100p_10000_20000.h5',\n",
- " 'Data-MLtutorial/JetDataset/jetImage_7_100p_0_10000.h5']\n",
- "# if you are running locallt, you can use the full dataset doing\n",
- "# for fileIN in glob.glob(\"tutorials/HiggsSchool/data/*h5\"):\n",
- "for fileIN in datafiles:\n",
- " print(\"Appending %s\" %fileIN)\n",
- " f = h5py.File(fileIN)\n",
- " myAllFeatures = np.array(f.get(\"jets\")[:,4:-6])\n",
- " mytarget = np.array(f.get('jets')[0:,-6:-1])\n",
- " AllFeatures = np.concatenate([AllFeatures, myAllFeatures], axis=0) if AllFeatures.size else myAllFeatures\n",
- " target = np.concatenate([target, mytarget], axis=0) if target.size else mytarget\n",
- " f.close()\n",
- "print(target.shape, AllFeatures.shape)\n",
- "\n",
- "# let's see what we have\n",
- "f = h5py.File(datafiles[-1])\n",
- "features_list = list(f.get(\"jetFeatureNames\"))[4:-6]\n",
- "f.close()\n",
- "print(features_list)\n",
- "labels = ['gluon', 'quark', 'W', 'Z', 'top']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#data exploration\n",
- "fig = plt.figure(figsize=(10, 10))\n",
- "for i, label in enumerate(labels):\n",
- " plt.hist(AllFeatures[target[:,i]==1][:,30], bins=50, range=(0,0.5), histtype='step', label=label)\n",
- "plt.legend()\n",
- "plt.xlabel(features_list[30])\n",
- "plt.ylabel('Number of jets')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#feature selection\n",
- "features = AllFeatures[:,[8, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 48]]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "6a333RYPXIwe"
- },
- "source": [
- "The dataset consists of 50000 jets, each represented by 16 features\n",
- "\n",
- "---\n",
- "\n",
- "We now shuffle the data, splitting them into a training and a validation dataset with 2:1 ratio"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "ZBqFs1eBXIwf"
- },
- "outputs": [],
- "source": [
- "from sklearn.model_selection import train_test_split\n",
- "X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)\n",
- "print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)\n",
- "del features, target"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "GkNz5UAhXIwg"
- },
- "source": [
- "# DNN model building"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "tTSDOiEHXIwh"
- },
- "outputs": [],
- "source": [
- "# keras imports\n",
- "from tensorflow.keras.models import Model\n",
- "from tensorflow.keras.layers import Dense, Input, Dropout, Flatten, Activation\n",
- "from tensorflow.keras.utils import plot_model\n",
- "from tensorflow.keras import backend as K\n",
- "from tensorflow.keras import metrics\n",
- "from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, TerminateOnNaN"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "rAl0DZTxXIwi"
- },
- "outputs": [],
- "source": [
- "input_shape = X_train.shape[1]\n",
- "dropoutRate = 0.1"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "2l492G8BXIwj"
- },
- "outputs": [],
- "source": [
- "####\n",
- "inputArray = Input(shape=(input_shape,))\n",
- "#\n",
- "x = Dense(40, activation='relu')(inputArray)\n",
- "x = Dropout(dropoutRate)(x)\n",
- "#\n",
- "x = Dense(20)(x)\n",
- "x = Activation('relu')(x)\n",
- "x = Dropout(dropoutRate)(x)\n",
- "#\n",
- "x = Dense(10, activation='relu')(x)\n",
- "x = Dropout(dropoutRate)(x)\n",
- "#\n",
- "x = Dense(5, activation='relu')(x)\n",
- "#\n",
- "output = Dense(5, activation='softmax')(x)\n",
- "####\n",
- "model = Model(inputs=inputArray, outputs=output)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "xu8rRUkhXIwj"
- },
- "outputs": [],
- "source": [
- "model.compile(loss='categorical_crossentropy', optimizer='adam')\n",
- "model.summary()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "2HfKWoOtXIwk"
- },
- "source": [
- "We now train the model with these settings:\n",
- "\n",
- "- the **batch size** is a hyperparameter of gradient descent that controls the number of training samples to work through before the model internal parameters are updated\n",
- " - batch size = 1 results in fast computation but noisy training that is slow to converge\n",
- " - batch size = dataset size results in slow computation but faster convergence)\n",
- "\n",
- "- the **number of epochs** controls the number of complete passes through the full training dataset -- at each epoch gradients are computed for each of the mini batches and model internal parameters are updated.\n",
- "\n",
- "- the **callbacks** are algorithms used to optimize the training (full list [here](https://keras.io/api/callbacks/)):\n",
- " - *EarlyStopping*: stop training when a monitored metric (`monitor`) has stopped improving in the last N epochs (`patience`)\n",
- " - *ReduceLROnPlateau*: reduce learning rate when a metric (`monitor`) has stopped improving in the last N epochs (`patience`)\n",
- " - *TerminateOnNaN*: terminates training when a NaN loss is encountered"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "KzO-lyLEXIwk"
- },
- "outputs": [],
- "source": [
- "batch_size = 1024\n",
- "n_epochs = 50\n",
- "\n",
- "# train \n",
- "history = model.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size, verbose = 2,\n",
- " validation_split=0.2,\n",
- " # callbacks = [\n",
- " # EarlyStopping(monitor='val_loss', patience=10, verbose=1),\n",
- " # ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=1),\n",
- " # TerminateOnNaN()]\n",
- " )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "044bCLqVXIwl"
- },
- "outputs": [],
- "source": [
- "# plot training history\n",
- "plt.plot(history.history['loss'])\n",
- "plt.plot(history.history['val_loss'])\n",
- "plt.yscale('log')\n",
- "plt.title('Training History')\n",
- "plt.ylabel('loss')\n",
- "plt.xlabel('epoch')\n",
- "plt.legend(['training', 'validation'], loc='upper right')\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "oESSmNLxXIwm"
- },
- "source": [
- "# Building the ROC Curves"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "gjKT7EjUXIwn"
- },
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "from sklearn.metrics import roc_curve, auc\n",
- "predict_test = model.predict(X_test)\n",
- "df = pd.DataFrame()\n",
- "fpr = {}\n",
- "tpr = {}\n",
- "auc1 = {}\n",
- "\n",
- "plt.figure()\n",
- "for i, label in enumerate(labels):\n",
- " df[label] = y_test[:,i]\n",
- " df[label + '_pred'] = predict_test[:,i]\n",
- "\n",
- " fpr[label], tpr[label], threshold = roc_curve(df[label],df[label+'_pred'])\n",
- "\n",
- " auc1[label] = auc(fpr[label], tpr[label])\n",
- "\n",
- " plt.plot(tpr[label],fpr[label],label='%s tagger, auc = %.1f%%'%(label,auc1[label]*100.))\n",
- "plt.semilogy()\n",
- "plt.xlabel(\"sig. efficiency\")\n",
- "plt.ylabel(\"bkg. mistag rate\")\n",
- "plt.ylim(0.000001,1)\n",
- "plt.grid(True)\n",
- "plt.legend(loc='lower right')\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## How do you build a QCD vs Top / W / Z ROC curve?"
- ]
- },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Rf258xT0XIwV"
+ },
+ "source": [
+ "# Training a Jet Tagging with **DNN** \n",
+ "\n",
+ "---\n",
+ "In this notebook, we perform a Jet identification task using a multiclass classifier based on a \n",
+ "Dense Neural Network (DNN), also called multi-layer perceptron (MLP). The problem consists on identifying a given jet as a quark, a gluon, a W, a Z, or a top,\n",
+ "based on set of physics-motivated high-level features.\n",
+ "\n",
+ "For details on the physics problem, see https://arxiv.org/pdf/1804.06913.pdf \n",
+ "\n",
+ "For details on the dataset, see Notebook1\n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {
+ "id": "4OMAZgtyXIwY"
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import h5py\n",
+ "import glob\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "%matplotlib inline"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "2lbB-J3hXIwb"
+ },
+ "source": [
+ "# Preparation of the training and validation samples\n",
+ "\n",
+ "---\n",
+ "In order to import the dataset, we now\n",
+ "- clone the dataset repository (to import the data in Colab)\n",
+ "- load the h5 files in the data/ repository\n",
+ "- extract the data we need: a target and jet features\n",
+ "\n",
+ "To type shell commands, we start the command line with !\n",
+ "\n",
+ "**nb, if you are running locally and you have already downloaded the datasets you can skip the cell below and, if needed, change the paths later to point to the folder with your previous download of the datasets.**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "jWjxFaRPXIwb"
+ },
+ "outputs": [],
+ "source": [
+ "! curl https://cernbox.cern.ch/s/6Ec5pGFEpFWeH6S/download -o Data-MLtutorial.tar.gz\n",
+ "! tar -xvzf Data-MLtutorial.tar.gz\n",
+ "! ls Data-MLtutorial/JetDataset/\n",
+ "! rm Data-MLtutorial.tar.gz"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Dataset Exploration"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# let's open the file\n",
+ "data_dir = \"Data-MLtutorial/JetDataset/\"\n",
+ "fileIN = data_dir + \"jetImage_7_100p_30000_40000.h5\"\n",
+ "f = h5py.File(fileIN)\n",
+ "# and see what it contains\n",
+ "print(list(f.keys()))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* 'jetImage' ,' jetImageECAL' and 'jetImageHCAL' contains the image representation of the jets . We will not use them today but build our point cloud from the other information.\n",
+ "* 'jetConstituentList' is the list of particles cointained in the jet. For each particle, a list of relevant quantities is stored. This is the dataset we will consider in this notebook.\n",
+ "* 'particleFeatureNames' is the list of the names corresponding to the quantities contained in 'jetConstituentList'\n",
+ "* 'jets' is the list of jets with the high-level jet features stored. We will only use jet ID from it, indecies [-6:-1]\n",
+ "* 'jetFeatureNames' is the list of the names corresponding to the quantities contained in 'jets'. These quantities are build using physics knowledge and correspond to high-level infromation and features per graph (as opposed to per node)\n",
+ "\n",
+ "The first 100 highest transverse momentum $p_T$ particles are considered for each jet.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {
+ "id": "cCGhrKdwXIwc"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "lzbQ-d0RKVmV"
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "colab": {
- "name": "Notebook2_JetID_DNN.ipynb",
- "provenance": []
- },
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.16"
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_30000_40000.h5\n",
+ "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_60000_70000.h5\n",
+ "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_50000_60000.h5\n",
+ "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_10000_20000.h5\n",
+ "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_0_10000.h5\n",
+ "(50000, 5) (50000, 49)\n",
+ "[b'j_tau1_b1', b'j_tau2_b1', b'j_tau3_b1', b'j_tau1_b2', b'j_tau2_b2', b'j_tau3_b2', b'j_tau32_b1', b'j_tau32_b2', b'j_zlogz', b'j_c1_b0', b'j_c1_b1', b'j_c1_b2', b'j_c2_b1', b'j_c2_b2', b'j_d2_b1', b'j_d2_b2', b'j_d2_a1_b1', b'j_d2_a1_b2', b'j_m2_b1', b'j_m2_b2', b'j_n2_b1', b'j_n2_b2', b'j_tau1_b1_mmdt', b'j_tau2_b1_mmdt', b'j_tau3_b1_mmdt', b'j_tau1_b2_mmdt', b'j_tau2_b2_mmdt', b'j_tau3_b2_mmdt', b'j_tau32_b1_mmdt', b'j_tau32_b2_mmdt', b'j_c1_b0_mmdt', b'j_c1_b1_mmdt', b'j_c1_b2_mmdt', b'j_c2_b1_mmdt', b'j_c2_b2_mmdt', b'j_d2_b1_mmdt', b'j_d2_b2_mmdt', b'j_d2_a1_b1_mmdt', b'j_d2_a1_b2_mmdt', b'j_m2_b1_mmdt', b'j_m2_b2_mmdt', b'j_n2_b1_mmdt', b'j_n2_b2_mmdt', b'j_mass_trim', b'j_mass_mmdt', b'j_mass_prun', b'j_mass_sdb2', b'j_mass_sdm1', b'j_multiplicity']\n"
+ ]
}
+ ],
+ "source": [
+ "target = np.array([])\n",
+ "AllFeatures = np.array([])\n",
+ "\n",
+ "# we cannot load all data on Colab. So we just take a few files\n",
+ "datafiles = [\n",
+ " \"Data-MLtutorial/JetDataset/jetImage_7_100p_30000_40000.h5\",\n",
+ " \"Data-MLtutorial/JetDataset/jetImage_7_100p_60000_70000.h5\",\n",
+ " \"Data-MLtutorial/JetDataset/jetImage_7_100p_50000_60000.h5\",\n",
+ " \"Data-MLtutorial/JetDataset/jetImage_7_100p_10000_20000.h5\",\n",
+ " \"Data-MLtutorial/JetDataset/jetImage_7_100p_0_10000.h5\",\n",
+ "]\n",
+ "# if you are running locallt, you can use the full dataset doing\n",
+ "# for fileIN in glob.glob(\"tutorials/HiggsSchool/data/*h5\"):\n",
+ "for fileIN in datafiles:\n",
+ " print(\"Appending %s\" % fileIN)\n",
+ " f = h5py.File(fileIN)\n",
+ " myAllFeatures = np.array(f.get(\"jets\")[:, 4:-6])\n",
+ " mytarget = np.array(f.get(\"jets\")[0:, -6:-1])\n",
+ " AllFeatures = (\n",
+ " np.concatenate([AllFeatures, myAllFeatures], axis=0) if AllFeatures.size else myAllFeatures\n",
+ " )\n",
+ " target = np.concatenate([target, mytarget], axis=0) if target.size else mytarget\n",
+ " f.close()\n",
+ "print(target.shape, AllFeatures.shape)\n",
+ "\n",
+ "# let's see what we have\n",
+ "f = h5py.File(datafiles[-1])\n",
+ "features_list = list(f.get(\"jetFeatureNames\"))[4:-6]\n",
+ "f.close()\n",
+ "print(features_list)\n",
+ "labels = [\"gluon\", \"quark\", \"W\", \"Z\", \"top\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# data exploration\n",
+ "fig = plt.figure(figsize=(10, 10))\n",
+ "for i, label in enumerate(labels):\n",
+ " plt.hist(\n",
+ " AllFeatures[target[:, i] == 1][:, 30], bins=50, range=(0, 0.5), histtype=\"step\", label=label\n",
+ " )\n",
+ "plt.legend()\n",
+ "plt.xlabel(features_list[30])\n",
+ "plt.ylabel(\"Number of jets\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# feature selection\n",
+ "features = AllFeatures[:, [8, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 48]]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "6a333RYPXIwe"
+ },
+ "source": [
+ "The dataset consists of 50000 jets, each represented by 16 features\n",
+ "\n",
+ "---\n",
+ "\n",
+ "We now shuffle the data, splitting them into a training and a validation dataset with 2:1 ratio"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "ZBqFs1eBXIwf"
+ },
+ "outputs": [],
+ "source": [
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)\n",
+ "print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)\n",
+ "del features, target"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "GkNz5UAhXIwg"
+ },
+ "source": [
+ "# DNN model building"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "tTSDOiEHXIwh"
+ },
+ "outputs": [],
+ "source": [
+ "# keras imports\n",
+ "from tensorflow.keras.models import Model\n",
+ "from tensorflow.keras.layers import Dense, Input, Dropout, Flatten, Activation\n",
+ "from tensorflow.keras.utils import plot_model\n",
+ "from tensorflow.keras import backend as K\n",
+ "from tensorflow.keras import metrics\n",
+ "from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, TerminateOnNaN"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "rAl0DZTxXIwi"
+ },
+ "outputs": [],
+ "source": [
+ "input_shape = X_train.shape[1]\n",
+ "dropoutRate = 0.1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "2l492G8BXIwj"
+ },
+ "outputs": [],
+ "source": [
+ "####\n",
+ "inputArray = Input(shape=(input_shape,))\n",
+ "#\n",
+ "x = Dense(40, activation=\"relu\")(inputArray)\n",
+ "x = Dropout(dropoutRate)(x)\n",
+ "#\n",
+ "x = Dense(20)(x)\n",
+ "x = Activation(\"relu\")(x)\n",
+ "x = Dropout(dropoutRate)(x)\n",
+ "#\n",
+ "x = Dense(10, activation=\"relu\")(x)\n",
+ "x = Dropout(dropoutRate)(x)\n",
+ "#\n",
+ "x = Dense(5, activation=\"relu\")(x)\n",
+ "#\n",
+ "output = Dense(5, activation=\"softmax\")(x)\n",
+ "####\n",
+ "model = Model(inputs=inputArray, outputs=output)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "xu8rRUkhXIwj"
+ },
+ "outputs": [],
+ "source": [
+ "model.compile(loss=\"categorical_crossentropy\", optimizer=\"adam\")\n",
+ "model.summary()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "2HfKWoOtXIwk"
+ },
+ "source": [
+ "We now train the model with these settings:\n",
+ "\n",
+ "- the **batch size** is a hyperparameter of gradient descent that controls the number of training samples to work through before the model internal parameters are updated\n",
+ " - batch size = 1 results in fast computation but noisy training that is slow to converge\n",
+ " - batch size = dataset size results in slow computation but faster convergence)\n",
+ "\n",
+ "- the **number of epochs** controls the number of complete passes through the full training dataset -- at each epoch gradients are computed for each of the mini batches and model internal parameters are updated.\n",
+ "\n",
+ "- the **callbacks** are algorithms used to optimize the training (full list [here](https://keras.io/api/callbacks/)):\n",
+ " - *EarlyStopping*: stop training when a monitored metric (`monitor`) has stopped improving in the last N epochs (`patience`)\n",
+ " - *ReduceLROnPlateau*: reduce learning rate when a metric (`monitor`) has stopped improving in the last N epochs (`patience`)\n",
+ " - *TerminateOnNaN*: terminates training when a NaN loss is encountered"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "KzO-lyLEXIwk"
+ },
+ "outputs": [],
+ "source": [
+ "batch_size = 1024\n",
+ "n_epochs = 50\n",
+ "\n",
+ "# train\n",
+ "history = model.fit(\n",
+ " X_train,\n",
+ " y_train,\n",
+ " epochs=n_epochs,\n",
+ " batch_size=batch_size,\n",
+ " verbose=2,\n",
+ " validation_split=0.2,\n",
+ " # callbacks = [\n",
+ " # EarlyStopping(monitor='val_loss', patience=10, verbose=1),\n",
+ " # ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=1),\n",
+ " # TerminateOnNaN()]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "044bCLqVXIwl"
+ },
+ "outputs": [],
+ "source": [
+ "# plot training history\n",
+ "plt.plot(history.history[\"loss\"])\n",
+ "plt.plot(history.history[\"val_loss\"])\n",
+ "plt.yscale(\"log\")\n",
+ "plt.title(\"Training History\")\n",
+ "plt.ylabel(\"loss\")\n",
+ "plt.xlabel(\"epoch\")\n",
+ "plt.legend([\"training\", \"validation\"], loc=\"upper right\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "oESSmNLxXIwm"
+ },
+ "source": [
+ "# Building the ROC Curves"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "gjKT7EjUXIwn"
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "from sklearn.metrics import roc_curve, auc\n",
+ "\n",
+ "predict_test = model.predict(X_test)\n",
+ "df = pd.DataFrame()\n",
+ "fpr = {}\n",
+ "tpr = {}\n",
+ "auc1 = {}\n",
+ "\n",
+ "plt.figure()\n",
+ "for i, label in enumerate(labels):\n",
+ " df[label] = y_test[:, i]\n",
+ " df[label + \"_pred\"] = predict_test[:, i]\n",
+ "\n",
+ " fpr[label], tpr[label], threshold = roc_curve(df[label], df[label + \"_pred\"])\n",
+ "\n",
+ " auc1[label] = auc(fpr[label], tpr[label])\n",
+ "\n",
+ " plt.plot(tpr[label], fpr[label], label=\"%s tagger, auc = %.1f%%\" % (label, auc1[label] * 100.0))\n",
+ "plt.semilogy()\n",
+ "plt.xlabel(\"sig. efficiency\")\n",
+ "plt.ylabel(\"bkg. mistag rate\")\n",
+ "plt.ylim(0.000001, 1)\n",
+ "plt.grid(True)\n",
+ "plt.legend(loc=\"lower right\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How do you build a QCD vs Top / W / Z ROC curve?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "lzbQ-d0RKVmV"
+ },
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "name": "Notebook2_JetID_DNN.ipynb",
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
},
- "nbformat": 4,
- "nbformat_minor": 0
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.16"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
}
diff --git a/machine-learning-das/notebooks/3.5-dense-regression.ipynb b/machine-learning-das/notebooks/3.5-dense-regression.ipynb
index a828df2..c41ac98 100644
--- a/machine-learning-das/notebooks/3.5-dense-regression.ipynb
+++ b/machine-learning-das/notebooks/3.5-dense-regression.ipynb
@@ -1,356 +1,367 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "Rf258xT0XIwV"
- },
- "source": [
- "# Training a Jet Regression with **DNN** \n",
- "\n",
- "---\n",
- "In this notebook, we perform a Jet identification task using a multiclass classifier based on a \n",
- "Dense Neural Network (DNN), also called multi-layer perceptron (MLP). The problem consists is \n",
- "regression of $tau_{32}$, given $tau_3$ and $tau_2$.\n",
- "\n",
- "For details on the physics problem, see https://arxiv.org/pdf/1804.06913.pdf \n",
- "\n",
- "For details on the dataset, see Notebook1\n",
- "\n",
- "---"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "id": "4OMAZgtyXIwY"
- },
- "outputs": [],
- "source": [
- "import os\n",
- "import h5py\n",
- "import glob\n",
- "import numpy as np\n",
- "import matplotlib.pyplot as plt\n",
- "%matplotlib inline"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "2lbB-J3hXIwb"
- },
- "source": [
- "# Preparation of the training and validation samples\n",
- "\n",
- "---\n",
- "In order to import the dataset, we now\n",
- "- clone the dataset repository (to import the data in Colab)\n",
- "- load the h5 files in the data/ repository\n",
- "- extract the data we need: a target and jetImage \n",
- "\n",
- "To type shell commands, we start the command line with !\n",
- "\n",
- "**nb, if you are running locally and you have already downloaded the datasets you can skip the cell below and, if needed, change the paths later to point to the folder with your previous download of the datasets.**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "jWjxFaRPXIwb"
- },
- "outputs": [],
- "source": [
- "! curl https://cernbox.cern.ch/s/6Ec5pGFEpFWeH6S/download -o Data-MLtutorial.tar.gz\n",
- "! tar -xvzf Data-MLtutorial.tar.gz \n",
- "! ls Data-MLtutorial/JetDataset/\n",
- "! rm Data-MLtutorial.tar.gz "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "id": "cCGhrKdwXIwc"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_30000_40000.h5\n",
- "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_60000_70000.h5\n",
- "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_50000_60000.h5\n",
- "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_10000_20000.h5\n",
- "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_0_10000.h5\n",
- "(50000,) (50000, 2)\n"
- ]
- }
- ],
- "source": [
- "target = np.array([])\n",
- "features = np.array([])\n",
- "ptype = np.array([])\n",
- "# we cannot load all data on Colab. So we just take a few files\n",
- "datafiles = ['Data-MLtutorial/JetDataset/jetImage_7_100p_30000_40000.h5',\n",
- " 'Data-MLtutorial/JetDataset/jetImage_7_100p_60000_70000.h5',\n",
- " 'Data-MLtutorial/JetDataset/jetImage_7_100p_50000_60000.h5',\n",
- " 'Data-MLtutorial/JetDataset/jetImage_7_100p_10000_20000.h5',\n",
- " 'Data-MLtutorial/JetDataset/jetImage_7_100p_0_10000.h5']\n",
- "# if you are running locallt, you can use the full dataset doing\n",
- "# for fileIN in glob.glob(\"tutorials/HiggsSchool/data/*h5\"):\n",
- "for fileIN in datafiles:\n",
- " print(\"Appending %s\" %fileIN)\n",
- " f = h5py.File(fileIN)\n",
- " myFeatures = np.array(f.get(\"jets\")[:,[5,6]])\n",
- " myptype = np.array(f.get('jets')[0:,-6:-1])\n",
- " mytarget = np.array(f.get('jets')[0:,10])\n",
- " features = np.concatenate([features, myFeatures], axis=0) if features.size else myFeatures\n",
- " target = np.concatenate([target, mytarget], axis=0) if target.size else mytarget\n",
- " ptype = np.concatenate([ptype, myptype], axis=0) if ptype.size else myptype\n",
- " f.close()\n",
- "print(target.shape, features.shape)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "6a333RYPXIwe"
- },
- "source": [
- "The dataset consists of 50000 jets, each represented by 16 features\n",
- "\n",
- "---\n",
- "\n",
- "We now shuffle the data, splitting them into a training and a validation dataset with 2:1 ratio"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "ZBqFs1eBXIwf"
- },
- "outputs": [],
- "source": [
- "features = features[ptype[:,-1]>0]\n",
- "target = target[ptype[:,-1]>0]\n",
- "ptype = ptype[ptype[:,-1]>0]\n",
- "from sklearn.model_selection import train_test_split\n",
- "X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)\n",
- "print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)\n",
- "del features, target"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "GkNz5UAhXIwg"
- },
- "source": [
- "# DNN model building"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "tTSDOiEHXIwh"
- },
- "outputs": [],
- "source": [
- "# keras imports\n",
- "from tensorflow.keras.models import Model\n",
- "from tensorflow.keras.layers import Dense, Input, Dropout, Flatten, Activation\n",
- "from tensorflow.keras.utils import plot_model\n",
- "from tensorflow.keras import backend as K\n",
- "from tensorflow.keras import metrics\n",
- "from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, TerminateOnNaN"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "rAl0DZTxXIwi"
- },
- "outputs": [],
- "source": [
- "input_shape = X_train.shape[1]\n",
- "dropoutRate = 0.0"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "2l492G8BXIwj"
- },
- "outputs": [],
- "source": [
- "####\n",
- "inputArray = Input(shape=(input_shape,))\n",
- "#\n",
- "x = Dense(40, activation='relu')(inputArray)\n",
- "x = Dropout(dropoutRate)(x)\n",
- "#\n",
- "x = Dense(20)(x)\n",
- "x = Activation('relu')(x)\n",
- "x = Dropout(dropoutRate)(x)\n",
- "#\n",
- "x = Dense(10, activation='relu')(x)\n",
- "x = Dropout(dropoutRate)(x)\n",
- "#\n",
- "x = Dense(5, activation='relu')(x)\n",
- "#\n",
- "output = Dense(1)(x)\n",
- "####\n",
- "model = Model(inputs=inputArray, outputs=output)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "xu8rRUkhXIwj"
- },
- "outputs": [],
- "source": [
- "model.compile(loss='mse', optimizer='adam')\n",
- "model.summary()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "2HfKWoOtXIwk"
- },
- "source": [
- "We now train the model with these settings:\n",
- "\n",
- "- the **batch size** is a hyperparameter of gradient descent that controls the number of training samples to work through before the model internal parameters are updated\n",
- " - batch size = 1 results in fast computation but noisy training that is slow to converge\n",
- " - batch size = dataset size results in slow computation but faster convergence)\n",
- "\n",
- "- the **number of epochs** controls the number of complete passes through the full training dataset -- at each epoch gradients are computed for each of the mini batches and model internal parameters are updated.\n",
- "\n",
- "- the **callbacks** are algorithms used to optimize the training (full list [here](https://keras.io/api/callbacks/)):\n",
- " - *EarlyStopping*: stop training when a monitored metric (`monitor`) has stopped improving in the last N epochs (`patience`)\n",
- " - *ReduceLROnPlateau*: reduce learning rate when a metric (`monitor`) has stopped improving in the last N epochs (`patience`)\n",
- " - *TerminateOnNaN*: terminates training when a NaN loss is encountered"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "KzO-lyLEXIwk"
- },
- "outputs": [],
- "source": [
- "batch_size = 128\n",
- "n_epochs = 100\n",
- "\n",
- "# train \n",
- "history = model.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size, verbose = 2,\n",
- " validation_split=0.2,\n",
- " callbacks = [EarlyStopping(monitor='val_loss', patience=10, verbose=1),\n",
- " ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=1),\n",
- " TerminateOnNaN()]\n",
- " )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "044bCLqVXIwl"
- },
- "outputs": [],
- "source": [
- "# plot training history\n",
- "plt.plot(history.history['loss'])\n",
- "plt.plot(history.history['val_loss'])\n",
- "plt.yscale('log')\n",
- "plt.title('Training History')\n",
- "plt.ylabel('loss')\n",
- "plt.xlabel('epoch')\n",
- "plt.legend(['training', 'validation'], loc='upper right')\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "predict = model.predict(X_test)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "plt.scatter(y_test,predict.flatten(),s=0.1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "plt.hist(y_test,bins=100,range=(0,1),histtype='step',label='true')\n",
- "plt.hist(predict.flatten(),bins=100,range=(0,1),histtype='step',label='predict')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "oESSmNLxXIwm"
- },
- "source": [
- "# Plot performce with 2D histograms"
- ]
- },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Rf258xT0XIwV"
+ },
+ "source": [
+ "# Training a Jet Regression with **DNN** \n",
+ "\n",
+ "---\n",
+ "In this notebook, we perform a Jet identification task using a multiclass classifier based on a \n",
+ "Dense Neural Network (DNN), also called multi-layer perceptron (MLP). The problem consists is \n",
+ "regression of $tau_{32}$, given $tau_3$ and $tau_2$.\n",
+ "\n",
+ "For details on the physics problem, see https://arxiv.org/pdf/1804.06913.pdf \n",
+ "\n",
+ "For details on the dataset, see Notebook1\n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "id": "4OMAZgtyXIwY"
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import h5py\n",
+ "import glob\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "%matplotlib inline"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "2lbB-J3hXIwb"
+ },
+ "source": [
+ "# Preparation of the training and validation samples\n",
+ "\n",
+ "---\n",
+ "In order to import the dataset, we now\n",
+ "- clone the dataset repository (to import the data in Colab)\n",
+ "- load the h5 files in the data/ repository\n",
+ "- extract the data we need: a target and jetImage \n",
+ "\n",
+ "To type shell commands, we start the command line with !\n",
+ "\n",
+ "**nb, if you are running locally and you have already downloaded the datasets you can skip the cell below and, if needed, change the paths later to point to the folder with your previous download of the datasets.**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "jWjxFaRPXIwb"
+ },
+ "outputs": [],
+ "source": [
+ "! curl https://cernbox.cern.ch/s/6Ec5pGFEpFWeH6S/download -o Data-MLtutorial.tar.gz\n",
+ "! tar -xvzf Data-MLtutorial.tar.gz\n",
+ "! ls Data-MLtutorial/JetDataset/\n",
+ "! rm Data-MLtutorial.tar.gz"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "id": "cCGhrKdwXIwc"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "lzbQ-d0RKVmV"
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "colab": {
- "name": "Notebook2_JetID_DNN.ipynb",
- "provenance": []
- },
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.16"
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_30000_40000.h5\n",
+ "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_60000_70000.h5\n",
+ "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_50000_60000.h5\n",
+ "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_10000_20000.h5\n",
+ "Appending Data-MLtutorial/JetDataset/jetImage_7_100p_0_10000.h5\n",
+ "(50000,) (50000, 2)\n"
+ ]
}
+ ],
+ "source": [
+ "target = np.array([])\n",
+ "features = np.array([])\n",
+ "ptype = np.array([])\n",
+ "# we cannot load all data on Colab. So we just take a few files\n",
+ "datafiles = [\n",
+ " \"Data-MLtutorial/JetDataset/jetImage_7_100p_30000_40000.h5\",\n",
+ " \"Data-MLtutorial/JetDataset/jetImage_7_100p_60000_70000.h5\",\n",
+ " \"Data-MLtutorial/JetDataset/jetImage_7_100p_50000_60000.h5\",\n",
+ " \"Data-MLtutorial/JetDataset/jetImage_7_100p_10000_20000.h5\",\n",
+ " \"Data-MLtutorial/JetDataset/jetImage_7_100p_0_10000.h5\",\n",
+ "]\n",
+ "# if you are running locallt, you can use the full dataset doing\n",
+ "# for fileIN in glob.glob(\"tutorials/HiggsSchool/data/*h5\"):\n",
+ "for fileIN in datafiles:\n",
+ " print(\"Appending %s\" % fileIN)\n",
+ " f = h5py.File(fileIN)\n",
+ " myFeatures = np.array(f.get(\"jets\")[:, [5, 6]])\n",
+ " myptype = np.array(f.get(\"jets\")[0:, -6:-1])\n",
+ " mytarget = np.array(f.get(\"jets\")[0:, 10])\n",
+ " features = np.concatenate([features, myFeatures], axis=0) if features.size else myFeatures\n",
+ " target = np.concatenate([target, mytarget], axis=0) if target.size else mytarget\n",
+ " ptype = np.concatenate([ptype, myptype], axis=0) if ptype.size else myptype\n",
+ " f.close()\n",
+ "print(target.shape, features.shape)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "6a333RYPXIwe"
+ },
+ "source": [
+ "The dataset consists of 50000 jets, each represented by 16 features\n",
+ "\n",
+ "---\n",
+ "\n",
+ "We now shuffle the data, splitting them into a training and a validation dataset with 2:1 ratio"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "ZBqFs1eBXIwf"
+ },
+ "outputs": [],
+ "source": [
+ "features = features[ptype[:, -1] > 0]\n",
+ "target = target[ptype[:, -1] > 0]\n",
+ "ptype = ptype[ptype[:, -1] > 0]\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)\n",
+ "print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)\n",
+ "del features, target"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "GkNz5UAhXIwg"
+ },
+ "source": [
+ "# DNN model building"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "tTSDOiEHXIwh"
+ },
+ "outputs": [],
+ "source": [
+ "# keras imports\n",
+ "from tensorflow.keras.models import Model\n",
+ "from tensorflow.keras.layers import Dense, Input, Dropout, Flatten, Activation\n",
+ "from tensorflow.keras.utils import plot_model\n",
+ "from tensorflow.keras import backend as K\n",
+ "from tensorflow.keras import metrics\n",
+ "from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, TerminateOnNaN"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "rAl0DZTxXIwi"
+ },
+ "outputs": [],
+ "source": [
+ "input_shape = X_train.shape[1]\n",
+ "dropoutRate = 0.0"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "2l492G8BXIwj"
+ },
+ "outputs": [],
+ "source": [
+ "####\n",
+ "inputArray = Input(shape=(input_shape,))\n",
+ "#\n",
+ "x = Dense(40, activation=\"relu\")(inputArray)\n",
+ "x = Dropout(dropoutRate)(x)\n",
+ "#\n",
+ "x = Dense(20)(x)\n",
+ "x = Activation(\"relu\")(x)\n",
+ "x = Dropout(dropoutRate)(x)\n",
+ "#\n",
+ "x = Dense(10, activation=\"relu\")(x)\n",
+ "x = Dropout(dropoutRate)(x)\n",
+ "#\n",
+ "x = Dense(5, activation=\"relu\")(x)\n",
+ "#\n",
+ "output = Dense(1)(x)\n",
+ "####\n",
+ "model = Model(inputs=inputArray, outputs=output)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "xu8rRUkhXIwj"
+ },
+ "outputs": [],
+ "source": [
+ "model.compile(loss=\"mse\", optimizer=\"adam\")\n",
+ "model.summary()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "2HfKWoOtXIwk"
+ },
+ "source": [
+ "We now train the model with these settings:\n",
+ "\n",
+ "- the **batch size** is a hyperparameter of gradient descent that controls the number of training samples to work through before the model internal parameters are updated\n",
+ " - batch size = 1 results in fast computation but noisy training that is slow to converge\n",
+ " - batch size = dataset size results in slow computation but faster convergence)\n",
+ "\n",
+ "- the **number of epochs** controls the number of complete passes through the full training dataset -- at each epoch gradients are computed for each of the mini batches and model internal parameters are updated.\n",
+ "\n",
+ "- the **callbacks** are algorithms used to optimize the training (full list [here](https://keras.io/api/callbacks/)):\n",
+ " - *EarlyStopping*: stop training when a monitored metric (`monitor`) has stopped improving in the last N epochs (`patience`)\n",
+ " - *ReduceLROnPlateau*: reduce learning rate when a metric (`monitor`) has stopped improving in the last N epochs (`patience`)\n",
+ " - *TerminateOnNaN*: terminates training when a NaN loss is encountered"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "KzO-lyLEXIwk"
+ },
+ "outputs": [],
+ "source": [
+ "batch_size = 128\n",
+ "n_epochs = 100\n",
+ "\n",
+ "# train\n",
+ "history = model.fit(\n",
+ " X_train,\n",
+ " y_train,\n",
+ " epochs=n_epochs,\n",
+ " batch_size=batch_size,\n",
+ " verbose=2,\n",
+ " validation_split=0.2,\n",
+ " callbacks=[\n",
+ " EarlyStopping(monitor=\"val_loss\", patience=10, verbose=1),\n",
+ " ReduceLROnPlateau(monitor=\"val_loss\", factor=0.1, patience=2, verbose=1),\n",
+ " TerminateOnNaN(),\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "044bCLqVXIwl"
+ },
+ "outputs": [],
+ "source": [
+ "# plot training history\n",
+ "plt.plot(history.history[\"loss\"])\n",
+ "plt.plot(history.history[\"val_loss\"])\n",
+ "plt.yscale(\"log\")\n",
+ "plt.title(\"Training History\")\n",
+ "plt.ylabel(\"loss\")\n",
+ "plt.xlabel(\"epoch\")\n",
+ "plt.legend([\"training\", \"validation\"], loc=\"upper right\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "predict = model.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plt.scatter(y_test, predict.flatten(), s=0.1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plt.hist(y_test, bins=100, range=(0, 1), histtype=\"step\", label=\"true\")\n",
+ "plt.hist(predict.flatten(), bins=100, range=(0, 1), histtype=\"step\", label=\"predict\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "oESSmNLxXIwm"
+ },
+ "source": [
+ "# Plot performce with 2D histograms"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "lzbQ-d0RKVmV"
+ },
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "name": "Notebook2_JetID_DNN.ipynb",
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
},
- "nbformat": 4,
- "nbformat_minor": 0
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.16"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
}
diff --git a/machine-learning-das/notebooks/8-SetTransformer-PointCloud.ipynb b/machine-learning-das/notebooks/8-SetTransformer-PointCloud.ipynb
index ff8db60..58716de 100644
--- a/machine-learning-das/notebooks/8-SetTransformer-PointCloud.ipynb
+++ b/machine-learning-das/notebooks/8-SetTransformer-PointCloud.ipynb
@@ -1,731 +1,750 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "view-in-github"
- },
- "source": [
- "
"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "BsonEqBekjyy"
- },
- "source": [
- "# Jet Tagging with Permutation Invariance\n",
- "Author: Abhijith Gandrakota, Jennifer Ngadiuba\n",
- "\n",
- "In this notebook we will see an implementation of the Transformer architecture for sets applied to the jet tagging task. For *sets* it is meant here a point cloud, i.e. a set of nodes without edges. We will instead use Multi-Head Attention to learn which nodes (or particles) have strong pair-wise interaction.\n",
- "\n",
- "The architecture was introduced by [J. Lee at al. (ICML 2019)](https://arxiv.org/abs/1810.00825) -- specifically designed to model interactions among elements in the input set without pre-defined edges. The model consists of an encoder and a decoder, both of which rely on attention mechanisms, as in the original Transformer implementation [by Vaswani](https://arxiv.org/abs/1706.03762). The main difference is that positional encoding is removed plus some other low level adaptions.\n",
- "\n",
- "We will use tensorflow for this implementation.\n",
- "\n",
- "Before you start, choose GPU as a hardware accelerator for this notebook. To do this first go to Edit -> Notebook Settings -> Choose GPU as a hardware accelerator."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "A7OS3w5WRSCj"
- },
- "outputs": [],
- "source": [
- "import tensorflow as tf\n",
- "from tensorflow import keras\n",
- "from tensorflow.keras import layers\n",
- "import h5py\n",
- "import numpy as np\n",
- "\n",
- "#checking if we have GPUs\n",
- "print(\"Num GPUs Available: \", len(tf.config.list_physical_devices('GPU')))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "qwekVVRzneqU"
- },
- "source": [
- "## Dataset exploration"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "BHQGiyC4Pr4R"
- },
- "outputs": [],
- "source": [
- "! curl https://cernbox.cern.ch/s/6Ec5pGFEpFWeH6S/download -o Data-MLtutorial.tar.gz\n",
- "! tar -xvzf Data-MLtutorial.tar.gz \n",
- "! ls Data-MLtutorial/JetDataset/\n",
- "! rm Data-MLtutorial.tar.gz "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "J9ZLcoKpPteG"
- },
- "outputs": [],
- "source": [
- "# let's open the file\n",
- "data_dir = 'Data-MLtutorial/JetDataset/'\n",
- "fileIN = data_dir+'jetImage_7_100p_30000_40000.h5'\n",
- "f = h5py.File(fileIN)\n",
- "# and see what it contains\n",
- "print(list(f.keys()))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "Ktx1VjNoOu4c"
- },
- "source": [
- "* 'jetImage' ,' jetImageECAL' and 'jetImageHCAL' contains the image representation of the jets . We will not use them today but build our point cloud from the other information.\n",
- "* 'jetConstituentList' is the list of particles cointained in the jet. For each particle, a list of relevant quantities is stored. This is the dataset we will consider in this notebook.\n",
- "* 'particleFeatureNames' is the list of the names corresponding to the quantities contained in 'jetConstituentList'\n",
- "* 'jets' is the list of jets with the high-level jet features stored. We will only use jet ID from it, indecies [-6:-1]\n",
- "* 'jetFeatureNames' is the list of the names corresponding to the quantities contained in 'jets'. These quantities are build using physics knowledge and correspond to high-level infromation and features per graph (as opposed to per node)\n",
- "\n",
- "The first 100 highest transverse momentum $p_T$ particles are considered for each jet.\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Re7oXWWmPxz9"
- },
- "outputs": [],
- "source": [
- "target_onehot = np.array([])\n",
- "jetList = np.array([])\n",
- "jetImages = np.array([])\n",
- "features_names = dict()\n",
- "datafiles = ['jetImage_7_100p_0_10000.h5',\n",
- " 'jetImage_7_100p_10000_20000.h5',\n",
- " 'jetImage_7_100p_30000_40000.h5',\n",
- " 'jetImage_7_100p_40000_50000.h5',\n",
- " 'jetImage_7_100p_50000_60000.h5'\n",
- " ]\n",
- "for i_f,fileIN in enumerate(datafiles):\n",
- " print(\"Appending %s\" %fileIN)\n",
- " f = h5py.File(data_dir + fileIN)\n",
- " jetList_file = np.array(f.get(\"jetConstituentList\"))\n",
- " target_file = np.array(f.get('jets')[0:,-6:-1])\n",
- " jetImages_file = np.array(f.get('jetImage'))\n",
- " jetList = np.concatenate([jetList, jetList_file], axis=0) if jetList.size else jetList_file\n",
- " target_onehot = np.concatenate([target_onehot, target_file], axis=0) if target_onehot.size else target_file\n",
- " jetImages = np.concatenate([jetImages, jetImages_file], axis=0) if jetImages.size else jetImages_file\n",
- " del jetList_file, target_file, jetImages_file\n",
- " #save particles/nodes features names and their indecies in a dictionary\n",
- " if i_f==0:\n",
- " for feat_idx,feat_name in enumerate(list(f['particleFeatureNames'])[:-1]):\n",
- " features_names[feat_name.decode(\"utf-8\").replace('j1_','')] = feat_idx\n",
- " f.close()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "7kQnL9vkP4rK"
- },
- "source": [
- "The ground truth is incorporated in the ['j_g', 'j_q', 'j_w', 'j_z', 'j_t] vector of boolean, taking the form\n",
- "* [1, 0, 0, 0, 0] for gluons\n",
- "* [0, 1, 0, 0, 0] for quarks\n",
- "* [0, 0, 1, 0, 0] for W\n",
- "* [0, 0, 0, 1, 0] for Z \n",
- "* [0, 0, 0, 0, 1] for top quarks\n",
- "\n",
- "This is what is called 'one-hot' encoding of a descrete label (typical of ground truth for classification problems). These labels are the 'target' for our classification tasks. Let's convert it back to single-column encoding :\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "84NSj2W7P477"
- },
- "outputs": [],
- "source": [
- "print(\"Labels for the first five entries in the dataset, one-hot encoded:\")\n",
- "for i in range(5):\n",
- " print(target_onehot[i])\n",
- "print(target_onehot.shape)\n",
- "target = np.argmax(target_onehot, axis=1)\n",
- "print(target.shape)\n",
- "print(\"Labels for the first five entries in the dataset, single column encoded:\")\n",
- "for i in range(0,5):\n",
- " print(target[i])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "mqsd_aP__RIi"
- },
- "source": [
- "Now our lables correspond to :\n",
- "* 0 for gluons\n",
- "* 1 for quarks\n",
- "* 2 for W\n",
- "* 3 for Z \n",
- "* 4 for top quarks\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "hyP15oxhP5ek"
- },
- "outputs": [],
- "source": [
- "num_classes = len(np.unique(target))\n",
- "label_names= [\"gluon\", \"quark\", \"W\", \"Z\", \"top\"]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "Ik-6OX0LMJW7"
- },
- "source": [
- "Now let's inspect our data. Each jet is a point cloud/graph with 100 particles/nodes, each of which has 16 features. We have a double-index dataset: (jet index, particle index). The list is cut at 100 constituents per jet. If less constituents are present in the jet/point cloud, the dataset is completed filling it with 0s (zero padding). Note : zero-padding is not using during the training, it is only used to store the ragged dataset.\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "YfHRopq0P8tW"
- },
- "outputs": [],
- "source": [
- "print('Jets shape : ',jetList.shape)\n",
- "print('Target/Labels shape : ',target.shape)\n",
- "print('Particles/Nodes features : ',list(features_names.keys()))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "VBYwH4t8MhHm"
- },
- "source": [
- "We are not interested in all features for now. For now we will only consider the same node features as were considered in the ParticleNet paper: "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "QWtB3vTWP_QY"
- },
- "outputs": [],
- "source": [
- "features_to_consider = 'etarel,phirel,pt,e,ptrel,erel,deltaR'.split(',')\n",
- "features_idx = [features_names[name] for name in features_to_consider]\n",
- "jetList = jetList[:,:,features_idx]\n",
- "print(jetList.shape)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "-M8uvPR4mfI7"
- },
- "source": [
- "Let's define basics hyperparamters:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "2mHCuVm6ZJaY"
- },
- "outputs": [],
- "source": [
- "batch_size=128\n",
- "learning_rate=0.0001\n",
- "epochs=20"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "unpVhZNfmotI"
- },
- "source": [
- "In the original paper, multi-head attention is also applied in the decoder step to obtain a smarter pooling operation. For this excercise we will simplify the model and use instead a `Lambda` layer to apply a custom pooling function to the input tensor. In this case, the `Lambda` layer is being used to sum over the first dimension, i.e. over the elements in the output set of the previous layer, which has shape `(batch_size, n_elements, features)`. By summing over the first dimension (`axis=1`), we obtain a tensor of shape `(batch_size, features)` that represents an aggregation of each feature over the elements in the set.\n",
- "\n",
- "Here is the full model:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "O7rzMn7wRcrP"
- },
- "outputs": [],
- "source": [
- "inputs = keras.Input(shape=(100,7), name='input')\n",
- "x = layers.TimeDistributed(layers.Dense(64))(inputs)\n",
- "x = layers.TimeDistributed(layers.LeakyReLU())(x)\n",
- "x = layers.TimeDistributed(layers.Dense(64))(x)\n",
- "x = layers.TimeDistributed(layers.LeakyReLU())(x)\n",
- "x = layers.TimeDistributed(layers.Dense(64))(x)\n",
- "x = layers.TimeDistributed(layers.LeakyReLU())(x)\n",
- "x = layers.TimeDistributed(layers.Dense(64))(x)\n",
- "x = layers.TimeDistributed(layers.LeakyReLU())(x)\n",
- "x = layers.Lambda(lambda y: tf.reduce_sum(y, axis=1))(x)\n",
- "x = layers.BatchNormalization()(x)\n",
- "x = layers.Dense(64)(x)\n",
- "x = layers.LeakyReLU()(x)\n",
- "x = layers.Dense(64)(x)\n",
- "x = layers.LeakyReLU()(x)\n",
- "x = layers.Dense(16)(x)\n",
- "x = layers.LeakyReLU()(x)\n",
- "output = layers.Dense(5, dtype='float32')(x)\n",
- "model = keras.models.Model(inputs=inputs, outputs=output)\n",
- "model.summary()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "G8NI-_bYdSAq"
- },
- "outputs": [],
- "source": [
- "model.compile(\n",
- " loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
- " optimizer=keras.optimizers.Adam(learning_rate=learning_rate),\n",
- " metrics=[keras.metrics.SparseCategoricalAccuracy()],\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "DhAKhgMMcrwa"
- },
- "outputs": [],
- "source": [
- "from sklearn.model_selection import train_test_split\n",
- "X_train, X_val, y_train, y_val, y_train_onehot, y_val_onehot = train_test_split(jetList, target, target_onehot, test_size=0.1, shuffle=True)\n",
- "print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)\n",
- "del jetList, target, target_onehot"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "lz7rfyeCdNF0"
- },
- "outputs": [],
- "source": [
- "history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "Egcr8vMhp-2v"
- },
- "source": [
- "We can now plot the validation and training loss evolution over the epochs:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "sjTOMuzAqGEr"
- },
- "outputs": [],
- "source": [
- "import matplotlib.pyplot as plt\n",
- "fig,axes = plt.subplots(2)\n",
- "\n",
- "axes[0].plot(history.history[\"sparse_categorical_accuracy\"])\n",
- "axes[0].plot(history.history[\"val_sparse_categorical_accuracy\"])\n",
- "axes[0].set_title(\"Accuracy\")\n",
- "axes[0].legend([\"Training\", \"Validation\"])\n",
- "\n",
- "axes[1].plot(history.history[\"loss\"])\n",
- "axes[1].plot(history.history[\"val_loss\"])\n",
- "axes[1].legend([\"Training\", \"Validation\"])\n",
- "axes[1].set_title(\"Loss\")\n",
- "\n",
- "fig.show()\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "CwrPPStDrS4J"
- },
- "source": [
- "Now we finally evaluate the performance by plotting the ROC curves for the different classes:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "JKM0yYFfecJh"
- },
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import matplotlib.pyplot as plt\n",
- "%matplotlib inline\n",
- "from sklearn.metrics import roc_curve, auc\n",
- "predict_val = tf.nn.softmax(model.predict(X_val))\n",
- "df = pd.DataFrame()\n",
- "fpr = {}\n",
- "tpr = {}\n",
- "auc1 = {}\n",
- "\n",
- "plt.figure()\n",
- "for i, label in enumerate(label_names):\n",
- "\n",
- " df[label] = y_val_onehot[:,i]\n",
- " df[label + '_pred'] = predict_val[:,i]\n",
- "\n",
- " fpr[label], tpr[label], threshold = roc_curve(df[label],df[label+'_pred'])\n",
- "\n",
- " auc1[label] = auc(fpr[label], tpr[label])\n",
- "\n",
- " plt.plot(tpr[label],fpr[label],label='%s tagger, auc = %.1f%%'%(label,auc1[label]*100.))\n",
- "plt.semilogy()\n",
- "plt.xlabel(\"sig. efficiency\")\n",
- "plt.ylabel(\"bkg. mistag rate\")\n",
- "plt.ylim(0.000001,1)\n",
- "plt.grid(True)\n",
- "plt.legend(loc='lower right')\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "IzxPDanrrYZB"
- },
- "source": [
- "As you can see the performance are not as good for other models we have trained on the same dataset. As mentioned at the beginning of the notebook training a transformer might be tricky. You can try the optional excercise below to improve the performance and surpass the other models."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Multi Head Attention recap\n",
- "\n",
- "Assume we have $n$ query vectors (corresponding to the $n$ elements in the set) each with dimension $d_q : Q \\in \\mathbb{R}^{n\\times d_q}$. In the jet tagging dataset $n=100$ and $d_q=7$.\n",
- "\n",
- "An attention function $\\mathrm{Att}(Q,K,V)$ is a function that maps queries $Q$ to outputs using $n_v$ key-value pairs $K \\in \\mathbb{R}^{n_v \\times d_q}, V \\in \\mathbb{R}^{n_v\\times d_v}$:\n",
- "\n",
- "$$\n",
- "\\mathrm{Att}(Q,K,V;\\omega) = \\omega(QK^{T})V.\n",
- "$$\n",
- "\n",
- "The pairwise dot product $QT^\\mathrm{T} \\in \\mathbb{R}^{n\\times n_v}$ measures how similar each pair of query and key vectors is, with weights computed with an activation function $\\omega$. The output $\\omega(QK^{T})V$ is a weighted sum of $V$ where a value gets more weight if its corresponding key has larger dot product with the query.\n",
- "\n",
- "Instead of computing a single attention function, the **multi-head attention** method first projects $Q, K, V$ onto $h$ different $d^M_q,d^M_q,d^M_v$-dimensional vectors, respectively. An attention function $\\mathrm{Att}(\\cdot; \\omega_j)$ is applied to each of these $h$ projections. The output is a linear transformation of the concatenation of all attention outputs:\n",
- "\n",
- "$$\n",
- "\\mathrm{Multihead}(Q, K, V ; \\lambda, \\omega) = \\mathrm{concat}(O_1,..., O_h)W^O\n",
- "$$\n",
- "\n",
- "$$\n",
- "O_j = \\mathrm{Att}(QW^Q_j, KW^K_j, VW^V_j ; \\omega_j )\n",
- "$$\n",
- "\n",
- "In other words, the model tells you what is the score of a particle in the set knowing its interaction with the other particles in the set given all features but in a way that the features are attended separately.\n",
- "\n",
- "Note that $\\mathrm{Multihead}(\\cdot, \\cdot, \\cdot; \\lambda)$ has learnable parameters $\\lambda =$ {$W^Q_j, W^K_j, W^V_j$}$_{j=1,...,h}$ where $W^Q_j, W^K_j \\in \\mathbb{R}^{d_q\\times d^M_q}, W^V_j \\in \\mathbb{R}^{d_v\\times d^M_v}, W^O \\in \\mathbb{R}^{hd^M_v\\times d}$. A typical choice for the dimension hyperparameters is $d^M_q = d_q /h, d^M_v = d_v /h, d = d_q$. For the Set Transformer we set $d_q = d_v = d$ and $d^M_q = d^M_v = d/h$. A scaled softmax $\\omega_j (\\cdot) = \\mathrm{softmax}(\\cdot/\\sqrt{d})$ is used.\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Building the Set Transformer\n",
- "\n",
- "We will implement a simplified version of the [original Set Transformer architecture](https://arxiv.org/abs/1810.00825). The reason is because Transformers are typically computationally and data hungry. As an optional excercise at the end of the notebook you can try to implement the full model and test it on a simpler problem like the MNIST dataset classification (or on a larger jet class dataset).\n",
- "\n",
- "The architecture is based on the block called `MAB` (= Multihead Attention Block) which implements the following:\n",
- "\n",
- "$$\n",
- "\\mathrm{MAB}(X, Y) = \\mathrm{LayerNorm}(H + \\mathrm{rFF}(H))\n",
- "$$\n",
- "\n",
- "$$\n",
- "H = \\mathrm{LayerNorm}(X + \\mathrm{Multihead}(X, X, X ; ω))\n",
- "$$\n",
- "\n",
- "where $X \\in \\mathbb{R}^{n\\times d}$ is the input set and $\\mathrm{rFF}$ is any feedforward layer. Since $Q=K=V=X$, the MAB takes a set and performs *self-attention* between the elements in the set, resulting in a set of equal size. Since the output of MAB contains information about pairwise interactions among the elements in the input set $X$, we can stack multiple MABs to encode higher order interactions. This stack is the *encoder* part of the transformer. \n",
- "\n",
- "The `LayerNorm` normalizes the activations of a layer across the last dimension (feature dimension) of the input tensor. Specifically, it centers and scales each feature dimension independently by subtracting the mean and dividing by the standard deviation, which are computed over the corresponding feature dimension of the input tensor. As for `BatchNormalization` it has learnable $\\gamma$ (scaling) and $\\beta$ (shifting) parameters. The difference with respect to `BatchNormalization` is that the normalization is performed indipendently per each instance in the batch. `LayerNorm` leads to improved stability when you expect instances of different sizes (or different zero padding degree as in the jet tagging case)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "RORSIHwVRPx4"
- },
- "outputs": [],
- "source": [
- "class SABTransformerBlock(tf.keras.layers.Layer):\n",
- " def __init__(self, num_heads, hidden_units, mlp_hidden_units=128, dropout_rate=0.1, **kwargs):\n",
- " super(SABTransformerBlock, self).__init__(**kwargs)\n",
- " self.num_heads = num_heads\n",
- " self.hidden_units = hidden_units\n",
- " self.mlp_hidden_units = mlp_hidden_units\n",
- " self.dropout_rate = dropout_rate\n",
- "\n",
- " def build(self, input_shape):\n",
- " self.attention = tf.keras.layers.MultiHeadAttention(num_heads=self.num_heads, \n",
- " key_dim=self.hidden_units//self.num_heads)\n",
- " self.feedforward = tf.keras.Sequential([\n",
- " layers.Dense(units=self.mlp_hidden_units, activation=\"relu\"),\n",
- " # Dropout(rate=self.dropout_rate),\n",
- " layers.Dense(units=input_shape[-1])\n",
- " ])\n",
- " self.layer_norm1 = layers.LayerNormalization(epsilon=1e-6)\n",
- " self.layer_norm2 = layers.LayerNormalization(epsilon=1e-6)\n",
- " self.dropout1 = layers.Dropout(rate=self.dropout_rate)\n",
- " self.dropout2 = layers.Dropout(rate=self.dropout_rate)\n",
- " super(SABTransformerBlock, self).build(input_shape)\n",
- " \n",
- " def call(self, inputs, mask=None):\n",
- " attention_output = self.attention(inputs, inputs, attention_mask=mask)[0]\n",
- " # attention_output = self.dropout1(attention_output)\n",
- " attention_output = self.layer_norm1(inputs + attention_output)\n",
- " feedforward_output = self.feedforward(attention_output)\n",
- " # feedforward_output = self.dropout2(feedforward_output)\n",
- " block_output = self.layer_norm2(attention_output + feedforward_output)\n",
- " return block_output\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "inputs = keras.Input(shape=(100,7), name='input')\n",
- "x = layers.TimeDistributed(layers.Dense(64))(inputs)\n",
- "x = SABTransformerBlock(num_heads=8, hidden_units=64)(x)\n",
- "x = SABTransformerBlock(num_heads=8, hidden_units=64)(x)\n",
- "x = SABTransformerBlock(num_heads=8, hidden_units=64)(x)\n",
- "x = layers.Lambda(lambda y: tf.reduce_sum(y, axis=1))(x)\n",
- "x = layers.BatchNormalization()(x)\n",
- "x = layers.Dense(64)(x)\n",
- "x = layers.LeakyReLU()(x)\n",
- "x = layers.Dense(64)(x)\n",
- "x = layers.LeakyReLU()(x)\n",
- "x = layers.Dense(16)(x)\n",
- "x = layers.LeakyReLU()(x)\n",
- "output = layers.Dense(5, dtype='float32')(x)\n",
- "model_st = keras.models.Model(inputs=inputs, outputs=output)\n",
- "model.summary()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "model.compile(\n",
- " loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
- " optimizer=keras.optimizers.Adam(learning_rate=learning_rate),\n",
- " metrics=[keras.metrics.SparseCategoricalAccuracy()],\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import matplotlib.pyplot as plt\n",
- "fig,axes = plt.subplots(2)\n",
- "\n",
- "axes[0].plot(history.history[\"sparse_categorical_accuracy\"])\n",
- "axes[0].plot(history.history[\"val_sparse_categorical_accuracy\"])\n",
- "axes[0].set_title(\"Accuracy\")\n",
- "axes[0].legend([\"Training\", \"Validation\"])\n",
- "\n",
- "axes[1].plot(history.history[\"loss\"])\n",
- "axes[1].plot(history.history[\"val_loss\"])\n",
- "axes[1].legend([\"Training\", \"Validation\"])\n",
- "axes[1].set_title(\"Loss\")\n",
- "\n",
- "fig.show()\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now we finally evaluate the performance by plotting the ROC curves for the different classes:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import matplotlib.pyplot as plt\n",
- "%matplotlib inline\n",
- "from sklearn.metrics import roc_curve, auc\n",
- "predict_val = tf.nn.softmax(model.predict(X_val))\n",
- "df = pd.DataFrame()\n",
- "fpr = {}\n",
- "tpr = {}\n",
- "auc1 = {}\n",
- "\n",
- "plt.figure()\n",
- "for i, label in enumerate(label_names):\n",
- "\n",
- " df[label] = y_val_onehot[:,i]\n",
- " df[label + '_pred'] = predict_val[:,i]\n",
- "\n",
- " fpr[label], tpr[label], threshold = roc_curve(df[label],df[label+'_pred'])\n",
- "\n",
- " auc1[label] = auc(fpr[label], tpr[label])\n",
- "\n",
- " plt.plot(tpr[label],fpr[label],label='%s tagger, auc = %.1f%%'%(label,auc1[label]*100.))\n",
- "plt.semilogy()\n",
- "plt.xlabel(\"sig. efficiency\")\n",
- "plt.ylabel(\"bkg. mistag rate\")\n",
- "plt.ylim(0.000001,1)\n",
- "plt.grid(True)\n",
- "plt.legend(loc='lower right')\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Optional Excercise\n",
- "\n",
- "The original paper also use MH mechanism in the decoder step (while we used a simple sum over the latent space nodes). If you would like to try it out the `Lambda` layer should be replaced with the `PoolingByMultiHeadAttention` block below.\n",
- "\n",
- "Consider also the fact that it might be hard to train a Transformer architecture of this kind over the rather small dataset used here. Check out [this other dataset](https://events.mcs.cmu.edu/us-cms-2023/) for increased statistics or [this notebook](https://github.com/DLii-Research/tf-settransformer/blob/master/examples/mnist_pointcloud.ipynb) for a simpler task.\n",
- "\n",
- "Below is the starting point for a smarter decoder:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "9qHj1_Y7ZU-R"
- },
- "outputs": [],
- "source": [
- "class PoolingByMultiHeadAttention(tf.keras.layers.Layer):\n",
- " def __init__(self, num_heads, hidden_units, mlp_hidden_units=128, num_seeds=1, **kwargs):\n",
- " super(PoolingByMultiHeadAttention, self).__init__(**kwargs)\n",
- " self.num_heads = num_heads\n",
- " self.hidden_units = hidden_units\n",
- " self.mlp_hidden_units = mlp_hidden_units\n",
- " self.num_seeds = num_seeds\n",
- " \n",
- " def build(self, input_shape):\n",
- " \n",
- " self.attention = tf.keras.layers.MultiHeadAttention(num_heads=self.num_heads, \n",
- " key_dim=self.hidden_units)\n",
- " \n",
- " self.seed_vectors = self.add_weight(\n",
- " shape=(1, self.num_seeds, self.hidden_units),\n",
- " initializer=\"random_normal\",\n",
- " trainable=True,\n",
- " name=\"Seeds\")\n",
- "\n",
- " self.feedforward = tf.keras.Sequential([\n",
- " layers.Dense(units=self.mlp_hidden_units, activation=\"relu\"),\n",
- " layers.Dense(units=self.hidden_units)\n",
- " ])\n",
- " self.layer_norm1 = layers.LayerNormalization(epsilon=1e-6)\n",
- " self.layer_norm2 = layers.LayerNormalization(epsilon=1e-6)\n",
- " super(PoolingByMultiHeadAttention, self).build(input_shape)\n",
- "\n",
- " def call(self, inputs, training=None):\n",
- " a = tf.expand_dims(self.seed_vectors, axis=0)\n",
- " seeds = tf.tile(self.seed_vectors, [tf.shape(inputs)[0], 1, 1])\n",
- " attention_output = self.attention(seeds, inputs)[0]\n",
- " attention_output = self.layer_norm1(seeds + attention_output)\n",
- " feedforward_output = self.feedforward(attention_output)\n",
- " block_output = self.layer_norm2(attention_output + feedforward_output)\n",
- " return block_output"
- ]
- }
- ],
- "metadata": {
- "accelerator": "GPU",
- "colab": {
- "authorship_tag": "ABX9TyPn4xtio5MeIQMG/e23naQt",
- "include_colab_link": true,
- "provenance": []
- },
- "gpuClass": "standard",
- "kernelspec": {
- "display_name": "Python 3",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.16"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "view-in-github"
+ },
+ "source": [
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "BsonEqBekjyy"
+ },
+ "source": [
+ "# Jet Tagging with Permutation Invariance\n",
+ "Author: Abhijith Gandrakota, Jennifer Ngadiuba\n",
+ "\n",
+ "In this notebook we will see an implementation of the Transformer architecture for sets applied to the jet tagging task. For *sets* it is meant here a point cloud, i.e. a set of nodes without edges. We will instead use Multi-Head Attention to learn which nodes (or particles) have strong pair-wise interaction.\n",
+ "\n",
+ "The architecture was introduced by [J. Lee at al. (ICML 2019)](https://arxiv.org/abs/1810.00825) -- specifically designed to model interactions among elements in the input set without pre-defined edges. The model consists of an encoder and a decoder, both of which rely on attention mechanisms, as in the original Transformer implementation [by Vaswani](https://arxiv.org/abs/1706.03762). The main difference is that positional encoding is removed plus some other low level adaptions.\n",
+ "\n",
+ "We will use tensorflow for this implementation.\n",
+ "\n",
+ "Before you start, choose GPU as a hardware accelerator for this notebook. To do this first go to Edit -> Notebook Settings -> Choose GPU as a hardware accelerator."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "A7OS3w5WRSCj"
+ },
+ "outputs": [],
+ "source": [
+ "import tensorflow as tf\n",
+ "from tensorflow import keras\n",
+ "from tensorflow.keras import layers\n",
+ "import h5py\n",
+ "import numpy as np\n",
+ "\n",
+ "# checking if we have GPUs\n",
+ "print(\"Num GPUs Available: \", len(tf.config.list_physical_devices(\"GPU\")))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "qwekVVRzneqU"
+ },
+ "source": [
+ "## Dataset exploration"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "BHQGiyC4Pr4R"
+ },
+ "outputs": [],
+ "source": [
+ "! curl https://cernbox.cern.ch/s/6Ec5pGFEpFWeH6S/download -o Data-MLtutorial.tar.gz\n",
+ "! tar -xvzf Data-MLtutorial.tar.gz\n",
+ "! ls Data-MLtutorial/JetDataset/\n",
+ "! rm Data-MLtutorial.tar.gz"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "J9ZLcoKpPteG"
+ },
+ "outputs": [],
+ "source": [
+ "# let's open the file\n",
+ "data_dir = \"Data-MLtutorial/JetDataset/\"\n",
+ "fileIN = data_dir + \"jetImage_7_100p_30000_40000.h5\"\n",
+ "f = h5py.File(fileIN)\n",
+ "# and see what it contains\n",
+ "print(list(f.keys()))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Ktx1VjNoOu4c"
+ },
+ "source": [
+ "* 'jetImage' ,' jetImageECAL' and 'jetImageHCAL' contains the image representation of the jets . We will not use them today but build our point cloud from the other information.\n",
+ "* 'jetConstituentList' is the list of particles cointained in the jet. For each particle, a list of relevant quantities is stored. This is the dataset we will consider in this notebook.\n",
+ "* 'particleFeatureNames' is the list of the names corresponding to the quantities contained in 'jetConstituentList'\n",
+ "* 'jets' is the list of jets with the high-level jet features stored. We will only use jet ID from it, indecies [-6:-1]\n",
+ "* 'jetFeatureNames' is the list of the names corresponding to the quantities contained in 'jets'. These quantities are build using physics knowledge and correspond to high-level infromation and features per graph (as opposed to per node)\n",
+ "\n",
+ "The first 100 highest transverse momentum $p_T$ particles are considered for each jet.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Re7oXWWmPxz9"
+ },
+ "outputs": [],
+ "source": [
+ "target_onehot = np.array([])\n",
+ "jetList = np.array([])\n",
+ "jetImages = np.array([])\n",
+ "features_names = dict()\n",
+ "datafiles = [\n",
+ " \"jetImage_7_100p_0_10000.h5\",\n",
+ " \"jetImage_7_100p_10000_20000.h5\",\n",
+ " \"jetImage_7_100p_30000_40000.h5\",\n",
+ " \"jetImage_7_100p_40000_50000.h5\",\n",
+ " \"jetImage_7_100p_50000_60000.h5\",\n",
+ "]\n",
+ "for i_f, fileIN in enumerate(datafiles):\n",
+ " print(\"Appending %s\" % fileIN)\n",
+ " f = h5py.File(data_dir + fileIN)\n",
+ " jetList_file = np.array(f.get(\"jetConstituentList\"))\n",
+ " target_file = np.array(f.get(\"jets\")[0:, -6:-1])\n",
+ " jetImages_file = np.array(f.get(\"jetImage\"))\n",
+ " jetList = np.concatenate([jetList, jetList_file], axis=0) if jetList.size else jetList_file\n",
+ " target_onehot = (\n",
+ " np.concatenate([target_onehot, target_file], axis=0) if target_onehot.size else target_file\n",
+ " )\n",
+ " jetImages = (\n",
+ " np.concatenate([jetImages, jetImages_file], axis=0) if jetImages.size else jetImages_file\n",
+ " )\n",
+ " del jetList_file, target_file, jetImages_file\n",
+ " # save particles/nodes features names and their indecies in a dictionary\n",
+ " if i_f == 0:\n",
+ " for feat_idx, feat_name in enumerate(list(f[\"particleFeatureNames\"])[:-1]):\n",
+ " features_names[feat_name.decode(\"utf-8\").replace(\"j1_\", \"\")] = feat_idx\n",
+ " f.close()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "7kQnL9vkP4rK"
+ },
+ "source": [
+ "The ground truth is incorporated in the ['j_g', 'j_q', 'j_w', 'j_z', 'j_t] vector of boolean, taking the form\n",
+ "* [1, 0, 0, 0, 0] for gluons\n",
+ "* [0, 1, 0, 0, 0] for quarks\n",
+ "* [0, 0, 1, 0, 0] for W\n",
+ "* [0, 0, 0, 1, 0] for Z \n",
+ "* [0, 0, 0, 0, 1] for top quarks\n",
+ "\n",
+ "This is what is called 'one-hot' encoding of a descrete label (typical of ground truth for classification problems). These labels are the 'target' for our classification tasks. Let's convert it back to single-column encoding :\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "84NSj2W7P477"
+ },
+ "outputs": [],
+ "source": [
+ "print(\"Labels for the first five entries in the dataset, one-hot encoded:\")\n",
+ "for i in range(5):\n",
+ " print(target_onehot[i])\n",
+ "print(target_onehot.shape)\n",
+ "target = np.argmax(target_onehot, axis=1)\n",
+ "print(target.shape)\n",
+ "print(\"Labels for the first five entries in the dataset, single column encoded:\")\n",
+ "for i in range(0, 5):\n",
+ " print(target[i])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "mqsd_aP__RIi"
+ },
+ "source": [
+ "Now our lables correspond to :\n",
+ "* 0 for gluons\n",
+ "* 1 for quarks\n",
+ "* 2 for W\n",
+ "* 3 for Z \n",
+ "* 4 for top quarks\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "hyP15oxhP5ek"
+ },
+ "outputs": [],
+ "source": [
+ "num_classes = len(np.unique(target))\n",
+ "label_names = [\"gluon\", \"quark\", \"W\", \"Z\", \"top\"]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Ik-6OX0LMJW7"
+ },
+ "source": [
+ "Now let's inspect our data. Each jet is a point cloud/graph with 100 particles/nodes, each of which has 16 features. We have a double-index dataset: (jet index, particle index). The list is cut at 100 constituents per jet. If less constituents are present in the jet/point cloud, the dataset is completed filling it with 0s (zero padding). Note : zero-padding is not using during the training, it is only used to store the ragged dataset.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "YfHRopq0P8tW"
+ },
+ "outputs": [],
+ "source": [
+ "print(\"Jets shape : \", jetList.shape)\n",
+ "print(\"Target/Labels shape : \", target.shape)\n",
+ "print(\"Particles/Nodes features : \", list(features_names.keys()))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "VBYwH4t8MhHm"
+ },
+ "source": [
+ "We are not interested in all features for now. For now we will only consider the same node features as were considered in the ParticleNet paper: "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "QWtB3vTWP_QY"
+ },
+ "outputs": [],
+ "source": [
+ "features_to_consider = \"etarel,phirel,pt,e,ptrel,erel,deltaR\".split(\",\")\n",
+ "features_idx = [features_names[name] for name in features_to_consider]\n",
+ "jetList = jetList[:, :, features_idx]\n",
+ "print(jetList.shape)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "-M8uvPR4mfI7"
+ },
+ "source": [
+ "Let's define basics hyperparamters:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "2mHCuVm6ZJaY"
+ },
+ "outputs": [],
+ "source": [
+ "batch_size = 128\n",
+ "learning_rate = 0.0001\n",
+ "epochs = 20"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "unpVhZNfmotI"
+ },
+ "source": [
+ "In the original paper, multi-head attention is also applied in the decoder step to obtain a smarter pooling operation. For this excercise we will simplify the model and use instead a `Lambda` layer to apply a custom pooling function to the input tensor. In this case, the `Lambda` layer is being used to sum over the first dimension, i.e. over the elements in the output set of the previous layer, which has shape `(batch_size, n_elements, features)`. By summing over the first dimension (`axis=1`), we obtain a tensor of shape `(batch_size, features)` that represents an aggregation of each feature over the elements in the set.\n",
+ "\n",
+ "Here is the full model:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "O7rzMn7wRcrP"
+ },
+ "outputs": [],
+ "source": [
+ "inputs = keras.Input(shape=(100, 7), name=\"input\")\n",
+ "x = layers.TimeDistributed(layers.Dense(64))(inputs)\n",
+ "x = layers.TimeDistributed(layers.LeakyReLU())(x)\n",
+ "x = layers.TimeDistributed(layers.Dense(64))(x)\n",
+ "x = layers.TimeDistributed(layers.LeakyReLU())(x)\n",
+ "x = layers.TimeDistributed(layers.Dense(64))(x)\n",
+ "x = layers.TimeDistributed(layers.LeakyReLU())(x)\n",
+ "x = layers.TimeDistributed(layers.Dense(64))(x)\n",
+ "x = layers.TimeDistributed(layers.LeakyReLU())(x)\n",
+ "x = layers.Lambda(lambda y: tf.reduce_sum(y, axis=1))(x)\n",
+ "x = layers.BatchNormalization()(x)\n",
+ "x = layers.Dense(64)(x)\n",
+ "x = layers.LeakyReLU()(x)\n",
+ "x = layers.Dense(64)(x)\n",
+ "x = layers.LeakyReLU()(x)\n",
+ "x = layers.Dense(16)(x)\n",
+ "x = layers.LeakyReLU()(x)\n",
+ "output = layers.Dense(5, dtype=\"float32\")(x)\n",
+ "model = keras.models.Model(inputs=inputs, outputs=output)\n",
+ "model.summary()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "G8NI-_bYdSAq"
+ },
+ "outputs": [],
+ "source": [
+ "model.compile(\n",
+ " loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
+ " optimizer=keras.optimizers.Adam(learning_rate=learning_rate),\n",
+ " metrics=[keras.metrics.SparseCategoricalAccuracy()],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "DhAKhgMMcrwa"
+ },
+ "outputs": [],
+ "source": [
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "X_train, X_val, y_train, y_val, y_train_onehot, y_val_onehot = train_test_split(\n",
+ " jetList, target, target_onehot, test_size=0.1, shuffle=True\n",
+ ")\n",
+ "print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)\n",
+ "del jetList, target, target_onehot"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "lz7rfyeCdNF0"
+ },
+ "outputs": [],
+ "source": [
+ "history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Egcr8vMhp-2v"
+ },
+ "source": [
+ "We can now plot the validation and training loss evolution over the epochs:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "sjTOMuzAqGEr"
+ },
+ "outputs": [],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "fig, axes = plt.subplots(2)\n",
+ "\n",
+ "axes[0].plot(history.history[\"sparse_categorical_accuracy\"])\n",
+ "axes[0].plot(history.history[\"val_sparse_categorical_accuracy\"])\n",
+ "axes[0].set_title(\"Accuracy\")\n",
+ "axes[0].legend([\"Training\", \"Validation\"])\n",
+ "\n",
+ "axes[1].plot(history.history[\"loss\"])\n",
+ "axes[1].plot(history.history[\"val_loss\"])\n",
+ "axes[1].legend([\"Training\", \"Validation\"])\n",
+ "axes[1].set_title(\"Loss\")\n",
+ "\n",
+ "fig.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "CwrPPStDrS4J"
+ },
+ "source": [
+ "Now we finally evaluate the performance by plotting the ROC curves for the different classes:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "JKM0yYFfecJh"
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "%matplotlib inline\n",
+ "from sklearn.metrics import roc_curve, auc\n",
+ "\n",
+ "predict_val = tf.nn.softmax(model.predict(X_val))\n",
+ "df = pd.DataFrame()\n",
+ "fpr = {}\n",
+ "tpr = {}\n",
+ "auc1 = {}\n",
+ "\n",
+ "plt.figure()\n",
+ "for i, label in enumerate(label_names):\n",
+ "\n",
+ " df[label] = y_val_onehot[:, i]\n",
+ " df[label + \"_pred\"] = predict_val[:, i]\n",
+ "\n",
+ " fpr[label], tpr[label], threshold = roc_curve(df[label], df[label + \"_pred\"])\n",
+ "\n",
+ " auc1[label] = auc(fpr[label], tpr[label])\n",
+ "\n",
+ " plt.plot(tpr[label], fpr[label], label=\"%s tagger, auc = %.1f%%\" % (label, auc1[label] * 100.0))\n",
+ "plt.semilogy()\n",
+ "plt.xlabel(\"sig. efficiency\")\n",
+ "plt.ylabel(\"bkg. mistag rate\")\n",
+ "plt.ylim(0.000001, 1)\n",
+ "plt.grid(True)\n",
+ "plt.legend(loc=\"lower right\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "IzxPDanrrYZB"
+ },
+ "source": [
+ "As you can see the performance are not as good for other models we have trained on the same dataset. As mentioned at the beginning of the notebook training a transformer might be tricky. You can try the optional excercise below to improve the performance and surpass the other models."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Multi Head Attention recap\n",
+ "\n",
+ "Assume we have $n$ query vectors (corresponding to the $n$ elements in the set) each with dimension $d_q : Q \\in \\mathbb{R}^{n\\times d_q}$. In the jet tagging dataset $n=100$ and $d_q=7$.\n",
+ "\n",
+ "An attention function $\\mathrm{Att}(Q,K,V)$ is a function that maps queries $Q$ to outputs using $n_v$ key-value pairs $K \\in \\mathbb{R}^{n_v \\times d_q}, V \\in \\mathbb{R}^{n_v\\times d_v}$:\n",
+ "\n",
+ "$$\n",
+ "\\mathrm{Att}(Q,K,V;\\omega) = \\omega(QK^{T})V.\n",
+ "$$\n",
+ "\n",
+ "The pairwise dot product $QT^\\mathrm{T} \\in \\mathbb{R}^{n\\times n_v}$ measures how similar each pair of query and key vectors is, with weights computed with an activation function $\\omega$. The output $\\omega(QK^{T})V$ is a weighted sum of $V$ where a value gets more weight if its corresponding key has larger dot product with the query.\n",
+ "\n",
+ "Instead of computing a single attention function, the **multi-head attention** method first projects $Q, K, V$ onto $h$ different $d^M_q,d^M_q,d^M_v$-dimensional vectors, respectively. An attention function $\\mathrm{Att}(\\cdot; \\omega_j)$ is applied to each of these $h$ projections. The output is a linear transformation of the concatenation of all attention outputs:\n",
+ "\n",
+ "$$\n",
+ "\\mathrm{Multihead}(Q, K, V ; \\lambda, \\omega) = \\mathrm{concat}(O_1,..., O_h)W^O\n",
+ "$$\n",
+ "\n",
+ "$$\n",
+ "O_j = \\mathrm{Att}(QW^Q_j, KW^K_j, VW^V_j ; \\omega_j )\n",
+ "$$\n",
+ "\n",
+ "In other words, the model tells you what is the score of a particle in the set knowing its interaction with the other particles in the set given all features but in a way that the features are attended separately.\n",
+ "\n",
+ "Note that $\\mathrm{Multihead}(\\cdot, \\cdot, \\cdot; \\lambda)$ has learnable parameters $\\lambda =$ {$W^Q_j, W^K_j, W^V_j$}$_{j=1,...,h}$ where $W^Q_j, W^K_j \\in \\mathbb{R}^{d_q\\times d^M_q}, W^V_j \\in \\mathbb{R}^{d_v\\times d^M_v}, W^O \\in \\mathbb{R}^{hd^M_v\\times d}$. A typical choice for the dimension hyperparameters is $d^M_q = d_q /h, d^M_v = d_v /h, d = d_q$. For the Set Transformer we set $d_q = d_v = d$ and $d^M_q = d^M_v = d/h$. A scaled softmax $\\omega_j (\\cdot) = \\mathrm{softmax}(\\cdot/\\sqrt{d})$ is used.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Building the Set Transformer\n",
+ "\n",
+ "We will implement a simplified version of the [original Set Transformer architecture](https://arxiv.org/abs/1810.00825). The reason is because Transformers are typically computationally and data hungry. As an optional excercise at the end of the notebook you can try to implement the full model and test it on a simpler problem like the MNIST dataset classification (or on a larger jet class dataset).\n",
+ "\n",
+ "The architecture is based on the block called `MAB` (= Multihead Attention Block) which implements the following:\n",
+ "\n",
+ "$$\n",
+ "\\mathrm{MAB}(X, Y) = \\mathrm{LayerNorm}(H + \\mathrm{rFF}(H))\n",
+ "$$\n",
+ "\n",
+ "$$\n",
+ "H = \\mathrm{LayerNorm}(X + \\mathrm{Multihead}(X, X, X ; ω))\n",
+ "$$\n",
+ "\n",
+ "where $X \\in \\mathbb{R}^{n\\times d}$ is the input set and $\\mathrm{rFF}$ is any feedforward layer. Since $Q=K=V=X$, the MAB takes a set and performs *self-attention* between the elements in the set, resulting in a set of equal size. Since the output of MAB contains information about pairwise interactions among the elements in the input set $X$, we can stack multiple MABs to encode higher order interactions. This stack is the *encoder* part of the transformer. \n",
+ "\n",
+ "The `LayerNorm` normalizes the activations of a layer across the last dimension (feature dimension) of the input tensor. Specifically, it centers and scales each feature dimension independently by subtracting the mean and dividing by the standard deviation, which are computed over the corresponding feature dimension of the input tensor. As for `BatchNormalization` it has learnable $\\gamma$ (scaling) and $\\beta$ (shifting) parameters. The difference with respect to `BatchNormalization` is that the normalization is performed indipendently per each instance in the batch. `LayerNorm` leads to improved stability when you expect instances of different sizes (or different zero padding degree as in the jet tagging case)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "RORSIHwVRPx4"
+ },
+ "outputs": [],
+ "source": [
+ "class SABTransformerBlock(tf.keras.layers.Layer):\n",
+ " def __init__(self, num_heads, hidden_units, mlp_hidden_units=128, dropout_rate=0.1, **kwargs):\n",
+ " super(SABTransformerBlock, self).__init__(**kwargs)\n",
+ " self.num_heads = num_heads\n",
+ " self.hidden_units = hidden_units\n",
+ " self.mlp_hidden_units = mlp_hidden_units\n",
+ " self.dropout_rate = dropout_rate\n",
+ "\n",
+ " def build(self, input_shape):\n",
+ " self.attention = tf.keras.layers.MultiHeadAttention(\n",
+ " num_heads=self.num_heads, key_dim=self.hidden_units // self.num_heads\n",
+ " )\n",
+ " self.feedforward = tf.keras.Sequential(\n",
+ " [\n",
+ " layers.Dense(units=self.mlp_hidden_units, activation=\"relu\"),\n",
+ " # Dropout(rate=self.dropout_rate),\n",
+ " layers.Dense(units=input_shape[-1]),\n",
+ " ]\n",
+ " )\n",
+ " self.layer_norm1 = layers.LayerNormalization(epsilon=1e-6)\n",
+ " self.layer_norm2 = layers.LayerNormalization(epsilon=1e-6)\n",
+ " self.dropout1 = layers.Dropout(rate=self.dropout_rate)\n",
+ " self.dropout2 = layers.Dropout(rate=self.dropout_rate)\n",
+ " super(SABTransformerBlock, self).build(input_shape)\n",
+ "\n",
+ " def call(self, inputs, mask=None):\n",
+ " attention_output = self.attention(inputs, inputs, attention_mask=mask)[0]\n",
+ " # attention_output = self.dropout1(attention_output)\n",
+ " attention_output = self.layer_norm1(inputs + attention_output)\n",
+ " feedforward_output = self.feedforward(attention_output)\n",
+ " # feedforward_output = self.dropout2(feedforward_output)\n",
+ " block_output = self.layer_norm2(attention_output + feedforward_output)\n",
+ " return block_output"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inputs = keras.Input(shape=(100, 7), name=\"input\")\n",
+ "x = layers.TimeDistributed(layers.Dense(64))(inputs)\n",
+ "x = SABTransformerBlock(num_heads=8, hidden_units=64)(x)\n",
+ "x = SABTransformerBlock(num_heads=8, hidden_units=64)(x)\n",
+ "x = SABTransformerBlock(num_heads=8, hidden_units=64)(x)\n",
+ "x = layers.Lambda(lambda y: tf.reduce_sum(y, axis=1))(x)\n",
+ "x = layers.BatchNormalization()(x)\n",
+ "x = layers.Dense(64)(x)\n",
+ "x = layers.LeakyReLU()(x)\n",
+ "x = layers.Dense(64)(x)\n",
+ "x = layers.LeakyReLU()(x)\n",
+ "x = layers.Dense(16)(x)\n",
+ "x = layers.LeakyReLU()(x)\n",
+ "output = layers.Dense(5, dtype=\"float32\")(x)\n",
+ "model_st = keras.models.Model(inputs=inputs, outputs=output)\n",
+ "model.summary()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model.compile(\n",
+ " loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
+ " optimizer=keras.optimizers.Adam(learning_rate=learning_rate),\n",
+ " metrics=[keras.metrics.SparseCategoricalAccuracy()],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "fig, axes = plt.subplots(2)\n",
+ "\n",
+ "axes[0].plot(history.history[\"sparse_categorical_accuracy\"])\n",
+ "axes[0].plot(history.history[\"val_sparse_categorical_accuracy\"])\n",
+ "axes[0].set_title(\"Accuracy\")\n",
+ "axes[0].legend([\"Training\", \"Validation\"])\n",
+ "\n",
+ "axes[1].plot(history.history[\"loss\"])\n",
+ "axes[1].plot(history.history[\"val_loss\"])\n",
+ "axes[1].legend([\"Training\", \"Validation\"])\n",
+ "axes[1].set_title(\"Loss\")\n",
+ "\n",
+ "fig.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now we finally evaluate the performance by plotting the ROC curves for the different classes:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "%matplotlib inline\n",
+ "from sklearn.metrics import roc_curve, auc\n",
+ "\n",
+ "predict_val = tf.nn.softmax(model.predict(X_val))\n",
+ "df = pd.DataFrame()\n",
+ "fpr = {}\n",
+ "tpr = {}\n",
+ "auc1 = {}\n",
+ "\n",
+ "plt.figure()\n",
+ "for i, label in enumerate(label_names):\n",
+ "\n",
+ " df[label] = y_val_onehot[:, i]\n",
+ " df[label + \"_pred\"] = predict_val[:, i]\n",
+ "\n",
+ " fpr[label], tpr[label], threshold = roc_curve(df[label], df[label + \"_pred\"])\n",
+ "\n",
+ " auc1[label] = auc(fpr[label], tpr[label])\n",
+ "\n",
+ " plt.plot(tpr[label], fpr[label], label=\"%s tagger, auc = %.1f%%\" % (label, auc1[label] * 100.0))\n",
+ "plt.semilogy()\n",
+ "plt.xlabel(\"sig. efficiency\")\n",
+ "plt.ylabel(\"bkg. mistag rate\")\n",
+ "plt.ylim(0.000001, 1)\n",
+ "plt.grid(True)\n",
+ "plt.legend(loc=\"lower right\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Optional Excercise\n",
+ "\n",
+ "The original paper also use MH mechanism in the decoder step (while we used a simple sum over the latent space nodes). If you would like to try it out the `Lambda` layer should be replaced with the `PoolingByMultiHeadAttention` block below.\n",
+ "\n",
+ "Consider also the fact that it might be hard to train a Transformer architecture of this kind over the rather small dataset used here. Check out [this other dataset](https://events.mcs.cmu.edu/us-cms-2023/) for increased statistics or [this notebook](https://github.com/DLii-Research/tf-settransformer/blob/master/examples/mnist_pointcloud.ipynb) for a simpler task.\n",
+ "\n",
+ "Below is the starting point for a smarter decoder:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "9qHj1_Y7ZU-R"
+ },
+ "outputs": [],
+ "source": [
+ "class PoolingByMultiHeadAttention(tf.keras.layers.Layer):\n",
+ " def __init__(self, num_heads, hidden_units, mlp_hidden_units=128, num_seeds=1, **kwargs):\n",
+ " super(PoolingByMultiHeadAttention, self).__init__(**kwargs)\n",
+ " self.num_heads = num_heads\n",
+ " self.hidden_units = hidden_units\n",
+ " self.mlp_hidden_units = mlp_hidden_units\n",
+ " self.num_seeds = num_seeds\n",
+ "\n",
+ " def build(self, input_shape):\n",
+ "\n",
+ " self.attention = tf.keras.layers.MultiHeadAttention(\n",
+ " num_heads=self.num_heads, key_dim=self.hidden_units\n",
+ " )\n",
+ "\n",
+ " self.seed_vectors = self.add_weight(\n",
+ " shape=(1, self.num_seeds, self.hidden_units),\n",
+ " initializer=\"random_normal\",\n",
+ " trainable=True,\n",
+ " name=\"Seeds\",\n",
+ " )\n",
+ "\n",
+ " self.feedforward = tf.keras.Sequential(\n",
+ " [\n",
+ " layers.Dense(units=self.mlp_hidden_units, activation=\"relu\"),\n",
+ " layers.Dense(units=self.hidden_units),\n",
+ " ]\n",
+ " )\n",
+ " self.layer_norm1 = layers.LayerNormalization(epsilon=1e-6)\n",
+ " self.layer_norm2 = layers.LayerNormalization(epsilon=1e-6)\n",
+ " super(PoolingByMultiHeadAttention, self).build(input_shape)\n",
+ "\n",
+ " def call(self, inputs, training=None):\n",
+ " a = tf.expand_dims(self.seed_vectors, axis=0)\n",
+ " seeds = tf.tile(self.seed_vectors, [tf.shape(inputs)[0], 1, 1])\n",
+ " attention_output = self.attention(seeds, inputs)[0]\n",
+ " attention_output = self.layer_norm1(seeds + attention_output)\n",
+ " feedforward_output = self.feedforward(attention_output)\n",
+ " block_output = self.layer_norm2(attention_output + feedforward_output)\n",
+ " return block_output"
+ ]
+ }
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "authorship_tag": "ABX9TyPn4xtio5MeIQMG/e23naQt",
+ "include_colab_link": true,
+ "provenance": []
+ },
+ "gpuClass": "standard",
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.16"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
}