JunxiChhen
diff --git a/‎examples/mxnet/gcmc/model.py
+1-1 b/‎examples/mxnet/gcmc/model.py
+1-1
diff --git a/‎examples/mxnet/gcn/train.py
+1-1 b/‎examples/mxnet/gcn/train.py
+1-1
diff --git a/‎examples/mxnet/graphsage/main.py
+2-2 b/‎examples/mxnet/graphsage/main.py
+2-2
diff --git a/‎examples/mxnet/sgc/sgc.py
+2-2 b/‎examples/mxnet/sgc/sgc.py
+2-2
diff --git a/‎examples/pytorch/cluster_gcn/cluster_gcn.py
+1-9 b/‎examples/pytorch/cluster_gcn/cluster_gcn.py
+1-9
diff --git a/‎examples/pytorch/gat/train.py
+1-1 b/‎examples/pytorch/gat/train.py
+1-1
diff --git a/‎examples/pytorch/gat/train_ppi.py
+3-6 b/‎examples/pytorch/gat/train_ppi.py
+3-6
diff --git a/‎examples/pytorch/gcmc/README.md
-17 b/‎examples/pytorch/gcmc/README.md
-17
diff --git a/‎examples/pytorch/gcmc/model.py
+1-1 b/‎examples/pytorch/gcmc/model.py
+1-1
diff --git a/‎examples/pytorch/gcmc/train.py
+5-5 b/‎examples/pytorch/gcmc/train.py
+5-5
diff --git a/‎examples/pytorch/gcn/train.py
+1-1 b/‎examples/pytorch/gcn/train.py
+1-1
diff --git a/‎examples/pytorch/graphsage/train_cv.py
+4-4 b/‎examples/pytorch/graphsage/train_cv.py
+4-4
diff --git a/‎examples/pytorch/graphsage/train_cv_multi_gpu.py
+2-2 b/‎examples/pytorch/graphsage/train_cv_multi_gpu.py
+2-2
diff --git a/‎examples/pytorch/graphsage/train_full.py
+21-20 b/‎examples/pytorch/graphsage/train_full.py
+21-20
@@ -220,7 +220,7 @@ def forward(self, graph, ufeat, ifeat):
         for i in range(self._num_basis_functions):
             graph.nodes['user'].data['h'] = F.dot(ufeat, self.Ps[i].data())
             graph.apply_edges(fn.u_dot_v('h', 'h', 'sr'))
-            basis_out.append(graph.edata['sr'].expand_dims(1))
+            basis_out.append(graph.edata['sr'])
         out = F.concat(*basis_out, dim=1)
         out = self.rate_out(out)
         return out
 
@@ -36,7 +36,7 @@ def main(args):
     else:
         cuda = True
         ctx = mx.gpu(args.gpu)
-        g = g.to(ctx)
+        g = g.int().to(ctx)
 
     features = g.ndata['feat']
     labels = mx.nd.array(g.ndata['label'], dtype="float32", ctx=ctx)
 
@@ -69,7 +69,7 @@ def main(args):
     else:
         cuda = True
         ctx = mx.gpu(args.gpu)
-        g = g.to(ctx)
+        g = g.int().to(ctx)
 
     features = g.ndata['feat']
     labels = mx.nd.array(g.ndata['label'], dtype="float32", ctx=ctx)
@@ -164,4 +164,4 @@ def main(args):
     args = parser.parse_args()
     print(args)
 
-    main(args)
+    main(args)
@@ -39,7 +39,7 @@ def main(args):
     else:
         cuda = True
         ctx = mx.gpu(args.gpu)
-        g = g.to(ctx)
+        g = g.int().to(ctx)
 
     features = g.ndata['feat']
     labels = mx.nd.array(g.ndata['label'], dtype="float32", ctx=ctx)
@@ -123,4 +123,4 @@ def main(args):
     args = parser.parse_args()
     print(args)
 
-    main(args)
+    main(args)
@@ -11,7 +11,6 @@
 import torch.nn.functional as F
 import dgl
 from dgl.data import register_data_args
-from torch.utils.tensorboard import SummaryWriter
 
 from modules import GraphSAGE
 from sampler import ClusterIter
@@ -84,7 +83,7 @@ def main(args):
         torch.cuda.set_device(args.gpu)
         val_mask = val_mask.cuda()
         test_mask = test_mask.cuda()
-        g = g.to(args.gpu)
+        g = g.int().to(args.gpu)
 
     print('labels shape:', g.ndata['label'].shape)
     print("features shape, ", g.ndata['feat'].shape)
@@ -102,7 +101,6 @@ def main(args):
 
     # logger and so on
     log_dir = save_log_dir(args)
-    writer = SummaryWriter(log_dir)
     logger = Logger(os.path.join(log_dir, 'loggings'))
     logger.write(args)
 
@@ -148,8 +146,6 @@ def main(args):
             if j % args.log_every == 0:
                 print(f"epoch:{epoch}/{args.n_epochs}, Iteration {j}/"
                       f"{len(cluster_iterator)}:training loss", loss.item())
-                writer.add_scalar('train/loss', loss.item(),
-                                  global_step=j + epoch * len(cluster_iterator))
         print("current memory:",
               torch.cuda.memory_allocated(device=pred.device) / 1024 / 1024)
 
@@ -164,8 +160,6 @@ def main(args):
                 print('new best val f1:', best_f1)
                 torch.save(model.state_dict(), os.path.join(
                     log_dir, 'best_model.pkl'))
-            writer.add_scalar('val/f1-mic', val_f1_mic, global_step=epoch)
-            writer.add_scalar('val/f1-mac', val_f1_mac, global_step=epoch)
 
     end_time = time.time()
     print(f'training using time {start_time-end_time}')
@@ -177,8 +171,6 @@ def main(args):
     test_f1_mic, test_f1_mac = evaluate(
         model, g, labels, test_mask, multitask)
     print("Test F1-mic{:.4f}, Test F1-mac{:.4f}". format(test_f1_mic, test_f1_mac))
-    writer.add_scalar('test/f1-mic', test_f1_mic)
-    writer.add_scalar('test/f1-mac', test_f1_mac)
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='GCN')
 
@@ -53,7 +53,7 @@ def main(args):
         cuda = False
     else:
         cuda = True
-        g = g.to(args.gpu)
+        g = g.int().to(args.gpu)
 
     features = g.ndata['feat']
     labels = g.ndata['label']
 
@@ -60,7 +60,7 @@ def main(args):
     g = train_dataset[0]
     n_classes = train_dataset.num_labels
     num_feats = g.ndata['feat'].shape[1]
-    g = g.to(device)
+    g = g.int().to(device)
     heads = ([args.num_heads] * args.num_layers) + [args.num_out_heads]
     # define the model
     model = GAT(g,
@@ -117,12 +117,9 @@ def main(args):
                 if cur_step == patience:
                     break
     test_score_list = []
-    for batch, test_data in enumerate(test_dataloader):
-        subgraph, feats, labels = test_data
+    for batch, subgraph in enumerate(test_dataloader):
         subgraph = subgraph.to(device)
-        feats = feats.to(device)
-        labels = labels.to(device)
-        test_score_list.append(evaluate(feats, model, subgraph, labels.float(), loss_fcn)[0])
+        test_score_list.append(evaluate(subgraph.ndata['feat'], model, subgraph, subgraph.ndata['label'], loss_fcn))
     print("Test F1-Score: {:.4f}".format(np.array(test_score_list).mean()))
 
 if __name__ == '__main__':
 
@@ -24,7 +24,6 @@ ml-100k, no feature
 python3 train.py --data_name=ml-100k --use_one_hot_fea --gcn_agg_accum=stack
 ```
 Results: RMSE=0.9088 (0.910 reported)
-Speed: 0.0410s/epoch (vanilla implementation: 0.1008s/epoch)
 
 ml-100k, with feature
 ```bash
@@ -37,7 +36,6 @@ ml-1m, no feature
 python3 train.py --data_name=ml-1m --gcn_agg_accum=sum --use_one_hot_fea
 ```
 Results: RMSE=0.8377 (0.832 reported)
-Speed: 0.0844s/epoch (vanilla implementation: 1.538s/epoch)
 
 ml-10m, no feature
 ```bash
@@ -46,7 +44,6 @@ python3 train.py --data_name=ml-10m --gcn_agg_accum=stack --gcn_dropout=0.3 \
                                  --use_one_hot_fea --gen_r_num_basis_func=4
 ```
 Results: RMSE=0.7800 (0.777 reported)
-Speed: 1.1982/epoch (vanilla implementation: OOM)
 Testbed: EC2 p3.2xlarge instance(Amazon Linux 2)
 
 ### Train with minibatch on a single GPU
@@ -67,8 +64,6 @@ python3 train_sampling.py --data_name=ml-100k \
                           --gpu 0 
 ```
 Results: RMSE=0.9380
-Speed: 1.059s/epoch (Run with 70 epoches)
-Speed: 1.046s/epoch (mix_cpu_gpu)
 
 ml-100k, with feature
 ```bash
@@ -97,8 +92,6 @@ python3 train_sampling.py --data_name=ml-1m \
                           --gpu 0
 ```
 Results: RMSE=0.8632
-Speed: 7.852s/epoch (Run with 60 epoches)
-Speed: 7.788s/epoch (mix_cpu_gpu)
 
 ml-10m, no feature
 ```bash
@@ -126,8 +119,6 @@ python3 train_sampling.py --data_name=ml-10m \
                           --gpu 0
 ```
 Results: RMSE=0.8050
-Speed: 394.304s/epoch (Run with 60 epoches)
-Speed: 408.749s/epoch (mix_cpu_gpu)
 Testbed: EC2 p3.2xlarge instance
 
 ### Train with minibatch on multi-GPU
@@ -151,8 +142,6 @@ python train_sampling.py --data_name=ml-100k \
                          --gpu 0,1,2,3,4,5,6,7
 ```
 Result: RMSE=0.9397
-Speed: 1.202s/epoch (Run with only 30 epoches) 
-Speed: 1.245/epoch (mix_cpu_gpu)
 
 ml-100k, with feature
 ```bash
@@ -162,7 +151,6 @@ python train_sampling.py --data_name=ml-100k \
                          --gpu 0,1,2,3,4,5,6,7
 ```
 Result: RMSE=0.9655
-Speed:  1.265/epoch (Run with 30 epoches)
 
 ml-1m, no feature
 ```bash
@@ -182,8 +170,6 @@ python train_sampling.py --data_name=ml-1m \
                          --gpu 0,1,2,3,4,5,6,7
 ```
 Results: RMSE=0.8621
-Speed: 11.612s/epoch (Run with 40 epoches)
-Speed: 12.483s/epoch (mix_cpu_gpu)
 
 ml-10m, no feature
 ```bash
@@ -211,8 +197,6 @@ python train_sampling.py --data_name=ml-10m \
                          --gpu 0,1,2,3,4,5,6,7
 ```
 Results: RMSE=0.8084
-Speed: 632.868s/epoch (Run with 30 epoches)
-Speed: 633.397s/epoch (mix_cpu_gpu)
 Testbed: EC2 p3.16xlarge instance
 
 ### Train with minibatch on CPU
@@ -223,5 +207,4 @@ python3 train_sampling.py --data_name=ml-100k \
                           --gcn_agg_accum=stack \
                           --gpu -1
 ```
-Speed 1.591s/epoch
 Testbed: EC2 r5.xlarge instance
@@ -340,7 +340,7 @@ def forward(self, graph, ufeat, ifeat):
             for i in range(self._num_basis):
                 graph.nodes['user'].data['h'] = ufeat @ self.Ps[i]
                 graph.apply_edges(fn.u_dot_v('h', 'h', 'sr'))
-                basis_out.append(graph.edata['sr'].unsqueeze(1))
+                basis_out.append(graph.edata['sr'])
             out = th.cat(basis_out, dim=1)
             out = self.combine_basis(out)
         return out
 
@@ -105,12 +105,12 @@ def train(args):
     count_num = 0
     count_loss = 0
 
-    dataset.train_enc_graph = dataset.train_enc_graph.to(args.device)
-    dataset.train_dec_graph = dataset.train_dec_graph.to(args.device)
+    dataset.train_enc_graph = dataset.train_enc_graph.int().to(args.device)
+    dataset.train_dec_graph = dataset.train_dec_graph.int().to(args.device)
     dataset.valid_enc_graph = dataset.train_enc_graph
-    dataset.valid_dec_graph = dataset.valid_dec_graph.to(args.device)
-    dataset.test_enc_graph = dataset.test_enc_graph.to(args.device)
-    dataset.test_dec_graph = dataset.test_dec_graph.to(args.device)
+    dataset.valid_dec_graph = dataset.valid_dec_graph.int().to(args.device)
+    dataset.test_enc_graph = dataset.test_enc_graph.int().to(args.device)
+    dataset.test_dec_graph = dataset.test_dec_graph.int().to(args.device)
 
     print("Start training ...")
     dur = []
 
@@ -38,7 +38,7 @@ def main(args):
         cuda = False
     else:
         cuda = True
-        g = g.to(args.gpu)
+        g = g.int().to(args.gpu)
 
     features = g.ndata['feat']
     labels = g.ndata['label']
 
@@ -109,7 +109,7 @@ def inference(self, g, x, batch_size, device):
                 end = start + batch_size
                 batch_nodes = nodes[start:end]
                 block = dgl.to_block(dgl.in_subgraph(g, batch_nodes), batch_nodes)
-                block = block.to(device)
+                block = block.int().to(device)
                 induced_nodes = block.srcdata[dgl.NID]
 
                 h = x[induced_nodes].to(device)
@@ -188,7 +188,7 @@ def load_subtensor(g, labels, blocks, hist_blocks, dev_id, aggregation_on_device
             hist_block = hist_block.to(dev_id)
         hist_block.update_all(fn.copy_u('hist', 'm'), fn.mean('m', 'agg_hist'))
 
-        block = block.to(dev_id)
+        block = block.int().to(dev_id)
         if not aggregation_on_device:
             hist_block = hist_block.to(dev_id)
         block.dstdata['agg_hist'] = hist_block.dstdata['agg_hist']
@@ -220,8 +220,8 @@ def run(args, dev_id, data):
 
     # Unpack data
     train_mask, val_mask, in_feats, labels, n_classes, g = data
-    train_nid = train_mask.nonzero()[:, 0]
-    val_nid = val_mask.nonzero()[:, 0]
+    train_nid = train_mask.nonzero().squeeze()
+    val_nid = val_mask.nonzero().squeeze()
 
     # Create sampler
     sampler = NeighborSampler(g, [int(_) for _ in args.fan_out.split(',')])
 
@@ -262,8 +262,8 @@ def run(proc_id, n_gpus, args, devices, data):
 
     # Unpack data
     train_mask, val_mask, in_feats, labels, n_classes, g = data
-    train_nid = train_mask.nonzero()[:, 0]
-    val_nid = val_mask.nonzero()[:, 0]
+    train_nid = train_mask.nonzero().squeeze()
+    val_nid = val_mask.nonzero().squeeze()
 
     # Split train_nid
     train_nid = th.split(train_nid, math.ceil(len(train_nid) // n_gpus))[proc_id]
 
@@ -11,6 +11,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import dgl
 from dgl import DGLGraph
 from dgl.data import register_data_args, load_data
 from dgl.nn.pytorch.conv import SAGEConv
@@ -48,31 +49,27 @@ def forward(self, graph, inputs):
         return h
 
 
-def evaluate(model, graph, features, labels, mask):
+def evaluate(model, graph, features, labels, nid):
     model.eval()
     with torch.no_grad():
         logits = model(graph, features)
-        logits = logits[mask]
-        labels = labels[mask]
+        logits = logits[nid]
+        labels = labels[nid]
         _, indices = torch.max(logits, dim=1)
         correct = torch.sum(indices == labels)
         return correct.item() * 1.0 / len(labels)
 
 def main(args):
     # load and preprocess dataset
     data = load_data(args)
-    features = torch.FloatTensor(data.features)
-    labels = torch.LongTensor(data.labels)
-    if hasattr(torch, 'BoolTensor'):
-        train_mask = torch.BoolTensor(data.train_mask)
-        val_mask = torch.BoolTensor(data.val_mask)
-        test_mask = torch.BoolTensor(data.test_mask)
-    else:
-        train_mask = torch.ByteTensor(data.train_mask)
-        val_mask = torch.ByteTensor(data.val_mask)
-        test_mask = torch.ByteTensor(data.test_mask)
+    g = data[0]
+    features = g.ndata['feat']
+    labels = g.ndata['label']
+    train_mask = g.ndata['train_mask']
+    val_mask = g.ndata['val_mask']
+    test_mask = g.ndata['test_mask']
     in_feats = features.shape[1]
-    n_classes = data.num_labels
+    n_classes = data.num_classes
     n_edges = data.graph.number_of_edges()
     print("""----Data statistics------'
       #Edges %d
@@ -97,11 +94,15 @@ def main(args):
         test_mask = test_mask.cuda()
         print("use cuda:", args.gpu)
 
+    train_nid = train_mask.nonzero().squeeze()
+    val_nid = val_mask.nonzero().squeeze()
+    test_nid = test_mask.nonzero().squeeze()
+
     # graph preprocess and calculate normalization factor
-    g = data.graph
-    g.remove_edges_from(nx.selfloop_edges(g))
-    g = DGLGraph(g)
+    g = dgl.remove_self_loop(g)
     n_edges = g.number_of_edges()
+    if cuda:
+        g = g.int().to(args.gpu)
 
     # create GraphSAGE model
     model = GraphSAGE(in_feats,
@@ -126,7 +127,7 @@ def main(args):
             t0 = time.time()
         # forward
         logits = model(g, features)
-        loss = F.cross_entropy(logits[train_mask], labels[train_mask])
+        loss = F.cross_entropy(logits[train_nid], labels[train_nid])
 
         optimizer.zero_grad()
         loss.backward()
@@ -135,13 +136,13 @@ def main(args):
         if epoch >= 3:
             dur.append(time.time() - t0)
 
-        acc = evaluate(model, g, features, labels, val_mask)
+        acc = evaluate(model, g, features, labels, val_nid)
         print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | "
               "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur), loss.item(),
                                             acc, n_edges / np.mean(dur) / 1000))
 
     print()
-    acc = evaluate(model, g, features, labels, test_mask)
+    acc = evaluate(model, g, features, labels, test_nid)
     print("Test Accuracy {:.4f}".format(acc))