flops

karinazad · karinazad · commit f2a53c0498ae · 2025-02-28T11:13:30.000-05:00
diff --git a/src/lobster/model/modern_bert/_modern_bert.py b/src/lobster/model/modern_bert/_modern_bert.py
@@ -93,6 +93,31 @@ def __init__(
             self.eos_token_id = eos_token_id
 
 
+        # Measure FLOPS - for forward pass only
+        self.flops_per_batch = lightning.fabric.utilities.throughput.measure_flops(self.model, self._sample_forward)
+
+    def _sample_forward(self):
+        batch_size = 64 # FIXME
+        batch = {
+                    "input_ids": torch.randint(0, self.vocab_size, (batch_size, self.max_length)),
+                    "attention_mask": torch.ones(batch_size, self.max_length),
+                }
+
+        tokens = batch["input_ids"]
+        B, length = tokens.shape
+        tokens = tokens.view(-1)
+        attention_mask = batch["attention_mask"].view(-1)
+
+        cu_seqlens = torch.tensor([0] + [(i + 1) * length for i in range(B)], dtype=torch.int32).cuda()
+
+        return self.model(
+                    tokens,
+                    attention_mask=attention_mask,
+                    cu_seqlens=cu_seqlens,
+                    max_seqlen=self.max_length
+        )
+    
+
     def training_step(self, batch, batch_idx):
         loss = self._compute_loss(batch)
         ppl = torch.exp(loss)
@@ -200,33 +225,5 @@ def _mask_inputs(self, train_inputs: torch.Tensor):
 
         return masked_inputs
     
-    def setup(self, stage: str | None):
-        """Used to measure FLOPs"""
-
-        with torch.device("meta"):
-            model = FlexBERT(self.config)
-
-            def sample_forward():
-                batch_size = 64 # TODO figure out how to avoid setting this manually
-                batch = {
-                    "input_ids": torch.randint(0, model.vocab_size, (batch_size, model.max_length)),
-                    "attention_mask": torch.ones(batch_size, model.max_length),
-                }
 
-                tokens = batch["input_ids"]
-                B, length = tokens.shape
-                tokens = tokens.view(-1)
-                attention_mask = batch["attention_mask"].view(-1)
-
-                cu_seqlens = torch.tensor([0] + [(i + 1) * length for i in range(B)], dtype=torch.int32).cuda()
-
-                return model.model(
-                    tokens,
-                    attention_mask=attention_mask,
-                    cu_seqlens=cu_seqlens,
-                    max_seqlen=model.max_length
-                )
-            
-            # Measure FLOPS - for forward pass only
-            self.flops_per_batch = lightning.fabric.utilities.throughput.measure_flops(model, sample_forward)