docstrings

karinazad · karinazad · commit 6f93169cd444 · 2025-03-03T10:02:38.000-05:00
diff --git a/src/lobster/datasets/_distributed_environment_utils.py b/src/lobster/datasets/_distributed_environment_utils.py
diff --git a/src/lobster/datasets/_multiplexed_sampling_dataset.py b/src/lobster/datasets/_multiplexed_sampling_dataset.py
@@ -7,14 +7,80 @@
 
 
 class MultiplexedSamplingDataset(IterableDataset):
-    """Dataset that samples from multiple datasets according to specified weights."""
-
     def __init__(
         self,
         datasets: Sequence[Dataset | IterableDataset],
         weights: Sequence[float | int] = None,
         seed: int | None = None,
     ):
+        """Dataset that samples from multiple datasets according to specified weights.
+
+        This dataset implements a weighted sampling strategy across multiple source datasets.
+        For each iteration, it randomly selects a source dataset according to the provided
+        weights and yields the next item from that dataset. This allows creating custom
+        mixing ratios of different data sources without having to physically combine them.
+
+        Parameters
+        ----------
+        datasets : Sequence[Dataset | IterableDataset]
+            A sequence of datasets to sample from. These can be either map-style
+            datasets (implementing __getitem__ and __len__) or iterable-style
+            datasets (implementing __iter__).
+
+        weights : Sequence[float | int], optional
+            Relative sampling weights for each dataset. Can be > 1.0.
+            If None, equal weights will be assigned to all datasets.
+            Must have the same length as datasets.
+            Weights will be normalized internally so they sum to 1.0.
+            Non-positive weights are not allowed.
+
+        seed : int or None, optional
+            Random seed for reproducible sampling. If None, sampling will not be
+            reproducible across runs.
+
+        Raises
+        ------
+        ValueError
+            If the number of weights doesn't match the number of datasets,
+            or if any weight is negative.
+
+        Notes
+        -----
+        - If any dataset is exhausted during iteration, the entire iteration will stop.
+        - When using this dataset with multiple workers, each worker will sample
+          independently with the same weights but potentially different items.
+        - Setting a seed ensures reproducible sampling sequences.
+
+
+        Examples
+        --------
+        from torch.utils.data import IterableDataset
+        # Create three simple iterable datasets
+        datasets = [
+            IterableStringDataset(["Banana"] * 100)
+            IterableStringDataset(["Apple"] * 500)
+            IterableStringDataset(["Orange"] * 1000)
+        ]
+
+        # Equal weighting (default)
+        equal_dataset = MultiplexedSamplingDataset(datasets, seed=42)
+        samples = [next(iter(equal_dataset)) for _ in range(6)]
+        # Output would be a mix of fruits with roughly equal probability
+        # Note that it **doesn't** take the number of items in each dataset into account
+        # ['Banana', 'Orange', 'Apple', 'Orange', 'Banana','Apple']
+
+        # Custom weighting (99% bananas)
+        banana_heavy = MultiplexedSamplingDataset(
+            datasets,
+            weights=[0.99, 0.005, 0.005],
+            seed=42
+        )
+        samples = [next(iter(banana_heavy)) for _ in range(6)]
+        # Output would be mostly bananas
+        # ['Banana', 'Banana', 'Banana', 'Banana', 'Banana', 'Banana',]
+
+
+        """
         if weights is not None:
             if len(datasets) != len(weights):
                 raise ValueError("Number of datasets and weights must match")
@@ -37,8 +103,18 @@ def __init__(
             self.generator = None
 
     def __iter__(self):
-        """Iterate over samples from datasets according to weights."""
+        """Iterate over samples from datasets according to weights.
+
+        Yields
+        ------
+        Any
+            Items sampled from the constituent datasets according to the specified weights.
 
+        Notes
+        -----
+        The iteration stops when any of the constituent datasets is exhausted,
+        even if other datasets still have items available.
+        """
         # Create iterators for each dataset
         # Assume each dataset handles worker sharding internally
         iterators = {dataset: iter(dataset) for dataset in self.datasets}
diff --git a/src/lobster/datasets/_shuffled_iterable_dataset.py b/src/lobster/datasets/_shuffled_iterable_dataset.py
@@ -10,6 +10,33 @@ def __init__(
         buffer_size: int = 10000,
         seed: int | None = None,
     ):
+        """
+        A dataset wrapper that applies shuffling to an iterable dataset using a buffer.
+
+        This implementation maintains a buffer of items from the underlying dataset
+        and yields a random item from this buffer each time, replacing it with a new
+        item from the dataset. This provides approximate shuffling for iterable datasets
+        that cannot be fully loaded into memory.
+
+        Parameters
+        ----------
+        dataset : IterableDataset
+            The underlying dataset to shuffle.
+        buffer_size : int, optional
+            The size of the buffer used for shuffling, by default 10000.
+            Larger buffer sizes provide better shuffling at the cost of memory.
+        seed : int or None, optional
+            Random seed for reproducibility, by default None.
+            If None, a random seed will be generated.
+
+        Notes
+        -----
+        The shuffling is approximate and depends on the buffer size. A larger buffer
+        provides better shuffling but requires more memory.
+
+        This implementation also handles distributed data loading with multiple workers
+        by ensuring each worker uses a different random seed derived from a shared base seed.
+        """
         super().__init__()
 
         self.dataset = dataset