Revert "Add --filter-rank to torchrun: allow logs filtering by rank (pytorch#118562)"

pytorchmergebot · pytorchmergebot · commit a4355d6b9a6a · 2024-02-02T23:56:21.000Z
This reverts commit 73229b4. Reverted pytorch#118562 on behalf of https://github.com/xmfan due to breaks MAST precheck, flag naming conflict ([comment](pytorch#118562 (comment)))
diff --git a/torch/distributed/argparse_util.py b/torch/distributed/argparse_util.py
@@ -11,7 +11,7 @@
 
 class env(Action):
     """
-    Get argument values from ``PET_{dest}`` before defaulting to the given ``default`` value.
+    Get argument values from ``PET_{dest}`` before defaultingto the given ``default`` value.
 
     For flags (e.g. ``--standalone``)
     use ``check_env`` instead.
diff --git a/torch/distributed/elastic/agent/server/local_elastic_agent.py b/torch/distributed/elastic/agent/server/local_elastic_agent.py
@@ -15,7 +15,7 @@
 from string import Template
 import tempfile
 import uuid
-from typing import Any, Dict, Optional, Tuple, Set
+from typing import Any, Dict, Optional, Tuple
 
 import torch.distributed.elastic.timer as timer
 from torch.distributed.elastic import events
@@ -140,7 +140,6 @@ def __init__(
         exit_barrier_timeout: float = 300,
         log_dir: Optional[str] = None,
         log_line_prefix_template: Optional[str] = None,
-        filter_local_ranks: Optional[Set[int]] = None,
     ):
         super().__init__(spec, exit_barrier_timeout)
         self._start_method = start_method
@@ -149,7 +148,6 @@ def __init__(
         self._rdzv_handler = spec.rdzv_handler
         self._log_dir = self._make_log_dir(log_dir, rdzv_run_id)
         self._log_line_prefix_template = log_line_prefix_template
-        self._filter_local_ranks = filter_local_ranks
         self._worker_watchdog: Optional[timer.FileTimerServer] = None
 
     def _make_log_dir(self, log_dir: Optional[str], rdzv_run_id: str):
@@ -302,7 +300,6 @@ def _start_workers(self, worker_group: WorkerGroup) -> Dict[int, Any]:
             start_method=self._start_method,
             redirects=spec.redirects,
             tee=spec.tee,
-            filter_local_ranks=self._filter_local_ranks,
         )
 
         return self._pcontext.pids()
diff --git a/torch/distributed/elastic/multiprocessing/__init__.py b/torch/distributed/elastic/multiprocessing/__init__.py
@@ -63,7 +63,7 @@ def trainer(a, b, c):
 """
 
 import os
-from typing import Callable, Dict, Optional, Tuple, Union, Set
+from typing import Callable, Dict, Optional, Tuple, Union
 
 from torch.distributed.elastic.multiprocessing.api import (  # noqa: F401
     _validate_full_rank,
@@ -103,7 +103,6 @@ def start_processes(
     start_method: str = "spawn",
     redirects: Union[Std, Dict[int, Std]] = Std.NONE,
     tee: Union[Std, Dict[int, Std]] = Std.NONE,
-    filter_local_ranks: Optional[Set[int]] = None,
 ) -> PContext:
     """
     Start ``n`` copies of ``entrypoint`` processes with the provided options.
@@ -195,7 +194,6 @@ def start_processes(
                       ignored for binaries
         redirects: which std streams to redirect to a log file
         tee: which std streams to redirect + print to console
-        filter_local_ranks: which ranks' logs to print to console
 
     """
     # listdir raises FileNotFound or NotADirectoryError so no need to check manually
@@ -225,9 +223,8 @@ def start_processes(
         redirect_std = redirs[local_rank]
         redirs[local_rank] = redirect_std | tee_std
 
-    SYS_STREAM = ""  # special case to indicate to output to console
-    stdouts = dict.fromkeys(range(nprocs), SYS_STREAM)
-    stderrs = dict.fromkeys(range(nprocs), SYS_STREAM)
+    stdouts = dict.fromkeys(range(nprocs), "")
+    stderrs = dict.fromkeys(range(nprocs), "")
     tee_stdouts: Dict[int, str] = {}
     tee_stderrs: Dict[int, str] = {}
     error_files = {}
@@ -254,19 +251,6 @@ def start_processes(
             if t & Std.ERR == Std.ERR:
                 tee_stderrs[local_rank] = stderrs[local_rank]
 
-            if filter_local_ranks and local_rank not in filter_local_ranks:
-                # If stream is tee'd, only write to file, but don't tail
-                if local_rank in tee_stdouts:
-                    tee_stdouts.pop(local_rank, None)
-                if local_rank in tee_stderrs:
-                    tee_stderrs.pop(local_rank, None)
-
-                # If stream is not redirected, don't print
-                if stdouts[local_rank] == SYS_STREAM:
-                    stdouts[local_rank] = os.devnull
-                if stderrs[local_rank] == SYS_STREAM:
-                    stderrs[local_rank] = os.devnull
-
             error_file = os.path.join(clogdir, "error.json")
             error_files[local_rank] = error_file
             log.info("Setting worker%s reply file to: %s", local_rank, error_file)
diff --git a/torch/distributed/launcher/api.py b/torch/distributed/launcher/api.py
@@ -8,7 +8,7 @@
 import sys
 import uuid
 from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Set
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch.distributed.elastic.rendezvous.registry as rdzv_registry
 from torch.distributed.elastic import events, metrics
@@ -63,7 +63,6 @@ class LaunchConfig:
         metrics_cfg: configuration to initialize metrics.
         local_addr: address of the local node if any. If not set, a lookup on the local
                 machine's FQDN will be performed.
-        filter_local_ranks: ranks for which to show logs in console. If not set, show from all.
     ..note:
         `rdzv_timeout` is a legacy argument that will be removed in future.
         Set the timeout via `rdzv_configs['timeout']`
@@ -88,7 +87,6 @@ class LaunchConfig:
     tee: Union[Std, Dict[int, Std]] = Std.NONE
     metrics_cfg: Dict[str, str] = field(default_factory=dict)
     local_addr: Optional[str] = None
-    filter_local_ranks: Optional[Set[int]] = None
 
     def __post_init__(self):
         default_timeout = 900
@@ -252,7 +250,6 @@ def launch_agent(
         start_method=config.start_method,
         log_dir=config.log_dir,
         log_line_prefix_template=config.log_line_prefix_template,
-        filter_local_ranks=config.filter_local_ranks,
     )
 
     shutdown_rdzv = True
diff --git a/torch/distributed/run.py b/torch/distributed/run.py
@@ -376,7 +376,7 @@ def main():
 import sys
 import uuid
 from argparse import REMAINDER, ArgumentParser
-from typing import Callable, List, Tuple, Union, Optional, Set
+from typing import Callable, List, Tuple, Union
 
 import torch
 from torch.distributed.argparse_util import check_env, env
@@ -548,17 +548,6 @@ def get_args_parser() -> ArgumentParser:
         help="Tee std streams into a log file and also to console (see --redirects for format).",
     )
 
-    parser.add_argument(
-        "--filter-local-ranks",
-        "--filter_local_ranks",
-        action=env,
-        type=str,
-        default="",
-        help="Only show logs from specified ranks in console (e.g. [--filter-local-ranks 0 1 2] will "
-        "only show logs from rank 0, 1 and 2). This will only apply to stdout and stderr, not to"
-        "log files saved via --redirect or --tee",
-    )
-
     #
     # Backwards compatible parameters with caffe2.distributed.launch.
     #
@@ -735,16 +724,6 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str
 
     rdzv_endpoint = get_rdzv_endpoint(args)
 
-    ranks: Optional[Set[int]] = None
-    if args.filter_local_ranks:
-        try:
-            ranks = set(map(int, args.filter_local_ranks.split(",")))
-            assert ranks
-        except Exception as e:
-            raise Exception(
-                "--filter_local_ranks must be a comma-separated list of integers e.g. --filter_local_ranks=0,1,2"
-            ) from e
-
     config = LaunchConfig(
         min_nodes=min_nodes,
         max_nodes=max_nodes,
@@ -762,7 +741,6 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str
         log_dir=args.log_dir,
         log_line_prefix_template=log_line_prefix_template,
         local_addr=args.local_addr,
-        filter_local_ranks=ranks,
     )
 
     with_python = not args.no_python