Skip to content

Commit a4355d6

Browse files
Revert "Add --filter-rank to torchrun: allow logs filtering by rank (pytorch#118562)"
This reverts commit 73229b4. Reverted pytorch#118562 on behalf of https://github.com/xmfan due to breaks MAST precheck, flag naming conflict ([comment](pytorch#118562 (comment)))
1 parent 63fd688 commit a4355d6

File tree

5 files changed

+7
-51
lines changed

5 files changed

+7
-51
lines changed

torch/distributed/argparse_util.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
class env(Action):
1313
"""
14-
Get argument values from ``PET_{dest}`` before defaulting to the given ``default`` value.
14+
Get argument values from ``PET_{dest}`` before defaultingto the given ``default`` value.
1515
1616
For flags (e.g. ``--standalone``)
1717
use ``check_env`` instead.

torch/distributed/elastic/agent/server/local_elastic_agent.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from string import Template
1616
import tempfile
1717
import uuid
18-
from typing import Any, Dict, Optional, Tuple, Set
18+
from typing import Any, Dict, Optional, Tuple
1919

2020
import torch.distributed.elastic.timer as timer
2121
from torch.distributed.elastic import events
@@ -140,7 +140,6 @@ def __init__(
140140
exit_barrier_timeout: float = 300,
141141
log_dir: Optional[str] = None,
142142
log_line_prefix_template: Optional[str] = None,
143-
filter_local_ranks: Optional[Set[int]] = None,
144143
):
145144
super().__init__(spec, exit_barrier_timeout)
146145
self._start_method = start_method
@@ -149,7 +148,6 @@ def __init__(
149148
self._rdzv_handler = spec.rdzv_handler
150149
self._log_dir = self._make_log_dir(log_dir, rdzv_run_id)
151150
self._log_line_prefix_template = log_line_prefix_template
152-
self._filter_local_ranks = filter_local_ranks
153151
self._worker_watchdog: Optional[timer.FileTimerServer] = None
154152

155153
def _make_log_dir(self, log_dir: Optional[str], rdzv_run_id: str):
@@ -302,7 +300,6 @@ def _start_workers(self, worker_group: WorkerGroup) -> Dict[int, Any]:
302300
start_method=self._start_method,
303301
redirects=spec.redirects,
304302
tee=spec.tee,
305-
filter_local_ranks=self._filter_local_ranks,
306303
)
307304

308305
return self._pcontext.pids()

torch/distributed/elastic/multiprocessing/__init__.py

+3-19
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def trainer(a, b, c):
6363
"""
6464

6565
import os
66-
from typing import Callable, Dict, Optional, Tuple, Union, Set
66+
from typing import Callable, Dict, Optional, Tuple, Union
6767

6868
from torch.distributed.elastic.multiprocessing.api import ( # noqa: F401
6969
_validate_full_rank,
@@ -103,7 +103,6 @@ def start_processes(
103103
start_method: str = "spawn",
104104
redirects: Union[Std, Dict[int, Std]] = Std.NONE,
105105
tee: Union[Std, Dict[int, Std]] = Std.NONE,
106-
filter_local_ranks: Optional[Set[int]] = None,
107106
) -> PContext:
108107
"""
109108
Start ``n`` copies of ``entrypoint`` processes with the provided options.
@@ -195,7 +194,6 @@ def start_processes(
195194
ignored for binaries
196195
redirects: which std streams to redirect to a log file
197196
tee: which std streams to redirect + print to console
198-
filter_local_ranks: which ranks' logs to print to console
199197
200198
"""
201199
# listdir raises FileNotFound or NotADirectoryError so no need to check manually
@@ -225,9 +223,8 @@ def start_processes(
225223
redirect_std = redirs[local_rank]
226224
redirs[local_rank] = redirect_std | tee_std
227225

228-
SYS_STREAM = "" # special case to indicate to output to console
229-
stdouts = dict.fromkeys(range(nprocs), SYS_STREAM)
230-
stderrs = dict.fromkeys(range(nprocs), SYS_STREAM)
226+
stdouts = dict.fromkeys(range(nprocs), "")
227+
stderrs = dict.fromkeys(range(nprocs), "")
231228
tee_stdouts: Dict[int, str] = {}
232229
tee_stderrs: Dict[int, str] = {}
233230
error_files = {}
@@ -254,19 +251,6 @@ def start_processes(
254251
if t & Std.ERR == Std.ERR:
255252
tee_stderrs[local_rank] = stderrs[local_rank]
256253

257-
if filter_local_ranks and local_rank not in filter_local_ranks:
258-
# If stream is tee'd, only write to file, but don't tail
259-
if local_rank in tee_stdouts:
260-
tee_stdouts.pop(local_rank, None)
261-
if local_rank in tee_stderrs:
262-
tee_stderrs.pop(local_rank, None)
263-
264-
# If stream is not redirected, don't print
265-
if stdouts[local_rank] == SYS_STREAM:
266-
stdouts[local_rank] = os.devnull
267-
if stderrs[local_rank] == SYS_STREAM:
268-
stderrs[local_rank] = os.devnull
269-
270254
error_file = os.path.join(clogdir, "error.json")
271255
error_files[local_rank] = error_file
272256
log.info("Setting worker%s reply file to: %s", local_rank, error_file)

torch/distributed/launcher/api.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import sys
99
import uuid
1010
from dataclasses import dataclass, field
11-
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Set
11+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
1212

1313
import torch.distributed.elastic.rendezvous.registry as rdzv_registry
1414
from torch.distributed.elastic import events, metrics
@@ -63,7 +63,6 @@ class LaunchConfig:
6363
metrics_cfg: configuration to initialize metrics.
6464
local_addr: address of the local node if any. If not set, a lookup on the local
6565
machine's FQDN will be performed.
66-
filter_local_ranks: ranks for which to show logs in console. If not set, show from all.
6766
..note:
6867
`rdzv_timeout` is a legacy argument that will be removed in future.
6968
Set the timeout via `rdzv_configs['timeout']`
@@ -88,7 +87,6 @@ class LaunchConfig:
8887
tee: Union[Std, Dict[int, Std]] = Std.NONE
8988
metrics_cfg: Dict[str, str] = field(default_factory=dict)
9089
local_addr: Optional[str] = None
91-
filter_local_ranks: Optional[Set[int]] = None
9290

9391
def __post_init__(self):
9492
default_timeout = 900
@@ -252,7 +250,6 @@ def launch_agent(
252250
start_method=config.start_method,
253251
log_dir=config.log_dir,
254252
log_line_prefix_template=config.log_line_prefix_template,
255-
filter_local_ranks=config.filter_local_ranks,
256253
)
257254

258255
shutdown_rdzv = True

torch/distributed/run.py

+1-23
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@ def main():
376376
import sys
377377
import uuid
378378
from argparse import REMAINDER, ArgumentParser
379-
from typing import Callable, List, Tuple, Union, Optional, Set
379+
from typing import Callable, List, Tuple, Union
380380

381381
import torch
382382
from torch.distributed.argparse_util import check_env, env
@@ -548,17 +548,6 @@ def get_args_parser() -> ArgumentParser:
548548
help="Tee std streams into a log file and also to console (see --redirects for format).",
549549
)
550550

551-
parser.add_argument(
552-
"--filter-local-ranks",
553-
"--filter_local_ranks",
554-
action=env,
555-
type=str,
556-
default="",
557-
help="Only show logs from specified ranks in console (e.g. [--filter-local-ranks 0 1 2] will "
558-
"only show logs from rank 0, 1 and 2). This will only apply to stdout and stderr, not to"
559-
"log files saved via --redirect or --tee",
560-
)
561-
562551
#
563552
# Backwards compatible parameters with caffe2.distributed.launch.
564553
#
@@ -735,16 +724,6 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str
735724

736725
rdzv_endpoint = get_rdzv_endpoint(args)
737726

738-
ranks: Optional[Set[int]] = None
739-
if args.filter_local_ranks:
740-
try:
741-
ranks = set(map(int, args.filter_local_ranks.split(",")))
742-
assert ranks
743-
except Exception as e:
744-
raise Exception(
745-
"--filter_local_ranks must be a comma-separated list of integers e.g. --filter_local_ranks=0,1,2"
746-
) from e
747-
748727
config = LaunchConfig(
749728
min_nodes=min_nodes,
750729
max_nodes=max_nodes,
@@ -762,7 +741,6 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str
762741
log_dir=args.log_dir,
763742
log_line_prefix_template=log_line_prefix_template,
764743
local_addr=args.local_addr,
765-
filter_local_ranks=ranks,
766744
)
767745

768746
with_python = not args.no_python

0 commit comments

Comments
 (0)