[BE]: Update mypy to 1.13.0 (pytorch#140808)

Skylion007 · pytorchmergebot · commit 08db735629db · 2024-12-03T02:50:10.000Z
Update mypy to 1.13.0 . Should hopefully reduce linting time. Has support for orjson cache serialization which should improve mypy cache perf if orjson is installed. Pull Request resolved: pytorch#140808 Approved by: https://github.com/ezyang, https://github.com/malfet
diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
@@ -90,7 +90,7 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:
 
-mypy==1.11.2
+mypy==1.13.0
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
 #Pinned versions: 1.10.0
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -144,7 +144,7 @@ init_command = [
     'numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11"',
     'numpy==2.1.0 ; python_version >= "3.12"',
     'expecttest==0.2.1',
-    'mypy==1.11.2',
+    'mypy==1.13.0',
     'sympy==1.13.0 ; python_version >= "3.9"',
     'types-requests==2.27.25',
     'types-PyYAML==6.0.7',
diff --git a/tools/flight_recorder/components/types.py b/tools/flight_recorder/components/types.py
@@ -386,9 +386,11 @@ def __init__(
         }, f"{type} is not a supported operation"
         self.type = type
         if type == "send":
+            assert isinstance(meta, str)
             s, d = meta.split("->")
             self._src, self._dst = int(s), int(d)
         elif type == "recv":
+            assert isinstance(meta, str)
             d, s = meta.split("<-")
             self._dst, self._src = int(d), int(s)
         else:
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
@@ -1503,6 +1503,7 @@ def result_capturing_wrapper(*graph_inputs):
                 # NB: this is wrong if graph_captured_result has
                 # data-dependent output size!
                 ignore_fresh_unbacked = null_context()
+                assert ambient_fake_mode is not None
                 if shape_env := ambient_fake_mode.shape_env:
                     ignore_fresh_unbacked = shape_env.ignore_fresh_unbacked_symbols()
 
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
@@ -11,7 +11,18 @@
 import traceback
 import weakref
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TYPE_CHECKING, Union
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Dict,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    TYPE_CHECKING,
+    Union,
+)
 
 import sympy
 
@@ -621,8 +632,11 @@ def save_global_state(self, out=None):
         """
         Saves to out if it is provided. Else saves to the tracing context's global_state.
         """
-        global_state = (
-            out if out is not None else self.tracing_context.global_context.global_state
+        global_state = cast(
+            Dict[str, Tuple[Callable[..., Any], bool]],
+            out
+            if out is not None
+            else self.tracing_context.global_context.global_state,
         )
 
         # TODO - Consider having a torch level API for torch_function_state. As
@@ -645,11 +659,11 @@ def save_global_state(self, out=None):
             functools.partial(torch.set_autocast_enabled, "cpu"),
             torch.is_autocast_enabled("cpu"),
         )
-        global_state["autocast_gpu_dtype"] = (
+        global_state["autocast_gpu_dtype"] = (  # type:ignore[assignment]
             functools.partial(torch.set_autocast_dtype, "cuda"),
             torch.get_autocast_dtype("cuda"),
         )
-        global_state["autocast_cpu_dtype"] = (
+        global_state["autocast_cpu_dtype"] = (  # type:ignore[assignment]
             functools.partial(torch.set_autocast_dtype, "cpu"),
             torch.get_autocast_dtype("cpu"),
         )
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
@@ -1108,7 +1108,7 @@ class ChromiumEventLogger:
     a specification of the Chromium Event JSON format.
     """
 
-    def get_stack(self):
+    def get_stack(self) -> List[str]:
         """
         The main event stack, with every chromium event.
         Logged to tlparse.
@@ -1119,7 +1119,7 @@ def get_stack(self):
             self.tls.stack = []
             return self.tls.stack
 
-    def get_top(self) -> str:
+    def get_top(self) -> Optional[str]:
         """
         Get the top event name or None if the stack is empty.
         """
diff --git a/torch/_export/serde/dynamic_shapes.py b/torch/_export/serde/dynamic_shapes.py
@@ -166,8 +166,8 @@ def _track_dim_from_dims(
         root = val.root if isinstance(val, _DerivedDim) else val  # type: ignore[attr-defined]
         if root.__name__ not in dims:
             dims[root.__name__] = {
-                "min": root.min,
-                "max": root.max,
+                "min": root.min,  # type: ignore[attr-defined,union-attr]
+                "max": root.max,  # type: ignore[attr-defined,union-attr]
                 "derived": set(),
             }
 
diff --git a/torch/_export/serde/serialize.py b/torch/_export/serde/serialize.py
@@ -2423,7 +2423,7 @@ def _dict_to_dataclass(cls, data):
         field_type = cls.__annotations__[_type]
         return cls.create(**{_type: _dict_to_dataclass(field_type, _value)})
     elif dataclasses.is_dataclass(cls):
-        obj = cls(**data)  # type: ignore[assignment]
+        obj = cls(**data)  # type: ignore[assignment,operator]
         type_hints = typing.get_type_hints(cls)
         for f in dataclasses.fields(cls):
             name = f.name
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
@@ -292,7 +292,7 @@ def reduction_prefix_array(
     acc_type: str,
     reduction_type: str,
     dtype: torch.dtype,
-    len: int,
+    len: Union[str, int],
     init_fn,
 ):
     """
diff --git a/torch/_inductor/codegen/cpp_gemm_template.py b/torch/_inductor/codegen/cpp_gemm_template.py
@@ -308,8 +308,8 @@ def transpose_w(
 
 
 def expand_bias(
-    B: Union[ir.IRNode, torch.Tensor], X: Union[ir.IRNode, torch.Tensor]
-) -> Union[ir.IRNode, torch.Tensor]:
+    B: Union[ir.IRNode, torch.Tensor, None], X: Union[ir.IRNode, torch.Tensor]
+) -> Optional[Union[ir.IRNode, torch.Tensor]]:
     """
     Expand Bias to the same size of X.
     """
@@ -870,7 +870,7 @@ def normalize_shapes(inputs, layout_or_out):
             W = new_inputs[1]
             B = new_inputs[2] if has_bias else None
             W = transpose_w(W, trans_w)
-            B = expand_bias(B, X)
+            B = expand_bias(B, X)  # type:ignore[arg-type]
             new_inputs[1] = W
             if B is not None:
                 new_inputs[2] = B
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
@@ -382,7 +382,7 @@ def split_const_gm(
             gm,
             node,
             (
-                const_result[const_outputs[node.name]]
+                const_result[const_outputs[node.name]]  # type:ignore[index]
                 if lifted_constant_names is None
                 else None
             ),
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
@@ -1883,9 +1883,9 @@ def materialize(
                     # Generating random inputs based on self.example_inputs sometimes can be problematic,
                     # e.g. illegal memory access. A comprehensive fix is to autotune in a separate process.
                     real_inputs = [
-                        materialize(x)
+                        materialize(x)  # type:ignore[arg-type]
                         for x in (
-                            self.example_inputs
+                            self.example_inputs  # type:ignore[union-attr]
                             if isinstance(V.real_inputs, NullHandler)
                             else V.real_inputs
                         )
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
@@ -1612,7 +1612,7 @@ def reduction_dtypes(
 # batched_matrix_contiguous_strides and contiguous_strides
 def make_contiguous_strides_for(
     shape: ShapeType, row_major: bool = True
-) -> Tuple[int, ...]:
+) -> Tuple[Union[_IntLikeT, int], ...]:
     """
     Returns the strides of a contiguous tensor if row_major
     If row_major=True, it returns the strides of a contiguous batch of Fortran-contiguous matrices
@@ -1625,11 +1625,13 @@ def make_contiguous_strides_for(
 
     from torch.fx.experimental.symbolic_shapes import is_nested_int
 
-    multiplier = 1
+    multiplier: Union[_IntLikeT, int] = 1
     strides = []
     for l in reversed(shape):
         strides.append(multiplier)
-        multiplier *= l if is_nested_int(l) else sym_max(l, 1)
+        multiplier *= (
+            l if is_nested_int(l) else sym_max(l, 1)
+        )  # type:ignore[assignment]
 
     result = tuple(reversed(strides))
 
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
@@ -410,7 +410,7 @@ def _broadcast_shapes(*_shapes):
         assert isinstance(shape, Sequence)
 
     # Computes common shape
-    common_shape = [
+    common_shape: List[Union[int, torch.SymInt]] = [
         1,
     ] * reduce(max, (len(shape) for shape in shapes))
     for arg_idx, shape in enumerate(shapes):
diff --git a/torch/ao/nn/qat/modules/conv.py b/torch/ao/nn/qat/modules/conv.py
@@ -20,7 +20,7 @@ def __init__(
         out_channels: int,
         kernel_size: Tuple[int, ...],
         stride: Tuple[int, ...],
-        padding: Tuple[int, ...],
+        padding: Union[str, Tuple[int, ...]],
         dilation: Tuple[int, ...],
         transposed: bool,
         output_padding: Tuple[int, ...],
diff --git a/torch/ao/ns/fx/weight_utils.py b/torch/ao/ns/fx/weight_utils.py
@@ -35,7 +35,7 @@ def get_lstm_weight(mod: nn.Module) -> List[torch.Tensor]:
     res = []
     for idx, param_name in enumerate(mod._flat_weights_names):  # type: ignore[arg-type]
         if "weight_ih_l" in param_name or "weight_hh_l" in param_name:
-            param_value = mod._flat_weights[idx].detach()  # type: ignore[index]
+            param_value = mod._flat_weights[idx].detach()  # type: ignore[index,union-attr]
             res.append(param_value)
     return res
 
@@ -72,7 +72,7 @@ def get_lstm_mod_weights(mod: nn.Module) -> List[torch.Tensor]:
         res = []
         for idx, param_name in enumerate(mod._flat_weights_names):
             if "weight_ih_l" in param_name or "weight_hh_l" in param_name:
-                param_value = mod._flat_weights[idx].detach()
+                param_value = mod._flat_weights[idx].detach()  # type: ignore[index,union-attr]
                 res.append(param_value)
         return res
     else:
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
@@ -665,7 +665,7 @@ def _get_output_act_obs_or_fq(
     named_modules: Dict[str, torch.nn.Module],
     obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
-) -> ObserverOrFakeQuantize:
+) -> Optional[ObserverOrFakeQuantize]:
     """Get the constructor for observer or fake quant object for
     the argument in the original graph as the output of previous node,
     skipping inserted observers
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py
@@ -105,7 +105,7 @@ def post_localSGD_hook(
     # Run allreduce using `global_group_to_use` in the first `start_localSGD_iter` iterations.
     if state.iter < state.start_localSGD_iter:
         state.maybe_increase_iter(bucket)
-        return default._allreduce_fut(global_group_to_use, input_tensor)
+        return default._allreduce_fut(global_group_to_use, input_tensor)  # type: ignore[arg-type]
 
     # If `post_local_gradient_allreduce` is not set,
     # then no gradient synchronization after the first `start_localSGD_iter` iterations.
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
@@ -7,6 +7,7 @@
 import torch
 import torch.distributed as dist
 from torch.distributed import distributed_c10d
+from torch.utils._typing_utils import not_none
 
 from . import default_hooks as default
 
@@ -398,7 +399,9 @@ def powerSGD_hook(
         >>> ddp_model.register_comm_hook(state, powerSGD_hook)
     """  # noqa: B950
     process_group = state.process_group
-    group_to_use = process_group if process_group is not None else dist.group.WORLD
+    group_to_use = (
+        process_group if process_group is not None else not_none(dist.group.WORLD)
+    )
     world_size = group_to_use.size()
 
     # The input tensor is a flattened 1D tensor.
@@ -707,7 +710,9 @@ def batched_powerSGD_hook(
         >>> ddp_model.register_comm_hook(state, batched_powerSGD_hook)
     """  # noqa: B950
     process_group = state.process_group
-    group_to_use = process_group if process_group is not None else dist.group.WORLD
+    group_to_use = (
+        process_group if process_group is not None else not_none(dist.group.WORLD)
+    )
     world_size = group_to_use.size()
 
     # The input tensor is a flattened 1D tensor.
diff --git a/torch/distributed/algorithms/model_averaging/averagers.py b/torch/distributed/algorithms/model_averaging/averagers.py
@@ -1,11 +1,12 @@
 # mypy: allow-untyped-defs
 import warnings
 from abc import ABC, abstractmethod
-from typing import Dict, Iterable, Union
+from typing import Dict, Iterable, Optional, Union
 
 import torch
 import torch.distributed as dist
 import torch.distributed.algorithms.model_averaging.utils as utils
+from torch.utils._typing_utils import not_none as _not_none
 
 
 __all__ = ["ModelAverager", "PeriodicModelAverager"]
@@ -21,9 +22,9 @@ class ModelAverager(ABC):
                        will be used. (default: ``None``)
     """
 
-    def __init__(self, process_group=None):
+    def __init__(self, process_group: Optional[dist.ProcessGroup] = None):
         self.process_group = (
-            process_group if process_group is not None else dist.group.WORLD
+            process_group if process_group is not None else _not_none(dist.group.WORLD)
         )
         self.step = 0
 
@@ -85,7 +86,9 @@ class PeriodicModelAverager(ModelAverager):
         >>>    averager.average_parameters(model.parameters())
     """
 
-    def __init__(self, period, warmup_steps=0, process_group=None):
+    def __init__(
+        self, period, warmup_steps=0, process_group: Optional[dist.ProcessGroup] = None
+    ):
         super().__init__(process_group)
         if warmup_steps < 0:
             raise ValueError("Arg ``warmup_steps`` must be a non-negative number.")
@@ -120,5 +123,7 @@ def average_parameters(
             self.step >= self.warmup_steps
             and (self.step - self.warmup_steps) % self.period == 0
         ):
-            utils.average_parameters_or_parameter_groups(params, self.process_group)
+            utils.average_parameters_or_parameter_groups(
+                params, _not_none(self.process_group)
+            )
         self.step += 1
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
@@ -4477,7 +4477,9 @@ def all_to_all(output_tensor_list, input_tensor_list, group=None, async_op=False
 
 
 @_exception_logger
-def barrier(group=GroupMember.WORLD, async_op=False, device_ids=None):
+def barrier(
+    group: Optional[ProcessGroup] = GroupMember.WORLD, async_op=False, device_ids=None
+):
     """
     Synchronize all processes.
 
@@ -4519,7 +4521,11 @@ def barrier(group=GroupMember.WORLD, async_op=False, device_ids=None):
         work.wait()
 
 
-def monitored_barrier(group=GroupMember.WORLD, timeout=None, wait_all_ranks=False):
+def monitored_barrier(
+    group: Optional[ProcessGroup] = GroupMember.WORLD,
+    timeout=None,
+    wait_all_ranks=False,
+):
     """
     Synchronize processes similar to ``torch.distributed.barrier``, but consider a configurable timeout.
 
@@ -4589,7 +4595,9 @@ def monitored_barrier(group=GroupMember.WORLD, timeout=None, wait_all_ranks=Fals
     _check_valid_timeout(timeout)
 
     group_to_use = _get_default_group() if group is None else group
-    return group_to_use.monitored_barrier(timeout, wait_all_ranks=wait_all_ranks)
+    return group_to_use.monitored_barrier(  # type:ignore[attr-defined]
+        timeout, wait_all_ranks=wait_all_ranks
+    )
 
 
 def _create_process_group_wrapper(
diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
@@ -630,7 +630,7 @@ def _flatten_optim_state(
     assert state_names is not None
 
     # Flatten the state
-    flat_state: Dict[str, Any] = {}
+    flat_state: Dict[str, Optional[torch.Tensor]] = {}
     for state_name in state_names:
         state_values = [
             unflat_param_state[state_name] if unflat_param_state is not None else None
@@ -658,7 +658,7 @@ def _flatten_optim_state(
         if are_pos_dim_tensors:
             flat_tensor = _flatten_tensor_optim_state(
                 state_name,
-                state_values,
+                state_values,  # type: ignore[arg-type]
                 unflat_param_names,
                 unflat_param_shapes,
                 handle,
@@ -680,7 +680,7 @@ def _flatten_optim_state(
         elif are_zero_dim_tensors:
             flat_state[state_name] = _flatten_zero_dim_tensor_optim_state(
                 state_name,
-                state_values,
+                state_values,  # type: ignore[arg-type]
                 unflat_param_names,
             )
         else:
diff --git a/torch/distributed/tensor/_ops/_tensor_ops.py b/torch/distributed/tensor/_ops/_tensor_ops.py
diff --git a/torch/fx/_symbolic_trace.py b/torch/fx/_symbolic_trace.py
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
diff --git a/torch/optim/radam.py b/torch/optim/radam.py
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py

Original file line number	Diff line number	Diff line change
`@@ -382,7 +382,7 @@ def split_const_gm(`
`382`	`382`	`gm,`
`383`	`383`	`node,`
`384`	`384`	`(`
`385`		`- const_result[const_outputs[node.name]]`
	`385`	`+ const_result[const_outputs[node.name]] # type:ignore[index]`
`386`	`386`	`if lifted_constant_names is None`
`387`	`387`	`else None`
`388`	`388`	`),`