diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index b1eb127b..6f175702 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -220,11 +220,6 @@ async def create(cls, model_directory: pathlib.Path, quiet=False, **kwargs): # Hardcode max output length to 16 self.config.max_output_len = 16 - # Then override the base_seq_len if present - override_base_seq_len = kwargs.get("override_base_seq_len") - if override_base_seq_len: - self.config.max_seq_len = override_base_seq_len - # Grab the base model's sequence length before overrides for # rope calculations base_seq_len = self.config.max_seq_len diff --git a/colab/TabbyAPI_Colab_Example.ipynb b/colab/TabbyAPI_Colab_Example.ipynb index 1b7e18d0..d2d8b6a1 100644 --- a/colab/TabbyAPI_Colab_Example.ipynb +++ b/colab/TabbyAPI_Colab_Example.ipynb @@ -92,7 +92,6 @@ "# @markdown ---\n", "# @markdown Model parameters:\n", "ContextSize = 4096 # @param {type:\"integer\"}\n", - "OverrideBaseSeqLen = 4096 # @param {type:\"integer\"}\n", "RopeScale = 1.0 # @param {type:\"number\"}\n", "RopeAlpha = 1.0 # @param {type:\"number\"}\n", "NumExpertsPerToken = 2 # @param {type:\"integer\"}\n", @@ -169,11 +168,6 @@ " # Fetched from the model's base sequence length in config.json by default\n", " max_seq_len: {ContextSize}\n", "\n", - " # Overrides base model context length (default: None)\n", - " # WARNING: Don't set this unless you know what you're doing!\n", - " # Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral/Mixtral models)\n", - " override_base_seq_len: {OverrideBaseSeqLen}\n", - "\n", " # Automatically allocate resources to GPUs (default: True)\n", " gpu_split_auto: True\n", "\n", diff --git a/common/config_models.py b/common/config_models.py index a31d6a56..40b4109a 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -176,16 +176,6 @@ class ModelConfig(BaseConfigModel): ), ge=0, ) - override_base_seq_len: Optional[int] = Field( - None, - description=( - "Overrides base model context length (default: Empty).\n" - "WARNING: Don't set this unless you know what you're doing!\n" - "Again, do NOT use this for configuring context length, " - "use max_seq_len above ^" - ), - ge=0, - ) tensor_parallel: Optional[bool] = Field( False, description=( diff --git a/config_sample.yml b/config_sample.yml index 771d7e6b..83f2fc76 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -69,11 +69,6 @@ model: # Fetched from the model's base sequence length in config.json by default. max_seq_len: - # Overrides base model context length (default: Empty). - # WARNING: Don't set this unless you know what you're doing! - # Again, do NOT use this for configuring context length, use max_seq_len above ^ - override_base_seq_len: - # Load model with tensor parallelism. # Falls back to autosplit if GPU split isn't provided. # This ignores the gpu_split_auto value. diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py index 58ab0dc4..f2817f03 100644 --- a/endpoints/core/types/model.py +++ b/endpoints/core/types/model.py @@ -82,13 +82,6 @@ class ModelLoadRequest(BaseModel): default=None, examples=[4096], ) - override_base_seq_len: Optional[int] = Field( - description=( - "Overrides the model's base sequence length. " "Leave blank if unsure" - ), - default=None, - examples=[4096], - ) cache_size: Optional[int] = Field( description=("Number in tokens, must be greater than or equal to max_seq_len"), default=None,