onnx · Mar 20, 2025
diff --git a/‎.github/workflows/publish-to-test-pypi.yml
+2-1 b/‎.github/workflows/publish-to-test-pypi.yml
+2-1
diff --git a/‎.github/workflows/server_installer_windows_latest.yml
+2-1 b/‎.github/workflows/server_installer_windows_latest.yml
+2-1
diff --git a/‎.github/workflows/test_lemonade.yml
+2-1 b/‎.github/workflows/test_lemonade.yml
+2-1
diff --git a/‎.github/workflows/test_lemonade_oga_cpu.yml
+2-1 b/‎.github/workflows/test_lemonade_oga_cpu.yml
+2-1
diff --git a/‎.github/workflows/test_quark.yml
+2-1 b/‎.github/workflows/test_quark.yml
+2-1
diff --git a/‎.github/workflows/test_server.yml
+2-1 b/‎.github/workflows/test_server.yml
+2-1
diff --git a/‎docs/contribute.md
+1-2 b/‎docs/contribute.md
+1-2
diff --git a/‎docs/lemonade/getting_started.md
+1-1 b/‎docs/lemonade/getting_started.md
+1-1
diff --git a/‎docs/lemonade/lemonade_server_exe.md
+7-103 b/‎docs/lemonade/lemonade_server_exe.md
+7-103
diff --git a/‎docs/lemonade/mmlu_accuracy.md
+1-1 b/‎docs/lemonade/mmlu_accuracy.md
+1-1
diff --git a/‎docs/lemonade/server_integration.md
+131 b/‎docs/lemonade/server_integration.md
+131
diff --git a/‎docs/lemonade/server_spec.md
+2-2 b/‎docs/lemonade/server_spec.md
+2-2
diff --git a/‎examples/lemonade/README.md
+9-1 b/‎examples/lemonade/README.md
+9-1
diff --git a/‎examples/lemonade/server/README.md
+8 b/‎examples/lemonade/server/README.md
+8
diff --git a/‎examples/lemonade/server/continue.md
+57 b/‎examples/lemonade/server/continue.md
+57
diff --git a/‎examples/readme.md
+1-1 b/‎examples/readme.md
+1-1
diff --git a/‎installer/Installer.nsi
+123-3 b/‎installer/Installer.nsi
+123-3
diff --git a/‎setup.py
+1-1 b/‎setup.py
+1-1
diff --git a/‎src/lemonade/cli.py
+18-23 b/‎src/lemonade/cli.py
+18-23
diff --git a/‎src/lemonade/tools/huggingface_bench.py
+1-1 b/‎src/lemonade/tools/huggingface_bench.py
+1-1
diff --git a/‎src/lemonade/tools/huggingface_load.py
+3-78 b/‎src/lemonade/tools/huggingface_load.py
+3-78
diff --git a/‎src/lemonade/tools/humaneval.py
+1-1 b/‎src/lemonade/tools/humaneval.py
+1-1
diff --git a/‎src/lemonade/tools/llamacpp.py
+1-1 b/‎src/lemonade/tools/llamacpp.py
+1-1
diff --git a/‎src/lemonade/tools/llamacpp_bench.py
+1-1 b/‎src/lemonade/tools/llamacpp_bench.py
+1-1
diff --git a/‎src/lemonade/tools/mmlu.py
+2-2 b/‎src/lemonade/tools/mmlu.py
+2-2
diff --git a/‎src/lemonade/tools/ort_genai/oga.py
+4 b/‎src/lemonade/tools/ort_genai/oga.py
+4
diff --git a/‎src/lemonade/tools/perplexity.py
+2-2 b/‎src/lemonade/tools/perplexity.py
+2-2
diff --git a/‎src/lemonade/tools/serve.py
+72-35 b/‎src/lemonade/tools/serve.py
+72-35
diff --git a/‎src/lemonade_install/install.py
+136-3 b/‎src/lemonade_install/install.py
+136-3
diff --git a/‎src/turnkeyml/__init__.py
-2 b/‎src/turnkeyml/__init__.py
-2
diff --git a/‎src/turnkeyml/cli/cli.py
+3-137 b/‎src/turnkeyml/cli/cli.py
+3-137
diff --git a/‎src/turnkeyml/common/cli_helpers.py
+135 b/‎src/turnkeyml/common/cli_helpers.py
+135
diff --git a/‎src/turnkeyml/tools/management_tools.py
+1-1 b/‎src/turnkeyml/tools/management_tools.py
+1-1
diff --git a/‎src/turnkeyml/version.py
+1-1 b/‎src/turnkeyml/version.py
+1-1
diff --git a/‎test/lemonade/server.py
+1-1 b/‎test/lemonade/server.py
+1-1
@@ -7,7 +7,8 @@ on:
       - v*
       - RC*
   pull_request:
-    branches: ["main", "canary", "refresh"]
+    branches:
+      - '**'
 
 jobs:
   build-n-publish:
 
@@ -6,7 +6,8 @@ on:
     tags:
       - v*
   pull_request:
-    branches: ["main"]
+    branches:
+      - '**'
   workflow_dispatch:
 
 jobs:
 
@@ -7,7 +7,8 @@ on:
   push:
     branches: ["main"]
   pull_request:
-    branches: ["main"]
+    branches:
+      - '**'
 
 permissions:
   contents: read
 
@@ -7,7 +7,8 @@ on:
   push:
     branches: ["main"]
   pull_request:
-    branches: ["main"]
+    branches:
+      - '**'
 
 permissions:
   contents: read
 
@@ -7,7 +7,8 @@ on:
   push:
     branches: ["main"]
   pull_request:
-    branches: ["main"]
+    branches:
+      - '**'
 
 permissions:
   contents: read
 
@@ -7,7 +7,8 @@ on:
   push:
     branches: ["main"]
   pull_request:
-    branches: ["main"]
+    branches:
+      - '**'
 
 permissions:
   contents: read
 
@@ -152,9 +152,8 @@ TurnkeyML is provided as a package on PyPI, the Python Package Index, as [turnke
 The following public APIs are available for developers. The maintainers aspire to change these as infrequently as possible, and doing so will require an update to the package's major version number.
 
 - From the top-level `__init__.py`:
-    - `turnkeycli`: the `main()` function of the `turnkey` CLI
-    - `evaluate_files()`: the top-level API called by the CLI
     - `turnkeyml.version`: The package version number
+    - `State` class and `load_state`: structure that holds build state between Tools; function to load `State` from disk. 
 - From the `common.filesystem` module:
     - `get_available_builds()`: list the builds in a turnkey cache
     - `make_cache_dir()`: create a turnkey cache
 
@@ -60,7 +60,7 @@ To install `lemonade` from source code:
 
 ## From Lemonade_Server_Installer.exe
 
-The `lemonade` server is available as a standalone tool with a one-click Windows installer `.exe`. Check out the [Lemonade_Server_Installer.exe guide](lemonade_server_exe.md) for installation instructions and the [server spec](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/server_spec.md) to learn more about the functionality.
+The Lemonade Server is available as a standalone tool with a one-click Windows installer `.exe`. Check out the [Lemonade_Server_Installer.exe guide](lemonade_server_exe.md) for installation instructions and the [server spec](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/server_spec.md) to learn more about the functionality.
 
 # CLI Commands
 
 
@@ -1,117 +1,21 @@
 # Lemonade Server Installer
 
-The `lemonade` server is available as a standalone tool with a one-click Windows installer `.exe`. Check out the [server spec](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/server_spec.md) to learn more about the functionality.
+The Lemonade Server is available as a standalone tool with a one-click Windows installer `.exe`. Check out the [server spec](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/server_spec.md) to learn more about the functionality.
 
-## GUI Installation and Usage
+## GUI Installation
 
 > *Note:* you may need to give your browser or OS permission to download or install the .exe.
 
 1. Navigate to the [latest release](https://github.com/onnx/turnkeyml/releases/latest).
 1. Scroll to the bottom and click `Lemonade_Server_Installer.exe` to download.
 1. Double-click the `Lemonade_Server_Installer.exe` and follow the instructions.
 
-Now that you have the server installed, you can double click the desktop shortcut to run the server process. From there, you can connect it to applications that are compatible with the OpenAI completions API.
+## Usage
 
-## Silent Installation and Command Line Usage
+Now that you have the server installed, you can double click the desktop shortcut to run the server process. 
 
-Silent installation and command line usage are useful if you want to fully integrate `lemonade` server into your own application. This guide provides fully automated steps for downloading, installing, and running `lemonade` server so that your users don't have to install `lemonade` separately.
+From there, you can connect it to applications that are compatible with the OpenAI completions API. The Lemonade Server [examples folder](https://github.com/onnx/turnkeyml/tree/main/examples/lemonade/server) has guides for how to use Lemonade Server with a collection of applications that we have tested.
 
-Definitions:
-- "Silent installation" refers to an automatic command for installing `lemonade` server without running any GUI or prompting the user for any questions. It does assume that the end-user fully accepts the license terms, so be sure that your own application makes this clear to the user.
-- Command line usage allows the server process to be launched programmatically, so that your application can manage starting and stopping the server process on your user's behalf.
+## Developing with Lemonade Server
 
-### Download
-
-Follow these instructions to download a copy of `Lemonade_Server_Installer.exe`.
-
-#### cURL Download
-
-In a `bash` terminal, such as `git bash`:
-
-Download the latest version:
-
-```bash
-curl -L -o ".\Lemonade_Server_Installer.exe" https://github.com/onnx/turnkeyml/releases/latest/download/Lemonade_Server_Installer.exe
-```
-
-Download a specific version:
-
-```bash
-curl -L -o ".\Lemonade_Server_Installer.exe" https://github.com/onnx/turnkeyml/releases/download/v6.0.0/Lemonade_Server_Installer.exe
-```
-
-#### PowerShell Download
-
-In a powershell terminal:
-
-Download the latest version:
-
-```powershell
-Invoke-WebRequest -Uri "https://github.com/onnx/turnkeyml/releases/latest/download/Lemonade_Server_Installer.exe" -OutFile "Lemonade_Server_Installer.exe"
-```
-
-Download a specific version:
-
-```powershell
-Invoke-WebRequest -Uri "https://github.com/onnx/turnkeyml/releases/download/v6.0.0/Lemonade_Server_Installer.exe" -OutFile "Lemonade_Server_Installer.exe"
-```
-
-### Silent Installation
-
-Silent installation runs `Lemonade_Server_Installer.exe` without a GUI and automatically accepts all prompts.
-
-In a `cmd.exe` terminal:
-
-Install *with* Ryzen AI hybrid support: 
-
-```bash
-Lemonade_Server_Installer.exe /S /Extras=hybrid
-```
-
-Install *without* Ryzen AI hybrid support:
-
-```bash
-Lemonade_Server_Installer.exe /S
-```
-
-The install directory can also be changed from the default by using `/D` as the last argument. 
-
-For example: 
-
-```bash
-Lemonade_Server_Installer.exe /S /Extras=hybrid /D=C:\a\new\path`
-```
-
-### Command Line Invocation
-
-Command line invocation starts the `lemonade` server process so that your application can connect to it via REST API endpoints. 
-
-#### Foreground Process
-
-These steps will open lemonade server in a terminal window that is visible to users. The user can exit the server by closing the window.
-
-In a `cmd.exe` terminal:
-
-```bash
-conda run --no-capture-output -p INSTALL_DIR\lemonade_server\lemon_env lemonade serve
-```
-
-Where `INSTALL_DIR` is the installation path of `lemonade_server`. 
-
-For example, if you used the default installation directory and your username is USERNAME: 
-
-```bash
-C:\Windows\System32\cmd.exe /C conda run --no-capture-output -p C:\Users\USERNAME\AppData\Local\lemonade_server\lemon_env lemonade serve
-```
-
-#### Background Process
-
-This command will open lemonade server without opening a window. Your application needs to manage terminating the process and any child processes it creates.
-
-In a powershell terminal:
-
-```powershell
-$serverProcess = Start-Process -FilePath "C:\Windows\System32\cmd.exe" -ArgumentList "/C conda run --no-capture-output -p INSTALL_DIR\lemonade_server\lemon_env lemonade serve" -RedirectStandardOutput lemonade_out.txt -RedirectStandardError lemonade_err.txt -PassThru -NoNewWindow
-```
-
-Where `INSTALL_DIR` is the installation path of `lemonade_server`.
+Interested in integrating Lemonade Server into an application you are developing? Check out the [Lemonade Server integration guide](server_integration.md) to learn more.
@@ -99,4 +99,4 @@ Use the syntax provided in the table to run that test subject with the `accuracy
 | Sociology                           | Culture           | sociology                           |
 | US Foreign Policy                   | Politics          | us_foreign_policy                   |
 | Virology                            | Health            | virology                            |
-| World Religions                     | Philosophy        | world_religions                     |
+| World Religions                     | Philosophy        | world_religions                     |
@@ -0,0 +1,131 @@
+# Integrating with Lemonade Server
+
+This guide provides instructions on how to integrate Lemonade Server into your application.
+
+There are two main ways in which Lemonade Sever might integrate into apps:
+* User-Managed Server: User is responsible for installing and managing Lemonade Server.
+* App-Managed Server: App is responsible for installing and managing Lemonade Server on behalf of the user.
+
+The first part of this guide contains instructions that are common for both integration approaches. The second part provides advanced instructions only needed for app-managed server integrations.
+
+## General Instructions
+
+
+### Identifying Compatible Devices
+
+AMD Ryzen™ AI `Hybrid` models are available on Windows 11 on all AMD Ryzen™ AI 300 Series Processors. To programmatically identify supported devices, we recommend using a regular expression that checks if the CPU name contains "Ryzen AI" and a 3-digit number starting with 3 as shown below.
+
+```
+Ryzen AI.*\b3\d{2}\b
+```
+
+Explanation:
+- `Ryzen AI`: Matches the literal phrase "Ryzen AI".
+- `.*`: Allows any characters (including spaces) to appear after "Ryzen AI".
+- `\b3\d{2}\b`: Matches a three-digit number starting with 3, ensuring it's a standalone number.
+
+There are several ways to check the CPU name on a Windows computer. A reliable way of doing so is through cmd's `reg query` command as shown below.
+
+```
+reg query "HKEY_LOCAL_MACHINE\HARDWARE\DESCRIPTION\System\CentralProcessor\0" /v ProcessorNameString
+```
+
+### Downloading Server Installer
+
+The recommended way of directing users to the server installer is pointing users to our releases page at [`https://github.com/onnx/turnkeyml/releases`](https://github.com/onnx/turnkeyml/releases). Alternatively, you may also provide the direct path to the installer itself or download the installer programmatically as shown below:
+
+
+Latest version:
+
+```bash
+https://github.com/onnx/turnkeyml/releases/latest/download/Lemonade_Server_Installer.exe
+```
+
+Specific version:
+
+```bash
+https://github.com/onnx/turnkeyml/releases/download/v6.0.0/Lemonade_Server_Installer.exe
+```
+
+Please note that the Server Installer is only available on Windows. Apps that integrate with our server on a Linux machine must install Lemonade from source as described [here](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/getting_started.md#from-source-code).
+
+## Stand-Alone Server Integration
+
+Some apps might prefer to be responsible for installing and managing Lemonade Server on behalf of the user. This part of the guide includes steps for installing and running Lemonade Server so that your users don't have to install Lemonade Server separately.
+
+Definitions:
+- "Silent installation" refers to an automatic command for installing Lemonade Server without running any GUI or prompting the user for any questions. It does assume that the end-user fully accepts the license terms, so be sure that your own application makes this clear to the user.
+- Command line usage allows the server process to be launched programmatically, so that your application can manage starting and stopping the server process on your user's behalf.
+
+### Silent Installation
+
+Silent installation runs `Lemonade_Server_Installer.exe` without a GUI and automatically accepts all prompts.
+
+In a `cmd.exe` terminal:
+
+Install *with* Ryzen AI hybrid support: 
+
+```bash
+Lemonade_Server_Installer.exe /S /Extras=hybrid
+```
+
+Install *without* Ryzen AI hybrid support:
+
+```bash
+Lemonade_Server_Installer.exe /S
+```
+
+The install directory can also be changed from the default by using `/D` as the last argument. 
+
+For example: 
+
+```bash
+Lemonade_Server_Installer.exe /S /Extras=hybrid /D=C:\a\new\path
+```
+
+Only `Qwen2.5-0.5B-Instruct-CPU` is installed by default in silent mode. If you wish to select additional models to download in silent mode, you may use the `/Models` argument.
+
+```bash
+Lemonade_Server_Installer.exe /S /Extras=hybrid /Models="Qwen2.5-0.5B-Instruct-CPU Llama-3.2-1B-Instruct-Hybrid"
+```
+
+The available modes are the following:
+* `Qwen2.5-0.5B-Instruct-CPU`
+* `Llama-3.2-1B-Instruct-Hybrid`
+* `Llama-3.2-3B-Instruct-Hybrid`
+* `Phi-3-Mini-Instruct-Hybrid`
+* `Qwen-1.5-7B-Chat-Hybrid`
+
+### Command Line Invocation
+
+Command line invocation starts the Lemonade Server process so that your application can connect to it via REST API endpoints. 
+
+#### Foreground Process
+
+These steps will open the Lemonade Server in a terminal window that is visible to users. The user can exit the server by closing the window.
+
+In a `cmd.exe` terminal:
+
+```bash
+conda run --no-capture-output -p INSTALL_DIR\lemonade_server\lemon_env lemonade serve
+```
+
+Where `INSTALL_DIR` is the installation path of `lemonade_server`. 
+
+For example, if you used the default installation directory and your username is USERNAME: 
+
+```bash
+C:\Windows\System32\cmd.exe /C conda run --no-capture-output -p C:\Users\USERNAME\AppData\Local\lemonade_server\lemon_env lemonade serve
+```
+
+#### Background Process
+
+This command will open the Lemonade Server without opening a window. Your application needs to manage terminating the process and any child processes it creates.
+
+In a powershell terminal:
+
+```powershell
+$serverProcess = Start-Process -FilePath "C:\Windows\System32\cmd.exe" -ArgumentList "/C conda run --no-capture-output -p INSTALL_DIR\lemonade_server\lemon_env lemonade serve" -RedirectStandardOutput lemonade_out.txt -RedirectStandardError lemonade_err.txt -PassThru -NoNewWindow
+```
+
+Where `INSTALL_DIR` is the installation path of `lemonade_server`.
@@ -9,7 +9,7 @@ We are also actively investigating and developing [additional endpoints](#additi
 ### OpenAI-Compatible Endpoints
 - POST `/api/v0/chat/completions` - Chat Completions (messages -> completion)
 - POST `/api/v0/completions` - Text Completions (prompt -> completion)
-- GET `/api/v0/models` - List available models
+- GET `/api/v0/models` - List models available locally
 
 ### Additional Endpoints
 
@@ -165,7 +165,7 @@ The following format is used for both streaming and non-streaming responses:
 
 ### `GET /api/v0/models` <sub>![Status](https://img.shields.io/badge/status-fully_available-green)</sub>
 
-Returns a list of key models available on the server in an OpenAI-compatible format. This list is curated based on what works best for Ryzen AI Hybrid. Additional models can be loaded via the `/api/v0/load` endpoint by specifying the Hugging Face checkpoint.
+Returns a list of key models available on the server in an OpenAI-compatible format. This list is curated based on what works best for Ryzen AI Hybrid. Only models available locally are shown.
 
 #### Parameters
 
 
@@ -1,6 +1,14 @@
 # Lemonade Examples
 
-This folder contains examples of how to use `lemonade` via the high-level APIs. These APIs make it easy to load a model, generate responses, and also show how to stream those responses.
+This folder contains examples of how to deploy `lemonade` into applications. 
+
+## Server Examples
+
+The `server/` folder contains examples of how to use Lemonade Server with existing applications that support server interfaces. Learn more in `server/README.md`. 
+
+## API Examples
+
+This folder has examples of using the Lemonade API to integrate LLMs into Python applications. These APIs make it easy to load a model, generate responses, and also show how to stream those responses.
 
 The `demos/` folder also contains some higher-level application demos of the APIs. Learn more in `demos/README.md`.
 
 
@@ -0,0 +1,8 @@
+# Lemonade Server Examples
+
+The guides in this folder help you connect Lemoande Server to applications.
+
+| App                 | Guide                                                                                               |
+|--------------------|-------------------------------------------------------------------------------------------------------|
+| [Open WebUI](https://github.com/open-webui/open-webui)         | [How chat with lemonade LLMs in Open WebUI](https://ryzenai.docs.amd..com/en/latest/llm/server_interface.html#open-webui-demo)   |
+| [Continue](https://www.continue.dev/)   | [How use lemonade LLMs as a coding assistant in Continue](continue.md)                                          |
@@ -0,0 +1,57 @@
+# Continue Coding Assistant
+
+## Overview
+
+[Continue](https://www.continue.dev/) is a coding assistant that lives inside of a VS Code extension. It supports chatting with your codebase, making edits, and a lot more.
+
+## Expectations
+
+We have found that the `Qwen-1.5-7B-Chat-Hybrid` model is the best Hybrid model available for coding. It is good at chatting with a few files at a time in your codebase to learn more about them. It can also make simple code editing suggestions pertaining to a few lines of code at a time.
+
+However, we do not recommend using this model for analyzing large codebases at once or making large or complex file edits.
+
+## Setup
+
+### Prerequisites
+
+1. Install Lemonade Server using the [installer .exe](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/lemonade_server_exe.md#lemonade-server-installer).
+
+### Install Continue
+
+> Note: they provide their own instructions [here](https://marketplace.visualstudio.com/items?itemName=Continue.continue)
+
+1. Open the Extensions tab in VS Code Activity Bar.
+1. Search "Continue - Codestral, Claude, and more" in the Extensions Marketplace search bar.
+1. Select the Continue extension and click install.
+
+This will add a Continue tab to your VS Code Activity Bar.
+
+### Add Lemonade Server to Continue
+
+> Note: The following instructions are based on instructions from Continue found [here](https://docs.continue.dev/customize/model-providers/openai#openai-compatible-servers--apis) 
+
+1. Open the Continue tab in your VS Code Activity Bar.
+1. Click the gear icon at the top to open Settings.
+1. Under "Configuration", click "Open Config File".
+1. Replace the "models" key in the `config.json` with the following and save:
+
+```json
+  "models": [
+    {
+      "title": "Lemonade", 
+      "provider": "openai",
+      "model": "Qwen-1.5-7B-Chat-Hybrid",
+      "apiKey": "-",
+      "apiBase": "http://localhost:8000/api/v0"
+    }
+  ],
+```
+
+## Usage
+
+> Note: see the Continue [user guide](https://docs.continue.dev/) to learn about all of their features.
+
+To try out Continue:
+- Open the Continue tab in your VS Code Activity Bar, and in the "Ask anything" box, type a question about your code. Use the `@` symbol to specify a file or too.
+  - Example: "What's the fastest way to install lemonade in @getting_started.md?"
+- Open a file, select some code, and push Ctrl+I to start a chat about editing that code.
@@ -3,5 +3,5 @@
 This directory contains examples to help you learn how to use the tools. The examples are split up into these sub-directories:
 1. `examples/lemonade`: scripts that demonstrate the `lemonade` CLI for LLMs.
 1. `examples/turnkey/cli`: a tutorial series for the `turnkey` CLI. This is the recommended starting point.
-1. `examples/turnkey/api`: scripts that demonstrate how to use the `turnkey.evaluate_files()` API.
+1. `examples/turnkey/api`: scripts that demonstrate how to use the `turnkey.files_api.evaluate_files()` API.
 
@@ -236,6 +236,54 @@ Section "Install Ryzen AI Hybrid Execution" HybridSec
   end:
 SectionEnd
 
+SubSection /e "Selected Models" ModelsSec
+  Section /o "Qwen2.5-0.5B-Instruct-CPU" Qwen05Sec
+    SectionIn 1
+    AddSize 999604  ;
+    StrCpy $9 "$9Qwen2.5-0.5B-Instruct-CPU "
+  SectionEnd
+
+  Section "Llama-3.2-1B-Instruct-Hybrid" Llama1BSec
+    SectionIn 1
+    AddSize 1884397  ;
+    StrCpy $9 "$9Llama-3.2-1B-Instruct-Hybrid "
+  SectionEnd
+
+  Section "Llama-3.2-3B-Instruct-Hybrid" Llama3BSec
+    SectionIn 1
+    AddSize 4268402  ;
+    StrCpy $9 "$9Llama-3.2-3B-Instruct-Hybrid "
+  SectionEnd
+
+  Section /o "Phi-3-Mini-Instruct-Hybrid" PhiSec
+    SectionIn 1
+    AddSize 4185551  ;
+    StrCpy $9 "$9Phi-3-Mini-Instruct-Hybrid "
+  SectionEnd
+
+  Section /o "Qwen-1.5-7B-Chat-Hybrid" Qwen7BSec
+    SectionIn 1
+    AddSize 8835894  ;
+    StrCpy $9 "$9Qwen-1.5-7B-Chat-Hybrid "
+  SectionEnd
+
+  Section "-Download Models" DownloadModels
+    ${If} ${Silent}
+        ${GetParameters} $CMDLINE
+        ${GetOptions} $CMDLINE "/Models=" $R0
+        ${If} $R0 != ""
+            nsExec::ExecToLog 'conda run --no-capture-output -p $INSTDIR\$LEMONADE_CONDA_ENV lemonade-install --models $R0'
+        ${Else}
+            ; Otherwise, only the default CPU model will be installed
+            nsExec::ExecToLog 'conda run --no-capture-output -p $INSTDIR\$LEMONADE_CONDA_ENV lemonade-install --models Qwen2.5-0.5B-Instruct-CPU'
+        ${EndIf}
+    ${Else}
+        nsExec::ExecToLog 'conda run --no-capture-output -p $INSTDIR\$LEMONADE_CONDA_ENV lemonade-install --models $9'
+    ${EndIf}
+  SectionEnd
+
+SubSectionEnd
+
 Section "-Add Desktop Shortcut" ShortcutSec  
   ; Create a desktop shortcut that passes the conda environment name as a parameter
   CreateShortcut "$DESKTOP\lemonade-server.lnk" "$INSTDIR\run_server.bat" "$LEMONADE_CONDA_ENV" "$INSTDIR\img\favicon.ico"
@@ -259,12 +307,68 @@ FunctionEnd
 !define MUI_FINISHPAGE_RUN_TEXT "Run Lemonade Server"
 
 Function .onSelChange
+    ; Check hybrid selection status
     StrCpy $HYBRID_SELECTED "false"
     SectionGetFlags ${HybridSec} $0
     IntOp $0 $0 & ${SF_SELECTED}
-    StrCmp $0 ${SF_SELECTED} 0 +2
+    StrCmp $0 ${SF_SELECTED} 0 hybrid_disabled
     StrCpy $HYBRID_SELECTED "true"
-    ;MessageBox MB_OK "Component 2 is selected"
+    
+    ; If hybrid is enabled, check if at least one hybrid model is selected
+    SectionGetFlags ${Llama1BSec} $1
+    IntOp $1 $1 & ${SF_SELECTED}
+    ${If} $1 == ${SF_SELECTED}
+        Goto end
+    ${EndIf}
+    
+    SectionGetFlags ${Llama3BSec} $1
+    IntOp $1 $1 & ${SF_SELECTED}
+    ${If} $1 == ${SF_SELECTED}
+        Goto end
+    ${EndIf}
+    
+    SectionGetFlags ${PhiSec} $1
+    IntOp $1 $1 & ${SF_SELECTED}
+    ${If} $1 == ${SF_SELECTED}
+        Goto end
+    ${EndIf}
+    
+    SectionGetFlags ${Qwen7BSec} $1
+    IntOp $1 $1 & ${SF_SELECTED}
+    ${If} $1 == ${SF_SELECTED}
+        Goto end
+    ${EndIf}
+    
+    ; If no hybrid model is selected, select Llama-1B by default
+    SectionGetFlags ${Llama1BSec} $1
+    IntOp $1 $1 | ${SF_SELECTED}
+    SectionSetFlags ${Llama1BSec} $1
+    MessageBox MB_OK "At least one hybrid model must be selected when hybrid execution is enabled. Llama-3.2-1B-Instruct-Hybrid has been automatically selected."
+    Goto end
+    
+hybrid_disabled:
+    ; When hybrid is disabled, select Qwen2.5-0.5B-Instruct-CPU and disable all other hybrid model selections
+    SectionGetFlags ${Qwen05Sec} $1
+    IntOp $1 $1 | ${SF_SELECTED}
+    SectionSetFlags ${Qwen05Sec} $1
+
+    SectionGetFlags ${Llama1BSec} $1
+    IntOp $1 $1 & ${SECTION_OFF}
+    SectionSetFlags ${Llama1BSec} $1
+    
+    SectionGetFlags ${Llama3BSec} $1
+    IntOp $1 $1 & ${SECTION_OFF}
+    SectionSetFlags ${Llama3BSec} $1
+    
+    SectionGetFlags ${PhiSec} $1
+    IntOp $1 $1 & ${SECTION_OFF}
+    SectionSetFlags ${PhiSec} $1
+    
+    SectionGetFlags ${Qwen7BSec} $1
+    IntOp $1 $1 & ${SECTION_OFF}
+    SectionSetFlags ${Qwen7BSec} $1
+
+end:
 FunctionEnd
 
 Function SkipLicense
@@ -276,6 +380,7 @@ FunctionEnd
 
 ; MUI Settings
 !insertmacro MUI_PAGE_WELCOME
+!define MUI_COMPONENTSPAGE_SMALLDESC
 !insertmacro MUI_PAGE_COMPONENTS
 
 !define MUI_PAGE_CUSTOMFUNCTION_PRE SkipLicense
@@ -307,18 +412,33 @@ LangString MUI_BUTTONTEXT_FINISH "${LANG_ENGLISH}" "Finish"
 LangString MUI_TEXT_LICENSE_TITLE ${LANG_ENGLISH} "AMD License Agreement"
 LangString MUI_TEXT_LICENSE_SUBTITLE ${LANG_ENGLISH} "Please review the license terms before installing AMD Ryzen AI Hybrid Execution Mode."
 LangString DESC_SEC01 ${LANG_ENGLISH} "The minimum set of dependencies for a lemonade server that runs LLMs on CPU."
-LangString DESC_HybridSec ${LANG_ENGLISH} "Add support for running LLMs on Ryzen AI hybrid execution mode, which uses both the NPU and iGPU for improved performance. Only available on Ryzen AI 300-series processors."
+LangString DESC_HybridSec ${LANG_ENGLISH} "Add support for running LLMs on Ryzen AI hybrid execution mode. Only available on Ryzen AI 300-series processors."
+LangString DESC_ModelsSec ${LANG_ENGLISH} "Select which models to install"
+LangString DESC_Qwen05Sec ${LANG_ENGLISH} "Small CPU-only Qwen model"
+LangString DESC_Llama1BSec ${LANG_ENGLISH} "1B parameter Llama model with hybrid execution"
+LangString DESC_Llama3BSec ${LANG_ENGLISH} "3B parameter Llama model with hybrid execution"
+LangString DESC_PhiSec ${LANG_ENGLISH} "Phi-3 Mini model with hybrid execution"
+LangString DESC_Qwen7BSec ${LANG_ENGLISH} "7B parameter Qwen model with hybrid execution"
 
 ; Insert the description macros
 !insertmacro MUI_FUNCTION_DESCRIPTION_BEGIN
   !insertmacro MUI_DESCRIPTION_TEXT ${SEC01} $(DESC_SEC01)
   !insertmacro MUI_DESCRIPTION_TEXT ${HybridSec} $(DESC_HybridSec)
+  !insertmacro MUI_DESCRIPTION_TEXT ${ModelsSec} $(DESC_ModelsSec)
+  !insertmacro MUI_DESCRIPTION_TEXT ${Qwen05Sec} $(DESC_Qwen05Sec)
+  !insertmacro MUI_DESCRIPTION_TEXT ${Llama1BSec} $(DESC_Llama1BSec)
+  !insertmacro MUI_DESCRIPTION_TEXT ${Llama3BSec} $(DESC_Llama3BSec)
+  !insertmacro MUI_DESCRIPTION_TEXT ${PhiSec} $(DESC_PhiSec)
+  !insertmacro MUI_DESCRIPTION_TEXT ${Qwen7BSec} $(DESC_Qwen7BSec)
 !insertmacro MUI_FUNCTION_DESCRIPTION_END
 
 Function .onInit
   StrCpy $LEMONADE_SERVER_STRING "Lemonade Server"
   StrCpy $LEMONADE_CONDA_ENV "lemon_env"
   StrCpy $HYBRID_SELECTED "true"
+  
+  ; Create a variable to store selected models
+  StrCpy $9 ""  ; $9 will hold our list of selected models
 
   ; Set the install directory, allowing /D override from CLI install
   ${If} $InstDir != ""
 
@@ -105,7 +105,7 @@
     classifiers=[],
     entry_points={
         "console_scripts": [
-            "turnkey=turnkeyml:turnkeycli",
+            "turnkey=turnkeyml.cli.cli:main",
             "turnkey-llm=lemonade:lemonadecli",
             "lemonade=lemonade:lemonadecli",
             "lemonade-install=lemonade_install:installcli",
 
@@ -1,14 +1,14 @@
 import os
+from turnkeyml import __version__ as version_number
 from turnkeyml.tools import FirstTool, NiceHelpFormatter
 import turnkeyml.common.filesystem as fs
-import turnkeyml.cli.cli as cli
+import turnkeyml.common.cli_helpers as cli
 from turnkeyml.sequence import Sequence
 from turnkeyml.tools.management_tools import Cache, Version, SystemInfo
 from turnkeyml.state import State
 
 from lemonade.tools.huggingface_load import (
     HuggingfaceLoad,
-    AdaptHuggingface,
 )
 
 from lemonade.tools.huggingface_bench import HuggingfaceBench
@@ -38,7 +38,6 @@ def main():
         AccuracyHumaneval,
         AccuracyPerplexity,
         LLMPrompt,
-        AdaptHuggingface,
         HuggingfaceBench,
         OgaBench,
         QuarkQuantize,
@@ -62,49 +61,46 @@ def main():
 
     # Define the argument parser
     parser = cli.CustomArgumentParser(
-        description="Turnkey analysis and benchmarking of GenAI models. "
-        "This utility is a toolchain. To use it, provide a list of tools and "
-        "their arguments.",
+        description=f"""Tools for evaluating and deploying LLMs (v{version_number}).
+
+Read this to learn the command syntax:
+https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/getting_started.md""",
         formatter_class=NiceHelpFormatter,
     )
 
     parser.add_argument(
         "-i",
         "--input",
-        help="The input that will be evaluated by the tool sequence "
-        "(e.g., huggingface checkpoints)",
+        help="The input that will be evaluated by the starting tool "
+        "(e.g., huggingface checkpoint)",
     )
 
     parser.add_argument(
         "-d",
         "--cache-dir",
-        help="Cache directory where the results of each tool will "
-        f"be stored (defaults to {cache.DEFAULT_CACHE_DIR})",
+        help="Cache directory where tool results are "
+        f"stored (default: {cache.DEFAULT_CACHE_DIR})",
         required=False,
         default=cache.DEFAULT_CACHE_DIR,
     )
 
-    parser.add_argument(
-        "--lean-cache",
-        dest="lean_cache",
-        help="Delete all build artifacts (e.g., .onnx files) when the command completes",
-        action="store_true",
-    )
-
+    memory_tracking_default_interval = 0.25
     parser.add_argument(
         "-m",
         "--memory",
         nargs="?",
         metavar="TRACK_INTERVAL",
         type=float,
         default=None,
-        const=0.25,
-        help="Track physical memory usage during the build and generate a plot when the "
-        "command completes. Optionally, specify the tracking interval (sec), "
-        "defaults to 0.25 sec.",
+        const=memory_tracking_default_interval,
+        help="Track memory usage and plot the results. "
+        "Optionally, set the tracking interval in seconds "
+        f"(default: {memory_tracking_default_interval})",
     )
 
-    global_args, tool_instances, evaluation_tools = cli.parse_tools(parser, tools)
+    global_args, tool_instances, evaluation_tools = cli.parse_tools(
+        parser, tools, cli_name="lemonade"
+    )
 
     if len(evaluation_tools) > 0:
         if not issubclass(evaluation_tools[0], FirstTool):
@@ -128,7 +124,6 @@ def main():
         )
         sequence.launch(
             state,
-            lean_cache=global_args["lean_cache"],
             track_memory_interval=global_args["memory"],
         )
     else:
 
@@ -122,7 +122,7 @@ def parser(parser: argparse.ArgumentParser = None, add_help: bool = True):
         # Allow inherited classes to initialize and pass in a parser, add parameters to it if so
         if parser is None:
             parser = __class__.helpful_parser(
-                short_description="Benchmark a torch.nn.Module LLM",
+                short_description="Benchmark a huggingface-style PyTorch LLM",
                 add_help=add_help,
             )
 
 
@@ -6,8 +6,8 @@
 from huggingface_hub import model_info
 from turnkeyml.state import State
 import turnkeyml.common.status as status
-from turnkeyml.tools import Tool, FirstTool
-from lemonade.tools.adapter import ModelAdapter, TokenizerAdapter
+from turnkeyml.tools import FirstTool
+from lemonade.tools.adapter import TokenizerAdapter
 from lemonade.cache import Keys
 
 # Command line interfaces for tools will use string inputs for data
@@ -110,7 +110,7 @@ def __init__(self):
     @staticmethod
     def parser(add_help: bool = True) -> argparse.ArgumentParser:
         parser = __class__.helpful_parser(
-            short_description="Load an LLM as torch.nn.Module using huggingface from_pretrained()",
+            short_description="Load an LLM in PyTorch using huggingface transformers",
             add_help=add_help,
         )
 
@@ -239,78 +239,3 @@ def run(
         status.add_to_state(state=state, name=input, model=model)
 
         return state
-
-
-class HuggingfaceAdapter(ModelAdapter):
-    """
-    Wrapper class for Huggingface LLMs that set generate() arguments to
-    make them more accurate and pleasant to chat with:
-
-        repetition_penalty: helps the LLM avoid repeating the same short
-            phrase in the response over and over.
-        temperature: helps the LLM stay focused on the prompt.
-        do_sample: apply the temperature.
-    """
-
-    def __init__(self, model, dtype=torch.float32, device="cpu"):
-        super().__init__()
-        self.model = model
-        self.dtype = dtype
-        self.device = device
-
-    def generate(
-        self,
-        input_ids,
-        max_new_tokens=512,
-        repetition_penalty=1.2,
-        do_sample=True,
-        temperature=0.1,
-        **kwargs,
-    ):
-        amp_enabled = (
-            True
-            if (self.dtype == torch.float16 or self.dtype == torch.bfloat16)
-            else False
-        )
-
-        # Move input_ids to the same device as the model
-        input_ids = input_ids.to(self.device)
-
-        with torch.no_grad(), torch.inference_mode(), torch.cpu.amp.autocast(
-            enabled=amp_enabled, dtype=self.dtype
-        ):
-            return self.model.generate(
-                input_ids=input_ids,
-                max_new_tokens=max_new_tokens,
-                repetition_penalty=repetition_penalty,
-                do_sample=do_sample,
-                temperature=temperature,
-                **kwargs,
-            )
-
-
-class AdaptHuggingface(Tool):
-    """
-    Apply specific settings to make Huggingface LLMs
-    more accurate and pleasant to chat with.
-    """
-
-    unique_name = "adapt-huggingface"
-
-    def __init__(self):
-        super().__init__(monitor_message="Adapting Huggingface LLM")
-
-    @staticmethod
-    def parser(add_help: bool = True) -> argparse.ArgumentParser:
-        parser = __class__.helpful_parser(
-            short_description="Apply accuracy-boosting settings to huggingface LLMs",
-            add_help=add_help,
-        )
-
-        return parser
-
-    def run(self, state: State) -> State:
-
-        state.model = HuggingfaceAdapter(state.model, state.dtype, state.device)
-
-        return state
@@ -42,7 +42,7 @@ def __init__(self):
     @staticmethod
     def parser(add_help: bool = True) -> argparse.ArgumentParser:
         parser = __class__.helpful_parser(
-            short_description="Run accuracy benchmark using HumanEval dataset",
+            short_description="Measure coding accuracy with HumanEval",
             add_help=add_help,
         )
         parser.add_argument(
 
@@ -152,7 +152,7 @@ def __init__(self):
     @staticmethod
     def parser(add_help: bool = True) -> argparse.ArgumentParser:
         parser = __class__.helpful_parser(
-            short_description="Wrap Llamacpp models with an API",
+            short_description="Wrap llama.cpp models with an API",
             add_help=add_help,
         )
 
 
@@ -24,7 +24,7 @@ def __init__(self):
     @staticmethod
     def parser(add_help: bool = True) -> argparse.ArgumentParser:
         parser = __class__.helpful_parser(
-            short_description="Benchmark a Llamacpp model",
+            short_description="Benchmark a llama.cpp model",
             add_help=add_help,
         )
 
 
@@ -43,8 +43,8 @@ def __init__(self):
     @staticmethod
     def parser(add_help: bool = True) -> argparse.ArgumentParser:
         parser = __class__.helpful_parser(
-            short_description="Run accuracy benchmark using Massive Multitask "
-            "Language Understanding (MMLU) test",
+            short_description="Measure accuracy with Massive Multitask "
+            "Language Understanding (MMLU)",
             add_help=add_help,
         )
 
 
@@ -162,6 +162,10 @@ def generate(
                 past_present_share_buffer=search_config.get(
                     "past_present_share_buffer", True
                 ),
+                # Make sure that results do not vary across laptops
+                # by default, random_seed=-1 causes different laptops to give
+                # different results
+                random_seed=1,
                 # Not currently supported by OGA
                 # diversity_penalty=search_config.get('diversity_penalty', 0.0),
                 # no_repeat_ngram_size=search_config.get('no_repeat_ngram_size', 0),
 
@@ -12,7 +12,7 @@
 
 class AccuracyPerplexity(Tool):
     """
-    Measure perplexity of an LLM using the wikitext dataset.
+    Measure perplexity of an LLM using the Wikitext-2 dataset.
 
     Required input state:
         - state.model: instance that provides a __call__() method that returns
@@ -32,7 +32,7 @@ def __init__(self):
     @staticmethod
     def parser(add_help: bool = True) -> argparse.ArgumentParser:
         parser = __class__.helpful_parser(
-            short_description="Measure Perplexity score using Wikitext-2 dataset",
+            short_description="Measure perplexity score",
             add_help=add_help,
         )
         return parser
 
@@ -4,8 +4,9 @@
 import time
 from threading import Thread, Event
 import logging
+import traceback
 
-from fastapi import FastAPI, HTTPException, status
+from fastapi import FastAPI, HTTPException, status, Request
 from fastapi.responses import StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
@@ -24,10 +25,9 @@
 from turnkeyml.state import State
 from turnkeyml.tools.management_tools import ManagementTool
 from lemonade.tools.adapter import ModelAdapter
-from lemonade.tools.prompt import DEFAULT_GENERATE_PARAMS
 from lemonade.tools.huggingface_load import HuggingfaceLoad
 from lemonade.cache import DEFAULT_CACHE_DIR
-
+from lemonade_install.install import ModelManager
 
 # Set to a high number to allow for interesting experiences in real apps
 # Tests should use the max_new_tokens argument to set a lower value
@@ -36,6 +36,34 @@
 DEFAULT_PORT = 8000
 DEFAULT_LOG_LEVEL = "info"
 
+LOCAL_MODELS = ModelManager().downloaded_models_enabled
+
+
+class GeneratorThread(Thread):
+    """
+    Thread class designed for use with streaming generation within
+    an LLM server. It needs access to the streamer in order to order
+    to help the completions APIs escape the "for text in streamer" loop.
+    It also provides exception handling that works nicely with HTTP
+    servers by providing the stack trace and making the exception
+    information available to the main thread.
+    """
+
+    def __init__(self, streamer, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.exception = None
+        self.streamer = streamer
+
+    def run(self):
+        try:
+            if self._target:
+                self._target(*self._args, **self._kwargs)
+        except Exception as e:  # pylint: disable=broad-except
+            self.exception = e
+            logging.error(f"Exception raised in generate thread: {e}")
+            traceback.print_exc()
+            self.streamer.done()
+
 
 # Custom huggingface-style stopping criteria to allow
 # us to halt streaming in-progress generations
@@ -150,6 +178,7 @@ def __init__(self):
         self.input_tokens = None
         self.output_tokens = None
         self.decode_token_times = None
+        self.process_time = None
 
         # Store debug logging state
         self.debug_logging_enabled = logging.getLogger().isEnabledFor(logging.DEBUG)
@@ -169,36 +198,15 @@ def __init__(self):
         self._generate_semaphore = asyncio.Semaphore(self.max_concurrent_generations)
 
         # Curated list of "Instruct" and "Chat" models.
-        self.builtin_models = {
-            "Qwen2.5-0.5B-Instruct-CPU": {
-                "checkpoint": "Qwen/Qwen2.5-0.5B-Instruct",
-                "device": "cpu",
-            },
-            "Llama-3.2-1B-Instruct-Hybrid": {
-                "checkpoint": "amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
-                "device": "hybrid",
-            },
-            "Llama-3.2-3B-Instruct-Hybrid": {
-                "checkpoint": "amd/Llama-3.2-3B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
-                "device": "hybrid",
-            },
-            "Phi-3-Mini-Instruct-Hybrid": {
-                "checkpoint": "amd/Phi-3-mini-4k-instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
-                "device": "hybrid",
-            },
-            "Qwen-1.5-7B-Chat-Hybrid": {
-                "checkpoint": "amd/Qwen1.5-7B-Chat-awq-g128-int4-asym-fp16-onnx-hybrid",
-                "device": "hybrid",
-            },
-        }
+        self.local_models = LOCAL_MODELS
 
         # Add lock for load/unload operations
         self._load_lock = asyncio.Lock()
 
     @staticmethod
     def parser(add_help: bool = True) -> argparse.ArgumentParser:
         parser = __class__.helpful_parser(
-            short_description="Industry Standard Model Server",
+            short_description="Launch an industry-standard LLM server",
             add_help=add_help,
         )
 
@@ -272,6 +280,10 @@ def trace(message, *args, **kwargs):
         # Update debug logging state after setting log level
         self.debug_logging_enabled = logging.getLogger().isEnabledFor(logging.DEBUG)
 
+        if self.debug_logging_enabled:
+            # Print the elapsed time for each request
+            self.setup_middleware_timer()
+
         # Only load the model when starting the server if checkpoint was provided
         if checkpoint:
             config = LoadConfig(
@@ -297,6 +309,7 @@ async def _show_telemetry(self):
             ["Output tokens", self.output_tokens],
             ["TTFT (s)", f"{self.time_to_first_token:.2f}"],
             ["TPS", f"{self.tokens_per_second:.2f}"],
+            ["Total time (s)", f"{self.process_time:.2f}"],
         ]
 
         table = tabulate(
@@ -313,8 +326,8 @@ async def completions(self, completion_request: CompletionRequest):
         if completion_request.model:
 
             # Get model config
-            if completion_request.model in self.builtin_models:
-                model_config = self.builtin_models[completion_request.model]
+            if completion_request.model in self.local_models:
+                model_config = self.local_models[completion_request.model]
                 lc = LoadConfig(**model_config)
             else:
                 # If the model is not built-in, we assume it corresponds to a checkpoint
@@ -394,8 +407,8 @@ async def chat_completions(self, chat_completion_request: ChatCompletionRequest)
         """
 
         # Get model config
-        if chat_completion_request.model in self.builtin_models:
-            model_config = self.builtin_models[chat_completion_request.model]
+        if chat_completion_request.model in self.local_models:
+            model_config = self.local_models[chat_completion_request.model]
             lc = LoadConfig(**model_config)
         else:
             # If the model is not built-in, we assume it corresponds to a checkpoint
@@ -548,7 +561,6 @@ async def _generate_tokens(self, message: str, stop: list[str] | str | None = No
             "min_new_tokens": 1,
             "pad_token_id": tokenizer.eos_token_id,
             "stopping_criteria": stopping_criteria,
-            **DEFAULT_GENERATE_PARAMS,
         }
 
         # Initialize performance variables
@@ -558,7 +570,9 @@ async def _generate_tokens(self, message: str, stop: list[str] | str | None = No
         self.output_tokens = 0
 
         # Begin generation
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread = GeneratorThread(
+            streamer, target=model.generate, kwargs=generation_kwargs
+        )
         thread.start()
 
         # Acquire the generation semaphore
@@ -621,7 +635,15 @@ async def _generate_tokens(self, message: str, stop: list[str] | str | None = No
                 self.max_concurrent_generations
                 - self._generate_semaphore._value  # pylint: disable=protected-access
             )
-            logging.debug(f"Active generations: {active_generations}")
+
+            # Check if an exception occurred in the generation thread
+            # If it did, raise it as an HTTPException so that the client
+            # knows they wont be getting a completion
+            if thread.exception:
+                raise HTTPException(
+                    status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                    detail=f"Completion failure {thread.exception}",
+                )
 
             # Display telemetry if in debug mode
             await self._show_telemetry()
@@ -707,7 +729,7 @@ async def load_llm(self, config: LoadConfig):
                         input=config.checkpoint,
                         device=config.device,
                         dtype="int4",
-                        force=True,
+                        force=False,
                     )
                 self.max_new_tokens = config.max_new_tokens
                 self.llm_loaded = config.checkpoint
@@ -766,7 +788,7 @@ async def models(self):
         Return a list of available models in OpenAI-compatible format.
         """
         models_list = []
-        for model in self.builtin_models:
+        for model in self.local_models:
             m = Model(
                 id=model,
                 owned_by="lemonade",
@@ -776,3 +798,18 @@ async def models(self):
             models_list.append(m)
 
         return {"object": "list", "data": models_list}
+
+    def setup_middleware_timer(self):
+        logging.info("Middleware set up")
+
+        @self.app.middleware("http")
+        async def save_process_time(request: Request, call_next):
+            """
+            Save the request processing time for any request, so that is can be
+            printed as telemetry.
+            """
+
+            start_time = time.perf_counter()
+            response = await call_next(request)
+            self.process_time = time.perf_counter() - start_time
+            return response
@@ -10,11 +10,12 @@
 import subprocess
 import sys
 import shutil
+import pkg_resources
 from pathlib import Path
 from typing import Optional
 import zipfile
 import requests
-
+import huggingface_hub
 
 lemonade_install_dir = Path(__file__).parent.parent.parent
 DEFAULT_AMD_OGA_NPU_DIR = os.path.join(
@@ -33,6 +34,123 @@
 )
 
 
+class ModelManager:
+
+    @property
+    def supported_cpu_models(self) -> dict:
+        """
+        Returns a dictionary of supported CPU models.
+        Note: Models must be downloaded before they are locally available.
+        """
+        return {
+            "Qwen2.5-0.5B-Instruct-CPU": {
+                "checkpoint": "Qwen/Qwen2.5-0.5B-Instruct",
+                "device": "cpu",
+            }
+        }
+
+    @property
+    def supported_hybrid_models(self) -> dict:
+        """
+        Returns a dictionary of supported hybrid models.
+        Note: Models must be downloaded before they are locally available.
+        """
+        return {
+            "Llama-3.2-1B-Instruct-Hybrid": {
+                "checkpoint": "amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
+                "device": "hybrid",
+            },
+            "Llama-3.2-3B-Instruct-Hybrid": {
+                "checkpoint": "amd/Llama-3.2-3B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
+                "device": "hybrid",
+            },
+            "Phi-3-Mini-Instruct-Hybrid": {
+                "checkpoint": "amd/Phi-3-mini-4k-instruct-awq-g128-int4-asym-fp16-onnx-hybrid",
+                "device": "hybrid",
+            },
+            "Qwen-1.5-7B-Chat-Hybrid": {
+                "checkpoint": "amd/Qwen1.5-7B-Chat-awq-g128-int4-asym-fp16-onnx-hybrid",
+                "device": "hybrid",
+            },
+        }
+
+    @property
+    def supported_models(self) -> dict:
+        """
+        Returns a dictionary of all supported models across all supported backends.
+        """
+        return {**self.supported_cpu_models, **self.supported_hybrid_models}
+
+    @property
+    def downloaded_hf_checkpoints(self) -> list[str]:
+        """
+        Returns a list of Hugging Face checkpoints that have been downloaded.
+        """
+        downloaded_hf_checkpoints = []
+        try:
+            hf_cache_info = huggingface_hub.scan_cache_dir()
+            downloaded_hf_checkpoints = [entry.repo_id for entry in hf_cache_info.repos]
+        except huggingface_hub.CacheNotFound:
+            pass
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            print(f"Error scanning Hugging Face cache: {e}")
+        return downloaded_hf_checkpoints
+
+    @property
+    def downloaded_cpu_models(self) -> dict:
+        """
+        Returns a dictionary of locally available CPU models.
+        """
+        downloaded_cpu_models = {}
+        for model in self.supported_cpu_models:
+            if (
+                self.supported_cpu_models[model]["checkpoint"]
+                in self.downloaded_hf_checkpoints
+            ):
+                downloaded_cpu_models[model] = self.supported_cpu_models[model]
+        return downloaded_cpu_models
+
+    @property
+    def downloaded_hybrid_models(self) -> dict:
+        """
+        Returns a dictionary of locally available hybrid models.
+        """
+        downloaded_hybrid_models = {}
+        for model in self.supported_hybrid_models:
+            if (
+                self.supported_hybrid_models[model]["checkpoint"]
+                in self.downloaded_hf_checkpoints
+            ):
+                downloaded_hybrid_models[model] = self.supported_hybrid_models[model]
+        return downloaded_hybrid_models
+
+    @property
+    def downloaded_models_enabled(self) -> dict:
+        """
+        Returns a dictionary of locally available models that are enabled by the current installation.
+        """
+        downloaded_models_enabled = self.downloaded_cpu_models.copy()
+        if (
+            "onnxruntime-vitisai" in pkg_resources.working_set.by_key
+            and "onnxruntime-genai-directml" in pkg_resources.working_set.by_key
+        ):
+            downloaded_models_enabled.update(self.downloaded_hybrid_models)
+        return downloaded_models_enabled
+
+    def download_models(self, models: list[str]):
+        """
+        Downloads the specified models from Hugging Face.
+        """
+        for model in models:
+            if model not in self.supported_models:
+                raise ValueError(
+                    f"Model {model} is not supported. Please choose from the following: {list(self.supported_models.keys())}"
+                )
+            checkpoint = self.supported_models[model]["checkpoint"]
+            print(f"Downloading {model} ({checkpoint})")
+            huggingface_hub.snapshot_download(repo_id=checkpoint)
+
+
 def download_lfs_file(token, file, output_filename):
     """Downloads a file from LFS"""
     # Set up the headers for the request
@@ -201,6 +319,14 @@ def parser() -> argparse.ArgumentParser:
             choices=["0.6.0"],
         )
 
+        parser.add_argument(
+            "--models",
+            help="One or more models to download",
+            type=str,
+            nargs="+",
+            choices=ModelManager().supported_models,
+        )
+
         return parser
 
     def run(
@@ -209,11 +335,18 @@ def run(
         quark: Optional[str] = None,
         yes: bool = False,
         token: Optional[str] = None,
+        models: Optional[str] = None,
     ):
-        if ryzenai is None and quark is None:
+        if ryzenai is None and quark is None and models is None:
             raise ValueError(
-                "You must select something to install, for example `--ryzenai` and/or `--quark`"
+                "You must select something to install, for example `--ryzenai`, `--quark`, or `--models`"
             )
+
+        # Download models if needed
+        if models is not None:
+            model_manager = ModelManager()
+            model_manager.download_models(models)
+
         if ryzenai is not None:
             if ryzenai == "npu":
                 file = "ryzen_ai_13_ga/npu-llm-artifacts_1.3.0.zip"
 
@@ -1,5 +1,3 @@
 from turnkeyml.version import __version__
 
-from .files_api import evaluate_files
-from .cli.cli import main as turnkeycli
 from .state import load_state, State
@@ -1,39 +1,13 @@
-import argparse
-import sys
 import os
 from difflib import get_close_matches
-from typing import List, Dict, Tuple, Any
+from typing import List
 import turnkeyml.common.filesystem as fs
 from turnkeyml.sequence import Sequence
-from turnkeyml.tools import Tool, FirstTool, NiceHelpFormatter
+from turnkeyml.tools import FirstTool, NiceHelpFormatter
 from turnkeyml.sequence.tool_plugins import get_supported_tools
 from turnkeyml.cli.spawn import DEFAULT_TIMEOUT_SECONDS
 from turnkeyml.files_api import evaluate_files
-import turnkeyml.common.printing as printing
-from turnkeyml.tools.management_tools import ManagementTool
-
-
-class CustomArgumentParser(argparse.ArgumentParser):
-
-    def error(self, message):
-        self.print_usage()
-        printing.log_error(message)
-        self.exit(2)
-
-
-def _tool_list_help(tools: List[Tool], subclass, exclude=None) -> str:
-    help = ""
-
-    for tool_class in tools:
-        if exclude and issubclass(tool_class, exclude):
-            continue
-        if issubclass(tool_class, subclass):
-            help = (
-                help
-                + f" * {tool_class.unique_name}: {tool_class.parser().short_description}\n"
-            )
-
-    return help
+from turnkeyml.common.cli_helpers import parse_tools, CustomArgumentParser
 
 
 def _check_extension(
@@ -63,114 +37,6 @@ def _check_extension(
     return file_name
 
 
-def parse_tools(
-    parser: argparse.ArgumentParser, supported_tools: List[Tool]
-) -> Tuple[Dict[str, Any], Dict[Tool, List[str]], List[str]]:
-    """
-    Add the help for parsing tools and their args to an ArgumentParser.
-
-    Then, perform the task of parsing a full turnkey CLI command including
-    teasing apart the global arguments and separate tool invocations.
-    """
-
-    tool_parsers = {tool.unique_name: tool.parser() for tool in supported_tools}
-    tool_classes = {tool.unique_name: tool for tool in supported_tools}
-
-    # Sort tools into categories and format for the help menu
-    first_tool_choices = _tool_list_help(supported_tools, FirstTool)
-    eval_tool_choices = _tool_list_help(supported_tools, Tool, exclude=FirstTool)
-    mgmt_tool_choices = _tool_list_help(supported_tools, ManagementTool)
-
-    tools_action = parser.add_argument(
-        "tools",
-        metavar="tool --tool-args [tool --tool-args...]",
-        nargs="?",
-        help=f"""\
-Available tools that can be sequenced together to perform a build. 
-
-Call `turnkey TOOL -h` to learn more about each tool.
-
-Tools that can start a sequence:
-{first_tool_choices}
-Tools that go into a sequence:
-{eval_tool_choices}
-Management tool choices:
-{mgmt_tool_choices}""",
-        choices=tool_parsers.keys(),
-    )
-
-    # run as if "-h" was passed if no parameters are passed
-    if len(sys.argv) == 1:
-        sys.argv.append("-h")
-
-    # Break sys.argv into categories based on which tools were invoked
-    # Arguments that are passed prior to invoking a tool are categorized as
-    # global arguments that should be used to initialize the state.
-    current_tool = "globals"
-    tools_invoked = {current_tool: []}
-    cmd = sys.argv[1:]
-    while len(cmd):
-        if cmd[0] in tool_parsers.keys():
-            # Make sure each tool was only called once
-            if cmd[0] in tools_invoked.keys():
-                parser.error(
-                    "A single call to turnkey can only invoke each tool once, "
-                    f"however this call invokes tool {cmd[0]} multiple times."
-                )
-            current_tool = cmd.pop(0)
-            tools_invoked[current_tool] = []
-        else:
-            tools_invoked[current_tool].append(cmd.pop(0))
-
-    # Trick argparse into thinking tools was not a positional argument
-    # this helps to avoid an error where an incorrect arg/value pair
-    # can be misinterpreted as the tools positional argument
-    tools_action.option_strings = ["--tools"]
-
-    # Do one pass of parsing to figure out if -h was used
-    global_args = vars(parser.parse_args(tools_invoked["globals"]))
-
-    # Remove "tools" from global args because it was just there
-    # as a placeholder
-    global_args.pop("tools")
-
-    # Remove globals from the list since its already been parsed
-    tools_invoked.pop("globals")
-    evaluation_tools = []
-    management_tools = []
-    for cmd, argv in tools_invoked.items():
-        tool_parsers[cmd].parse_args(argv)
-
-        # Keep track of whether the tools are ManagementTool or not,
-        # since ManagementTools are mutually exclusive with evaluation
-        # tools
-        if issubclass(tool_classes[cmd], ManagementTool):
-            management_tools.append(cmd)
-        else:
-            evaluation_tools.append(cmd)
-
-    if len(management_tools) > 0 and len(evaluation_tools) > 0:
-        parser.error(
-            "This call to turnkey invoked both management and "
-            "evaluation tools, however each call to turnkey "
-            "is only allowed to invoke one or the other. "
-            f"Management tools: {management_tools};"
-            f"Evaluation tools: {evaluation_tools}."
-        )
-
-    if len(management_tools) == 0 and len(evaluation_tools) == 0:
-        parser.error(
-            "Calls to turnkey are required to call at least "
-            "one tool or management tool."
-        )
-
-    # Convert tool names into Tool instances
-    tool_instances = {tool_classes[cmd](): argv for cmd, argv in tools_invoked.items()}
-    evaluation_tools = [tool_classes[cmd] for cmd in evaluation_tools]
-
-    return global_args, tool_instances, evaluation_tools
-
-
 def main():
 
     supported_tools = get_supported_tools()
 
@@ -0,0 +1,135 @@
+import argparse
+import sys
+from typing import List, Dict, Tuple, Any
+from turnkeyml.tools import Tool, FirstTool
+import turnkeyml.common.printing as printing
+from turnkeyml.tools.management_tools import ManagementTool
+
+
+class CustomArgumentParser(argparse.ArgumentParser):
+
+    def error(self, message):
+        self.print_usage()
+        printing.log_error(message)
+        self.exit(2)
+
+
+def _tool_list_help(tools: List[Tool], subclass, exclude=None) -> str:
+    help = ""
+
+    for tool_class in tools:
+        if exclude and issubclass(tool_class, exclude):
+            continue
+        if issubclass(tool_class, subclass):
+            help = (
+                help
+                + f" * {tool_class.unique_name}: {tool_class.parser().short_description}\n"
+            )
+
+    return help
+
+
+def parse_tools(
+    parser: argparse.ArgumentParser, supported_tools: List[Tool], cli_name="turnkey"
+) -> Tuple[Dict[str, Any], Dict[Tool, List[str]], List[str]]:
+    """
+    Add the help for parsing tools and their args to an ArgumentParser.
+
+    Then, perform the task of parsing a full turnkey CLI command including
+    teasing apart the global arguments and separate tool invocations.
+    """
+
+    tool_parsers = {tool.unique_name: tool.parser() for tool in supported_tools}
+    tool_classes = {tool.unique_name: tool for tool in supported_tools}
+
+    # Sort tools into categories and format for the help menu
+    first_tool_choices = _tool_list_help(supported_tools, FirstTool)
+    eval_tool_choices = _tool_list_help(supported_tools, Tool, exclude=FirstTool)
+    mgmt_tool_choices = _tool_list_help(supported_tools, ManagementTool)
+
+    tools_action = parser.add_argument(
+        "tools",
+        metavar="tool --tool-args [tool --tool-args...]",
+        nargs="?",
+        help=f"""\
+Run `{cli_name} TOOL -h` to learn more about each tool.
+
+Tools that can start a sequence:
+{first_tool_choices}
+Tools that go into a sequence:
+{eval_tool_choices}
+Management tools:
+{mgmt_tool_choices}""",
+        choices=tool_parsers.keys(),
+    )
+
+    # run as if "-h" was passed if no parameters are passed
+    if len(sys.argv) == 1:
+        sys.argv.append("-h")
+
+    # Break sys.argv into categories based on which tools were invoked
+    # Arguments that are passed prior to invoking a tool are categorized as
+    # global arguments that should be used to initialize the state.
+    current_tool = "globals"
+    tools_invoked = {current_tool: []}
+    cmd = sys.argv[1:]
+    while len(cmd):
+        if cmd[0] in tool_parsers.keys():
+            # Make sure each tool was only called once
+            if cmd[0] in tools_invoked.keys():
+                parser.error(
+                    "A single call to turnkey can only invoke each tool once, "
+                    f"however this call invokes tool {cmd[0]} multiple times."
+                )
+            current_tool = cmd.pop(0)
+            tools_invoked[current_tool] = []
+        else:
+            tools_invoked[current_tool].append(cmd.pop(0))
+
+    # Trick argparse into thinking tools was not a positional argument
+    # this helps to avoid an error where an incorrect arg/value pair
+    # can be misinterpreted as the tools positional argument
+    tools_action.option_strings = ["--tools"]
+
+    # Do one pass of parsing to figure out if -h was used
+    global_args = vars(parser.parse_args(tools_invoked["globals"]))
+
+    # Remove "tools" from global args because it was just there
+    # as a placeholder
+    global_args.pop("tools")
+
+    # Remove globals from the list since its already been parsed
+    tools_invoked.pop("globals")
+    evaluation_tools = []
+    management_tools = []
+    for cmd, argv in tools_invoked.items():
+        tool_parsers[cmd].parse_args(argv)
+
+        # Keep track of whether the tools are ManagementTool or not,
+        # since ManagementTools are mutually exclusive with evaluation
+        # tools
+        if issubclass(tool_classes[cmd], ManagementTool):
+            management_tools.append(cmd)
+        else:
+            evaluation_tools.append(cmd)
+
+    if len(management_tools) > 0 and len(evaluation_tools) > 0:
+        parser.error(
+            "This call to turnkey invoked both management and "
+            "evaluation tools, however each call to turnkey "
+            "is only allowed to invoke one or the other. "
+            f"Management tools: {management_tools};"
+            f"Evaluation tools: {evaluation_tools}."
+        )
+
+    if len(management_tools) == 0 and len(evaluation_tools) == 0:
+        parser.error(
+            "Calls to turnkey are required to call at least "
+            "one tool or management tool."
+        )
+
+    # Convert tool names into Tool instances
+    tool_instances = {tool_classes[cmd](): argv for cmd, argv in tools_invoked.items()}
+    evaluation_tools = [tool_classes[cmd] for cmd in evaluation_tools]
+
+    return global_args, tool_instances, evaluation_tools
@@ -122,7 +122,7 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser:
         # passed directly to the `run()` method
 
         parser = __class__.helpful_parser(
-            short_description="Manage the turnkey build cache " f"",
+            short_description="Manage the build cache " f"",
             add_help=add_help,
         )
 
 
@@ -1 +1 @@
-__version__ = "6.0.2"
+__version__ = "6.0.3"
@@ -212,7 +212,7 @@ def test_004_test_models(self):
         assert len(l.data) > 0
 
         # Check that the list contains the models we expect
-        assert any(model.id == "Llama-3.2-1B-Instruct-Hybrid" for model in l.data)
+        assert any(model.id == "Qwen2.5-0.5B-Instruct-CPU" for model in l.data)
 
     # Endpoint: /api/v0/completions
     def test_005_test_completions(self):
Original file line number	Diff line number	Diff line change
`@@ -122,7 +122,7 @@ def parser(parser: argparse.ArgumentParser = None, add_help: bool = True):`
`122`	`122`	`# Allow inherited classes to initialize and pass in a parser, add parameters to it if so`
`123`	`123`	`if parser is None:`
`124`	`124`	`parser = __class__.helpful_parser(`
`125`		`- short_description="Benchmark a torch.nn.Module LLM",`
	`125`	`+ short_description="Benchmark a huggingface-style PyTorch LLM",`
`126`	`126`	`add_help=add_help,`
`127`	`127`	`)`
`128`	`128`
Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ def __init__(self):`
`42`	`42`	`@staticmethod`
`43`	`43`	`def parser(add_help: bool = True) -> argparse.ArgumentParser:`
`44`	`44`	`parser = __class__.helpful_parser(`
`45`		`- short_description="Run accuracy benchmark using HumanEval dataset",`
	`45`	`+ short_description="Measure coding accuracy with HumanEval",`
`46`	`46`	`add_help=add_help,`
`47`	`47`	`)`
`48`	`48`	`parser.add_argument(`
Original file line number	Diff line number	Diff line change
`@@ -152,7 +152,7 @@ def __init__(self):`
`152`	`152`	`@staticmethod`
`153`	`153`	`def parser(add_help: bool = True) -> argparse.ArgumentParser:`
`154`	`154`	`parser = __class__.helpful_parser(`
`155`		`- short_description="Wrap Llamacpp models with an API",`
	`155`	`+ short_description="Wrap llama.cpp models with an API",`
`156`	`156`	`add_help=add_help,`
`157`	`157`	`)`
`158`	`158`
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ def __init__(self):`
`24`	`24`	`@staticmethod`
`25`	`25`	`def parser(add_help: bool = True) -> argparse.ArgumentParser:`
`26`	`26`	`parser = __class__.helpful_parser(`
`27`		`- short_description="Benchmark a Llamacpp model",`
	`27`	`+ short_description="Benchmark a llama.cpp model",`
`28`	`28`	`add_help=add_help,`
`29`	`29`	`)`
`30`	`30`
Original file line number	Diff line number	Diff line change
`@@ -122,7 +122,7 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser:`
`122`	`122`	# passed directly to the `run()` method
`123`	`123`
`124`	`124`	`parser = __class__.helpful_parser(`
`125`		`- short_description="Manage the turnkey build cache " f"",`
	`125`	`+ short_description="Manage the build cache " f"",`
`126`	`126`	`add_help=add_help,`
`127`	`127`	`)`
`128`	`128`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "6.0.2"`
	`1`	`+__version__ = "6.0.3"`