app.py

import torch
from transformers import pipeline

from typing import Union

# FastAPI imports
from fastapi import Request,FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel

app = FastAPI()
origins = ["*"]

app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
print (f"##### Device is {torch_device} #####")

# Create model pipeline
pipe = pipeline(
    "text-generation",
    model="./data/models/Meta-Llama-3-8B-Instruct",
    # model="meta-llama/Meta-Llama-3-8B-Instruct", # if you want to download the model from HF, login required
    model_kwargs={"torch_dtype": torch.bfloat16},
    device=torch_device,
)

# Needed when config values are not provided by the user.
default_generation_config = {
    "temperature": 0.2, #0.2
    "top_p": 0.9,
    "max_new_tokens": 256, #128
}

# Needed when no system prompt is provided by the user.
default_system_prompt = "You are a helpful assistant called Llama-3. Write out your answer short and succinct!"

# model.eval()
# if torch.__version__ >= "2":
#     model = torch.compile(model)

print("##### Model is loaded #####")

# Data model for making POST requests to /chat 
class ChatRequest(BaseModel):
    messages: list
    temperature: Union[float, None] = None
    top_p: Union[float, None] = None
    max_new_tokens: Union[int, None] = None


def generate(messages: list, temperature: float = None, top_p: float = None, max_new_tokens: int = None) -> str:
    """Generates a response given a list of messages (conversation history) and the generation configuration."""

    temperature = temperature if temperature else default_generation_config["temperature"]
    top_p = top_p if top_p else default_generation_config["top_p"]
    max_new_tokens = max_new_tokens if max_new_tokens else default_generation_config["max_new_tokens"]

    prompt = pipe.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
            )
    terminators = [
        pipe.tokenizer.eos_token_id,
        pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipe(
        prompt,
        max_new_tokens=max_new_tokens,
        eos_token_id=terminators,
        do_sample=False,
        temperature=temperature,
        top_p=top_p,
    )

    generated_outputs = outputs[0]["generated_text"] # full prompt
    text = generated_outputs[len(prompt):] # just the response
    return text

def isSystemPrompt(msg):
    if msg["role"] == "system":
        return True
    else:
        return False

@app.get("/home")
def home():
    """Mainly for quick testing. If the service is running, you should see a full welcome message generated by the model."""

    input_text = "Write a welcome message for the home page of a large language model chatbot"
    messages = [
        {"role": "system", "content": default_system_prompt},
        {"role": "user", "content": input_text},
    ]

    print("##### Generating welcome response #####")
    response = generate(messages)

    welcome_text = "<h2>Welcome to LLM service!</h2>"
    welcome_text += response
    return welcome_text

@app.post("/chat")
def chat(chat_request: ChatRequest):
    """The main endpoint for interacting with the model. 
    A list of messages is required, but the other config parameters can be left empty.
    Providing an initial system prompt in the messages is also optional."""

    messages = chat_request.messages
    temperature=chat_request.temperature
    top_p=chat_request.top_p
    max_new_tokens=chat_request.max_new_tokens

    # check system prompt, add one if necessary
    if not isSystemPrompt(messages[0]):
        msg = {"role": "system", "content": default_system_prompt}
        messages.insert(0, msg)

    print("##### Generating response... #####")
    response = generate(messages, temperature, top_p, max_new_tokens)
    return response

if __name__ == "__main__":
    # setting debug to True enables hot reload
    # and also provides a debugger shell
    # if you hit an error while running the server
    app.run(debug=False)