diff --git a/mlflow/.env.template b/mlflow/.env.template new file mode 100644 index 000000000..7f807e2fb --- /dev/null +++ b/mlflow/.env.template @@ -0,0 +1,7 @@ +[google] +GOOGLE_APPLICATION_CREDENTIALS = + +[mlflow] +MLFLOW_TRACKING_URI = https://mlflow-dot-broad-ml4cvd.uc.r.appspot.com +MLFLOW_TRACKING_USERNAME = +MLFLOW_TRACKING_PASSWORD = diff --git a/mlflow/Dockerfile b/mlflow/Dockerfile new file mode 100644 index 000000000..29a323f59 --- /dev/null +++ b/mlflow/Dockerfile @@ -0,0 +1,21 @@ +FROM python:3.8-slim + +# displays logs imidietly to the stream, they wont be part of the buffer +ENV PYTHONUNBUFFERED True +ENV GCP_PROJECT="783282864357" + +WORKDIR /app + +COPY ./requirements.txt ./requirements.txt + +RUN pip install -r requirements.txt + +COPY get_secret.py /app/get_secret.py +COPY entry-point.sh /app/entry-point.sh +COPY mlflow_auth.py /app/mlflow_auth.py + +ENTRYPOINT ["/usr/bin/env", "bash", "/app/entry-point.sh"] + +EXPOSE 8080 +EXPOSE 8000/tcp +CMD [ "--backend-store-uri", "/tmp"] diff --git a/mlflow/README.md b/mlflow/README.md new file mode 100644 index 000000000..624c12510 --- /dev/null +++ b/mlflow/README.md @@ -0,0 +1,73 @@ +# MLflow Usage and Maintance + + +## Using MLflow on App Engine +### To see the MLflow UI +ML4H has a GAE url for MLflow + +The UI needs a username and password to access +These are stored in the GCP Secret Manager + +### To add your experiments to this MLflow instance +You can use this from your local machine or from your GCP VM + +To start, you will want to use the .env.template to make your own .env file +cp .env.template .env +and then update your python script with this path + +you will also need to log in using +gcloud auth application-default login + +note the path that your auth json is stored at and put that into your .env file as well + + +## General Architecture + +MLflow is using Cloud SQL as a backend database-- this stores all of the experiment runs that show up in the UI +MLflow is stood up on App Engine. The server is packaged up as a docker image deployed. It is also running on CloudRun (though likely this will be taken down in the future) +MLflow is uses a Google bucket as an artifact store. + + + +## Setup and Maintanence + +The database for this instance is in GCP Cloud SQL +To access the database, navigate to the GCP SQL dashboard, then navigate to the database +Next you want to connect by clicking OPEN CLOUD SHELL +Once there you can log in with +gcloud sql connect --user=postgres --quiet + +Use the credentials from the GCP secret manager + +Once in the database, you can use \d to get a full list of tables +Likely the tables you are interested in will be `runs` and `tags` + +In the future, to maximize security, the app can live behind a Identity-Aware Proxy. +Note that this cannot be applied so specific services, so if in the future a different service as added, a different security set-up may be necessary + + + + +### For the App Engine setup +The default service account's name is PROJECT_ID@appspot.gserviceaccount.com + +Ensure that the service account your app is using to authenticate calls to Cloud SQL has the Cloud SQL Client IAM role. + + +From the mlflow directory, run: +gcloud app deploy +gcloud sql instances describe broad-ml4cvd-staging-db + + + +(is this even necessary to include?) +gcloud app deploy --image-url=us-central1-docker.pkg.dev/gvs-internal/rc-testing-repo/mlflow-imagine:latest + + + + +Update the Mlflow docker (only needed for CloudRun implementation) +docker build -t mlflow-gcp . +docker tag mlflow-gcp us-central1-docker.pkg.dev/gvs-internal/rc-testing-repo/mlflow-imagine:latest + +TODO add bit about proxying to be able to see this diff --git a/mlflow/app.yaml b/mlflow/app.yaml new file mode 100644 index 000000000..7f6305af6 --- /dev/null +++ b/mlflow/app.yaml @@ -0,0 +1,8 @@ +service: mlflow +runtime: custom +env: flex +automatic_scaling: + min_num_instances: 1 + max_num_instances: 2 +beta_settings: + cloud_sql_instances: broad-ml4cvd:us-central1:broad-ml4cvd-staging-db diff --git a/mlflow/check_tracking.py b/mlflow/check_tracking.py new file mode 100644 index 000000000..942e0b2e9 --- /dev/null +++ b/mlflow/check_tracking.py @@ -0,0 +1,87 @@ +import base64 +import configparser +import os +import pickle +import random +import tempfile +from pathlib import Path + +import mlflow + +PONY = """ +iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAHh0lEQVRYw61Xa2wU1xk9d96z7/Wu +12Ztrw22qbHBTsE8nAYbjFS1QagthEpR+6NRpQaaqi20aiqURkojRQpVSJsGoeRHSZUKVVGVlLQl +EgUDJsQJTVMDtsEYjI1tvGuvvZ59zOzO6/bHYtY8BBhn/t25c893vm/O+e69BF/i86Mnfl6/ZXXj +9yoCgU2yKFZTSj1qLnfjSjR67Pj5/hfeOv167M415MsIvGP9rvqNjU2/qy8vezLkdUHiC7A2ARgK +dF3u7z382bnGA5377LlruYUG37P5lR9Xhxv2EdYrXhwH+sZz8EgE9WU8zDVe8BwHb3cCZf6iBgA+ +ANNz1zMLCf6bLXtfWBZZud8te8XZd7qRRUxJoc9NIFQGQdh8iFRWu6TphnwnxrwJdHR2MQDwq2/8 +dktt2YqXRV7Kl9q2MJOegmkZMEwdgWUOSJIImspCUVWUBwJ1e7ZtHvrol+88syACbrfroy/O9V5c +sWrVfkkoJDSTmYbH6QdDGHgcPty4PA7TNKEldIg8h5xhQjdNbnGoZN8jE+jo7KopLg58PRIpr2va +XltBhMKcJMhQs+mb1bDhLXGDYRgMxBMwLQulPh/KAwEMTU6ce2QR2rZdx3M8ACBcWYLpliQmTuWD +OkQXFGYSwbogxBCDhuZ6MCyDddtWY6BvAENnL13tOXbpjURK/dMjE2AYxrYsC7puQBB4NGysQaBq +Aqm4CsHDorxmBTjubkiv34GvbFju+kvHewf+1fmhsRANfKpqmkpI3ueEECxaUoKa5kosXlaJVDKJ +6Pg4KKUFbSQS6E10w1fiLlm5avVjC3JBe2vL9HDv4J8pLfQSQgg0TcOlvh54vA50x7vwxw/3IpVO +5ecZgser2jCWGsaGjZvIgm1Y1K181eoaxsjHvRj89/+g53QMDw1h1cqV+G+0C+uXtKPCqMKhf7wD +Qgi8Xh/OnPkEnMHANE3lTrx5d0JZFOqMwQQqHA6wRMC1zwcQqglCFEWwLEGkLILKyioYkzlkdQ0i +L2FJbS3S2aTB83z8oQnsWL8r7JTcG3zuYGVO10hciV4jhHQQEJdHljE2NQWnJMGZdiJUuggAwLMW +uru7ceTIEailU9jKPQ0AqKiIgFLK9/ReagHwz/sS2Nm6my/xl/2+LLh4h9dZxAAABYFlGUhrSjal +6Ux5gENVKAQCYLRYhGXbyOo6Ghevxet/24u4exTf/c52cISHruswDAMZVbvGMMypB1agvHjJH6rD +9Tt5lsHSUg4hDwuBJTBsAZmcQ7oajcG0KRgCTCoK1JiETDpzsxkVYeeW3chu1uBHKagFCIIA3TDN +KZ22tre2pO6Mx84dPPvEz+oaqpoPCpxI1lYLKPGw4FgCQgCOIZAFAkkAzl7JQeJ5BD0yik0eExeG +wdUGQRgGrCVCMB3I5QxkMqo6oyjHb9yIPtvW3Hj+Xr/6tgqUFS9+UhadZJGPgVe+t0ECLjec4jjO +XM6AZRisiARQU+LDaO8o5NW1AABKKRQleTUWm6xvb23R79vc5g4ETiwFALd07+ATimKd6On5YGhS +S3scftg2xeCEib5RA+x09lZf4DgOgSJ/NSHk+w/srnMHOTOXBYCsQe/6MDqTUN4+eqKt4/zAdo5z +udKaAtO2AACjCQtJB397aTkOfr/vh/MiEJsauWDZJsZnbNh3cDh9sX9XOqt/9saJVy3D1Hvcsg9+ +VwAAkCUZhBrL7gKXRHFtR2cX99AEKOjRRCquGhbF6LRZ2AUpRVbXn/5B++PaoR37T4rN9E1hqQ3L +lYVUT9HyXAPcXvdd4CzLsACc9yNwmws+H+7KrYysCYV84XUzKkV5EQuWISCEIOR1V0eCxUxlKFTV +z85c3vBUS0vN+ggiy8OQnfI9wZOp1NDaVU2vPnQFACCuRF+aVKJjpg1ciRWqUOrzI53VoOayiF2L +faLrhm0aJnK5e4s8lUrb8fj08/PSwK+/uefb31rbdLTcr5ppTcHItAXTyovBsCxMpVLGB5/+57UN +W1vDpmlmeJ5HMpnE9HSikHUyhWhsAqNjY2+1fW3New8icEsgz7X9gtu2rvng0nDYRwhB/41xXJ20 +kTUoXCzBF4ODw38/e2719pef0oqLgxNer0eetR3DFHZZw9DhcspQ1TR9qENOQYAIBz1un00peq5f +R4nXA8M0wLN58PqK8krDNJsopS1OhyzPla5l23NGt2xYMS8CDMFYIpNRGEIQcLtwbSKDNdUuiDdv +OQLLwbapJPD8GkHIn0Yty8rfrebkmhc+wLFs47wIvHnyNevjvv5XkpqGUp8fj1UVo9RbMEnPyMiA +yHPHnU7HklvZUsCybMwe0fLWy6/hBaHy+KnOpnmJ8KeHnt978PjJbReuD3eMxONaTJnB1WjUPNHT +c/ivp8+2H+jcp3E8V17odixM07rtDOhwyMjldMiSBEEQnnloEc4+Lx5+6f0XD+P9na27WQIEKJA8 +0LkvOzuvqppFKS1kTWkWhAizyXAcB9umMAwDHMsufxCBed+OOzq7wpIk/sTpdDYwhLDJVOqY0+ls +kiVxKwAHpVAVRXk3nUm+C+DCprZW9X54/wfaeRiPgPVMkQAAAABJRU5ErkJggg== +""".strip().replace( + "\n", "" +) + + +def load_env_values() -> None: + config = configparser.ConfigParser(interpolation=None) + config.sections() + config.read("/path/to/.env") + + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = config["google"]["GOOGLE_APPLICATION_CREDENTIALS"] + + os.environ["MLFLOW_TRACKING_USERNAME"] = config["mlflow"]["MLFLOW_TRACKING_USERNAME"] + os.environ["MLFLOW_TRACKING_PASSWORD"] = config["mlflow"]["MLFLOW_TRACKING_PASSWORD"] + os.environ["MLFLOW_TRACKING_URI"] = config["mlflow"]["MLFLOW_TRACKING_URI"] + + +def main() -> None: + print("STARTING") + load_env_values() + with mlflow.start_run(run_name="rc_remote_test"): + mlflow.log_param("a", random.choice([1, 2, 3, 5, 7])) + for epoch in range(10): + mlflow.log_metric("m", 2 * epoch * epoch + random.random() - 0.5, step=epoch) + with tempfile.TemporaryDirectory() as temp_dir: + temp_dir_path = Path(temp_dir) + print("seems like we get here and then cant log the artifact") + model_file_path = temp_dir_path / "model.pickle" + with open(model_file_path, "wb") as f: + pickle.dump({"rc_remote_test": "grzybowa"}, f) + mlflow.log_artifact(model_file_path) + + image_file_path = temp_dir_path / "pony.png" + with open(image_file_path, "wb") as f: + f.write(base64.b64decode(PONY)) + mlflow.log_artifact(image_file_path) + print("DONE") + + +if __name__ == "__main__": + main() diff --git a/mlflow/entry-point.sh b/mlflow/entry-point.sh new file mode 100755 index 000000000..aa075324e --- /dev/null +++ b/mlflow/entry-point.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash + +set -e + +# Verify that all required variables are set +if [[ -z "${GCP_PROJECT}" ]]; then + echo "Error: GCP_PROJECT not set" + exit 1 +fi + +# Fetch secrets from Secret Manager in CGP +export MLFLOW_TRACKING_USERNAME="$(python3 /app/get_secret.py --project="${GCP_PROJECT}" --secret=mlflow_tracking_username)" +export MLFLOW_TRACKING_PASSWORD="$(python3 /app/get_secret.py --project="${GCP_PROJECT}" --secret=mlflow_tracking_password)" +export ARTIFACT_URL="$(python3 /app/get_secret.py --project="${GCP_PROJECT}" --secret=mlflow_artifact_url)" +if [[ -z "${DATABASE_URL}" ]]; then # Allow overriding for local deployment + export DATABASE_URL="$(python3 /app/get_secret.py --project="${GCP_PROJECT}" --secret=mlflow_database_url)" +fi + +# Verify that all required variables are set +if [[ -z "${MLFLOW_TRACKING_USERNAME}" ]]; then + echo "Error: MLFLOW_TRACKING_USERNAME not set" + exit 1 +fi + +if [[ -z "${MLFLOW_TRACKING_PASSWORD}" ]]; then + echo "Error: MLFLOW_TRACKING_PASSWORD not set" + exit 1 +fi + +if [[ -z "${ARTIFACT_URL}" ]]; then + echo "Error: ARTIFACT_URL not set" + exit 1 +fi + +if [[ -z "${DATABASE_URL}" ]]; then + echo "Error: DATABASE_URL not set" + exit 1 +fi + +if [[ -z "${PORT}" ]]; then + export PORT=8080 +fi + +export WSGI_AUTH_CREDENTIALS="${MLFLOW_TRACKING_USERNAME}:${MLFLOW_TRACKING_PASSWORD}" +export _MLFLOW_SERVER_ARTIFACT_ROOT="${ARTIFACT_URL}" +export _MLFLOW_SERVER_FILE_STORE="${DATABASE_URL}" + +echo "hello its me im the problem its me" +echo ${_MLFLOW_SERVER_FILE_STORE} +echo "or me" +echo ${_MLFLOW_SERVER_ARTIFACT_ROOT} + +# Start MLflow and ngingx using supervisor +exec gunicorn -b "${HOST}:${PORT}" -w 4 --log-level debug --access-logfile=- --error-logfile=- --log-level=debug mlflow_auth:app diff --git a/mlflow/get_secret.py b/mlflow/get_secret.py new file mode 100644 index 000000000..f8d435725 --- /dev/null +++ b/mlflow/get_secret.py @@ -0,0 +1,19 @@ +import click +from google.cloud import secretmanager + + +@click.command() +@click.option("--version", type=str, required=True, default="latest") +@click.option("--project", type=str, required=True) +@click.option("--secret", type=str, required=True) +def main(version: str, project: str, secret: str) -> None: + client = secretmanager.SecretManagerServiceClient() + response = client.access_secret_version( + request={"name": f"projects/783282864357/secrets/{secret}/versions/{version}"} + ) + payload = response.payload.data.decode("UTF-8") + print(payload, end="") + + +if __name__ == "__main__": + main() diff --git a/mlflow/mlflow_auth.py b/mlflow/mlflow_auth.py new file mode 100644 index 000000000..2b9739c09 --- /dev/null +++ b/mlflow/mlflow_auth.py @@ -0,0 +1,4 @@ +from mlflow.server import app as mlflow_app +from wsgi_basic_auth import BasicAuth + +app = BasicAuth(mlflow_app) diff --git a/mlflow/requirements.txt b/mlflow/requirements.txt new file mode 100644 index 000000000..d174fbca9 --- /dev/null +++ b/mlflow/requirements.txt @@ -0,0 +1,8 @@ +google-cloud-storage==2.9.0 +google-cloud-secret-manager==2.16.1 +mlflow==2.4.1 +click==8.1.3 +pg8000==1.29.3 +wsgi-basic-auth==1.1.0 +gunicorn==20.1.0 +psycopg2-binary==2.9.6