climatepolicyradar · jesse-c · Oct 17, 2024 · Oct 17, 2024 · jesse-c · Oct 18, 2024
@@ -11,57 +11,79 @@ permissions:
   contents: write
 
 jobs:
-  test:
+  lint:
     runs-on: ubuntu-latest
     steps:
       - name: Check out repository code
         uses: actions/checkout@v4
 
-      - name: Install poetry
-        shell: bash
-        run: pipx install poetry==1.8.2
+      - name: Install uv
+        uses: astral-sh/setup-uv@v3
+        with:
+          enable-cache: true
+          cache-dependency-glob: "uv.lock"
 
-      - name: Install python or load from cache with dependencies
+      - name: Install Python or load from cache with dependencies
         uses: actions/setup-python@v5
         with:
-          python-version-file: .python-version
-          cache: poetry
+          python-version-file: "pyproject.toml"
 
-      - name: Install dependencies
-        run: poetry install --with dev
+      - name: Install the project
+        run: uv sync --all-extras --dev
 
       - name: Lint
         run: |
-          poetry run pre-commit install
-          poetry run pre-commit run --all-files --show-diff-on-failure
+          uv run pre-commit install
+          uv run pre-commit run --all-files --show-diff-on-failure
+
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v3
+        with:
+          enable-cache: true
+          cache-dependency-glob: "uv.lock"
+
+      - name: Install Python or load from cache with dependencies
+        uses: actions/setup-python@v5
+        with:
+          python-version-file: "pyproject.toml"
+
+      - name: Install the project
+        run: uv sync --all-extras --dev
 
       - name: Test
-        run: poetry run pytest -v
+        run: uv run pytest -v
 
   deploy:
     if: github.ref == 'refs/heads/main'
-    needs: [test]
+    needs: [lint, test]
     runs-on: ubuntu-latest
     steps:
       - name: Check out repository code
         uses: actions/checkout@v4
 
-      - name: Install poetry
-        shell: bash
-        run: pipx install poetry==1.8.2
+      - name: Install uv
+        uses: astral-sh/setup-uv@v3
+        with:
+          enable-cache: true
+          cache-dependency-glob: "uv.lock"
 
-      - name: Install python or load from cache with dependencies
+      - name: Install Python or load from cache with dependencies
         uses: actions/setup-python@v5
         with:
-          python-version-file: .python-version
-          cache: poetry
+          python-version-file: "pyproject.toml"
 
-      - name: Install dependencies
-        run: poetry install
+      - name: Install the project
+        run: uv sync --all-extras --dev
 
       - name: Deploy docs
         working-directory: ./docs
-        run: poetry run mkdocs gh-deploy --force
+        run: uv run mkdocs gh-deploy --force
 
       - name: Deploy more
         run: echo 'this is a placeholder'
@@ -26,8 +26,3 @@ repos:
         files: ^src/
         types: [python]
         additional_dependencies: ['pyright@1.1.294']
-  - repo: https://github.com/python-poetry/poetry
-    rev: 1.8.0
-    hooks:
-    -   id: poetry-check
-
@@ -3,6 +3,7 @@
 from dataclasses import dataclass
 from io import BytesIO
 from pathlib import Path
+from typing import Optional
 
 import boto3
 from cpr_sdk.parser_models import BaseParserOutput
@@ -158,8 +159,8 @@ def determine_classifier_ids(
 
 @flow(log_prints=True)
 def classifier_inference(
-    document_ids: list[str] = None,
-    classifier_spec: list[tuple[WikibaseID, str]] = None,
+    document_ids: Optional[list[str]] = None,
+    classifier_spec: Optional[list[tuple[WikibaseID, str]]] = None,
 ):
     """
     Flow to run inference on documents within a bucket prefix

@@ -2,57 +2,57 @@ set dotenv-load
 
 # install dependencies and set up the project
 install:
-    poetry install --with dev
-    poetry run pre-commit install
-    poetry run ipython kernel install --user
+    uv sync --extra dev
+    uv run pre-commit install
+    uv run ipython kernel install --user
 
 # test the project
 test:
-    poetry run pytest
+    uv run pytest
 
 # update the snapshots for the tests
 test-snapshot-update:
-    poetry run pytest --snapshot-update
+    uv run pytest --snapshot-update
 
 # run linters and code formatters
 lint:
-    poetry run pre-commit run --all-files --show-diff-on-failure
+    uv run pre-commit run --all-files --show-diff-on-failure
 
 # build a dataset of passages
 build-dataset:
-    poetry run python scripts/build_dataset.py
+    uv run python scripts/build_dataset.py
 
 # fetch metadata and labelled passages for a specific wikibase ID
 get-concept id:
-    poetry run python scripts/get_concept.py --wikibase-id {{id}}
+    uv run python scripts/get_concept.py --wikibase-id {{id}}
 
 # train a model for a specific wikibase ID
 train id +OPTS="":
-    poetry run train --wikibase-id {{id}} {{OPTS}}
+    uv run scripts/train.py --wikibase-id {{id}} {{OPTS}}
 
 # evaluate a model for a specific wikibase ID
 evaluate id:
-    poetry run python scripts/evaluate.py --wikibase-id {{id}}
+    uv run python scripts/evaluate.py --wikibase-id {{id}}
 
 # promote a model for a specific wikibase ID
 promote id +OPTS="":
-    poetry run promote --wikibase-id {{id}} {{OPTS}}
+    uv run scripts/promote.py --wikibase-id {{id}} {{OPTS}}
 
 # run a model for a specific wikibase ID on a supplied string
 label id string:
-    poetry run python scripts/label.py --wikibase-id {{id}} --input-string {{string}}
+    uv run python scripts/label.py --wikibase-id {{id}} --input-string {{string}}
 
 # find instances of the concept in a set of passages for a specific wikibase ID
 predict id:
-    poetry run python scripts/predict.py --wikibase-id {{id}}
+    uv run python scripts/predict.py --wikibase-id {{id}}
 
 # sample a set of passages from the dataset for a specific wikibase ID
 sample id:
-    poetry run python scripts/sample.py --wikibase-id {{id}}
+    uv run python scripts/sample.py --wikibase-id {{id}}
 
 # push a sampled set of passages to argilla for a specific wikibase ID
 push-to-argilla id usernames workspace:
-    poetry run python scripts/push_to_argilla.py --wikibase-id {{id}} --usernames {{usernames}} --workspace {{workspace}}
+    uv run python scripts/push_to_argilla.py --wikibase-id {{id}} --usernames {{usernames}} --workspace {{workspace}}
 
 # run the full pipeline for a specific wikibase ID
 create-labelling-task id usernames workspace:
@@ -64,6 +64,6 @@ create-labelling-task id usernames workspace:
 
 # visualise IAA, model vs gold-standard agreement, and positive predictions on the full dataset
 visualise-labels id:
-    poetry run python scripts/visualise_labels.py --wikibase-id {{id}}
+    uv run python scripts/visualise_labels.py --wikibase-id {{id}}
 
 analyse-classifier id: (get-concept id) (train id) (predict id) (evaluate id) (visualise-labels id)
@@ -1,70 +1,46 @@
-[tool.poetry]
+[project]
 name = "knowledge-graph"
 version = "0.1.0"
-description = ""
-authors = ["CPR Data Science <dsci@climatepolicyradar.org>"]
-license = "Apache 2.0"
+description = "Add your description here"
 readme = "README.md"
-packages = [
-  { include = "src", from = "." },
-  { include = "tests", from = "." },
-  { include = "scripts", from = "." },
+requires-python = ">=3.11"
+dependencies = [
+    "argilla==1.29.1",
+    "azure-pdf-parser",
+    "boto3==1.35.31",
+    "cpr-sdk==1.9.1",
+    "griffe==0.48.0",
+    "httpx==0.26.0",
+    "ipykernel==6.29.3",
+    "more-itertools==10.3.0",
+    "neomodel==5.3.3",
+    "prefect==2.16.8",
+    "python-dotenv==1.0.1",
+    "rapidfuzz==3.10.0",
+    "rich==13.7.0",
+    "sentence-transformers==3.1.1",
+    "tqdm>=4.66.5",
+    "typer==0.9.4",
+    "wandb==0.18.3",
+    # The following is to work around the fact that pytorch stopped supported intel-macs after 2.2.2
+    # See: https://github.com/pytorch/pytorch/issues/114602
+    "torch==2.2.2; sys_platform == 'darwin' and platform_machine == 'x86_64'",
+    "torch==2.4.1; sys_platform != 'darwin' or platform_machine != 'x86_64'",
 ]
 
-[tool.poetry.dependencies]
-python = ">=3.10,<3.11"
-httpx = "^0.26.0"
-rich = "^13.7.0"
-tqdm = "^4.66.2"
-ipykernel = "^6.29.3"
-azure_pdf_parser = { git = "https://github.com/climatepolicyradar/azure-pdf-parser.git", tag = "v0.4.3" }
-# navigator_document_parser = { git = "https://github.com/climatepolicyradar/navigator-document-parser.git" }
-neomodel = "^5.3.3"
-typer = "0.9.4"
-rapidfuzz = "^3.10.0"
-python-dotenv = "^1.0.1"
-hypothesis = "^6.112.2"
-more-itertools = "^10.3.0"
-argilla = "1.29.1"
-prefect = "2.16.8"
-griffe = "0.48.0"
-cpr-sdk = "^1.7.1"
-sentence-transformers = "^3.1.1"
-wandb = "^0.18.3"
-boto3 = "^1.35.31"
-moto = {extras = ["s3"], version = "^5.0.16"}
-# The following is to work around the fact that pytorch stopped supported intel-macs after 2.2.2
-# See: https://github.com/pytorch/pytorch/issues/114602
-torch = [
-    {version = "2.2.2", markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'"},
-    {version = "2.4.1", markers = "sys_platform != 'darwin' and platform_machine != 'x86_64'"},
-  ]
-
-
-[[tool.poetry.source]]
-name = "pytorch"
-url = "https://download.pytorch.org/whl/cpu"
-priority = "explicit"
-
-
-[tool.poetry.scripts]
-train = "scripts.train:app"
-promote = "scripts.promote:app"
-
-[tool.poetry.group.dev]
-optional = true
-
-[tool.poetry.group.dev.dependencies]
-pytest = "^8.3.2"
-mkdocs-material = "^9.5.39"
-pre-commit = "^3.8.0"
-boto3 = "^1.35.32"
-syrupy = "^4.7.1"
-moto = {extras = ["s3"], version = "^5.0.16"}
+[project.optional-dependencies]
+dev = [
+    "hypothesis==6.112.2",
+    "mkdocs-material==9.5.39",
+    "moto[s3]==5.0.16",
+    "pre-commit>=4.0.1",
+    "pytest>=8.3.3",
+    "ruff>=0.7.0",
+    "syrupy>=4.7.2",
+]
 
-[build-system]
-requires = ["poetry-core"]
-build-backend = "poetry.core.masonry.api"
+[tool.uv.sources]
+azure-pdf-parser = { git = "https://github.com/climatepolicyradar/azure-pdf-parser.git", tag = "v0.4.3" }
 
 [tool.ruff]
 lint.select = ["E", "F", "D", "I"]

@@ -353,9 +353,10 @@ def test_copy_across_aws_envs(
         mock_from_s3.head_object.return_value = {"ContentLength": len(content)}
         mock_to_s3.head_object.return_value = {"ContentLength": len(content)}
 
-        with patch("scripts.promote.download") as mock_download, patch(
-            "scripts.promote.upload"
-        ) as mock_upload:
+        with (
+            patch("scripts.promote.download") as mock_download,
+            patch("scripts.promote.upload") as mock_upload,
+        ):
             # Call the function
             result_bucket, result_key = copy_across_aws_envs(
                 promotion,