From 5edfc3c3233607a8c28838c262466a8a2fa7f311 Mon Sep 17 00:00:00 2001
From: Simon Bauer <simon.bauer@symflower.com>
Date: Mon, 22 Apr 2024 14:17:23 +0200
Subject: [PATCH 1/3] Give every assessment category a human-readable name and
 description

---
 cmd/eval-dev-quality/cmd/evaluate.go      |  2 +-
 cmd/eval-dev-quality/cmd/evaluate_test.go |  3 +-
 evaluate/metrics/category.go              | 68 +++++++++++++++++++----
 evaluate/metrics/category_test.go         |  2 +-
 4 files changed, 62 insertions(+), 13 deletions(-)

diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go
index 50dd4411..d03a529e 100644
--- a/cmd/eval-dev-quality/cmd/evaluate.go
+++ b/cmd/eval-dev-quality/cmd/evaluate.go
@@ -250,7 +250,7 @@ func (command *Evaluate) Execute(args []string) (err error) {
 	}
 
 	_ = metrics.WalkByScore(assessments.Collapse(), func(model string, assessment metrics.Assessments, score uint) error {
-		log.Printf("Evaluation score for %q (%q): %s", model, assessment.Category(totalScore), assessment)
+		log.Printf("Evaluation score for %q (%q): %s", model, assessment.Category(totalScore).Name, assessment)
 
 		return nil
 	})
diff --git a/cmd/eval-dev-quality/cmd/evaluate_test.go b/cmd/eval-dev-quality/cmd/evaluate_test.go
index f39d6783..b0f435c2 100644
--- a/cmd/eval-dev-quality/cmd/evaluate_test.go
+++ b/cmd/eval-dev-quality/cmd/evaluate_test.go
@@ -73,7 +73,7 @@ func TestEvaluateExecute(t *testing.T) {
 		},
 
 		ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) {
-			assert.Contains(t, output, `Evaluation score for "symflower/symbolic-execution" ("code-no-excess"): score=6, coverage-statement=1, files-executed=1, response-no-error=1, response-no-excess=1, response-not-empty=1, response-with-code=1`)
+			assert.Contains(t, output, `Evaluation score for "symflower/symbolic-execution" ("No Excess Response"): score=6, coverage-statement=1, files-executed=1, response-no-error=1, response-no-excess=1, response-not-empty=1, response-with-code=1`)
 			if !assert.Equal(t, 1, strings.Count(output, "Evaluation score for")) {
 				t.Logf("Output: %s", output)
 			}
@@ -86,6 +86,7 @@ func TestEvaluateExecute(t *testing.T) {
 				`), data)
 			},
 			"evaluation.log": nil,
+			"report.md":      nil,
 			"symflower_symbolic-execution/golang/golang/plain.log": nil,
 		},
 	})
diff --git a/evaluate/metrics/category.go b/evaluate/metrics/category.go
index b0397eae..3fdd7edb 100644
--- a/evaluate/metrics/category.go
+++ b/evaluate/metrics/category.go
@@ -1,30 +1,78 @@
 package metrics
 
+import "fmt"
+
 // AssessmentCategory represents a categorical ranking of a model based on Assessments.
-type AssessmentCategory string
+type AssessmentCategory struct {
+	// Name holds a unique short name of the category.
+	Name string
+	// Description holds the description of a category.
+	Description string
+}
+
+// AllAssessmentCategories holds all assessment categories.
+var AllAssessmentCategories []*AssessmentCategory
+
+// registerAssessmentCategory registers a new assessment category.
+func registerAssessmentCategory(c AssessmentCategory) *AssessmentCategory {
+	for _, category := range AllAssessmentCategories {
+		if c.Name == category.Name {
+			panic(fmt.Sprintf("duplicated category name %q", c.Name))
+		}
+	}
+
+	AllAssessmentCategories = append(AllAssessmentCategories, &c)
+
+	return &c
+}
 
 var (
 	// AssessmentCategoryUnknown indicates that it is not possible to compute a model's category.
-	AssessmentCategoryUnknown = AssessmentCategory("category-unknown")
+	AssessmentCategoryUnknown = registerAssessmentCategory(AssessmentCategory{
+		Name:        "Category Unknown",
+		Description: "Models in this category could not be categorized.",
+	})
 	// AssessmentCategoryResponseError indicates that a model has encountered an error trying to produce a response.
-	AssessmentCategoryResponseError = AssessmentCategory("response-error")
+	AssessmentCategoryResponseError = registerAssessmentCategory(AssessmentCategory{
+		Name:        "Response Error",
+		Description: "Models in this category encountered an error.",
+	})
 	// AssessmentCategoryResponseEmpty indicates that a model has returned an empty response.
-	AssessmentCategoryResponseEmpty = AssessmentCategory("response-empty")
+	AssessmentCategoryResponseEmpty = registerAssessmentCategory(AssessmentCategory{
+		Name:        "Response Empty",
+		Description: "Models in this category produced an empty response.",
+	})
 	// AssessmentCategoryResponseNoCode indicates that a model's response did not contain any source code.
-	AssessmentCategoryResponseNoCode = AssessmentCategory("response-no-code")
+	AssessmentCategoryResponseNoCode = registerAssessmentCategory(AssessmentCategory{
+		Name:        "No Code",
+		Description: "Models in this category produced no code.",
+	})
 	// AssessmentCategoryCodeInvalid indicates that a model's generated code produced an error when executed.
-	AssessmentCategoryCodeInvalid = AssessmentCategory("code-invalid")
+	AssessmentCategoryCodeInvalid = registerAssessmentCategory(AssessmentCategory{
+		Name:        "Invalid Code",
+		Description: "Models in this category produced invalid code.",
+	})
 	// AssessmentCategoryCodeExecuted indicates that a model's generated code could be executed without an error.
-	AssessmentCategoryCodeExecuted = AssessmentCategory("code-executed")
+	AssessmentCategoryCodeExecuted = registerAssessmentCategory(AssessmentCategory{
+		Name:        "Executable Code",
+		Description: "Models in this category produced executable code.",
+	})
 	// AssessmentCategoryCodeCoverageStatementReached indicates that a model's generated code reached 100% statement coverage.
-	AssessmentCategoryCodeCoverageStatementReached = AssessmentCategory("code-coverage-statement")
+	AssessmentCategoryCodeCoverageStatementReached = registerAssessmentCategory(AssessmentCategory{
+		Name:        "Statement Coverage Reached",
+		Description: "Models in this category produced code that reached full statement coverage.",
+	})
 	// AssessmentCategoryCodeNoExcess indicates that a model's response did not contain more content than requested.
-	AssessmentCategoryCodeNoExcess = AssessmentCategory("code-no-excess")
+	AssessmentCategoryCodeNoExcess = registerAssessmentCategory(AssessmentCategory{
+		Name:        "No Excess Response",
+		Description: "Models in this category did not respond with more content than requested.",
+	})
 )
 
 // Category infers a categorical ranking of a model based on assessment values.
 // A models overall category corresponds to the criterion where the model was consistently able to receive "total" amount of points. I.e. if there were 3 tasks in total and a model was able to produce executing code for all tasks, but only in one case the coverage goal was reached, then the category is only "CodeExecuted" because the coverage goal was not reached consistently.
-func (a Assessments) Category(total uint) AssessmentCategory {
+// The returned category is never "nil".
+func (a Assessments) Category(total uint) *AssessmentCategory {
 	if total == 0 {
 		return AssessmentCategoryUnknown
 	}
diff --git a/evaluate/metrics/category_test.go b/evaluate/metrics/category_test.go
index b78ccf46..28f08358 100644
--- a/evaluate/metrics/category_test.go
+++ b/evaluate/metrics/category_test.go
@@ -13,7 +13,7 @@ func TestAssessmentsCategory(t *testing.T) {
 		Assessments Assessments
 		Total       uint
 
-		ExpectedAssessmentCategory AssessmentCategory
+		ExpectedAssessmentCategory *AssessmentCategory
 	}
 
 	validate := func(t *testing.T, tc *testCase) {

From 202e683fb647ca1a78e8aef9d1acda4fa32016e4 Mon Sep 17 00:00:00 2001
From: Simon Bauer <simon.bauer@symflower.com>
Date: Mon, 22 Apr 2024 15:23:44 +0200
Subject: [PATCH 2/3] Markdown export

Part of #49
---
 cmd/eval-dev-quality/cmd/evaluate.go      |  26 ++++-
 cmd/eval-dev-quality/cmd/evaluate_test.go |  14 ++-
 evaluate/report/markdown.go               | 108 ++++++++++++++++++
 evaluate/report/markdown_test.go          | 129 ++++++++++++++++++++++
 version/version.go                        |   4 +
 5 files changed, 272 insertions(+), 9 deletions(-)
 create mode 100644 evaluate/report/markdown.go
 create mode 100644 evaluate/report/markdown_test.go
 create mode 100644 version/version.go

diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go
index d03a529e..40b4eed3 100644
--- a/cmd/eval-dev-quality/cmd/evaluate.go
+++ b/cmd/eval-dev-quality/cmd/evaluate.go
@@ -21,6 +21,7 @@ import (
 	_ "github.com/symflower/eval-dev-quality/provider/openrouter" // Register provider.
 	_ "github.com/symflower/eval-dev-quality/provider/symflower"  // Register provider.
 	"github.com/symflower/eval-dev-quality/tools"
+	"github.com/symflower/eval-dev-quality/version"
 )
 
 // Evaluate holds the "evaluation" command.
@@ -58,10 +59,12 @@ const repositoryPlainName = "plain"
 
 // Execute executes the command.
 func (command *Evaluate) Execute(args []string) (err error) {
-	command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", time.Now().Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group.
+	evaluationTimestamp := time.Now()
+	command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", evaluationTimestamp.Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group.
 	command.logger.Printf("Writing results to %s", command.ResultPath)
 
-	log, logClose, err := log.WithFile(command.logger, filepath.Join(command.ResultPath, "evaluation.log"))
+	logFilePath := filepath.Join(command.ResultPath, "evaluation.log")
+	log, logClose, err := log.WithFile(command.logger, logFilePath)
 	if err != nil {
 		return err
 	}
@@ -231,7 +234,8 @@ func (command *Evaluate) Execute(args []string) (err error) {
 	if err != nil {
 		log.Fatalf("ERROR: could not create result summary: %s", err)
 	}
-	if err := os.WriteFile(filepath.Join(command.ResultPath, "evaluation.csv"), []byte(csv), 0644); err != nil {
+	csvReportPath := filepath.Join(command.ResultPath, "evaluation.csv")
+	if err := os.WriteFile(csvReportPath, []byte(csv), 0644); err != nil {
 		log.Fatalf("ERROR: could not write result summary: %s", err)
 	}
 
@@ -249,7 +253,21 @@ func (command *Evaluate) Execute(args []string) (err error) {
 		totalScore = uint(len(languagesSelected))
 	}
 
-	_ = metrics.WalkByScore(assessments.Collapse(), func(model string, assessment metrics.Assessments, score uint) error {
+	assessmentsPerModel := assessments.Collapse()
+	if err := (report.Markdown{
+		DateTime: evaluationTimestamp,
+		Version:  version.Current,
+
+		CSVPath: csvReportPath,
+		LogPath: logFilePath,
+
+		AssessmentPerModel: assessmentsPerModel,
+		TotalScore:         totalScore,
+	}).WriteToFile(filepath.Join(command.ResultPath, "report.md")); err != nil {
+		return err
+	}
+
+	_ = metrics.WalkByScore(assessmentsPerModel, func(model string, assessment metrics.Assessments, score uint) error {
 		log.Printf("Evaluation score for %q (%q): %s", model, assessment.Category(totalScore).Name, assessment)
 
 		return nil
diff --git a/cmd/eval-dev-quality/cmd/evaluate_test.go b/cmd/eval-dev-quality/cmd/evaluate_test.go
index b0f435c2..95e10887 100644
--- a/cmd/eval-dev-quality/cmd/evaluate_test.go
+++ b/cmd/eval-dev-quality/cmd/evaluate_test.go
@@ -22,7 +22,7 @@ func TestEvaluateExecute(t *testing.T) {
 		Arguments []string
 
 		ExpectedOutputValidate func(t *testing.T, output string, resultPath string)
-		ExpectedResultFiles    map[string]func(t *testing.T, filePath string, data string)
+		ExpectedResultFiles    map[string]func(t *testing.T, resultPath string, filePath string, data string)
 	}
 
 	validate := func(t *testing.T, tc *testCase) {
@@ -54,7 +54,7 @@ func TestEvaluateExecute(t *testing.T) {
 				if validate != nil {
 					data, err := os.ReadFile(filepath.Join(temporaryPath, filePath))
 					if assert.NoError(t, err) {
-						validate(t, filePath, string(data))
+						validate(t, temporaryPath, filePath, string(data))
 					}
 				}
 			}
@@ -78,15 +78,19 @@ func TestEvaluateExecute(t *testing.T) {
 				t.Logf("Output: %s", output)
 			}
 		},
-		ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
-			"evaluation.csv": func(t *testing.T, filePath, data string) {
+		ExpectedResultFiles: map[string]func(t *testing.T, resultPath string, filePath string, data string){
+			"evaluation.csv": func(t *testing.T, resultPath string, filePath, data string) {
 				assert.Equal(t, bytesutil.StringTrimIndentations(`
 					model,language,repository,score,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code
 					symflower/symbolic-execution,golang,golang/plain,6,1,1,1,1,1,1
 				`), data)
 			},
 			"evaluation.log": nil,
-			"report.md":      nil,
+			"report.md": func(t *testing.T, resultPath string, filePath, data string) {
+				// Ensure the report links to the CSV file and logs.
+				assert.Contains(t, data, filepath.Join(resultPath, "evaluation.csv"))
+				assert.Contains(t, data, filepath.Join(resultPath, "evaluation.log"))
+			},
 			"symflower_symbolic-execution/golang/golang/plain.log": nil,
 		},
 	})
diff --git a/evaluate/report/markdown.go b/evaluate/report/markdown.go
new file mode 100644
index 00000000..77511285
--- /dev/null
+++ b/evaluate/report/markdown.go
@@ -0,0 +1,108 @@
+package report
+
+import (
+	"io"
+	"os"
+	"path/filepath"
+	"text/template"
+	"time"
+
+	pkgerrors "github.com/pkg/errors"
+	"github.com/zimmski/osutil/bytesutil"
+
+	"github.com/symflower/eval-dev-quality/evaluate/metrics"
+)
+
+// Markdown holds the values for exporting a Markdown report.
+type Markdown struct {
+	// DateTime holds the timestamp of the evaluation.
+	DateTime time.Time
+	// Version holds the version of the evaluation tool.
+	Version string
+
+	// CSVPath holds the path of detailed CSV results.
+	CSVPath string
+	// LogPath holds the path of detailed logs.
+	LogPath string
+
+	// AssessmentPerModel holds
+	AssessmentPerModel map[string]metrics.Assessments
+	// TotalScore holds the total reachable score per task.
+	// REMARK Used for category computation.
+	TotalScore uint
+}
+
+// markdownTemplateContext holds the template for a Markdown report.
+type markdownTemplateContext struct {
+	Markdown
+
+	Categories        []*metrics.AssessmentCategory
+	ModelsPerCategory map[*metrics.AssessmentCategory][]string
+}
+
+// markdownTemplate holds the template for a Markdown report.
+var markdownTemplate = template.Must(template.New("template-report").Parse(bytesutil.StringTrimIndentations(`
+	# Evaluation from {{.DateTime.Format "2006-01-02 15:04:05"}}
+
+	This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version {{.Version}}` + "`" + `.
+
+	## Results
+
+	> Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot.
+
+	The results of all models have been divided into the following categories:
+	{{ range $category := .Categories -}}
+	- {{ $category.Name }}: {{ $category.Description }}
+	{{ end }}
+	The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here]({{.LogPath}}). Detailed scoring can be found [here]({{.CSVPath}}).
+
+	{{ range $category := .Categories -}}
+	{{ with $modelNames := index $.ModelsPerCategory $category -}}
+	### "{{ $category.Name }}"
+
+	{{ $category.Description }}
+
+	{{ range $modelName := $modelNames -}}
+	- ` + "`" + `{{ $modelName }}` + "`" + `
+	{{ end }}
+	{{ end }}
+	{{- end -}}
+`)))
+
+// Format formats the markdown values in the template to the given writer.
+func (m Markdown) Format(writer io.Writer) error {
+	templateContext := markdownTemplateContext{
+		Markdown:   m,
+		Categories: metrics.AllAssessmentCategories,
+	}
+	templateContext.ModelsPerCategory = make(map[*metrics.AssessmentCategory][]string, len(metrics.AllAssessmentCategories))
+	for model, assessment := range m.AssessmentPerModel {
+		category := assessment.Category(m.TotalScore)
+		templateContext.ModelsPerCategory[category] = append(templateContext.ModelsPerCategory[category], model)
+	}
+	// TODO Generate svg using maybe https://github.com/wcharczuk/go-chart.
+
+	return pkgerrors.WithStack(markdownTemplate.Execute(writer, templateContext))
+}
+
+// WriteToFile writes the Markdown values in the template to the given file.
+func (t Markdown) WriteToFile(path string) (err error) {
+	t.CSVPath, err = filepath.Abs(t.CSVPath)
+	if err != nil {
+		return err
+	}
+	t.LogPath, err = filepath.Abs(t.LogPath)
+	if err != nil {
+		return err
+	}
+
+	if err = os.MkdirAll(filepath.Base(path), 0755); err != nil {
+		return pkgerrors.WithStack(err)
+	}
+	file, err := os.Create(path)
+	if err != nil {
+		return pkgerrors.WithStack(err)
+	}
+
+	return pkgerrors.WithStack(t.Format(file))
+}
diff --git a/evaluate/report/markdown_test.go b/evaluate/report/markdown_test.go
new file mode 100644
index 00000000..27093aaf
--- /dev/null
+++ b/evaluate/report/markdown_test.go
@@ -0,0 +1,129 @@
+package report
+
+import (
+	"bytes"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"github.com/symflower/eval-dev-quality/evaluate/metrics"
+	"github.com/zimmski/osutil/bytesutil"
+)
+
+func TestMarkdownFormat(t *testing.T) {
+	type testCase struct {
+		Name string
+
+		Markdown Markdown
+
+		ExpectedReport string
+		ExpectedError  error
+	}
+
+	validate := func(t *testing.T, tc *testCase) {
+		t.Run(tc.Name, func(t *testing.T) {
+			var buffer bytes.Buffer
+			actualError := tc.Markdown.Format(&buffer)
+			assert.Equal(t, tc.ExpectedError, actualError)
+			actualReport := buffer.String()
+
+			assert.Equalf(t, bytesutil.StringTrimIndentations(tc.ExpectedReport), actualReport, "Full output:\n%s", actualReport)
+		})
+	}
+
+	testTimeString := "2000-01-01 00:00:00"
+	testTime, err := time.Parse(time.DateTime, testTimeString)
+	require.NoError(t, err)
+
+	validate(t, &testCase{
+		Name: "No Models",
+
+		Markdown: Markdown{
+			DateTime: testTime,
+			Version:  "1234",
+
+			CSVPath: "some/csv/path.csv",
+			LogPath: "some/log/path.log",
+		},
+
+		ExpectedReport: `
+			# Evaluation from 2000-01-01 00:00:00
+
+			This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version 1234` + "`" + `.
+
+			## Results
+
+			> Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot.
+
+			The results of all models have been divided into the following categories:
+			- Category Unknown: Models in this category could not be categorized.
+			- Response Error: Models in this category encountered an error.
+			- Response Empty: Models in this category produced an empty response.
+			- No Code: Models in this category produced no code.
+			- Invalid Code: Models in this category produced invalid code.
+			- Executable Code: Models in this category produced executable code.
+			- Statement Coverage Reached: Models in this category produced code that reached full statement coverage.
+			- No Excess Response: Models in this category did not respond with more content than requested.
+
+			The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here](some/log/path.log). Detailed scoring can be found [here](some/csv/path.csv).
+
+		`,
+	})
+
+	validate(t, &testCase{
+		Name: "Simple Models",
+
+		Markdown: Markdown{
+			DateTime: testTime,
+			Version:  "1234",
+
+			CSVPath: "some/csv/path.csv",
+			LogPath: "some/log/path.log",
+
+			TotalScore: 1,
+			AssessmentPerModel: map[string]metrics.Assessments{
+				"ModelResponseError": metrics.NewAssessments(),
+				"ModelNoCode": metrics.Assessments{
+					metrics.AssessmentKeyResponseNoError:  1,
+					metrics.AssessmentKeyResponseNotEmpty: 1,
+				},
+			},
+		},
+
+		ExpectedReport: `
+			# Evaluation from 2000-01-01 00:00:00
+
+			This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version 1234` + "`" + `.
+
+			## Results
+
+			> Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot.
+
+			The results of all models have been divided into the following categories:
+			- Category Unknown: Models in this category could not be categorized.
+			- Response Error: Models in this category encountered an error.
+			- Response Empty: Models in this category produced an empty response.
+			- No Code: Models in this category produced no code.
+			- Invalid Code: Models in this category produced invalid code.
+			- Executable Code: Models in this category produced executable code.
+			- Statement Coverage Reached: Models in this category produced code that reached full statement coverage.
+			- No Excess Response: Models in this category did not respond with more content than requested.
+
+			The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here](some/log/path.log). Detailed scoring can be found [here](some/csv/path.csv).
+
+			### "Response Error"
+
+			Models in this category encountered an error.
+
+			- ` + "`ModelResponseError`" + `
+
+			### "No Code"
+
+			Models in this category produced no code.
+
+			- ` + "`ModelNoCode`" + `
+
+		`,
+	})
+}
diff --git a/version/version.go b/version/version.go
new file mode 100644
index 00000000..e71ccf5a
--- /dev/null
+++ b/version/version.go
@@ -0,0 +1,4 @@
+package version
+
+// Current holds the current version.
+var Current = "0.2.0"

From b6eccad7f8367aaef08b3d04b4aa0c54ced8f9a0 Mon Sep 17 00:00:00 2001
From: Simon Bauer <simon.bauer@symflower.com>
Date: Tue, 23 Apr 2024 11:51:01 +0200
Subject: [PATCH 3/3] SVG category export

Closes #49
---
 cmd/eval-dev-quality/cmd/evaluate.go        |   1 +
 cmd/eval-dev-quality/cmd/evaluate_test.go   |   7 +-
 evaluate/report/markdown.go                 |  81 ++++++++++++++-
 evaluate/report/markdown_test.go            | 108 +++++++++++++++++++-
 evaluate/report/testdata/all_categories.svg |  61 +++++++++++
 evaluate/report/testdata/two_categories.svg |  28 +++++
 evaluate/report/testdata/two_models.svg     |  24 +++++
 go.mod                                      |   4 +
 go.sum                                      |  39 +++++++
 9 files changed, 349 insertions(+), 4 deletions(-)
 create mode 100644 evaluate/report/testdata/all_categories.svg
 create mode 100644 evaluate/report/testdata/two_categories.svg
 create mode 100644 evaluate/report/testdata/two_models.svg

diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go
index 40b4eed3..f60b6244 100644
--- a/cmd/eval-dev-quality/cmd/evaluate.go
+++ b/cmd/eval-dev-quality/cmd/evaluate.go
@@ -260,6 +260,7 @@ func (command *Evaluate) Execute(args []string) (err error) {
 
 		CSVPath: csvReportPath,
 		LogPath: logFilePath,
+		SVGPath: filepath.Join(command.ResultPath, "categories.svg"),
 
 		AssessmentPerModel: assessmentsPerModel,
 		TotalScore:         totalScore,
diff --git a/cmd/eval-dev-quality/cmd/evaluate_test.go b/cmd/eval-dev-quality/cmd/evaluate_test.go
index 95e10887..9258affa 100644
--- a/cmd/eval-dev-quality/cmd/evaluate_test.go
+++ b/cmd/eval-dev-quality/cmd/evaluate_test.go
@@ -79,6 +79,10 @@ func TestEvaluateExecute(t *testing.T) {
 			}
 		},
 		ExpectedResultFiles: map[string]func(t *testing.T, resultPath string, filePath string, data string){
+			"categories.svg": func(t *testing.T, resultPath string, filePath, data string) {
+				assert.Contains(t, data, "No Excess Response</text>") // Assert "no excess" category is present.
+				assert.Contains(t, data, "1</text>")                  // Assert the Y-axis label is at least one for one model in that category.
+			},
 			"evaluation.csv": func(t *testing.T, resultPath string, filePath, data string) {
 				assert.Equal(t, bytesutil.StringTrimIndentations(`
 					model,language,repository,score,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code
@@ -87,7 +91,8 @@ func TestEvaluateExecute(t *testing.T) {
 			},
 			"evaluation.log": nil,
 			"report.md": func(t *testing.T, resultPath string, filePath, data string) {
-				// Ensure the report links to the CSV file and logs.
+				// Ensure the report links to the CSV file, SVG file and logs.
+				assert.Contains(t, data, filepath.Join(resultPath, "categories.svg"))
 				assert.Contains(t, data, filepath.Join(resultPath, "evaluation.csv"))
 				assert.Contains(t, data, filepath.Join(resultPath, "evaluation.log"))
 			},
diff --git a/evaluate/report/markdown.go b/evaluate/report/markdown.go
index 77511285..ed9b01b2 100644
--- a/evaluate/report/markdown.go
+++ b/evaluate/report/markdown.go
@@ -4,10 +4,12 @@ import (
 	"io"
 	"os"
 	"path/filepath"
+	"strconv"
 	"text/template"
 	"time"
 
 	pkgerrors "github.com/pkg/errors"
+	"github.com/wcharczuk/go-chart/v2"
 	"github.com/zimmski/osutil/bytesutil"
 
 	"github.com/symflower/eval-dev-quality/evaluate/metrics"
@@ -24,6 +26,9 @@ type Markdown struct {
 	CSVPath string
 	// LogPath holds the path of detailed logs.
 	LogPath string
+	// SVGPath holds the path of the charted results.
+	// REMARK The charts will be generated automatically during the export if this path is set.
+	SVGPath string
 
 	// AssessmentPerModel holds
 	AssessmentPerModel map[string]metrics.Assessments
@@ -44,6 +49,11 @@ type markdownTemplateContext struct {
 var markdownTemplate = template.Must(template.New("template-report").Parse(bytesutil.StringTrimIndentations(`
 	# Evaluation from {{.DateTime.Format "2006-01-02 15:04:05"}}
 
+	{{ with $svgPath := .SVGPath -}}
+	![Bar chart that categorizes all evaluated models.]({{$svgPath}})
+
+	{{ end -}}
+
 	This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version {{.Version}}` + "`" + `.
 
 	## Results
@@ -69,6 +79,52 @@ var markdownTemplate = template.Must(template.New("template-report").Parse(bytes
 	{{- end -}}
 `)))
 
+// barChartModelsPerCategoriesSVG generates a bar chart showing models per category and writes it out as an SVG.
+func barChartModelsPerCategoriesSVG(writer io.Writer, categories []*metrics.AssessmentCategory, modelsPerCategory map[*metrics.AssessmentCategory][]string) error {
+	bars := make([]chart.Value, 0, len(categories))
+	maxCount := 0
+	for _, category := range categories {
+		count := len(modelsPerCategory[category])
+		if count > maxCount {
+			maxCount = count
+		}
+		if count == 0 {
+			continue
+		}
+
+		bars = append(bars, chart.Value{
+			Label: category.Name,
+			Value: float64(count),
+		})
+	}
+	ticks := make([]chart.Tick, maxCount+1)
+	for i := range ticks {
+		ticks[i] = chart.Tick{
+			Value: float64(i),
+			Label: strconv.Itoa(i),
+		}
+	}
+	graph := chart.BarChart{
+		Title: "Models per Category",
+		Bars:  bars,
+		YAxis: chart.YAxis{
+			Ticks: ticks,
+		},
+
+		Background: chart.Style{
+			Padding: chart.Box{
+				Top:    60,
+				Bottom: 40,
+			},
+		},
+		Height:   300,
+		Width:    (len(bars) + 2) * 60,
+		BarWidth: 60,
+	}
+
+	return pkgerrors.WithStack(graph.Render(chart.SVG, writer))
+}
+
 // Format formats the markdown values in the template to the given writer.
 func (m Markdown) Format(writer io.Writer) error {
 	templateContext := markdownTemplateContext{
@@ -80,7 +136,24 @@ func (m Markdown) Format(writer io.Writer) error {
 		category := assessment.Category(m.TotalScore)
 		templateContext.ModelsPerCategory[category] = append(templateContext.ModelsPerCategory[category], model)
 	}
-	// TODO Generate svg using maybe https://github.com/wcharczuk/go-chart.
+
+	if m.SVGPath == "" {
+		return pkgerrors.WithStack(markdownTemplate.Execute(writer, templateContext))
+
+	}
+
+	svgFile, err := os.Create(m.SVGPath)
+	if err != nil {
+		return pkgerrors.WithStack(err)
+	}
+	defer func() {
+		if err := svgFile.Close(); err != nil {
+			panic(err)
+		}
+	}()
+	if err := barChartModelsPerCategoriesSVG(svgFile, metrics.AllAssessmentCategories, templateContext.ModelsPerCategory); err != nil {
+		return pkgerrors.WithStack(err)
+	}
 
 	return pkgerrors.WithStack(markdownTemplate.Execute(writer, templateContext))
 }
@@ -95,6 +168,12 @@ func (t Markdown) WriteToFile(path string) (err error) {
 	if err != nil {
 		return err
 	}
+	if t.SVGPath != "" {
+		t.SVGPath, err = filepath.Abs(t.SVGPath)
+		if err != nil {
+			return err
+		}
+	}
 
 	if err = os.MkdirAll(filepath.Base(path), 0755); err != nil {
 		return pkgerrors.WithStack(err)
diff --git a/evaluate/report/markdown_test.go b/evaluate/report/markdown_test.go
index 27093aaf..c61e80ec 100644
--- a/evaluate/report/markdown_test.go
+++ b/evaluate/report/markdown_test.go
@@ -2,33 +2,75 @@ package report
 
 import (
 	"bytes"
+	"os"
+	"path/filepath"
+	"strings"
 	"testing"
 	"time"
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
-	"github.com/symflower/eval-dev-quality/evaluate/metrics"
 	"github.com/zimmski/osutil/bytesutil"
+
+	"github.com/symflower/eval-dev-quality/evaluate/metrics"
 )
 
+// validateFileContent asserts that the file content matches the content of the given file path.
+// The expected file is created if it does not exist. If the contents don't match, the actual content is written to disk alongside the expected file.
+func validateFileContent(t *testing.T, expectedFilePath string, actualFileContent string) {
+	require.NotEmpty(t, expectedFilePath, "expected file path cannot be empty")
+	expectedContent, err := os.ReadFile(expectedFilePath)
+	if err != nil {
+		// Create the file if it does not exist already to make it easy to add new cases.
+		require.ErrorIs(t, err, os.ErrNotExist)
+		expectedContent = []byte("TODO")
+		require.NoError(t, os.WriteFile(expectedFilePath, expectedContent, 0644))
+		t.Logf("expected file %q does not exist yet, created it for you", expectedFilePath)
+	}
+
+	if !assert.Equalf(t, string(expectedContent), actualFileContent, "actual content:\n%s", actualFileContent) {
+		extension := filepath.Ext(expectedFilePath)
+		actualFile := strings.TrimSuffix(expectedFilePath, extension) + "_actual" + extension
+		require.NoError(t, os.WriteFile(actualFile, []byte(actualFileContent), 0644))
+		t.Logf("written actual file content for failing %q to %q", t.Name(), actualFile)
+	}
+}
+
 func TestMarkdownFormat(t *testing.T) {
 	type testCase struct {
 		Name string
 
+		// Markdown holds the Markdown values.
+		// REMARK Do not set the SVG path in the tests as it is set to a temporary file automatically.
 		Markdown Markdown
 
 		ExpectedReport string
-		ExpectedError  error
+		// ExpectedSVGFile is the path to the reference file for the generated SVG content.
+		// REMARK If no SVG reference file is set, none will be generated in the template.
+		ExpectedSVGFile string
+		ExpectedError   error
 	}
 
 	validate := func(t *testing.T, tc *testCase) {
 		t.Run(tc.Name, func(t *testing.T) {
+			temporaryDirectory := t.TempDir()
+			if tc.ExpectedSVGFile != "" {
+				tc.Markdown.SVGPath = filepath.Join(temporaryDirectory, "test.svg")
+			}
+
 			var buffer bytes.Buffer
 			actualError := tc.Markdown.Format(&buffer)
 			assert.Equal(t, tc.ExpectedError, actualError)
 			actualReport := buffer.String()
+			actualReport = strings.ReplaceAll(actualReport, temporaryDirectory, "$TEST_DIR")
 
 			assert.Equalf(t, bytesutil.StringTrimIndentations(tc.ExpectedReport), actualReport, "Full output:\n%s", actualReport)
+
+			if tc.ExpectedSVGFile != "" {
+				actualSVGContent, err := os.ReadFile(tc.Markdown.SVGPath)
+				assert.NoError(t, err)
+				validateFileContent(t, tc.ExpectedSVGFile, string(actualSVGContent))
+			}
 		})
 	}
 
@@ -94,6 +136,8 @@ func TestMarkdownFormat(t *testing.T) {
 		ExpectedReport: `
 			# Evaluation from 2000-01-01 00:00:00
 
+			![Bar chart that categorizes all evaluated models.]($TEST_DIR/test.svg)
+
 			This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version 1234` + "`" + `.
 
 			## Results
@@ -125,5 +169,65 @@ func TestMarkdownFormat(t *testing.T) {
 			- ` + "`ModelNoCode`" + `
 
 		`,
+		ExpectedSVGFile: "testdata/two_models.svg",
+	})
+}
+
+func TestBarChartModelsPerCategoriesSVG(t *testing.T) {
+	type testCase struct {
+		Name string
+
+		Categories        []*metrics.AssessmentCategory
+		ModelsPerCategory map[*metrics.AssessmentCategory]uint
+
+		ExpectedFile  string
+		ExpectedError error
+	}
+
+	validate := func(t *testing.T, tc *testCase) {
+		t.Run(tc.Name, func(t *testing.T) {
+			var actualSVGContent bytes.Buffer
+			dummyModelsPerCategory := make(map[*metrics.AssessmentCategory][]string)
+			for category, count := range tc.ModelsPerCategory {
+				dummyModelsPerCategory[category] = make([]string, count)
+			}
+
+			actualError := barChartModelsPerCategoriesSVG(&actualSVGContent, tc.Categories, dummyModelsPerCategory)
+			assert.Equal(t, tc.ExpectedError, actualError)
+
+			validateFileContent(t, tc.ExpectedFile, actualSVGContent.String())
+		})
+	}
+
+	validate(t, &testCase{
+		Name: "Two Categories",
+
+		Categories: []*metrics.AssessmentCategory{
+			metrics.AssessmentCategoryResponseError,
+			metrics.AssessmentCategoryResponseNoCode,
+		},
+		ModelsPerCategory: map[*metrics.AssessmentCategory]uint{
+			metrics.AssessmentCategoryResponseError:  1,
+			metrics.AssessmentCategoryResponseNoCode: 3,
+		},
+
+		ExpectedFile: "testdata/two_categories.svg",
+	})
+
+	validate(t, &testCase{
+		Name: "All Categories",
+
+		Categories: metrics.AllAssessmentCategories,
+		ModelsPerCategory: map[*metrics.AssessmentCategory]uint{
+			metrics.AssessmentCategoryResponseError:                1,
+			metrics.AssessmentCategoryResponseEmpty:                2,
+			metrics.AssessmentCategoryResponseNoCode:               3,
+			metrics.AssessmentCategoryCodeInvalid:                  4,
+			metrics.AssessmentCategoryCodeExecuted:                 5,
+			metrics.AssessmentCategoryCodeCoverageStatementReached: 6,
+			metrics.AssessmentCategoryCodeNoExcess:                 7,
+		},
+
+		ExpectedFile: "testdata/all_categories.svg",
 	})
 }
diff --git a/evaluate/report/testdata/all_categories.svg b/evaluate/report/testdata/all_categories.svg
new file mode 100644
index 00000000..93da3159
--- /dev/null
+++ b/evaluate/report/testdata/all_categories.svg
@@ -0,0 +1,61 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="540" height="300">\n<path  d="M 0 0
+L 540 0
+L 540 300
+L 0 300
+L 0 0" style="stroke-width:0;stroke:rgba(255,255,255,1.0);fill:rgba(255,255,255,1.0)"/><path  d="M 20 66
+L 512 66
+L 512 225
+L 20 225
+L 20 66" style="stroke-width:0;stroke:rgba(255,255,255,1.0);fill:rgba(255,255,255,1.0)"/><path  d="M 25 202
+L 85 202
+L 85 225
+L 25 225
+L 25 202" style="stroke-width:3;stroke:rgba(106,195,203,1.0);fill:rgba(106,195,203,1.0)"/><path  d="M 96 179
+L 156 179
+L 156 225
+L 96 225
+L 96 179" style="stroke-width:3;stroke:rgba(42,190,137,1.0);fill:rgba(42,190,137,1.0)"/><path  d="M 167 156
+L 227 156
+L 227 225
+L 167 225
+L 167 156" style="stroke-width:3;stroke:rgba(110,128,139,1.0);fill:rgba(110,128,139,1.0)"/><path  d="M 238 134
+L 298 134
+L 298 225
+L 238 225
+L 238 134" style="stroke-width:3;stroke:rgba(240,174,90,1.0);fill:rgba(240,174,90,1.0)"/><path  d="M 309 111
+L 369 111
+L 369 225
+L 309 225
+L 309 111" style="stroke-width:3;stroke:rgba(0,116,217,1.0);fill:rgba(0,116,217,1.0)"/><path  d="M 380 88
+L 440 88
+L 440 225
+L 380 225
+L 380 88" style="stroke-width:3;stroke:rgba(0,217,101,1.0);fill:rgba(0,217,101,1.0)"/><path  d="M 451 66
+L 511 66
+L 511 225
+L 451 225
+L 451 66" style="stroke-width:3;stroke:rgba(217,0,116,1.0);fill:rgba(217,0,116,1.0)"/><path  d="M 20 225
+L 512 225" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><path  d="M 20 225
+L 20 230" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="27" y="247" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">Response</text><text x="41" y="264" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">Error</text><path  d="M 91 225
+L 91 230" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="98" y="247" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">Response</text><text x="108" y="264" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">Empty</text><path  d="M 162 225
+L 162 230" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="172" y="247" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">No Code</text><path  d="M 233 225
+L 233 230" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="249" y="247" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">Invalid</text><text x="253" y="264" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">Code</text><path  d="M 304 225
+L 304 230" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="308" y="247" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">Executable</text><text x="324" y="264" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">Code</text><path  d="M 375 225
+L 375 230" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="380" y="247" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">Statement</text><text x="383" y="264" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">Coverage</text><text x="385" y="281" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">Reached</text><path  d="M 446 225
+L 446 230" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="451" y="247" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">No Excess</text><text x="453" y="264" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">Response</text><path  d="M 512 225
+L 512 66" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><path  d="M 512 225
+L 517 225" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="522" y="231" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">0</text><path  d="M 512 202
+L 517 202" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="522" y="208" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">1</text><path  d="M 512 179
+L 517 179" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="522" y="185" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">2</text><path  d="M 512 156
+L 517 156" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="522" y="162" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">3</text><path  d="M 512 134
+L 517 134" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="522" y="140" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">4</text><path  d="M 512 111
+L 517 111" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="522" y="117" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">5</text><path  d="M 512 88
+L 517 88" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="522" y="94" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">6</text><path  d="M 512 66
+L 517 66" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="522" y="72" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">7</text><path  d="M 20 225
+L 512 225" style="stroke-width:0;stroke:none;fill:none"/><path  d="M 20 202
+L 512 202" style="stroke-width:0;stroke:none;fill:none"/><path  d="M 20 179
+L 512 179" style="stroke-width:0;stroke:none;fill:none"/><path  d="M 20 156
+L 512 156" style="stroke-width:0;stroke:none;fill:none"/><path  d="M 20 134
+L 512 134" style="stroke-width:0;stroke:none;fill:none"/><path  d="M 20 111
+L 512 111" style="stroke-width:0;stroke:none;fill:none"/><path  d="M 20 88
+L 512 88" style="stroke-width:0;stroke:none;fill:none"/><text x="199" y="25" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:15.3px;font-family:'Roboto Medium',sans-serif">Models per Category</text></svg>
\ No newline at end of file
diff --git a/evaluate/report/testdata/two_categories.svg b/evaluate/report/testdata/two_categories.svg
new file mode 100644
index 00000000..7d1985f7
--- /dev/null
+++ b/evaluate/report/testdata/two_categories.svg
@@ -0,0 +1,28 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="240" height="300">\n<path  d="M 0 0
+L 240 0
+L 240 300
+L 0 300
+L 0 0" style="stroke-width:0;stroke:rgba(255,255,255,1.0);fill:rgba(255,255,255,1.0)"/><path  d="M 20 66
+L 212 66
+L 212 225
+L 20 225
+L 20 66" style="stroke-width:0;stroke:rgba(255,255,255,1.0);fill:rgba(255,255,255,1.0)"/><path  d="M 38 172
+L 98 172
+L 98 225
+L 38 225
+L 38 172" style="stroke-width:3;stroke:rgba(106,195,203,1.0);fill:rgba(106,195,203,1.0)"/><path  d="M 134 66
+L 194 66
+L 194 225
+L 134 225
+L 134 66" style="stroke-width:3;stroke:rgba(42,190,137,1.0);fill:rgba(42,190,137,1.0)"/><path  d="M 20 225
+L 212 225" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><path  d="M 20 225
+L 20 230" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="24" y="247" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">Response Error</text><path  d="M 116 225
+L 116 230" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="139" y="247" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">No Code</text><path  d="M 212 225
+L 212 66" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><path  d="M 212 225
+L 217 225" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="222" y="231" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">0</text><path  d="M 212 172
+L 217 172" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="222" y="178" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">1</text><path  d="M 212 119
+L 217 119" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="222" y="125" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">2</text><path  d="M 212 66
+L 217 66" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="222" y="72" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">3</text><path  d="M 20 225
+L 212 225" style="stroke-width:0;stroke:none;fill:none"/><path  d="M 20 172
+L 212 172" style="stroke-width:0;stroke:none;fill:none"/><path  d="M 20 119
+L 212 119" style="stroke-width:0;stroke:none;fill:none"/><text x="61" y="22" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">Models per Category</text></svg>
\ No newline at end of file
diff --git a/evaluate/report/testdata/two_models.svg b/evaluate/report/testdata/two_models.svg
new file mode 100644
index 00000000..804f122c
--- /dev/null
+++ b/evaluate/report/testdata/two_models.svg
@@ -0,0 +1,24 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="240" height="300">\n<path  d="M 0 0
+L 240 0
+L 240 300
+L 0 300
+L 0 0" style="stroke-width:0;stroke:rgba(255,255,255,1.0);fill:rgba(255,255,255,1.0)"/><path  d="M 20 66
+L 212 66
+L 212 225
+L 20 225
+L 20 66" style="stroke-width:0;stroke:rgba(255,255,255,1.0);fill:rgba(255,255,255,1.0)"/><path  d="M 38 66
+L 98 66
+L 98 225
+L 38 225
+L 38 66" style="stroke-width:3;stroke:rgba(106,195,203,1.0);fill:rgba(106,195,203,1.0)"/><path  d="M 134 66
+L 194 66
+L 194 225
+L 134 225
+L 134 66" style="stroke-width:3;stroke:rgba(42,190,137,1.0);fill:rgba(42,190,137,1.0)"/><path  d="M 20 225
+L 212 225" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><path  d="M 20 225
+L 20 230" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="24" y="247" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">Response Error</text><path  d="M 116 225
+L 116 230" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="139" y="247" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">No Code</text><path  d="M 212 225
+L 212 66" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><path  d="M 212 225
+L 217 225" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="222" y="231" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">0</text><path  d="M 212 66
+L 217 66" style="stroke-width:0;stroke:rgba(51,51,51,1.0);fill:none"/><text x="222" y="72" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">1</text><path  d="M 20 225
+L 212 225" style="stroke-width:0;stroke:none;fill:none"/><text x="61" y="22" style="stroke-width:0;stroke:none;fill:rgba(51,51,51,1.0);font-size:12.8px;font-family:'Roboto Medium',sans-serif">Models per Category</text></svg>
\ No newline at end of file
diff --git a/go.mod b/go.mod
index 38a024a1..b7873d3a 100644
--- a/go.mod
+++ b/go.mod
@@ -8,13 +8,16 @@ require (
 	github.com/sashabaranov/go-openai v1.20.4
 	github.com/stretchr/testify v1.9.0
 	github.com/symflower/lockfile v0.0.0-20240419143922-aa3b60940c84
+	github.com/wcharczuk/go-chart/v2 v2.1.1
 	github.com/zimmski/osutil v1.1.1-0.20240329123339-0f31a74c026a
 	golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8
 )
 
 require (
 	github.com/avast/retry-go v3.0.0+incompatible // indirect
+	github.com/blend/go-sdk v1.20220411.3 // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
 	github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
 	github.com/rivo/uniseg v0.4.7 // indirect
@@ -23,6 +26,7 @@ require (
 	github.com/termie/go-shutil v0.0.0-20140729215957-bcacb06fecae // indirect
 	github.com/ulikunitz/xz v0.5.11 // indirect
 	github.com/yuin/goldmark v1.7.0 // indirect
+	golang.org/x/image v0.11.0 // indirect
 	golang.org/x/sys v0.18.0 // indirect
 	golang.org/x/term v0.18.0 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
diff --git a/go.sum b/go.sum
index 35701e8b..97db46a8 100644
--- a/go.sum
+++ b/go.sum
@@ -1,8 +1,12 @@
 github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0=
 github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY=
+github.com/blend/go-sdk v1.20220411.3 h1:GFV4/FQX5UzXLPwWV03gP811pj7B8J2sbuq+GJQofXc=
+github.com/blend/go-sdk v1.20220411.3/go.mod h1:7lnH8fTi6U4i1fArEXRyOIY2E1X4MALg09qsQqY1+ak=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
+github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
 github.com/jessevdk/go-flags v1.5.1-0.20210607101731-3927b71304df h1:JTDw/M13b6dZmEJI/vfcCLENqcjUHi9UBry+R0pjh5Q=
 github.com/jessevdk/go-flags v1.5.1-0.20210607101731-3927b71304df/go.mod h1:Fw0T6WPc1dYxT4mKEZRfG5kJhaTDP9pj1c2EWnYs/m4=
 github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw=
@@ -31,20 +35,55 @@ github.com/termie/go-shutil v0.0.0-20140729215957-bcacb06fecae h1:vgGSvdW5Lqg+I1
 github.com/termie/go-shutil v0.0.0-20140729215957-bcacb06fecae/go.mod h1:quDq6Se6jlGwiIKia/itDZxqC5rj6/8OdFyMMAwTxCs=
 github.com/ulikunitz/xz v0.5.11 h1:kpFauv27b6ynzBNT/Xy+1k+fK4WswhN/6PN5WhFAGw8=
 github.com/ulikunitz/xz v0.5.11/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
+github.com/wcharczuk/go-chart/v2 v2.1.1 h1:2u7na789qiD5WzccZsFz4MJWOJP72G+2kUuJoSNqWnE=
+github.com/wcharczuk/go-chart/v2 v2.1.1/go.mod h1:CyCAUt2oqvfhCl6Q5ZvAZwItgpQKZOkCJGb+VGv6l14=
+github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
 github.com/yuin/goldmark v1.7.0 h1:EfOIvIMZIzHdB/R/zVrikYLPPwJlfMcNczJFMs1m6sA=
 github.com/yuin/goldmark v1.7.0/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E=
 github.com/zimmski/osutil v1.1.1-0.20240329123339-0f31a74c026a h1:FnAMI1+eOEJc3cr9r4O3XR632HLh6UYsG2gYLCbWEuE=
 github.com/zimmski/osutil v1.1.1-0.20240329123339-0f31a74c026a/go.mod h1:TZrA1ZvRIeylQ0ECaANmCVlT0WR/62zJxMdQX9SyLvY=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8 h1:aAcj0Da7eBAtrTp03QXWvm88pSyOt+UgdZw2BFZ+lEw=
 golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8/go.mod h1:CQ1k9gNrJ50XIzaKCRR2hssIjF07kZFEiieALBM/ARQ=
+golang.org/x/image v0.11.0 h1:ds2RoQvBvYTiJkwpSFDwCcDFNX7DqjL2WsUgTNk0Ooo=
+golang.org/x/image v0.11.0/go.mod h1:bglhjqbqVuEb9e9+eNR45Jfu7D+T4Qan+NhQk8Ck2P8=
+golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
+golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
+golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
+golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210320140829-1e4c9ba3b0c4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
 golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4=
 golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
 golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
 golang.org/x/term v0.18.0 h1:FcHjZXDMxI8mM3nwhX9HlKop4C0YQvCVCdwYl2wOtE8=
 golang.org/x/term v0.18.0/go.mod h1:ILwASektA3OnRv7amZ1xhE/KTR+u50pbXfZ03+6Nx58=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
+golang.org/x/text v0.12.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
+golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=