From 8612a1b062d02c0bb05901d2434b8abaa3a5edba Mon Sep 17 00:00:00 2001
From: Rui Azevedo <rui.azevedo@symflower.com>
Date: Mon, 22 Jul 2024 11:47:53 +0100
Subject: [PATCH] Let the "report" command also generate a markdown report for
 the combined evaluations

Part of #205
---
 cmd/eval-dev-quality/cmd/evaluate.go    |   4 +-
 cmd/eval-dev-quality/cmd/report.go      |  22 ++++
 cmd/eval-dev-quality/cmd/report_test.go |  48 ++++---
 evaluate/report/csv.go                  |  39 ++++++
 evaluate/report/csv_test.go             | 163 ++++++++++++++++++++++++
 evaluate/report/markdown.go             |   2 +-
 6 files changed, 256 insertions(+), 22 deletions(-)

diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go
index 8701097c9..52cfe2400 100644
--- a/cmd/eval-dev-quality/cmd/evaluate.go
+++ b/cmd/eval-dev-quality/cmd/evaluate.go
@@ -446,8 +446,8 @@ func (command *Evaluate) evaluateLocal(evaluationContext *evaluate.Context) (err
 		command.logger.Panicf("ERROR: %s", err)
 	}
 
-	_ = assessmentsPerModel.WalkByScore(func(model model.Model, assessment metrics.Assessments, score uint64) (err error) {
-		command.logger.Printf("Evaluation score for %q (%q): %s", model.ID(), assessment.Category(totalScore).ID, assessment)
+	_ = assessmentsPerModel.WalkByScore(func(model string, assessment metrics.Assessments, score uint64) (err error) {
+		command.logger.Printf("Evaluation score for %q (%q): %s", model, assessment.Category(totalScore).ID, assessment)
 
 		return nil
 	})
diff --git a/cmd/eval-dev-quality/cmd/report.go b/cmd/eval-dev-quality/cmd/report.go
index cf1cd2dcb..42297a0ec 100644
--- a/cmd/eval-dev-quality/cmd/report.go
+++ b/cmd/eval-dev-quality/cmd/report.go
@@ -3,11 +3,13 @@ package cmd
 import (
 	"os"
 	"path/filepath"
+	"time"
 
 	pkgerrors "github.com/pkg/errors"
 	"github.com/zimmski/osutil"
 	"golang.org/x/exp/maps"
 
+	"github.com/symflower/eval-dev-quality/evaluate"
 	"github.com/symflower/eval-dev-quality/evaluate/report"
 	"github.com/symflower/eval-dev-quality/log"
 )
@@ -21,6 +23,8 @@ type Report struct {
 
 	// logger holds the logger of the command.
 	logger *log.Logger
+	// timestamp holds the timestamp of the command execution.
+	timestamp time.Time
 }
 
 var _ SetLogger = (*Evaluate)(nil)
@@ -83,6 +87,24 @@ func (command *Report) Execute(args []string) (err error) {
 		command.logger.Panicf("ERROR: %s", err)
 	}
 
+	// Write markdown reports.
+	assessmentsPerModel, err := report.RecordsToAssessmentsPerModel(records)
+	if err != nil {
+		return err
+	}
+	if err := (report.Markdown{
+		DateTime: command.timestamp,
+		Version:  evaluate.Version,
+		Revision: evaluate.Revision,
+
+		CSVPath: "./evaluation.csv",
+		SVGPath: "./categories.svg",
+
+		AssessmentPerModel: assessmentsPerModel,
+	}).WriteToFile(filepath.Join(command.ResultPath, "README.md")); err != nil {
+		command.logger.Panicf("ERROR: %s", err)
+	}
+
 	return nil
 }
 
diff --git a/cmd/eval-dev-quality/cmd/report_test.go b/cmd/eval-dev-quality/cmd/report_test.go
index eaa6d8be0..f701484c3 100644
--- a/cmd/eval-dev-quality/cmd/report_test.go
+++ b/cmd/eval-dev-quality/cmd/report_test.go
@@ -10,7 +10,6 @@ import (
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
-	"github.com/symflower/eval-dev-quality/evaluate/metrics"
 	"github.com/symflower/eval-dev-quality/evaluate/report"
 	"github.com/symflower/eval-dev-quality/log"
 	"github.com/zimmski/osutil"
@@ -38,6 +37,15 @@ var gpt4EvaluationCSVFileContent = bytesutil.StringTrimIndentations(`
 	openrouter/openai/gpt-4,java,java/plain,write-tests,12,12,12,12,12,12,12,12,12,12
 `)
 
+// validateMarkdownLinks checks if the Markdown report data contains all the links to other relevant report files.
+func validateMarkdownLinks(t *testing.T, data string, modelLogNames []string) {
+	assert.Contains(t, data, "](./categories.svg)")
+	assert.Contains(t, data, "](./evaluation.csv)")
+	for _, m := range modelLogNames {
+		assert.Contains(t, data, fmt.Sprintf("](./%s/)", m))
+	}
+}
+
 func TestReportExecute(t *testing.T) {
 	type testCase struct {
 		Name string
@@ -148,6 +156,10 @@ func TestReportExecute(t *testing.T) {
 
 		ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
 			"evaluation.csv": nil,
+			filepath.Join("result-directory", "categories.svg"): nil,
+			filepath.Join("result-directory", "README.md"): func(t *testing.T, filePath string, data string) {
+				validateMarkdownLinks(t, data, []string{"openrouter_anthropic_claude-2.0"})
+			},
 			filepath.Join("result-directory", "evaluation.csv"): func(t *testing.T, filePath, data string) {
 				expectedContent := fmt.Sprintf("%s\n%s", strings.Join(report.EvaluationHeader(), ","), claudeEvaluationCSVFileContent)
 				assert.Equal(t, expectedContent, data)
@@ -176,6 +188,14 @@ func TestReportExecute(t *testing.T) {
 			filepath.Join("docs", "v5", "claude", "evaluation.csv"):             nil,
 			filepath.Join("docs", "v5", "gemma", "evaluation.csv"):              nil,
 			filepath.Join("docs", "v5", "openrouter", "gpt4", "evaluation.csv"): nil,
+			filepath.Join("result-directory", "categories.svg"):                 nil,
+			filepath.Join("result-directory", "README.md"): func(t *testing.T, filePath string, data string) {
+				validateMarkdownLinks(t, data, []string{
+					"openrouter_anthropic_claude-2.0",
+					"openrouter_google_gemma-7b-it",
+					"openrouter_openai_gpt-4",
+				})
+			},
 			filepath.Join("result-directory", "evaluation.csv"): func(t *testing.T, filePath, data string) {
 				expectedContent := fmt.Sprintf("%s\n%s%s%s", strings.Join(report.EvaluationHeader(), ","), claudeEvaluationCSVFileContent, gemmaEvaluationCSVFileContent, gpt4EvaluationCSVFileContent)
 				assert.Equal(t, expectedContent, data)
@@ -201,6 +221,14 @@ func TestReportExecute(t *testing.T) {
 			filepath.Join("docs", "v5", "claude", "evaluation.csv"): nil,
 			filepath.Join("docs", "v5", "gemma", "evaluation.csv"):  nil,
 			filepath.Join("docs", "v5", "gpt4", "evaluation.csv"):   nil,
+			filepath.Join("result-directory", "categories.svg"):     nil,
+			filepath.Join("result-directory", "README.md"): func(t *testing.T, filePath string, data string) {
+				validateMarkdownLinks(t, data, []string{
+					"openrouter_anthropic_claude-2.0",
+					"openrouter_google_gemma-7b-it",
+					"openrouter_openai_gpt-4",
+				})
+			},
 			filepath.Join("result-directory", "evaluation.csv"): func(t *testing.T, filePath, data string) {
 				expectedContent := fmt.Sprintf("%s\n%s%s%s", strings.Join(report.EvaluationHeader(), ","), claudeEvaluationCSVFileContent, gemmaEvaluationCSVFileContent, gpt4EvaluationCSVFileContent)
 				assert.Equal(t, expectedContent, data)
@@ -293,21 +321,3 @@ func evaluationFileWithContent(t *testing.T, workingDirectory string, content st
 	require.NoError(t, os.MkdirAll(workingDirectory, 0700))
 	require.NoError(t, os.WriteFile(filepath.Join(workingDirectory, "evaluation.csv"), []byte(content), 0700))
 }
-
-// validateReportLinks checks if the Markdown report data contains all the links to other relevant report files.
-func validateReportLinks(t *testing.T, data string, modelLogNames []string) {
-	assert.Contains(t, data, "](./categories.svg)")
-	assert.Contains(t, data, "](./evaluation.csv)")
-	assert.Contains(t, data, "](./evaluation.log)")
-	for _, m := range modelLogNames {
-		assert.Contains(t, data, fmt.Sprintf("](./%s/)", m))
-	}
-}
-
-// validateSVGContent checks if the SVG data contains all given categories and an axis label for the maximal model count.
-func validateSVGContent(t *testing.T, data string, categories []*metrics.AssessmentCategory, maxModelCount uint) {
-	for _, category := range categories {
-		assert.Contains(t, data, fmt.Sprintf("%s</text>", category.Name))
-	}
-	assert.Contains(t, data, fmt.Sprintf("%d</text>", maxModelCount))
-}
diff --git a/evaluate/report/csv.go b/evaluate/report/csv.go
index cd808a079..7b1e95946 100644
--- a/evaluate/report/csv.go
+++ b/evaluate/report/csv.go
@@ -101,6 +101,45 @@ func RecordsFromEvaluationCSVFiles(evaluationCSVFilePaths []string) (records [][
 	return records, nil
 }
 
+// RecordsToAssessmentsPerModel converts evaluation records into assessments per model.
+func RecordsToAssessmentsPerModel(records [][]string) (assessmentsPerModel AssessmentPerModel, err error) {
+	assessmentsPerModel = map[string]metrics.Assessments{}
+
+	for _, record := range records {
+		model := record[0]
+		assessment, err := assessmentFromRecord(record[5:])
+		if err != nil {
+			return nil, err
+		}
+
+		if _, ok := assessmentsPerModel[model]; !ok {
+			assessmentsPerModel[model] = assessment
+		} else {
+			assessmentsPerModel[model].Add(assessment)
+		}
+	}
+
+	return assessmentsPerModel, nil
+}
+
+// assessmentFromRecord return the assessments of a record.
+func assessmentFromRecord(assessmentFields []string) (assessments metrics.Assessments, err error) {
+	if len(assessmentFields) != len(metrics.AllAssessmentKeysStrings) {
+		return nil, pkgerrors.Errorf("expected %d assessments, but found %d", len(metrics.AllAssessmentKeysStrings), len(assessmentFields))
+	}
+
+	assessments = metrics.NewAssessments()
+	for i, field := range assessmentFields {
+		assessmentKeyValue, err := strconv.ParseUint(field, 10, 64)
+		if err != nil {
+			return nil, pkgerrors.WithStack(err)
+		}
+		assessments[metrics.AssessmentKey(metrics.AllAssessmentKeysStrings[i])] = assessmentKeyValue
+	}
+
+	return assessments, nil
+}
+
 // SortEvaluationRecords sorts the evaluation records.
 func SortEvaluationRecords(records [][]string) {
 	sort.Slice(records, func(i, j int) bool {
diff --git a/evaluate/report/csv_test.go b/evaluate/report/csv_test.go
index 89fcdfd7c..d89b69c75 100644
--- a/evaluate/report/csv_test.go
+++ b/evaluate/report/csv_test.go
@@ -1,6 +1,7 @@
 package report
 
 import (
+	"fmt"
 	"os"
 	"path/filepath"
 	"strings"
@@ -311,3 +312,165 @@ func TestSortEvaluationRecords(t *testing.T) {
 		},
 	})
 }
+
+func TestAssessmentFromRecord(t *testing.T) {
+	type testCase struct {
+		Name string
+
+		Record []string
+
+		ExpectedAssessments metrics.Assessments
+		ExpectedErrText     string
+	}
+
+	validate := func(t *testing.T, tc *testCase) {
+		t.Run(tc.Name, func(t *testing.T) {
+			actualAssessments, actualErr := assessmentFromRecord(tc.Record)
+
+			if len(tc.ExpectedErrText) > 0 {
+				assert.ErrorContains(t, actualErr, tc.ExpectedErrText)
+			} else {
+				require.NoError(t, actualErr)
+			}
+
+			assert.Equal(t, tc.ExpectedAssessments, actualAssessments)
+		})
+	}
+
+	validate(t, &testCase{
+		Name: "Invalid assessments",
+
+		Record: []string{"1", "2", "3"},
+
+		ExpectedErrText: fmt.Sprintf("expected %d assessments, but found %d", len(metrics.AllAssessmentKeysStrings), 3),
+	})
+	validate(t, &testCase{
+		Name: "Valid assessments",
+
+		Record: []string{"1", "2", "3", "4", "5", "6", "7", "8", "9"},
+
+		ExpectedAssessments: metrics.Assessments{
+			metrics.AssessmentKeyCoverage:                           1,
+			metrics.AssessmentKeyFilesExecuted:                      2,
+			metrics.AssessmentKeyFilesExecutedMaximumReachable:      3,
+			metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 4,
+			metrics.AssessmentKeyProcessingTime:                     5,
+			metrics.AssessmentKeyResponseCharacterCount:             6,
+			metrics.AssessmentKeyResponseNoError:                    7,
+			metrics.AssessmentKeyResponseNoExcess:                   8,
+			metrics.AssessmentKeyResponseWithCode:                   9,
+		},
+	})
+}
+
+func TestRecordsToAssessmentsPerModel(t *testing.T) {
+	type testCase struct {
+		Name string
+
+		Records [][]string
+
+		ExpectedAssessmentsPerModel AssessmentPerModel
+	}
+
+	validate := func(t *testing.T, tc *testCase) {
+		t.Run(tc.Name, func(t *testing.T) {
+			actualAssessmentsPerModel, actualErr := RecordsToAssessmentsPerModel(tc.Records)
+			require.NoError(t, actualErr)
+
+			assert.Equal(t, tc.ExpectedAssessmentsPerModel, actualAssessmentsPerModel)
+		})
+	}
+
+	validate(t, &testCase{
+		Name: "Single record",
+
+		Records: [][]string{
+			[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
+		},
+
+		ExpectedAssessmentsPerModel: map[string]metrics.Assessments{
+			"modelA": metrics.Assessments{
+				metrics.AssessmentKeyCoverage:                           1,
+				metrics.AssessmentKeyFilesExecuted:                      2,
+				metrics.AssessmentKeyFilesExecutedMaximumReachable:      3,
+				metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 4,
+				metrics.AssessmentKeyProcessingTime:                     5,
+				metrics.AssessmentKeyResponseCharacterCount:             6,
+				metrics.AssessmentKeyResponseNoError:                    7,
+				metrics.AssessmentKeyResponseNoExcess:                   8,
+				metrics.AssessmentKeyResponseWithCode:                   9,
+			},
+		},
+	})
+	validate(t, &testCase{
+		Name: "Multiple records from the same model",
+
+		Records: [][]string{
+			[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
+			[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
+			[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
+		},
+
+		ExpectedAssessmentsPerModel: map[string]metrics.Assessments{
+			"modelA": metrics.Assessments{
+				metrics.AssessmentKeyCoverage:                           3,
+				metrics.AssessmentKeyFilesExecuted:                      6,
+				metrics.AssessmentKeyFilesExecutedMaximumReachable:      9,
+				metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 12,
+				metrics.AssessmentKeyProcessingTime:                     15,
+				metrics.AssessmentKeyResponseCharacterCount:             18,
+				metrics.AssessmentKeyResponseNoError:                    21,
+				metrics.AssessmentKeyResponseNoExcess:                   24,
+				metrics.AssessmentKeyResponseWithCode:                   27,
+			},
+		},
+	})
+	validate(t, &testCase{
+		Name: "Multiple records from different models",
+
+		Records: [][]string{
+			[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
+			[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
+			[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
+			[]string{"modelB", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
+			[]string{"modelB", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
+			[]string{"modelC", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
+		},
+
+		ExpectedAssessmentsPerModel: map[string]metrics.Assessments{
+			"modelA": metrics.Assessments{
+				metrics.AssessmentKeyCoverage:                           3,
+				metrics.AssessmentKeyFilesExecuted:                      6,
+				metrics.AssessmentKeyFilesExecutedMaximumReachable:      9,
+				metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 12,
+				metrics.AssessmentKeyProcessingTime:                     15,
+				metrics.AssessmentKeyResponseCharacterCount:             18,
+				metrics.AssessmentKeyResponseNoError:                    21,
+				metrics.AssessmentKeyResponseNoExcess:                   24,
+				metrics.AssessmentKeyResponseWithCode:                   27,
+			},
+			"modelB": metrics.Assessments{
+				metrics.AssessmentKeyCoverage:                           2,
+				metrics.AssessmentKeyFilesExecuted:                      4,
+				metrics.AssessmentKeyFilesExecutedMaximumReachable:      6,
+				metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 8,
+				metrics.AssessmentKeyProcessingTime:                     10,
+				metrics.AssessmentKeyResponseCharacterCount:             12,
+				metrics.AssessmentKeyResponseNoError:                    14,
+				metrics.AssessmentKeyResponseNoExcess:                   16,
+				metrics.AssessmentKeyResponseWithCode:                   18,
+			},
+			"modelC": metrics.Assessments{
+				metrics.AssessmentKeyCoverage:                           1,
+				metrics.AssessmentKeyFilesExecuted:                      2,
+				metrics.AssessmentKeyFilesExecutedMaximumReachable:      3,
+				metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 4,
+				metrics.AssessmentKeyProcessingTime:                     5,
+				metrics.AssessmentKeyResponseCharacterCount:             6,
+				metrics.AssessmentKeyResponseNoError:                    7,
+				metrics.AssessmentKeyResponseNoExcess:                   8,
+				metrics.AssessmentKeyResponseWithCode:                   9,
+			},
+		},
+	})
+}
diff --git a/evaluate/report/markdown.go b/evaluate/report/markdown.go
index 52c797840..ac16b2bd0 100644
--- a/evaluate/report/markdown.go
+++ b/evaluate/report/markdown.go
@@ -83,7 +83,7 @@ var markdownTemplate = template.Must(template.New("template-report").Parse(bytes
 	{{ range $category := .Categories -}}
 	- {{ $category.Name }}: {{ $category.Description }}
 	{{ end }}
-	The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here]({{.LogPath}}). Detailed scoring can be found [here]({{.CSVPath}}).
+	The following sections list all models with their categories.{{ with $logPath := .LogPath }} The complete log of the evaluation with all outputs can be found [here]({{$logPath}}).{{ end }} Detailed scoring can be found [here]({{.CSVPath}}).
 
 	{{ range $category := .Categories -}}
 	{{ with $modelNames := index $.ModelsPerCategory $category -}}