From 8612a1b062d02c0bb05901d2434b8abaa3a5edba Mon Sep 17 00:00:00 2001 From: Rui Azevedo Date: Mon, 22 Jul 2024 11:47:53 +0100 Subject: [PATCH] Let the "report" command also generate a markdown report for the combined evaluations Part of #205 --- cmd/eval-dev-quality/cmd/evaluate.go | 4 +- cmd/eval-dev-quality/cmd/report.go | 22 ++++ cmd/eval-dev-quality/cmd/report_test.go | 48 ++++--- evaluate/report/csv.go | 39 ++++++ evaluate/report/csv_test.go | 163 ++++++++++++++++++++++++ evaluate/report/markdown.go | 2 +- 6 files changed, 256 insertions(+), 22 deletions(-) diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go index 8701097c9..52cfe2400 100644 --- a/cmd/eval-dev-quality/cmd/evaluate.go +++ b/cmd/eval-dev-quality/cmd/evaluate.go @@ -446,8 +446,8 @@ func (command *Evaluate) evaluateLocal(evaluationContext *evaluate.Context) (err command.logger.Panicf("ERROR: %s", err) } - _ = assessmentsPerModel.WalkByScore(func(model model.Model, assessment metrics.Assessments, score uint64) (err error) { - command.logger.Printf("Evaluation score for %q (%q): %s", model.ID(), assessment.Category(totalScore).ID, assessment) + _ = assessmentsPerModel.WalkByScore(func(model string, assessment metrics.Assessments, score uint64) (err error) { + command.logger.Printf("Evaluation score for %q (%q): %s", model, assessment.Category(totalScore).ID, assessment) return nil }) diff --git a/cmd/eval-dev-quality/cmd/report.go b/cmd/eval-dev-quality/cmd/report.go index cf1cd2dcb..42297a0ec 100644 --- a/cmd/eval-dev-quality/cmd/report.go +++ b/cmd/eval-dev-quality/cmd/report.go @@ -3,11 +3,13 @@ package cmd import ( "os" "path/filepath" + "time" pkgerrors "github.com/pkg/errors" "github.com/zimmski/osutil" "golang.org/x/exp/maps" + "github.com/symflower/eval-dev-quality/evaluate" "github.com/symflower/eval-dev-quality/evaluate/report" "github.com/symflower/eval-dev-quality/log" ) @@ -21,6 +23,8 @@ type Report struct { // logger holds the logger of the command. logger *log.Logger + // timestamp holds the timestamp of the command execution. + timestamp time.Time } var _ SetLogger = (*Evaluate)(nil) @@ -83,6 +87,24 @@ func (command *Report) Execute(args []string) (err error) { command.logger.Panicf("ERROR: %s", err) } + // Write markdown reports. + assessmentsPerModel, err := report.RecordsToAssessmentsPerModel(records) + if err != nil { + return err + } + if err := (report.Markdown{ + DateTime: command.timestamp, + Version: evaluate.Version, + Revision: evaluate.Revision, + + CSVPath: "./evaluation.csv", + SVGPath: "./categories.svg", + + AssessmentPerModel: assessmentsPerModel, + }).WriteToFile(filepath.Join(command.ResultPath, "README.md")); err != nil { + command.logger.Panicf("ERROR: %s", err) + } + return nil } diff --git a/cmd/eval-dev-quality/cmd/report_test.go b/cmd/eval-dev-quality/cmd/report_test.go index eaa6d8be0..f701484c3 100644 --- a/cmd/eval-dev-quality/cmd/report_test.go +++ b/cmd/eval-dev-quality/cmd/report_test.go @@ -10,7 +10,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/symflower/eval-dev-quality/evaluate/metrics" "github.com/symflower/eval-dev-quality/evaluate/report" "github.com/symflower/eval-dev-quality/log" "github.com/zimmski/osutil" @@ -38,6 +37,15 @@ var gpt4EvaluationCSVFileContent = bytesutil.StringTrimIndentations(` openrouter/openai/gpt-4,java,java/plain,write-tests,12,12,12,12,12,12,12,12,12,12 `) +// validateMarkdownLinks checks if the Markdown report data contains all the links to other relevant report files. +func validateMarkdownLinks(t *testing.T, data string, modelLogNames []string) { + assert.Contains(t, data, "](./categories.svg)") + assert.Contains(t, data, "](./evaluation.csv)") + for _, m := range modelLogNames { + assert.Contains(t, data, fmt.Sprintf("](./%s/)", m)) + } +} + func TestReportExecute(t *testing.T) { type testCase struct { Name string @@ -148,6 +156,10 @@ func TestReportExecute(t *testing.T) { ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ "evaluation.csv": nil, + filepath.Join("result-directory", "categories.svg"): nil, + filepath.Join("result-directory", "README.md"): func(t *testing.T, filePath string, data string) { + validateMarkdownLinks(t, data, []string{"openrouter_anthropic_claude-2.0"}) + }, filepath.Join("result-directory", "evaluation.csv"): func(t *testing.T, filePath, data string) { expectedContent := fmt.Sprintf("%s\n%s", strings.Join(report.EvaluationHeader(), ","), claudeEvaluationCSVFileContent) assert.Equal(t, expectedContent, data) @@ -176,6 +188,14 @@ func TestReportExecute(t *testing.T) { filepath.Join("docs", "v5", "claude", "evaluation.csv"): nil, filepath.Join("docs", "v5", "gemma", "evaluation.csv"): nil, filepath.Join("docs", "v5", "openrouter", "gpt4", "evaluation.csv"): nil, + filepath.Join("result-directory", "categories.svg"): nil, + filepath.Join("result-directory", "README.md"): func(t *testing.T, filePath string, data string) { + validateMarkdownLinks(t, data, []string{ + "openrouter_anthropic_claude-2.0", + "openrouter_google_gemma-7b-it", + "openrouter_openai_gpt-4", + }) + }, filepath.Join("result-directory", "evaluation.csv"): func(t *testing.T, filePath, data string) { expectedContent := fmt.Sprintf("%s\n%s%s%s", strings.Join(report.EvaluationHeader(), ","), claudeEvaluationCSVFileContent, gemmaEvaluationCSVFileContent, gpt4EvaluationCSVFileContent) assert.Equal(t, expectedContent, data) @@ -201,6 +221,14 @@ func TestReportExecute(t *testing.T) { filepath.Join("docs", "v5", "claude", "evaluation.csv"): nil, filepath.Join("docs", "v5", "gemma", "evaluation.csv"): nil, filepath.Join("docs", "v5", "gpt4", "evaluation.csv"): nil, + filepath.Join("result-directory", "categories.svg"): nil, + filepath.Join("result-directory", "README.md"): func(t *testing.T, filePath string, data string) { + validateMarkdownLinks(t, data, []string{ + "openrouter_anthropic_claude-2.0", + "openrouter_google_gemma-7b-it", + "openrouter_openai_gpt-4", + }) + }, filepath.Join("result-directory", "evaluation.csv"): func(t *testing.T, filePath, data string) { expectedContent := fmt.Sprintf("%s\n%s%s%s", strings.Join(report.EvaluationHeader(), ","), claudeEvaluationCSVFileContent, gemmaEvaluationCSVFileContent, gpt4EvaluationCSVFileContent) assert.Equal(t, expectedContent, data) @@ -293,21 +321,3 @@ func evaluationFileWithContent(t *testing.T, workingDirectory string, content st require.NoError(t, os.MkdirAll(workingDirectory, 0700)) require.NoError(t, os.WriteFile(filepath.Join(workingDirectory, "evaluation.csv"), []byte(content), 0700)) } - -// validateReportLinks checks if the Markdown report data contains all the links to other relevant report files. -func validateReportLinks(t *testing.T, data string, modelLogNames []string) { - assert.Contains(t, data, "](./categories.svg)") - assert.Contains(t, data, "](./evaluation.csv)") - assert.Contains(t, data, "](./evaluation.log)") - for _, m := range modelLogNames { - assert.Contains(t, data, fmt.Sprintf("](./%s/)", m)) - } -} - -// validateSVGContent checks if the SVG data contains all given categories and an axis label for the maximal model count. -func validateSVGContent(t *testing.T, data string, categories []*metrics.AssessmentCategory, maxModelCount uint) { - for _, category := range categories { - assert.Contains(t, data, fmt.Sprintf("%s", category.Name)) - } - assert.Contains(t, data, fmt.Sprintf("%d", maxModelCount)) -} diff --git a/evaluate/report/csv.go b/evaluate/report/csv.go index cd808a079..7b1e95946 100644 --- a/evaluate/report/csv.go +++ b/evaluate/report/csv.go @@ -101,6 +101,45 @@ func RecordsFromEvaluationCSVFiles(evaluationCSVFilePaths []string) (records [][ return records, nil } +// RecordsToAssessmentsPerModel converts evaluation records into assessments per model. +func RecordsToAssessmentsPerModel(records [][]string) (assessmentsPerModel AssessmentPerModel, err error) { + assessmentsPerModel = map[string]metrics.Assessments{} + + for _, record := range records { + model := record[0] + assessment, err := assessmentFromRecord(record[5:]) + if err != nil { + return nil, err + } + + if _, ok := assessmentsPerModel[model]; !ok { + assessmentsPerModel[model] = assessment + } else { + assessmentsPerModel[model].Add(assessment) + } + } + + return assessmentsPerModel, nil +} + +// assessmentFromRecord return the assessments of a record. +func assessmentFromRecord(assessmentFields []string) (assessments metrics.Assessments, err error) { + if len(assessmentFields) != len(metrics.AllAssessmentKeysStrings) { + return nil, pkgerrors.Errorf("expected %d assessments, but found %d", len(metrics.AllAssessmentKeysStrings), len(assessmentFields)) + } + + assessments = metrics.NewAssessments() + for i, field := range assessmentFields { + assessmentKeyValue, err := strconv.ParseUint(field, 10, 64) + if err != nil { + return nil, pkgerrors.WithStack(err) + } + assessments[metrics.AssessmentKey(metrics.AllAssessmentKeysStrings[i])] = assessmentKeyValue + } + + return assessments, nil +} + // SortEvaluationRecords sorts the evaluation records. func SortEvaluationRecords(records [][]string) { sort.Slice(records, func(i, j int) bool { diff --git a/evaluate/report/csv_test.go b/evaluate/report/csv_test.go index 89fcdfd7c..d89b69c75 100644 --- a/evaluate/report/csv_test.go +++ b/evaluate/report/csv_test.go @@ -1,6 +1,7 @@ package report import ( + "fmt" "os" "path/filepath" "strings" @@ -311,3 +312,165 @@ func TestSortEvaluationRecords(t *testing.T) { }, }) } + +func TestAssessmentFromRecord(t *testing.T) { + type testCase struct { + Name string + + Record []string + + ExpectedAssessments metrics.Assessments + ExpectedErrText string + } + + validate := func(t *testing.T, tc *testCase) { + t.Run(tc.Name, func(t *testing.T) { + actualAssessments, actualErr := assessmentFromRecord(tc.Record) + + if len(tc.ExpectedErrText) > 0 { + assert.ErrorContains(t, actualErr, tc.ExpectedErrText) + } else { + require.NoError(t, actualErr) + } + + assert.Equal(t, tc.ExpectedAssessments, actualAssessments) + }) + } + + validate(t, &testCase{ + Name: "Invalid assessments", + + Record: []string{"1", "2", "3"}, + + ExpectedErrText: fmt.Sprintf("expected %d assessments, but found %d", len(metrics.AllAssessmentKeysStrings), 3), + }) + validate(t, &testCase{ + Name: "Valid assessments", + + Record: []string{"1", "2", "3", "4", "5", "6", "7", "8", "9"}, + + ExpectedAssessments: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyFilesExecutedMaximumReachable: 3, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 4, + metrics.AssessmentKeyProcessingTime: 5, + metrics.AssessmentKeyResponseCharacterCount: 6, + metrics.AssessmentKeyResponseNoError: 7, + metrics.AssessmentKeyResponseNoExcess: 8, + metrics.AssessmentKeyResponseWithCode: 9, + }, + }) +} + +func TestRecordsToAssessmentsPerModel(t *testing.T) { + type testCase struct { + Name string + + Records [][]string + + ExpectedAssessmentsPerModel AssessmentPerModel + } + + validate := func(t *testing.T, tc *testCase) { + t.Run(tc.Name, func(t *testing.T) { + actualAssessmentsPerModel, actualErr := RecordsToAssessmentsPerModel(tc.Records) + require.NoError(t, actualErr) + + assert.Equal(t, tc.ExpectedAssessmentsPerModel, actualAssessmentsPerModel) + }) + } + + validate(t, &testCase{ + Name: "Single record", + + Records: [][]string{ + []string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}, + }, + + ExpectedAssessmentsPerModel: map[string]metrics.Assessments{ + "modelA": metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyFilesExecutedMaximumReachable: 3, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 4, + metrics.AssessmentKeyProcessingTime: 5, + metrics.AssessmentKeyResponseCharacterCount: 6, + metrics.AssessmentKeyResponseNoError: 7, + metrics.AssessmentKeyResponseNoExcess: 8, + metrics.AssessmentKeyResponseWithCode: 9, + }, + }, + }) + validate(t, &testCase{ + Name: "Multiple records from the same model", + + Records: [][]string{ + []string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}, + []string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}, + []string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}, + }, + + ExpectedAssessmentsPerModel: map[string]metrics.Assessments{ + "modelA": metrics.Assessments{ + metrics.AssessmentKeyCoverage: 3, + metrics.AssessmentKeyFilesExecuted: 6, + metrics.AssessmentKeyFilesExecutedMaximumReachable: 9, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 12, + metrics.AssessmentKeyProcessingTime: 15, + metrics.AssessmentKeyResponseCharacterCount: 18, + metrics.AssessmentKeyResponseNoError: 21, + metrics.AssessmentKeyResponseNoExcess: 24, + metrics.AssessmentKeyResponseWithCode: 27, + }, + }, + }) + validate(t, &testCase{ + Name: "Multiple records from different models", + + Records: [][]string{ + []string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}, + []string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}, + []string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}, + []string{"modelB", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}, + []string{"modelB", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}, + []string{"modelC", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}, + }, + + ExpectedAssessmentsPerModel: map[string]metrics.Assessments{ + "modelA": metrics.Assessments{ + metrics.AssessmentKeyCoverage: 3, + metrics.AssessmentKeyFilesExecuted: 6, + metrics.AssessmentKeyFilesExecutedMaximumReachable: 9, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 12, + metrics.AssessmentKeyProcessingTime: 15, + metrics.AssessmentKeyResponseCharacterCount: 18, + metrics.AssessmentKeyResponseNoError: 21, + metrics.AssessmentKeyResponseNoExcess: 24, + metrics.AssessmentKeyResponseWithCode: 27, + }, + "modelB": metrics.Assessments{ + metrics.AssessmentKeyCoverage: 2, + metrics.AssessmentKeyFilesExecuted: 4, + metrics.AssessmentKeyFilesExecutedMaximumReachable: 6, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 8, + metrics.AssessmentKeyProcessingTime: 10, + metrics.AssessmentKeyResponseCharacterCount: 12, + metrics.AssessmentKeyResponseNoError: 14, + metrics.AssessmentKeyResponseNoExcess: 16, + metrics.AssessmentKeyResponseWithCode: 18, + }, + "modelC": metrics.Assessments{ + metrics.AssessmentKeyCoverage: 1, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyFilesExecutedMaximumReachable: 3, + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 4, + metrics.AssessmentKeyProcessingTime: 5, + metrics.AssessmentKeyResponseCharacterCount: 6, + metrics.AssessmentKeyResponseNoError: 7, + metrics.AssessmentKeyResponseNoExcess: 8, + metrics.AssessmentKeyResponseWithCode: 9, + }, + }, + }) +} diff --git a/evaluate/report/markdown.go b/evaluate/report/markdown.go index 52c797840..ac16b2bd0 100644 --- a/evaluate/report/markdown.go +++ b/evaluate/report/markdown.go @@ -83,7 +83,7 @@ var markdownTemplate = template.Must(template.New("template-report").Parse(bytes {{ range $category := .Categories -}} - {{ $category.Name }}: {{ $category.Description }} {{ end }} - The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here]({{.LogPath}}). Detailed scoring can be found [here]({{.CSVPath}}). + The following sections list all models with their categories.{{ with $logPath := .LogPath }} The complete log of the evaluation with all outputs can be found [here]({{$logPath}}).{{ end }} Detailed scoring can be found [here]({{.CSVPath}}). {{ range $category := .Categories -}} {{ with $modelNames := index $.ModelsPerCategory $category -}}