Skip to content

Commit

Permalink
Let the "report" command also generate a markdown report for the comb…
Browse files Browse the repository at this point in the history
…ined evaluations

Part of #205
  • Loading branch information
ruiAzevedo19 committed Jul 22, 2024
1 parent 9496913 commit 8612a1b
Show file tree
Hide file tree
Showing 6 changed files with 256 additions and 22 deletions.
4 changes: 2 additions & 2 deletions cmd/eval-dev-quality/cmd/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -446,8 +446,8 @@ func (command *Evaluate) evaluateLocal(evaluationContext *evaluate.Context) (err
command.logger.Panicf("ERROR: %s", err)
}

_ = assessmentsPerModel.WalkByScore(func(model model.Model, assessment metrics.Assessments, score uint64) (err error) {
command.logger.Printf("Evaluation score for %q (%q): %s", model.ID(), assessment.Category(totalScore).ID, assessment)
_ = assessmentsPerModel.WalkByScore(func(model string, assessment metrics.Assessments, score uint64) (err error) {
command.logger.Printf("Evaluation score for %q (%q): %s", model, assessment.Category(totalScore).ID, assessment)

return nil
})
Expand Down
22 changes: 22 additions & 0 deletions cmd/eval-dev-quality/cmd/report.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@ package cmd
import (
"os"
"path/filepath"
"time"

pkgerrors "github.com/pkg/errors"
"github.com/zimmski/osutil"
"golang.org/x/exp/maps"

"github.com/symflower/eval-dev-quality/evaluate"
"github.com/symflower/eval-dev-quality/evaluate/report"
"github.com/symflower/eval-dev-quality/log"
)
Expand All @@ -21,6 +23,8 @@ type Report struct {

// logger holds the logger of the command.
logger *log.Logger
// timestamp holds the timestamp of the command execution.
timestamp time.Time
}

var _ SetLogger = (*Evaluate)(nil)
Expand Down Expand Up @@ -83,6 +87,24 @@ func (command *Report) Execute(args []string) (err error) {
command.logger.Panicf("ERROR: %s", err)
}

// Write markdown reports.
assessmentsPerModel, err := report.RecordsToAssessmentsPerModel(records)
if err != nil {
return err
}
if err := (report.Markdown{
DateTime: command.timestamp,
Version: evaluate.Version,
Revision: evaluate.Revision,

CSVPath: "./evaluation.csv",
SVGPath: "./categories.svg",

AssessmentPerModel: assessmentsPerModel,
}).WriteToFile(filepath.Join(command.ResultPath, "README.md")); err != nil {
command.logger.Panicf("ERROR: %s", err)
}

return nil
}

Expand Down
48 changes: 29 additions & 19 deletions cmd/eval-dev-quality/cmd/report_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import (

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/symflower/eval-dev-quality/evaluate/metrics"
"github.com/symflower/eval-dev-quality/evaluate/report"
"github.com/symflower/eval-dev-quality/log"
"github.com/zimmski/osutil"
Expand Down Expand Up @@ -38,6 +37,15 @@ var gpt4EvaluationCSVFileContent = bytesutil.StringTrimIndentations(`
openrouter/openai/gpt-4,java,java/plain,write-tests,12,12,12,12,12,12,12,12,12,12
`)

// validateMarkdownLinks checks if the Markdown report data contains all the links to other relevant report files.
func validateMarkdownLinks(t *testing.T, data string, modelLogNames []string) {
assert.Contains(t, data, "](./categories.svg)")
assert.Contains(t, data, "](./evaluation.csv)")
for _, m := range modelLogNames {
assert.Contains(t, data, fmt.Sprintf("](./%s/)", m))
}
}

func TestReportExecute(t *testing.T) {
type testCase struct {
Name string
Expand Down Expand Up @@ -148,6 +156,10 @@ func TestReportExecute(t *testing.T) {

ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"evaluation.csv": nil,
filepath.Join("result-directory", "categories.svg"): nil,
filepath.Join("result-directory", "README.md"): func(t *testing.T, filePath string, data string) {
validateMarkdownLinks(t, data, []string{"openrouter_anthropic_claude-2.0"})
},
filepath.Join("result-directory", "evaluation.csv"): func(t *testing.T, filePath, data string) {
expectedContent := fmt.Sprintf("%s\n%s", strings.Join(report.EvaluationHeader(), ","), claudeEvaluationCSVFileContent)
assert.Equal(t, expectedContent, data)
Expand Down Expand Up @@ -176,6 +188,14 @@ func TestReportExecute(t *testing.T) {
filepath.Join("docs", "v5", "claude", "evaluation.csv"): nil,
filepath.Join("docs", "v5", "gemma", "evaluation.csv"): nil,
filepath.Join("docs", "v5", "openrouter", "gpt4", "evaluation.csv"): nil,
filepath.Join("result-directory", "categories.svg"): nil,
filepath.Join("result-directory", "README.md"): func(t *testing.T, filePath string, data string) {
validateMarkdownLinks(t, data, []string{
"openrouter_anthropic_claude-2.0",
"openrouter_google_gemma-7b-it",
"openrouter_openai_gpt-4",
})
},
filepath.Join("result-directory", "evaluation.csv"): func(t *testing.T, filePath, data string) {
expectedContent := fmt.Sprintf("%s\n%s%s%s", strings.Join(report.EvaluationHeader(), ","), claudeEvaluationCSVFileContent, gemmaEvaluationCSVFileContent, gpt4EvaluationCSVFileContent)
assert.Equal(t, expectedContent, data)
Expand All @@ -201,6 +221,14 @@ func TestReportExecute(t *testing.T) {
filepath.Join("docs", "v5", "claude", "evaluation.csv"): nil,
filepath.Join("docs", "v5", "gemma", "evaluation.csv"): nil,
filepath.Join("docs", "v5", "gpt4", "evaluation.csv"): nil,
filepath.Join("result-directory", "categories.svg"): nil,
filepath.Join("result-directory", "README.md"): func(t *testing.T, filePath string, data string) {
validateMarkdownLinks(t, data, []string{
"openrouter_anthropic_claude-2.0",
"openrouter_google_gemma-7b-it",
"openrouter_openai_gpt-4",
})
},
filepath.Join("result-directory", "evaluation.csv"): func(t *testing.T, filePath, data string) {
expectedContent := fmt.Sprintf("%s\n%s%s%s", strings.Join(report.EvaluationHeader(), ","), claudeEvaluationCSVFileContent, gemmaEvaluationCSVFileContent, gpt4EvaluationCSVFileContent)
assert.Equal(t, expectedContent, data)
Expand Down Expand Up @@ -293,21 +321,3 @@ func evaluationFileWithContent(t *testing.T, workingDirectory string, content st
require.NoError(t, os.MkdirAll(workingDirectory, 0700))
require.NoError(t, os.WriteFile(filepath.Join(workingDirectory, "evaluation.csv"), []byte(content), 0700))
}

// validateReportLinks checks if the Markdown report data contains all the links to other relevant report files.
func validateReportLinks(t *testing.T, data string, modelLogNames []string) {
assert.Contains(t, data, "](./categories.svg)")
assert.Contains(t, data, "](./evaluation.csv)")
assert.Contains(t, data, "](./evaluation.log)")
for _, m := range modelLogNames {
assert.Contains(t, data, fmt.Sprintf("](./%s/)", m))
}
}

// validateSVGContent checks if the SVG data contains all given categories and an axis label for the maximal model count.
func validateSVGContent(t *testing.T, data string, categories []*metrics.AssessmentCategory, maxModelCount uint) {
for _, category := range categories {
assert.Contains(t, data, fmt.Sprintf("%s</text>", category.Name))
}
assert.Contains(t, data, fmt.Sprintf("%d</text>", maxModelCount))
}
39 changes: 39 additions & 0 deletions evaluate/report/csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,45 @@ func RecordsFromEvaluationCSVFiles(evaluationCSVFilePaths []string) (records [][
return records, nil
}

// RecordsToAssessmentsPerModel converts evaluation records into assessments per model.
func RecordsToAssessmentsPerModel(records [][]string) (assessmentsPerModel AssessmentPerModel, err error) {
assessmentsPerModel = map[string]metrics.Assessments{}

for _, record := range records {
model := record[0]
assessment, err := assessmentFromRecord(record[5:])
if err != nil {
return nil, err
}

if _, ok := assessmentsPerModel[model]; !ok {
assessmentsPerModel[model] = assessment
} else {
assessmentsPerModel[model].Add(assessment)
}
}

return assessmentsPerModel, nil
}

// assessmentFromRecord return the assessments of a record.
func assessmentFromRecord(assessmentFields []string) (assessments metrics.Assessments, err error) {
if len(assessmentFields) != len(metrics.AllAssessmentKeysStrings) {
return nil, pkgerrors.Errorf("expected %d assessments, but found %d", len(metrics.AllAssessmentKeysStrings), len(assessmentFields))
}

assessments = metrics.NewAssessments()
for i, field := range assessmentFields {
assessmentKeyValue, err := strconv.ParseUint(field, 10, 64)
if err != nil {
return nil, pkgerrors.WithStack(err)
}
assessments[metrics.AssessmentKey(metrics.AllAssessmentKeysStrings[i])] = assessmentKeyValue
}

return assessments, nil
}

// SortEvaluationRecords sorts the evaluation records.
func SortEvaluationRecords(records [][]string) {
sort.Slice(records, func(i, j int) bool {
Expand Down
163 changes: 163 additions & 0 deletions evaluate/report/csv_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package report

import (
"fmt"
"os"
"path/filepath"
"strings"
Expand Down Expand Up @@ -311,3 +312,165 @@ func TestSortEvaluationRecords(t *testing.T) {
},
})
}

func TestAssessmentFromRecord(t *testing.T) {
type testCase struct {
Name string

Record []string

ExpectedAssessments metrics.Assessments
ExpectedErrText string
}

validate := func(t *testing.T, tc *testCase) {
t.Run(tc.Name, func(t *testing.T) {
actualAssessments, actualErr := assessmentFromRecord(tc.Record)

if len(tc.ExpectedErrText) > 0 {
assert.ErrorContains(t, actualErr, tc.ExpectedErrText)
} else {
require.NoError(t, actualErr)
}

assert.Equal(t, tc.ExpectedAssessments, actualAssessments)
})
}

validate(t, &testCase{
Name: "Invalid assessments",

Record: []string{"1", "2", "3"},

ExpectedErrText: fmt.Sprintf("expected %d assessments, but found %d", len(metrics.AllAssessmentKeysStrings), 3),
})
validate(t, &testCase{
Name: "Valid assessments",

Record: []string{"1", "2", "3", "4", "5", "6", "7", "8", "9"},

ExpectedAssessments: metrics.Assessments{
metrics.AssessmentKeyCoverage: 1,
metrics.AssessmentKeyFilesExecuted: 2,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 3,
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 4,
metrics.AssessmentKeyProcessingTime: 5,
metrics.AssessmentKeyResponseCharacterCount: 6,
metrics.AssessmentKeyResponseNoError: 7,
metrics.AssessmentKeyResponseNoExcess: 8,
metrics.AssessmentKeyResponseWithCode: 9,
},
})
}

func TestRecordsToAssessmentsPerModel(t *testing.T) {
type testCase struct {
Name string

Records [][]string

ExpectedAssessmentsPerModel AssessmentPerModel
}

validate := func(t *testing.T, tc *testCase) {
t.Run(tc.Name, func(t *testing.T) {
actualAssessmentsPerModel, actualErr := RecordsToAssessmentsPerModel(tc.Records)
require.NoError(t, actualErr)

assert.Equal(t, tc.ExpectedAssessmentsPerModel, actualAssessmentsPerModel)
})
}

validate(t, &testCase{
Name: "Single record",

Records: [][]string{
[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
},

ExpectedAssessmentsPerModel: map[string]metrics.Assessments{
"modelA": metrics.Assessments{
metrics.AssessmentKeyCoverage: 1,
metrics.AssessmentKeyFilesExecuted: 2,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 3,
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 4,
metrics.AssessmentKeyProcessingTime: 5,
metrics.AssessmentKeyResponseCharacterCount: 6,
metrics.AssessmentKeyResponseNoError: 7,
metrics.AssessmentKeyResponseNoExcess: 8,
metrics.AssessmentKeyResponseWithCode: 9,
},
},
})
validate(t, &testCase{
Name: "Multiple records from the same model",

Records: [][]string{
[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
},

ExpectedAssessmentsPerModel: map[string]metrics.Assessments{
"modelA": metrics.Assessments{
metrics.AssessmentKeyCoverage: 3,
metrics.AssessmentKeyFilesExecuted: 6,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 9,
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 12,
metrics.AssessmentKeyProcessingTime: 15,
metrics.AssessmentKeyResponseCharacterCount: 18,
metrics.AssessmentKeyResponseNoError: 21,
metrics.AssessmentKeyResponseNoExcess: 24,
metrics.AssessmentKeyResponseWithCode: 27,
},
},
})
validate(t, &testCase{
Name: "Multiple records from different models",

Records: [][]string{
[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
[]string{"modelA", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
[]string{"modelB", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
[]string{"modelB", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
[]string{"modelC", "languageB", "repositoryA", "taskA", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"},
},

ExpectedAssessmentsPerModel: map[string]metrics.Assessments{
"modelA": metrics.Assessments{
metrics.AssessmentKeyCoverage: 3,
metrics.AssessmentKeyFilesExecuted: 6,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 9,
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 12,
metrics.AssessmentKeyProcessingTime: 15,
metrics.AssessmentKeyResponseCharacterCount: 18,
metrics.AssessmentKeyResponseNoError: 21,
metrics.AssessmentKeyResponseNoExcess: 24,
metrics.AssessmentKeyResponseWithCode: 27,
},
"modelB": metrics.Assessments{
metrics.AssessmentKeyCoverage: 2,
metrics.AssessmentKeyFilesExecuted: 4,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 6,
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 8,
metrics.AssessmentKeyProcessingTime: 10,
metrics.AssessmentKeyResponseCharacterCount: 12,
metrics.AssessmentKeyResponseNoError: 14,
metrics.AssessmentKeyResponseNoExcess: 16,
metrics.AssessmentKeyResponseWithCode: 18,
},
"modelC": metrics.Assessments{
metrics.AssessmentKeyCoverage: 1,
metrics.AssessmentKeyFilesExecuted: 2,
metrics.AssessmentKeyFilesExecutedMaximumReachable: 3,
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 4,
metrics.AssessmentKeyProcessingTime: 5,
metrics.AssessmentKeyResponseCharacterCount: 6,
metrics.AssessmentKeyResponseNoError: 7,
metrics.AssessmentKeyResponseNoExcess: 8,
metrics.AssessmentKeyResponseWithCode: 9,
},
},
})
}
2 changes: 1 addition & 1 deletion evaluate/report/markdown.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ var markdownTemplate = template.Must(template.New("template-report").Parse(bytes
{{ range $category := .Categories -}}
- {{ $category.Name }}: {{ $category.Description }}
{{ end }}
The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here]({{.LogPath}}). Detailed scoring can be found [here]({{.CSVPath}}).
The following sections list all models with their categories.{{ with $logPath := .LogPath }} The complete log of the evaluation with all outputs can be found [here]({{$logPath}}).{{ end }} Detailed scoring can be found [here]({{.CSVPath}}).
{{ range $category := .Categories -}}
{{ with $modelNames := index $.ModelsPerCategory $category -}}
Expand Down

0 comments on commit 8612a1b

Please sign in to comment.