diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go index d03a529eb..40b4eed39 100644 --- a/cmd/eval-dev-quality/cmd/evaluate.go +++ b/cmd/eval-dev-quality/cmd/evaluate.go @@ -21,6 +21,7 @@ import ( _ "github.com/symflower/eval-dev-quality/provider/openrouter" // Register provider. _ "github.com/symflower/eval-dev-quality/provider/symflower" // Register provider. "github.com/symflower/eval-dev-quality/tools" + "github.com/symflower/eval-dev-quality/version" ) // Evaluate holds the "evaluation" command. @@ -58,10 +59,12 @@ const repositoryPlainName = "plain" // Execute executes the command. func (command *Evaluate) Execute(args []string) (err error) { - command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", time.Now().Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group. + evaluationTimestamp := time.Now() + command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", evaluationTimestamp.Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group. command.logger.Printf("Writing results to %s", command.ResultPath) - log, logClose, err := log.WithFile(command.logger, filepath.Join(command.ResultPath, "evaluation.log")) + logFilePath := filepath.Join(command.ResultPath, "evaluation.log") + log, logClose, err := log.WithFile(command.logger, logFilePath) if err != nil { return err } @@ -231,7 +234,8 @@ func (command *Evaluate) Execute(args []string) (err error) { if err != nil { log.Fatalf("ERROR: could not create result summary: %s", err) } - if err := os.WriteFile(filepath.Join(command.ResultPath, "evaluation.csv"), []byte(csv), 0644); err != nil { + csvReportPath := filepath.Join(command.ResultPath, "evaluation.csv") + if err := os.WriteFile(csvReportPath, []byte(csv), 0644); err != nil { log.Fatalf("ERROR: could not write result summary: %s", err) } @@ -249,7 +253,21 @@ func (command *Evaluate) Execute(args []string) (err error) { totalScore = uint(len(languagesSelected)) } - _ = metrics.WalkByScore(assessments.Collapse(), func(model string, assessment metrics.Assessments, score uint) error { + assessmentsPerModel := assessments.Collapse() + if err := (report.Markdown{ + DateTime: evaluationTimestamp, + Version: version.Current, + + CSVPath: csvReportPath, + LogPath: logFilePath, + + AssessmentPerModel: assessmentsPerModel, + TotalScore: totalScore, + }).WriteToFile(filepath.Join(command.ResultPath, "report.md")); err != nil { + return err + } + + _ = metrics.WalkByScore(assessmentsPerModel, func(model string, assessment metrics.Assessments, score uint) error { log.Printf("Evaluation score for %q (%q): %s", model, assessment.Category(totalScore).Name, assessment) return nil diff --git a/cmd/eval-dev-quality/cmd/evaluate_test.go b/cmd/eval-dev-quality/cmd/evaluate_test.go index b0f435c20..95e10887c 100644 --- a/cmd/eval-dev-quality/cmd/evaluate_test.go +++ b/cmd/eval-dev-quality/cmd/evaluate_test.go @@ -22,7 +22,7 @@ func TestEvaluateExecute(t *testing.T) { Arguments []string ExpectedOutputValidate func(t *testing.T, output string, resultPath string) - ExpectedResultFiles map[string]func(t *testing.T, filePath string, data string) + ExpectedResultFiles map[string]func(t *testing.T, resultPath string, filePath string, data string) } validate := func(t *testing.T, tc *testCase) { @@ -54,7 +54,7 @@ func TestEvaluateExecute(t *testing.T) { if validate != nil { data, err := os.ReadFile(filepath.Join(temporaryPath, filePath)) if assert.NoError(t, err) { - validate(t, filePath, string(data)) + validate(t, temporaryPath, filePath, string(data)) } } } @@ -78,15 +78,19 @@ func TestEvaluateExecute(t *testing.T) { t.Logf("Output: %s", output) } }, - ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ - "evaluation.csv": func(t *testing.T, filePath, data string) { + ExpectedResultFiles: map[string]func(t *testing.T, resultPath string, filePath string, data string){ + "evaluation.csv": func(t *testing.T, resultPath string, filePath, data string) { assert.Equal(t, bytesutil.StringTrimIndentations(` model,language,repository,score,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code symflower/symbolic-execution,golang,golang/plain,6,1,1,1,1,1,1 `), data) }, "evaluation.log": nil, - "report.md": nil, + "report.md": func(t *testing.T, resultPath string, filePath, data string) { + // Ensure the report links to the CSV file and logs. + assert.Contains(t, data, filepath.Join(resultPath, "evaluation.csv")) + assert.Contains(t, data, filepath.Join(resultPath, "evaluation.log")) + }, "symflower_symbolic-execution/golang/golang/plain.log": nil, }, }) diff --git a/evaluate/report/markdown.go b/evaluate/report/markdown.go new file mode 100644 index 000000000..775112857 --- /dev/null +++ b/evaluate/report/markdown.go @@ -0,0 +1,108 @@ +package report + +import ( + "io" + "os" + "path/filepath" + "text/template" + "time" + + pkgerrors "github.com/pkg/errors" + "github.com/zimmski/osutil/bytesutil" + + "github.com/symflower/eval-dev-quality/evaluate/metrics" +) + +// Markdown holds the values for exporting a Markdown report. +type Markdown struct { + // DateTime holds the timestamp of the evaluation. + DateTime time.Time + // Version holds the version of the evaluation tool. + Version string + + // CSVPath holds the path of detailed CSV results. + CSVPath string + // LogPath holds the path of detailed logs. + LogPath string + + // AssessmentPerModel holds + AssessmentPerModel map[string]metrics.Assessments + // TotalScore holds the total reachable score per task. + // REMARK Used for category computation. + TotalScore uint +} + +// markdownTemplateContext holds the template for a Markdown report. +type markdownTemplateContext struct { + Markdown + + Categories []*metrics.AssessmentCategory + ModelsPerCategory map[*metrics.AssessmentCategory][]string +} + +// markdownTemplate holds the template for a Markdown report. +var markdownTemplate = template.Must(template.New("template-report").Parse(bytesutil.StringTrimIndentations(` + # Evaluation from {{.DateTime.Format "2006-01-02 15:04:05"}} + + This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version {{.Version}}` + "`" + `. + + ## Results + + > Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot. + + The results of all models have been divided into the following categories: + {{ range $category := .Categories -}} + - {{ $category.Name }}: {{ $category.Description }} + {{ end }} + The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here]({{.LogPath}}). Detailed scoring can be found [here]({{.CSVPath}}). + + {{ range $category := .Categories -}} + {{ with $modelNames := index $.ModelsPerCategory $category -}} + ### "{{ $category.Name }}" + + {{ $category.Description }} + + {{ range $modelName := $modelNames -}} + - ` + "`" + `{{ $modelName }}` + "`" + ` + {{ end }} + {{ end }} + {{- end -}} +`))) + +// Format formats the markdown values in the template to the given writer. +func (m Markdown) Format(writer io.Writer) error { + templateContext := markdownTemplateContext{ + Markdown: m, + Categories: metrics.AllAssessmentCategories, + } + templateContext.ModelsPerCategory = make(map[*metrics.AssessmentCategory][]string, len(metrics.AllAssessmentCategories)) + for model, assessment := range m.AssessmentPerModel { + category := assessment.Category(m.TotalScore) + templateContext.ModelsPerCategory[category] = append(templateContext.ModelsPerCategory[category], model) + } + // TODO Generate svg using maybe https://github.com/wcharczuk/go-chart. + + return pkgerrors.WithStack(markdownTemplate.Execute(writer, templateContext)) +} + +// WriteToFile writes the Markdown values in the template to the given file. +func (t Markdown) WriteToFile(path string) (err error) { + t.CSVPath, err = filepath.Abs(t.CSVPath) + if err != nil { + return err + } + t.LogPath, err = filepath.Abs(t.LogPath) + if err != nil { + return err + } + + if err = os.MkdirAll(filepath.Base(path), 0755); err != nil { + return pkgerrors.WithStack(err) + } + file, err := os.Create(path) + if err != nil { + return pkgerrors.WithStack(err) + } + + return pkgerrors.WithStack(t.Format(file)) +} diff --git a/evaluate/report/markdown_test.go b/evaluate/report/markdown_test.go new file mode 100644 index 000000000..27093aaf6 --- /dev/null +++ b/evaluate/report/markdown_test.go @@ -0,0 +1,129 @@ +package report + +import ( + "bytes" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/symflower/eval-dev-quality/evaluate/metrics" + "github.com/zimmski/osutil/bytesutil" +) + +func TestMarkdownFormat(t *testing.T) { + type testCase struct { + Name string + + Markdown Markdown + + ExpectedReport string + ExpectedError error + } + + validate := func(t *testing.T, tc *testCase) { + t.Run(tc.Name, func(t *testing.T) { + var buffer bytes.Buffer + actualError := tc.Markdown.Format(&buffer) + assert.Equal(t, tc.ExpectedError, actualError) + actualReport := buffer.String() + + assert.Equalf(t, bytesutil.StringTrimIndentations(tc.ExpectedReport), actualReport, "Full output:\n%s", actualReport) + }) + } + + testTimeString := "2000-01-01 00:00:00" + testTime, err := time.Parse(time.DateTime, testTimeString) + require.NoError(t, err) + + validate(t, &testCase{ + Name: "No Models", + + Markdown: Markdown{ + DateTime: testTime, + Version: "1234", + + CSVPath: "some/csv/path.csv", + LogPath: "some/log/path.log", + }, + + ExpectedReport: ` + # Evaluation from 2000-01-01 00:00:00 + + This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version 1234` + "`" + `. + + ## Results + + > Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot. + + The results of all models have been divided into the following categories: + - Category Unknown: Models in this category could not be categorized. + - Response Error: Models in this category encountered an error. + - Response Empty: Models in this category produced an empty response. + - No Code: Models in this category produced no code. + - Invalid Code: Models in this category produced invalid code. + - Executable Code: Models in this category produced executable code. + - Statement Coverage Reached: Models in this category produced code that reached full statement coverage. + - No Excess Response: Models in this category did not respond with more content than requested. + + The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here](some/log/path.log). Detailed scoring can be found [here](some/csv/path.csv). + + `, + }) + + validate(t, &testCase{ + Name: "Simple Models", + + Markdown: Markdown{ + DateTime: testTime, + Version: "1234", + + CSVPath: "some/csv/path.csv", + LogPath: "some/log/path.log", + + TotalScore: 1, + AssessmentPerModel: map[string]metrics.Assessments{ + "ModelResponseError": metrics.NewAssessments(), + "ModelNoCode": metrics.Assessments{ + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyResponseNotEmpty: 1, + }, + }, + }, + + ExpectedReport: ` + # Evaluation from 2000-01-01 00:00:00 + + This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version 1234` + "`" + `. + + ## Results + + > Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot. + + The results of all models have been divided into the following categories: + - Category Unknown: Models in this category could not be categorized. + - Response Error: Models in this category encountered an error. + - Response Empty: Models in this category produced an empty response. + - No Code: Models in this category produced no code. + - Invalid Code: Models in this category produced invalid code. + - Executable Code: Models in this category produced executable code. + - Statement Coverage Reached: Models in this category produced code that reached full statement coverage. + - No Excess Response: Models in this category did not respond with more content than requested. + + The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here](some/log/path.log). Detailed scoring can be found [here](some/csv/path.csv). + + ### "Response Error" + + Models in this category encountered an error. + + - ` + "`ModelResponseError`" + ` + + ### "No Code" + + Models in this category produced no code. + + - ` + "`ModelNoCode`" + ` + + `, + }) +} diff --git a/version/version.go b/version/version.go new file mode 100644 index 000000000..e71ccf5aa --- /dev/null +++ b/version/version.go @@ -0,0 +1,4 @@ +package version + +// Current holds the current version. +var Current = "0.2.0"