diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go index d03a529eb..04757e0c4 100644 --- a/cmd/eval-dev-quality/cmd/evaluate.go +++ b/cmd/eval-dev-quality/cmd/evaluate.go @@ -21,6 +21,7 @@ import ( _ "github.com/symflower/eval-dev-quality/provider/openrouter" // Register provider. _ "github.com/symflower/eval-dev-quality/provider/symflower" // Register provider. "github.com/symflower/eval-dev-quality/tools" + "github.com/symflower/eval-dev-quality/version" ) // Evaluate holds the "evaluation" command. @@ -58,7 +59,8 @@ const repositoryPlainName = "plain" // Execute executes the command. func (command *Evaluate) Execute(args []string) (err error) { - command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", time.Now().Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group. + evaluationTimestamp := time.Now() + command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", evaluationTimestamp.Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group. command.logger.Printf("Writing results to %s", command.ResultPath) log, logClose, err := log.WithFile(command.logger, filepath.Join(command.ResultPath, "evaluation.log")) @@ -231,7 +233,8 @@ func (command *Evaluate) Execute(args []string) (err error) { if err != nil { log.Fatalf("ERROR: could not create result summary: %s", err) } - if err := os.WriteFile(filepath.Join(command.ResultPath, "evaluation.csv"), []byte(csv), 0644); err != nil { + csvReportPath := filepath.Join(command.ResultPath, "evaluation.csv") + if err := os.WriteFile(csvReportPath, []byte(csv), 0644); err != nil { log.Fatalf("ERROR: could not write result summary: %s", err) } @@ -249,7 +252,21 @@ func (command *Evaluate) Execute(args []string) (err error) { totalScore = uint(len(languagesSelected)) } - _ = metrics.WalkByScore(assessments.Collapse(), func(model string, assessment metrics.Assessments, score uint) error { + assessmentsPerModel := assessments.Collapse() + if err := (report.Markdown{ + DateTime: evaluationTimestamp, + Version: version.Current, + + CSVPath: csvReportPath, + LogPath: command.ResultPath, + + AssessmentPerModel: assessmentsPerModel, + TotalScore: totalScore, + }).WriteToFile(filepath.Join(command.ResultPath, "report.md")); err != nil { + return err + } + + _ = metrics.WalkByScore(assessmentsPerModel, func(model string, assessment metrics.Assessments, score uint) error { log.Printf("Evaluation score for %q (%q): %s", model, assessment.Category(totalScore).Name, assessment) return nil diff --git a/evaluate/report/markdown.go b/evaluate/report/markdown.go new file mode 100644 index 000000000..d0347b4e2 --- /dev/null +++ b/evaluate/report/markdown.go @@ -0,0 +1,101 @@ +package report + +import ( + "io" + "os" + "path/filepath" + "text/template" + "time" + + pkgerrors "github.com/pkg/errors" + "github.com/zimmski/osutil/bytesutil" + + "github.com/symflower/eval-dev-quality/evaluate/metrics" +) + +// Markdown holds the values for exporting a Markdown report. +type Markdown struct { + // DateTime holds the timestamp of the evaluation. + DateTime time.Time + // Version holds the version of the evaluation tool. + Version string + + // CSVPath holds the path of detailed CSV results. + CSVPath string + // LogPath holds the path of detailed logs. + LogPath string + + // AssessmentPerModel holds + AssessmentPerModel map[string]metrics.Assessments + // TotalScore holds the total reachable score per task. + // REMARK Used for category computation. + TotalScore uint +} + +// markdownTemplateContext holds the template for a Markdown report. +type markdownTemplateContext struct { + Markdown + + Categories []*metrics.AssessmentCategory + ModelsPerCategory map[*metrics.AssessmentCategory][]string +} + +// markdownTemplate holds the template for a Markdown report. +var markdownTemplate = template.Must(template.New("template-report").Parse(bytesutil.StringTrimIndentations(` + # Evaluation from {{.DateTime.Format "2006-01-02 15:04:05"}} + + This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version {{.Version}}` + "`" + `. + + ## Results + + {{"{{< info-block >}}"}} + Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot. + {{"{{}}"}} + + The results of all models have been divided into the following categories: + {{ range $category := .Categories -}} + - {{ $category.Name }}: {{ $category.Description }} + {{ end }} + The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here]({{.LogPath}}). Detailed scoring can be found [here]({{.CSVPath}}). + + {{ range $category := .Categories -}} + {{ with $modelNames := index $.ModelsPerCategory $category -}} + ### "{{ $category.Name }}" + + {{ $category.Description }} + + {{ range $modelName := $modelNames -}} + - ` + "`" + `{{ $modelName }}` + "`" + ` + {{ end }} + {{ end }} + {{- end -}} +`))) + +// Format formats the markdown values in the template to the given writer. +func (m Markdown) Format(writer io.Writer) error { + templateContext := markdownTemplateContext{ + Markdown: m, + Categories: metrics.AllAssessmentCategories, + } + templateContext.ModelsPerCategory = make(map[*metrics.AssessmentCategory][]string, len(metrics.AllAssessmentCategories)) + for model, assessment := range m.AssessmentPerModel { + category := assessment.Category(m.TotalScore) + templateContext.ModelsPerCategory[category] = append(templateContext.ModelsPerCategory[category], model) + } + // TODO Generate svg using maybe https://github.com/wcharczuk/go-chart. + + return pkgerrors.WithStack(markdownTemplate.Execute(writer, templateContext)) +} + +// WriteToFile writes the Markdown values in the template to the given file. +func (t Markdown) WriteToFile(path string) error { + if err := os.MkdirAll(filepath.Base(path), 0755); err != nil { + return pkgerrors.WithStack(err) + } + file, err := os.Create(path) + if err != nil { + return pkgerrors.WithStack(err) + } + + return pkgerrors.WithStack(t.Format(file)) +} diff --git a/evaluate/report/markdown_test.go b/evaluate/report/markdown_test.go new file mode 100644 index 000000000..916224954 --- /dev/null +++ b/evaluate/report/markdown_test.go @@ -0,0 +1,132 @@ +package report + +import ( + "bytes" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/symflower/eval-dev-quality/evaluate/metrics" + "github.com/zimmski/osutil/bytesutil" +) + +func TestMarkdownFormat(t *testing.T) { + type testCase struct { + Name string + + Markdown Markdown + + ExpectedReport string + ExpectedError error + } + + validate := func(t *testing.T, tc *testCase) { + t.Run(tc.Name, func(t *testing.T) { + var buffer bytes.Buffer + actualError := tc.Markdown.Format(&buffer) + + assert.Equalf(t, bytesutil.StringTrimIndentations(tc.ExpectedReport), buffer.String(), "Full output:\n%s", buffer.String()) + assert.Equal(t, tc.ExpectedError, actualError) + }) + } + + testTimeString := "2000-01-01 00:00:00" + testTime, err := time.Parse(time.DateTime, testTimeString) + require.NoError(t, err) + + validate(t, &testCase{ + Name: "No Models", + + Markdown: Markdown{ + DateTime: testTime, + Version: "1234", + + CSVPath: "some/csv/path.csv", + LogPath: "some/log/path.log", + }, + + ExpectedReport: ` + # Evaluation from 2000-01-01 00:00:00 + + This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version 1234` + "`" + `. + + ## Results + + {{< info-block >}} + Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot. + {{}} + + The results of all models have been divided into the following categories: + - Category Unknown: Models in this category could not be categorized. + - Response Error: Models in this category encountered an error. + - Response Empty: Models in this category produced an empty response. + - No Code: Models in this category produced no code. + - Invalid Code: Models in this category produced invalid code. + - Executable Code: Models in this category produced executable code. + - Statement Coverage Reached: Models in this category produced code that reached full statement coverage. + - No Excess Response: Models in this category did not respond with more content than requested. + + The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here](some/log/path.log). Detailed scoring can be found [here](some/csv/path.csv). + + `, + }) + + validate(t, &testCase{ + Name: "Simple Models", + + Markdown: Markdown{ + DateTime: testTime, + Version: "1234", + + CSVPath: "some/csv/path.csv", + LogPath: "some/log/path.log", + + TotalScore: 1, + AssessmentPerModel: map[string]metrics.Assessments{ + "ModelResponseError": metrics.NewAssessments(), + "ModelNoCode": metrics.Assessments{ + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyResponseNotEmpty: 1, + }, + }, + }, + + ExpectedReport: ` + # Evaluation from 2000-01-01 00:00:00 + + This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version 1234` + "`" + `. + + ## Results + + {{< info-block >}} + Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot. + {{}} + + The results of all models have been divided into the following categories: + - Category Unknown: Models in this category could not be categorized. + - Response Error: Models in this category encountered an error. + - Response Empty: Models in this category produced an empty response. + - No Code: Models in this category produced no code. + - Invalid Code: Models in this category produced invalid code. + - Executable Code: Models in this category produced executable code. + - Statement Coverage Reached: Models in this category produced code that reached full statement coverage. + - No Excess Response: Models in this category did not respond with more content than requested. + + The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here](some/log/path.log). Detailed scoring can be found [here](some/csv/path.csv). + + ### "Response Error" + + Models in this category encountered an error. + + - ` + "`ModelResponseError`" + ` + + ### "No Code" + + Models in this category produced no code. + + - ` + "`ModelNoCode`" + ` + + `, + }) +} diff --git a/version/version.go b/version/version.go new file mode 100644 index 000000000..e71ccf5aa --- /dev/null +++ b/version/version.go @@ -0,0 +1,4 @@ +package version + +// Current holds the current version. +var Current = "0.2.0"