WIP

symflower · Apr 22, 2024 · e7638dd · e7638dd
1 parent 14f9b93
commit e7638dd
Show file tree

Hide file tree

Showing 5 changed files with 268 additions and 13 deletions.
diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go
@@ -222,7 +222,7 @@ func (command *Evaluate) Execute(args []string) (err error) {
 	}
 
 	_ = metrics.WalkByScore(assessments.Collapse(), func(model string, assessment metrics.Assessments, score uint) error {
-		log.Printf("Evaluation score for %q (%q): %s", model, assessment.Category(totalScore), assessment)
+		log.Printf("Evaluation score for %q (%q): %s", model, assessment.Category(totalScore).Name, assessment)
 
 		return nil
 	})

diff --git a/evaluate/metrics/category.go b/evaluate/metrics/category.go
@@ -1,30 +1,77 @@
 package metrics
 
+import "fmt"
+
 // AssessmentCategory represents a categorical ranking of a model based on Assessments.
-type AssessmentCategory string
+type AssessmentCategory struct {
+	// Name holds a unique short name of the category.
+	Name string
+	// Description holds the description of a category.
+	Description string
+}
+
+// AllAssessmentCategories holds all assessment categories.
+var AllAssessmentCategories []*AssessmentCategory
+
+// registerAssessmentCategory registers a new assessment category.
+func registerAssessmentCategory(c AssessmentCategory) *AssessmentCategory {
+	for _, category := range AllAssessmentCategories {
+		if c.Name == category.Name {
+			panic(fmt.Sprintf("duplicated category name %q", c.Name))
+		}
+	}
+
+	AllAssessmentCategories = append(AllAssessmentCategories, &c)
+
+	return &c
+}
 
 var (
 	// AssessmentCategoryUnknown indicates that it is not possible to compute a model's category.
-	AssessmentCategoryUnknown = AssessmentCategory("category-unknown")
+	AssessmentCategoryUnknown = registerAssessmentCategory(AssessmentCategory{
+		Name:        "Category Unknown",
+		Description: "Models in this category could not be categorized.",
+	})
 	// AssessmentCategoryResponseError indicates that a model has encountered an error trying to produce a response.
-	AssessmentCategoryResponseError = AssessmentCategory("response-error")
+	AssessmentCategoryResponseError = registerAssessmentCategory(AssessmentCategory{
+		Name:        "Response Error",
+		Description: "Models in this category encountered an error.",
+	})
 	// AssessmentCategoryResponseEmpty indicates that a model has returned an empty response.
-	AssessmentCategoryResponseEmpty = AssessmentCategory("response-empty")
+	AssessmentCategoryResponseEmpty = registerAssessmentCategory(AssessmentCategory{
+		Name:        "Response Empty",
+		Description: "Models in this category produced an empty response.",
+	})
 	// AssessmentCategoryResponseNoCode indicates that a model's response did not contain any source code.
-	AssessmentCategoryResponseNoCode = AssessmentCategory("response-no-code")
+	AssessmentCategoryResponseNoCode = registerAssessmentCategory(AssessmentCategory{
+		Name:        "No Code",
+		Description: "Models in this category produced no code.",
+	})
 	// AssessmentCategoryCodeInvalid indicates that a model's generated code produced an error when executed.
-	AssessmentCategoryCodeInvalid = AssessmentCategory("code-invalid")
+	AssessmentCategoryCodeInvalid = registerAssessmentCategory(AssessmentCategory{
+		Name:        "Invalid Code",
+		Description: "Models in this category produced invalid code.",
+	})
 	// AssessmentCategoryCodeExecuted indicates that a model's generated code could be executed without an error.
-	AssessmentCategoryCodeExecuted = AssessmentCategory("code-executed")
+	AssessmentCategoryCodeExecuted = registerAssessmentCategory(AssessmentCategory{
+		Name:        "Executable Code",
+		Description: "Models in this category produced executable code.",
+	})
 	// AssessmentCategoryCodeCoverageStatementReached indicates that a model's generated code reached 100% statement coverage.
-	AssessmentCategoryCodeCoverageStatementReached = AssessmentCategory("code-coverage-statement")
+	AssessmentCategoryCodeCoverageStatementReached = registerAssessmentCategory(AssessmentCategory{
+		Name:        "Statement Coverage Reached",
+		Description: "Models in this category produced code that reached full statement coverage.",
+	})
 	// AssessmentCategoryCodeNoExcess indicates that a model's response did not contain more content than requested.
-	AssessmentCategoryCodeNoExcess = AssessmentCategory("code-no-excess")
+	AssessmentCategoryCodeNoExcess = registerAssessmentCategory(AssessmentCategory{
+		Name:        "No Excess Response",
+		Description: "Models in this category did not respond with more content than requested.",
+	})
 )
 
 // Category infers a categorical ranking of a model based on assessment values.
-// A models overall category corresponds to the criterion where the model was consistently able to receive "total" amount of points. I.e. if there were 3 tasks in total and a model was able to produce executing code for all tasks, but only in one case the coverage goal was reached, then the category is only "CodeExecuted" because the coverage goal was not reached consistently.
-func (a Assessments) Category(total uint) AssessmentCategory {
+// A models overall category corresponds to the criterion where the model was consistently able to receive "total" amount of points. I.e. if there were 3 tasks in total and a model was able to produce executing code for all tasks, but only in one case the coverage goal was reached, then the category is only "CodeExecuted" because the coverage goal was not reached consistently. The category is never "nil".
+func (a Assessments) Category(total uint) *AssessmentCategory {
 	if total == 0 {
 		return AssessmentCategoryUnknown
 	}

diff --git a/evaluate/metrics/category_test.go b/evaluate/metrics/category_test.go
@@ -13,7 +13,7 @@ func TestAssessmentsCategory(t *testing.T) {
 		Assessments Assessments
 		Total       uint
 
-		ExpectedAssessmentCategory AssessmentCategory
+		ExpectedAssessmentCategory *AssessmentCategory
 	}
 
 	validate := func(t *testing.T, tc *testCase) {

diff --git a/evaluate/report/markdown.go b/evaluate/report/markdown.go
@@ -0,0 +1,106 @@
+package report
+
+import (
+	"io"
+	"os"
+	"path/filepath"
+	"text/template"
+	"time"
+
+	pkgerrors "github.com/pkg/errors"
+	"github.com/zimmski/osutil/bytesutil"
+
+	"github.com/symflower/eval-dev-quality/evaluate/metrics"
+)
+
+// Markdown holds the values for exporting a Markdown report.
+type Markdown struct {
+	// DateTime holds the timestamp of the evaluation.
+	DateTime time.Time
+	// Version holds the version of the evaluation tool.
+	Version string
+
+	// CSVPath holds the path of detailed CSV results.
+	CSVPath string
+	// LogPath holds the path of detailed logs.
+	LogPath string
+	// SVGPath holds the path of the charted results.
+	// REMARK The charts will be generated during the export.
+	SVGPath string
+
+	// CategoriesPerModel holds
+	CategoriesPerModel map[string]metrics.Assessments
+	// TotalScore holds the total score.
+	TotalScore uint
+}
+
+type markdownTemplateContext struct {
+	Markdown
+
+	Categories        []*metrics.AssessmentCategory
+	ModelsPerCategory map[*metrics.AssessmentCategory][]string
+}
+
+// markdownTemplate holds the template for a Markdown report.
+var markdownTemplate = template.Must(template.New("template-report").Parse(bytesutil.StringTrimIndentations(`
+	# Evaluation from {{.DateTime.Format "2006-01-02 15:04:05"}}
+
+	{{"{{<"}} img-blog name="{{.SVGPath}}" alt="Bar chart that categorizes all evaluated LLMs."{{">}}"}}
+
+	This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version {{.Version}}` + "`" + `.
+
+	## Results
+
+	{{"{{< info-block >}}"}}
+	Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot.
+	{{"{{</ info-block >}}"}}
+
+	The results of all models have been divided into the following categories:
+
+	{{- range $category := .Categories }}
+	- {{ $category.Name }}: {{ $category.Description}}
+	{{- end }}
+
+
+	The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here]({{.LogPath}}). Detailed scoring can be found [here]({{.CSVPath}})
+
+	{{- range $category := .Categories }}
+	### "{{ $category.Name }}"
+
+	{{ $category.Description}}
+
+	{{- range $modelName := index $.ModelsPerCategory $category}}
+	- ` + "`" + `{{ $modelName }}` + "`" + `
+	{{- end }}
+	{{- end }}
+`)))
+
+// Format formats the markdown values in the template to the given writer.
+func (m Markdown) Format(writer io.Writer) error {
+	templateContext := markdownTemplateContext{
+		Markdown:   m,
+		Categories: metrics.AllAssessmentCategories,
+	}
+	templateContext.ModelsPerCategory = make(map[*metrics.AssessmentCategory][]string, len(metrics.AllAssessmentCategories))
+	for model, assessment := range m.CategoriesPerModel {
+		category := assessment.Category(m.TotalScore)
+		templateContext.ModelsPerCategory[category] = append(templateContext.ModelsPerCategory[category], model)
+	}
+	// TODO Generate svg using maybe https://github.com/wcharczuk/go-chart and save to SVGPath.
+
+	return pkgerrors.WithStack(markdownTemplate.Execute(writer, templateContext))
+}
+
+// WriteToFile writes the markdown values in the template to the given file.
+func (t Markdown) WriteToFile(path string) error {
+	directory := filepath.Base(path)
+	if err := os.MkdirAll(directory, 0755); err != nil {
+		return pkgerrors.WithStack(err)
+	}
+	file, err := os.Create(path)
+	if err != nil {
+		return pkgerrors.WithStack(err)
+	}
+
+	return pkgerrors.WithStack(t.Format(file))
+}
diff --git a/evaluate/report/markdown_test.go b/evaluate/report/markdown_test.go
@@ -0,0 +1,102 @@
+package report
+
+import (
+	"bytes"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"github.com/zimmski/osutil/bytesutil"
+)
+
+func TestMarkdownFormat(t *testing.T) {
+	type testCase struct {
+		Name string
+
+		Markdown Markdown
+
+		ExpectedReport string
+		ExpectedError  error
+	}
+
+	validate := func(t *testing.T, tc *testCase) {
+		t.Run(tc.Name, func(t *testing.T) {
+			var buffer bytes.Buffer
+			actualError := tc.Markdown.Format(&buffer)
+
+			assert.Equal(t, bytesutil.StringTrimIndentations(tc.ExpectedReport), buffer.String())
+			assert.Equal(t, tc.ExpectedError, actualError)
+		})
+	}
+
+	testTimeString := "2000-01-01 00:00:00"
+	testTime, err := time.Parse(time.DateTime, testTimeString)
+	require.NoError(t, err)
+
+	validate(t, &testCase{
+		Name: "No Models",
+
+		Markdown: Markdown{
+			DateTime: testTime,
+			Version:  "1234",
+
+			CSVPath: "some/csv/path.csv",
+			LogPath: "some/log/path.log",
+			SVGPath: "some/svg/path.svg",
+		},
+
+		ExpectedReport: `
+			# Evaluation from 2000-01-01 00:00:00
+
+			{{< img-blog name="some/svg/path.svg" alt="Bar chart that categorizes all evaluated LLMs.">}}
+
+			This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version 1234` + "`" + `.
+
+			## Results
+
+			{{< info-block >}}
+			Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot.
+			{{</ info-block >}}
+
+			The results of all models have been divided into the following categories:
+			- Category Unknown: Models in this category could not be categorized.
+			- Response Error: Models in this category encountered an error.
+			- Response Empty: Models in this category produced an empty response.
+			- No Code: Models in this category produced no code.
+			- Invalid Code: Models in this category produced invalid code.
+			- Executable Code: Models in this category produced executable code.
+			- Statement Coverage Reached: Models in this category produced code that reached full statement coverage.
+			- No Excess Response: Models in this category did not respond with more content than requested.
+
+
+			The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here](some/log/path.log). Detailed scoring can be found [here](some/csv/path.csv)
+			### "Category Unknown"
+
+			Models in this category could not be categorized.
+			### "Response Error"
+
+			Models in this category encountered an error.
+			### "Response Empty"
+
+			Models in this category produced an empty response.
+			### "No Code"
+
+			Models in this category produced no code.
+			### "Invalid Code"
+
+			Models in this category produced invalid code.
+			### "Executable Code"
+
+			Models in this category produced executable code.
+			### "Statement Coverage Reached"
+
+			Models in this category produced code that reached full statement coverage.
+			### "No Excess Response"
+
+			Models in this category did not respond with more content than requested.
+		`,
+	})
+
+	// TODO Test with some results.
+}