Markdown export

symflower · Apr 22, 2024 · fd5f9ac · fd5f9ac
1 parent 99220e0
commit fd5f9ac
Show file tree

Hide file tree

Showing 4 changed files with 256 additions and 3 deletions.
diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go
@@ -22,6 +22,7 @@ import (
 	_ "github.com/symflower/eval-dev-quality/provider/openrouter" // Register provider.
 	_ "github.com/symflower/eval-dev-quality/provider/symflower"  // Register provider.
 	"github.com/symflower/eval-dev-quality/tools"
+	"github.com/symflower/eval-dev-quality/version"
 )
 
 // Evaluate holds the "evaluation" command.
@@ -49,7 +50,8 @@ const repositoryPlainName = "plain"
 
 // Execute executes the command.
 func (command *Evaluate) Execute(args []string) (err error) {
-	command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", time.Now().Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group.
+	evaluationTimestamp := time.Now()
+	command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", evaluationTimestamp.Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group.
 	golog.Printf("Writing results to %s", command.ResultPath)
 
 	log, logClose, err := log.FileAndSTDOUT(filepath.Join(command.ResultPath, "evaluation.log"))
@@ -203,7 +205,8 @@ func (command *Evaluate) Execute(args []string) (err error) {
 	if err != nil {
 		log.Fatalf("ERROR: could not create result summary: %s", err)
 	}
-	if err := os.WriteFile(filepath.Join(command.ResultPath, "evaluation.csv"), []byte(csv), 0644); err != nil {
+	csvReportPath := filepath.Join(command.ResultPath, "evaluation.csv")
+	if err := os.WriteFile(csvReportPath, []byte(csv), 0644); err != nil {
 		log.Fatalf("ERROR: could not write result summary: %s", err)
 	}
 
@@ -221,7 +224,21 @@ func (command *Evaluate) Execute(args []string) (err error) {
 		totalScore = uint(len(languages))
 	}
 
-	_ = metrics.WalkByScore(assessments.Collapse(), func(model string, assessment metrics.Assessments, score uint) error {
+	assessmentsPerModel := assessments.Collapse()
+	if err := (report.Markdown{
+		DateTime: evaluationTimestamp,
+		Version:  version.Current,
+
+		CSVPath: csvReportPath,
+		LogPath: command.ResultPath,
+
+		AssessmentPerModel: assessmentsPerModel,
+		TotalScore:         totalScore,
+	}).WriteToFile(filepath.Join(command.ResultPath, "report.md")); err != nil {
+		return err
+	}
+
+	_ = metrics.WalkByScore(assessmentsPerModel, func(model string, assessment metrics.Assessments, score uint) error {
 		log.Printf("Evaluation score for %q (%q): %s", model, assessment.Category(totalScore).Name, assessment)
 
 		return nil

diff --git a/evaluate/report/markdown.go b/evaluate/report/markdown.go
@@ -0,0 +1,100 @@
+package report
+
+import (
+	"io"
+	"os"
+	"path/filepath"
+	"text/template"
+	"time"
+
+	pkgerrors "github.com/pkg/errors"
+	"github.com/zimmski/osutil/bytesutil"
+
+	"github.com/symflower/eval-dev-quality/evaluate/metrics"
+)
+
+// Markdown holds the values for exporting a Markdown report.
+type Markdown struct {
+	// DateTime holds the timestamp of the evaluation.
+	DateTime time.Time
+	// Version holds the version of the evaluation tool.
+	Version string
+
+	// CSVPath holds the path of detailed CSV results.
+	CSVPath string
+	// LogPath holds the path of detailed logs.
+	LogPath string
+
+	// AssessmentPerModel holds
+	AssessmentPerModel map[string]metrics.Assessments
+	// TotalScore holds the total reachable score per task.
+	// REMARK Used for category computation.
+	TotalScore uint
+}
+
+// markdownTemplateContext holds the template for a Markdown report.
+type markdownTemplateContext struct {
+	Markdown
+
+	Categories        []*metrics.AssessmentCategory
+	ModelsPerCategory map[*metrics.AssessmentCategory][]string
+}
+
+// markdownTemplate holds the template for a Markdown report.
+var markdownTemplate = template.Must(template.New("template-report").Parse(bytesutil.StringTrimIndentations(`
+	# Evaluation from {{.DateTime.Format "2006-01-02 15:04:05"}}
+	This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version {{.Version}}` + "`" + `.
+
+	## Results
+
+	{{"{{< info-block >}}"}}
+	Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot.
+	{{"{{</ info-block >}}"}}
+
+	The results of all models have been divided into the following categories:
+	{{ range $category := .Categories -}}
+	- {{ $category.Name }}: {{ $category.Description }}
+	{{ end }}
+	The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here]({{.LogPath}}). Detailed scoring can be found [here]({{.CSVPath}}).
+
+	{{ range $category := .Categories -}}
+	{{ with $modelNames := index $.ModelsPerCategory $category -}}
+	### "{{ $category.Name }}"
+
+	{{ $category.Description }}
+
+	{{ range $modelName := $modelNames -}}
+	- ` + "`" + `{{ $modelName }}` + "`" + `
+	{{ end }}
+	{{ end }}
+	{{- end -}}
+`)))
+
+// Format formats the markdown values in the template to the given writer.
+func (m Markdown) Format(writer io.Writer) error {
+	templateContext := markdownTemplateContext{
+		Markdown:   m,
+		Categories: metrics.AllAssessmentCategories,
+	}
+	templateContext.ModelsPerCategory = make(map[*metrics.AssessmentCategory][]string, len(metrics.AllAssessmentCategories))
+	for model, assessment := range m.AssessmentPerModel {
+		category := assessment.Category(m.TotalScore)
+		templateContext.ModelsPerCategory[category] = append(templateContext.ModelsPerCategory[category], model)
+	}
+	// TODO Generate svg using maybe https://github.com/wcharczuk/go-chart.
+
+	return pkgerrors.WithStack(markdownTemplate.Execute(writer, templateContext))
+}
+
+// WriteToFile writes the Markdown values in the template to the given file.
+func (t Markdown) WriteToFile(path string) error {
+	if err := os.MkdirAll(filepath.Base(path), 0755); err != nil {
+		return pkgerrors.WithStack(err)
+	}
+	file, err := os.Create(path)
+	if err != nil {
+		return pkgerrors.WithStack(err)
+	}
+
+	return pkgerrors.WithStack(t.Format(file))
+}
diff --git a/evaluate/report/markdown_test.go b/evaluate/report/markdown_test.go
@@ -0,0 +1,132 @@
+package report
+
+import (
+	"bytes"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"github.com/symflower/eval-dev-quality/evaluate/metrics"
+	"github.com/zimmski/osutil/bytesutil"
+)
+
+func TestMarkdownFormat(t *testing.T) {
+	type testCase struct {
+		Name string
+
+		Markdown Markdown
+
+		ExpectedReport string
+		ExpectedError  error
+	}
+
+	validate := func(t *testing.T, tc *testCase) {
+		t.Run(tc.Name, func(t *testing.T) {
+			var buffer bytes.Buffer
+			actualError := tc.Markdown.Format(&buffer)
+
+			assert.Equalf(t, bytesutil.StringTrimIndentations(tc.ExpectedReport), buffer.String(), "Full output:\n%s", buffer.String())
+			assert.Equal(t, tc.ExpectedError, actualError)
+		})
+	}
+
+	testTimeString := "2000-01-01 00:00:00"
+	testTime, err := time.Parse(time.DateTime, testTimeString)
+	require.NoError(t, err)
+
+	validate(t, &testCase{
+		Name: "No Models",
+
+		Markdown: Markdown{
+			DateTime: testTime,
+			Version:  "1234",
+
+			CSVPath: "some/csv/path.csv",
+			LogPath: "some/log/path.log",
+		},
+
+		ExpectedReport: `
+			# Evaluation from 2000-01-01 00:00:00
+
+			This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version 1234` + "`" + `.
+
+			## Results
+
+			{{< info-block >}}
+			Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot.
+			{{</ info-block >}}
+
+			The results of all models have been divided into the following categories:
+			- Category Unknown: Models in this category could not be categorized.
+			- Response Error: Models in this category encountered an error.
+			- Response Empty: Models in this category produced an empty response.
+			- No Code: Models in this category produced no code.
+			- Invalid Code: Models in this category produced invalid code.
+			- Executable Code: Models in this category produced executable code.
+			- Statement Coverage Reached: Models in this category produced code that reached full statement coverage.
+			- No Excess Response: Models in this category did not respond with more content than requested.
+
+			The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here](some/log/path.log). Detailed scoring can be found [here](some/csv/path.csv).
+
+		`,
+	})
+
+	validate(t, &testCase{
+		Name: "Simple Models",
+
+		Markdown: Markdown{
+			DateTime: testTime,
+			Version:  "1234",
+
+			CSVPath: "some/csv/path.csv",
+			LogPath: "some/log/path.log",
+
+			TotalScore: 1,
+			AssessmentPerModel: map[string]metrics.Assessments{
+				"ModelResponseError": metrics.NewAssessments(),
+				"ModelNoCode": metrics.Assessments{
+					metrics.AssessmentKeyResponseNoError:  1,
+					metrics.AssessmentKeyResponseNotEmpty: 1,
+				},
+			},
+		},
+
+		ExpectedReport: `
+			# Evaluation from 2000-01-01 00:00:00
+
+			This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version 1234` + "`" + `.
+
+			## Results
+
+			{{< info-block >}}
+			Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot.
+			{{</ info-block >}}
+
+			The results of all models have been divided into the following categories:
+			- Category Unknown: Models in this category could not be categorized.
+			- Response Error: Models in this category encountered an error.
+			- Response Empty: Models in this category produced an empty response.
+			- No Code: Models in this category produced no code.
+			- Invalid Code: Models in this category produced invalid code.
+			- Executable Code: Models in this category produced executable code.
+			- Statement Coverage Reached: Models in this category produced code that reached full statement coverage.
+			- No Excess Response: Models in this category did not respond with more content than requested.
+
+			The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here](some/log/path.log). Detailed scoring can be found [here](some/csv/path.csv).
+
+			### "Response Error"
+
+			Models in this category encountered an error.
+
+			- ` + "`ModelResponseError`" + `
+
+			### "No Code"
+
+			Models in this category produced no code.
+
+			- ` + "`ModelNoCode`" + `
+
+		`,
+	})
+}
diff --git a/version/version.go b/version/version.go
@@ -0,0 +1,4 @@
+package version
+
+// Current holds the current version.
+var Current = "0.2.0"