WIP

symflower · Apr 22, 2024 · 44dc8c9 · 44dc8c9
1 parent 14f9b93
commit 44dc8c9
Show file tree

Hide file tree

Showing 2 changed files with 116 additions and 0 deletions.
diff --git a/evaluate/report/markdown.go b/evaluate/report/markdown.go
@@ -0,0 +1,82 @@
+package report
+
+import (
+	"html/template"
+	"io"
+	"os"
+	"path/filepath"
+	"time"
+
+	pkgerrors "github.com/pkg/errors"
+	"github.com/zimmski/osutil/bytesutil"
+)
+
+// Markdown holds the values for exporting a Markdown report.
+type Markdown struct {
+	DateTime time.Time
+	Version  string
+
+	CVSPath string
+	LogPath string
+	SVGPath string
+
+	// Categories        []metrics.AssessmentCategory
+	// ModelsPerCategory map[metrics.AssessmentCategory]model.Model
+}
+
+// markdownTemplate holds the template for a Markdown report.
+var markdownTemplate = template.Must(template.New("template-report").Parse(bytesutil.StringTrimIndentations(`
+	# Evaluation from {{.DateTime.Format "2006-01-02 15:04:05"}}
+
+	{{"{{"}}< img-blog name="{{.SVGPath}}" alt="Bar chart that categorizes all evaluated LLMs.">{{"}}"}}
+
+	This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + ` version
+	{{.Version}} ` + "`" + `.
+
+	## Results
+
+	{{< info-block >}}
+	Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot.
+	{{</ info-block >}}
+
+	The results of all models have been divided into the following categories:
+
+	TODO
+
+	- **"no result"** e.g. we received an empty answer
+	- **"no code"** i.e. the model simply does not understand how to write Go code
+	- **"insufficient/unusable code"** e.g. a kind of random collection of statements that are maybe trying to test something but are not
+	- **"incorrect testing method"** e.g. multiple compile errors but mostly the testing code itself was wrong
+	- **"mistake (types, imports)"** i.e. simple compile errors (e.g. incorrect imports), additional code that does not align with the signature of the function under test 🧢
+	- **"passes (overengineered)"** i.e. code compiles and reaches 100%% coverage but there is code that should not go through a code review 🎩
+	- **"perfect solution"** i.e. if at all, minor objections 👑
+
+	The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here]({{.LogPath}}). Detailed scoring can be found [here]({{.CSVPath}})
+
+	### Result category: "$CATEGORY_NAME"
+
+	$CATEGORY_EXPLANATION
+
+	TODO
+
+	> -
+`)))
+
+// Format formats the markdown values in the template to the given writer.
+func (t Markdown) Format(writer io.Writer) error {
+	return pkgerrors.WithStack(markdownTemplate.Execute(writer, t))
+}
+
+// WriteToFile writes the markdown values in the template to the given file.
+func (t Markdown) WriteToFile(path string) error {
+	directory := filepath.Base(path)
+	if err := os.MkdirAll(directory, 0755); err != nil {
+		return pkgerrors.WithStack(err)
+	}
+	file, err := os.Create(path)
+	if err != nil {
+		return pkgerrors.WithStack(err)
+	}
+
+	return pkgerrors.WithStack(t.Format(file))
+}
diff --git a/evaluate/report/markdown_test.go b/evaluate/report/markdown_test.go
@@ -0,0 +1,34 @@
+package report
+
+import (
+	"bytes"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/zimmski/osutil/bytesutil"
+)
+
+func TestMarkdownFormat(t *testing.T) {
+	type testCase struct {
+		Name string
+
+		Markdown Markdown
+
+		ExpectedReport string
+		ExpectedError  error
+	}
+
+	validate := func(t *testing.T, tc *testCase) {
+		t.Run(tc.Name, func(t *testing.T) {
+			var buffer bytes.Buffer
+			actualError := tc.Markdown.Format(&buffer)
+
+			assert.Equal(t, bytesutil.StringTrimIndentations(tc.ExpectedReport), buffer.String())
+			assert.Equal(t, tc.ExpectedError, actualError)
+		})
+	}
+
+	validate(t, &testCase{
+		Name: "Simple",
+	})
+}