Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
bauersimon committed Apr 22, 2024
1 parent 14f9b93 commit 44dc8c9
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 0 deletions.
82 changes: 82 additions & 0 deletions evaluate/report/markdown.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
package report

import (
"html/template"
"io"
"os"
"path/filepath"
"time"

pkgerrors "github.com/pkg/errors"
"github.com/zimmski/osutil/bytesutil"
)

// Markdown holds the values for exporting a Markdown report.
type Markdown struct {
DateTime time.Time
Version string

CVSPath string
LogPath string
SVGPath string

// Categories []metrics.AssessmentCategory
// ModelsPerCategory map[metrics.AssessmentCategory]model.Model
}

// markdownTemplate holds the template for a Markdown report.
var markdownTemplate = template.Must(template.New("template-report").Parse(bytesutil.StringTrimIndentations(`
# Evaluation from {{.DateTime.Format "2006-01-02 15:04:05"}}
{{"{{"}}< img-blog name="{{.SVGPath}}" alt="Bar chart that categorizes all evaluated LLMs.">{{"}}"}}
This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + ` version
{{.Version}} ` + "`" + `.
## Results
{{< info-block >}}
Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot.
{{</ info-block >}}
The results of all models have been divided into the following categories:
TODO
- **"no result"** e.g. we received an empty answer
- **"no code"** i.e. the model simply does not understand how to write Go code
- **"insufficient/unusable code"** e.g. a kind of random collection of statements that are maybe trying to test something but are not
- **"incorrect testing method"** e.g. multiple compile errors but mostly the testing code itself was wrong
- **"mistake (types, imports)"** i.e. simple compile errors (e.g. incorrect imports), additional code that does not align with the signature of the function under test 🧢
- **"passes (overengineered)"** i.e. code compiles and reaches 100%% coverage but there is code that should not go through a code review 🎩
- **"perfect solution"** i.e. if at all, minor objections 👑
The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here]({{.LogPath}}). Detailed scoring can be found [here]({{.CSVPath}})
### Result category: "$CATEGORY_NAME"
$CATEGORY_EXPLANATION
TODO
> -
`)))

// Format formats the markdown values in the template to the given writer.
func (t Markdown) Format(writer io.Writer) error {
return pkgerrors.WithStack(markdownTemplate.Execute(writer, t))
}

// WriteToFile writes the markdown values in the template to the given file.
func (t Markdown) WriteToFile(path string) error {
directory := filepath.Base(path)
if err := os.MkdirAll(directory, 0755); err != nil {
return pkgerrors.WithStack(err)
}
file, err := os.Create(path)
if err != nil {
return pkgerrors.WithStack(err)
}

return pkgerrors.WithStack(t.Format(file))
}
34 changes: 34 additions & 0 deletions evaluate/report/markdown_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package report

import (
"bytes"
"testing"

"github.com/stretchr/testify/assert"
"github.com/zimmski/osutil/bytesutil"
)

func TestMarkdownFormat(t *testing.T) {
type testCase struct {
Name string

Markdown Markdown

ExpectedReport string
ExpectedError error
}

validate := func(t *testing.T, tc *testCase) {
t.Run(tc.Name, func(t *testing.T) {
var buffer bytes.Buffer
actualError := tc.Markdown.Format(&buffer)

assert.Equal(t, bytesutil.StringTrimIndentations(tc.ExpectedReport), buffer.String())
assert.Equal(t, tc.ExpectedError, actualError)
})
}

validate(t, &testCase{
Name: "Simple",
})
}

0 comments on commit 44dc8c9

Please sign in to comment.