Skip to content

Commit

Permalink
Markdown export
Browse files Browse the repository at this point in the history
  • Loading branch information
bauersimon committed Apr 22, 2024
1 parent 99220e0 commit fd5f9ac
Show file tree
Hide file tree
Showing 4 changed files with 256 additions and 3 deletions.
23 changes: 20 additions & 3 deletions cmd/eval-dev-quality/cmd/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
_ "github.com/symflower/eval-dev-quality/provider/openrouter" // Register provider.
_ "github.com/symflower/eval-dev-quality/provider/symflower" // Register provider.
"github.com/symflower/eval-dev-quality/tools"
"github.com/symflower/eval-dev-quality/version"
)

// Evaluate holds the "evaluation" command.
Expand Down Expand Up @@ -49,7 +50,8 @@ const repositoryPlainName = "plain"

// Execute executes the command.
func (command *Evaluate) Execute(args []string) (err error) {
command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", time.Now().Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group.
evaluationTimestamp := time.Now()
command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", evaluationTimestamp.Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group.
golog.Printf("Writing results to %s", command.ResultPath)

log, logClose, err := log.FileAndSTDOUT(filepath.Join(command.ResultPath, "evaluation.log"))
Expand Down Expand Up @@ -203,7 +205,8 @@ func (command *Evaluate) Execute(args []string) (err error) {
if err != nil {
log.Fatalf("ERROR: could not create result summary: %s", err)
}
if err := os.WriteFile(filepath.Join(command.ResultPath, "evaluation.csv"), []byte(csv), 0644); err != nil {
csvReportPath := filepath.Join(command.ResultPath, "evaluation.csv")
if err := os.WriteFile(csvReportPath, []byte(csv), 0644); err != nil {
log.Fatalf("ERROR: could not write result summary: %s", err)
}

Expand All @@ -221,7 +224,21 @@ func (command *Evaluate) Execute(args []string) (err error) {
totalScore = uint(len(languages))
}

_ = metrics.WalkByScore(assessments.Collapse(), func(model string, assessment metrics.Assessments, score uint) error {
assessmentsPerModel := assessments.Collapse()
if err := (report.Markdown{
DateTime: evaluationTimestamp,
Version: version.Current,

CSVPath: csvReportPath,
LogPath: command.ResultPath,

AssessmentPerModel: assessmentsPerModel,
TotalScore: totalScore,
}).WriteToFile(filepath.Join(command.ResultPath, "report.md")); err != nil {
return err
}

_ = metrics.WalkByScore(assessmentsPerModel, func(model string, assessment metrics.Assessments, score uint) error {
log.Printf("Evaluation score for %q (%q): %s", model, assessment.Category(totalScore).Name, assessment)

return nil
Expand Down
100 changes: 100 additions & 0 deletions evaluate/report/markdown.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
package report

import (
"io"
"os"
"path/filepath"
"text/template"
"time"

pkgerrors "github.com/pkg/errors"
"github.com/zimmski/osutil/bytesutil"

"github.com/symflower/eval-dev-quality/evaluate/metrics"
)

// Markdown holds the values for exporting a Markdown report.
type Markdown struct {
// DateTime holds the timestamp of the evaluation.
DateTime time.Time
// Version holds the version of the evaluation tool.
Version string

// CSVPath holds the path of detailed CSV results.
CSVPath string
// LogPath holds the path of detailed logs.
LogPath string

// AssessmentPerModel holds
AssessmentPerModel map[string]metrics.Assessments
// TotalScore holds the total reachable score per task.
// REMARK Used for category computation.
TotalScore uint
}

// markdownTemplateContext holds the template for a Markdown report.
type markdownTemplateContext struct {
Markdown

Categories []*metrics.AssessmentCategory
ModelsPerCategory map[*metrics.AssessmentCategory][]string
}

// markdownTemplate holds the template for a Markdown report.
var markdownTemplate = template.Must(template.New("template-report").Parse(bytesutil.StringTrimIndentations(`
# Evaluation from {{.DateTime.Format "2006-01-02 15:04:05"}}
This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version {{.Version}}` + "`" + `.
## Results
{{"{{< info-block >}}"}}
Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot.
{{"{{</ info-block >}}"}}
The results of all models have been divided into the following categories:
{{ range $category := .Categories -}}
- {{ $category.Name }}: {{ $category.Description }}
{{ end }}
The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here]({{.LogPath}}). Detailed scoring can be found [here]({{.CSVPath}}).
{{ range $category := .Categories -}}
{{ with $modelNames := index $.ModelsPerCategory $category -}}
### "{{ $category.Name }}"
{{ $category.Description }}
{{ range $modelName := $modelNames -}}
- ` + "`" + `{{ $modelName }}` + "`" + `
{{ end }}
{{ end }}
{{- end -}}
`)))

// Format formats the markdown values in the template to the given writer.
func (m Markdown) Format(writer io.Writer) error {
templateContext := markdownTemplateContext{
Markdown: m,
Categories: metrics.AllAssessmentCategories,
}
templateContext.ModelsPerCategory = make(map[*metrics.AssessmentCategory][]string, len(metrics.AllAssessmentCategories))
for model, assessment := range m.AssessmentPerModel {
category := assessment.Category(m.TotalScore)
templateContext.ModelsPerCategory[category] = append(templateContext.ModelsPerCategory[category], model)
}
// TODO Generate svg using maybe https://github.com/wcharczuk/go-chart.

return pkgerrors.WithStack(markdownTemplate.Execute(writer, templateContext))
}

// WriteToFile writes the Markdown values in the template to the given file.
func (t Markdown) WriteToFile(path string) error {
if err := os.MkdirAll(filepath.Base(path), 0755); err != nil {
return pkgerrors.WithStack(err)
}
file, err := os.Create(path)
if err != nil {
return pkgerrors.WithStack(err)
}

return pkgerrors.WithStack(t.Format(file))
}
132 changes: 132 additions & 0 deletions evaluate/report/markdown_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
package report

import (
"bytes"
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/symflower/eval-dev-quality/evaluate/metrics"
"github.com/zimmski/osutil/bytesutil"
)

func TestMarkdownFormat(t *testing.T) {
type testCase struct {
Name string

Markdown Markdown

ExpectedReport string
ExpectedError error
}

validate := func(t *testing.T, tc *testCase) {
t.Run(tc.Name, func(t *testing.T) {
var buffer bytes.Buffer
actualError := tc.Markdown.Format(&buffer)

assert.Equalf(t, bytesutil.StringTrimIndentations(tc.ExpectedReport), buffer.String(), "Full output:\n%s", buffer.String())
assert.Equal(t, tc.ExpectedError, actualError)
})
}

testTimeString := "2000-01-01 00:00:00"
testTime, err := time.Parse(time.DateTime, testTimeString)
require.NoError(t, err)

validate(t, &testCase{
Name: "No Models",

Markdown: Markdown{
DateTime: testTime,
Version: "1234",

CSVPath: "some/csv/path.csv",
LogPath: "some/log/path.log",
},

ExpectedReport: `
# Evaluation from 2000-01-01 00:00:00
This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version 1234` + "`" + `.
## Results
{{< info-block >}}
Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot.
{{</ info-block >}}
The results of all models have been divided into the following categories:
- Category Unknown: Models in this category could not be categorized.
- Response Error: Models in this category encountered an error.
- Response Empty: Models in this category produced an empty response.
- No Code: Models in this category produced no code.
- Invalid Code: Models in this category produced invalid code.
- Executable Code: Models in this category produced executable code.
- Statement Coverage Reached: Models in this category produced code that reached full statement coverage.
- No Excess Response: Models in this category did not respond with more content than requested.
The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here](some/log/path.log). Detailed scoring can be found [here](some/csv/path.csv).
`,
})

validate(t, &testCase{
Name: "Simple Models",

Markdown: Markdown{
DateTime: testTime,
Version: "1234",

CSVPath: "some/csv/path.csv",
LogPath: "some/log/path.log",

TotalScore: 1,
AssessmentPerModel: map[string]metrics.Assessments{
"ModelResponseError": metrics.NewAssessments(),
"ModelNoCode": metrics.Assessments{
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyResponseNotEmpty: 1,
},
},
},

ExpectedReport: `
# Evaluation from 2000-01-01 00:00:00
This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version 1234` + "`" + `.
## Results
{{< info-block >}}
Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot.
{{</ info-block >}}
The results of all models have been divided into the following categories:
- Category Unknown: Models in this category could not be categorized.
- Response Error: Models in this category encountered an error.
- Response Empty: Models in this category produced an empty response.
- No Code: Models in this category produced no code.
- Invalid Code: Models in this category produced invalid code.
- Executable Code: Models in this category produced executable code.
- Statement Coverage Reached: Models in this category produced code that reached full statement coverage.
- No Excess Response: Models in this category did not respond with more content than requested.
The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here](some/log/path.log). Detailed scoring can be found [here](some/csv/path.csv).
### "Response Error"
Models in this category encountered an error.
- ` + "`ModelResponseError`" + `
### "No Code"
Models in this category produced no code.
- ` + "`ModelNoCode`" + `
`,
})
}
4 changes: 4 additions & 0 deletions version/version.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
package version

// Current holds the current version.
var Current = "0.2.0"

0 comments on commit fd5f9ac

Please sign in to comment.