Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Svg reporting #64

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 24 additions & 5 deletions cmd/eval-dev-quality/cmd/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
_ "github.com/symflower/eval-dev-quality/provider/openrouter" // Register provider.
_ "github.com/symflower/eval-dev-quality/provider/symflower" // Register provider.
"github.com/symflower/eval-dev-quality/tools"
"github.com/symflower/eval-dev-quality/version"
)

// Evaluate holds the "evaluation" command.
Expand Down Expand Up @@ -58,10 +59,12 @@ const repositoryPlainName = "plain"

// Execute executes the command.
func (command *Evaluate) Execute(args []string) (err error) {
command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", time.Now().Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group.
evaluationTimestamp := time.Now()
command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", evaluationTimestamp.Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group.
command.logger.Printf("Writing results to %s", command.ResultPath)

log, logClose, err := log.WithFile(command.logger, filepath.Join(command.ResultPath, "evaluation.log"))
logFilePath := filepath.Join(command.ResultPath, "evaluation.log")
log, logClose, err := log.WithFile(command.logger, logFilePath)
if err != nil {
return err
}
Expand Down Expand Up @@ -231,7 +234,8 @@ func (command *Evaluate) Execute(args []string) (err error) {
if err != nil {
log.Fatalf("ERROR: could not create result summary: %s", err)
}
if err := os.WriteFile(filepath.Join(command.ResultPath, "evaluation.csv"), []byte(csv), 0644); err != nil {
csvReportPath := filepath.Join(command.ResultPath, "evaluation.csv")
if err := os.WriteFile(csvReportPath, []byte(csv), 0644); err != nil {
log.Fatalf("ERROR: could not write result summary: %s", err)
}

Expand All @@ -249,8 +253,23 @@ func (command *Evaluate) Execute(args []string) (err error) {
totalScore = uint(len(languagesSelected))
}

_ = metrics.WalkByScore(assessments.Collapse(), func(model string, assessment metrics.Assessments, score uint) error {
log.Printf("Evaluation score for %q (%q): %s", model, assessment.Category(totalScore), assessment)
assessmentsPerModel := assessments.Collapse()
if err := (report.Markdown{
DateTime: evaluationTimestamp,
Version: version.Current,

CSVPath: csvReportPath,
LogPath: logFilePath,
SVGPath: filepath.Join(command.ResultPath, "categories.svg"),

AssessmentPerModel: assessmentsPerModel,
TotalScore: totalScore,
}).WriteToFile(filepath.Join(command.ResultPath, "report.md")); err != nil {
return err
}

_ = metrics.WalkByScore(assessmentsPerModel, func(model string, assessment metrics.Assessments, score uint) error {
log.Printf("Evaluation score for %q (%q): %s", model, assessment.Category(totalScore).Name, assessment)

return nil
})
Expand Down
20 changes: 15 additions & 5 deletions cmd/eval-dev-quality/cmd/evaluate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ func TestEvaluateExecute(t *testing.T) {
Arguments []string

ExpectedOutputValidate func(t *testing.T, output string, resultPath string)
ExpectedResultFiles map[string]func(t *testing.T, filePath string, data string)
ExpectedResultFiles map[string]func(t *testing.T, resultPath string, filePath string, data string)
}

validate := func(t *testing.T, tc *testCase) {
Expand Down Expand Up @@ -54,7 +54,7 @@ func TestEvaluateExecute(t *testing.T) {
if validate != nil {
data, err := os.ReadFile(filepath.Join(temporaryPath, filePath))
if assert.NoError(t, err) {
validate(t, filePath, string(data))
validate(t, temporaryPath, filePath, string(data))
}
}
}
Expand All @@ -73,19 +73,29 @@ func TestEvaluateExecute(t *testing.T) {
},

ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) {
assert.Contains(t, output, `Evaluation score for "symflower/symbolic-execution" ("code-no-excess"): score=6, coverage-statement=1, files-executed=1, response-no-error=1, response-no-excess=1, response-not-empty=1, response-with-code=1`)
assert.Contains(t, output, `Evaluation score for "symflower/symbolic-execution" ("No Excess Response"): score=6, coverage-statement=1, files-executed=1, response-no-error=1, response-no-excess=1, response-not-empty=1, response-with-code=1`)
if !assert.Equal(t, 1, strings.Count(output, "Evaluation score for")) {
t.Logf("Output: %s", output)
}
},
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"evaluation.csv": func(t *testing.T, filePath, data string) {
ExpectedResultFiles: map[string]func(t *testing.T, resultPath string, filePath string, data string){
"categories.svg": func(t *testing.T, resultPath string, filePath, data string) {
assert.Contains(t, data, "No Excess Response</text>") // Assert "no excess" category is present.
assert.Contains(t, data, "1</text>") // Assert the Y-axis label is at least one for one model in that category.
},
"evaluation.csv": func(t *testing.T, resultPath string, filePath, data string) {
assert.Equal(t, bytesutil.StringTrimIndentations(`
model,language,repository,score,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code
symflower/symbolic-execution,golang,golang/plain,6,1,1,1,1,1,1
`), data)
},
"evaluation.log": nil,
"report.md": func(t *testing.T, resultPath string, filePath, data string) {
// Ensure the report links to the CSV file, SVG file and logs.
assert.Contains(t, data, filepath.Join(resultPath, "categories.svg"))
assert.Contains(t, data, filepath.Join(resultPath, "evaluation.csv"))
assert.Contains(t, data, filepath.Join(resultPath, "evaluation.log"))
},
"symflower_symbolic-execution/golang/golang/plain.log": nil,
},
})
Expand Down
68 changes: 58 additions & 10 deletions evaluate/metrics/category.go
Original file line number Diff line number Diff line change
@@ -1,30 +1,78 @@
package metrics

import "fmt"

// AssessmentCategory represents a categorical ranking of a model based on Assessments.
type AssessmentCategory string
type AssessmentCategory struct {
// Name holds a unique short name of the category.
Name string
// Description holds the description of a category.
Description string
}

// AllAssessmentCategories holds all assessment categories.
var AllAssessmentCategories []*AssessmentCategory

// registerAssessmentCategory registers a new assessment category.
func registerAssessmentCategory(c AssessmentCategory) *AssessmentCategory {
for _, category := range AllAssessmentCategories {
if c.Name == category.Name {
panic(fmt.Sprintf("duplicated category name %q", c.Name))
}
}

AllAssessmentCategories = append(AllAssessmentCategories, &c)

return &c
}

var (
// AssessmentCategoryUnknown indicates that it is not possible to compute a model's category.
AssessmentCategoryUnknown = AssessmentCategory("category-unknown")
AssessmentCategoryUnknown = registerAssessmentCategory(AssessmentCategory{
Name: "Category Unknown",
Description: "Models in this category could not be categorized.",
})
// AssessmentCategoryResponseError indicates that a model has encountered an error trying to produce a response.
AssessmentCategoryResponseError = AssessmentCategory("response-error")
AssessmentCategoryResponseError = registerAssessmentCategory(AssessmentCategory{
Name: "Response Error",
Description: "Models in this category encountered an error.",
})
// AssessmentCategoryResponseEmpty indicates that a model has returned an empty response.
AssessmentCategoryResponseEmpty = AssessmentCategory("response-empty")
AssessmentCategoryResponseEmpty = registerAssessmentCategory(AssessmentCategory{
Name: "Response Empty",
Description: "Models in this category produced an empty response.",
})
// AssessmentCategoryResponseNoCode indicates that a model's response did not contain any source code.
AssessmentCategoryResponseNoCode = AssessmentCategory("response-no-code")
AssessmentCategoryResponseNoCode = registerAssessmentCategory(AssessmentCategory{
Name: "No Code",
Description: "Models in this category produced no code.",
})
// AssessmentCategoryCodeInvalid indicates that a model's generated code produced an error when executed.
AssessmentCategoryCodeInvalid = AssessmentCategory("code-invalid")
AssessmentCategoryCodeInvalid = registerAssessmentCategory(AssessmentCategory{
Name: "Invalid Code",
Description: "Models in this category produced invalid code.",
})
// AssessmentCategoryCodeExecuted indicates that a model's generated code could be executed without an error.
AssessmentCategoryCodeExecuted = AssessmentCategory("code-executed")
AssessmentCategoryCodeExecuted = registerAssessmentCategory(AssessmentCategory{
Name: "Executable Code",
Description: "Models in this category produced executable code.",
})
// AssessmentCategoryCodeCoverageStatementReached indicates that a model's generated code reached 100% statement coverage.
AssessmentCategoryCodeCoverageStatementReached = AssessmentCategory("code-coverage-statement")
AssessmentCategoryCodeCoverageStatementReached = registerAssessmentCategory(AssessmentCategory{
Name: "Statement Coverage Reached",
Description: "Models in this category produced code that reached full statement coverage.",
})
// AssessmentCategoryCodeNoExcess indicates that a model's response did not contain more content than requested.
AssessmentCategoryCodeNoExcess = AssessmentCategory("code-no-excess")
AssessmentCategoryCodeNoExcess = registerAssessmentCategory(AssessmentCategory{
Name: "No Excess Response",
Description: "Models in this category did not respond with more content than requested.",
})
)

// Category infers a categorical ranking of a model based on assessment values.
// A models overall category corresponds to the criterion where the model was consistently able to receive "total" amount of points. I.e. if there were 3 tasks in total and a model was able to produce executing code for all tasks, but only in one case the coverage goal was reached, then the category is only "CodeExecuted" because the coverage goal was not reached consistently.
func (a Assessments) Category(total uint) AssessmentCategory {
// The returned category is never "nil".
func (a Assessments) Category(total uint) *AssessmentCategory {
if total == 0 {
return AssessmentCategoryUnknown
}
Expand Down
2 changes: 1 addition & 1 deletion evaluate/metrics/category_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ func TestAssessmentsCategory(t *testing.T) {
Assessments Assessments
Total uint

ExpectedAssessmentCategory AssessmentCategory
ExpectedAssessmentCategory *AssessmentCategory
}

validate := func(t *testing.T, tc *testCase) {
Expand Down
187 changes: 187 additions & 0 deletions evaluate/report/markdown.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
package report

import (
"io"
"os"
"path/filepath"
"strconv"
"text/template"
"time"

pkgerrors "github.com/pkg/errors"
"github.com/wcharczuk/go-chart/v2"
"github.com/zimmski/osutil/bytesutil"

"github.com/symflower/eval-dev-quality/evaluate/metrics"
)

// Markdown holds the values for exporting a Markdown report.
type Markdown struct {
// DateTime holds the timestamp of the evaluation.
DateTime time.Time
// Version holds the version of the evaluation tool.
Version string

// CSVPath holds the path of detailed CSV results.
CSVPath string
// LogPath holds the path of detailed logs.
LogPath string
// SVGPath holds the path of the charted results.
// REMARK The charts will be generated automatically during the export if this path is set.
SVGPath string

// AssessmentPerModel holds
AssessmentPerModel map[string]metrics.Assessments
// TotalScore holds the total reachable score per task.
// REMARK Used for category computation.
TotalScore uint
}

// markdownTemplateContext holds the template for a Markdown report.
type markdownTemplateContext struct {
Markdown

Categories []*metrics.AssessmentCategory
ModelsPerCategory map[*metrics.AssessmentCategory][]string
}

// markdownTemplate holds the template for a Markdown report.
var markdownTemplate = template.Must(template.New("template-report").Parse(bytesutil.StringTrimIndentations(`
# Evaluation from {{.DateTime.Format "2006-01-02 15:04:05"}}

{{ with $svgPath := .SVGPath -}}
![Bar chart that categorizes all evaluated models.]({{$svgPath}})

{{ end -}}

This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version {{.Version}}` + "`" + `.

## Results

> Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot.

The results of all models have been divided into the following categories:
{{ range $category := .Categories -}}
- {{ $category.Name }}: {{ $category.Description }}
{{ end }}
The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here]({{.LogPath}}). Detailed scoring can be found [here]({{.CSVPath}}).

{{ range $category := .Categories -}}
{{ with $modelNames := index $.ModelsPerCategory $category -}}
### "{{ $category.Name }}"

{{ $category.Description }}

{{ range $modelName := $modelNames -}}
- ` + "`" + `{{ $modelName }}` + "`" + `
{{ end }}
{{ end }}
{{- end -}}
`)))

// barChartModelsPerCategoriesSVG generates a bar chart showing models per category and writes it out as an SVG.
func barChartModelsPerCategoriesSVG(writer io.Writer, categories []*metrics.AssessmentCategory, modelsPerCategory map[*metrics.AssessmentCategory][]string) error {
bars := make([]chart.Value, 0, len(categories))
maxCount := 0
for _, category := range categories {
count := len(modelsPerCategory[category])
if count > maxCount {
maxCount = count
}
if count == 0 {
continue
}

bars = append(bars, chart.Value{
Label: category.Name,
Value: float64(count),
})
}
ticks := make([]chart.Tick, maxCount+1)
for i := range ticks {
ticks[i] = chart.Tick{
Value: float64(i),
Label: strconv.Itoa(i),
}
}
graph := chart.BarChart{
Title: "Models per Category",
Bars: bars,
YAxis: chart.YAxis{
Ticks: ticks,
},

Background: chart.Style{
Padding: chart.Box{
Top: 60,
Bottom: 40,
},
},
Height: 300,
Width: (len(bars) + 2) * 60,
BarWidth: 60,
}

return pkgerrors.WithStack(graph.Render(chart.SVG, writer))
}

// Format formats the markdown values in the template to the given writer.
func (m Markdown) Format(writer io.Writer) error {
templateContext := markdownTemplateContext{
Markdown: m,
Categories: metrics.AllAssessmentCategories,
}
templateContext.ModelsPerCategory = make(map[*metrics.AssessmentCategory][]string, len(metrics.AllAssessmentCategories))
for model, assessment := range m.AssessmentPerModel {
category := assessment.Category(m.TotalScore)
templateContext.ModelsPerCategory[category] = append(templateContext.ModelsPerCategory[category], model)
}

if m.SVGPath == "" {
return pkgerrors.WithStack(markdownTemplate.Execute(writer, templateContext))

}

svgFile, err := os.Create(m.SVGPath)
if err != nil {
return pkgerrors.WithStack(err)
}
defer func() {
if err := svgFile.Close(); err != nil {
panic(err)
}
}()
if err := barChartModelsPerCategoriesSVG(svgFile, metrics.AllAssessmentCategories, templateContext.ModelsPerCategory); err != nil {
return pkgerrors.WithStack(err)
}

return pkgerrors.WithStack(markdownTemplate.Execute(writer, templateContext))
}

// WriteToFile writes the Markdown values in the template to the given file.
func (t Markdown) WriteToFile(path string) (err error) {
t.CSVPath, err = filepath.Abs(t.CSVPath)
if err != nil {
return err
}
t.LogPath, err = filepath.Abs(t.LogPath)
if err != nil {
return err
}
if t.SVGPath != "" {
t.SVGPath, err = filepath.Abs(t.SVGPath)
if err != nil {
return err
}
}

if err = os.MkdirAll(filepath.Base(path), 0755); err != nil {
return pkgerrors.WithStack(err)
}
file, err := os.Create(path)
if err != nil {
return pkgerrors.WithStack(err)
}

return pkgerrors.WithStack(t.Format(file))
}
Loading