Skip to content

Commit

Permalink
Merge pull request #57 from symflower/markdown-reporting
Browse files Browse the repository at this point in the history
Reporting with MarkDown file and SVG bar charts
  • Loading branch information
zimmski authored Apr 26, 2024
2 parents 02f967b + dd8a259 commit 2a84027
Show file tree
Hide file tree
Showing 13 changed files with 715 additions and 14 deletions.
22 changes: 19 additions & 3 deletions cmd/eval-dev-quality/cmd/evaluate.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ const repositoryPlainName = "plain"

// Execute executes the command.
func (command *Evaluate) Execute(args []string) (err error) {
command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", time.Now().Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group.
evaluationTimestamp := time.Now()
command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", evaluationTimestamp.Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group.
command.logger.Printf("Writing results to %s", command.ResultPath)

log, logClose, err := log.WithFile(command.logger, filepath.Join(command.ResultPath, "evaluation.log"))
Expand Down Expand Up @@ -273,8 +274,23 @@ func (command *Evaluate) Execute(args []string) (err error) {
totalScore = uint(len(languagesSelected))
}

_ = metrics.WalkByScore(assessments.Collapse(), func(model string, assessment metrics.Assessments, score uint) error {
log.Printf("Evaluation score for %q (%q): %s", model, assessment.Category(totalScore), assessment)
assessmentsPerModel := assessments.Collapse()
if err := (report.Markdown{
DateTime: evaluationTimestamp,
Version: evaluate.Version,

CSVPath: "./evaluation.csv",
LogPath: "./evaluation.log",
SVGPath: "./categories.svg",

AssessmentPerModel: assessmentsPerModel,
TotalScore: totalScore,
}).WriteToFile(filepath.Join(command.ResultPath, "README.md")); err != nil {
return err
}

_ = metrics.WalkByScore(assessmentsPerModel, func(model string, assessment metrics.Assessments, score uint) error {
log.Printf("Evaluation score for %q (%q): %s", model, assessment.Category(totalScore).ID, assessment)

return nil
})
Expand Down
79 changes: 79 additions & 0 deletions cmd/eval-dev-quality/cmd/evaluate_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package cmd

import (
"fmt"
"os"
"path/filepath"
"sort"
Expand All @@ -12,13 +13,32 @@ import (
"github.com/zimmski/osutil"
"github.com/zimmski/osutil/bytesutil"

"github.com/symflower/eval-dev-quality/evaluate/metrics"
"github.com/symflower/eval-dev-quality/log"
)

// validateReportLinks checks if the Markdown report data contains all the links to other relevant report files.
func validateReportLinks(t *testing.T, data string) {
assert.Contains(t, data, "](./categories.svg)")
assert.Contains(t, data, "](./evaluation.csv)")
assert.Contains(t, data, "](./evaluation.log)")
}

// validateSVGContent checks if the SVG data contains all given categories and an axis label for the maximal model count.
func validateSVGContent(t *testing.T, data string, categories []*metrics.AssessmentCategory, maxModelCount uint) {
for _, category := range categories {
assert.Contains(t, data, fmt.Sprintf("%s</text>", category.Name))
}
assert.Contains(t, data, fmt.Sprintf("%d</text>", maxModelCount))
}

func TestEvaluateExecute(t *testing.T) {
type testCase struct {
Name string

Before func(t *testing.T, resultPath string)
After func(t *testing.T, resultPath string)

Arguments []string

ExpectedOutputValidate func(t *testing.T, output string, resultPath string)
Expand All @@ -29,6 +49,13 @@ func TestEvaluateExecute(t *testing.T) {
t.Run(tc.Name, func(t *testing.T) {
temporaryPath := t.TempDir()

if tc.Before != nil {
tc.Before(t, temporaryPath)
}
if tc.After != nil {
defer tc.After(t, temporaryPath)
}

logOutput, logger := log.Buffer()
defer func() {
if t.Failed() {
Expand Down Expand Up @@ -84,13 +111,19 @@ func TestEvaluateExecute(t *testing.T) {
}
},
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"categories.svg": func(t *testing.T, filePath, data string) {
validateSVGContent(t, data, []*metrics.AssessmentCategory{metrics.AssessmentCategoryCodeNoExcess}, 1)
},
"evaluation.csv": func(t *testing.T, filePath, data string) {
assert.Equal(t, bytesutil.StringTrimIndentations(`
model,language,repository,score,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code
symflower/symbolic-execution,golang,golang/plain,6,1,1,1,1,1,1
`), data)
},
"evaluation.log": nil,
"README.md": func(t *testing.T, filePath, data string) {
validateReportLinks(t, data)
},
"symflower_symbolic-execution/golang/golang/plain.log": nil,
},
})
Expand All @@ -108,6 +141,9 @@ func TestEvaluateExecute(t *testing.T) {
}
},
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"categories.svg": func(t *testing.T, filePath, data string) {
validateSVGContent(t, data, []*metrics.AssessmentCategory{metrics.AssessmentCategoryCodeNoExcess}, 1)
},
"evaluation.csv": func(t *testing.T, filePath, data string) {
assert.Equal(t, bytesutil.StringTrimIndentations(`
model,language,repository,score,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code
Expand All @@ -116,6 +152,9 @@ func TestEvaluateExecute(t *testing.T) {
`), data)
},
"evaluation.log": nil,
"README.md": func(t *testing.T, filePath, data string) {
validateReportLinks(t, data)
},
"symflower_symbolic-execution/golang/golang/plain.log": nil,
"symflower_symbolic-execution/java/java/plain.log": nil,
},
Expand All @@ -140,13 +179,19 @@ func TestEvaluateExecute(t *testing.T) {
}
},
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"categories.svg": func(t *testing.T, filePath, data string) {
validateSVGContent(t, data, []*metrics.AssessmentCategory{metrics.AssessmentCategoryCodeNoExcess}, 1)
},
"evaluation.csv": func(t *testing.T, filePath, data string) {
assert.Equal(t, bytesutil.StringTrimIndentations(`
model,language,repository,score,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code
symflower/symbolic-execution,golang,golang/plain,6,1,1,1,1,1,1
`), data)
},
"evaluation.log": nil,
"README.md": func(t *testing.T, filePath, data string) {
validateReportLinks(t, data)
},
"symflower_symbolic-execution/golang/golang/plain.log": nil,
},
})
Expand All @@ -165,16 +210,50 @@ func TestEvaluateExecute(t *testing.T) {
}
},
ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"categories.svg": func(t *testing.T, filePath, data string) {
validateSVGContent(t, data, []*metrics.AssessmentCategory{metrics.AssessmentCategoryCodeNoExcess}, 1)
},
"evaluation.csv": func(t *testing.T, filePath, data string) {
assert.Equal(t, bytesutil.StringTrimIndentations(`
model,language,repository,score,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code
symflower/symbolic-execution,golang,golang/plain,6,1,1,1,1,1,1
`), data)
},
"evaluation.log": nil,
"README.md": func(t *testing.T, filePath, data string) {
validateReportLinks(t, data)
},
"symflower_symbolic-execution/golang/golang/plain.log": nil,
},
})
})
})

// This case cehcks a beautiful bug where the Markdown export crashed when the current working directory contained a README.md file. While this is not the case during the tests (as the current work directory is the directory of this file), it certainly caused problems when our binary was executed from the repository root (which of course contained a README.md). Therefore, we sadly have to modify the current work directory right within the tests of this case to reproduce the problem and fix it forever.
validate(t, &testCase{
Name: "Current work directory contains a README.md",

Before: func(t *testing.T, resultPath string) {
if err := os.Remove("README.md"); err != nil {
require.Contains(t, err.Error(), "no such file or directory")
}
require.NoError(t, os.WriteFile("README.md", []byte(""), 0644))
},
After: func(t *testing.T, resultPath string) {
require.NoError(t, os.Remove("README.md"))
},

Arguments: []string{
"--language", "golang",
"--model", "symflower/symbolic-execution",
},

ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
"categories.svg": nil,
"evaluation.csv": nil,
"evaluation.log": nil,
"README.md": nil,
"symflower_symbolic-execution/golang/golang/plain.log": nil,
},
})
}
78 changes: 68 additions & 10 deletions evaluate/metrics/category.go
Original file line number Diff line number Diff line change
@@ -1,30 +1,88 @@
package metrics

import "fmt"

// AssessmentCategory represents a categorical ranking of a model based on Assessments.
type AssessmentCategory string
type AssessmentCategory struct {
// ID holds a unique identifier.
ID string
// Name holds a short name.
Name string
// Description holds the description.
Description string
}

// AllAssessmentCategories holds all assessment categories.
var AllAssessmentCategories []*AssessmentCategory

// registerAssessmentCategory registers a new assessment category.
func registerAssessmentCategory(c AssessmentCategory) *AssessmentCategory {
for _, category := range AllAssessmentCategories {
if c.ID == category.ID {
panic(fmt.Sprintf("duplicated category ID %q", c.ID))
}
}

AllAssessmentCategories = append(AllAssessmentCategories, &c)

return &c
}

var (
// AssessmentCategoryUnknown indicates that it is not possible to compute a model's category.
AssessmentCategoryUnknown = AssessmentCategory("category-unknown")
AssessmentCategoryUnknown = registerAssessmentCategory(AssessmentCategory{
ID: "category-unknown",
Name: "category unknown",
Description: "Models in this category could not be categorized.",
})
// AssessmentCategoryResponseError indicates that a model has encountered an error trying to produce a response.
AssessmentCategoryResponseError = AssessmentCategory("response-error")
AssessmentCategoryResponseError = registerAssessmentCategory(AssessmentCategory{
ID: "response-error",
Name: "response error",
Description: "Models in this category encountered an error.",
})
// AssessmentCategoryResponseEmpty indicates that a model has returned an empty response.
AssessmentCategoryResponseEmpty = AssessmentCategory("response-empty")
AssessmentCategoryResponseEmpty = registerAssessmentCategory(AssessmentCategory{
ID: "response-empty",
Name: "response empty",
Description: "Models in this category produced an empty response.",
})
// AssessmentCategoryResponseNoCode indicates that a model's response did not contain any source code.
AssessmentCategoryResponseNoCode = AssessmentCategory("response-no-code")
AssessmentCategoryResponseNoCode = registerAssessmentCategory(AssessmentCategory{
ID: "response-no-code",
Name: "no code",
Description: "Models in this category produced no code.",
})
// AssessmentCategoryCodeInvalid indicates that a model's generated code produced an error when executed.
AssessmentCategoryCodeInvalid = AssessmentCategory("code-invalid")
AssessmentCategoryCodeInvalid = registerAssessmentCategory(AssessmentCategory{
ID: "code-invalid",
Name: "invalid code",
Description: "Models in this category produced invalid code.",
})
// AssessmentCategoryCodeExecuted indicates that a model's generated code could be executed without an error.
AssessmentCategoryCodeExecuted = AssessmentCategory("code-executed")
AssessmentCategoryCodeExecuted = registerAssessmentCategory(AssessmentCategory{
ID: "code-executed",
Name: "executable code",
Description: "Models in this category produced executable code.",
})
// AssessmentCategoryCodeCoverageStatementReached indicates that a model's generated code reached 100% statement coverage.
AssessmentCategoryCodeCoverageStatementReached = AssessmentCategory("code-coverage-statement")
AssessmentCategoryCodeCoverageStatementReached = registerAssessmentCategory(AssessmentCategory{
ID: "code-coverage-statement",
Name: "statement coverage reached",
Description: "Models in this category produced code that reached full statement coverage.",
})
// AssessmentCategoryCodeNoExcess indicates that a model's response did not contain more content than requested.
AssessmentCategoryCodeNoExcess = AssessmentCategory("code-no-excess")
AssessmentCategoryCodeNoExcess = registerAssessmentCategory(AssessmentCategory{
ID: "code-no-excess",
Name: "no excess response",
Description: "Models in this category did not respond with more content than requested.",
})
)

// Category infers a categorical ranking of a model based on assessment values.
// A models overall category corresponds to the criterion where the model was consistently able to receive "total" amount of points. I.e. if there were 3 tasks in total and a model was able to produce executing code for all tasks, but only in one case the coverage goal was reached, then the category is only "CodeExecuted" because the coverage goal was not reached consistently.
func (a Assessments) Category(total uint) AssessmentCategory {
// The returned category is never "nil".
func (a Assessments) Category(total uint) *AssessmentCategory {
if total == 0 {
return AssessmentCategoryUnknown
}
Expand Down
2 changes: 1 addition & 1 deletion evaluate/metrics/category_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ func TestAssessmentsCategory(t *testing.T) {
Assessments Assessments
Total uint

ExpectedAssessmentCategory AssessmentCategory
ExpectedAssessmentCategory *AssessmentCategory
}

validate := func(t *testing.T, tc *testCase) {
Expand Down
Loading

0 comments on commit 2a84027

Please sign in to comment.