Merge pull request #57 from symflower/markdown-reporting

Reporting with MarkDown file and SVG bar charts
symflower · Apr 26, 2024 · 2a84027 · 2a84027
2 parents 02f967b + dd8a259
commit 2a84027
Show file tree

Hide file tree

Showing 13 changed files with 715 additions and 14 deletions.
diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go
@@ -62,7 +62,8 @@ const repositoryPlainName = "plain"
 
 // Execute executes the command.
 func (command *Evaluate) Execute(args []string) (err error) {
-	command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", time.Now().Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group.
+	evaluationTimestamp := time.Now()
+	command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", evaluationTimestamp.Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group.
 	command.logger.Printf("Writing results to %s", command.ResultPath)
 
 	log, logClose, err := log.WithFile(command.logger, filepath.Join(command.ResultPath, "evaluation.log"))
@@ -273,8 +274,23 @@ func (command *Evaluate) Execute(args []string) (err error) {
 		totalScore = uint(len(languagesSelected))
 	}
 
-	_ = metrics.WalkByScore(assessments.Collapse(), func(model string, assessment metrics.Assessments, score uint) error {
-		log.Printf("Evaluation score for %q (%q): %s", model, assessment.Category(totalScore), assessment)
+	assessmentsPerModel := assessments.Collapse()
+	if err := (report.Markdown{
+		DateTime: evaluationTimestamp,
+		Version:  evaluate.Version,
+
+		CSVPath: "./evaluation.csv",
+		LogPath: "./evaluation.log",
+		SVGPath: "./categories.svg",
+
+		AssessmentPerModel: assessmentsPerModel,
+		TotalScore:         totalScore,
+	}).WriteToFile(filepath.Join(command.ResultPath, "README.md")); err != nil {
+		return err
+	}
+
+	_ = metrics.WalkByScore(assessmentsPerModel, func(model string, assessment metrics.Assessments, score uint) error {
+		log.Printf("Evaluation score for %q (%q): %s", model, assessment.Category(totalScore).ID, assessment)
 
 		return nil
 	})

diff --git a/cmd/eval-dev-quality/cmd/evaluate_test.go b/cmd/eval-dev-quality/cmd/evaluate_test.go
@@ -1,6 +1,7 @@
 package cmd
 
 import (
+	"fmt"
 	"os"
 	"path/filepath"
 	"sort"
@@ -12,13 +13,32 @@ import (
 	"github.com/zimmski/osutil"
 	"github.com/zimmski/osutil/bytesutil"
 
+	"github.com/symflower/eval-dev-quality/evaluate/metrics"
 	"github.com/symflower/eval-dev-quality/log"
 )
 
+// validateReportLinks checks if the Markdown report data contains all the links to other relevant report files.
+func validateReportLinks(t *testing.T, data string) {
+	assert.Contains(t, data, "](./categories.svg)")
+	assert.Contains(t, data, "](./evaluation.csv)")
+	assert.Contains(t, data, "](./evaluation.log)")
+}
+
+// validateSVGContent checks if the SVG data contains all given categories and an axis label for the maximal model count.
+func validateSVGContent(t *testing.T, data string, categories []*metrics.AssessmentCategory, maxModelCount uint) {
+	for _, category := range categories {
+		assert.Contains(t, data, fmt.Sprintf("%s</text>", category.Name))
+	}
+	assert.Contains(t, data, fmt.Sprintf("%d</text>", maxModelCount))
+}
+
 func TestEvaluateExecute(t *testing.T) {
 	type testCase struct {
 		Name string
 
+		Before func(t *testing.T, resultPath string)
+		After  func(t *testing.T, resultPath string)
+
 		Arguments []string
 
 		ExpectedOutputValidate func(t *testing.T, output string, resultPath string)
@@ -29,6 +49,13 @@ func TestEvaluateExecute(t *testing.T) {
 		t.Run(tc.Name, func(t *testing.T) {
 			temporaryPath := t.TempDir()
 
+			if tc.Before != nil {
+				tc.Before(t, temporaryPath)
+			}
+			if tc.After != nil {
+				defer tc.After(t, temporaryPath)
+			}
+
 			logOutput, logger := log.Buffer()
 			defer func() {
 				if t.Failed() {
@@ -84,13 +111,19 @@ func TestEvaluateExecute(t *testing.T) {
 				}
 			},
 			ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
+				"categories.svg": func(t *testing.T, filePath, data string) {
+					validateSVGContent(t, data, []*metrics.AssessmentCategory{metrics.AssessmentCategoryCodeNoExcess}, 1)
+				},
 				"evaluation.csv": func(t *testing.T, filePath, data string) {
 					assert.Equal(t, bytesutil.StringTrimIndentations(`
 						model,language,repository,score,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code
 						symflower/symbolic-execution,golang,golang/plain,6,1,1,1,1,1,1
 					`), data)
 				},
 				"evaluation.log": nil,
+				"README.md": func(t *testing.T, filePath, data string) {
+					validateReportLinks(t, data)
+				},
 				"symflower_symbolic-execution/golang/golang/plain.log": nil,
 			},
 		})
@@ -108,6 +141,9 @@ func TestEvaluateExecute(t *testing.T) {
 				}
 			},
 			ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
+				"categories.svg": func(t *testing.T, filePath, data string) {
+					validateSVGContent(t, data, []*metrics.AssessmentCategory{metrics.AssessmentCategoryCodeNoExcess}, 1)
+				},
 				"evaluation.csv": func(t *testing.T, filePath, data string) {
 					assert.Equal(t, bytesutil.StringTrimIndentations(`
 						model,language,repository,score,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code
@@ -116,6 +152,9 @@ func TestEvaluateExecute(t *testing.T) {
 					`), data)
 				},
 				"evaluation.log": nil,
+				"README.md": func(t *testing.T, filePath, data string) {
+					validateReportLinks(t, data)
+				},
 				"symflower_symbolic-execution/golang/golang/plain.log": nil,
 				"symflower_symbolic-execution/java/java/plain.log":     nil,
 			},
@@ -140,13 +179,19 @@ func TestEvaluateExecute(t *testing.T) {
 					}
 				},
 				ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
+					"categories.svg": func(t *testing.T, filePath, data string) {
+						validateSVGContent(t, data, []*metrics.AssessmentCategory{metrics.AssessmentCategoryCodeNoExcess}, 1)
+					},
 					"evaluation.csv": func(t *testing.T, filePath, data string) {
 						assert.Equal(t, bytesutil.StringTrimIndentations(`
 							model,language,repository,score,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code
 							symflower/symbolic-execution,golang,golang/plain,6,1,1,1,1,1,1
 						`), data)
 					},
 					"evaluation.log": nil,
+					"README.md": func(t *testing.T, filePath, data string) {
+						validateReportLinks(t, data)
+					},
 					"symflower_symbolic-execution/golang/golang/plain.log": nil,
 				},
 			})
@@ -165,16 +210,50 @@ func TestEvaluateExecute(t *testing.T) {
 					}
 				},
 				ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
+					"categories.svg": func(t *testing.T, filePath, data string) {
+						validateSVGContent(t, data, []*metrics.AssessmentCategory{metrics.AssessmentCategoryCodeNoExcess}, 1)
+					},
 					"evaluation.csv": func(t *testing.T, filePath, data string) {
 						assert.Equal(t, bytesutil.StringTrimIndentations(`
 							model,language,repository,score,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code
 							symflower/symbolic-execution,golang,golang/plain,6,1,1,1,1,1,1
 						`), data)
 					},
 					"evaluation.log": nil,
+					"README.md": func(t *testing.T, filePath, data string) {
+						validateReportLinks(t, data)
+					},
 					"symflower_symbolic-execution/golang/golang/plain.log": nil,
 				},
 			})
 		})
 	})
+
+	// This case cehcks a beautiful bug where the Markdown export crashed when the current working directory contained a README.md file. While this is not the case during the tests (as the current work directory is the directory of this file), it certainly caused problems when our binary was executed from the repository root (which of course contained a README.md). Therefore, we sadly have to modify the current work directory right within the tests of this case to reproduce the problem and fix it forever.
+	validate(t, &testCase{
+		Name: "Current work directory contains a README.md",
+
+		Before: func(t *testing.T, resultPath string) {
+			if err := os.Remove("README.md"); err != nil {
+				require.Contains(t, err.Error(), "no such file or directory")
+			}
+			require.NoError(t, os.WriteFile("README.md", []byte(""), 0644))
+		},
+		After: func(t *testing.T, resultPath string) {
+			require.NoError(t, os.Remove("README.md"))
+		},
+
+		Arguments: []string{
+			"--language", "golang",
+			"--model", "symflower/symbolic-execution",
+		},
+
+		ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
+			"categories.svg": nil,
+			"evaluation.csv": nil,
+			"evaluation.log": nil,
+			"README.md":      nil,
+			"symflower_symbolic-execution/golang/golang/plain.log": nil,
+		},
+	})
 }
diff --git a/evaluate/metrics/category.go b/evaluate/metrics/category.go
@@ -1,30 +1,88 @@
 package metrics
 
+import "fmt"
+
 // AssessmentCategory represents a categorical ranking of a model based on Assessments.
-type AssessmentCategory string
+type AssessmentCategory struct {
+	// ID holds a unique identifier.
+	ID string
+	// Name holds a short name.
+	Name string
+	// Description holds the description.
+	Description string
+}
+
+// AllAssessmentCategories holds all assessment categories.
+var AllAssessmentCategories []*AssessmentCategory
+
+// registerAssessmentCategory registers a new assessment category.
+func registerAssessmentCategory(c AssessmentCategory) *AssessmentCategory {
+	for _, category := range AllAssessmentCategories {
+		if c.ID == category.ID {
+			panic(fmt.Sprintf("duplicated category ID %q", c.ID))
+		}
+	}
+
+	AllAssessmentCategories = append(AllAssessmentCategories, &c)
+
+	return &c
+}
 
 var (
 	// AssessmentCategoryUnknown indicates that it is not possible to compute a model's category.
-	AssessmentCategoryUnknown = AssessmentCategory("category-unknown")
+	AssessmentCategoryUnknown = registerAssessmentCategory(AssessmentCategory{
+		ID:          "category-unknown",
+		Name:        "category unknown",
+		Description: "Models in this category could not be categorized.",
+	})
 	// AssessmentCategoryResponseError indicates that a model has encountered an error trying to produce a response.
-	AssessmentCategoryResponseError = AssessmentCategory("response-error")
+	AssessmentCategoryResponseError = registerAssessmentCategory(AssessmentCategory{
+		ID:          "response-error",
+		Name:        "response error",
+		Description: "Models in this category encountered an error.",
+	})
 	// AssessmentCategoryResponseEmpty indicates that a model has returned an empty response.
-	AssessmentCategoryResponseEmpty = AssessmentCategory("response-empty")
+	AssessmentCategoryResponseEmpty = registerAssessmentCategory(AssessmentCategory{
+		ID:          "response-empty",
+		Name:        "response empty",
+		Description: "Models in this category produced an empty response.",
+	})
 	// AssessmentCategoryResponseNoCode indicates that a model's response did not contain any source code.
-	AssessmentCategoryResponseNoCode = AssessmentCategory("response-no-code")
+	AssessmentCategoryResponseNoCode = registerAssessmentCategory(AssessmentCategory{
+		ID:          "response-no-code",
+		Name:        "no code",
+		Description: "Models in this category produced no code.",
+	})
 	// AssessmentCategoryCodeInvalid indicates that a model's generated code produced an error when executed.
-	AssessmentCategoryCodeInvalid = AssessmentCategory("code-invalid")
+	AssessmentCategoryCodeInvalid = registerAssessmentCategory(AssessmentCategory{
+		ID:          "code-invalid",
+		Name:        "invalid code",
+		Description: "Models in this category produced invalid code.",
+	})
 	// AssessmentCategoryCodeExecuted indicates that a model's generated code could be executed without an error.
-	AssessmentCategoryCodeExecuted = AssessmentCategory("code-executed")
+	AssessmentCategoryCodeExecuted = registerAssessmentCategory(AssessmentCategory{
+		ID:          "code-executed",
+		Name:        "executable code",
+		Description: "Models in this category produced executable code.",
+	})
 	// AssessmentCategoryCodeCoverageStatementReached indicates that a model's generated code reached 100% statement coverage.
-	AssessmentCategoryCodeCoverageStatementReached = AssessmentCategory("code-coverage-statement")
+	AssessmentCategoryCodeCoverageStatementReached = registerAssessmentCategory(AssessmentCategory{
+		ID:          "code-coverage-statement",
+		Name:        "statement coverage reached",
+		Description: "Models in this category produced code that reached full statement coverage.",
+	})
 	// AssessmentCategoryCodeNoExcess indicates that a model's response did not contain more content than requested.
-	AssessmentCategoryCodeNoExcess = AssessmentCategory("code-no-excess")
+	AssessmentCategoryCodeNoExcess = registerAssessmentCategory(AssessmentCategory{
+		ID:          "code-no-excess",
+		Name:        "no excess response",
+		Description: "Models in this category did not respond with more content than requested.",
+	})
 )
 
 // Category infers a categorical ranking of a model based on assessment values.
 // A models overall category corresponds to the criterion where the model was consistently able to receive "total" amount of points. I.e. if there were 3 tasks in total and a model was able to produce executing code for all tasks, but only in one case the coverage goal was reached, then the category is only "CodeExecuted" because the coverage goal was not reached consistently.
-func (a Assessments) Category(total uint) AssessmentCategory {
+// The returned category is never "nil".
+func (a Assessments) Category(total uint) *AssessmentCategory {
 	if total == 0 {
 		return AssessmentCategoryUnknown
 	}

diff --git a/evaluate/metrics/category_test.go b/evaluate/metrics/category_test.go
@@ -13,7 +13,7 @@ func TestAssessmentsCategory(t *testing.T) {
 		Assessments Assessments
 		Total       uint
 
-		ExpectedAssessmentCategory AssessmentCategory
+		ExpectedAssessmentCategory *AssessmentCategory
 	}
 
 	validate := func(t *testing.T, tc *testCase) {