Markdown export

Part of #49
symflower · Apr 24, 2024 · 155ab34 · 155ab34
1 parent 6f19eb3
commit 155ab34
Show file tree

Hide file tree

Showing 5 changed files with 272 additions and 9 deletions.
diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go
@@ -21,6 +21,7 @@ import (
 	_ "github.com/symflower/eval-dev-quality/provider/openrouter" // Register provider.
 	_ "github.com/symflower/eval-dev-quality/provider/symflower"  // Register provider.
 	"github.com/symflower/eval-dev-quality/tools"
+	"github.com/symflower/eval-dev-quality/version"
 )
 
 // Evaluate holds the "evaluation" command.
@@ -60,10 +61,12 @@ const repositoryPlainName = "plain"
 
 // Execute executes the command.
 func (command *Evaluate) Execute(args []string) (err error) {
-	command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", time.Now().Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group.
+	evaluationTimestamp := time.Now()
+	command.ResultPath = strings.ReplaceAll(command.ResultPath, "%datetime%", evaluationTimestamp.Format("2006-01-02-15:04:05")) // REMARK Use a datetime format with a dash, so directories can be easily marked because they are only one group.
 	command.logger.Printf("Writing results to %s", command.ResultPath)
 
-	log, logClose, err := log.WithFile(command.logger, filepath.Join(command.ResultPath, "evaluation.log"))
+	logFilePath := filepath.Join(command.ResultPath, "evaluation.log")
+	log, logClose, err := log.WithFile(command.logger, logFilePath)
 	if err != nil {
 		return err
 	}
@@ -237,7 +240,8 @@ func (command *Evaluate) Execute(args []string) (err error) {
 	if err != nil {
 		log.Panicf("ERROR: could not create result summary: %s", err)
 	}
-	if err := os.WriteFile(filepath.Join(command.ResultPath, "evaluation.csv"), []byte(csv), 0644); err != nil {
+	csvReportPath := filepath.Join(command.ResultPath, "evaluation.csv")
+	if err := os.WriteFile(csvReportPath, []byte(csv), 0644); err != nil {
 		log.Panicf("ERROR: could not write result summary: %s", err)
 	}
 
@@ -255,7 +259,21 @@ func (command *Evaluate) Execute(args []string) (err error) {
 		totalScore = uint(len(languagesSelected))
 	}
 
-	_ = metrics.WalkByScore(assessments.Collapse(), func(model string, assessment metrics.Assessments, score uint) error {
+	assessmentsPerModel := assessments.Collapse()
+	if err := (report.Markdown{
+		DateTime: evaluationTimestamp,
+		Version:  version.Current,
+
+		CSVPath: csvReportPath,
+		LogPath: logFilePath,
+
+		AssessmentPerModel: assessmentsPerModel,
+		TotalScore:         totalScore,
+	}).WriteToFile(filepath.Join(command.ResultPath, "report.md")); err != nil {
+		return err
+	}
+
+	_ = metrics.WalkByScore(assessmentsPerModel, func(model string, assessment metrics.Assessments, score uint) error {
 		log.Printf("Evaluation score for %q (%q): %s", model, assessment.Category(totalScore).Name, assessment)
 
 		return nil

diff --git a/cmd/eval-dev-quality/cmd/evaluate_test.go b/cmd/eval-dev-quality/cmd/evaluate_test.go
@@ -22,7 +22,7 @@ func TestEvaluateExecute(t *testing.T) {
 		Arguments []string
 
 		ExpectedOutputValidate func(t *testing.T, output string, resultPath string)
-		ExpectedResultFiles    map[string]func(t *testing.T, filePath string, data string)
+		ExpectedResultFiles    map[string]func(t *testing.T, resultPath string, filePath string, data string)
 	}
 
 	validate := func(t *testing.T, tc *testCase) {
@@ -54,7 +54,7 @@ func TestEvaluateExecute(t *testing.T) {
 				if validate != nil {
 					data, err := os.ReadFile(filepath.Join(temporaryPath, filePath))
 					if assert.NoError(t, err) {
-						validate(t, filePath, string(data))
+						validate(t, temporaryPath, filePath, string(data))
 					}
 				}
 			}
@@ -78,15 +78,19 @@ func TestEvaluateExecute(t *testing.T) {
 				t.Logf("Output: %s", output)
 			}
 		},
-		ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){
-			"evaluation.csv": func(t *testing.T, filePath, data string) {
+		ExpectedResultFiles: map[string]func(t *testing.T, resultPath string, filePath string, data string){
+			"evaluation.csv": func(t *testing.T, resultPath string, filePath, data string) {
 				assert.Equal(t, bytesutil.StringTrimIndentations(`
 					model,language,repository,score,coverage-statement,files-executed,response-no-error,response-no-excess,response-not-empty,response-with-code
 					symflower/symbolic-execution,golang,golang/plain,6,1,1,1,1,1,1
 				`), data)
 			},
 			"evaluation.log": nil,
-			"report.md":      nil,
+			"report.md": func(t *testing.T, resultPath string, filePath, data string) {
+				// Ensure the report links to the CSV file and logs.
+				assert.Contains(t, data, filepath.Join(resultPath, "evaluation.csv"))
+				assert.Contains(t, data, filepath.Join(resultPath, "evaluation.log"))
+			},
 			"symflower_symbolic-execution/golang/golang/plain.log": nil,
 		},
 	})

diff --git a/evaluate/report/markdown.go b/evaluate/report/markdown.go
@@ -0,0 +1,108 @@
+package report
+
+import (
+	"io"
+	"os"
+	"path/filepath"
+	"text/template"
+	"time"
+
+	pkgerrors "github.com/pkg/errors"
+	"github.com/zimmski/osutil/bytesutil"
+
+	"github.com/symflower/eval-dev-quality/evaluate/metrics"
+)
+
+// Markdown holds the values for exporting a Markdown report.
+type Markdown struct {
+	// DateTime holds the timestamp of the evaluation.
+	DateTime time.Time
+	// Version holds the version of the evaluation tool.
+	Version string
+
+	// CSVPath holds the path of detailed CSV results.
+	CSVPath string
+	// LogPath holds the path of detailed logs.
+	LogPath string
+
+	// AssessmentPerModel holds
+	AssessmentPerModel map[string]metrics.Assessments
+	// TotalScore holds the total reachable score per task.
+	// REMARK Used for category computation.
+	TotalScore uint
+}
+
+// markdownTemplateContext holds the template for a Markdown report.
+type markdownTemplateContext struct {
+	Markdown
+
+	Categories        []*metrics.AssessmentCategory
+	ModelsPerCategory map[*metrics.AssessmentCategory][]string
+}
+
+// markdownTemplate holds the template for a Markdown report.
+var markdownTemplate = template.Must(template.New("template-report").Parse(bytesutil.StringTrimIndentations(`
+	# Evaluation from {{.DateTime.Format "2006-01-02 15:04:05"}}
+
+	This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version {{.Version}}` + "`" + `.
+
+	## Results
+
+	> Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot.
+
+	The results of all models have been divided into the following categories:
+	{{ range $category := .Categories -}}
+	- {{ $category.Name }}: {{ $category.Description }}
+	{{ end }}
+	The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here]({{.LogPath}}). Detailed scoring can be found [here]({{.CSVPath}}).
+
+	{{ range $category := .Categories -}}
+	{{ with $modelNames := index $.ModelsPerCategory $category -}}
+	### "{{ $category.Name }}"
+
+	{{ $category.Description }}
+
+	{{ range $modelName := $modelNames -}}
+	- ` + "`" + `{{ $modelName }}` + "`" + `
+	{{ end }}
+	{{ end }}
+	{{- end -}}
+`)))
+
+// Format formats the markdown values in the template to the given writer.
+func (m Markdown) Format(writer io.Writer) error {
+	templateContext := markdownTemplateContext{
+		Markdown:   m,
+		Categories: metrics.AllAssessmentCategories,
+	}
+	templateContext.ModelsPerCategory = make(map[*metrics.AssessmentCategory][]string, len(metrics.AllAssessmentCategories))
+	for model, assessment := range m.AssessmentPerModel {
+		category := assessment.Category(m.TotalScore)
+		templateContext.ModelsPerCategory[category] = append(templateContext.ModelsPerCategory[category], model)
+	}
+	// TODO Generate svg using maybe https://github.com/wcharczuk/go-chart.
+
+	return pkgerrors.WithStack(markdownTemplate.Execute(writer, templateContext))
+}
+
+// WriteToFile writes the Markdown values in the template to the given file.
+func (t Markdown) WriteToFile(path string) (err error) {
+	t.CSVPath, err = filepath.Abs(t.CSVPath)
+	if err != nil {
+		return err
+	}
+	t.LogPath, err = filepath.Abs(t.LogPath)
+	if err != nil {
+		return err
+	}
+
+	if err = os.MkdirAll(filepath.Base(path), 0755); err != nil {
+		return pkgerrors.WithStack(err)
+	}
+	file, err := os.Create(path)
+	if err != nil {
+		return pkgerrors.WithStack(err)
+	}
+
+	return pkgerrors.WithStack(t.Format(file))
+}
diff --git a/evaluate/report/markdown_test.go b/evaluate/report/markdown_test.go
@@ -0,0 +1,129 @@
+package report
+
+import (
+	"bytes"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"github.com/symflower/eval-dev-quality/evaluate/metrics"
+	"github.com/zimmski/osutil/bytesutil"
+)
+
+func TestMarkdownFormat(t *testing.T) {
+	type testCase struct {
+		Name string
+
+		Markdown Markdown
+
+		ExpectedReport string
+		ExpectedError  error
+	}
+
+	validate := func(t *testing.T, tc *testCase) {
+		t.Run(tc.Name, func(t *testing.T) {
+			var buffer bytes.Buffer
+			actualError := tc.Markdown.Format(&buffer)
+			assert.Equal(t, tc.ExpectedError, actualError)
+			actualReport := buffer.String()
+
+			assert.Equalf(t, bytesutil.StringTrimIndentations(tc.ExpectedReport), actualReport, "Full output:\n%s", actualReport)
+		})
+	}
+
+	testTimeString := "2000-01-01 00:00:00"
+	testTime, err := time.Parse(time.DateTime, testTimeString)
+	require.NoError(t, err)
+
+	validate(t, &testCase{
+		Name: "No Models",
+
+		Markdown: Markdown{
+			DateTime: testTime,
+			Version:  "1234",
+
+			CSVPath: "some/csv/path.csv",
+			LogPath: "some/log/path.log",
+		},
+
+		ExpectedReport: `
+			# Evaluation from 2000-01-01 00:00:00
+
+			This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version 1234` + "`" + `.
+
+			## Results
+
+			> Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot.
+
+			The results of all models have been divided into the following categories:
+			- Category Unknown: Models in this category could not be categorized.
+			- Response Error: Models in this category encountered an error.
+			- Response Empty: Models in this category produced an empty response.
+			- No Code: Models in this category produced no code.
+			- Invalid Code: Models in this category produced invalid code.
+			- Executable Code: Models in this category produced executable code.
+			- Statement Coverage Reached: Models in this category produced code that reached full statement coverage.
+			- No Excess Response: Models in this category did not respond with more content than requested.
+
+			The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here](some/log/path.log). Detailed scoring can be found [here](some/csv/path.csv).
+
+		`,
+	})
+
+	validate(t, &testCase{
+		Name: "Simple Models",
+
+		Markdown: Markdown{
+			DateTime: testTime,
+			Version:  "1234",
+
+			CSVPath: "some/csv/path.csv",
+			LogPath: "some/log/path.log",
+
+			TotalScore: 1,
+			AssessmentPerModel: map[string]metrics.Assessments{
+				"ModelResponseError": metrics.NewAssessments(),
+				"ModelNoCode": metrics.Assessments{
+					metrics.AssessmentKeyResponseNoError:  1,
+					metrics.AssessmentKeyResponseNotEmpty: 1,
+				},
+			},
+		},
+
+		ExpectedReport: `
+			# Evaluation from 2000-01-01 00:00:00
+
+			This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + `version 1234` + "`" + `.
+
+			## Results
+
+			> Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot.
+
+			The results of all models have been divided into the following categories:
+			- Category Unknown: Models in this category could not be categorized.
+			- Response Error: Models in this category encountered an error.
+			- Response Empty: Models in this category produced an empty response.
+			- No Code: Models in this category produced no code.
+			- Invalid Code: Models in this category produced invalid code.
+			- Executable Code: Models in this category produced executable code.
+			- Statement Coverage Reached: Models in this category produced code that reached full statement coverage.
+			- No Excess Response: Models in this category did not respond with more content than requested.
+
+			The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here](some/log/path.log). Detailed scoring can be found [here](some/csv/path.csv).
+
+			### "Response Error"
+
+			Models in this category encountered an error.
+
+			- ` + "`ModelResponseError`" + `
+
+			### "No Code"
+
+			Models in this category produced no code.
+
+			- ` + "`ModelNoCode`" + `
+
+		`,
+	})
+}
diff --git a/version/version.go b/version/version.go
@@ -0,0 +1,4 @@
+package version
+
+// Current holds the current version.
+var Current = "0.2.0"