From 0c00a61a8770f57f0422109a0fa5ced982f5ced0 Mon Sep 17 00:00:00 2001
From: Simon Bauer <simon.bauer@symflower.com>
Date: Fri, 19 Apr 2024 17:07:00 +0200
Subject: [PATCH] WIP

---
 evaluate/report/markdown.go | 56 +++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 evaluate/report/markdown.go

diff --git a/evaluate/report/markdown.go b/evaluate/report/markdown.go
new file mode 100644
index 000000000..1c66f8cb2
--- /dev/null
+++ b/evaluate/report/markdown.go
@@ -0,0 +1,56 @@
+package report
+
+import (
+	"html/template"
+
+	"github.com/symflower/eval-dev-quality/evaluate/metrics"
+	"github.com/symflower/eval-dev-quality/model"
+	"github.com/zimmski/osutil/bytesutil"
+)
+
+type markdownTemplateContext struct {
+	DateTime string
+	Version  string
+	LogPath  string
+	CVSPath  string
+
+	Categories        []metrics.AssessmentCategory
+	ModelsPerCategory map[metrics.AssessmentCategory]model.Model
+}
+
+// markdownTemplate holds the template for a Markdown report.
+var markdownTemplate = template.Must(template.New("template-report").Parse(bytesutil.StringTrimIndentations(`
+	# Evaluation from $DATETIME
+
+	{{< img-blog name="assets/$IMAGE.svg" alt="Bar chart that categorizes all evaluated LLMs.">}}
+
+	This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in version $VERSION.
+
+	## Results
+
+	{{< info-block >}}
+	Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot.
+	{{</ info-block >}}
+
+	The results of all models have been divided into the following categories:
+
+	TODO
+
+	- **"no result"** e.g. we received an empty answer
+	- **"no code"** i.e. the model simply does not understand how to write Go code
+	- **"insufficient/unusable code"** e.g. a kind of random collection of statements that are maybe trying to test something but are not
+	- **"incorrect testing method"** e.g. multiple compile errors but mostly the testing code itself was wrong
+	- **"mistake (types, imports)"** i.e. simple compile errors (e.g. incorrect imports), additional code that does not align with the signature of the function under test 🧢
+	- **"passes (overengineered)"** i.e. code compiles and reaches 100%% coverage but there is code that should not go through a code review 🎩
+	- **"perfect solution"** i.e. if at all, minor objections 👑
+
+	The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here]($LOGPATH). Detailed scoring can be found [here]($CSVPATH)
+
+	### Result category: "$CATEGORY_NAME"
+
+	$CATEGORY_EXPLANATION
+
+	TODO
+
+	> -
+`)))