From 0c00a61a8770f57f0422109a0fa5ced982f5ced0 Mon Sep 17 00:00:00 2001 From: Simon Bauer Date: Fri, 19 Apr 2024 17:07:00 +0200 Subject: [PATCH] WIP --- evaluate/report/markdown.go | 56 +++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 evaluate/report/markdown.go diff --git a/evaluate/report/markdown.go b/evaluate/report/markdown.go new file mode 100644 index 000000000..1c66f8cb2 --- /dev/null +++ b/evaluate/report/markdown.go @@ -0,0 +1,56 @@ +package report + +import ( + "html/template" + + "github.com/symflower/eval-dev-quality/evaluate/metrics" + "github.com/symflower/eval-dev-quality/model" + "github.com/zimmski/osutil/bytesutil" +) + +type markdownTemplateContext struct { + DateTime string + Version string + LogPath string + CVSPath string + + Categories []metrics.AssessmentCategory + ModelsPerCategory map[metrics.AssessmentCategory]model.Model +} + +// markdownTemplate holds the template for a Markdown report. +var markdownTemplate = template.Must(template.New("template-report").Parse(bytesutil.StringTrimIndentations(` + # Evaluation from $DATETIME + + {{< img-blog name="assets/$IMAGE.svg" alt="Bar chart that categorizes all evaluated LLMs.">}} + + This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in version $VERSION. + + ## Results + + {{< info-block >}} + Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot. + {{}} + + The results of all models have been divided into the following categories: + + TODO + + - **"no result"** e.g. we received an empty answer + - **"no code"** i.e. the model simply does not understand how to write Go code + - **"insufficient/unusable code"** e.g. a kind of random collection of statements that are maybe trying to test something but are not + - **"incorrect testing method"** e.g. multiple compile errors but mostly the testing code itself was wrong + - **"mistake (types, imports)"** i.e. simple compile errors (e.g. incorrect imports), additional code that does not align with the signature of the function under test ๐Ÿงข + - **"passes (overengineered)"** i.e. code compiles and reaches 100%% coverage but there is code that should not go through a code review ๐ŸŽฉ + - **"perfect solution"** i.e. if at all, minor objections ๐Ÿ‘‘ + + The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here]($LOGPATH). Detailed scoring can be found [here]($CSVPATH) + + ### Result category: "$CATEGORY_NAME" + + $CATEGORY_EXPLANATION + + TODO + + > - +`)))