-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
14f9b93
commit 6f4046b
Showing
1 changed file
with
82 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
package report | ||
|
||
import ( | ||
"html/template" | ||
"io" | ||
"os" | ||
"path/filepath" | ||
"time" | ||
|
||
pkgerrors "github.com/pkg/errors" | ||
"github.com/zimmski/osutil/bytesutil" | ||
) | ||
|
||
// Markdown holds the values for exporting a Markdown report. | ||
type Markdown struct { | ||
DateTime time.Time | ||
Version string | ||
|
||
CVSPath string | ||
LogPath string | ||
SVGPath string | ||
|
||
// Categories []metrics.AssessmentCategory | ||
// ModelsPerCategory map[metrics.AssessmentCategory]model.Model | ||
} | ||
|
||
// markdownTemplate holds the template for a Markdown report. | ||
var markdownTemplate = template.Must(template.New("template-report").Parse(bytesutil.StringTrimIndentations(` | ||
# Evaluation from {{.DateTime.Format "2006-01-02 15:04:05"}} | ||
{{"{{"}}< img-blog name="{{.SVGPath}}" alt="Bar chart that categorizes all evaluated LLMs.">{{"}}"}} | ||
This report was generated by [DevQualityEval benchmark](https://github.com/symflower/eval-dev-quality) in ` + "`" + ` version | ||
{{.Version}} ` + "`" + `. | ||
## Results | ||
{{< info-block >}} | ||
Keep in mind that LLMs are nondeterministic. The following results just reflect a current snapshot. | ||
{{</ info-block >}} | ||
The results of all models have been divided into the following categories: | ||
TODO | ||
- **"no result"** e.g. we received an empty answer | ||
- **"no code"** i.e. the model simply does not understand how to write Go code | ||
- **"insufficient/unusable code"** e.g. a kind of random collection of statements that are maybe trying to test something but are not | ||
- **"incorrect testing method"** e.g. multiple compile errors but mostly the testing code itself was wrong | ||
- **"mistake (types, imports)"** i.e. simple compile errors (e.g. incorrect imports), additional code that does not align with the signature of the function under test 🧢 | ||
- **"passes (overengineered)"** i.e. code compiles and reaches 100%% coverage but there is code that should not go through a code review 🎩 | ||
- **"perfect solution"** i.e. if at all, minor objections 👑 | ||
The following sections list all models with their categories. The complete log of the evaluation with all outputs can be found [here]({{.LogPath}}). Detailed scoring can be found [here]({{.CSVPath}}) | ||
### Result category: "$CATEGORY_NAME" | ||
$CATEGORY_EXPLANATION | ||
TODO | ||
> - | ||
`))) | ||
|
||
// Format formats the markdown values in the template to the given writer. | ||
func (t Markdown) Format(writer io.Writer) error { | ||
return pkgerrors.WithStack(markdownTemplate.Execute(writer, t)) | ||
} | ||
|
||
// WriteToFile writes the markdown values in the template to the given file. | ||
func (t Markdown) WriteToFile(path string) error { | ||
directory := filepath.Base(path) | ||
if err := os.MkdirAll(directory, 0755); err != nil { | ||
return pkgerrors.WithStack(err) | ||
} | ||
file, err := os.Create(path) | ||
if err != nil { | ||
return pkgerrors.WithStack(err) | ||
} | ||
|
||
return pkgerrors.WithStack(t.Format(file)) | ||
} |