diff --git a/cmd/eval-dev-quality/cmd/evaluate.go b/cmd/eval-dev-quality/cmd/evaluate.go index fdc714f1..cdac95e6 100644 --- a/cmd/eval-dev-quality/cmd/evaluate.go +++ b/cmd/eval-dev-quality/cmd/evaluate.go @@ -437,10 +437,6 @@ func (command *Evaluate) evaluateLocal(evaluationContext *evaluate.Context) (err return nil }) - if err := report.WriteCSVs(command.ResultPath); err != nil { - command.logger.Panicf("ERROR: %s", err) - } - return nil } diff --git a/cmd/eval-dev-quality/cmd/evaluate_test.go b/cmd/eval-dev-quality/cmd/evaluate_test.go index a390b6dd..32bbc5da 100644 --- a/cmd/eval-dev-quality/cmd/evaluate_test.go +++ b/cmd/eval-dev-quality/cmd/evaluate_test.go @@ -246,38 +246,6 @@ func TestEvaluateExecute(t *testing.T) { assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyResponseCharacterCount], uint64(254)) }, filepath.Join("result-directory", "evaluation.log"): nil, - filepath.Join("result-directory", "golang-summed.csv"): func(t *testing.T, filePath, data string) { - actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{ - metrics.Assessments{ - metrics.AssessmentKeyCoverage: 20, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyResponseNoError: 2, - metrics.AssessmentKeyResponseNoExcess: 2, - metrics.AssessmentKeyResponseWithCode: 2, - }, - }, []uint64{28}) - // Assert non-deterministic behavior. - assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(508)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(508)) - }, - filepath.Join("result-directory", "models-summed.csv"): func(t *testing.T, filePath, data string) { - actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{ - metrics.Assessments{ - metrics.AssessmentKeyCoverage: 20, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyResponseNoError: 2, - metrics.AssessmentKeyResponseNoExcess: 2, - metrics.AssessmentKeyResponseWithCode: 2, - }, - }, []uint64{28}) - // Assert non-deterministic behavior. - assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(508)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(508)) - }, filepath.Join("result-directory", "README.md"): func(t *testing.T, filePath, data string) { validateReportLinks(t, data, []string{"symflower_symbolic-execution"}) }, @@ -363,54 +331,6 @@ func TestEvaluateExecute(t *testing.T) { assert.Equal(t, actualAssessments[3][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(139)) assert.Equal(t, actualAssessments[3][metrics.AssessmentKeyResponseCharacterCount], uint64(139)) }, - filepath.Join("result-directory", "golang-summed.csv"): func(t *testing.T, filePath, data string) { - actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{ - metrics.Assessments{ - metrics.AssessmentKeyCoverage: 20, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyResponseNoError: 2, - metrics.AssessmentKeyResponseNoExcess: 2, - metrics.AssessmentKeyResponseWithCode: 2, - }, - }, []uint64{28}) - // Assert non-deterministic behavior. - assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(508)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(508)) - }, - filepath.Join("result-directory", "java-summed.csv"): func(t *testing.T, filePath, data string) { - actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{ - metrics.Assessments{ - metrics.AssessmentKeyCoverage: 20, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyResponseNoError: 2, - metrics.AssessmentKeyResponseNoExcess: 2, - metrics.AssessmentKeyResponseWithCode: 2, - }, - }, []uint64{28}) - // Assert non-deterministic behavior. - assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(278)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(278)) - }, - filepath.Join("result-directory", "models-summed.csv"): func(t *testing.T, filePath, data string) { - actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{ - metrics.Assessments{ - metrics.AssessmentKeyCoverage: 40, - metrics.AssessmentKeyFilesExecuted: 4, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 4, - metrics.AssessmentKeyResponseNoError: 4, - metrics.AssessmentKeyResponseNoExcess: 4, - metrics.AssessmentKeyResponseWithCode: 4, - }, - }, []uint64{56}) - // Assert non-deterministic behavior. - assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(786)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(786)) - }, filepath.Join("result-directory", "evaluation.log"): nil, filepath.Join("result-directory", "README.md"): func(t *testing.T, filePath, data string) { validateReportLinks(t, data, []string{"symflower_symbolic-execution"}) @@ -485,38 +405,6 @@ func TestEvaluateExecute(t *testing.T) { assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyResponseCharacterCount], uint64(254)) }, filepath.Join("result-directory", "evaluation.log"): nil, - filepath.Join("result-directory", "golang-summed.csv"): func(t *testing.T, filePath, data string) { - actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{ - metrics.Assessments{ - metrics.AssessmentKeyCoverage: 20, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyResponseNoError: 2, - metrics.AssessmentKeyResponseNoExcess: 2, - metrics.AssessmentKeyResponseWithCode: 2, - }, - }, []uint64{28}) - // Assert non-deterministic behavior. - assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(508)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(508)) - }, - filepath.Join("result-directory", "models-summed.csv"): func(t *testing.T, filePath, data string) { - actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{ - metrics.Assessments{ - metrics.AssessmentKeyCoverage: 20, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyResponseNoError: 2, - metrics.AssessmentKeyResponseNoExcess: 2, - metrics.AssessmentKeyResponseWithCode: 2, - }, - }, []uint64{28}) - // Assert non-deterministic behavior. - assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(508)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(508)) - }, filepath.Join("result-directory", "README.md"): func(t *testing.T, filePath, data string) { validateReportLinks(t, data, []string{"symflower_symbolic-execution"}) }, @@ -567,38 +455,6 @@ func TestEvaluateExecute(t *testing.T) { assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyResponseCharacterCount], uint64(254)) }, filepath.Join("result-directory", "evaluation.log"): nil, - filepath.Join("result-directory", "golang-summed.csv"): func(t *testing.T, filePath, data string) { - actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{ - metrics.Assessments{ - metrics.AssessmentKeyCoverage: 20, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyResponseNoError: 2, - metrics.AssessmentKeyResponseNoExcess: 2, - metrics.AssessmentKeyResponseWithCode: 2, - }, - }, []uint64{28}) - // Assert non-deterministic behavior. - assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(508)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(508)) - }, - filepath.Join("result-directory", "models-summed.csv"): func(t *testing.T, filePath, data string) { - actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{ - metrics.Assessments{ - metrics.AssessmentKeyCoverage: 20, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyResponseNoError: 2, - metrics.AssessmentKeyResponseNoExcess: 2, - metrics.AssessmentKeyResponseWithCode: 2, - }, - }, []uint64{28}) - // Assert non-deterministic behavior. - assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(508)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(508)) - }, filepath.Join("result-directory", "README.md"): func(t *testing.T, filePath, data string) { validateReportLinks(t, data, []string{"symflower_symbolic-execution"}) }, @@ -665,9 +521,7 @@ func TestEvaluateExecute(t *testing.T) { assert.Contains(t, data, "preloading model") assert.Contains(t, data, "unloading model") }, - filepath.Join("result-directory", "golang-summed.csv"): nil, - filepath.Join("result-directory", "models-summed.csv"): nil, - filepath.Join("result-directory", "README.md"): nil, + filepath.Join("result-directory", "README.md"): nil, filepath.Join("result-directory", string(evaluatetask.IdentifierWriteTests), "ollama_"+model.CleanModelNameForFileSystem(providertesting.OllamaTestModel), "golang", "golang", "plain.log"): nil, }, }) @@ -713,9 +567,7 @@ func TestEvaluateExecute(t *testing.T) { assert.Contains(t, data, fmt.Sprintf(`Evaluation score for "custom-ollama/%s"`, providertesting.OllamaTestModel)) assert.Contains(t, data, "response-no-error=2") }, - filepath.Join("result-directory", "golang-summed.csv"): nil, - filepath.Join("result-directory", "models-summed.csv"): nil, - filepath.Join("result-directory", "README.md"): nil, + filepath.Join("result-directory", "README.md"): nil, filepath.Join("result-directory", string(evaluatetask.IdentifierWriteTests), "custom-ollama_"+model.CleanModelNameForFileSystem(providertesting.OllamaTestModel), "golang", "golang", "plain.log"): nil, }, }) @@ -815,9 +667,7 @@ func TestEvaluateExecute(t *testing.T) { assert.Contains(t, data, "Run 2/3") assert.Contains(t, data, "Run 3/3") }, - filepath.Join("result-directory", "golang-summed.csv"): nil, - filepath.Join("result-directory", "models-summed.csv"): nil, - filepath.Join("result-directory", "README.md"): nil, + filepath.Join("result-directory", "README.md"): nil, filepath.Join("result-directory", string(evaluatetask.IdentifierWriteTests), "symflower_symbolic-execution", "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) { assert.Equal(t, 3, strings.Count(data, `Evaluating model "symflower/symbolic-execution"`)) }, @@ -850,12 +700,10 @@ func TestEvaluateExecute(t *testing.T) { }, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ - filepath.Join("result-directory", "categories.svg"): nil, - filepath.Join("result-directory", "evaluation.csv"): nil, - filepath.Join("result-directory", "evaluation.log"): nil, - filepath.Join("result-directory", "golang-summed.csv"): nil, - filepath.Join("result-directory", "models-summed.csv"): nil, - filepath.Join("result-directory", "README.md"): nil, + filepath.Join("result-directory", "categories.svg"): nil, + filepath.Join("result-directory", "evaluation.csv"): nil, + filepath.Join("result-directory", "evaluation.log"): nil, + filepath.Join("result-directory", "README.md"): nil, filepath.Join("result-directory", string(evaluatetask.IdentifierWriteTests), "symflower_symbolic-execution", "golang", "golang", "plain.log"): nil, }, }) @@ -873,12 +721,10 @@ func TestEvaluateExecute(t *testing.T) { }, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ - filepath.Join("result-directory-0", "categories.svg"): nil, - filepath.Join("result-directory-0", "evaluation.csv"): nil, - filepath.Join("result-directory-0", "evaluation.log"): nil, - filepath.Join("result-directory-0", "golang-summed.csv"): nil, - filepath.Join("result-directory-0", "models-summed.csv"): nil, - filepath.Join("result-directory-0", "README.md"): nil, + filepath.Join("result-directory-0", "categories.svg"): nil, + filepath.Join("result-directory-0", "evaluation.csv"): nil, + filepath.Join("result-directory-0", "evaluation.log"): nil, + filepath.Join("result-directory-0", "README.md"): nil, filepath.Join("result-directory-0", string(evaluatetask.IdentifierWriteTests), "symflower_symbolic-execution", "golang", "golang", "plain.log"): nil, }, }) diff --git a/evaluate/report/csv.go b/evaluate/report/csv.go index d60d4871..0f7949a1 100644 --- a/evaluate/report/csv.go +++ b/evaluate/report/csv.go @@ -4,11 +4,8 @@ import ( "cmp" "encoding/csv" "io" - "os" - "path/filepath" "slices" "strconv" - "strings" pkgerrors "github.com/pkg/errors" "golang.org/x/exp/maps" @@ -19,14 +16,6 @@ import ( "github.com/symflower/eval-dev-quality/task" ) -// CSVFormatter defines a formatter for CSV data. -type CSVFormatter interface { - // Header returns the header description as a CSV row. - Header() (header []string) - // Rows returns all data as CSV rows. - Rows() (rows [][]string) -} - // EvaluationFile holds the evaluation CSV file writer. type EvaluationFile struct { // Holds the writer where the evaluation CSV is written to. @@ -72,217 +61,3 @@ func (e *EvaluationFile) WriteEvaluationRecord(model model.Model, language langu func evaluationHeader() (header []string) { return append([]string{"model-id", "language", "repository", "task", "score"}, metrics.AllAssessmentKeysStrings...) } - -// EvaluationRecord holds a line of the evaluation CSV. -type EvaluationRecord struct { - // ModelID holds the model id. - ModelID string - - // LanguageID holds the language id. - LanguageID string - - // Assessments holds the assessments of an entry. - Assessments metrics.Assessments -} - -// Clone clones an evaluation record. -func (e *EvaluationRecord) Clone() (new *EvaluationRecord) { - new = &EvaluationRecord{} - - new.ModelID = e.ModelID - new.LanguageID = e.LanguageID - new.Assessments = metrics.Merge(e.Assessments, nil) - - return new -} - -// EvaluationRecords holds all the evaluation records. -type EvaluationRecords []*EvaluationRecord - -// EvaluationRecordsPerModel holds the collection of evaluation records per model. -type EvaluationRecordsPerModel map[string]*EvaluationRecord - -// GroupByModel groups the evaluation records by model. -func (e EvaluationRecords) GroupByModel() EvaluationRecordsPerModel { - perModel := map[string]*EvaluationRecord{} - - for _, record := range e { - _, ok := perModel[record.ModelID] - if !ok { - perModel[record.ModelID] = record.Clone() - } else { - r := perModel[record.ModelID] - r.Assessments = metrics.Merge(r.Assessments, record.Assessments) - } - } - - return perModel -} - -// Header returns the header description as a CSV row. -func (EvaluationRecordsPerModel) Header() (header []string) { - return append([]string{"model-id", "score"}, metrics.AllAssessmentKeysStrings...) -} - -// Rows returns all data as CSV rows. -func (e EvaluationRecordsPerModel) Rows() (rows [][]string) { - models := maps.Keys(e) - slices.SortStableFunc(models, func(a, b string) int { - return cmp.Compare(a, b) - }) - - for _, model := range models { - record := e[model] - metrics := record.Assessments.StringCSV() - score := record.Assessments.Score() - - row := append([]string{record.ModelID, strconv.FormatUint(uint64(score), 10)}, metrics...) - rows = append(rows, row) - } - - return rows -} - -// EvaluationRecordsPerModel holds the collection of evaluation records per model. -type EvaluationRecordsPerLanguagePerModel map[string]EvaluationRecordsPerModel - -// GroupByLanguageAndModel groups the evaluation records by language and model. -func (e EvaluationRecords) GroupByLanguageAndModel() EvaluationRecordsPerLanguagePerModel { - perLanguageAndModel := map[string]EvaluationRecordsPerModel{} - - for _, record := range e { - perModel, ok := perLanguageAndModel[record.LanguageID] - if !ok { - perLanguageAndModel[record.LanguageID] = EvaluationRecordsPerModel{ - record.ModelID: record, - } - } else { - _, ok := perModel[record.ModelID] - if !ok { - perModel[record.ModelID] = record.Clone() - } else { - perModel[record.ModelID].Assessments = metrics.Merge(perModel[record.ModelID].Assessments, record.Assessments) - } - } - } - - return perLanguageAndModel -} - -// loadEvaluationRecords reads and returns the evaluation records from the evaluation CSV file. -func loadEvaluationRecords(evaluationFilePath string) (evaluationRecords EvaluationRecords, err error) { - evaluationFile, err := os.Open(evaluationFilePath) - if err != nil { - return nil, pkgerrors.WithStack(err) - } - defer evaluationFile.Close() - - reader := csv.NewReader(evaluationFile) - - // Check if the evaluation CSV header is correct. - if header, err := reader.Read(); err != nil { - return nil, pkgerrors.Wrap(err, "found error while reading evaluation file") - } else if strings.Join(header, ",") != strings.Join(evaluationHeader(), ",") { - return nil, pkgerrors.WithStack(pkgerrors.Errorf("expected header %+v\nfound header %+v", evaluationHeader(), header)) - } - - // Read the raw records from the evaluation CSV file. - records, err := reader.ReadAll() - if err != nil { - return nil, pkgerrors.WithStack(err) - } - - // Convert the raw records into assessments that can be easily manipulated. - evaluationRecords = EvaluationRecords{} - for _, record := range records { - evaluationRecord, err := convertRawRecordToEvaluationRecord(record) - if err != nil { - return nil, err - } - evaluationRecords = append(evaluationRecords, evaluationRecord) - } - - return evaluationRecords, nil -} - -// convertRawRecordToEvaluationRecord converts a raw CSV record into an evaluation record. -func convertRawRecordToEvaluationRecord(raw []string) (record *EvaluationRecord, err error) { - assessments := metrics.NewAssessments() - - modelID := raw[0] - if err != nil { - return nil, pkgerrors.WithStack(err) - } - - languageID := raw[1] - - rawMetrics := raw[5:] - for i, assessementKey := range metrics.AllAssessmentKeysStrings { - metric, err := strconv.ParseUint(rawMetrics[i], 10, 64) - if err != nil { - return nil, pkgerrors.WithStack(err) - } - - assessments[metrics.AssessmentKey(assessementKey)] = metric - } - - return &EvaluationRecord{ - ModelID: modelID, - - LanguageID: languageID, - - Assessments: assessments, - }, nil -} - -// generateCSV returns the whole CSV as string. -func generateCSV(formatter CSVFormatter) (csvData string, err error) { - var out strings.Builder - csv := csv.NewWriter(&out) - - if err := csv.Write(formatter.Header()); err != nil { - return "", pkgerrors.WithStack(err) - } - - for _, row := range formatter.Rows() { - if err := csv.Write(row); err != nil { - return "", pkgerrors.WithStack(err) - } - } - - csv.Flush() - - return out.String(), nil -} - -// WriteCSVs writes the various CSV reports to disk. -func WriteCSVs(resultPath string) (err error) { - evaluationRecords, err := loadEvaluationRecords(filepath.Join(resultPath, "evaluation.csv")) - if err != nil { - return err - } - - // Write the "models-summed.csv" containing the summary per model. - perModel := evaluationRecords.GroupByModel() - csvByModel, err := generateCSV(perModel) - if err != nil { - return pkgerrors.Wrap(err, "could not create models-summed.csv summary") - } - if err := os.WriteFile(filepath.Join(resultPath, "models-summed.csv"), []byte(csvByModel), 0644); err != nil { - return pkgerrors.Wrap(err, "could not write models-summed.csv summary") - } - - // Write the individual "language-summed.csv" containing the summary per model per language. - perLanguage := evaluationRecords.GroupByLanguageAndModel() - for language, modelsByLanguage := range perLanguage { - csvByLanguage, err := generateCSV(modelsByLanguage) - if err != nil { - return pkgerrors.Wrap(err, "could not create "+language+"-summed.csv summary") - } - if err := os.WriteFile(filepath.Join(resultPath, language+"-summed.csv"), []byte(csvByLanguage), 0644); err != nil { - return pkgerrors.Wrap(err, "could not write "+language+"-summed.csv summary") - } - } - - return nil -} diff --git a/evaluate/report/csv_test.go b/evaluate/report/csv_test.go index dcd4931b..d3f687f8 100644 --- a/evaluate/report/csv_test.go +++ b/evaluate/report/csv_test.go @@ -1,16 +1,12 @@ package report import ( - "os" - "path/filepath" "strings" "testing" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/zimmski/osutil" "github.com/zimmski/osutil/bytesutil" - "golang.org/x/exp/maps" "github.com/symflower/eval-dev-quality/evaluate/metrics" evaluatetask "github.com/symflower/eval-dev-quality/evaluate/task" @@ -19,84 +15,6 @@ import ( "github.com/symflower/eval-dev-quality/task" ) -func TestGenerateCSVForAssessmentPerModel(t *testing.T) { - type testCase struct { - Name string - - CSVFormatter CSVFormatter - - ExpectedString string - } - - validate := func(t *testing.T, tc *testCase) { - t.Run(tc.Name, func(t *testing.T) { - actualString, err := generateCSV(tc.CSVFormatter) - assert.NoError(t, err) - - assert.Equal(t, bytesutil.StringTrimIndentations(tc.ExpectedString), actualString) - }) - } - - validate(t, &testCase{ - Name: "Single empty model", - - CSVFormatter: EvaluationRecordsPerModel{ - "some-model-a": &EvaluationRecord{ - ModelID: "some-model-a", - LanguageID: "golang", - Assessments: metrics.NewAssessments(), - }, - }, - - ExpectedString: ` - model-id,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - some-model-a,0,0,0,0,0,0,0,0,0,0 - `, - }) - validate(t, &testCase{ - Name: "Multiple models with assessments", - - CSVFormatter: EvaluationRecordsPerModel{ - "some-model-a": &EvaluationRecord{ - ModelID: "some-model-a", - LanguageID: "golang", - Assessments: metrics.Assessments{ - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 50, - metrics.AssessmentKeyResponseCharacterCount: 100, - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyResponseNoError: 3, - metrics.AssessmentKeyResponseNoExcess: 4, - metrics.AssessmentKeyResponseWithCode: 5, - metrics.AssessmentKeyProcessingTime: 200, - }, - }, - "some-model-b": &EvaluationRecord{ - ModelID: "some-model-b", - LanguageID: "java", - Assessments: metrics.Assessments{ - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 100, - metrics.AssessmentKeyResponseCharacterCount: 200, - metrics.AssessmentKeyCoverage: 6, - metrics.AssessmentKeyFilesExecuted: 7, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 7, - metrics.AssessmentKeyResponseNoError: 8, - metrics.AssessmentKeyResponseNoExcess: 9, - metrics.AssessmentKeyResponseWithCode: 10, - metrics.AssessmentKeyProcessingTime: 400, - }, - }, - }, - - ExpectedString: ` - model-id,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - some-model-a,15,1,2,2,50,200,100,3,4,5 - some-model-b,40,6,7,7,100,400,200,8,9,10 - `, - }) -} - func TestNewEvaluationFile(t *testing.T) { var file strings.Builder _, err := NewEvaluationFile(&file) @@ -174,565 +92,3 @@ func TestWriteEvaluationRecord(t *testing.T) { `, }) } - -func TestLoadEvaluationRecords(t *testing.T) { - type testCase struct { - Name string - - Before func(resultPath string) - - ExpectedEvaluationRecords EvaluationRecords - ExpectedErr func(err error) - } - - validate := func(t *testing.T, tc *testCase) { - t.Run(tc.Name, func(t *testing.T) { - resultPath := t.TempDir() - - if tc.Before != nil { - tc.Before(resultPath) - } - - actualAssessments, actualErr := loadEvaluationRecords(filepath.Join(resultPath, "evaluation.csv")) - - if tc.ExpectedErr != nil { - tc.ExpectedErr(actualErr) - } else { - assert.NoError(t, actualErr) - assert.Equal(t, tc.ExpectedEvaluationRecords, actualAssessments) - } - }) - } - - validate(t, &testCase{ - Name: "Evaluation file does not exist", - - ExpectedErr: func(err error) { - if osutil.IsWindows() { - assert.ErrorContains(t, err, "The system cannot find the file specified") - } else { - assert.ErrorContains(t, err, "no such file or directory") - } - }, - }) - validate(t, &testCase{ - Name: "Evaluation file exists but it is empty", - - Before: func(resultPath string) { - file, err := os.Create(filepath.Join(resultPath, "evaluation.csv")) - require.NoError(t, err) - defer file.Close() - }, - - ExpectedErr: func(err error) { - assert.ErrorContains(t, err, "found error while reading evaluation file") - }, - }) - validate(t, &testCase{ - Name: "Evaluation file exists but with the wrong header", - - Before: func(resultPath string) { - header := bytesutil.StringTrimIndentations(` - model-id - `) - require.NoError(t, os.WriteFile(filepath.Join(resultPath, "evaluation.csv"), []byte(header), 0644)) - }, - - ExpectedErr: func(err error) { - assert.ErrorContains(t, err, "found header [model-id]") - }, - }) - validate(t, &testCase{ - Name: "Single assessment", - - Before: func(resultPath string) { - fileContent := bytesutil.StringTrimIndentations(` - model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - openrouter/anthropic/claude-1.2,golang,golang/light,write-tests,982,750,18,18,70179,720571,71195,115,49,50 - `) - require.NoError(t, os.WriteFile(filepath.Join(resultPath, "evaluation.csv"), []byte(fileContent), 0644)) - }, - - ExpectedEvaluationRecords: EvaluationRecords{ - &EvaluationRecord{ - ModelID: "openrouter/anthropic/claude-1.2", - LanguageID: "golang", - Assessments: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 750, - metrics.AssessmentKeyFilesExecuted: 18, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 18, - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 70179, - metrics.AssessmentKeyProcessingTime: 720571, - metrics.AssessmentKeyResponseCharacterCount: 71195, - metrics.AssessmentKeyResponseNoError: 115, - metrics.AssessmentKeyResponseNoExcess: 49, - metrics.AssessmentKeyResponseWithCode: 50, - }, - }, - }, - }) - validate(t, &testCase{ - Name: "Multiple assessments", - - Before: func(resultPath string) { - fileContent := bytesutil.StringTrimIndentations(` - model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - openrouter/anthropic/claude-1.2,golang,golang/light,write-tests,982,750,18,18,70179,720571,71195,115,49,50 - openrouter/anthropic/claude-1.2,golang,golang/plain,write-tests,37,20,2,2,441,11042,523,5,5,5 - `) - require.NoError(t, os.WriteFile(filepath.Join(resultPath, "evaluation.csv"), []byte(fileContent), 0644)) - }, - - ExpectedEvaluationRecords: EvaluationRecords{ - &EvaluationRecord{ - ModelID: "openrouter/anthropic/claude-1.2", - LanguageID: "golang", - Assessments: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 750, - metrics.AssessmentKeyFilesExecuted: 18, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 18, - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 70179, - metrics.AssessmentKeyProcessingTime: 720571, - metrics.AssessmentKeyResponseCharacterCount: 71195, - metrics.AssessmentKeyResponseNoError: 115, - metrics.AssessmentKeyResponseNoExcess: 49, - metrics.AssessmentKeyResponseWithCode: 50, - }, - }, - &EvaluationRecord{ - ModelID: "openrouter/anthropic/claude-1.2", - LanguageID: "golang", - Assessments: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 20, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 441, - metrics.AssessmentKeyProcessingTime: 11042, - metrics.AssessmentKeyResponseCharacterCount: 523, - metrics.AssessmentKeyResponseNoError: 5, - metrics.AssessmentKeyResponseNoExcess: 5, - metrics.AssessmentKeyResponseWithCode: 5, - }, - }, - }, - }) -} - -func TestEvaluationRecordsGroupByModel(t *testing.T) { - type testCase struct { - Name string - - EvaluationRecords EvaluationRecords - - ExpectedEvaluationRecords map[string]*EvaluationRecord - } - - validate := func(t *testing.T, tc *testCase) { - t.Run(tc.Name, func(t *testing.T) { - actualEvaluationRecords := tc.EvaluationRecords.GroupByModel() - - assert.ElementsMatch(t, maps.Keys(tc.ExpectedEvaluationRecords), maps.Keys(actualEvaluationRecords)) - - for modelID, expectedRecord := range tc.ExpectedEvaluationRecords { - actualRecord := actualEvaluationRecords[modelID] - assert.Equal(t, expectedRecord, actualRecord) - assert.Truef(t, expectedRecord.Assessments.Equal(actualRecord.Assessments), "model:%s\nexpected:%s\nactual:%s", modelID, tc.ExpectedEvaluationRecords, actualEvaluationRecords) - } - }) - } - - validate(t, &testCase{ - Name: "Single record", - - EvaluationRecords: EvaluationRecords{ - &EvaluationRecord{ - ModelID: "openrouter/anthropic/claude-1.2", - LanguageID: "golang", - Assessments: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, - metrics.AssessmentKeyProcessingTime: 4, - metrics.AssessmentKeyResponseCharacterCount: 5, - metrics.AssessmentKeyResponseNoError: 6, - metrics.AssessmentKeyResponseNoExcess: 7, - metrics.AssessmentKeyResponseWithCode: 8, - }, - }, - }, - ExpectedEvaluationRecords: map[string]*EvaluationRecord{ - "openrouter/anthropic/claude-1.2": &EvaluationRecord{ - ModelID: "openrouter/anthropic/claude-1.2", - LanguageID: "golang", - Assessments: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, - metrics.AssessmentKeyProcessingTime: 4, - metrics.AssessmentKeyResponseCharacterCount: 5, - metrics.AssessmentKeyResponseNoError: 6, - metrics.AssessmentKeyResponseNoExcess: 7, - metrics.AssessmentKeyResponseWithCode: 8, - }, - }, - }, - }) - validate(t, &testCase{ - Name: "Multiple records", - - EvaluationRecords: EvaluationRecords{ - &EvaluationRecord{ - ModelID: "openrouter/anthropic/claude-1.2", - LanguageID: "golang", - Assessments: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, - metrics.AssessmentKeyProcessingTime: 4, - metrics.AssessmentKeyResponseCharacterCount: 5, - metrics.AssessmentKeyResponseNoError: 6, - metrics.AssessmentKeyResponseNoExcess: 7, - metrics.AssessmentKeyResponseWithCode: 8, - }, - }, - &EvaluationRecord{ - ModelID: "openrouter/anthropic/claude-1.2", - LanguageID: "golang", - Assessments: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, - metrics.AssessmentKeyProcessingTime: 4, - metrics.AssessmentKeyResponseCharacterCount: 5, - metrics.AssessmentKeyResponseNoError: 6, - metrics.AssessmentKeyResponseNoExcess: 7, - metrics.AssessmentKeyResponseWithCode: 8, - }, - }, - &EvaluationRecord{ - ModelID: "ollama/codeqwen:latest", - LanguageID: "java", - Assessments: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, - metrics.AssessmentKeyProcessingTime: 4, - metrics.AssessmentKeyResponseCharacterCount: 5, - metrics.AssessmentKeyResponseNoError: 6, - metrics.AssessmentKeyResponseNoExcess: 7, - metrics.AssessmentKeyResponseWithCode: 8, - }, - }, - }, - ExpectedEvaluationRecords: map[string]*EvaluationRecord{ - "openrouter/anthropic/claude-1.2": &EvaluationRecord{ - ModelID: "openrouter/anthropic/claude-1.2", - LanguageID: "golang", - Assessments: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 2, - metrics.AssessmentKeyFilesExecuted: 4, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 4, - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 6, - metrics.AssessmentKeyProcessingTime: 8, - metrics.AssessmentKeyResponseCharacterCount: 10, - metrics.AssessmentKeyResponseNoError: 12, - metrics.AssessmentKeyResponseNoExcess: 14, - metrics.AssessmentKeyResponseWithCode: 16, - }, - }, - "ollama/codeqwen:latest": &EvaluationRecord{ - ModelID: "ollama/codeqwen:latest", - LanguageID: "java", - Assessments: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, - metrics.AssessmentKeyProcessingTime: 4, - metrics.AssessmentKeyResponseCharacterCount: 5, - metrics.AssessmentKeyResponseNoError: 6, - metrics.AssessmentKeyResponseNoExcess: 7, - metrics.AssessmentKeyResponseWithCode: 8, - }, - }, - }, - }) -} - -func TestEvaluationRecordsGroupByLanguageAndModel(t *testing.T) { - type testCase struct { - Name string - - EvaluationRecords EvaluationRecords - - ExpectedEvaluationRecordsPerLanguagePerModel EvaluationRecordsPerLanguagePerModel - } - - validate := func(t *testing.T, tc *testCase) { - t.Run(tc.Name, func(t *testing.T) { - actualEvaluationRecordsPerLanguagePerModel := tc.EvaluationRecords.GroupByLanguageAndModel() - - assert.Equal(t, tc.ExpectedEvaluationRecordsPerLanguagePerModel, actualEvaluationRecordsPerLanguagePerModel) - }) - } - - validate(t, &testCase{ - Name: "Single record without assessments", - - EvaluationRecords: EvaluationRecords{ - &EvaluationRecord{ - ModelID: "openrouter/anthropic/claude-1.2", - LanguageID: "golang", - Assessments: metrics.NewAssessments(), - }, - }, - - ExpectedEvaluationRecordsPerLanguagePerModel: EvaluationRecordsPerLanguagePerModel{ - "golang": EvaluationRecordsPerModel{ - "openrouter/anthropic/claude-1.2": &EvaluationRecord{ - ModelID: "openrouter/anthropic/claude-1.2", - LanguageID: "golang", - Assessments: metrics.NewAssessments(), - }, - }, - }, - }) - validate(t, &testCase{ - Name: "Multiple records", - - EvaluationRecords: EvaluationRecords{ - &EvaluationRecord{ - ModelID: "openrouter/anthropic/claude-1.2", - LanguageID: "golang", - Assessments: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, - metrics.AssessmentKeyProcessingTime: 4, - metrics.AssessmentKeyResponseCharacterCount: 5, - metrics.AssessmentKeyResponseNoError: 6, - metrics.AssessmentKeyResponseNoExcess: 7, - metrics.AssessmentKeyResponseWithCode: 8, - }, - }, - &EvaluationRecord{ - ModelID: "openrouter/anthropic/claude-1.2", - LanguageID: "golang", - Assessments: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, - metrics.AssessmentKeyProcessingTime: 4, - metrics.AssessmentKeyResponseCharacterCount: 5, - metrics.AssessmentKeyResponseNoError: 6, - metrics.AssessmentKeyResponseNoExcess: 7, - metrics.AssessmentKeyResponseWithCode: 8, - }, - }, - &EvaluationRecord{ - ModelID: "openrouter/anthropic/claude-1.2", - LanguageID: "java", - Assessments: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, - metrics.AssessmentKeyProcessingTime: 4, - metrics.AssessmentKeyResponseCharacterCount: 5, - metrics.AssessmentKeyResponseNoError: 6, - metrics.AssessmentKeyResponseNoExcess: 7, - metrics.AssessmentKeyResponseWithCode: 8, - }, - }, - &EvaluationRecord{ - ModelID: "ollama/codeqwen:latest", - LanguageID: "golang", - Assessments: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, - metrics.AssessmentKeyProcessingTime: 4, - metrics.AssessmentKeyResponseCharacterCount: 5, - metrics.AssessmentKeyResponseNoError: 6, - metrics.AssessmentKeyResponseNoExcess: 7, - metrics.AssessmentKeyResponseWithCode: 8, - }, - }, - &EvaluationRecord{ - ModelID: "ollama/codeqwen:latest", - LanguageID: "java", - Assessments: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, - metrics.AssessmentKeyProcessingTime: 4, - metrics.AssessmentKeyResponseCharacterCount: 5, - metrics.AssessmentKeyResponseNoError: 6, - metrics.AssessmentKeyResponseNoExcess: 7, - metrics.AssessmentKeyResponseWithCode: 8, - }, - }, - }, - - ExpectedEvaluationRecordsPerLanguagePerModel: EvaluationRecordsPerLanguagePerModel{ - "golang": EvaluationRecordsPerModel{ - "openrouter/anthropic/claude-1.2": &EvaluationRecord{ - ModelID: "openrouter/anthropic/claude-1.2", - LanguageID: "golang", - Assessments: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 2, - metrics.AssessmentKeyFilesExecuted: 4, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 4, - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 6, - metrics.AssessmentKeyProcessingTime: 8, - metrics.AssessmentKeyResponseCharacterCount: 10, - metrics.AssessmentKeyResponseNoError: 12, - metrics.AssessmentKeyResponseNoExcess: 14, - metrics.AssessmentKeyResponseWithCode: 16, - }, - }, - "ollama/codeqwen:latest": &EvaluationRecord{ - ModelID: "ollama/codeqwen:latest", - LanguageID: "golang", - Assessments: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, - metrics.AssessmentKeyProcessingTime: 4, - metrics.AssessmentKeyResponseCharacterCount: 5, - metrics.AssessmentKeyResponseNoError: 6, - metrics.AssessmentKeyResponseNoExcess: 7, - metrics.AssessmentKeyResponseWithCode: 8, - }, - }, - }, - "java": EvaluationRecordsPerModel{ - "openrouter/anthropic/claude-1.2": &EvaluationRecord{ - ModelID: "openrouter/anthropic/claude-1.2", - LanguageID: "java", - Assessments: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, - metrics.AssessmentKeyProcessingTime: 4, - metrics.AssessmentKeyResponseCharacterCount: 5, - metrics.AssessmentKeyResponseNoError: 6, - metrics.AssessmentKeyResponseNoExcess: 7, - metrics.AssessmentKeyResponseWithCode: 8, - }, - }, - "ollama/codeqwen:latest": &EvaluationRecord{ - ModelID: "ollama/codeqwen:latest", - LanguageID: "java", - Assessments: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 1, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 3, - metrics.AssessmentKeyProcessingTime: 4, - metrics.AssessmentKeyResponseCharacterCount: 5, - metrics.AssessmentKeyResponseNoError: 6, - metrics.AssessmentKeyResponseNoExcess: 7, - metrics.AssessmentKeyResponseWithCode: 8, - }, - }, - }, - }, - }) - -} - -func TestWriteCSVs(t *testing.T) { - type testCase struct { - Name string - - FileName string - - ExpectedFileContent string - } - - resultPath := t.TempDir() - - evaluationFilePath := filepath.Join(resultPath, "evaluation.csv") - evaluationFileContent := bytesutil.StringTrimIndentations(` - model-id,language,repository,task,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - openrouter/anthropic/claude-2.0,golang,golang/light,write-tests,24,1,2,2,3,4,5,6,7,8 - openrouter/anthropic/claude-2.0,golang,golang/plain,write-tests,24,1,2,2,3,4,5,6,7,8 - openrouter/anthropic/claude-2.0,java,java/light,write-tests,69,10,11,11,12,13,14,15,16,17 - openrouter/anthropic/claude-2.0,java,java/plain,write-tests,69,10,11,11,12,13,14,15,16,17 - openrouter/anthropic/claude-3-sonnet,golang,golang/light,write-tests,21,8,7,7,6,5,4,3,2,1 - openrouter/anthropic/claude-3-sonnet,golang,golang/plain,write-tests,21,8,7,7,6,5,4,3,2,1 - openrouter/anthropic/claude-3-sonnet,java,java/light,write-tests,69,10,11,11,12,13,14,15,16,17 - openrouter/anthropic/claude-3-sonnet,java,java/plain,write-tests,69,10,11,11,12,13,14,15,16,17 - openrouter/openai/gpt-4,golang,golang/light,write-tests,24,1,2,2,3,4,5,6,7,8 - openrouter/openai/gpt-4,golang,golang/plain,write-tests,24,1,2,2,3,4,5,6,7,8 - openrouter/openai/gpt-4,java,java/light,write-tests,24,1,2,2,3,4,5,6,7,8 - openrouter/openai/gpt-4,java,java/plain,write-tests,24,1,2,2,3,4,5,6,7,8 - `) - require.NoError(t, os.WriteFile(evaluationFilePath, []byte(evaluationFileContent), 0644)) - - err := WriteCSVs(resultPath) - require.NoError(t, err) - - validate := func(t *testing.T, tc *testCase) { - t.Run(tc.Name, func(t *testing.T) { - summedFilePath := filepath.Join(resultPath, tc.FileName) - - _, err = os.Stat(summedFilePath) - require.NoError(t, err) - - actualSummedFileContent, err := os.ReadFile(summedFilePath) - require.NoError(t, err) - - assert.Equal(t, bytesutil.StringTrimIndentations(tc.ExpectedFileContent), string(actualSummedFileContent)) - }) - } - - validate(t, &testCase{ - Name: "Models summed", - - FileName: "models-summed.csv", - - ExpectedFileContent: ` - model-id,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - openrouter/anthropic/claude-2.0,186,22,26,26,30,34,38,42,46,50 - openrouter/anthropic/claude-3-sonnet,180,36,36,36,36,36,36,36,36,36 - openrouter/openai/gpt-4,96,4,8,8,12,16,20,24,28,32 - `, - }) - validate(t, &testCase{ - Name: "Golang summed", - - FileName: "golang-summed.csv", - - ExpectedFileContent: ` - model-id,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - openrouter/anthropic/claude-2.0,48,2,4,4,6,8,10,12,14,16 - openrouter/anthropic/claude-3-sonnet,42,16,14,14,12,10,8,6,4,2 - openrouter/openai/gpt-4,48,2,4,4,6,8,10,12,14,16 - `, - }) - validate(t, &testCase{ - Name: "Java summed", - - FileName: "java-summed.csv", - - ExpectedFileContent: ` - model-id,score,coverage,files-executed,files-executed-maximum-reachable,generate-tests-for-file-character-count,processing-time,response-character-count,response-no-error,response-no-excess,response-with-code - openrouter/anthropic/claude-2.0,138,20,22,22,24,26,28,30,32,34 - openrouter/anthropic/claude-3-sonnet,138,20,22,22,24,26,28,30,32,34 - openrouter/openai/gpt-4,48,2,4,4,6,8,10,12,14,16 - `, - }) -}