From fcf95c49762a461ce3db4f43ab2438c984d64caa Mon Sep 17 00:00:00 2001 From: Rui Azevedo Date: Fri, 28 Jun 2024 10:36:20 +0100 Subject: [PATCH 1/2] refactor, Let the tasks return assessments per task identifiers, so each task can run the "symflower fix" and store its assessments separately Part of #213 --- evaluate/evaluate.go | 4 +- evaluate/metrics/testing/assessments.go | 42 +++++++++++++++++---- evaluate/report/collection.go | 7 ++++ evaluate/task/task-code-repair.go | 16 +++++--- evaluate/task/task-code-repair_test.go | 41 ++++++++++++-------- evaluate/task/task-write-test.go | 16 +++++--- evaluate/task/task-write-test_test.go | 9 +++-- evaluate/task/test-integration/task_test.go | 19 ++++++---- evaluate/task/testing/task.go | 4 +- task/task.go | 2 +- 10 files changed, 108 insertions(+), 52 deletions(-) diff --git a/evaluate/evaluate.go b/evaluate/evaluate.go index 2c708980..ca50401d 100644 --- a/evaluate/evaluate.go +++ b/evaluate/evaluate.go @@ -126,7 +126,7 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore, totalScore uin } assessment, ps, err := task.Run(temporaryRepository) - assessments.Add(model, language, repositoryPath, taskIdentifier, assessment) + assessments.AddAssessmentPerTask(model, language, repositoryPath, assessment) if err != nil { ps = append(ps, err) } @@ -226,7 +226,7 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore, totalScore uin } assessment, ps, err := task.Run(temporaryRepository) - assessments.Add(model, language, repositoryPath, taskIdentifier, assessment) + assessments.AddAssessmentPerTask(model, language, repositoryPath, assessment) problemsPerModel[modelID] = append(problemsPerModel[modelID], ps...) if err != nil { ctx.Log.Printf("ERROR: Model %q encountered a hard error for language %q, repository %q: %+v", modelID, languageID, repositoryPath, err) diff --git a/evaluate/metrics/testing/assessments.go b/evaluate/metrics/testing/assessments.go index dab179b4..3f652f15 100644 --- a/evaluate/metrics/testing/assessments.go +++ b/evaluate/metrics/testing/assessments.go @@ -1,10 +1,12 @@ package metricstesting import ( - "maps" "testing" + "golang.org/x/exp/maps" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "github.com/symflower/eval-dev-quality/evaluate/metrics" "github.com/symflower/eval-dev-quality/language" @@ -17,17 +19,41 @@ func AssertAssessmentsEqual(t *testing.T, expected metrics.Assessments, actual m expected = maps.Clone(expected) actual = maps.Clone(actual) - expected[metrics.AssessmentKeyProcessingTime] = 0 - actual[metrics.AssessmentKeyProcessingTime] = 0 - - expected[metrics.AssessmentKeyGenerateTestsForFileCharacterCount] = 0 - actual[metrics.AssessmentKeyGenerateTestsForFileCharacterCount] = 0 - expected[metrics.AssessmentKeyResponseCharacterCount] = 0 - actual[metrics.AssessmentKeyResponseCharacterCount] = 0 + clearNonDeterministicAssessmentValues(expected) + clearNonDeterministicAssessmentValues(actual) assert.Truef(t, expected.Equal(actual), "expected:%s\nactual:%s", expected, actual) } +// AssertTaskAssessmentsEqual checks if the given assessments per task are equal ignoring default and nondeterministic values. +func AssertTaskAssessmentsEqual(t *testing.T, expected map[task.Identifier]metrics.Assessments, actual map[task.Identifier]metrics.Assessments) { + expected = maps.Clone(expected) + actual = maps.Clone(actual) + + // The expected and actual maps must have the same task identifiers. + require.ElementsMatch(t, maps.Keys(expected), maps.Keys(actual)) + + // Ignore non-deterministic values. + for _, assessment := range expected { + clearNonDeterministicAssessmentValues(assessment) + } + for _, assessment := range actual { + clearNonDeterministicAssessmentValues(assessment) + } + + for task, expectedAssessment := range expected { + actualAssessment := actual[task] + assert.Truef(t, expectedAssessment.Equal(actualAssessment), "task:%s\nexpected:%s\nactual:%s", task, expected, actual) + } +} + +// clearNonDeterministicAssessmentValues ignores non-deterministic values such as processing time and response character count. +func clearNonDeterministicAssessmentValues(assessment metrics.Assessments) { + assessment[metrics.AssessmentKeyProcessingTime] = 0 + assessment[metrics.AssessmentKeyGenerateTestsForFileCharacterCount] = 0 + assessment[metrics.AssessmentKeyResponseCharacterCount] = 0 +} + // AssessmentsWithProcessingTime is an empty assessment collection with positive processing time. var AssessmentsWithProcessingTime = metrics.Assessments{ metrics.AssessmentKeyProcessingTime: 1, diff --git a/evaluate/report/collection.go b/evaluate/report/collection.go index b7a1a280..33760a20 100644 --- a/evaluate/report/collection.go +++ b/evaluate/report/collection.go @@ -84,6 +84,13 @@ func (a *AssessmentStore) Add(model model.Model, l language.Language, repository assessments.Add(assessment) } +// AddAssessmentPerTask adds new assessments per task. +func (a *AssessmentStore) AddAssessmentPerTask(model model.Model, l language.Language, repositoryPath string, taskAssessment map[task.Identifier]metrics.Assessments) { + for taskIdentifier, assessment := range taskAssessment { + a.Add(model, l, repositoryPath, taskIdentifier, assessment) + } +} + // Walk walks over all entries. func (a *AssessmentStore) Walk(function func(m model.Model, l language.Language, r string, t task.Identifier, a metrics.Assessments) error) (err error) { models := maps.Keys(a.store) diff --git a/evaluate/task/task-code-repair.go b/evaluate/task/task-code-repair.go index 76e1ccfc..3b71b43d 100644 --- a/evaluate/task/task-code-repair.go +++ b/evaluate/task/task-code-repair.go @@ -52,7 +52,7 @@ func (t *TaskCodeRepair) Identifier() evaltask.Identifier { // Run performs source code repairing in a repository with compilation errors. // This task requires the repository to consist of multiple packages, with each containing one faulty implementation file and a corresponding test file. -func (t *TaskCodeRepair) Run(repository evaltask.Repository) (repositoryAssessment metrics.Assessments, problems []error, err error) { +func (t *TaskCodeRepair) Run(repository evaltask.Repository) (repositoryAssessment map[evaltask.Identifier]metrics.Assessments, problems []error, err error) { log, logClose, err := log.WithFile(t.Logger, filepath.Join(t.ResultPath, string(t.Identifier()), model.CleanModelNameForFileSystem(t.Model.ID()), t.Language.ID(), repository.Name()+".log")) if err != nil { return nil, nil, err @@ -75,7 +75,7 @@ func (t *TaskCodeRepair) Run(repository evaltask.Repository) (repositoryAssessme } } - repositoryAssessment = metrics.NewAssessments() + modelAssessment := metrics.NewAssessments() for _, packagePath := range packagePaths { if err := repository.Reset(t.Logger); err != nil { t.Logger.Panicf("ERROR: unable to reset temporary repository path: %s", err) @@ -107,8 +107,8 @@ func (t *TaskCodeRepair) Run(repository evaltask.Repository) (repositoryAssessme if assessments[metrics.AssessmentKeyProcessingTime] == 0 { return nil, nil, pkgerrors.Errorf("no model response time measurement present for %q at repository %q", t.Model.ID(), repository.Name()) } - repositoryAssessment.Add(assessments) - repositoryAssessment.Award(metrics.AssessmentKeyResponseNoError) + modelAssessment.Add(assessments) + modelAssessment.Award(metrics.AssessmentKeyResponseNoError) coverage, ps, err := t.Language.Execute(log, packagePath) problems = append(problems, ps...) @@ -118,8 +118,12 @@ func (t *TaskCodeRepair) Run(repository evaltask.Repository) (repositoryAssessme continue } log.Printf("Executes tests with %d coverage objects", coverage) - repositoryAssessment.Award(metrics.AssessmentKeyFilesExecuted) - repositoryAssessment.AwardPoints(metrics.AssessmentKeyCoverage, coverage) + modelAssessment.Award(metrics.AssessmentKeyFilesExecuted) + modelAssessment.AwardPoints(metrics.AssessmentKeyCoverage, coverage) + } + + repositoryAssessment = map[evaltask.Identifier]metrics.Assessments{ + IdentifierCodeRepair: modelAssessment, } return repositoryAssessment, problems, nil diff --git a/evaluate/task/task-code-repair_test.go b/evaluate/task/task-code-repair_test.go index 7b893cd5..920f7fa0 100644 --- a/evaluate/task/task-code-repair_test.go +++ b/evaluate/task/task-code-repair_test.go @@ -13,6 +13,7 @@ import ( "github.com/symflower/eval-dev-quality/language/java" "github.com/symflower/eval-dev-quality/log" modeltesting "github.com/symflower/eval-dev-quality/model/testing" + "github.com/symflower/eval-dev-quality/task" "github.com/zimmski/osutil" "github.com/zimmski/osutil/bytesutil" ) @@ -71,10 +72,12 @@ func TestTaskCodeRepairRun(t *testing.T) { TestDataPath: temporaryDirectoryPath, RepositoryPath: filepath.Join("golang", "mistakes"), - ExpectedRepositoryAssessment: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 30, - metrics.AssessmentKeyFilesExecuted: 1, - metrics.AssessmentKeyResponseNoError: 1, + ExpectedRepositoryAssessment: map[task.Identifier]metrics.Assessments{ + IdentifierCodeRepair: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 30, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + }, }, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(IdentifierCodeRepair), "mocked-model", "golang", "golang", "mistakes.log"): func(t *testing.T, filePath, data string) { @@ -134,10 +137,12 @@ func TestTaskCodeRepairRun(t *testing.T) { TestDataPath: temporaryDirectoryPath, RepositoryPath: filepath.Join("golang", "mistakes"), - ExpectedRepositoryAssessment: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 60, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyResponseNoError: 2, + ExpectedRepositoryAssessment: map[task.Identifier]metrics.Assessments{ + IdentifierCodeRepair: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 60, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyResponseNoError: 2, + }, }, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(IdentifierCodeRepair), "mocked-model", "golang", "golang", "mistakes.log"): func(t *testing.T, filePath, data string) { @@ -188,10 +193,12 @@ func TestTaskCodeRepairRun(t *testing.T) { TestDataPath: temporaryDirectoryPath, RepositoryPath: filepath.Join("java", "mistakes"), - ExpectedRepositoryAssessment: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 80, - metrics.AssessmentKeyFilesExecuted: 1, - metrics.AssessmentKeyResponseNoError: 1, + ExpectedRepositoryAssessment: map[task.Identifier]metrics.Assessments{ + IdentifierCodeRepair: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 80, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + }, }, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(IdentifierCodeRepair), "mocked-model", "java", "java", "mistakes.log"): func(t *testing.T, filePath, data string) { @@ -253,10 +260,12 @@ func TestTaskCodeRepairRun(t *testing.T) { TestDataPath: temporaryDirectoryPath, RepositoryPath: filepath.Join("java", "mistakes"), - ExpectedRepositoryAssessment: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 160, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyResponseNoError: 2, + ExpectedRepositoryAssessment: map[task.Identifier]metrics.Assessments{ + IdentifierCodeRepair: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 160, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyResponseNoError: 2, + }, }, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(IdentifierCodeRepair), "mocked-model", "java", "java", "mistakes.log"): func(t *testing.T, filePath, data string) { diff --git a/evaluate/task/task-write-test.go b/evaluate/task/task-write-test.go index f7fb1f55..2b991dea 100644 --- a/evaluate/task/task-write-test.go +++ b/evaluate/task/task-write-test.go @@ -43,7 +43,7 @@ func (t *TaskWriteTests) Identifier() evaltask.Identifier { } // TaskWriteTests generates test files for the given implementation file in a repository. -func (t *TaskWriteTests) Run(repository evaltask.Repository) (repositoryAssessment metrics.Assessments, problems []error, err error) { +func (t *TaskWriteTests) Run(repository evaltask.Repository) (repositoryAssessment map[evaltask.Identifier]metrics.Assessments, problems []error, err error) { dataPath := repository.DataPath() log, logClose, err := log.WithFile(t.Logger, filepath.Join(t.ResultPath, string(t.Identifier()), model.CleanModelNameForFileSystem(t.Model.ID()), t.Language.ID(), repository.Name()+".log")) @@ -62,7 +62,7 @@ func (t *TaskWriteTests) Run(repository evaltask.Repository) (repositoryAssessme return nil, problems, pkgerrors.WithStack(err) } - repositoryAssessment = metrics.NewAssessments() + modelAssessment := metrics.NewAssessments() for _, filePath := range filePaths { if err := repository.Reset(t.Logger); err != nil { t.Logger.Panicf("ERROR: unable to reset temporary repository path: %s", err) @@ -85,8 +85,8 @@ func (t *TaskWriteTests) Run(repository evaltask.Repository) (repositoryAssessme if assessments[metrics.AssessmentKeyProcessingTime] == 0 { return nil, nil, pkgerrors.Errorf("no model response time measurement present for %q at repository %q", t.Model.ID(), repository.Name()) } - repositoryAssessment.Add(assessments) - repositoryAssessment.Award(metrics.AssessmentKeyResponseNoError) + modelAssessment.Add(assessments) + modelAssessment.Award(metrics.AssessmentKeyResponseNoError) coverage, ps, err := t.Language.Execute(log, dataPath) problems = append(problems, ps...) @@ -96,8 +96,12 @@ func (t *TaskWriteTests) Run(repository evaltask.Repository) (repositoryAssessme continue } log.Printf("Executes tests with %d coverage objects", coverage) - repositoryAssessment.Award(metrics.AssessmentKeyFilesExecuted) - repositoryAssessment.AwardPoints(metrics.AssessmentKeyCoverage, coverage) + modelAssessment.Award(metrics.AssessmentKeyFilesExecuted) + modelAssessment.AwardPoints(metrics.AssessmentKeyCoverage, coverage) + } + + repositoryAssessment = map[evaltask.Identifier]metrics.Assessments{ + IdentifierWriteTests: modelAssessment, } return repositoryAssessment, problems, nil diff --git a/evaluate/task/task-write-test_test.go b/evaluate/task/task-write-test_test.go index 57b091e5..5b5b2cdc 100644 --- a/evaluate/task/task-write-test_test.go +++ b/evaluate/task/task-write-test_test.go @@ -13,6 +13,7 @@ import ( "github.com/symflower/eval-dev-quality/language/golang" "github.com/symflower/eval-dev-quality/log" modeltesting "github.com/symflower/eval-dev-quality/model/testing" + "github.com/symflower/eval-dev-quality/task" ) func TestTaskWriteTestsRun(t *testing.T) { @@ -59,9 +60,11 @@ func TestTaskWriteTestsRun(t *testing.T) { TestDataPath: temporaryDirectoryPath, RepositoryPath: filepath.Join("golang", "plain"), - ExpectedRepositoryAssessment: metrics.Assessments{ - metrics.AssessmentKeyFilesExecuted: 1, - metrics.AssessmentKeyResponseNoError: 2, + ExpectedRepositoryAssessment: map[task.Identifier]metrics.Assessments{ + IdentifierWriteTests: metrics.Assessments{ + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 2, + }, }, ExpectedProblemContains: []string{ "expected 'package', found does", diff --git a/evaluate/task/test-integration/task_test.go b/evaluate/task/test-integration/task_test.go index 9ea1481e..1c680a4a 100644 --- a/evaluate/task/test-integration/task_test.go +++ b/evaluate/task/test-integration/task_test.go @@ -12,6 +12,7 @@ import ( "github.com/symflower/eval-dev-quality/language/golang" "github.com/symflower/eval-dev-quality/log" "github.com/symflower/eval-dev-quality/model/symflower" + "github.com/symflower/eval-dev-quality/task" "github.com/symflower/eval-dev-quality/tools" toolstesting "github.com/symflower/eval-dev-quality/tools/testing" ) @@ -48,14 +49,16 @@ func TestTaskWriteTestsRun(t *testing.T) { TestDataPath: filepath.Join("..", "..", "..", "testdata"), RepositoryPath: filepath.Join("golang", "plain"), - ExpectedRepositoryAssessment: metrics.Assessments{ - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 254, - metrics.AssessmentKeyResponseCharacterCount: 254, - metrics.AssessmentKeyCoverage: 10, - metrics.AssessmentKeyFilesExecuted: 1, - metrics.AssessmentKeyResponseNoError: 1, - metrics.AssessmentKeyResponseNoExcess: 1, - metrics.AssessmentKeyResponseWithCode: 1, + ExpectedRepositoryAssessment: map[task.Identifier]metrics.Assessments{ + evaluatetask.IdentifierWriteTests: metrics.Assessments{ + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 254, + metrics.AssessmentKeyResponseCharacterCount: 254, + metrics.AssessmentKeyCoverage: 10, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, + }, }, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), "symflower_symbolic-execution", "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) { diff --git a/evaluate/task/testing/task.go b/evaluate/task/testing/task.go index 8d68b314..712959dc 100644 --- a/evaluate/task/testing/task.go +++ b/evaluate/task/testing/task.go @@ -24,7 +24,7 @@ type TestCaseTask struct { TestDataPath string RepositoryPath string - ExpectedRepositoryAssessment metrics.Assessments + ExpectedRepositoryAssessment map[evaltask.Identifier]metrics.Assessments ExpectedResultFiles map[string]func(t *testing.T, filePath string, data string) ExpectedProblemContains []string ExpectedError error @@ -33,7 +33,7 @@ type TestCaseTask struct { func (tc *TestCaseTask) Validate(t *testing.T, task evaltask.Task, repository evaltask.Repository, resultPath string) { actualRepositoryAssessment, actualProblems, actualErr := task.Run(repository) - metricstesting.AssertAssessmentsEqual(t, tc.ExpectedRepositoryAssessment, actualRepositoryAssessment) + metricstesting.AssertTaskAssessmentsEqual(t, tc.ExpectedRepositoryAssessment, actualRepositoryAssessment) if assert.Equal(t, len(tc.ExpectedProblemContains), len(actualProblems), "problems count") { for i, expectedProblem := range tc.ExpectedProblemContains { actualProblem := actualProblems[i] diff --git a/task/task.go b/task/task.go index 8df261ae..f0336d1d 100644 --- a/task/task.go +++ b/task/task.go @@ -39,7 +39,7 @@ type Task interface { Identifier() (identifier Identifier) // Run runs a task in a given repository. - Run(repository Repository) (assessments metrics.Assessments, problems []error, err error) + Run(repository Repository) (assessments map[Identifier]metrics.Assessments, problems []error, err error) } // Repository defines a repository to be evaluated. From eb9e3f053ba23bace88e79b18ad7cdf3a30dad92 Mon Sep 17 00:00:00 2001 From: Rui Azevedo Date: Fri, 28 Jun 2024 14:08:42 +0100 Subject: [PATCH 2/2] Apply "symflower fix" to a "write-test" result of a model when it errors, so model responses can possibly be fixed Closes of #213 --- cmd/eval-dev-quality/cmd/evaluate_test.go | 282 ++++++++++++-------- evaluate/evaluate.go | 3 +- evaluate/evaluate_test.go | 162 ++++++++++- evaluate/metrics/assessment.go | 16 ++ evaluate/metrics/assessment_test.go | 51 ++++ evaluate/task/symflower-fix.go | 32 +++ evaluate/task/task-write-test.go | 56 +++- evaluate/task/task-write-test_test.go | 109 ++++++++ evaluate/task/task.go | 2 + evaluate/task/test-integration/task_test.go | 9 + tools/symflower.go | 2 +- 11 files changed, 594 insertions(+), 130 deletions(-) create mode 100644 evaluate/task/symflower-fix.go diff --git a/cmd/eval-dev-quality/cmd/evaluate_test.go b/cmd/eval-dev-quality/cmd/evaluate_test.go index 600f18c0..90b19ab5 100644 --- a/cmd/eval-dev-quality/cmd/evaluate_test.go +++ b/cmd/eval-dev-quality/cmd/evaluate_test.go @@ -199,17 +199,17 @@ func TestEvaluateExecute(t *testing.T) { ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) { actualAssessments := validateMetrics(t, extractMetricsLogsMatch, output, []metrics.Assessments{ metrics.Assessments{ - metrics.AssessmentKeyCoverage: 10, - metrics.AssessmentKeyFilesExecuted: 1, - metrics.AssessmentKeyResponseNoError: 1, - metrics.AssessmentKeyResponseNoExcess: 1, - metrics.AssessmentKeyResponseWithCode: 1, + metrics.AssessmentKeyCoverage: 20, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyResponseNoError: 2, + metrics.AssessmentKeyResponseNoExcess: 2, + metrics.AssessmentKeyResponseWithCode: 2, }, - }, []uint64{14}) + }, []uint64{28}) // Assert non-deterministic behavior. assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(508)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(508)) assert.Equal(t, 1, strings.Count(output, "Evaluation score for")) }, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ @@ -225,42 +225,52 @@ func TestEvaluateExecute(t *testing.T) { metrics.AssessmentKeyResponseNoExcess: 1, metrics.AssessmentKeyResponseWithCode: 1, }, - }, []uint64{14}) + metrics.Assessments{ + metrics.AssessmentKeyCoverage: 10, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, + }, + }, []uint64{14, 14}) // Assert non-deterministic behavior. assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254)) assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254)) + assert.Greater(t, actualAssessments[1][metrics.AssessmentKeyProcessingTime], uint64(0)) + assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254)) + assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyResponseCharacterCount], uint64(254)) }, filepath.Join("result-directory", "evaluation.log"): nil, filepath.Join("result-directory", "golang-summed.csv"): func(t *testing.T, filePath, data string) { actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{ metrics.Assessments{ - metrics.AssessmentKeyCoverage: 10, - metrics.AssessmentKeyFilesExecuted: 1, - metrics.AssessmentKeyResponseNoError: 1, - metrics.AssessmentKeyResponseNoExcess: 1, - metrics.AssessmentKeyResponseWithCode: 1, + metrics.AssessmentKeyCoverage: 20, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyResponseNoError: 2, + metrics.AssessmentKeyResponseNoExcess: 2, + metrics.AssessmentKeyResponseWithCode: 2, }, - }, []uint64{14}) + }, []uint64{28}) // Assert non-deterministic behavior. assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(508)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(508)) }, filepath.Join("result-directory", "models-summed.csv"): func(t *testing.T, filePath, data string) { actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{ metrics.Assessments{ - metrics.AssessmentKeyCoverage: 10, - metrics.AssessmentKeyFilesExecuted: 1, - metrics.AssessmentKeyResponseNoError: 1, - metrics.AssessmentKeyResponseNoExcess: 1, - metrics.AssessmentKeyResponseWithCode: 1, + metrics.AssessmentKeyCoverage: 20, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyResponseNoError: 2, + metrics.AssessmentKeyResponseNoExcess: 2, + metrics.AssessmentKeyResponseWithCode: 2, }, - }, []uint64{14}) + }, []uint64{28}) // Assert non-deterministic behavior. assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(508)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(508)) }, filepath.Join("result-directory", "README.md"): func(t *testing.T, filePath, data string) { validateReportLinks(t, data, []string{"symflower_symbolic-execution"}) @@ -280,17 +290,17 @@ func TestEvaluateExecute(t *testing.T) { ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) { actualAssessments := validateMetrics(t, extractMetricsLogsMatch, output, []metrics.Assessments{ metrics.Assessments{ - metrics.AssessmentKeyCoverage: 20, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyResponseNoError: 2, - metrics.AssessmentKeyResponseNoExcess: 2, - metrics.AssessmentKeyResponseWithCode: 2, + metrics.AssessmentKeyCoverage: 40, + metrics.AssessmentKeyFilesExecuted: 4, + metrics.AssessmentKeyResponseNoError: 4, + metrics.AssessmentKeyResponseNoExcess: 4, + metrics.AssessmentKeyResponseWithCode: 4, }, - }, []uint64{28}) + }, []uint64{56}) // Assert non-deterministic behavior. assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(393)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(393)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(786)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(786)) assert.Equal(t, 1, strings.Count(output, "Evaluation score for")) }, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ @@ -313,17 +323,6 @@ func TestEvaluateExecute(t *testing.T) { metrics.AssessmentKeyResponseNoExcess: 1, metrics.AssessmentKeyResponseWithCode: 1, }, - }, []uint64{14, 14}) - // Assert non-deterministic behavior. - assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254)) - assert.Greater(t, actualAssessments[1][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(139)) - assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyResponseCharacterCount], uint64(139)) - }, - filepath.Join("result-directory", "golang-summed.csv"): func(t *testing.T, filePath, data string) { - actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{ metrics.Assessments{ metrics.AssessmentKeyCoverage: 10, metrics.AssessmentKeyFilesExecuted: 1, @@ -331,28 +330,44 @@ func TestEvaluateExecute(t *testing.T) { metrics.AssessmentKeyResponseNoExcess: 1, metrics.AssessmentKeyResponseWithCode: 1, }, - }, []uint64{14}) + metrics.Assessments{ + metrics.AssessmentKeyCoverage: 10, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, + }, + }, []uint64{14, 14, 14, 14}) // Assert non-deterministic behavior. assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254)) assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254)) + assert.Greater(t, actualAssessments[1][metrics.AssessmentKeyProcessingTime], uint64(0)) + assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254)) + assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyResponseCharacterCount], uint64(254)) + assert.Greater(t, actualAssessments[2][metrics.AssessmentKeyProcessingTime], uint64(0)) + assert.Equal(t, actualAssessments[2][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(139)) + assert.Equal(t, actualAssessments[2][metrics.AssessmentKeyResponseCharacterCount], uint64(139)) + assert.Greater(t, actualAssessments[3][metrics.AssessmentKeyProcessingTime], uint64(0)) + assert.Equal(t, actualAssessments[3][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(139)) + assert.Equal(t, actualAssessments[3][metrics.AssessmentKeyResponseCharacterCount], uint64(139)) }, - filepath.Join("result-directory", "java-summed.csv"): func(t *testing.T, filePath, data string) { + filepath.Join("result-directory", "golang-summed.csv"): func(t *testing.T, filePath, data string) { actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{ metrics.Assessments{ - metrics.AssessmentKeyCoverage: 10, - metrics.AssessmentKeyFilesExecuted: 1, - metrics.AssessmentKeyResponseNoError: 1, - metrics.AssessmentKeyResponseNoExcess: 1, - metrics.AssessmentKeyResponseWithCode: 1, + metrics.AssessmentKeyCoverage: 20, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyResponseNoError: 2, + metrics.AssessmentKeyResponseNoExcess: 2, + metrics.AssessmentKeyResponseWithCode: 2, }, - }, []uint64{14}) + }, []uint64{28}) // Assert non-deterministic behavior. assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(139)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(139)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(508)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(508)) }, - filepath.Join("result-directory", "models-summed.csv"): func(t *testing.T, filePath, data string) { + filepath.Join("result-directory", "java-summed.csv"): func(t *testing.T, filePath, data string) { actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{ metrics.Assessments{ metrics.AssessmentKeyCoverage: 20, @@ -364,8 +379,23 @@ func TestEvaluateExecute(t *testing.T) { }, []uint64{28}) // Assert non-deterministic behavior. assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(393)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(393)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(278)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(278)) + }, + filepath.Join("result-directory", "models-summed.csv"): func(t *testing.T, filePath, data string) { + actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{ + metrics.Assessments{ + metrics.AssessmentKeyCoverage: 40, + metrics.AssessmentKeyFilesExecuted: 4, + metrics.AssessmentKeyResponseNoError: 4, + metrics.AssessmentKeyResponseNoExcess: 4, + metrics.AssessmentKeyResponseWithCode: 4, + }, + }, []uint64{56}) + // Assert non-deterministic behavior. + assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(786)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(786)) }, filepath.Join("result-directory", "evaluation.log"): nil, filepath.Join("result-directory", "README.md"): func(t *testing.T, filePath, data string) { @@ -395,17 +425,17 @@ func TestEvaluateExecute(t *testing.T) { ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) { actualAssessments := validateMetrics(t, extractMetricsLogsMatch, output, []metrics.Assessments{ metrics.Assessments{ - metrics.AssessmentKeyCoverage: 10, - metrics.AssessmentKeyFilesExecuted: 1, - metrics.AssessmentKeyResponseNoError: 1, - metrics.AssessmentKeyResponseNoExcess: 1, - metrics.AssessmentKeyResponseWithCode: 1, + metrics.AssessmentKeyCoverage: 20, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyResponseNoError: 2, + metrics.AssessmentKeyResponseNoExcess: 2, + metrics.AssessmentKeyResponseWithCode: 2, }, - }, []uint64{14}) + }, []uint64{28}) // Assert non-deterministic behavior. assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(508)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(508)) assert.Equal(t, 1, strings.Count(output, "Evaluation score for")) }, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ @@ -421,42 +451,52 @@ func TestEvaluateExecute(t *testing.T) { metrics.AssessmentKeyResponseNoExcess: 1, metrics.AssessmentKeyResponseWithCode: 1, }, - }, []uint64{14}) + metrics.Assessments{ + metrics.AssessmentKeyCoverage: 10, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, + }, + }, []uint64{14, 14}) // Assert non-deterministic behavior. assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254)) assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254)) + assert.Greater(t, actualAssessments[1][metrics.AssessmentKeyProcessingTime], uint64(0)) + assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254)) + assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyResponseCharacterCount], uint64(254)) }, filepath.Join("result-directory", "evaluation.log"): nil, filepath.Join("result-directory", "golang-summed.csv"): func(t *testing.T, filePath, data string) { actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{ metrics.Assessments{ - metrics.AssessmentKeyCoverage: 10, - metrics.AssessmentKeyFilesExecuted: 1, - metrics.AssessmentKeyResponseNoError: 1, - metrics.AssessmentKeyResponseNoExcess: 1, - metrics.AssessmentKeyResponseWithCode: 1, + metrics.AssessmentKeyCoverage: 20, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyResponseNoError: 2, + metrics.AssessmentKeyResponseNoExcess: 2, + metrics.AssessmentKeyResponseWithCode: 2, }, - }, []uint64{14}) + }, []uint64{28}) // Assert non-deterministic behavior. assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(508)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(508)) }, filepath.Join("result-directory", "models-summed.csv"): func(t *testing.T, filePath, data string) { actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{ metrics.Assessments{ - metrics.AssessmentKeyCoverage: 10, - metrics.AssessmentKeyFilesExecuted: 1, - metrics.AssessmentKeyResponseNoError: 1, - metrics.AssessmentKeyResponseNoExcess: 1, - metrics.AssessmentKeyResponseWithCode: 1, + metrics.AssessmentKeyCoverage: 20, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyResponseNoError: 2, + metrics.AssessmentKeyResponseNoExcess: 2, + metrics.AssessmentKeyResponseWithCode: 2, }, - }, []uint64{14}) + }, []uint64{28}) // Assert non-deterministic behavior. assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(508)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(508)) }, filepath.Join("result-directory", "README.md"): func(t *testing.T, filePath, data string) { validateReportLinks(t, data, []string{"symflower_symbolic-execution"}) @@ -473,7 +513,7 @@ func TestEvaluateExecute(t *testing.T) { }, ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) { - assert.Regexp(t, `Evaluation score for "symflower/symbolic-execution" \("code-no-excess"\): cost=0.00, score=14, coverage=10, files-executed=1, generate-tests-for-file-character-count=254, processing-time=\d+, response-character-count=254, response-no-error=1, response-no-excess=1, response-with-code=1`, output) + assert.Regexp(t, `Evaluation score for "symflower/symbolic-execution" \("code-no-excess"\): cost=0.00, score=28, coverage=20, files-executed=2, generate-tests-for-file-character-count=508, processing-time=\d+, response-character-count=508, response-no-error=2, response-no-excess=2, response-with-code=2`, output) assert.Equal(t, 1, strings.Count(output, "Evaluation score for")) }, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ @@ -489,42 +529,52 @@ func TestEvaluateExecute(t *testing.T) { metrics.AssessmentKeyResponseNoExcess: 1, metrics.AssessmentKeyResponseWithCode: 1, }, - }, []uint64{14}) + metrics.Assessments{ + metrics.AssessmentKeyCoverage: 10, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, + }, + }, []uint64{14, 14}) // Assert non-deterministic behavior. assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254)) assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254)) + assert.Greater(t, actualAssessments[1][metrics.AssessmentKeyProcessingTime], uint64(0)) + assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254)) + assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyResponseCharacterCount], uint64(254)) }, filepath.Join("result-directory", "evaluation.log"): nil, filepath.Join("result-directory", "golang-summed.csv"): func(t *testing.T, filePath, data string) { actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{ metrics.Assessments{ - metrics.AssessmentKeyCoverage: 10, - metrics.AssessmentKeyFilesExecuted: 1, - metrics.AssessmentKeyResponseNoError: 1, - metrics.AssessmentKeyResponseNoExcess: 1, - metrics.AssessmentKeyResponseWithCode: 1, + metrics.AssessmentKeyCoverage: 20, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyResponseNoError: 2, + metrics.AssessmentKeyResponseNoExcess: 2, + metrics.AssessmentKeyResponseWithCode: 2, }, - }, []uint64{14}) + }, []uint64{28}) // Assert non-deterministic behavior. assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(508)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(508)) }, filepath.Join("result-directory", "models-summed.csv"): func(t *testing.T, filePath, data string) { actualAssessments := validateMetrics(t, extractMetricsCSVMatch, data, []metrics.Assessments{ metrics.Assessments{ - metrics.AssessmentKeyCoverage: 10, - metrics.AssessmentKeyFilesExecuted: 1, - metrics.AssessmentKeyResponseNoError: 1, - metrics.AssessmentKeyResponseNoExcess: 1, - metrics.AssessmentKeyResponseWithCode: 1, + metrics.AssessmentKeyCoverage: 20, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyResponseNoError: 2, + metrics.AssessmentKeyResponseNoExcess: 2, + metrics.AssessmentKeyResponseWithCode: 2, }, - }, []uint64{14}) + }, []uint64{28}) // Assert non-deterministic behavior. assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(254)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(254)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(508)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(508)) }, filepath.Join("result-directory", "README.md"): func(t *testing.T, filePath, data string) { validateReportLinks(t, data, []string{"symflower_symbolic-execution"}) @@ -588,7 +638,7 @@ func TestEvaluateExecute(t *testing.T) { filepath.Join("result-directory", "evaluation.log"): func(t *testing.T, filePath, data string) { // Since the model is non-deterministic, we can only assert that the model did at least not error. assert.Contains(t, data, fmt.Sprintf(`Evaluation score for "ollama/%s"`, providertesting.OllamaTestModel)) - assert.Contains(t, data, "response-no-error=1") + assert.Contains(t, data, "response-no-error=2") assert.Contains(t, data, "preloading model") assert.Contains(t, data, "unloading model") }, @@ -638,7 +688,7 @@ func TestEvaluateExecute(t *testing.T) { filepath.Join("result-directory", "evaluation.log"): func(t *testing.T, filePath, data string) { // Since the model is non-deterministic, we can only assert that the model did at least not error. assert.Contains(t, data, fmt.Sprintf(`Evaluation score for "custom-ollama/%s"`, providertesting.OllamaTestModel)) - assert.Contains(t, data, "response-no-error=1") + assert.Contains(t, data, "response-no-error=2") }, filepath.Join("result-directory", "golang-summed.csv"): nil, filepath.Join("result-directory", "models-summed.csv"): nil, @@ -663,17 +713,17 @@ func TestEvaluateExecute(t *testing.T) { ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) { actualAssessments := validateMetrics(t, extractMetricsLogsMatch, output, []metrics.Assessments{ metrics.Assessments{ - metrics.AssessmentKeyCoverage: 30, - metrics.AssessmentKeyFilesExecuted: 3, - metrics.AssessmentKeyResponseNoError: 3, - metrics.AssessmentKeyResponseNoExcess: 3, - metrics.AssessmentKeyResponseWithCode: 3, + metrics.AssessmentKeyCoverage: 60, + metrics.AssessmentKeyFilesExecuted: 6, + metrics.AssessmentKeyResponseNoError: 6, + metrics.AssessmentKeyResponseNoExcess: 6, + metrics.AssessmentKeyResponseWithCode: 6, }, - }, []uint64{42}) + }, []uint64{84}) // Assert non-deterministic behavior. assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(762)) - assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(762)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(1524)) + assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(1524)) assert.Equal(t, 1, strings.Count(output, "Evaluation score for")) }, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ @@ -687,11 +737,21 @@ func TestEvaluateExecute(t *testing.T) { metrics.AssessmentKeyResponseNoExcess: 3, metrics.AssessmentKeyResponseWithCode: 3, }, - }, []uint64{42}) + metrics.Assessments{ + metrics.AssessmentKeyCoverage: 30, + metrics.AssessmentKeyFilesExecuted: 3, + metrics.AssessmentKeyResponseNoError: 3, + metrics.AssessmentKeyResponseNoExcess: 3, + metrics.AssessmentKeyResponseWithCode: 3, + }, + }, []uint64{42, 42}) // Assert non-deterministic behavior. assert.Greater(t, actualAssessments[0][metrics.AssessmentKeyProcessingTime], uint64(0)) assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(762)) assert.Equal(t, actualAssessments[0][metrics.AssessmentKeyResponseCharacterCount], uint64(762)) + assert.Greater(t, actualAssessments[1][metrics.AssessmentKeyProcessingTime], uint64(0)) + assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyGenerateTestsForFileCharacterCount], uint64(762)) + assert.Equal(t, actualAssessments[1][metrics.AssessmentKeyResponseCharacterCount], uint64(762)) }, filepath.Join("result-directory", "evaluation.log"): func(t *testing.T, filePath, data string) { assert.Contains(t, data, "Run 1/3") diff --git a/evaluate/evaluate.go b/evaluate/evaluate.go index ca50401d..ed11928f 100644 --- a/evaluate/evaluate.go +++ b/evaluate/evaluate.go @@ -249,7 +249,8 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore, totalScore uin } } if isOnlyPlainRepositories { - totalScore = uint64(len(ctx.Languages)) * uint64(ctx.Runs) + // For every write-test task in the plain repository, each model is also executed with the `symflower fix` which results in double the total results. + totalScore = 2 * uint64(len(ctx.Languages)) * uint64(ctx.Runs) } return assessments, totalScore diff --git a/evaluate/evaluate_test.go b/evaluate/evaluate_test.go index ec22a420..4bcd12e6 100644 --- a/evaluate/evaluate_test.go +++ b/evaluate/evaluate_test.go @@ -139,7 +139,7 @@ func TestEvaluate(t *testing.T) { return nil })) - assert.Equal(t, tc.ExpectedAssessments, actualAssessments) + assert.ElementsMatch(t, tc.ExpectedAssessments, actualAssessments) assert.Equal(t, tc.ExpectedTotalScore, actualTotalScore) if tc.ExpectedOutputValidate != nil { @@ -200,8 +200,15 @@ func TestEvaluate(t *testing.T) { Task: evaluatetask.IdentifierWriteTests, Assessment: metrics.Assessments{}, }, + &metricstesting.AssessmentTuple{ + Model: mockedModel, + Language: languageGolang, + RepositoryPath: repositoryPath, + Task: evaluatetask.IdentifierWriteTestsSymflowerFix, + Assessment: metrics.Assessments{}, + }, }, - ExpectedTotalScore: 1, + ExpectedTotalScore: 2, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), mockedModel.ID(), "golang", "golang", "plain.log"): nil, }, @@ -246,8 +253,15 @@ func TestEvaluate(t *testing.T) { Task: evaluatetask.IdentifierWriteTests, Assessment: metrics.Assessments{}, }, + &metricstesting.AssessmentTuple{ + Model: mockedModel, + Language: languageGolang, + RepositoryPath: repositoryPath, + Task: evaluatetask.IdentifierWriteTestsSymflowerFix, + Assessment: metrics.Assessments{}, + }, }, - ExpectedTotalScore: 1, + ExpectedTotalScore: 2, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) { assert.Contains(t, data, ErrEmptyResponseFromModel.Error()) @@ -301,8 +315,19 @@ func TestEvaluate(t *testing.T) { metrics.AssessmentKeyResponseNoError: 1, }, }, + &metricstesting.AssessmentTuple{ + Model: mockedModel, + Language: languageGolang, + RepositoryPath: repositoryPath, + Task: evaluatetask.IdentifierWriteTestsSymflowerFix, + Assessment: map[metrics.AssessmentKey]uint64{ + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 14, + metrics.AssessmentKeyResponseCharacterCount: 14, + metrics.AssessmentKeyResponseNoError: 1, + }, + }, }, - ExpectedTotalScore: 1, + ExpectedTotalScore: 2, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) { assert.Contains(t, data, "Attempt 1/3: "+ErrEmptyResponseFromModel.Error()) @@ -355,8 +380,19 @@ func TestEvaluate(t *testing.T) { metrics.AssessmentKeyResponseNoError: 1, }, }, + &metricstesting.AssessmentTuple{ + Model: mockedModel, + Language: languageGolang, + RepositoryPath: repositoryPath, + Task: evaluatetask.IdentifierWriteTestsSymflowerFix, + Assessment: map[metrics.AssessmentKey]uint64{ + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 14, + metrics.AssessmentKeyResponseCharacterCount: 14, + metrics.AssessmentKeyResponseNoError: 1, + }, + }, }, - ExpectedTotalScore: 1, + ExpectedTotalScore: 2, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) { assert.Contains(t, data, "DONE 0 tests, 1 error") @@ -443,6 +479,17 @@ func TestEvaluate(t *testing.T) { metrics.AssessmentKeyResponseNoError: 1, }, }, + &metricstesting.AssessmentTuple{ + Model: mockedModel, + Language: languageGolang, + RepositoryPath: repositoryNextPath, + Task: evaluatetask.IdentifierWriteTestsSymflowerFix, + Assessment: map[metrics.AssessmentKey]uint64{ + metrics.AssessmentKeyCoverage: 0, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + }, + }, &metricstesting.AssessmentTuple{ Model: mockedModel, Language: languageGolang, @@ -454,6 +501,17 @@ func TestEvaluate(t *testing.T) { metrics.AssessmentKeyResponseNoError: 2, }, }, + &metricstesting.AssessmentTuple{ + Model: mockedModel, + Language: languageGolang, + RepositoryPath: repositoryPlainPath, + Task: evaluatetask.IdentifierWriteTestsSymflowerFix, + Assessment: map[metrics.AssessmentKey]uint64{ + metrics.AssessmentKeyCoverage: 0, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyResponseNoError: 2, + }, + }, }, ExpectedTotalScore: 0, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ @@ -516,6 +574,17 @@ func TestEvaluate(t *testing.T) { metrics.AssessmentKeyResponseNoError: 2, }, }, + &metricstesting.AssessmentTuple{ + Model: mockedModel, + Language: languageGolang, + RepositoryPath: repositoryNextPath, + Task: evaluatetask.IdentifierWriteTestsSymflowerFix, + Assessment: map[metrics.AssessmentKey]uint64{ + metrics.AssessmentKeyCoverage: 0, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyResponseNoError: 2, + }, + }, &metricstesting.AssessmentTuple{ Model: mockedModel, Language: languageGolang, @@ -527,6 +596,17 @@ func TestEvaluate(t *testing.T) { metrics.AssessmentKeyResponseNoError: 1, }, }, + &metricstesting.AssessmentTuple{ + Model: mockedModel, + Language: languageGolang, + RepositoryPath: repositoryPlainPath, + Task: evaluatetask.IdentifierWriteTestsSymflowerFix, + Assessment: map[metrics.AssessmentKey]uint64{ + metrics.AssessmentKeyCoverage: 0, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + }, + }, }, ExpectedTotalScore: 0, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ @@ -581,6 +661,13 @@ func TestEvaluate(t *testing.T) { Task: evaluatetask.IdentifierWriteTests, Assessment: map[metrics.AssessmentKey]uint64{}, }, + &metricstesting.AssessmentTuple{ + Model: mockedModel, + Language: languageGolang, + RepositoryPath: repositoryPlainPath, + Task: evaluatetask.IdentifierWriteTestsSymflowerFix, + Assessment: map[metrics.AssessmentKey]uint64{}, + }, }, ExpectedTotalScore: 0, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ @@ -634,8 +721,19 @@ func TestEvaluate(t *testing.T) { metrics.AssessmentKeyResponseNoError: 3, }, }, + &metricstesting.AssessmentTuple{ + Model: mockedModel, + Language: languageGolang, + RepositoryPath: repositoryPath, + Task: evaluatetask.IdentifierWriteTestsSymflowerFix, + Assessment: map[metrics.AssessmentKey]uint64{ + metrics.AssessmentKeyCoverage: 0, + metrics.AssessmentKeyFilesExecuted: 3, + metrics.AssessmentKeyResponseNoError: 3, + }, + }, }, - ExpectedTotalScore: 3, + ExpectedTotalScore: 6, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil, }, @@ -690,8 +788,19 @@ func TestEvaluate(t *testing.T) { metrics.AssessmentKeyResponseNoError: 3, }, }, + &metricstesting.AssessmentTuple{ + Model: mockedModel, + Language: languageGolang, + RepositoryPath: repositoryPath, + Task: evaluatetask.IdentifierWriteTestsSymflowerFix, + Assessment: map[metrics.AssessmentKey]uint64{ + metrics.AssessmentKeyCoverage: 0, + metrics.AssessmentKeyFilesExecuted: 3, + metrics.AssessmentKeyResponseNoError: 3, + }, + }, }, - ExpectedTotalScore: 3, + ExpectedTotalScore: 6, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil, }, @@ -776,8 +885,19 @@ func TestEvaluate(t *testing.T) { metrics.AssessmentKeyResponseNoError: 3, }, }, + &metricstesting.AssessmentTuple{ + Model: mockedModel, + Language: languageGolang, + RepositoryPath: repositoryPath, + Task: evaluatetask.IdentifierWriteTestsSymflowerFix, + Assessment: map[metrics.AssessmentKey]uint64{ + metrics.AssessmentKeyCoverage: 0, + metrics.AssessmentKeyFilesExecuted: 3, + metrics.AssessmentKeyResponseNoError: 3, + }, + }, }, - ExpectedTotalScore: 3, + ExpectedTotalScore: 6, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil, }, @@ -845,8 +965,19 @@ func TestEvaluate(t *testing.T) { metrics.AssessmentKeyResponseNoError: 3, }, }, + &metricstesting.AssessmentTuple{ + Model: mockedModel, + Language: languageGolang, + RepositoryPath: repositoryPath, + Task: evaluatetask.IdentifierWriteTestsSymflowerFix, + Assessment: map[metrics.AssessmentKey]uint64{ + metrics.AssessmentKeyCoverage: 0, + metrics.AssessmentKeyFilesExecuted: 3, + metrics.AssessmentKeyResponseNoError: 3, + }, + }, }, - ExpectedTotalScore: 3, + ExpectedTotalScore: 6, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil, }, @@ -895,8 +1026,19 @@ func TestEvaluate(t *testing.T) { metrics.AssessmentKeyResponseNoError: 1, }, }, + &metricstesting.AssessmentTuple{ + Model: mockedModel, + Language: languageGolang, + RepositoryPath: repositoryPath, + Task: evaluatetask.IdentifierWriteTestsSymflowerFix, + Assessment: map[metrics.AssessmentKey]uint64{ + metrics.AssessmentKeyCoverage: 0, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + }, + }, }, - ExpectedTotalScore: 1, + ExpectedTotalScore: 2, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), evalmodel.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain.log"): nil, }, diff --git a/evaluate/metrics/assessment.go b/evaluate/metrics/assessment.go index 5b788ec8..a8b67974 100644 --- a/evaluate/metrics/assessment.go +++ b/evaluate/metrics/assessment.go @@ -152,3 +152,19 @@ func (a Assessments) StringCSV() (row []string) { return row } + +// CombineWithSymflowerFixAssessments combines the model assessments with the ones from "symflower fix". +func CombineWithSymflowerFixAssessments(model Assessments, fixed Assessments) (combined Assessments) { + combined = NewAssessments() + + combined[AssessmentKeyCoverage] = fixed[AssessmentKeyCoverage] + combined[AssessmentKeyFilesExecuted] = fixed[AssessmentKeyFilesExecuted] + combined[AssessmentKeyGenerateTestsForFileCharacterCount] = model[AssessmentKeyGenerateTestsForFileCharacterCount] + combined[AssessmentKeyProcessingTime] = model[AssessmentKeyProcessingTime] + fixed[AssessmentKeyProcessingTime] + combined[AssessmentKeyResponseCharacterCount] = model[AssessmentKeyResponseCharacterCount] + combined[AssessmentKeyResponseNoError] = model[AssessmentKeyResponseNoError] + combined[AssessmentKeyResponseNoExcess] = model[AssessmentKeyResponseNoExcess] + combined[AssessmentKeyResponseWithCode] = model[AssessmentKeyResponseWithCode] + + return combined +} diff --git a/evaluate/metrics/assessment_test.go b/evaluate/metrics/assessment_test.go index 1ae4fe32..c39e002d 100644 --- a/evaluate/metrics/assessment_test.go +++ b/evaluate/metrics/assessment_test.go @@ -272,3 +272,54 @@ func TestAssessmentsScore(t *testing.T) { ExpectedScore: uint64(9), }) } + +func TestCombineModelAndSymflowerFixAssessments(t *testing.T) { + type testCase struct { + Name string + + ModelAssessment Assessments + SymflowerFixAssessments Assessments + + ExpectedAssessments Assessments + } + + validate := func(t *testing.T, tc *testCase) { + t.Run(tc.Name, func(t *testing.T) { + actualAssessments := CombineWithSymflowerFixAssessments(tc.ModelAssessment, tc.SymflowerFixAssessments) + + assert.Equal(t, tc.ExpectedAssessments, actualAssessments) + }) + } + + validate(t, &testCase{ + Name: "Simple", + + ModelAssessment: Assessments{ + AssessmentKeyFilesExecuted: 1, + AssessmentKeyProcessingTime: uint64(200), + AssessmentKeyCoverage: 0, + AssessmentKeyResponseCharacterCount: 100, + AssessmentKeyGenerateTestsForFileCharacterCount: 50, + AssessmentKeyResponseNoError: 0, + AssessmentKeyResponseWithCode: 1, + AssessmentKeyResponseNoExcess: 1, + }, + SymflowerFixAssessments: Assessments{ + AssessmentKeyFilesExecuted: 1, + AssessmentKeyProcessingTime: uint64(100), + AssessmentKeyCoverage: 10, + AssessmentKeyResponseNoError: 1, + }, + + ExpectedAssessments: Assessments{ + AssessmentKeyFilesExecuted: 1, + AssessmentKeyProcessingTime: uint64(300), + AssessmentKeyCoverage: 10, + AssessmentKeyResponseCharacterCount: 100, + AssessmentKeyGenerateTestsForFileCharacterCount: 50, + AssessmentKeyResponseNoError: 0, + AssessmentKeyResponseWithCode: 1, + AssessmentKeyResponseNoExcess: 1, + }, + }) +} diff --git a/evaluate/task/symflower-fix.go b/evaluate/task/symflower-fix.go new file mode 100644 index 00000000..a441f4cb --- /dev/null +++ b/evaluate/task/symflower-fix.go @@ -0,0 +1,32 @@ +package task + +import ( + "context" + "time" + + pkgerrors "github.com/pkg/errors" + "github.com/symflower/eval-dev-quality/evaluate/metrics" + "github.com/symflower/eval-dev-quality/language" + "github.com/symflower/eval-dev-quality/log" + "github.com/symflower/eval-dev-quality/tools" + "github.com/symflower/eval-dev-quality/util" +) + +// symflowerFix runs the "symflower fix" command and returns its execution time in milliseconds. +func symflowerFix(logger *log.Logger, modelAssessment metrics.Assessments, repositoryPath string, language language.Language) (duration uint64, err error) { + start := time.Now() + _, err = util.CommandWithResult(context.Background(), logger, &util.Command{ + Command: []string{ + tools.SymflowerPath, "fix", + "--language", language.ID(), + "--workspace", repositoryPath, + }, + + Directory: repositoryPath, + }) + if err != nil { + return 0, pkgerrors.WithStack(err) + } + + return uint64(time.Since(start).Milliseconds()), nil +} diff --git a/evaluate/task/task-write-test.go b/evaluate/task/task-write-test.go index 2b991dea..9b7cb776 100644 --- a/evaluate/task/task-write-test.go +++ b/evaluate/task/task-write-test.go @@ -63,7 +63,11 @@ func (t *TaskWriteTests) Run(repository evaltask.Repository) (repositoryAssessme } modelAssessment := metrics.NewAssessments() + withSymflowerAssessment := metrics.NewAssessments() for _, filePath := range filePaths { + modelAssessmentForFile := metrics.NewAssessments() + withSymflowerAssessmentForFile := modelAssessmentForFile // The symflower assessment tracks how the model result can be improved in case of a failure, so just link to the model assessment until a failure actually happens. + if err := repository.Reset(t.Logger); err != nil { t.Logger.Panicf("ERROR: unable to reset temporary repository path: %s", err) } @@ -85,23 +89,61 @@ func (t *TaskWriteTests) Run(repository evaltask.Repository) (repositoryAssessme if assessments[metrics.AssessmentKeyProcessingTime] == 0 { return nil, nil, pkgerrors.Errorf("no model response time measurement present for %q at repository %q", t.Model.ID(), repository.Name()) } - modelAssessment.Add(assessments) - modelAssessment.Award(metrics.AssessmentKeyResponseNoError) + modelAssessmentForFile.Add(assessments) + modelAssessmentForFile.Award(metrics.AssessmentKeyResponseNoError) coverage, ps, err := t.Language.Execute(log, dataPath) problems = append(problems, ps...) if err != nil { problems = append(problems, pkgerrors.WithMessage(err, filePath)) - continue + // Run "symflower fix" if the model response fails to execute. + if t.Language.ID() == "golang" { // Currently we only support Go for "symflower fix". + log.Print("model response alone failed execution, attempting to fix with \"symflower fix \"") + + duration, err := symflowerFix(log, modelAssessment, dataPath, t.Language) + if err != nil { + problems = append(problems, err) + + modelAssessment.Add(modelAssessmentForFile) + withSymflowerAssessment.Add(withSymflowerAssessmentForFile) + + continue + } + + coverage, ps, err := t.Language.Execute(log, dataPath) + problems = append(problems, ps...) + if err != nil { + problems = append(problems, pkgerrors.WithMessage(err, "symflower fix")) + + modelAssessment.Add(modelAssessmentForFile) + withSymflowerAssessment.Add(withSymflowerAssessmentForFile) + + continue + } + log.Printf("with symflower repair: Executes tests with %d coverage objects", coverage) + + // Symflower was able to fix a failure so now update the assessment with the improved results. + withSymflowerAssessmentForFile = metrics.NewAssessments() + withSymflowerAssessmentForFile[metrics.AssessmentKeyProcessingTime] = duration + withSymflowerAssessmentForFile.Award(metrics.AssessmentKeyFilesExecuted) + withSymflowerAssessmentForFile.AwardPoints(metrics.AssessmentKeyCoverage, coverage) + + withSymflowerAssessmentForFile = metrics.CombineWithSymflowerFixAssessments(modelAssessmentForFile, withSymflowerAssessmentForFile) + } + } else { + log.Printf("Executes tests with %d coverage objects", coverage) + modelAssessmentForFile.Award(metrics.AssessmentKeyFilesExecuted) + modelAssessmentForFile.AwardPoints(metrics.AssessmentKeyCoverage, coverage) } - log.Printf("Executes tests with %d coverage objects", coverage) - modelAssessment.Award(metrics.AssessmentKeyFilesExecuted) - modelAssessment.AwardPoints(metrics.AssessmentKeyCoverage, coverage) + + modelAssessment.Add(modelAssessmentForFile) + withSymflowerAssessment.Add(withSymflowerAssessmentForFile) } repositoryAssessment = map[evaltask.Identifier]metrics.Assessments{ - IdentifierWriteTests: modelAssessment, + IdentifierWriteTests: modelAssessment, + IdentifierWriteTestsSymflowerFix: withSymflowerAssessment, } return repositoryAssessment, problems, nil diff --git a/evaluate/task/task-write-test_test.go b/evaluate/task/task-write-test_test.go index 5b5b2cdc..03517352 100644 --- a/evaluate/task/task-write-test_test.go +++ b/evaluate/task/task-write-test_test.go @@ -14,6 +14,8 @@ import ( "github.com/symflower/eval-dev-quality/log" modeltesting "github.com/symflower/eval-dev-quality/model/testing" "github.com/symflower/eval-dev-quality/task" + "github.com/zimmski/osutil" + "github.com/zimmski/osutil/bytesutil" ) func TestTaskWriteTestsRun(t *testing.T) { @@ -65,9 +67,14 @@ func TestTaskWriteTestsRun(t *testing.T) { metrics.AssessmentKeyFilesExecuted: 1, metrics.AssessmentKeyResponseNoError: 2, }, + IdentifierWriteTestsSymflowerFix: metrics.Assessments{ + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 2, + }, }, ExpectedProblemContains: []string{ "expected 'package', found does", + "exit status 1", }, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(IdentifierWriteTests), "mocked-model", "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) { @@ -77,4 +84,106 @@ func TestTaskWriteTestsRun(t *testing.T) { }, }) }) + + t.Run("Symflower Fix", func(t *testing.T) { + t.Run("Go", func(t *testing.T) { + validateGo := func(t *testing.T, testName string, testFileContent string, expectedAssessments map[task.Identifier]metrics.Assessments, expectedProblems []string, assertTestsPass bool) { + temporaryDirectoryPath := t.TempDir() + repositoryPath := filepath.Join(temporaryDirectoryPath, "golang", "plain") + require.NoError(t, osutil.CopyTree(filepath.Join("..", "..", "testdata", "golang", "plain"), repositoryPath)) + + modelMock := modeltesting.NewMockModelNamed(t, "mocked-model") + modelMock.RegisterGenerateSuccess(t, IdentifierWriteTests, "plain_test.go", testFileContent, metricstesting.AssessmentsWithProcessingTime).Once() + + validate(t, &tasktesting.TestCaseTask{ + Name: testName, + + Model: modelMock, + Language: &golang.Language{}, + TestDataPath: temporaryDirectoryPath, + RepositoryPath: filepath.Join("golang", "plain"), + + ExpectedRepositoryAssessment: expectedAssessments, + ExpectedProblemContains: expectedProblems, + ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ + filepath.Join(string(IdentifierWriteTests), "mocked-model", "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) { + assert.Contains(t, data, "Evaluating model \"mocked-model\"") + if assertTestsPass { + assert.Contains(t, data, "PASS: TestPlain") + } + }, + }, + }) + } + { + expectedAssessments := map[task.Identifier]metrics.Assessments{ + IdentifierWriteTests: metrics.Assessments{ + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyCoverage: 10, + }, + IdentifierWriteTestsSymflowerFix: metrics.Assessments{ + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyCoverage: 10, + }, + } + validateGo(t, "Model generated correct test", bytesutil.StringTrimIndentations(` + package plain + + import "testing" + + func TestPlain(t *testing.T) { + plain() + } + `), expectedAssessments, nil, true) + } + { + expectedAssessments := map[task.Identifier]metrics.Assessments{ + IdentifierWriteTests: metrics.Assessments{ + metrics.AssessmentKeyResponseNoError: 1, + }, + IdentifierWriteTestsSymflowerFix: metrics.Assessments{ + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyCoverage: 10, + }, + } + expectedProblems := []string{ + "imported and not used", + } + validateGo(t, "Model generated test with unused import", bytesutil.StringTrimIndentations(` + package plain + + import ( + "testing" + "strings" + ) + + func TestPlain(t *testing.T) { + plain() + } + `), expectedAssessments, expectedProblems, true) + } + { + expectedAssessments := map[task.Identifier]metrics.Assessments{ + IdentifierWriteTests: metrics.Assessments{ + metrics.AssessmentKeyResponseNoError: 1, + }, + IdentifierWriteTestsSymflowerFix: metrics.Assessments{ + metrics.AssessmentKeyResponseNoError: 1, + }, + } + expectedProblems := []string{ + "expected declaration, found this", + "unable to format source code", + } + validateGo(t, "Model generated test that is unfixable", bytesutil.StringTrimIndentations(` + package plain + + this is not valid go code + `), expectedAssessments, expectedProblems, false) + } + }) + }) } diff --git a/evaluate/task/task.go b/evaluate/task/task.go index 5ddda6ef..cb422e10 100644 --- a/evaluate/task/task.go +++ b/evaluate/task/task.go @@ -33,6 +33,8 @@ func registerIdentifier(name string) (identifier evaltask.Identifier) { var ( // IdentifierWriteTests holds the identifier for the "write test" task. IdentifierWriteTests = registerIdentifier("write-tests") + // IdentifierWriteTestsSymflowerFix holds the identifier for the "write test" task with the "symflower fix" applied. + IdentifierWriteTestsSymflowerFix = registerIdentifier("write-tests-symflower-fix") // IdentifierCodeRepair holds the identifier for the "code repair" task. IdentifierCodeRepair = registerIdentifier("code-repair") ) diff --git a/evaluate/task/test-integration/task_test.go b/evaluate/task/test-integration/task_test.go index 1c680a4a..1fed8c94 100644 --- a/evaluate/task/test-integration/task_test.go +++ b/evaluate/task/test-integration/task_test.go @@ -59,6 +59,15 @@ func TestTaskWriteTestsRun(t *testing.T) { metrics.AssessmentKeyResponseNoExcess: 1, metrics.AssessmentKeyResponseWithCode: 1, }, + evaluatetask.IdentifierWriteTestsSymflowerFix: metrics.Assessments{ + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 254, + metrics.AssessmentKeyResponseCharacterCount: 254, + metrics.AssessmentKeyCoverage: 10, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, + }, }, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), "symflower_symbolic-execution", "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) { diff --git a/tools/symflower.go b/tools/symflower.go index 8e08b212..c841a91d 100644 --- a/tools/symflower.go +++ b/tools/symflower.go @@ -91,7 +91,7 @@ func (*symflower) CheckVersion(logger *log.Logger, binaryPath string) (err error } // SymflowerVersionRequired holds the version of Symflower required for this revision of the evaluation. -const SymflowerVersionRequired = "37153" +const SymflowerVersionRequired = "38036" // RequiredVersion returns the required version of the tool. func (*symflower) RequiredVersion() string {