diff --git a/evaluate/evaluate.go b/evaluate/evaluate.go index 2c708980..ca50401d 100644 --- a/evaluate/evaluate.go +++ b/evaluate/evaluate.go @@ -126,7 +126,7 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore, totalScore uin } assessment, ps, err := task.Run(temporaryRepository) - assessments.Add(model, language, repositoryPath, taskIdentifier, assessment) + assessments.AddAssessmentPerTask(model, language, repositoryPath, assessment) if err != nil { ps = append(ps, err) } @@ -226,7 +226,7 @@ func Evaluate(ctx *Context) (assessments *report.AssessmentStore, totalScore uin } assessment, ps, err := task.Run(temporaryRepository) - assessments.Add(model, language, repositoryPath, taskIdentifier, assessment) + assessments.AddAssessmentPerTask(model, language, repositoryPath, assessment) problemsPerModel[modelID] = append(problemsPerModel[modelID], ps...) if err != nil { ctx.Log.Printf("ERROR: Model %q encountered a hard error for language %q, repository %q: %+v", modelID, languageID, repositoryPath, err) diff --git a/evaluate/metrics/testing/assessments.go b/evaluate/metrics/testing/assessments.go index dab179b4..3f652f15 100644 --- a/evaluate/metrics/testing/assessments.go +++ b/evaluate/metrics/testing/assessments.go @@ -1,10 +1,12 @@ package metricstesting import ( - "maps" "testing" + "golang.org/x/exp/maps" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "github.com/symflower/eval-dev-quality/evaluate/metrics" "github.com/symflower/eval-dev-quality/language" @@ -17,17 +19,41 @@ func AssertAssessmentsEqual(t *testing.T, expected metrics.Assessments, actual m expected = maps.Clone(expected) actual = maps.Clone(actual) - expected[metrics.AssessmentKeyProcessingTime] = 0 - actual[metrics.AssessmentKeyProcessingTime] = 0 - - expected[metrics.AssessmentKeyGenerateTestsForFileCharacterCount] = 0 - actual[metrics.AssessmentKeyGenerateTestsForFileCharacterCount] = 0 - expected[metrics.AssessmentKeyResponseCharacterCount] = 0 - actual[metrics.AssessmentKeyResponseCharacterCount] = 0 + clearNonDeterministicAssessmentValues(expected) + clearNonDeterministicAssessmentValues(actual) assert.Truef(t, expected.Equal(actual), "expected:%s\nactual:%s", expected, actual) } +// AssertTaskAssessmentsEqual checks if the given assessments per task are equal ignoring default and nondeterministic values. +func AssertTaskAssessmentsEqual(t *testing.T, expected map[task.Identifier]metrics.Assessments, actual map[task.Identifier]metrics.Assessments) { + expected = maps.Clone(expected) + actual = maps.Clone(actual) + + // The expected and actual maps must have the same task identifiers. + require.ElementsMatch(t, maps.Keys(expected), maps.Keys(actual)) + + // Ignore non-deterministic values. + for _, assessment := range expected { + clearNonDeterministicAssessmentValues(assessment) + } + for _, assessment := range actual { + clearNonDeterministicAssessmentValues(assessment) + } + + for task, expectedAssessment := range expected { + actualAssessment := actual[task] + assert.Truef(t, expectedAssessment.Equal(actualAssessment), "task:%s\nexpected:%s\nactual:%s", task, expected, actual) + } +} + +// clearNonDeterministicAssessmentValues ignores non-deterministic values such as processing time and response character count. +func clearNonDeterministicAssessmentValues(assessment metrics.Assessments) { + assessment[metrics.AssessmentKeyProcessingTime] = 0 + assessment[metrics.AssessmentKeyGenerateTestsForFileCharacterCount] = 0 + assessment[metrics.AssessmentKeyResponseCharacterCount] = 0 +} + // AssessmentsWithProcessingTime is an empty assessment collection with positive processing time. var AssessmentsWithProcessingTime = metrics.Assessments{ metrics.AssessmentKeyProcessingTime: 1, diff --git a/evaluate/report/collection.go b/evaluate/report/collection.go index b7a1a280..33760a20 100644 --- a/evaluate/report/collection.go +++ b/evaluate/report/collection.go @@ -84,6 +84,13 @@ func (a *AssessmentStore) Add(model model.Model, l language.Language, repository assessments.Add(assessment) } +// AddAssessmentPerTask adds new assessments per task. +func (a *AssessmentStore) AddAssessmentPerTask(model model.Model, l language.Language, repositoryPath string, taskAssessment map[task.Identifier]metrics.Assessments) { + for taskIdentifier, assessment := range taskAssessment { + a.Add(model, l, repositoryPath, taskIdentifier, assessment) + } +} + // Walk walks over all entries. func (a *AssessmentStore) Walk(function func(m model.Model, l language.Language, r string, t task.Identifier, a metrics.Assessments) error) (err error) { models := maps.Keys(a.store) diff --git a/evaluate/task/task-code-repair.go b/evaluate/task/task-code-repair.go index 76e1ccfc..3b71b43d 100644 --- a/evaluate/task/task-code-repair.go +++ b/evaluate/task/task-code-repair.go @@ -52,7 +52,7 @@ func (t *TaskCodeRepair) Identifier() evaltask.Identifier { // Run performs source code repairing in a repository with compilation errors. // This task requires the repository to consist of multiple packages, with each containing one faulty implementation file and a corresponding test file. -func (t *TaskCodeRepair) Run(repository evaltask.Repository) (repositoryAssessment metrics.Assessments, problems []error, err error) { +func (t *TaskCodeRepair) Run(repository evaltask.Repository) (repositoryAssessment map[evaltask.Identifier]metrics.Assessments, problems []error, err error) { log, logClose, err := log.WithFile(t.Logger, filepath.Join(t.ResultPath, string(t.Identifier()), model.CleanModelNameForFileSystem(t.Model.ID()), t.Language.ID(), repository.Name()+".log")) if err != nil { return nil, nil, err @@ -75,7 +75,7 @@ func (t *TaskCodeRepair) Run(repository evaltask.Repository) (repositoryAssessme } } - repositoryAssessment = metrics.NewAssessments() + modelAssessment := metrics.NewAssessments() for _, packagePath := range packagePaths { if err := repository.Reset(t.Logger); err != nil { t.Logger.Panicf("ERROR: unable to reset temporary repository path: %s", err) @@ -107,8 +107,8 @@ func (t *TaskCodeRepair) Run(repository evaltask.Repository) (repositoryAssessme if assessments[metrics.AssessmentKeyProcessingTime] == 0 { return nil, nil, pkgerrors.Errorf("no model response time measurement present for %q at repository %q", t.Model.ID(), repository.Name()) } - repositoryAssessment.Add(assessments) - repositoryAssessment.Award(metrics.AssessmentKeyResponseNoError) + modelAssessment.Add(assessments) + modelAssessment.Award(metrics.AssessmentKeyResponseNoError) coverage, ps, err := t.Language.Execute(log, packagePath) problems = append(problems, ps...) @@ -118,8 +118,12 @@ func (t *TaskCodeRepair) Run(repository evaltask.Repository) (repositoryAssessme continue } log.Printf("Executes tests with %d coverage objects", coverage) - repositoryAssessment.Award(metrics.AssessmentKeyFilesExecuted) - repositoryAssessment.AwardPoints(metrics.AssessmentKeyCoverage, coverage) + modelAssessment.Award(metrics.AssessmentKeyFilesExecuted) + modelAssessment.AwardPoints(metrics.AssessmentKeyCoverage, coverage) + } + + repositoryAssessment = map[evaltask.Identifier]metrics.Assessments{ + IdentifierCodeRepair: modelAssessment, } return repositoryAssessment, problems, nil diff --git a/evaluate/task/task-code-repair_test.go b/evaluate/task/task-code-repair_test.go index 7b893cd5..920f7fa0 100644 --- a/evaluate/task/task-code-repair_test.go +++ b/evaluate/task/task-code-repair_test.go @@ -13,6 +13,7 @@ import ( "github.com/symflower/eval-dev-quality/language/java" "github.com/symflower/eval-dev-quality/log" modeltesting "github.com/symflower/eval-dev-quality/model/testing" + "github.com/symflower/eval-dev-quality/task" "github.com/zimmski/osutil" "github.com/zimmski/osutil/bytesutil" ) @@ -71,10 +72,12 @@ func TestTaskCodeRepairRun(t *testing.T) { TestDataPath: temporaryDirectoryPath, RepositoryPath: filepath.Join("golang", "mistakes"), - ExpectedRepositoryAssessment: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 30, - metrics.AssessmentKeyFilesExecuted: 1, - metrics.AssessmentKeyResponseNoError: 1, + ExpectedRepositoryAssessment: map[task.Identifier]metrics.Assessments{ + IdentifierCodeRepair: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 30, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + }, }, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(IdentifierCodeRepair), "mocked-model", "golang", "golang", "mistakes.log"): func(t *testing.T, filePath, data string) { @@ -134,10 +137,12 @@ func TestTaskCodeRepairRun(t *testing.T) { TestDataPath: temporaryDirectoryPath, RepositoryPath: filepath.Join("golang", "mistakes"), - ExpectedRepositoryAssessment: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 60, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyResponseNoError: 2, + ExpectedRepositoryAssessment: map[task.Identifier]metrics.Assessments{ + IdentifierCodeRepair: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 60, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyResponseNoError: 2, + }, }, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(IdentifierCodeRepair), "mocked-model", "golang", "golang", "mistakes.log"): func(t *testing.T, filePath, data string) { @@ -188,10 +193,12 @@ func TestTaskCodeRepairRun(t *testing.T) { TestDataPath: temporaryDirectoryPath, RepositoryPath: filepath.Join("java", "mistakes"), - ExpectedRepositoryAssessment: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 80, - metrics.AssessmentKeyFilesExecuted: 1, - metrics.AssessmentKeyResponseNoError: 1, + ExpectedRepositoryAssessment: map[task.Identifier]metrics.Assessments{ + IdentifierCodeRepair: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 80, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + }, }, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(IdentifierCodeRepair), "mocked-model", "java", "java", "mistakes.log"): func(t *testing.T, filePath, data string) { @@ -253,10 +260,12 @@ func TestTaskCodeRepairRun(t *testing.T) { TestDataPath: temporaryDirectoryPath, RepositoryPath: filepath.Join("java", "mistakes"), - ExpectedRepositoryAssessment: metrics.Assessments{ - metrics.AssessmentKeyCoverage: 160, - metrics.AssessmentKeyFilesExecuted: 2, - metrics.AssessmentKeyResponseNoError: 2, + ExpectedRepositoryAssessment: map[task.Identifier]metrics.Assessments{ + IdentifierCodeRepair: metrics.Assessments{ + metrics.AssessmentKeyCoverage: 160, + metrics.AssessmentKeyFilesExecuted: 2, + metrics.AssessmentKeyResponseNoError: 2, + }, }, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(IdentifierCodeRepair), "mocked-model", "java", "java", "mistakes.log"): func(t *testing.T, filePath, data string) { diff --git a/evaluate/task/task-write-test.go b/evaluate/task/task-write-test.go index f7fb1f55..2b991dea 100644 --- a/evaluate/task/task-write-test.go +++ b/evaluate/task/task-write-test.go @@ -43,7 +43,7 @@ func (t *TaskWriteTests) Identifier() evaltask.Identifier { } // TaskWriteTests generates test files for the given implementation file in a repository. -func (t *TaskWriteTests) Run(repository evaltask.Repository) (repositoryAssessment metrics.Assessments, problems []error, err error) { +func (t *TaskWriteTests) Run(repository evaltask.Repository) (repositoryAssessment map[evaltask.Identifier]metrics.Assessments, problems []error, err error) { dataPath := repository.DataPath() log, logClose, err := log.WithFile(t.Logger, filepath.Join(t.ResultPath, string(t.Identifier()), model.CleanModelNameForFileSystem(t.Model.ID()), t.Language.ID(), repository.Name()+".log")) @@ -62,7 +62,7 @@ func (t *TaskWriteTests) Run(repository evaltask.Repository) (repositoryAssessme return nil, problems, pkgerrors.WithStack(err) } - repositoryAssessment = metrics.NewAssessments() + modelAssessment := metrics.NewAssessments() for _, filePath := range filePaths { if err := repository.Reset(t.Logger); err != nil { t.Logger.Panicf("ERROR: unable to reset temporary repository path: %s", err) @@ -85,8 +85,8 @@ func (t *TaskWriteTests) Run(repository evaltask.Repository) (repositoryAssessme if assessments[metrics.AssessmentKeyProcessingTime] == 0 { return nil, nil, pkgerrors.Errorf("no model response time measurement present for %q at repository %q", t.Model.ID(), repository.Name()) } - repositoryAssessment.Add(assessments) - repositoryAssessment.Award(metrics.AssessmentKeyResponseNoError) + modelAssessment.Add(assessments) + modelAssessment.Award(metrics.AssessmentKeyResponseNoError) coverage, ps, err := t.Language.Execute(log, dataPath) problems = append(problems, ps...) @@ -96,8 +96,12 @@ func (t *TaskWriteTests) Run(repository evaltask.Repository) (repositoryAssessme continue } log.Printf("Executes tests with %d coverage objects", coverage) - repositoryAssessment.Award(metrics.AssessmentKeyFilesExecuted) - repositoryAssessment.AwardPoints(metrics.AssessmentKeyCoverage, coverage) + modelAssessment.Award(metrics.AssessmentKeyFilesExecuted) + modelAssessment.AwardPoints(metrics.AssessmentKeyCoverage, coverage) + } + + repositoryAssessment = map[evaltask.Identifier]metrics.Assessments{ + IdentifierWriteTests: modelAssessment, } return repositoryAssessment, problems, nil diff --git a/evaluate/task/task-write-test_test.go b/evaluate/task/task-write-test_test.go index 57b091e5..5b5b2cdc 100644 --- a/evaluate/task/task-write-test_test.go +++ b/evaluate/task/task-write-test_test.go @@ -13,6 +13,7 @@ import ( "github.com/symflower/eval-dev-quality/language/golang" "github.com/symflower/eval-dev-quality/log" modeltesting "github.com/symflower/eval-dev-quality/model/testing" + "github.com/symflower/eval-dev-quality/task" ) func TestTaskWriteTestsRun(t *testing.T) { @@ -59,9 +60,11 @@ func TestTaskWriteTestsRun(t *testing.T) { TestDataPath: temporaryDirectoryPath, RepositoryPath: filepath.Join("golang", "plain"), - ExpectedRepositoryAssessment: metrics.Assessments{ - metrics.AssessmentKeyFilesExecuted: 1, - metrics.AssessmentKeyResponseNoError: 2, + ExpectedRepositoryAssessment: map[task.Identifier]metrics.Assessments{ + IdentifierWriteTests: metrics.Assessments{ + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 2, + }, }, ExpectedProblemContains: []string{ "expected 'package', found does", diff --git a/evaluate/task/test-integration/task_test.go b/evaluate/task/test-integration/task_test.go index 9ea1481e..1c680a4a 100644 --- a/evaluate/task/test-integration/task_test.go +++ b/evaluate/task/test-integration/task_test.go @@ -12,6 +12,7 @@ import ( "github.com/symflower/eval-dev-quality/language/golang" "github.com/symflower/eval-dev-quality/log" "github.com/symflower/eval-dev-quality/model/symflower" + "github.com/symflower/eval-dev-quality/task" "github.com/symflower/eval-dev-quality/tools" toolstesting "github.com/symflower/eval-dev-quality/tools/testing" ) @@ -48,14 +49,16 @@ func TestTaskWriteTestsRun(t *testing.T) { TestDataPath: filepath.Join("..", "..", "..", "testdata"), RepositoryPath: filepath.Join("golang", "plain"), - ExpectedRepositoryAssessment: metrics.Assessments{ - metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 254, - metrics.AssessmentKeyResponseCharacterCount: 254, - metrics.AssessmentKeyCoverage: 10, - metrics.AssessmentKeyFilesExecuted: 1, - metrics.AssessmentKeyResponseNoError: 1, - metrics.AssessmentKeyResponseNoExcess: 1, - metrics.AssessmentKeyResponseWithCode: 1, + ExpectedRepositoryAssessment: map[task.Identifier]metrics.Assessments{ + evaluatetask.IdentifierWriteTests: metrics.Assessments{ + metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 254, + metrics.AssessmentKeyResponseCharacterCount: 254, + metrics.AssessmentKeyCoverage: 10, + metrics.AssessmentKeyFilesExecuted: 1, + metrics.AssessmentKeyResponseNoError: 1, + metrics.AssessmentKeyResponseNoExcess: 1, + metrics.AssessmentKeyResponseWithCode: 1, + }, }, ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ filepath.Join(string(evaluatetask.IdentifierWriteTests), "symflower_symbolic-execution", "golang", "golang", "plain.log"): func(t *testing.T, filePath, data string) { diff --git a/evaluate/task/testing/task.go b/evaluate/task/testing/task.go index 8d68b314..712959dc 100644 --- a/evaluate/task/testing/task.go +++ b/evaluate/task/testing/task.go @@ -24,7 +24,7 @@ type TestCaseTask struct { TestDataPath string RepositoryPath string - ExpectedRepositoryAssessment metrics.Assessments + ExpectedRepositoryAssessment map[evaltask.Identifier]metrics.Assessments ExpectedResultFiles map[string]func(t *testing.T, filePath string, data string) ExpectedProblemContains []string ExpectedError error @@ -33,7 +33,7 @@ type TestCaseTask struct { func (tc *TestCaseTask) Validate(t *testing.T, task evaltask.Task, repository evaltask.Repository, resultPath string) { actualRepositoryAssessment, actualProblems, actualErr := task.Run(repository) - metricstesting.AssertAssessmentsEqual(t, tc.ExpectedRepositoryAssessment, actualRepositoryAssessment) + metricstesting.AssertTaskAssessmentsEqual(t, tc.ExpectedRepositoryAssessment, actualRepositoryAssessment) if assert.Equal(t, len(tc.ExpectedProblemContains), len(actualProblems), "problems count") { for i, expectedProblem := range tc.ExpectedProblemContains { actualProblem := actualProblems[i] diff --git a/task/task.go b/task/task.go index 8df261ae..f0336d1d 100644 --- a/task/task.go +++ b/task/task.go @@ -39,7 +39,7 @@ type Task interface { Identifier() (identifier Identifier) // Run runs a task in a given repository. - Run(repository Repository) (assessments metrics.Assessments, problems []error, err error) + Run(repository Repository) (assessments map[Identifier]metrics.Assessments, problems []error, err error) } // Repository defines a repository to be evaluated.