Skip to content

Commit

Permalink
Collect usage metrics of queries to be able to calculate costs
Browse files Browse the repository at this point in the history
Part of #347
  • Loading branch information
ahumenberger committed Mar 4, 2025
1 parent 853758b commit 602a273
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 33 deletions.
13 changes: 13 additions & 0 deletions evaluate/evaluate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"testing"
"time"

"github.com/sashabaranov/go-openai"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -428,6 +429,10 @@ func TestEvaluate(t *testing.T) {
Before: func(t *testing.T, logger *log.Logger, resultPath string) {
queryResult := &provider.QueryResult{
Message: "model-response",
Usage: openai.Usage{
PromptTokens: 123,
CompletionTokens: 456,
},
}
// Set up mocks, when test is running.
mockedQuery.On("Query", mock.Anything, mock.Anything, mock.Anything).Return(queryResult, nil).After(10 * time.Millisecond) // Simulate a model response delay because our internal safety measures trigger when a query is done in 0 milliseconds.
Expand Down Expand Up @@ -463,6 +468,8 @@ func TestEvaluate(t *testing.T) {
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 14,
metrics.AssessmentKeyResponseCharacterCount: 14,
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyTokenInput: 123,
metrics.AssessmentKeyTokenOutput: 456,
},
},
&metricstesting.AssessmentTuple{
Expand All @@ -476,6 +483,8 @@ func TestEvaluate(t *testing.T) {
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 14,
metrics.AssessmentKeyResponseCharacterCount: 14,
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyTokenInput: 123,
metrics.AssessmentKeyTokenOutput: 456,
},
},
&metricstesting.AssessmentTuple{
Expand All @@ -489,6 +498,8 @@ func TestEvaluate(t *testing.T) {
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 14,
metrics.AssessmentKeyResponseCharacterCount: 14,
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyTokenInput: 123,
metrics.AssessmentKeyTokenOutput: 456,
},
},
&metricstesting.AssessmentTuple{
Expand All @@ -502,6 +513,8 @@ func TestEvaluate(t *testing.T) {
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 14,
metrics.AssessmentKeyResponseCharacterCount: 14,
metrics.AssessmentKeyResponseNoError: 1,
metrics.AssessmentKeyTokenInput: 123,
metrics.AssessmentKeyTokenOutput: 456,
},
},
},
Expand Down
21 changes: 13 additions & 8 deletions evaluate/metrics/assessment.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ import (
type AssessmentKey string

var (
// allAssessmentKeys holds all registered assessment keys.
allAssessmentKeys []AssessmentKey
// AllAssessmentKeys holds all registered assessment keys.
AllAssessmentKeys []AssessmentKey
// AllAssessmentKeysStrings returns all registered assessment keys as strings.
AllAssessmentKeysStrings []string
)
Expand All @@ -22,7 +22,7 @@ func RegisterAssessmentKey(key string) AssessmentKey {
assessment := AssessmentKey(key)
i := sort.SearchStrings(AllAssessmentKeysStrings, key)

allAssessmentKeys = slices.Insert(allAssessmentKeys, i, assessment)
AllAssessmentKeys = slices.Insert(AllAssessmentKeys, i, assessment)
AllAssessmentKeysStrings = slices.Insert(AllAssessmentKeysStrings, i, key)

return assessment
Expand Down Expand Up @@ -54,6 +54,11 @@ var (
// AssessmentKeyResponseNoExcess indicates that a model did not produce more content as requested.
// TODO Infer if a model produced "too much" code. https://github.com/symflower/eval-dev-quality/issues/44
AssessmentKeyResponseNoExcess = RegisterAssessmentKey("response-no-excess")

// AssessmentKeyTokenInput collects the number of input token.
AssessmentKeyTokenInput = RegisterAssessmentKey("token-input")
// AssessmentKeyTokenOutput collects the number of output token.
AssessmentKeyTokenOutput = RegisterAssessmentKey("token-output")
)

// Assessments holds a collection of numerical assessment metrics.
Expand All @@ -77,7 +82,7 @@ func (a Assessments) Equal(x Assessments) bool {
return a == nil && x == nil
}

for _, key := range allAssessmentKeys {
for _, key := range AllAssessmentKeys {
if a[key] != x[key] {
return false
}
Expand All @@ -101,9 +106,9 @@ func (a Assessments) String() string {
if a == nil {
a = NewAssessments()
}
entries := make([]string, len(allAssessmentKeys))
entries := make([]string, len(AllAssessmentKeys))

for i, key := range allAssessmentKeys {
for i, key := range AllAssessmentKeys {
entries[i] = fmt.Sprintf("%s=%d", key, a[key])
}

Expand All @@ -116,8 +121,8 @@ func (a Assessments) StringCSV() (row []string) {
a = NewAssessments()
}

row = make([]string, len(allAssessmentKeys))
for i, key := range allAssessmentKeys {
row = make([]string, len(AllAssessmentKeys))
for i, key := range AllAssessmentKeys {
row[i] = fmt.Sprintf("%d", a[key])
}

Expand Down
47 changes: 22 additions & 25 deletions evaluate/report/testing/csv.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
package testing

import (
"regexp"
"strconv"
"strings"
"testing"

"github.com/stretchr/testify/assert"
Expand All @@ -18,32 +18,29 @@ func atoiUint64(t *testing.T, s string) uint64 {
return uint64(value)
}

// extractMetricsCSVMatch is a regular expression to extract metrics from CSV rows.
var extractMetricsCSVMatch = regexp.MustCompile(`(\S+),(\S+),(\S+),(\S+),(\S+),\d+,(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+),(\d+)`)

// ParseMetrics extracts multiple assessment metrics from the given string.
func ParseMetrics(t *testing.T, data string) (assessments metricstesting.AssessmentTuples) {
matches := extractMetricsCSVMatch.FindAllStringSubmatch(data, -1)

for _, match := range matches {
assessments = append(assessments, &metricstesting.AssessmentTuple{
Model: match[1],
Language: match[2],
RepositoryPath: match[3],
Case: match[4],
Task: task.Identifier(match[5]),
Assessment: metrics.Assessments{
metrics.AssessmentKeyCoverage: atoiUint64(t, match[6]),
metrics.AssessmentKeyFilesExecuted: atoiUint64(t, match[7]),
metrics.AssessmentKeyFilesExecutedMaximumReachable: atoiUint64(t, match[8]),
metrics.AssessmentKeyGenerateTestsForFileCharacterCount: atoiUint64(t, match[9]),
metrics.AssessmentKeyProcessingTime: atoiUint64(t, match[10]),
metrics.AssessmentKeyResponseCharacterCount: atoiUint64(t, match[11]),
metrics.AssessmentKeyResponseNoError: atoiUint64(t, match[12]),
metrics.AssessmentKeyResponseNoExcess: atoiUint64(t, match[13]),
metrics.AssessmentKeyResponseWithCode: atoiUint64(t, match[14]),
},
})
lines := strings.Split(strings.TrimSpace(data), "\n")
if len(lines) < 2 {
return assessments
}

for _, line := range lines[1:] {
cells := strings.Split(line, ",")

tuple := &metricstesting.AssessmentTuple{
Model: cells[0],
Language: cells[1],
RepositoryPath: cells[2],
Case: cells[3],
Task: task.Identifier(cells[4]),
Assessment: metrics.Assessments{},
}
for i, key := range metrics.AllAssessmentKeys {
tuple.Assessment[key] = atoiUint64(t, cells[i+6])
}

assessments = append(assessments, tuple)
}

return assessments
Expand Down
2 changes: 2 additions & 0 deletions model/llm/llm.go
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,8 @@ func handleQueryResult(ctx model.Context, queryResult *provider.QueryResult) (as
assessment[metrics.AssessmentKeyProcessingTime] = uint64(queryResult.Duration.Milliseconds())
assessment[metrics.AssessmentKeyResponseCharacterCount] = uint64(len(queryResult.Message))
assessment[metrics.AssessmentKeyGenerateTestsForFileCharacterCount] = uint64(len(sourceFileContent))
assessment[metrics.AssessmentKeyTokenInput] = uint64(queryResult.Usage.PromptTokens)
assessment[metrics.AssessmentKeyTokenOutput] = uint64(queryResult.Usage.CompletionTokens)

filePathAbsolute := filepath.Join(ctx.RepositoryPath, ctx.FilePath)
if err := os.MkdirAll(filepath.Dir(filePathAbsolute), 0755); err != nil {
Expand Down
1 change: 1 addition & 0 deletions provider/openai-api/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,6 @@ func QueryOpenAIAPIModel(ctx context.Context, client *openai.Client, modelIdenti

return &provider.QueryResult{
Message: apiResponse.Choices[0].Message.Content,
Usage: apiResponse.Usage,
}, nil
}
3 changes: 3 additions & 0 deletions provider/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"time"

pkgerrors "github.com/pkg/errors"
"github.com/sashabaranov/go-openai"

"github.com/symflower/eval-dev-quality/log"
"github.com/symflower/eval-dev-quality/model"
Expand Down Expand Up @@ -49,6 +50,8 @@ type QueryResult struct {
Message string
// Duration holds the duration of the result.
Duration time.Duration
// Usage holds the usage metrics of the query.
Usage openai.Usage
}

// Query is a provider that allows to query a model directly.
Expand Down

0 comments on commit 602a273

Please sign in to comment.