Skip to content

Commit

Permalink
use strings instead of hashes
Browse files Browse the repository at this point in the history
  • Loading branch information
ReallyLiri committed Nov 1, 2021
1 parent 9cece1c commit 1b68b90
Show file tree
Hide file tree
Showing 18 changed files with 417 additions and 399 deletions.
33 changes: 33 additions & 0 deletions dedup/dedup.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package dedup

import "github.com/adrg/strutil/metrics"

var levenshtein *metrics.Levenshtein

const smallFactor = 1
const bigFactor = 3

func init() {
levenshtein = metrics.NewLevenshtein()
levenshtein.CaseSensitive = true
levenshtein.InsertCost = bigFactor
levenshtein.DeleteCost = bigFactor
levenshtein.ReplaceCost = smallFactor
}

func max(a int, b int) int {
if a >= b {
return a
}
return b
}

func AreSimilar(a string, b string, similarityThreshold float64) bool {
maxLenFactor := bigFactor * max(len(a), len(b))
if maxLenFactor == 0 {
return true
}
distance := levenshtein.Distance(a, b)
score := 1 - float64(distance)/float64(maxLenFactor)
return score >= similarityThreshold
}
62 changes: 62 additions & 0 deletions dedup/dedup_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package dedup

import (
"github.com/stretchr/testify/assert"
"testing"
)

func Test_dedup(t *testing.T) {

assert.True(t, AreSimilar("", "", 0))
assert.True(t, AreSimilar("", "", 0.5))
assert.True(t, AreSimilar("", "", 1))

assert.True(t, AreSimilar("a", "", 0))
assert.False(t, AreSimilar("a", "", 0.1))
assert.False(t, AreSimilar("", "a", 0.1))

assert.True(t, AreSimilar(
`Event by kubelet: Failed x since , :
Failed to pull image "nginx:l4t3st": rpc error: code = Unknown desc = Error response from daemon: manifest for nginx:l4t3st not found: manifest unknown: manifest unknown`,
`Event by kubelet: Failed x since , :
Error: ErrImagePull`,
0.1,
))
assert.False(t, AreSimilar(
`Event by kubelet: Failed x since , :
Failed to pull image "nginx:l4t3st": rpc error: code = Unknown desc = Error response from daemon: manifest for nginx:l4t3st not found: manifest unknown: manifest unknown`,
`Event by kubelet: Failed x since , :
Error: ErrImagePull`,
0.75,
))
assert.False(t, AreSimilar(
`Event by kubelet: Failed x since , :
Failed to pull image "nginx:l4t3st": rpc error: code = Unknown desc = Error response from daemon: manifest for nginx:l4t3st not found: manifest unknown: manifest unknown`,
`Event by kubelet: Failed x since , :
Error: ImagePullBackOff`,
0.75,
))

assert.True(t, AreSimilar(
`Event by kubelet: Failed x since , :
Error: ErrImagePull`,
`Event by kubelet: Failed x since , :
Error: ImagePullBackOff`,
0.6,
))
assert.False(t, AreSimilar(
`Event by kubelet: Failed x since , :
Error: ErrImagePull`,
`Event by kubelet: Failed x since , :
Error: ImagePullBackOff`,
0.95,
))

assert.True(t, AreSimilar(
`Event by kernel-monitor: TaskHung since , :
INFO: task runc:[2:INIT]:293016 blocked for more than 327 seconds.`,
`Event by kernel-monitor: TaskHung since , :
INFO: task runc:[2:INIT]:309147 blocked for more than 327 seconds.`,
0.8,
))
}
34 changes: 34 additions & 0 deletions dedup/temporal.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package dedup

import (
"fmt"
log "github.com/sirupsen/logrus"
"strings"
)

const temporalStart = "<t>"
const temporalEnd = "</t>"

func NormalizeTemporal(message string) string {
for {
temporalStartIndex := strings.Index(message, temporalStart)
if temporalStartIndex == -1 {
break
}
temporalEndIndex := strings.Index(message, temporalEnd)
if temporalEndIndex == -1 || temporalEndIndex < temporalStartIndex {
log.Errorf("invalid temporal format for %v", message)
break
}
message = message[:temporalStartIndex] + message[(temporalEndIndex+len(temporalEnd)):]
}
return message
}

func CleanTemporal(message string) string {
return strings.ReplaceAll(strings.ReplaceAll(message, temporalStart, ""), temporalEnd, "")
}

func WrapTemporal(item interface{}) string {
return fmt.Sprintf("%v%v%v", temporalStart, item, temporalEnd)
}
36 changes: 36 additions & 0 deletions dedup/temporal_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package dedup

import (
"github.com/stretchr/testify/assert"
"testing"
)

func Test_normalizeMessage(t *testing.T) {
assert.Equal(t, "", NormalizeTemporal(""))
assert.Equal(t, "abc", NormalizeTemporal("abc"))
assert.Equal(t, "hello world", NormalizeTemporal("hello world"))
assert.Equal(t, "", NormalizeTemporal("<t>hello world</t>"))
assert.Equal(t, "", NormalizeTemporal("<t></t>"))
assert.Equal(t, "The is here", NormalizeTemporal("The <t>hello world</t> is here"))
assert.Equal(t, "The brown jumps the dog", NormalizeTemporal("The <t>quick</t> brown <t>fox</t> jumps <t>over</t> the <t>lazy</t> dog"))
assert.Equal(t, "t<t>t", NormalizeTemporal("t<t>t"))
assert.Equal(t, "t</t>a<t>t", NormalizeTemporal("t</t>a<t>t"))
assert.Equal(t, "tt", NormalizeTemporal("t<t>t<t></t>t"))
assert.Equal(t, "tt", NormalizeTemporal("t<t>t<t/></t>t"))
assert.Equal(t, "t</t>t", NormalizeTemporal("t<t>t</t></t>t"))
}

func Test_cleanMessage(t *testing.T) {
assert.Equal(t, "", CleanTemporal(""))
assert.Equal(t, "abc", CleanTemporal("abc"))
assert.Equal(t, "hello world", CleanTemporal("hello world"))
assert.Equal(t, "hello world", CleanTemporal("<t>hello world</t>"))
assert.Equal(t, "", CleanTemporal("<t></t>"))
assert.Equal(t, "The hello world is here", CleanTemporal("The <t>hello world</t> is here"))
assert.Equal(t, "The quick brown fox jumps over the lazy dog", CleanTemporal("The <t>quick</t> brown <t>fox</t> jumps <t>over</t> the <t>lazy</t> dog"))
assert.Equal(t, "tt", CleanTemporal("t<t>t"))
assert.Equal(t, "tat", CleanTemporal("t</t>a<t>t"))
assert.Equal(t, "ttt", CleanTemporal("t<t>t<t></t>t"))
assert.Equal(t, "tt<t/>t", CleanTemporal("t<t>t<t/></t>t"))
assert.Equal(t, "ttt", CleanTemporal("t<t>t</t></t>t"))
}
Loading

0 comments on commit 1b68b90

Please sign in to comment.