Skip to content

Commit

Permalink
fix letter dropping issues in dicttomap/cleaning process
Browse files Browse the repository at this point in the history
make hashtron accurate on cleaned words
  • Loading branch information
neurlang authored and Your Name committed Feb 26, 2025
1 parent 69730b7 commit 1868c0f
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 18 deletions.
15 changes: 15 additions & 0 deletions cmd/dicttomap/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,22 @@ func main() {
i = 0
}
}
if len(sword1) > 0 {
for i := range sword1 {
if nodel != nil && *nodel && sword2[i] == "" {
continue
}
mut.Lock()
histogram[[2]string{sword1[i], sword2[i]}]++
mut.Unlock()
}
osword[0] += " " + spacesep(sword1)
osword[1] += " " + spacesep(sword2)
}


if len(osword[0]) > 0 && len(osword[1]) > 0 {
//println(word1, word2, osword[0][1:], osword[1][1:])
osword[0] = osword[0][1:]
osword[1] = osword[1][1:]
writer.AddRow(osword[:])
Expand Down
29 changes: 11 additions & 18 deletions repo/hashtron_phonemizer_repo.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ type language struct {
SrcMultiSuffix []string `json:"SrcMultiSuffix"`
DstMultiSuffix []string `json:"DstMultiSuffix"`
DropLast []string `json:"DropLast"`
version int
mapSrcMultiLen int
mapSrcMultiSufLen int
mapSrcMulti map[string]struct{}
Expand Down Expand Up @@ -121,13 +120,7 @@ func (l *languages) SrcMulti(isReverse bool, lang string) map[string]struct{} {
}
return (*l)[lang+reverse].mapSrcMulti
}
func (l *languages) Version(isReverse bool, lang string) int {
var reverse string
if isReverse {
reverse = "_reverse"
}
return (*l)[lang+reverse].version
}

func (l *languages) DstMulti(isReverse bool, lang string) map[string]struct{} {
var reverse string
if isReverse {
Expand Down Expand Up @@ -322,8 +315,6 @@ func (r *HashtronPhonemizerRepository) LoadLanguage(isReverse bool, lang string)
net.NewLayer(1, 0)

(*r.nets)[lang+reverse] = &net

(*r.lang)[lang+reverse].version = 1

}
err := (*r.nets)[lang+reverse].ReadZlibWeights(bytesReader)
Expand Down Expand Up @@ -509,30 +500,32 @@ outer:
i := i - lastspace
r.mut.RLock()
net := (*r.nets)[lang+reverse]
version := r.lang.Version(isReverse, lang)
r.mut.RUnlock()
var multiword = lastspace > 0
if net == nil {
continue
}
var predicted int
for q := 0; (version == 0 && q == 0) || (version == 1 && q < len(srcaR)-i); q++ {
for q := 0; (!multiword && q == 0) || (multiword && q < len(srcaR)-i); q++ {
var input = phonemizer.NewSample{
SrcA: copystrings(srcaR[:len(srcaR)-q]),
DstA: copystrings(dstaR[0:i]),
SrcCut: copystrings(srcaR[0:i]),
SrcFut: copystrings(srcaR[i:len(srcaR)-q]),
Option: option,
}
var pred int
r.mut.RLock()
if version == 1 {
predicted += int(net.Infer2(input.V1()))
} else {
predicted += int(net.Infer2(&input))
if net.LenLayers() == 3 {
pred = int(net.Infer2(input.V1()))
} else { // 5 layers, old model
pred = int(net.Infer2(&input))
}
r.mut.RUnlock()
//fmt.Println(input.SrcA, input.DstA, input.SrcCut, input.SrcFut, input.Option, predicted)
predicted += pred
//fmt.Println(input.SrcA, input.DstA, input.SrcCut, input.SrcFut, input.Option, pred)
}
if (version == 0 && predicted == 1) || (version == 1 && 2*predicted > len(srcaR)) {
if (!multiword && predicted == 1) || (multiword && 2*predicted > len(srcaR)) {
if option == "_" {
option = ""
} else if strings.HasPrefix(option, "_") {
Expand Down

0 comments on commit 1868c0f

Please sign in to comment.