Suffix trees and suffix arrays are advanced data structures used for efficient string processing tasks such as substring search, longest common substring, and more. They are especially useful for problems involving many queries on the same text.
A suffix tree is a compressed trie of all suffixes of a given string. It provides fast substring searches and can be constructed in linear time.
Key Concepts:
Construction:
Implementation: Below is a basic implementation of a suffix tree.
gopackage main
import (
"fmt"
)
// SuffixTreeNode represents a node in the suffix tree
type SuffixTreeNode struct {
children map[byte]*SuffixTreeNode
start int
end *int
suffixLink *SuffixTreeNode
}
// SuffixTree represents the suffix tree
type SuffixTree struct {
text string
root *SuffixTreeNode
end int
active *activePoint
remainingSuffixCount int
}
// activePoint maintains the active point in the suffix tree
type activePoint struct {
node *SuffixTreeNode
edge byte
length int
}
// NewSuffixTreeNode creates a new suffix tree node
func NewSuffixTreeNode(start int, end *int) *SuffixTreeNode {
return &SuffixTreeNode{
children: make(map[byte]*SuffixTreeNode),
start: start,
end: end,
}
}
// NewSuffixTree creates a new suffix tree
func NewSuffixTree(text string) *SuffixTree {
tree := &SuffixTree{text: text}
tree.root = NewSuffixTreeNode(-1, nil)
tree.active = &activePoint{node: tree.root}
tree.buildSuffixTree()
return tree
}
// EdgeLength returns the length of the edge
func (n *SuffixTreeNode) EdgeLength(position int) int {
if n == nil {
return 0
}
return min(*n.end, position+1) - n.start
}
// BuildSuffixTree constructs the suffix tree using Ukkonen's algorithm
func (tree *SuffixTree) buildSuffixTree() {
size := len(tree.text)
tree.end = -1
for i := 0; i < size; i++ {
tree.extendSuffixTree(i)
}
}
// ExtendSuffixTree extends the suffix tree at a given position
func (tree *SuffixTree) extendSuffixTree(pos int) {
tree.end++
tree.remainingSuffixCount++
var lastNewNode *SuffixTreeNode
for tree.remainingSuffixCount > 0 {
if tree.active.length == 0 {
tree.active.edge = tree.text[pos]
}
if tree.active.node.children[tree.active.edge] == nil {
tree.active.node.children[tree.active.edge] = NewSuffixTreeNode(pos, &tree.end)
if lastNewNode != nil {
lastNewNode.suffixLink = tree.active.node
lastNewNode = nil
}
} else {
next := tree.active.node.children[tree.active.edge]
if tree.walkDown(next) {
continue
}
if tree.text[next.start+tree.active.length] == tree.text[pos] {
if lastNewNode != nil && tree.active.node != tree.root {
lastNewNode.suffixLink = tree.active.node
lastNewNode = nil
}
tree.active.length++
break
}
splitEnd := next.start + tree.active.length - 1
split := NewSuffixTreeNode(next.start, &splitEnd)
tree.active.node.children[tree.active.edge] = split
split.children[tree.text[pos]] = NewSuffixTreeNode(pos, &tree.end)
next.start += tree.active.length
split.children[tree.text[next.start]] = next
if lastNewNode != nil {
lastNewNode.suffixLink = split
}
lastNewNode = split
}
tree.remainingSuffixCount--
if tree.active.node == tree.root && tree.active.length > 0 {
tree.active.length--
tree.active.edge = tree.text[pos-tree.remainingSuffixCount+1]
} else if tree.active.node != tree.root {
tree.active.node = tree.active.node.suffixLink
}
}
}
// WalkDown performs the walk down operation
func (tree *SuffixTree) walkDown(currNode *SuffixTreeNode) bool {
if tree.active.length >= currNode.EdgeLength(tree.end) {
tree.active.edge = tree.text[currNode.start + currNode.EdgeLength(tree.end)]
tree.active.length -= currNode.EdgeLength(tree.end)
tree.active.node = currNode
return true
}
return false
}
// Min returns the minimum of two integers
func min(a, b int) int {
if a < b {
return a
}
return b
}
// PrintSuffixTree prints the suffix tree (for debugging purposes)
func (tree *SuffixTree) PrintSuffixTree() {
tree.print(tree.root, 0)
}
func (tree *SuffixTree) print(node *SuffixTreeNode, depth int) {
if node == nil {
return
}
if node.start != -1 {
fmt.Println(string(tree.text[node.start:min(*node.end, len(tree.text))]))
}
for _, child := range node.children {
tree.print(child, depth+1)
}
}
func main() {
text := "banana"
suffixTree := NewSuffixTree(text)
suffixTree.PrintSuffixTree()
}
A suffix array is an array of integers giving the starting positions of suffixes of a string in lexicographical order. It's more space-efficient than a suffix tree and can be built in linear time.
Construction:
Implementation:
gopackage main
import (
"fmt"
"sort"
)
// SuffixArrayEntry represents a suffix array entry
type SuffixArrayEntry struct {
index int
rank [2]int
}
// SuffixArray constructs the suffix array using the efficient approach
func SuffixArray(text string) []int {
n := len(text)
suffixes := make([]SuffixArrayEntry, n)
for i := range text {
suffixes[i] = SuffixArrayEntry{index: i, rank: [2]int{int(text[i]), -1}}
if i+1 < n {
suffixes[i].rank[1] = int(text[i+1])
}
}
sort.Slice(suffixes, func(i, j int) bool {
return suffixes[i].rank[0] < suffixes[j].rank[0]
})
index := make([]int, n)
k := 4
for k < 2*n {
rank := 0
prevRank := suffixes[0].rank
suffixes[0].rank[0] = rank
index[suffixes[0].index] = 0
for i := 1; i < n; i++ {
if suffixes[i].rank == prevRank {
prevRank = suffixes[i].rank
suffixes[i].rank[0] = rank
} else {
prevRank = suffixes[i].rank
rank++
suffixes[i].rank[0] = rank
}
index[suffixes[i].index] = i
}
for i := 0; i < n; i++ {
nextIndex := suffixes[i].index + k/2
if nextIndex < n {
suffixes[i].rank[1] = suffixes[index[nextIndex]].rank[0]
} else {
suffixes[i].rank[1] = -1
}
}
sort.Slice(suffixes, func(i, j int) bool {
return suffixes[i].rank[0] < suffixes[j].rank[0]
})
k *= 2
}
suffixArray := make([]int, n)
for i := range suffixes {
suffixArray[i] = suffixes[i].index
}
return suffixArray
}
func main() {
text := "banana"
suffixArray := SuffixArray(text)
fmt.Println("Suffix Array:", suffixArray)
}
Understanding and implementing these data structures will enhance your ability to solve complex string processing problems efficiently.