suffixarray

Imports

Imports #

"bytes"
"encoding/binary"
"errors"
"io"
"math"
"regexp"
"slices"
"sort"

Constants & Variables

bufSize const #

const bufSize = *ast.BinaryExpr

errTooBig var #

var errTooBig = *ast.CallExpr

maxData32 var #

Can change for testing

var maxData32 int = realMaxData32

realMaxData32 const #

const realMaxData32 = math.MaxInt32

Structs

Index struct #

Index implements a suffix array for fast substring search.

type Index struct {
data []byte
sa ints
}

ints struct #

An ints is either an []int32 or an []int64. That is, one of them is empty, and one is the real data. The int64 form is used when len(data) > maxData32

type ints struct {
int32 []int32
int64 []int64
}

Functions

Bytes method #

Bytes returns the data over which the index was created. It must not be modified.

func (x *Index) Bytes() []byte

FindAllIndex method #

FindAllIndex returns a sorted list of non-overlapping matches of the regular expression r, where a match is a pair of indices specifying the matched slice of x.Bytes(). If n < 0, all matches are returned in successive order. Otherwise, at most n matches are returned and they may not be successive. The result is nil if there are no matches, or if n == 0.

func (x *Index) FindAllIndex(r *regexp.Regexp, n int) (result [][]int)

Lookup method #

Lookup returns an unsorted list of at most n indices where the byte string s occurs in the indexed data. If n < 0, all occurrences are returned. The result is nil if s is empty, s is not found, or n == 0. Lookup time is O(log(N)*len(s) + len(result)) where N is the size of the indexed data.

func (x *Index) Lookup(s []byte, n int) (result []int)

New function #

New creates a new [Index] for data. [Index] creation time is O(N) for N = len(data).

func New(data []byte) *Index

Read method #

Read reads the index from r into x; x must not be nil.

func (x *Index) Read(r io.Reader) error

Write method #

Write writes the index x to w.

func (x *Index) Write(w io.Writer) error

assignID_32 function #

func assignID_32(text []int32, sa []int32, numLMS int) int

assignID_64 function #

func assignID_64(text []int64, sa []int64, numLMS int) int

assignID_8_32 function #

assignID_8_32 assigns a dense ID numbering to the set of LMS-substrings respecting string ordering and equality, returning the maximum assigned ID. For example given the input "ababab", the LMS-substrings are "aba", "aba", and "ab", renumbered as 2 2 1. sa[len(sa)-numLMS:] holds the LMS-substring indexes sorted in string order, so to assign numbers we can consider each in turn, removing adjacent duplicates. The new ID for the LMS-substring at index j is written to sa[j/2], overwriting the length previously stored there (by length_8_32 above).

func assignID_8_32(text []byte, sa []int32, numLMS int) int

assignID_8_64 function #

func assignID_8_64(text []byte, sa []int64, numLMS int) int

at method #

func (x *Index) at(i int) []byte

bucketMax_32 function #

func bucketMax_32(text []int32, freq []int32, bucket []int32)

bucketMax_64 function #

func bucketMax_64(text []int64, freq []int64, bucket []int64)

bucketMax_8_32 function #

bucketMax_8_32 stores into bucket[c] the maximum index in the bucket for character c in a bucket-sort of text. The bucket indexes for c are [min, max). That is, max is one past the final index in that bucket.

func bucketMax_8_32(text []byte, freq []int32, bucket []int32)

bucketMax_8_64 function #

func bucketMax_8_64(text []byte, freq []int64, bucket []int64)

bucketMin_32 function #

func bucketMin_32(text []int32, freq []int32, bucket []int32)

bucketMin_64 function #

func bucketMin_64(text []int64, freq []int64, bucket []int64)

bucketMin_8_32 function #

bucketMin_8_32 stores into bucket[c] the minimum index in the bucket for character c in a bucket-sort of text.

func bucketMin_8_32(text []byte, freq []int32, bucket []int32)

bucketMin_8_64 function #

func bucketMin_8_64(text []byte, freq []int64, bucket []int64)

expand_32 function #

func expand_32(text []int32, freq []int32, bucket []int32, sa []int32, numLMS int)

expand_64 function #

func expand_64(text []int64, freq []int64, bucket []int64, sa []int64, numLMS int)

expand_8_32 function #

expand_8_32 distributes the compacted, sorted LMS-suffix indexes from sa[:numLMS] into the tops of the appropriate buckets in sa, preserving the sorted order and making room for the L-type indexes to be slotted into the sorted sequence by induceL_8_32.

func expand_8_32(text []byte, freq []int32, bucket []int32, sa []int32, numLMS int)

expand_8_64 function #

func expand_8_64(text []byte, freq []int64, bucket []int64, sa []int64, numLMS int)

freq_32 function #

func freq_32(text []int32, freq []int32, bucket []int32) []int32

freq_64 function #

func freq_64(text []int64, freq []int64, bucket []int64) []int64

freq_8_32 function #

freq_8_32 returns the character frequencies for text, as a slice indexed by character value. If freq is nil, freq_8_32 uses and returns bucket. If freq is non-nil, freq_8_32 assumes that freq[0] >= 0 means the frequencies are already computed. If the frequency data is overwritten or uninitialized, the caller must set freq[0] = -1 to force recomputation the next time it is needed.

func freq_8_32(text []byte, freq []int32, bucket []int32) []int32

freq_8_64 function #

func freq_8_64(text []byte, freq []int64, bucket []int64) []int64

get method #

func (a *ints) get(i int) int64

induceL_32 function #

func induceL_32(text []int32, sa []int32, freq []int32, bucket []int32)

induceL_64 function #

func induceL_64(text []int64, sa []int64, freq []int64, bucket []int64)

induceL_8_32 function #

induceL_8_32 inserts L-type text indexes into sa, assuming that the leftmost S-type indexes are inserted into sa, in sorted order, in the right bucket halves. It leaves all the L-type indexes in sa, but the leftmost L-type indexes are negated, to mark them for processing by induceS_8_32.

func induceL_8_32(text []byte, sa []int32, freq []int32, bucket []int32)

induceL_8_64 function #

func induceL_8_64(text []byte, sa []int64, freq []int64, bucket []int64)

induceS_32 function #

func induceS_32(text []int32, sa []int32, freq []int32, bucket []int32)

induceS_64 function #

func induceS_64(text []int64, sa []int64, freq []int64, bucket []int64)

induceS_8_32 function #

func induceS_8_32(text []byte, sa []int32, freq []int32, bucket []int32)

induceS_8_64 function #

func induceS_8_64(text []byte, sa []int64, freq []int64, bucket []int64)

induceSubL_32 function #

func induceSubL_32(text []int32, sa []int32, freq []int32, bucket []int32)

induceSubL_64 function #

func induceSubL_64(text []int64, sa []int64, freq []int64, bucket []int64)

induceSubL_8_32 function #

induceSubL_8_32 inserts the L-type text indexes of LMS-substrings into sa, assuming that the final characters of the LMS-substrings are already inserted into sa, sorted by final character, and at the right (not left) end of the corresponding character bucket. Each LMS-substring has the form (as a regexp) /S+L+S/: one or more S-type, one or more L-type, final S-type. induceSubL_8_32 leaves behind only the leftmost L-type text index for each LMS-substring. That is, it removes the final S-type indexes that are present on entry, and it inserts but then removes the interior L-type indexes too. (Only the leftmost L-type index is needed by induceSubS_8_32.)

func induceSubL_8_32(text []byte, sa []int32, freq []int32, bucket []int32)

induceSubL_8_64 function #

func induceSubL_8_64(text []byte, sa []int64, freq []int64, bucket []int64)

induceSubS_32 function #

func induceSubS_32(text []int32, sa []int32, freq []int32, bucket []int32)

induceSubS_64 function #

func induceSubS_64(text []int64, sa []int64, freq []int64, bucket []int64)

induceSubS_8_32 function #

induceSubS_8_32 inserts the S-type text indexes of LMS-substrings into sa, assuming that the leftmost L-type text indexes are already inserted into sa, sorted by LMS-substring suffix, and at the left end of the corresponding character bucket. Each LMS-substring has the form (as a regexp) /S+L+S/: one or more S-type, one or more L-type, final S-type. induceSubS_8_32 leaves behind only the leftmost S-type text index for each LMS-substring, in sorted order, at the right end of sa. That is, it removes the L-type indexes that are present on entry, and it inserts but then removes the interior S-type indexes too, leaving the LMS-substring start indexes packed into sa[len(sa)-numLMS:]. (Only the LMS-substring start indexes are processed by the recursion.)

func induceSubS_8_32(text []byte, sa []int32, freq []int32, bucket []int32)

induceSubS_8_64 function #

func induceSubS_8_64(text []byte, sa []int64, freq []int64, bucket []int64)

len method #

func (a *ints) len() int

length_32 function #

func length_32(text []int32, sa []int32, numLMS int)

length_64 function #

func length_64(text []int64, sa []int64, numLMS int)

length_8_32 function #

length_8_32 computes and records the length of each LMS-substring in text. The length of the LMS-substring at index j is stored at sa[j/2], avoiding the LMS-substring indexes already stored in the top half of sa. (If index j is an LMS-substring start, then index j-1 is type L and cannot be.) There are two exceptions, made for optimizations in name_8_32 below. First, the final LMS-substring is recorded as having length 0, which is otherwise impossible, instead of giving it a length that includes the implicit sentinel. This ensures the final LMS-substring has length unequal to all others and therefore can be detected as different without text comparison (it is unequal because it is the only one that ends in the implicit sentinel, and the text comparison would be problematic since the implicit sentinel is not actually present at text[len(text)]). Second, to avoid text comparison entirely, if an LMS-substring is very short, sa[j/2] records its actual text instead of its length, so that if two such substrings have matching “length,” the text need not be read at all. The definition of “very short” is that the text bytes must pack into a uint32, and the unsigned encoding e must be ≥ len(text), so that it can be distinguished from a valid length.

func length_8_32(text []byte, sa []int32, numLMS int)

length_8_64 function #

func length_8_64(text []byte, sa []int64, numLMS int)

lookupAll method #

lookupAll returns a slice into the matching region of the index. The runtime is O(log(N)*len(s)).

func (x *Index) lookupAll(s []byte) ints

map_32 function #

map_32 maps the LMS-substrings in text to their new IDs, producing the subproblem for the recursion. The mapping itself was mostly applied by assignID_8_32: sa[i] is either 0, the ID for the LMS-substring at index 2*i, or the ID for the LMS-substring at index 2*i+1. To produce the subproblem we need only remove the zeros and change ID into ID-1 (our IDs start at 1, but text chars start at 0). map_32 packs the result, which is the input to the recursion, into the top of sa, so that the recursion result can be stored in the bottom of sa, which sets up for expand_8_32 well.

func map_32(sa []int32, numLMS int)

map_64 function #

func map_64(sa []int64, numLMS int)

placeLMS_32 function #

func placeLMS_32(text []int32, sa []int32, freq []int32, bucket []int32) int

placeLMS_64 function #

func placeLMS_64(text []int64, sa []int64, freq []int64, bucket []int64) int

placeLMS_8_32 function #

placeLMS_8_32 places into sa the indexes of the final characters of the LMS substrings of text, sorted into the rightmost ends of their correct buckets in the suffix array. The imaginary sentinel character at the end of the text is the final character of the final LMS substring, but there is no bucket for the imaginary sentinel character, which has a smaller value than any real character. The caller must therefore pretend that sa[-1] == len(text). The text indexes of LMS-substring characters are always ≥ 1 (the first LMS-substring must be preceded by one or more L-type characters that are not part of any LMS-substring), so using 0 as a “not present” suffix array entry is safe, both in this function and in most later functions (until induceL_8_32 below).

func placeLMS_8_32(text []byte, sa []int32, freq []int32, bucket []int32) int

placeLMS_8_64 function #

func placeLMS_8_64(text []byte, sa []int64, freq []int64, bucket []int64) int

readInt function #

readInt reads an int x from r using buf to buffer the read and returns x.

func readInt(r io.Reader, buf []byte) (int64, error)

readSlice function #

readSlice reads data[:n] from r and returns n. It uses buf to buffer the read.

func readSlice(r io.Reader, buf []byte, data ints) (n int, err error)

recurse_32 function #

recurse_32 calls sais_32 recursively to solve the subproblem we've built. The subproblem is at the right end of sa, the suffix array result will be written at the left end of sa, and the middle of sa is available for use as temporary frequency and bucket storage.

func recurse_32(sa []int32, oldTmp []int32, numLMS int, maxID int)

recurse_64 function #

func recurse_64(sa []int64, oldTmp []int64, numLMS int, maxID int)

sais_32 function #

func sais_32(text []int32, textMax int, sa []int32, tmp []int32)

sais_64 function #

func sais_64(text []int64, textMax int, sa []int64, tmp []int64)

sais_8_32 function #

sais_8_32 computes the suffix array of text. The text must contain only values in [0, textMax). The suffix array is stored in sa, which the caller must ensure is already zeroed. The caller must also provide temporary space tmp with len(tmp) ≥ textMax. If len(tmp) ≥ 2*textMax then the algorithm runs a little faster. If sais_8_32 modifies tmp, it sets tmp[0] = -1 on return.

func sais_8_32(text []byte, textMax int, sa []int32, tmp []int32)

sais_8_64 function #

func sais_8_64(text []byte, textMax int, sa []int64, tmp []int64)

set method #

func (a *ints) set(i int, v int64)

slice method #

func (a *ints) slice(i int, j int) ints

text_32 function #

text_32 returns the suffix array for the input text. It requires that len(text) fit in an int32 and that the caller zero sa.

func text_32(text []byte, sa []int32)

text_64 function #

func text_64(text []byte, sa []int64)

unmap_32 function #

func unmap_32(text []int32, sa []int32, numLMS int)

unmap_64 function #

func unmap_64(text []int64, sa []int64, numLMS int)

unmap_8_32 function #

unmap_8_32 unmaps the subproblem back to the original. sa[:numLMS] is the LMS-substring numbers, which don't matter much anymore. sa[len(sa)-numLMS:] is the sorted list of those LMS-substring numbers. The key part is that if the list says K that means the K'th substring. We can replace sa[:numLMS] with the indexes of the LMS-substrings. Then if the list says K it really means sa[K]. Having mapped the list back to LMS-substring indexes, we can place those into the right buckets.

func unmap_8_32(text []byte, sa []int32, numLMS int)

unmap_8_64 function #

func unmap_8_64(text []byte, sa []int64, numLMS int)

writeInt function #

writeInt writes an int x to w using buf to buffer the write.

func writeInt(w io.Writer, buf []byte, x int) error

writeSlice function #

writeSlice writes data[:n] to w and returns n. It uses buf to buffer the write.

func writeSlice(w io.Writer, buf []byte, data ints) (n int, err error)

Generated with Arrow