Imports #
"math"
"math/bits"
"sort"
"bufio"
"io"
"math/bits"
"strconv"
"sync"
"errors"
"fmt"
"io"
"math"
"math"
"io"
"math"
"math/bits"
"sort"
"bufio"
"io"
"math/bits"
"strconv"
"sync"
"errors"
"fmt"
"io"
"math"
"math"
"io"
const BestCompression = 9
const BestSpeed = 1
const DefaultCompression = *ast.UnaryExpr
HuffmanOnly disables Lempel-Ziv match searching and only performs Huffman entropy encoding. This mode is useful in compressing data that has already been compressed with an LZ style algorithm (e.g. Snappy or LZ4) that lacks an entropy encoder. Compression gains are achieved when certain bytes in the input stream occur more frequently than others. Note that HuffmanOnly produces a compressed output that is RFC 1951 compliant. That is, any valid DEFLATE decompressor will continue to be able to decompress this output.
const HuffmanOnly = *ast.UnaryExpr
const NoCompression = 0
const badCode = 255
The LZ77 step produces a sequence of literal tokens and
const baseMatchLength = 3
const baseMatchOffset = 1
bufferFlushSize indicates the buffer size after which bytes are flushed to the writer. Should preferably be a multiple of 6, since we accumulate 6 bytes between writes to the buffer.
const bufferFlushSize = 240
Reset the buffer offset when reaching this. Offsets are stored between blocks as int32 values. Since the offset we are checking against is at the beginning of the buffer, we need to subtract the current and input buffer to not risk overflowing the int32.
const bufferReset = *ast.BinaryExpr
bufferSize is the actual output byte buffer size. It must have additional headroom for a flush which can contain up to 8 bytes.
const bufferSize = *ast.BinaryExpr
var codeOrder = [...]int{...}
The number of codegen codes.
const codegenCodeCount = 19
The odd order in which the codegen code sizes are written.
var codegenOrder = []uint32{...}
The special code used to mark the end of a block.
const endBlockMarker = 256
var errWriterClosed = *ast.CallExpr
var fixedHuffmanDecoder huffmanDecoder
var fixedLiteralEncoding *huffmanEncoder = *ast.CallExpr
var fixedOffsetEncoding *huffmanEncoder = *ast.CallExpr
Initialize the fixedHuffmanDecoder only once upon first use.
var fixedOnce sync.Once
const hashBits = 17
const hashMask = *ast.BinaryExpr
const hashSize = *ast.BinaryExpr
const hashmul = 0x1e35a7bd
huffOffset is a static offset encoder used for huffman only encoding. It can be reused since we will not be encoding offset values.
var huffOffset *huffmanEncoder
const huffmanChunkBits = 9
const huffmanCountMask = 15
const huffmanNumChunks = *ast.BinaryExpr
const huffmanValueShift = 4
These constants are defined by the Snappy implementation so that its assembly implementation can fast-path some 16-bytes-at-a-time copies. They aren't necessary in the pure Go implementation, as we don't use those same optimizations, but using the same thresholds doesn't really hurt.
const inputMargin = *ast.BinaryExpr
The length indicated by length code X - LENGTH_CODES_START.
var lengthBase = []uint32{...}
The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH) is lengthCodes[length - MIN_MATCH_LENGTH]
var lengthCodes = [...]uint32{...}
The first length code.
const lengthCodesStart = 257
The number of extra bits needed by length code X - LENGTH_CODES_START.
var lengthExtraBits = []int8{...}
2 bits: type 0 = literal 1=EOF 2=Match 3=Unused 8 bits: xlength = length - MIN_MATCH_LENGTH 22 bits xoffset = offset - MIN_OFFSET_SIZE, or literal
const lengthShift = 22
var levels = []compressionLevel{...}
const literalType = *ast.BinaryExpr
const logWindowSize = 15
const matchType = *ast.BinaryExpr
const maxBitsLimit = 16
const maxCodeLen = 16
The maximum number of tokens we put into a single flate block, just to stop things from getting too large.
const maxFlateBlockTokens = *ast.BinaryExpr
const maxHashOffset = *ast.BinaryExpr
const maxMatchLength = 258
const maxMatchOffset = *ast.BinaryExpr
const maxNumDist = 30
The next three numbers come from the RFC section 3.2.7, with the additional proviso in section 3.2.5 which implies that distance codes 30 and 31 should never occur in compressed data.
const maxNumLit = 286
const maxStoreBlockSize = 65535
const minMatchLength = 4
These constants are defined by the Snappy implementation so that its assembly implementation can fast-path some 16-bytes-at-a-time copies. They aren't necessary in the pure Go implementation, as we don't use those same optimizations, but using the same thresholds doesn't really hurt.
const minNonLiteralBlockSize = *ast.BinaryExpr
const numCodes = 19
var offsetBase = []uint32{...}
The largest offset code.
const offsetCodeCount = 30
var offsetCodes = [...]uint32{...}
offset code word extra bits.
var offsetExtraBits = []int8{...}
const offsetMask = *ast.BinaryExpr
const skipNever = math.MaxInt32
const tableBits = 14
const tableMask = *ast.BinaryExpr
const tableShift = *ast.BinaryExpr
const tableSize = *ast.BinaryExpr
const typeMask = *ast.BinaryExpr
const windowMask = *ast.BinaryExpr
const windowSize = *ast.BinaryExpr
A CorruptInputError reports the presence of corrupt input at a given offset.
type CorruptInputError int64
An InternalError reports an error in the flate code itself.
type InternalError string
type byFreq []literalNode
type byLiteral []literalNode
type token uint32
The actual read interface needed by [NewReader]. If the passed in io.Reader does not also have ReadByte, the [NewReader] will introduce its own buffering.
type Reader interface {
io.Reader
io.ByteReader
}
Resetter resets a ReadCloser returned by [NewReader] or [NewReaderDict] to switch to a new underlying [Reader]. This permits reusing a ReadCloser instead of allocating a new one.
type Resetter interface {
Reset(r io.Reader, dict []byte) error
}
A ReadError reports an error encountered while reading input. Deprecated: No longer returned.
type ReadError struct {
Offset int64
Err error
}
A WriteError reports an error encountered while writing output. Deprecated: No longer returned.
type WriteError struct {
Offset int64
Err error
}
A Writer takes data written to it and writes the compressed form of that data to an underlying writer (see [NewWriter]).
type Writer struct {
d compressor
dict []byte
}
type compressionLevel struct {
level int
good int
lazy int
nice int
chain int
fastSkipHashing int
}
type compressor struct {
compressionLevel
w *huffmanBitWriter
bulkHasher func([]byte, []uint32)
fill func(*compressor, []byte) int
step func(*compressor)
bestSpeed *deflateFast
chainHead int
hashHead [hashSize]uint32
hashPrev [windowSize]uint32
hashOffset int
index int
window []byte
windowEnd int
blockStart int
byteAvailable bool
sync bool
tokens []token
length int
offset int
maxInsertIndex int
err error
hashMatch [*ast.BinaryExpr]uint32
}
Decompress state.
type decompressor struct {
r Reader
rBuf *bufio.Reader
roffset int64
b uint32
nb uint
h1 huffmanDecoder
h2 huffmanDecoder
bits *[*ast.BinaryExpr]int
codebits *[numCodes]int
dict dictDecoder
buf [4]byte
step func(*decompressor)
stepState int
final bool
err error
toRead []byte
hl *huffmanDecoder
hd *huffmanDecoder
copyLen int
copyDist int
}
deflateFast maintains the table for matches, and the previous byte block for cross block matching.
type deflateFast struct {
table [tableSize]tableEntry
prev []byte
cur int32
}
dictDecoder implements the LZ77 sliding dictionary as used in decompression. LZ77 decompresses data through sequences of two forms of commands: - Literal insertions: Runs of one or more symbols are inserted into the data stream as is. This is accomplished through the writeByte method for a single symbol, or combinations of writeSlice/writeMark for multiple symbols. Any valid stream must start with a literal insertion if no preset dictionary is used. - Backward copies: Runs of one or more symbols are copied from previously emitted data. Backward copies come as the tuple (dist, length) where dist determines how far back in the stream to copy from and length determines how many bytes to copy. Note that it is valid for the length to be greater than the distance. Since LZ77 uses forward copies, that situation is used to perform a form of run-length encoding on repeated runs of symbols. The writeCopy and tryWriteCopy are used to implement this command. For performance reasons, this implementation performs little to no sanity checks about the arguments. As such, the invariants documented for each method call must be respected.
type dictDecoder struct {
hist []byte
wrPos int
rdPos int
full bool
}
type dictWriter struct {
w io.Writer
}
hcode is a huffman code with a bit code and bit length.
type hcode struct {
code uint16
len uint16
}
type huffmanBitWriter struct {
writer io.Writer
bits uint64
nbits uint
bytes [bufferSize]byte
codegenFreq [codegenCodeCount]int32
nbytes int
literalFreq []int32
offsetFreq []int32
codegen []uint8
literalEncoding *huffmanEncoder
offsetEncoding *huffmanEncoder
codegenEncoding *huffmanEncoder
err error
}
type huffmanDecoder struct {
min int
chunks [huffmanNumChunks]uint32
links [][]uint32
linkMask uint32
}
type huffmanEncoder struct {
codes []hcode
freqcache []literalNode
bitCount [17]int32
lns byLiteral
lfs byFreq
}
A levelInfo describes the state of the constructed tree for a given depth.
type levelInfo struct {
level int32
lastFreq int32
nextCharFreq int32
nextPairFreq int32
needed int32
}
type literalNode struct {
literal uint16
freq int32
}
type tableEntry struct {
val uint32
offset int32
}
func (f *decompressor) Close() error
Close flushes and closes the writer.
func (w *Writer) Close() error
func (e *WriteError) Error() string
func (e *ReadError) Error() string
func (e InternalError) Error() string
func (e CorruptInputError) Error() string
Flush flushes any pending data to the underlying writer. It is useful mainly in compressed network protocols, to ensure that a remote reader has enough data to reconstruct a packet. Flush does not return until the data has been written. Calling Flush when there is no pending data still causes the [Writer] to emit a sync marker of at least 4 bytes. If the underlying writer returns an error, Flush returns that error. In the terminology of the zlib library, Flush is equivalent to Z_SYNC_FLUSH.
func (w *Writer) Flush() error
func (s byFreq) Len() int
func (s byLiteral) Len() int
func (s byLiteral) Less(i int, j int) bool
func (s byFreq) Less(i int, j int) bool
NewReader returns a new ReadCloser that can be used to read the uncompressed version of r. If r does not also implement [io.ByteReader], the decompressor may read more data than necessary from r. The reader returns [io.EOF] after the final block in the DEFLATE stream has been encountered. Any trailing data after the final block is ignored. The [io.ReadCloser] returned by NewReader also implements [Resetter].
func NewReader(r io.Reader) io.ReadCloser
NewReaderDict is like [NewReader] but initializes the reader with a preset dictionary. The returned [Reader] behaves as if the uncompressed data stream started with the given dictionary, which has already been read. NewReaderDict is typically used to read data compressed by NewWriterDict. The ReadCloser returned by NewReaderDict also implements [Resetter].
func NewReaderDict(r io.Reader, dict []byte) io.ReadCloser
NewWriter returns a new [Writer] compressing data at the given level. Following zlib, levels range from 1 ([BestSpeed]) to 9 ([BestCompression]); higher levels typically run slower but compress more. Level 0 ([NoCompression]) does not attempt any compression; it only adds the necessary DEFLATE framing. Level -1 ([DefaultCompression]) uses the default compression level. Level -2 ([HuffmanOnly]) will use Huffman compression only, giving a very fast compression for all types of input, but sacrificing considerable compression efficiency. If level is in the range [-2, 9] then the error returned will be nil. Otherwise the error returned will be non-nil.
func NewWriter(w io.Writer, level int) (*Writer, error)
NewWriterDict is like [NewWriter] but initializes the new [Writer] with a preset dictionary. The returned [Writer] behaves as if the dictionary had been written to it without producing any compressed output. The compressed data written to w can only be decompressed by a [Reader] initialized with the same dictionary.
func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error)
func (f *decompressor) Read(b []byte) (int, error)
func (f *decompressor) Reset(r io.Reader, dict []byte) error
Reset discards the writer's state and makes it equivalent to the result of [NewWriter] or [NewWriterDict] called with dst and w's level and dictionary.
func (w *Writer) Reset(dst io.Writer)
func (s byLiteral) Swap(i int, j int)
func (s byFreq) Swap(i int, j int)
func (w *dictWriter) Write(b []byte) (n int, err error)
Write writes data to w, which will eventually write the compressed form of data to its underlying writer.
func (w *Writer) Write(data []byte) (n int, err error)
Look at the leaves and assign them a bit count and an encoding as specified in RFC 1951 3.2.2
func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalNode)
availRead reports the number of bytes that can be flushed by readFlush.
func (dd *dictDecoder) availRead() int
availWrite reports the available amount of output buffer space.
func (dd *dictDecoder) availWrite() int
bitCounts computes the number of literals assigned to each bit size in the Huffman encoding. It is only called when list.length >= 3. The cases of 0, 1, and 2 literals are handled by special case code. list is an array of the literals with non-zero frequencies and their associated frequencies. The array is in order of increasing frequency and has as its last element a special element with frequency MaxInt32. maxBits is the maximum number of bits that should be used to encode any literal. It must be less than 16. bitCounts returns an integer slice in which slice[i] indicates the number of literals that should be encoded in i bits.
func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32
func (h *huffmanEncoder) bitLength(freq []int32) int
bulkHash4 will compute hashes using the same algorithm as hash4.
func bulkHash4(b []byte, dst []uint32)
func (d *compressor) close() error
copyData copies f.copyLen bytes from the underlying reader into f.hist. It pauses for reads when f.hist is full.
func (f *decompressor) copyData()
Copy a single uncompressed data block from input to output.
func (f *decompressor) dataBlock()
func (d *compressor) deflate()
dynamicSize returns the size of dynamically encoded data in bits.
func (w *huffmanBitWriter) dynamicSize(litEnc *huffmanEncoder, offEnc *huffmanEncoder, extraBits int) (size int, numCodegens int)
func emitLiteral(dst []token, lit []byte) []token
encSpeed will compress and store the currently added data, if enough has been accumulated or we at the end of the stream. Any error that occurred will be in d.err
func (d *compressor) encSpeed()
encode encodes a block given in src and appends tokens to dst and returns the result.
func (e *deflateFast) encode(dst []token, src []byte) []token
func (d *compressor) fillDeflate(b []byte) int
func (d *compressor) fillStore(b []byte) int
fillWindow will fill the current window with the supplied dictionary and calculate all hashes. This is much faster than doing a full encode. Should only be used after a reset.
func (d *compressor) fillWindow(b []byte)
Try to find a match starting at index whose length is greater than prevSize. We only look at chainCount possibilities before giving up.
func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length int, offset int, ok bool)
func (f *decompressor) finishBlock()
func fixedHuffmanDecoderInit()
fixedSize returns the size of dynamically encoded data in bits.
func (w *huffmanBitWriter) fixedSize(extraBits int) int
func (w *huffmanBitWriter) flush()
Update this Huffman Code object to be the minimum code for the specified frequency count. freq is an array of frequencies, in which freq[i] gives the frequency of literal i. maxBits The maximum number of bits to use for any literal.
func (h *huffmanEncoder) generate(freq []int32, maxBits int32)
RFC 1951 3.2.7 specifies a special run-length encoding for specifying the literal and offset lengths arrays (which are concatenated into a single array). This method generates that run-length encoding. The result is written into the codegen array, and the frequencies of each code is written into the codegenFreq array. Codes 0-15 are single byte codes. Codes 16-18 are followed by additional information. Code badCode is an end marker numLiterals The number of literals in literalEncoding numOffsets The number of offsets in offsetEncoding litenc, offenc The literal and offset encoder to use
func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litEnc *huffmanEncoder, offEnc *huffmanEncoder)
Generates a HuffmanCode corresponding to the fixed literal table.
func generateFixedLiteralEncoding() *huffmanEncoder
func generateFixedOffsetEncoding() *huffmanEncoder
func hash(u uint32) uint32
hash4 returns a hash representation of the first 4 bytes of the supplied slice. The caller must ensure that len(b) >= 4.
func hash4(b []byte) uint32
histSize reports the total amount of historical data in the dictionary.
func (dd *dictDecoder) histSize() int
histogram accumulates a histogram of b in h. len(h) must be >= 256, and h's elements must be all zeroes.
func histogram(b []byte, h []int32)
Read the next Huffman-encoded symbol from f according to h.
func (f *decompressor) huffSym(h *huffmanDecoder) (int, error)
Decode a single Huffman block from f. hl and hd are the Huffman states for the lit/length values and the distance values, respectively. If hd == nil, using the fixed distance encoding associated with fixed Huffman blocks.
func (f *decompressor) huffmanBlock()
indexTokens indexes a slice of tokens, and updates literalFreq and offsetFreq, and generates literalEncoding and offsetEncoding. The number of literal and offset tokens is returned.
func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals int, numOffsets int)
func (d *compressor) init(w io.Writer, level int) (err error)
func init()
Initialize Huffman decoding tables from array of code lengths. Following this function, h is guaranteed to be initialized into a complete tree (i.e., neither over-subscribed nor under-subscribed). The exception is a degenerate case where the tree has only a single symbol with length 1. Empty trees are permitted.
func (h *huffmanDecoder) init(lengths []int) bool
init initializes dictDecoder to have a sliding window dictionary of the given size. If a preset dict is provided, it will initialize the dictionary with the contents of dict.
func (dd *dictDecoder) init(size int, dict []byte)
func (d *compressor) initDeflate()
func (t token) length() uint32
func lengthCode(len uint32) uint32
Returns the literal of a literal token.
func (t token) literal() uint32
Convert a literal into a literal token.
func literalToken(literal uint32) token
func load32(b []byte, i int32) uint32
func load64(b []byte, i int32) uint64
func (f *decompressor) makeReader(r io.Reader)
matchLen returns the match length between src[s:] and src[t:]. t can be negative to indicate the match is starting in e.prev. We assume that src[s-4:s] and src[t-4:t] already match.
func (e *deflateFast) matchLen(s int32, t int32, src []byte) int32
matchLen returns the number of matching bytes in a and b up to length 'max'. Both slices must be at least 'max' bytes in size.
func matchLen(a []byte, b []byte, max int) int
Convert a < xlength, xoffset > pair into a match token.
func matchToken(xlength uint32, xoffset uint32) token
func maxNode() literalNode
func (f *decompressor) moreBits() error
func newDeflateFast() *deflateFast
func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter
func newHuffmanEncoder(size int) *huffmanEncoder
func (f *decompressor) nextBlock()
noEOF returns err, unless err == io.EOF, in which case it returns io.ErrUnexpectedEOF.
func noEOF(e error) error
Returns the extra offset of a match token.
func (t token) offset() uint32
Returns the offset code corresponding to a specific offset.
func offsetCode(off uint32) uint32
readFlush returns a slice of the historical buffer that is ready to be emitted to the user. The data returned by readFlush must be fully consumed before calling any other dictDecoder methods.
func (dd *dictDecoder) readFlush() []byte
func (f *decompressor) readHuffman() error
func (d *compressor) reset(w io.Writer)
Reset resets the encoding history. This ensures that no matches are made to the previous block.
func (e *deflateFast) reset()
func (w *huffmanBitWriter) reset(writer io.Writer)
func reverseBits(number uint16, bitLength byte) uint16
set sets the code and length of an hcode.
func (h *hcode) set(code uint16, length uint16)
shiftOffsets will shift down all match offset. This is only called in rare situations to prevent integer overflow. See https://golang.org/issue/18636 and https://github.com/golang/go/issues/34121.
func (e *deflateFast) shiftOffsets()
func (s *byLiteral) sort(a []literalNode)
func (s *byFreq) sort(a []literalNode)
func (d *compressor) store()
storeHuff compresses and stores the currently added data when the d.window is full or we are at the end of the stream. Any error that occurred will be in d.err
func (d *compressor) storeHuff()
storedSize calculates the stored size, including header. The function returns the size in bits and whether the block fits inside a single block.
func (w *huffmanBitWriter) storedSize(in []byte) (int, bool)
func (d *compressor) syncFlush() error
tryWriteCopy tries to copy a string at a given (distance, length) to the output. This specialized version is optimized for short distances. This method is designed to be inlined for performance reasons. This invariant must be kept: 0 < dist <= histSize()
func (dd *dictDecoder) tryWriteCopy(dist int, length int) int
func (d *compressor) write(b []byte) (n int, err error)
func (w *huffmanBitWriter) write(b []byte)
func (w *huffmanBitWriter) writeBits(b int32, nb uint)
func (d *compressor) writeBlock(tokens []token, index int) error
writeBlock will write a block of tokens with the smallest encoding. The original input can be supplied, and if the huffman encoded data is larger than the original bytes, the data will be written as a stored block. If the input is nil, the tokens will always be Huffman encoded.
func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte)
writeBlockDynamic encodes a block using a dynamic Huffman table. This should be used if the symbols used have a disproportionate histogram distribution. If input is supplied and the compression savings are below 1/16th of the input size the block is stored.
func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []byte)
writeBlockHuff encodes a block of bytes as either Huffman encoded literals or uncompressed bytes if the results only gains very little from compression.
func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte)
writeByte writes a single byte to the dictionary. This invariant must be kept: 0 < availWrite()
func (dd *dictDecoder) writeByte(c byte)
func (w *huffmanBitWriter) writeBytes(bytes []byte)
func (w *huffmanBitWriter) writeCode(c hcode)
writeCopy copies a string at a given (dist, length) to the output. This returns the number of bytes copied and may be less than the requested length if the available space in the output buffer is too small. This invariant must be kept: 0 < dist <= histSize()
func (dd *dictDecoder) writeCopy(dist int, length int) int
Write the header of a dynamic Huffman block to the output stream. numLiterals The number of literals specified in codegen numOffsets The number of offsets specified in codegen numCodegens The number of codegens used in codegen
func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, numCodegens int, isEof bool)
func (w *huffmanBitWriter) writeFixedHeader(isEof bool)
writeMark advances the writer pointer by cnt. This invariant must be kept: 0 <= cnt <= availWrite()
func (dd *dictDecoder) writeMark(cnt int)
writeSlice returns a slice of the available buffer to write data to. This invariant will be kept: len(s) <= availWrite()
func (dd *dictDecoder) writeSlice() []byte
func (d *compressor) writeStoredBlock(buf []byte) error
func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool)
writeTokens writes a slice of tokens to the output. codes for literal and offset encoding must be supplied.
func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes []hcode, oeCodes []hcode)
Generated with Arrow