atris commented on a change in pull request #7405: URL: https://github.com/apache/pinot/pull/7405#discussion_r716411483
########## File path: pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/nativefst/ImmutableFST.java ########## @@ -0,0 +1,406 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.utils.nativefst; + +import java.io.DataInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.EnumSet; +import java.util.Map; +import java.util.Set; +import org.apache.pinot.segment.local.io.readerwriter.PinotDataBufferMemoryManager; +import org.apache.pinot.segment.local.realtime.impl.dictionary.OffHeapMutableBytesStore; +import org.apache.pinot.spi.utils.Pair; + + +/** + * FST binary format implementation + * + * <p> + * This version indicates the dictionary was built with these flags: + * {@link FSTFlags#FLEXIBLE}, {@link FSTFlags#STOPBIT} and + * {@link FSTFlags#NEXTBIT}. The internal representation of the FST must + * therefore follow this description (please note this format describes only a + * single transition (arc), not the entire dictionary file). + * + * <pre> + * ---- this node header present only if automaton was compiled with NUMBERS option. + * Byte + * +-+-+-+-+-+-+-+-+\ + * 0 | | | | | | | | | \ LSB + * +-+-+-+-+-+-+-+-+ + + * 1 | | | | | | | | | | number of strings recognized + * +-+-+-+-+-+-+-+-+ +----- by the automaton starting + * : : : : : : : : : | from this node. + * +-+-+-+-+-+-+-+-+ + + * ctl-1 | | | | | | | | | / MSB + * +-+-+-+-+-+-+-+-+/ + * + * ---- remaining part of the node + * Length of output symbols dictionary -- Integer + * <Arc ID, Output Symbol> + * <Arc ID, Output Symbol> + * <Arc ID, Output Symbol> + * . + * . + * . + * <Arc ID, Output Symbol> (Length) + * + * Byte + * +-+-+-+-+-+-+-+-+\ + * 0 | | | | | | | | | +------ label + * +-+-+-+-+-+-+-+-+/ + * + * +------------- node pointed to is next + * | +----------- the last arc of the node + * | | +--------- the arc is final + * | | | + * +-----------+ + * | | | | | + * ___+___ | | | | + * / \ | | | | + * MSB LSB | + * 7 6 5 4 3 2 1 0 | + * +-+-+-+-+-+-+-+-+ | + * 1 | | | | | | | | | \ \ + * +-+-+-+-+-+-+-+-+ \ \ LSB + * +-+-+-+-+-+-+-+-+ + + * 2 | | | | | | | | | | + * +-+-+-+-+-+-+-+-+ | + * 3 | | | | | | | | | +----- target node address (in bytes) + * +-+-+-+-+-+-+-+-+ | (not present except for the byte + * : : : : : : : : : | with flags if the node pointed to + * +-+-+-+-+-+-+-+-+ + is next) + * gtl | | | | | | | | | / MSB + * +-+-+-+-+-+-+-+-+ / + * gtl+1 (gtl = gotoLength) + * </pre> + */ +public final class ImmutableFST extends FST { + /** + * Default filler byte. + */ + public final static byte DEFAULT_FILLER = '_'; + + /** + * Default annotation byte. + */ + public final static byte DEFAULT_ANNOTATION = '+'; + + /** + * Automaton version as in the file header. + */ + public static final byte VERSION = 5; + + /** + * Bit indicating that an arc corresponds to the last character of a sequence + * available when building the automaton. + */ + public static final int BIT_FINAL_ARC = 1 << 0; + + /** + * Bit indicating that an arc is the last one of the node's list and the + * following one belongs to another node. + */ + public static final int BIT_LAST_ARC = 1 << 1; + + /** + * Bit indicating that the target node of this arc follows it in the + * compressed automaton structure (no goto field). + */ + public static final int BIT_TARGET_NEXT = 1 << 2; + + /** + * An offset in the arc structure, where the address and flags field begins. + * In version 5 of FST automata, this value is constant (1, skip label). + */ + public final static int ADDRESS_OFFSET = 1; + + private static final int PER_BUFFER_SIZE = 16; + + /** + * An array of bytes with the internal representation of the automaton. Please + * see the documentation of this class for more information on how this + * structure is organized. + */ + public final OffHeapMutableBytesStore _mutableBytesStore; + /** + * The length of the node header structure (if the automaton was compiled with + * <code>NUMBERS</code> option). Otherwise zero. + */ + public final int _nodeDataLength; + /** + * Number of bytes each address takes in full, expanded form (goto length). + */ + public final int _gotoLength; + /** Filler character. */ + public final byte _filler; + /** Annotation character. */ + public final byte _annotation; + public Map<Integer, Integer> _outputSymbols; + /** + * Flags for this automaton version. + */ + private Set<FSTFlags> _flags; + + /** + * Read and wrap a binary automaton in FST version 5. + */ + ImmutableFST(InputStream stream, boolean hasOutputSymbols, PinotDataBufferMemoryManager memoryManager) + throws IOException { + DataInputStream in = new DataInputStream(stream); + + this._filler = in.readByte(); + this._annotation = in.readByte(); + final byte hgtl = in.readByte(); + + _mutableBytesStore = new OffHeapMutableBytesStore(memoryManager, "ImmutableFST"); + + /* + * Determine if the automaton was compiled with NUMBERS. If so, modify + * ctl and goto fields accordingly. + */ + _flags = EnumSet.of(FSTFlags.FLEXIBLE, FSTFlags.STOPBIT, FSTFlags.NEXTBIT); + if ((hgtl & 0xf0) != 0) { + _flags.add(FSTFlags.NUMBERS); + } + + _flags = Collections.unmodifiableSet(_flags); + + this._nodeDataLength = (hgtl >>> 4) & 0x0f; + this._gotoLength = hgtl & 0x0f; + + if (hasOutputSymbols) { + final int outputSymbolsLength = in.readInt(); + byte[] outputSymbolsBuffer = readRemaining(in, outputSymbolsLength); + + if (outputSymbolsBuffer.length > 0) { + String outputSymbolsSerialized = new String(outputSymbolsBuffer); + + _outputSymbols = buildMap(outputSymbolsSerialized); + } + } + + readRemaining(in); + } + + protected final void readRemaining(InputStream in) + throws IOException { + byte[] buffer = new byte[PER_BUFFER_SIZE]; + while ((in.read(buffer)) >= 0) { + _mutableBytesStore.add(buffer); + } + } + + /** + * Returns the start node of this automaton. + */ + @Override + public int getRootNode() { + // Skip dummy node marking terminating state. + final int epsilonNode = skipArc(getFirstArc(0)); + + // And follow the epsilon node's first (and only) arc. + return getDestinationNodeOffset(getFirstArc(epsilonNode)); + } + + /** + * {@inheritDoc} + */ + @Override + public final int getFirstArc(int node) { + return _nodeDataLength + node; + } + + /** + * {@inheritDoc} + */ + @Override + public final int getNextArc(int arc) { + if (isArcLast(arc)) { + return 0; + } else { + return skipArc(arc); + } + } + + /** + * {@inheritDoc} + */ + @Override + public int getArc(int node, byte label) { + for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { + if (getArcLabel(arc) == label) { + return arc; + } + } + + // An arc labeled with "label" not found. + return 0; + } + + /** + * {@inheritDoc} + */ + @Override + public int getEndNode(int arc) { + final int nodeOffset = getDestinationNodeOffset(arc); + return nodeOffset; + } + + /** + * {@inheritDoc} + */ + @Override + public byte getArcLabel(int arc) { + return getByte(arc, 0); + } + + @Override + public int getOutputSymbol(int arc) { + return _outputSymbols.get(arc); + } + + /** + * {@inheritDoc} + */ + @Override + public boolean isArcFinal(int arc) { + return (getByte(arc, ADDRESS_OFFSET) & BIT_FINAL_ARC) != 0; + } + + /** + * {@inheritDoc} + */ + @Override + public boolean isArcTerminal(int arc) { + return (0 == getDestinationNodeOffset(arc)); + } + + /** + * Returns the number encoded at the given node. The number equals the count + * of the set of suffixes reachable from <code>node</code> (called its right + * language). + */ + @Override + public int getRightLanguageCount(int node) { + assert getFlags().contains(FSTFlags.NUMBERS) : "This FST was not compiled with NUMBERS."; + return decodeFromBytes(node, _nodeDataLength); + } + + /** + * {@inheritDoc} + * + * <p> + * For this automaton version, an additional {@link FSTFlags#NUMBERS} flag may + * be set to indicate the automaton contains extra fields for each node. + * </p> + */ + @Override + public Set<FSTFlags> getFlags() { + return _flags; + } + + /** + * Returns <code>true</code> if this arc has <code>NEXT</code> bit set. + * + * @see #BIT_LAST_ARC + * @param arc The node's arc identifier. + * @return Returns true if the argument is the last arc of a node. + */ + public boolean isArcLast(int arc) { + return (getByte(arc, ADDRESS_OFFSET) & BIT_LAST_ARC) != 0; + } + + /** + * @see #BIT_TARGET_NEXT + * @param arc The node's arc identifier. + * @return Returns true if {@link #BIT_TARGET_NEXT} is set for this arc. + */ + public boolean isNextSet(int arc) { + + return (getByte(arc, ADDRESS_OFFSET) & BIT_TARGET_NEXT) != 0; + } + + /** + * Returns an n-byte integer encoded in byte-packed representation. + */ + final int decodeFromBytes(final int start, final int n) { + int r = 0; + + for (int i = n; --i >= 0; ) { + Pair<Integer, Integer> offheapOffsets = getOffheapOffsets(start + i); + byte[] inputData = _mutableBytesStore.get(offheapOffsets.getFirst()); + + r = r << 8 | (inputData[offheapOffsets.getSecond()] & 0xff); + } + return r; + } + + /** + * Returns the address of the node pointed to by this arc. + */ + final int getDestinationNodeOffset(int arc) { + if (isNextSet(arc)) { + /* The destination node follows this arc in the array. */ + return skipArc(arc); + } else { + /* + * The destination node address has to be extracted from the arc's + * goto field. + */ + return decodeFromBytes(arc + ADDRESS_OFFSET, _gotoLength) >>> 3; + } + } + + /** + * Read the arc's layout and skip as many bytes, as needed. + */ + private int skipArc(int offset) { + return offset + (isNextSet(offset) ? 1 + 1 /* label + flags */ : 1 + _gotoLength /* label + flags/address */); + } + + private byte getByte(int seek, int offset) { + Pair<Integer, Integer> offheapOffsets = getOffheapOffsets(seek); + + int fooArc = offheapOffsets.getFirst(); + byte[] retVal = _mutableBytesStore.get((fooArc)); + + int barArc = offheapOffsets.getSecond(); + int target = barArc + offset; + + if (target >= PER_BUFFER_SIZE) { + retVal = _mutableBytesStore.get(fooArc + 1); + target = target - PER_BUFFER_SIZE; + } + + return retVal[target]; + } + + private Pair<Integer, Integer> getOffheapOffsets(int seek) { + int fooArc = seek >= PER_BUFFER_SIZE ? seek / PER_BUFFER_SIZE : 0; + int barArc = seek >= PER_BUFFER_SIZE ? seek - ((fooArc) * PER_BUFFER_SIZE) : seek; + + assert fooArc < _mutableBytesStore.getNumValues(); + assert barArc < PER_BUFFER_SIZE; + + return new Pair<>(fooArc, barArc); + } Review comment: Since barArc depends on fooArc, is inlining possible here? ########## File path: pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/nativefst/ByteSequenceIterator.java ########## @@ -0,0 +1,180 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.utils.nativefst; + +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.Iterator; +import java.util.NoSuchElementException; + + +/** + * An iterator that traverses the right language of a given node (all sequences + * reachable from a given node). + */ +public final class ByteSequenceIterator implements Iterator<ByteBuffer> { + /** + * Default expected depth of the recursion stack (estimated longest sequence + * in the automaton). Buffers expand by the same value if exceeded. + */ + private final static int EXPECTED_MAX_STATES = 15; + + /** The FST to which this iterator belongs. */ + private final FST _fst; + + /** An internal cache for the next element in the FST */ + private ByteBuffer _nextElement; + + /** + * A buffer for the current sequence of bytes from the current node to the + * root. + */ + private byte[] _buffer = new byte[EXPECTED_MAX_STATES]; + + /** Reusable byte buffer wrapper around {@link #_buffer}. */ + private ByteBuffer _bufferWrapper = ByteBuffer.wrap(_buffer); + + /** An arc stack for DFS when processing the automaton. */ + private int[] _arcs = new int[EXPECTED_MAX_STATES]; + + /** Current processing depth in {@link #_arcs}. */ + private int _position; + + /** + * Create an instance of the iterator for a given node. + * @param fst The automaton to iterate over. + * @param node The starting node's identifier (can be the {@link FST#getRootNode()}). + */ + public ByteSequenceIterator(FST fst, int node) { + this._fst = fst; + + if (fst.getFirstArc(node) != 0) { + restartFrom(node); + } + } + + /** + * Restart walking from <code>node</code>. Allows iterator reuse. + * + * @param node Restart the iterator from <code>node</code>. + * @return Returns <code>this</code> for call chaining. + */ + public ByteSequenceIterator restartFrom(int node) { + _position = 0; + _bufferWrapper.clear(); + _nextElement = null; + + pushNode(node); + return this; + } + + /** Returns <code>true</code> if there are still elements in this iterator. */ + @Override + public boolean hasNext() { + if (_nextElement == null) { + _nextElement = advance(); + } + + return _nextElement != null; + } + + /** + * @return Returns a {@link ByteBuffer} with the sequence corresponding to the + * next final state in the automaton. + */ + @Override + public ByteBuffer next() { + if (_nextElement != null) { + final ByteBuffer cache = _nextElement; + _nextElement = null; + return cache; + } else { + final ByteBuffer cache = advance(); + if (cache == null) { + throw new NoSuchElementException(); + } + return cache; + } + } + + /** + * Advances to the next available final state. + */ + private final ByteBuffer advance() { + if (_position == 0) { + return null; + } + + while (_position > 0) { + final int lastIndex = _position - 1; + final int arc = _arcs[lastIndex]; + + if (arc == 0) { + // Remove the current node from the queue. + _position--; + continue; + } + + // Go to the next arc, but leave it on the stack + // so that we keep the recursion depth level accurate. + _arcs[lastIndex] = _fst.getNextArc(arc); + + // Expand buffer if needed. + final int bufferLength = this._buffer.length; + if (lastIndex >= bufferLength) { + this._buffer = Arrays.copyOf(_buffer, bufferLength + EXPECTED_MAX_STATES); + this._bufferWrapper = ByteBuffer.wrap(_buffer); + } + _buffer[lastIndex] = _fst.getArcLabel(arc); + + if (!_fst.isArcTerminal(arc)) { + // Recursively descend into the arc's node. + pushNode(_fst.getEndNode(arc)); + } + + if (_fst.isArcFinal(arc)) { + _bufferWrapper.clear(); + _bufferWrapper.limit(lastIndex + 1); + return _bufferWrapper; + } + } + + return null; + } + + /** + * Not implemented in this iterator. + */ + @Override + public void remove() { + throw new UnsupportedOperationException("Read-only iterator."); + } + + /** + * Descends to a given node, adds its arcs to the stack to be traversed. + */ + private void pushNode(int node) { + // Expand buffers if needed. + if (_position == _arcs.length) { + _arcs = Arrays.copyOf(_arcs, _arcs.length + EXPECTED_MAX_STATES); + } + + _arcs[_position++] = _fst.getFirstArc(node); + } +} Review comment: Strange. My IDE shows newlines at the end of all files, and I think Checkstyle checks the same as well? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
