Github user tokee commented on a diff in the pull request:
https://github.com/apache/lucene-solr/pull/525#discussion_r245993566
--- Diff:
lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java ---
@@ -0,0 +1,601 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current
document
+ * among the list of documents that this iterator can return. This is
useful
+ * to implement sparse doc values by only having to encode values for
documents
+ * that actually have a value.
+ * <p>Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code
65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:<ul>
+ * <li>{@code ALL} if the range contains 65536 documents exactly,
+ * <li>{@code DENSE} if the range contains 4096 documents or more; in
that
+ * case documents are stored in a bit set,
+ * <li>{@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ * stored in a {@link DataInput#readShort() short}.
+ * </ul>
+ * <p>Only ranges that contain at least one value are encoded.
+ * <p>This implementation uses 6 bytes per document in the worst-case,
which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ *
+ * To avoid O(n) lookup time complexity, with n being the number of
documents, two lookup
+ * tables are used: A lookup table for block offset and index, and a rank
structure
+ * for DENSE block index lookups.
+ *
+ * The lookup table is an array of {@code int}-pairs, with a pair for each
block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current
position and forward
+ * one block at a time.
+ *
+ * Each int-pair entry consists of 2 logical parts:
+ *
+ * The first 32 bit int holds the index (number of set bits in the blocks)
up to just before the
+ * wanted block. The maximum number of set bits is the maximum number of
documents, which is < 2^31.
+ *
+ * The next int holds the offset in bytes into the underlying slice. As
there is a maximum of 2^16
+ * blocks, it follows that the maximum size of any block must not exceed
2^15 bytes to avoid
+ * overflow (2^16 bytes if the int is treated as unsigned). This is
currently the case, with the
+ * largest block being DENSE and using 2^13 + 36 bytes.
+ *
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing
(0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has
index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block
structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of byte-pairs with an
entry for each
+ * sub-block (default 512 bits) out of the 65536 bits in the outer DENSE
block.
+ *
+ * Each rank-entry states the number of set bits within the block up to
the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and
that the last entry can
+ * at most be 65536-2 = 65634 and thus will always fit into an byte-pair
of 16 bits.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning
of the DENSE block.
+ * This ensures locality and keeps logistics simple.
+ *
+ * @lucene.internal
+ */
+final class IndexedDISI extends DocIdSetIterator {
+
+ // jump-table time/space trade-offs to consider:
+ // The block offsets and the block indexes could be stored in more
compressed form with
+ // two PackedInts or two MonotonicDirectReaders.
+ // The DENSE ranks (default 128 shorts = 256 bytes) could likewise be
compressed. But as there is
+ // at least 4096 set bits in DENSE blocks, there will be at least one
rank with 2^12 bits, so it
+ // is doubtful if there is much to gain here.
+
+ private static final int BLOCK_SIZE = 65536; // The number of docIDs
that a single block represents
+
+ private static final int DENSE_BLOCK_LONGS = BLOCK_SIZE/Long.SIZE; //
1024
+ public static final byte DEFAULT_DENSE_RANK_POWER = 9; // Every 512
docIDs / 8 longs
+
+ static final int MAX_ARRAY_LENGTH = (1 << 12) - 1;
+
+ private static void flush(
+ int block, FixedBitSet buffer, int cardinality, byte denseRankPower,
IndexOutput out) throws IOException {
+ assert block >= 0 && block < 65536;
+ out.writeShort((short) block);
+ assert cardinality > 0 && cardinality <= 65536;
+ out.writeShort((short) (cardinality - 1));
+ if (cardinality > MAX_ARRAY_LENGTH) {
+ if (cardinality != 65536) { // all docs are set
+ if (denseRankPower != -1) {
+ final byte[] rank = createRank(buffer, denseRankPower);
+ out.writeBytes(rank, rank.length);
+ }
+ for (long word : buffer.getBits()) {
+ out.writeLong(word);
+ }
+ }
+ } else {
+ BitSetIterator it = new BitSetIterator(buffer, cardinality);
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS;
doc = it.nextDoc()) {
+ out.writeShort((short) doc);
+ }
+ }
+ }
+
+ // Creates a DENSE rank-entry (the number of set bits up to a given
point) for the buffer.
+ // One rank-entry for every {@code 2^denseRankPower} bits, with each
rank-entry using 2 bytes.
+ // Represented as a byte[] for fast flushing and mirroring of the
retrieval representation.
+ private static byte[] createRank(FixedBitSet buffer, byte
denseRankPower) {
+ final int longsPerRank = 1 << (denseRankPower-6);
+ final int rankMark = longsPerRank-1;
+ final int rankIndexShift = denseRankPower-7; // 6 for the long (2^6) +
1 for 2 bytes/entry
+ final byte[] rank = new byte[DENSE_BLOCK_LONGS >> rankIndexShift];
+ final long[] bits = buffer.getBits();
+ int bitCount = 0;
+ for (int word = 0 ; word < DENSE_BLOCK_LONGS ; word++) {
+ if ((word & rankMark) == 0) { // Every longsPerRank longs
+ rank[word >> rankIndexShift] = (byte)(bitCount>>8);
+ rank[(word >> rankIndexShift)+1] = (byte)(bitCount & 0xFF);
+ }
+ bitCount += Long.bitCount(bits[word]);
+ }
+ return rank;
+ }
+
+ /**
+ * Writes the docIDs from it to out, in logical blocks, one for each
65536 docIDs in monotonically increasing
+ * gap-less order. DENSE blocks uses {@link #DEFAULT_DENSE_RANK_POWER}
of 9 (every 512 docIDs / 8 longs).
+ * The caller must keep track of the number of jump-table entries
(returned by this method) as well as the
+ * denseRankPower (9 for this method) and provide them when constructing
an IndexedDISI for reading.
+ * @param it the document IDs.
+ * @param out destination for the blocks.
+ * @throws IOException if there was an error writing to out.
+ * @return the number of jump-table entries following the blocks, -1 for
no entries.
+ * This should be stored in meta and used when creating an
instance of IndexedDISI.
+ */
+ static short writeBitSet(DocIdSetIterator it, IndexOutput out) throws
IOException {
+ return writeBitSet(it, out, DEFAULT_DENSE_RANK_POWER);
+ }
+
+ /**
+ * Writes the docIDs from it to out, in logical blocks, one for each
65536 docIDs in monotonically
+ * increasing gap-less order.
+ * The caller must keep track of the number of jump-table entries
(returned by this method) as well as the
+ * denseRankPower and provide them when constructing an IndexedDISI for
reading.
+ * @param it the document IDs.
+ * @param out destination for the blocks.
+ * @param denseRankPower for {@link Method#DENSE} blocks, a rank will be
written every {@code 2^denseRankPower} docIDs.
+ * Values < 7 (every 128 docIDs) or > 15
(every 32768 docIDs) disables DENSE rank.
+ * Recommended values are 8-12: Every 256-4096
docIDs or 4-64 longs.
+ * {@link #DEFAULT_DENSE_RANK_POWER} is 9: Every
512 docIDs.
+ * This should be stored in meta and used when
creating an instance of IndexedDISI.
+ * @throws IOException if there was an error writing to out.
+ * @return the number of jump-table entries following the blocks, -1 for
no entries.
+ * This should be stored in meta and used when creating an
instance of IndexedDISI.
+ */
+ static short writeBitSet(DocIdSetIterator it, IndexOutput out, byte
denseRankPower) throws IOException {
+ final long origo = out.getFilePointer(); // All jumps are relative to
the origo
+ if (denseRankPower < 7 || denseRankPower > 15) {
+ denseRankPower = -1; // Disabled
+ }
+ int totalCardinality = 0;
+ int blockCardinality = 0;
+ final FixedBitSet buffer = new FixedBitSet(1<<16);
+ int[] jumps = new int[ArrayUtil.oversize(1, Integer.BYTES*2)];
+ int prevBlock = -1;
+ int jumpBlockIndex = 0;
+
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc
= it.nextDoc()) {
+ final int block = doc >>> 16;
+ if (prevBlock != -1 && block != prevBlock) {
+ // Track offset+index from previous block up to current
+ jumps = addJumps(jumps, out.getFilePointer()-origo,
totalCardinality, jumpBlockIndex, prevBlock+1);
+ jumpBlockIndex = prevBlock+1;
+ // Flush block
+ flush(prevBlock, buffer, blockCardinality, denseRankPower, out);
+ // Reset for next block
+ buffer.clear(0, buffer.length());
+ totalCardinality += blockCardinality;
+ blockCardinality = 0;
+ }
+ buffer.set(doc & 0xFFFF);
+ blockCardinality++;
+ prevBlock = block;
+ }
+ if (blockCardinality > 0) {
+ jumps = addJumps(jumps, out.getFilePointer()-origo,
totalCardinality, jumpBlockIndex, prevBlock+1);
+ flush(prevBlock, buffer, blockCardinality, denseRankPower, out);
+ buffer.clear(0, buffer.length());
+ prevBlock++;
+ }
+ final int lastBlock = prevBlock == -1 ? 0 : prevBlock; // There will
always be at least 1 block (NO_MORE_DOCS)
+ // Last entry is a SPARSE with blockIndex == 32767 and the single
entry 65535, which becomes the docID NO_MORE_DOCS
+ // To avoid creating 65K jump-table entries, only a single entry is
created pointing to the offset of the
+ // NO_MORE_DOCS block, but with the jumpBlockIndex set to the logical
EMPTY block after all real blocks.
+ jumps = addJumps(jumps, out.getFilePointer()-origo,
DocIdSetIterator.NO_MORE_DOCS & 0xFFFF0000,
+ lastBlock, lastBlock+1);
+ buffer.set(DocIdSetIterator.NO_MORE_DOCS & 0xFFFF);
+ flush(DocIdSetIterator.NO_MORE_DOCS >>> 16, buffer, 1, denseRankPower,
out);
+ // offset+index jump-table stored at the end
+ return flushBlockJumps(jumps, lastBlock+1, out, origo);
+ }
+
+ // Adds entries to the offset & index jump-table for blocks
+ private static int[] addJumps(int[] jumps, long offset, int index, int
startBlock, int endBlock) {
+ assert offset < Integer.MAX_VALUE : "Logically the offset should not
exceed 2^30 but was >= Integer.MAX_VALUE";
+ jumps = ArrayUtil.grow(jumps, (endBlock+1)*2);
+ for (int b = startBlock; b < endBlock; b++) {
+ jumps[b*2] = index;
+ jumps[b*2+1] = (int) offset;
+ }
+ return jumps;
+ }
+
+ // Flushes the offet & index jump-table for blocks. This should be the
last data written to out
+ // This method returns the blockCount for the blocks reachable for the
jump_table or -1 for no jump-table
+ private static short flushBlockJumps(int[] jumps, int blockCount,
IndexOutput out, long origo) throws IOException {
+ if (blockCount == 2) { // Jumps with a single real entry +
NO_MORE_DOCS is just wasted space so we ignore that
+ blockCount = 0;
+ }
+ for (int i = 0 ; i < blockCount ; i++) {
+ out.writeInt(jumps[i*2]); // index
+ out.writeInt(jumps[i*2+1]); // offset
+ }
+ // As there are at most 32k blocks, the count is a short
+ // The jumpTableOffset will be at lastPos - (blockCount * Long.BYTES)
+ return (short)blockCount;
+ }
+
+ /** The slice that stores the {@link DocIdSetIterator}. */
+ private final IndexInput slice;
+ private final int jumpTableEntryCount;
+ private final byte denseRankPower;
+ private final RandomAccessInput jumpTable; // Skip blocks of 64K bits
+ private final byte[] denseRankTable;
+ private final long cost;
+
+ /**
+ * @param in backing data.
+ * @param offset starting offset for blocks in backing data.
+ * @param length the number of bytes in the backing data.
+ * @param jumpTableEntryCount the number of blocks convered by the
jump-table.
+ * @param cost normally the number of logical docIDs.
+ */
+ IndexedDISI(IndexInput in, long offset, long length, int
jumpTableEntryCount, byte denseRankPower, long cost) throws IOException {
+ this(in.slice("docs", offset, length),
--- End diff --
Doing so makes the separation cleaner. Nice catch. I have implemented it.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]