[GitHub] lucene-solr pull request #416: WIP: SOLR-12519

dsmiley Wed, 15 Aug 2018 07:03:38 -0700

Github user dsmiley commented on a diff in the pull request:

    https://github.com/apache/lucene-solr/pull/416#discussion_r210273819
  
    --- Diff: 
solr/core/src/java/org/apache/solr/response/transform/ChildDocTransformer.java 
---
    @@ -0,0 +1,253 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.solr.response.transform;
    +
    +import java.io.IOException;
    +import java.lang.invoke.MethodHandles;
    +import java.util.ArrayList;
    +import java.util.Collection;
    +import java.util.HashMap;
    +import java.util.List;
    +import java.util.Map;
    +import java.util.Set;
    +
    +import com.google.common.collect.ArrayListMultimap;
    +import com.google.common.collect.Multimap;
    +import org.apache.lucene.index.DocValues;
    +import org.apache.lucene.index.IndexableField;
    +import org.apache.lucene.index.LeafReaderContext;
    +import org.apache.lucene.index.ReaderUtil;
    +import org.apache.lucene.index.SortedDocValues;
    +import org.apache.lucene.search.join.BitSetProducer;
    +import org.apache.lucene.util.BitSet;
    +import org.apache.solr.common.SolrDocument;
    +import org.apache.solr.response.DocsStreamer;
    +import org.apache.solr.schema.FieldType;
    +import org.apache.solr.schema.IndexSchema;
    +import org.apache.solr.search.DocSet;
    +import org.apache.solr.search.SolrDocumentFetcher;
    +import org.apache.solr.search.SolrIndexSearcher;
    +import org.apache.solr.search.SolrReturnFields;
    +import org.slf4j.Logger;
    +import org.slf4j.LoggerFactory;
    +
    +import static 
org.apache.solr.response.transform.ChildDocTransformerFactory.NUM_SEP_CHAR;
    +import static 
org.apache.solr.response.transform.ChildDocTransformerFactory.PATH_SEP_CHAR;
    +import static org.apache.solr.schema.IndexSchema.NEST_PATH_FIELD_NAME;
    +
    +class ChildDocTransformer extends DocTransformer {
    +  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    +
    +  private static final String ANON_CHILD_KEY = "_childDocuments_";
    +
    +  private final String name;
    +  private final BitSetProducer parentsFilter;
    +  private final DocSet childDocSet;
    +  private final int limit;
    +
    +  private final SolrReturnFields childReturnFields = new 
SolrReturnFields();
    +
    +  ChildDocTransformer(String name, BitSetProducer parentsFilter,
    +                      DocSet childDocSet, int limit) {
    +    this.name = name;
    +    this.parentsFilter = parentsFilter;
    +    this.childDocSet = childDocSet;
    +    this.limit = limit;
    +  }
    +
    +  @Override
    +  public String getName()  {
    +    return name;
    +  }
    +
    +  @Override
    +  public void transform(SolrDocument rootDoc, int rootDocId) {
    +    // note: this algorithm works if both if we have have _nest_path_  and 
also if we don't!
    +
    +    try {
    +
    +      // lookup what the *previous* rootDocId is, and figure which segment 
this is
    +      final SolrIndexSearcher searcher = context.getSearcher();
    +      final List<LeafReaderContext> leaves = 
searcher.getIndexReader().leaves();
    +      final int seg = ReaderUtil.subIndex(rootDocId, leaves);
    +      final LeafReaderContext leafReaderContext = leaves.get(seg);
    +      final int segBaseId = leafReaderContext.docBase;
    +      final int segRootId = rootDocId - segBaseId;
    +      final BitSet segParentsBitSet = 
parentsFilter.getBitSet(leafReaderContext);
    +      final int segPrevRootId = segParentsBitSet.prevSetBit(segRootId - 
1); // can return -1 and that's okay
    +
    +      // we'll need this soon...
    +      final SortedDocValues segPathDocValues = 
DocValues.getSorted(leafReaderContext.reader(), NEST_PATH_FIELD_NAME);
    +
    +      // the key in the Map is the document's ancestors key(one above the 
parent), while the key in the intermediate
    +      // MultiMap is the direct child document's key(of the parent 
document)
    +      Map<String, Multimap<String, SolrDocument>> 
pendingParentPathsToChildren = new HashMap<>();
    +
    +      IndexSchema schema = searcher.getSchema();
    +      SolrDocumentFetcher docFetcher = searcher.getDocFetcher();
    +      Set<String> dvFieldsToReturn = docFetcher.getNonStoredDVs(true);
    +      final int lastChildId = segBaseId + segPrevRootId + 1;
    +      boolean shouldDecorateWithDVs = dvFieldsToReturn.size() > 0;
    +
    +      // Loop each child ID up to the parent (exclusive).
    +      for (int docId = limit == - 1? lastChildId: 
calcLimitIndex(segBaseId, segRootId, segPrevRootId + 1); docId < rootDocId; 
++docId) {
    +
    +        // get the path.  (note will default to ANON_CHILD_KEY if not in 
schema or oddly blank)
    +        String fullDocPath = getPathByDocId(docId - segBaseId, 
segPathDocValues);
    +
    +        // Is this doc a direct ancestor of another doc we've seen?
    +        boolean isAncestor = 
pendingParentPathsToChildren.containsKey(fullDocPath);
    +
    +        // Do we need to do anything with this doc (either ancestor or 
matched the child query)
    +        if (isAncestor || childDocSet == null || 
childDocSet.exists(docId)) {
    +          // load the doc
    +          SolrDocument doc = 
DocsStreamer.convertLuceneDocToSolrDoc(docFetcher.doc(docId), schema, 
childReturnFields);
    +          if (shouldDecorateWithDVs) {
    +            docFetcher.decorateDocValueFields(doc, docId, 
dvFieldsToReturn);
    +          }
    +
    +          if (isAncestor) {
    +            // if this path has pending child docs, add them.
    +            addChildrenToParent(doc, 
pendingParentPathsToChildren.remove(fullDocPath)); // no longer pending
    +          }
    +
    +          // get parent path
    +          String parentDocPath = getParentPath(fullDocPath);
    +          String lastPath = getLastPath(fullDocPath);
    +          // put into pending:
    +          // trim path if the doc was inside array, see 
trimPathIfArrayDoc()
    +          // e.g. toppings#1/ingredients#1 -> outer map key toppings#1
    +          // -> inner MultiMap key ingredients
    +          // or lonely#/lonelyGrandChild# -> outer map key lonely#
    +          // -> inner MultiMap key lonelyGrandChild#
    +          pendingParentPathsToChildren.computeIfAbsent(parentDocPath, x -> 
ArrayListMultimap.create())
    +              .put(trimLastPoundIfArray(lastPath), doc); // multimap add 
(won't replace)
    +        }
    +      }
    +
    +      // only children of parent remain
    +      assert pendingParentPathsToChildren.keySet().size() == 1;
    +
    +      addChildrenToParent(rootDoc, 
pendingParentPathsToChildren.remove(null));
    +
    +    } catch (IOException e) {
    +      //TODO DWS: reconsider this unusual error handling approach; 
shouldn't we rethrow?
    +      log.warn("Could not fetch child documents", e);
    +      rootDoc.put(getName(), "Could not fetch child documents");
    +    }
    +  }
    +
    +  private static void addChildrenToParent(SolrDocument parent, 
Multimap<String, SolrDocument> children) {
    +    for(String childLabel: children.keySet()) {
    +      addChildrenToParent(parent, children.get(childLabel), childLabel);
    +    }
    +  }
    +
    +  private static void addChildrenToParent(SolrDocument parent, 
Collection<SolrDocument> children, String cDocsPath) {
    +    // if no paths; we do not need to add the child document's relation to 
its parent document.
    +    if (cDocsPath.equals(ANON_CHILD_KEY)) {
    +      parent.addChildDocuments(children);
    +      return;
    +    }
    +    // lookup leaf key for these children using path
    +    // depending on the label, add to the parent at the right key/label
    +    String trimmedPath = trimLastPound(cDocsPath);
    +    // if the child doc's path does not end with #, it is an array(same 
string is returned by ChildDocTransformer#trimLastPound)
    +    if (!parent.containsKey(trimmedPath) && (trimmedPath == cDocsPath)) {
    +      List<SolrDocument> list = new ArrayList<>(children);
    +      parent.setField(trimmedPath, list);
    +      return;
    +    }
    +    // is single value
    +    parent.setField(trimmedPath, ((List)children).get(0));
    +  }
    +
    +  private static String getLastPath(String path) {
    +    int lastIndexOfPathSepChar = path.lastIndexOf(PATH_SEP_CHAR);
    +    if(lastIndexOfPathSepChar == -1) {
    +      return path;
    +    }
    +    return path.substring(lastIndexOfPathSepChar + 1);
    +  }
    +
    +  private static String trimLastPoundIfArray(String path) {
    +    // remove index after last pound sign and if there is an array index 
e.g. toppings#1 -> toppings
    +    // or return original string if child doc is not in an array 
ingredients# -> ingredients#
    +    final int indexOfSepChar = path.lastIndexOf(NUM_SEP_CHAR);
    +    if (indexOfSepChar == -1) {
    +      return path;
    +    }
    +    int lastIndex = path.length() - 1;
    +    boolean singleDocVal = indexOfSepChar == lastIndex;
    +    return singleDocVal ? path: path.substring(0, indexOfSepChar);
    +  }
    +
    +  private static String trimLastPound(String path) {
    +    // remove index after last pound sign and index from e.g. toppings#1 
-> toppings
    +    int lastIndex = path.lastIndexOf('#');
    +    return lastIndex == -1 ? path : path.substring(0, lastIndex);
    +  }
    +
    +  /**
    +   * Returns the *parent* path for this document.
    +   * Children of the root will yield null.
    +   */
    +  private static String getParentPath(String currDocPath) {
    +    // chop off leaf (after last '/')
    +    // if child of leaf then return null (special value)
    +    int lastPathIndex = currDocPath.lastIndexOf(PATH_SEP_CHAR);
    +    return lastPathIndex == -1 ? null : currDocPath.substring(0, 
lastPathIndex);
    +  }
    +
    +  /** Looks up the nest path.  If there is none, returns {@link 
#ANON_CHILD_KEY}. */
    +  private static String getPathByDocId(int segDocId, SortedDocValues 
segPathDocValues) throws IOException {
    +    int numToAdvance = segPathDocValues.docID() == -1 ? segDocId : 
segDocId - (segPathDocValues.docID());
    +    assert numToAdvance >= 0;
    +    boolean advanced = segPathDocValues.advanceExact(segDocId);
    +    if (!advanced) {
    +      return ANON_CHILD_KEY;
    +    }
    +    return segPathDocValues.binaryValue().utf8ToString();
    +  }
    +
    +  /**
    +   *
    +   * @param segDocBaseId base docID of the segment
    +   * @param RootId docID if the current root document
    +   * @param lastDescendantId lowest docID of the root document's descendant
    +   * @return the docID to loop and to not surpass limit of descendants to 
match specified by query
    +   */
    +  private int calcLimitIndex(int segDocBaseId, int RootId, int 
lastDescendantId) {
    --- End diff --
    
    (I commented elsewhere that I don't understand your claim about the 
ordering)
    
    I'm now commenting here on this line to suggest simplifying (at least 
conceptually) this method.  You're having it take a mixture of arguments with 
varying segment vs global bases, and I think it can be simplified.  I think 
this method signature would be simpler as" 
`calcDocIdToIterateFrom(firstChildDocId, rootDocId, limit)`  and the caller can 
more easily set docId to this in the loop init part and not conditionally look 
at limit.



---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] lucene-solr pull request #416: WIP: SOLR-12519

Reply via email to