cpoerschke commented on a change in pull request #123: URL: https://github.com/apache/solr/pull/123#discussion_r631256745
########## File path: solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/FieldValueFeature.java ########## @@ -146,5 +171,100 @@ public float getMaxScore(int upTo) throws IOException { return Float.POSITIVE_INFINITY; } } + + /** + * A FeatureScorer that reads the docValues for a field + */ + public class DocValuesFieldValueFeatureScorer extends FeatureWeight.FeatureScorer { + final LeafReaderContext context; + final DocIdSetIterator docValues; + final FieldType schemaFieldType; + DocValuesType docValuesType = DocValuesType.NONE; + + public DocValuesFieldValueFeatureScorer(final FeatureWeight weight, final LeafReaderContext context, + final DocIdSetIterator itr, final SchemaField schemaField) { Review comment: ```suggestion final DocIdSetIterator itr, final FieldType fieldType) { ``` minor/subjective: since/if only part of `SchemaField` is used we could pass in only that here ########## File path: solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/FieldValueFeature.java ########## @@ -146,5 +171,100 @@ public float getMaxScore(int upTo) throws IOException { return Float.POSITIVE_INFINITY; } } + + /** + * A FeatureScorer that reads the docValues for a field + */ + public class DocValuesFieldValueFeatureScorer extends FeatureWeight.FeatureScorer { + final LeafReaderContext context; + final DocIdSetIterator docValues; Review comment: ```suggestion final DocValuesIterator docValues; ``` if `DocValuesIterator` was possible here then use of its `advanceExact` could be an alternative to `DocIdSetIterator.advance`. ########## File path: solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/FieldValueFeature.java ########## @@ -146,5 +171,100 @@ public float getMaxScore(int upTo) throws IOException { return Float.POSITIVE_INFINITY; } } + + /** + * A FeatureScorer that reads the docValues for a field + */ + public class DocValuesFieldValueFeatureScorer extends FeatureWeight.FeatureScorer { + final LeafReaderContext context; + final DocIdSetIterator docValues; + final FieldType schemaFieldType; + DocValuesType docValuesType = DocValuesType.NONE; + + public DocValuesFieldValueFeatureScorer(final FeatureWeight weight, final LeafReaderContext context, + final DocIdSetIterator itr, final SchemaField schemaField) { + super(weight, itr); + this.context = context; + schemaFieldType = schemaField.getType(); + + try { + FieldInfo fieldInfo = context.reader().getFieldInfos().fieldInfo(field); + // if fieldInfo is null, just use NONE-Type. This causes no problems, because we won't call score() anyway + docValuesType = fieldInfo != null ? fieldInfo.getDocValuesType() : DocValuesType.NONE; + switch (docValuesType) { + case NUMERIC: + docValues = DocValues.getNumeric(context.reader(), field); + break; + case SORTED: + docValues = DocValues.getSorted(context.reader(), field); + break; + case BINARY: + case SORTED_NUMERIC: + case SORTED_SET: + case NONE: + default: + docValues = null; Review comment: There is a `SchemaField.docValuesType()` method, have you considered using that in the caller e.g. where the `schemaField.hasDocValues()` already happens? If it can be used there then less or no need to consider `docValues == null` as a possibility in `DocValuesFieldValueFeatureScorer` here i.e. an exception could be thrown for `docValues == null` since it would indicate a bug and the `score()` method could presume non-null. ########## File path: solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/FieldValueFeature.java ########## @@ -146,5 +171,100 @@ public float getMaxScore(int upTo) throws IOException { return Float.POSITIVE_INFINITY; } } + + /** + * A FeatureScorer that reads the docValues for a field + */ + public class DocValuesFieldValueFeatureScorer extends FeatureWeight.FeatureScorer { + final LeafReaderContext context; + final DocIdSetIterator docValues; + final FieldType schemaFieldType; + DocValuesType docValuesType = DocValuesType.NONE; + + public DocValuesFieldValueFeatureScorer(final FeatureWeight weight, final LeafReaderContext context, + final DocIdSetIterator itr, final SchemaField schemaField) { + super(weight, itr); + this.context = context; + schemaFieldType = schemaField.getType(); + + try { + FieldInfo fieldInfo = context.reader().getFieldInfos().fieldInfo(field); + // if fieldInfo is null, just use NONE-Type. This causes no problems, because we won't call score() anyway + docValuesType = fieldInfo != null ? fieldInfo.getDocValuesType() : DocValuesType.NONE; + switch (docValuesType) { + case NUMERIC: + docValues = DocValues.getNumeric(context.reader(), field); + break; + case SORTED: + docValues = DocValues.getSorted(context.reader(), field); + break; + case BINARY: + case SORTED_NUMERIC: + case SORTED_SET: + case NONE: + default: + docValues = null; + } + } catch (IOException e) { + throw new IllegalArgumentException("Could not read docValues for field " + field + " with docValuesType " + + docValuesType.name()); + } + } + + @Override + public float score() throws IOException { + if (docValues != null && docValues.advance(itr.docID()) < DocIdSetIterator.NO_MORE_DOCS) { + switch (docValuesType) { + case NUMERIC: + if (NumberType.FLOAT.equals(schemaFieldType.getNumberType())) { + // convert float value that was stored as long back to float + return Float.intBitsToFloat((int) ((NumericDocValues) docValues).longValue()); + } else if (NumberType.DOUBLE.equals(schemaFieldType.getNumberType())) { Review comment: minor/subjective: `schemaFieldType.getNumberType()` happening in the constructor (once per segment) rather than here (once per document) ########## File path: solr/contrib/ltr/src/test-files/solr/collection1/conf/schema.xml ########## @@ -18,13 +18,22 @@ <schema name="example" version="1.5"> <fields> - <field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" /> + <field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false"/> + <field name="final-score" type="string" indexed="true" stored="true" multiValued="false"/> + <field name="final-score-float" type="float" indexed="true" stored="true" multiValued="false"/> Review comment: Consider avoiding `-` in field name based on https://issues.apache.org/jira/browse/SOLR-8110 and https://issues.apache.org/jira/browse/SOLR-3407 info. Assuming of course that the `-` here is not material to the use of the field. ########## File path: solr/contrib/ltr/src/test/org/apache/solr/ltr/TestLTROnSolrCloud.java ########## @@ -107,21 +105,21 @@ public void testSimpleQuery() throws Exception { final Float original_result7_score = (Float)queryResponse.getResults().get(7).get("score"); final String result0_features= FeatureLoggerTestUtils.toFeatureVector( - "powpularityS","64.0", "c3","2.0", "original","0.0"); + "powpularityS","64.0", "c3","2.0", "original","0.0", "dvIntFieldFeature","8.0","dvLongFieldFeature","8.0","dvFloatFieldFeature","0.8","dvDoubleFieldFeature","0.8","dvStrNumFieldFeature","8.0","dvStrBoolFieldFeature","1.0"); final String result1_features= FeatureLoggerTestUtils.toFeatureVector( - "powpularityS","49.0", "c3","2.0", "original","1.0"); + "powpularityS","49.0", "c3","2.0", "original","1.0", "dvIntFieldFeature","7.0","dvLongFieldFeature","7.0","dvFloatFieldFeature","0.7","dvDoubleFieldFeature","0.7","dvStrNumFieldFeature","7.0","dvStrBoolFieldFeature","0.0"); final String result2_features= FeatureLoggerTestUtils.toFeatureVector( - "powpularityS","36.0", "c3","2.0", "original","2.0"); + "powpularityS","36.0", "c3","2.0", "original","2.0", "dvIntFieldFeature","6.0","dvLongFieldFeature","6.0","dvFloatFieldFeature","0.6","dvDoubleFieldFeature","0.6","dvStrNumFieldFeature","6.0","dvStrBoolFieldFeature","1.0"); final String result3_features= FeatureLoggerTestUtils.toFeatureVector( - "powpularityS","25.0", "c3","2.0", "original","3.0"); + "powpularityS","25.0", "c3","2.0", "original","3.0", "dvIntFieldFeature","5.0","dvLongFieldFeature","5.0","dvFloatFieldFeature","0.5","dvDoubleFieldFeature","0.5","dvStrNumFieldFeature","5.0","dvStrBoolFieldFeature","0.0"); final String result4_features= FeatureLoggerTestUtils.toFeatureVector( - "powpularityS","16.0", "c3","2.0", "original","4.0"); + "powpularityS","16.0", "c3","2.0", "original","4.0", "dvIntFieldFeature","4.0","dvLongFieldFeature","4.0","dvFloatFieldFeature","0.4","dvDoubleFieldFeature","0.4","dvStrNumFieldFeature","4.0","dvStrBoolFieldFeature","1.0"); final String result5_features= FeatureLoggerTestUtils.toFeatureVector( - "powpularityS", "9.0", "c3","2.0", "original","5.0"); + "powpularityS", "9.0", "c3","2.0", "original","5.0", "dvIntFieldFeature","3.0","dvLongFieldFeature","3.0","dvFloatFieldFeature","0.3","dvDoubleFieldFeature","0.3","dvStrNumFieldFeature","3.0","dvStrBoolFieldFeature","0.0"); final String result6_features= FeatureLoggerTestUtils.toFeatureVector( - "powpularityS", "4.0", "c3","2.0", "original","6.0"); + "powpularityS", "4.0", "c3","2.0", "original","6.0", "dvIntFieldFeature","2.0","dvLongFieldFeature","2.0","dvFloatFieldFeature","0.2","dvDoubleFieldFeature","0.2","dvStrNumFieldFeature","2.0","dvStrBoolFieldFeature","1.0"); final String result7_features= FeatureLoggerTestUtils.toFeatureVector( - "powpularityS", "1.0", "c3","2.0", "original","7.0"); + "powpularityS", "1.0", "c3","2.0", "original","7.0", "dvIntFieldFeature","-1.0","dvLongFieldFeature","-2.0","dvFloatFieldFeature","-3.0","dvDoubleFieldFeature","-4.0","dvStrNumFieldFeature","-5.0","dvStrBoolFieldFeature","0.0"); Review comment: minor: very long lines, could we wrap somehow? ########## File path: solr/contrib/ltr/src/test/org/apache/solr/ltr/TestLTROnSolrCloud.java ########## @@ -232,32 +230,47 @@ private void createCollection(String name, String config, int numShards, int num solrCluster.waitForActiveCollection(name, numShards, numShards * numReplicas); } - void indexDocument(String collection, String id, String title, String description, int popularity) throws Exception{ SolrInputDocument doc = new SolrInputDocument(); doc.setField("id", id); doc.setField("title", title); doc.setField("description", description); doc.setField("popularity", popularity); + if(popularity != 1) { + // check that empty values will be read as default + doc.setField("dvIntField", popularity); + doc.setField("dvLongField", popularity); + doc.setField("dvFloatField", ((float) popularity) / 10); + doc.setField("dvDoubleField", ((double) popularity) / 10); + doc.setField("dvStrNumField", popularity); + doc.setField("dvStrBoolField", popularity % 2 == 0 ? "T" : "F"); + } solrCluster.getSolrClient().add(collection, doc); } private void indexDocuments(final String collection) throws Exception { final int collectionSize = 8; - for (int docId = 1; docId <= collectionSize; docId++) { + // put documents in reversed order to check that advanceExact is working correctly + for (int docId = collectionSize; docId >= 1; docId--) { Review comment: Similar to multi-segment case coverage below, randomisation could be used here to cover both ascending and descending order. Or another way could be to form a list of ids and to shuffle it with the randomiser and then to use it to index. ########## File path: solr/contrib/ltr/src/test/org/apache/solr/ltr/TestLTROnSolrCloud.java ########## @@ -232,32 +230,47 @@ private void createCollection(String name, String config, int numShards, int num solrCluster.waitForActiveCollection(name, numShards, numShards * numReplicas); } - void indexDocument(String collection, String id, String title, String description, int popularity) throws Exception{ SolrInputDocument doc = new SolrInputDocument(); doc.setField("id", id); doc.setField("title", title); doc.setField("description", description); doc.setField("popularity", popularity); + if(popularity != 1) { + // check that empty values will be read as default + doc.setField("dvIntField", popularity); + doc.setField("dvLongField", popularity); + doc.setField("dvFloatField", ((float) popularity) / 10); + doc.setField("dvDoubleField", ((double) popularity) / 10); + doc.setField("dvStrNumField", popularity); + doc.setField("dvStrBoolField", popularity % 2 == 0 ? "T" : "F"); + } solrCluster.getSolrClient().add(collection, doc); } private void indexDocuments(final String collection) throws Exception { final int collectionSize = 8; - for (int docId = 1; docId <= collectionSize; docId++) { + // put documents in reversed order to check that advanceExact is working correctly + for (int docId = collectionSize; docId >= 1; docId--) { final int popularity = docId; indexDocument(collection, String.valueOf(docId), "a1", "bloom", popularity); + if(docId == collectionSize / 2) { + // commit in the middle in order to check that everything works fine for multi-segment case + solrCluster.getSolrClient().commit(collection); + } Review comment: It's great to expand the test coverage here to check that the multi-segment case works. However _always_ having the commit in the middle also changes what the existing test does i.e. the single-segment or fewer segments scenario. A _"best of both worlds"_ way could be use of randomisation i.e. only _sometimes_ commit in the middle e.g. ``` if (random.nextBoolean()) solrCluster.getSolrClient().commit(collection); ``` which is supported by the test framework. ########## File path: solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/FieldValueFeature.java ########## @@ -57,50 +68,65 @@ public void setField(String field) { } @Override - public LinkedHashMap<String,Object> paramsToMap() { - final LinkedHashMap<String,Object> params = defaultParamsToMap(); + public LinkedHashMap<String, Object> paramsToMap() { + final LinkedHashMap<String, Object> params = defaultParamsToMap(); params.put("field", field); return params; } @Override protected void validate() throws FeatureException { if (field == null || field.isEmpty()) { - throw new FeatureException(getClass().getSimpleName()+ - ": field must be provided"); + throw new FeatureException(getClass().getSimpleName() + ": field must be provided"); } } - public FieldValueFeature(String name, Map<String,Object> params) { + public FieldValueFeature(String name, Map<String, Object> params) { super(name, params); } @Override - public FeatureWeight createWeight(IndexSearcher searcher, boolean needsScores, - SolrQueryRequest request, Query originalQuery, Map<String,String[]> efi) - throws IOException { + public FeatureWeight createWeight(IndexSearcher searcher, boolean needsScores, SolrQueryRequest request, + Query originalQuery, Map<String, String[]> efi) throws IOException { return new FieldValueFeatureWeight(searcher, request, originalQuery, efi); } public class FieldValueFeatureWeight extends FeatureWeight { + private final SchemaField schemaField; public FieldValueFeatureWeight(IndexSearcher searcher, SolrQueryRequest request, Query originalQuery, Map<String,String[]> efi) { super(FieldValueFeature.this, searcher, request, originalQuery, efi); + if (searcher instanceof SolrIndexSearcher) { + schemaField = ((SolrIndexSearcher) searcher).getSchema().getFieldOrNull(field); + } else { + schemaField = null; + } } + /** + * Return a FeatureScorer that uses docValues or storedFields if no docValues are present + * @param context the segment this FeatureScorer is working with + * @return FeatureScorer for the current segment and field + * @throws IOException as defined by abstract class Feature + */ Review comment: Thanks for adding the javadocs here and on the `[DocValues]FieldValueFeatureScorer` classes below! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@solr.apache.org For additional commands, e-mail: issues-h...@solr.apache.org