tomglk commented on a change in pull request #123: URL: https://github.com/apache/solr/pull/123#discussion_r633062781
########## File path: solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/FieldValueFeature.java ########## @@ -57,50 +67,79 @@ public void setField(String field) { } @Override - public LinkedHashMap<String,Object> paramsToMap() { - final LinkedHashMap<String,Object> params = defaultParamsToMap(); + public LinkedHashMap<String, Object> paramsToMap() { + final LinkedHashMap<String, Object> params = defaultParamsToMap(); params.put("field", field); return params; } @Override protected void validate() throws FeatureException { if (field == null || field.isEmpty()) { - throw new FeatureException(getClass().getSimpleName()+ - ": field must be provided"); + throw new FeatureException(getClass().getSimpleName() + ": field must be provided"); } } - public FieldValueFeature(String name, Map<String,Object> params) { + public FieldValueFeature(String name, Map<String, Object> params) { super(name, params); } @Override - public FeatureWeight createWeight(IndexSearcher searcher, boolean needsScores, - SolrQueryRequest request, Query originalQuery, Map<String,String[]> efi) - throws IOException { + public FeatureWeight createWeight(IndexSearcher searcher, boolean needsScores, SolrQueryRequest request, + Query originalQuery, Map<String, String[]> efi) throws IOException { return new FieldValueFeatureWeight(searcher, request, originalQuery, efi); } public class FieldValueFeatureWeight extends FeatureWeight { + private final SchemaField schemaField; public FieldValueFeatureWeight(IndexSearcher searcher, - SolrQueryRequest request, Query originalQuery, Map<String,String[]> efi) { + SolrQueryRequest request, Query originalQuery, Map<String, String[]> efi) { super(FieldValueFeature.this, searcher, request, originalQuery, efi); + if (searcher instanceof SolrIndexSearcher) { + schemaField = ((SolrIndexSearcher) searcher).getSchema().getFieldOrNull(field); + } else { + schemaField = null; + } } + /** + * Return a FeatureScorer that uses docValues or storedFields if no docValues are present + * + * @param context the segment this FeatureScorer is working with + * @return FeatureScorer for the current segment and field + * @throws IOException as defined by abstract class Feature + */ @Override public FeatureScorer scorer(LeafReaderContext context) throws IOException { + if (schemaField != null && !schemaField.stored() && schemaField.hasDocValues()) { + + FieldInfo fieldInfo = context.reader().getFieldInfos().fieldInfo(field); + DocValuesType docValuesType = fieldInfo != null ? fieldInfo.getDocValuesType() : DocValuesType.NONE; + + if (DocValuesType.NUMERIC.equals(docValuesType)) { + return new NumericDocValuesFieldValueFeatureScorer(this, context, + DocIdSetIterator.all(DocIdSetIterator.NO_MORE_DOCS), schemaField.getType().getNumberType()); + } else if (DocValuesType.SORTED.equals(docValuesType)) { + return new SortedDocValuesFieldValueFeatureScorer(this, context, + DocIdSetIterator.all(DocIdSetIterator.NO_MORE_DOCS)); + // If type is NONE, this segment has no docs with this field. That's not a problem, because we won't call score() anyway Review comment: I think that's a great change, it really improves the readability! ########## File path: solr/contrib/ltr/src/test/org/apache/solr/ltr/feature/TestFieldValueFeature.java ########## @@ -70,86 +78,92 @@ public void after() throws Exception { @Test public void testRanking() throws Exception { - - final SolrQuery query = new SolrQuery(); - query.setQuery("title:w1"); - query.add("fl", "*, score"); - query.add("rows", "4"); - - // Normal term match - assertJQ("/query" + query.toQueryString(), "/response/numFound/==4"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/id=='1'"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[1]/id=='8'"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[2]/id=='6'"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[3]/id=='7'"); - - query.add("rq", "{!ltr model=popularity-model reRankDocs=4}"); - - assertJQ("/query" + query.toQueryString(), "/response/numFound/==4"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/id=='8'"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[1]/id=='7'"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[2]/id=='6'"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[3]/id=='1'"); - - query.setQuery("*:*"); - query.remove("rows"); - query.add("rows", "8"); - query.remove("rq"); - query.add("rq", "{!ltr model=popularity-model reRankDocs=8}"); - - assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/id=='8'"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[1]/id=='7'"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[2]/id=='6'"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[3]/id=='5'"); + for (String field : FIELD_NAMES) { + + final SolrQuery query = new SolrQuery(); + query.setQuery("title:w1"); + query.add("fl", "*, score"); + query.add("rows", "4"); + + // Normal term match + assertJQ("/query" + query.toQueryString(), "/response/numFound/==4"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/id=='1'"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[1]/id=='8'"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[2]/id=='6'"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[3]/id=='7'"); + + query.add("rq", "{!ltr model="+field+"-model reRankDocs=4}"); + + assertJQ("/query" + query.toQueryString(), "/response/numFound/==4"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/id=='8'"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[1]/id=='7'"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[2]/id=='6'"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[3]/id=='1'"); + + query.setQuery("*:*"); + query.remove("rows"); + query.add("rows", "8"); + query.remove("rq"); + query.add("rq", "{!ltr model="+field+"-model reRankDocs=8}"); + + assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/id=='8'"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[1]/id=='7'"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[2]/id=='6'"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[3]/id=='5'"); + } } @Test public void testIfADocumentDoesntHaveAFieldDefaultValueIsReturned() throws Exception { - SolrQuery query = new SolrQuery(); - query.setQuery("id:42"); - query.add("fl", "*, score"); - query.add("rows", "4"); - - assertJQ("/query" + query.toQueryString(), "/response/numFound/==1"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/id=='42'"); - query = new SolrQuery(); - query.setQuery("id:42"); - query.add("rq", "{!ltr model=popularity-model reRankDocs=4}"); - query.add("fl", "[fv]"); - assertJQ("/query" + query.toQueryString(), "/response/numFound/==1"); - assertJQ("/query" + query.toQueryString(), - "/response/docs/[0]/=={'[fv]':'"+FeatureLoggerTestUtils.toFeatureVector("popularity",Float.toString(FIELD_VALUE_FEATURE_DEFAULT_VAL))+"'}"); - + for (String field : FIELD_NAMES) { + SolrQuery query = new SolrQuery(); + query.setQuery("id:42"); + query.add("fl", "*, score"); + query.add("rows", "4"); + + assertJQ("/query" + query.toQueryString(), "/response/numFound/==1"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/id=='42'"); + query = new SolrQuery(); + query.setQuery("id:42"); + query.add("rq", "{!ltr model="+field+"-model reRankDocs=4}"); + query.add("fl", "[fv]"); + assertJQ("/query" + query.toQueryString(), "/response/numFound/==1"); + assertJQ("/query" + query.toQueryString(), + "/response/docs/[0]/=={'[fv]':'"+FeatureLoggerTestUtils.toFeatureVector(field,Float.toString(FIELD_VALUE_FEATURE_DEFAULT_VAL))+"'}"); Review comment: I think the problem is `final private boolean extractAllFeatures;` in `LTRScoringQuery`. It is set to true and therefore all previous features are added to the response instead of just the model features. This is the logic (line 210 in LTRScoringQuery): ```java if (this.extractAllFeatures) { features = allFeatures; } else{ features = modelFeatures; } ``` I didn't find a way to set it to false with a query param. So we would have two options: - use this constructor `public LTRScoringQuery(LTRScoringModel ltrScoringModel, boolean extractAllFeatures)` - adjust the xpath to something like `[ends-with(., '<fieldname>=0.0')]` I am trying to adjust the xpath, because I would prefer that solution, but I was not able to find the right pattern so far. ########## File path: solr/contrib/ltr/src/test/org/apache/solr/ltr/feature/TestFieldValueFeature.java ########## @@ -70,86 +78,92 @@ public void after() throws Exception { @Test public void testRanking() throws Exception { - - final SolrQuery query = new SolrQuery(); - query.setQuery("title:w1"); - query.add("fl", "*, score"); - query.add("rows", "4"); - - // Normal term match - assertJQ("/query" + query.toQueryString(), "/response/numFound/==4"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/id=='1'"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[1]/id=='8'"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[2]/id=='6'"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[3]/id=='7'"); - - query.add("rq", "{!ltr model=popularity-model reRankDocs=4}"); - - assertJQ("/query" + query.toQueryString(), "/response/numFound/==4"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/id=='8'"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[1]/id=='7'"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[2]/id=='6'"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[3]/id=='1'"); - - query.setQuery("*:*"); - query.remove("rows"); - query.add("rows", "8"); - query.remove("rq"); - query.add("rq", "{!ltr model=popularity-model reRankDocs=8}"); - - assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/id=='8'"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[1]/id=='7'"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[2]/id=='6'"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[3]/id=='5'"); + for (String field : FIELD_NAMES) { + + final SolrQuery query = new SolrQuery(); + query.setQuery("title:w1"); + query.add("fl", "*, score"); + query.add("rows", "4"); + + // Normal term match + assertJQ("/query" + query.toQueryString(), "/response/numFound/==4"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/id=='1'"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[1]/id=='8'"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[2]/id=='6'"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[3]/id=='7'"); + + query.add("rq", "{!ltr model="+field+"-model reRankDocs=4}"); + + assertJQ("/query" + query.toQueryString(), "/response/numFound/==4"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/id=='8'"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[1]/id=='7'"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[2]/id=='6'"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[3]/id=='1'"); + + query.setQuery("*:*"); + query.remove("rows"); + query.add("rows", "8"); + query.remove("rq"); + query.add("rq", "{!ltr model="+field+"-model reRankDocs=8}"); + + assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/id=='8'"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[1]/id=='7'"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[2]/id=='6'"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[3]/id=='5'"); + } } @Test public void testIfADocumentDoesntHaveAFieldDefaultValueIsReturned() throws Exception { - SolrQuery query = new SolrQuery(); - query.setQuery("id:42"); - query.add("fl", "*, score"); - query.add("rows", "4"); - - assertJQ("/query" + query.toQueryString(), "/response/numFound/==1"); - assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/id=='42'"); - query = new SolrQuery(); - query.setQuery("id:42"); - query.add("rq", "{!ltr model=popularity-model reRankDocs=4}"); - query.add("fl", "[fv]"); - assertJQ("/query" + query.toQueryString(), "/response/numFound/==1"); - assertJQ("/query" + query.toQueryString(), - "/response/docs/[0]/=={'[fv]':'"+FeatureLoggerTestUtils.toFeatureVector("popularity",Float.toString(FIELD_VALUE_FEATURE_DEFAULT_VAL))+"'}"); - + for (String field : FIELD_NAMES) { + SolrQuery query = new SolrQuery(); + query.setQuery("id:42"); + query.add("fl", "*, score"); + query.add("rows", "4"); + + assertJQ("/query" + query.toQueryString(), "/response/numFound/==1"); + assertJQ("/query" + query.toQueryString(), "/response/docs/[0]/id=='42'"); + query = new SolrQuery(); + query.setQuery("id:42"); + query.add("rq", "{!ltr model="+field+"-model reRankDocs=4}"); + query.add("fl", "[fv]"); + assertJQ("/query" + query.toQueryString(), "/response/numFound/==1"); + assertJQ("/query" + query.toQueryString(), + "/response/docs/[0]/=={'[fv]':'"+FeatureLoggerTestUtils.toFeatureVector(field,Float.toString(FIELD_VALUE_FEATURE_DEFAULT_VAL))+"'}"); Review comment: I think the problem is `final private boolean extractAllFeatures;` in `LTRScoringQuery`. It is set to true and therefore all previous features are added to the response instead of just the model features. This is the logic (line 210 in LTRScoringQuery): ```java if (this.extractAllFeatures) { features = allFeatures; } else{ features = modelFeatures; } ``` I didn't find a way to set it to false with a query param. So we would have two options: - use this constructor `public LTRScoringQuery(LTRScoringModel ltrScoringModel, boolean extractAllFeatures)` - adjust the xpath to something like `[ends-with(., '<fieldname>=0.0')]` I am trying to adjust the xpath, because I would prefer that solution, but I was not able to find the right pattern so far. **Edit:** Ok, I took a break and realized my mistake (the break seemed to be necessary *smh*). I took another look at the jsonpath-adjustment. A pattern like this `"['response']['docs'][0][?(@['[fv]']=~ /.* <fieldName> =0.0.*/i )]"` should work but it would require an additional dependency for `jayway.jsonpath` and we would have to manually call `JsonPath.read(...)` and assert that the result is not null. I am not sure, whether that's nice, but maybe you want to build up upon that / can generate an idea from here? ########## File path: solr/contrib/ltr/src/test/org/apache/solr/ltr/feature/TestFieldValueFeature.java ########## @@ -56,11 +62,13 @@ public void before() throws Exception { assertU(commit()); - loadFeature("popularity", FieldValueFeature.class.getName(), - "{\"field\":\"popularity\"}"); + for (String field : FIELD_NAMES) { + loadFeature(field, FieldValueFeature.class.getName(), + "{\"field\":\""+field+"\"}"); - loadModel("popularity-model", LinearModel.class.getName(), - new String[] {"popularity"}, "{\"weights\":{\"popularity\":1.0}}"); + loadModel(field + "-model", LinearModel.class.getName(), + new String[] {field}, "{\"weights\":{\""+field+"\":1.0}}"); + } Review comment: Thanks for continuing the investigation! I think the example you posted here should work, although I am still a bit irritated regarding the `extractAllFeatures = true`. As user I would expect it to always return all the features that were created, no matter if that happened before or after the model. But the code makes it explicit, that these information should not be modified (`allFeatures` is an unmodifiableList). I think that a nicely placed docString could avoid some confusion. ########## File path: solr/contrib/ltr/src/java/org/apache/solr/ltr/feature/FieldValueFeature.java ########## @@ -106,18 +106,29 @@ public FieldValueFeatureWeight(IndexSearcher searcher, /** * Return a FeatureScorer that uses docValues or storedFields if no docValues are present + * * @param context the segment this FeatureScorer is working with * @return FeatureScorer for the current segment and field * @throws IOException as defined by abstract class Feature */ @Override public FeatureScorer scorer(LeafReaderContext context) throws IOException { if (schemaField != null && !schemaField.stored() && schemaField.hasDocValues()) { - return new DocValuesFieldValueFeatureScorer(this, context, - DocIdSetIterator.all(DocIdSetIterator.NO_MORE_DOCS), schemaField.getType()); + + FieldInfo fieldInfo = context.reader().getFieldInfos().fieldInfo(field); + DocValuesType docValuesType = fieldInfo != null ? fieldInfo.getDocValuesType() : DocValuesType.NONE; + + if (DocValuesType.NUMERIC.equals(docValuesType) || DocValuesType.SORTED.equals(docValuesType)) { + return new DocValuesFieldValueFeatureScorer(this, context, + DocIdSetIterator.all(DocIdSetIterator.NO_MORE_DOCS), schemaField.getType(), docValuesType); + // If type is NONE, this segment has no docs with this field. That's not a problem, because we won't call score() anyway Review comment: Great observation! I completely overlooked this. Your reasoning makes complete sense to me and some debugging verified the behavior you described. Even if it may be an edge case, I think it makes sense to make it explicit. In terms of performance you are also right. Trying to read the stored fields before returning the default value surely is more costly than just returning the value. I started to create a `DefaultValueFieldValueFeatureScorer` that always returns the default value, but until now I am struggling to find a way to test this. I hope your bike is well now! :) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@solr.apache.org For additional commands, e-mail: issues-h...@solr.apache.org