cpoerschke commented on code in PR #2120:
URL: https://github.com/apache/solr/pull/2120#discussion_r1417761015


##########
solr/modules/analysis-extras/src/java/org/apache/solr/update/processor/TextToVectorProcessor.java:
##########
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+import ai.onnxruntime.OrtException;
+import java.io.File;
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import opennlp.dl.vectors.SentenceVectorsDL;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.update.AddUpdateCommand;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class TextToVectorProcessor extends UpdateRequestProcessor {
+
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  private static final String INPUT_FIELD_PARAM = "inputField";
+  private static final String OUTPUT_FIELD_PARAM = "outputField";
+  private static final String MODEL_FILE_NAME_PARAM = "model";
+  private static final String VOCAB_FILE_NAME_PARAM = "vocab";
+
+  private static final String DEFAULT_INPUT_FIELDNAME = "name";
+  private static final String DEFAULT_OUTPUT_FIELDNAME = "film_vector";
+  private static final String DEFAULT_MODEL_FILE_NAME =
+      "/Users/cpoerschke/opennlp-data/onnx/sentence-transformers/model.onnx";
+  private static final String DEFAULT_VOCAB_FILE_NAME =
+      "/Users/cpoerschke/opennlp-data/onnx/sentence-transformers/vocab.txt";
+
+  private String inputFieldname = DEFAULT_INPUT_FIELDNAME;
+  private String outputFieldname = DEFAULT_OUTPUT_FIELDNAME;
+  private String modelFileName = DEFAULT_MODEL_FILE_NAME;
+  private String vocabFileName = DEFAULT_VOCAB_FILE_NAME;
+
+  private final SentenceVectorsDL sentenceVectorsDL;
+
+  TextToVectorProcessor(
+      SolrParams parameters,
+      SolrQueryRequest request,
+      SolrQueryResponse response,
+      UpdateRequestProcessor nextProcessor) {
+    super(nextProcessor);
+
+    if (parameters != null) {
+      this.modelFileName = parameters.get(MODEL_FILE_NAME_PARAM, 
DEFAULT_MODEL_FILE_NAME);
+      this.vocabFileName = parameters.get(VOCAB_FILE_NAME_PARAM, 
DEFAULT_VOCAB_FILE_NAME);
+    }
+
+    SentenceVectorsDL sv = null;
+    try {
+      sv = new SentenceVectorsDL(new File(this.modelFileName), new 
File(this.vocabFileName));
+    } catch (IOException ioe) {
+      log.warn("SentenceVectorsDL initialisation failed", ioe);
+    } catch (OrtException oe) {
+      log.warn("SentenceVectorsDL initialisation failed", oe);
+    }
+    this.sentenceVectorsDL = sv;
+  }
+
+  @Override
+  public void processAdd(AddUpdateCommand command) throws IOException {
+    final SolrInputDocument document = command.getSolrInputDocument();
+    if (document.containsKey(this.inputFieldname)) {
+      final String inputText = (String) 
document.getFieldValue(this.inputFieldname);
+      final float[] vectors = getVectors(inputText);
+      if (vectors != null) {
+        if (!document.containsKey(this.outputFieldname)) {
+          if (log.isInfoEnabled()) {
+            log.info(
+                "for {}='{}' adding {}={}",
+                this.inputFieldname,
+                inputText,
+                this.outputFieldname,
+                vectors);

Review Comment:
   illustration:
   
   ```
    2> 2050 INFO  
(TEST-TestOpenNLPExtractNamedEntitiesUpdateProcessorFactory.testTextToVector-seed#[E1B49D36ACB1A7A6])
 [n: c: s: r: x: t:] o.a.s.u.p.TextToVectorProcessor for name='Hello World' 
adding film_vector=[-0.3888032, 0.34699255, 0.11776694, -0.34095168, 
-0.06666583, -0.48905686, -0.17004992, 0.13088302, 0.063909784, -0.45450905, 
0.28404686, 0.11651428, 0.8480127, -0.3692923, -0.26619208, 0.07025662, 
-0.33313373, -0.12756985, -0.10953906, -0.05785429, -0.1686004, 0.6803354, 
-0.0545584, 0.14137872, -0.1302295, 0.42987126, 0.18358722, 0.039669953, 
0.34926295, -0.7637918, -0.47905764, -0.011129548, -0.21528748, 0.010859001, 
-0.36128473, 0.20746648, -0.15468447, -0.16313638, -0.32861906, -0.11707684, 
-0.22934996, -0.07812902, 0.41594654, -0.3178699, 0.70466787, 0.57724303, 
-0.08186829, -0.19486351, 0.56620926, -0.315683, -0.3664256, 0.62871945, 
-0.56286377, -0.29139203, -0.19633804, -0.25078568, 0.025854347, 0.2275373, 
0.016218923, -0.23669145, 0.0415261, -0.29401213, 0.14258103
 , -0.121401176, -0.008730955, 0.039560817, 0.0053293803, -0.24547896, 
0.44033033, 0.7013321, 0.9053267, 0.059785035, -0.084875785, -0.16118778, 
-0.1711878, -0.012526851, 0.93482494, 0.045238227, -0.08164379, -0.23231106, 
-0.28685895, 0.13048773, -0.06715582, -0.3958803, -0.40276945, 0.24914381, 
0.1663518, 0.17067464, -0.5423983, -0.042805046, -0.44677478, 0.14615427, 
-0.25440606, 0.13382448, -0.76335627, -0.052609626, 0.26569095, -0.2802104, 
2.5820546E-4, 4.7013593, 0.34225053, 0.9730423, -0.09100162, 0.1523611, 
0.35319296, 0.2618392, -0.27190408, 0.3904167, 0.46644256, 0.096129775, 
0.038468562, -0.68113154, -0.14383656, -0.14390369, -0.12763882, 0.03943621, 
0.13934013, 0.069219515, -0.011218052, 0.059729867, 0.41935375, -0.7073411, 
-0.54900736, -0.23348936, -0.48685154, -1.0377865, -0.3907155, -5.1925535E-32, 
0.34958348, 0.24257527, 0.0922519, -0.056983564, 0.05145917, -0.03483889, 
-0.10464314, -0.24817131, -0.3975137, 0.15658592, 0.014809836, 0.016810402, 
0.13939738, 3.682524E-4, 
 -0.35828018, -0.22547285, -7.414073E-5, 0.11041657, 0.1177998, 0.121245965, 
0.21362822, 0.2089299, 0.29802522, -0.2965596, 0.45853406, -0.3814977, 
0.1616222, 0.14375491, -0.079968534, -0.42107645, 0.07313761, 0.0041592177, 
-0.09183425, 0.13059235, -0.15606982, -0.33942413, 0.038978532, 0.10325194, 
0.58120537, -0.066942625, 0.317512, 0.020956634, -0.09412029, -0.20082667, 
-0.01501832, -0.25486082, -0.009016886, 0.3213461, -0.4404407, -0.5431586, 
-0.02560106, 0.009370689, -0.15988332, 0.7154971, 0.45263883, -0.4779931, 
0.43939096, 0.6388841, -0.21583866, -0.1684763, 0.36716363, -0.32485852, 
-0.03232416, 0.7199912, -0.050039202, -0.16872749, 0.34808043, 0.2741264, 
-0.3621533, 0.09292322, -0.47831714, -0.0050955545, -0.08602172, -0.036092795, 
-0.12762114, 0.3099814, -0.28370318, -0.02446674, 0.7231124, 0.20177883, 
0.5636964, 0.1983404, -0.24632987, -0.17683306, 0.20640743, 0.11485933, 
0.07848426, -0.58890146, -0.11557844, 0.0061890427, -0.7215529, 0.16266064, 
0.6440193, 0.12441084, 0.06
 8208665, 5.1678064E-32, 0.38650972, -0.016602185, 0.1250662, -0.20177147, 
-0.17153032, 0.08130391, 0.11415684, 0.42104292, -0.5687257, 0.4979761, 
0.16585845, -0.20500977, 0.10016692, -0.21252581, 0.33173344, 0.68953454, 
0.48526096, 0.23091117, -0.16490223, -0.45430768, -0.17198756, 0.015296657, 
-0.27280217, 0.41654357, -0.20554197, 0.13391481, 0.08810015, -0.13057417, 
0.22373128, -0.58190167, -0.0899881, -0.5290033, 0.08644417, -0.4024223, 
-0.001209748, 0.1443516, 0.06197461, -0.11183194, 0.5300779, -0.27028757, 
-0.11026083, -0.07564972, 0.23398972, 0.104088485, 0.108091846, 0.08388511, 
-0.018295512, -0.5918972, 0.13884187, -0.049967855, 0.08229362, 0.20356356, 
0.19936256, -0.27707797, 0.13065122, -0.18001778, -0.22446743, -0.18393275, 
-0.17757297, -0.418278, 0.13075297, -0.53767633, -0.7188872, 0.10799691, 
-0.91248226, 0.50794154, 0.13374779, 0.061186384, -0.049560227, -0.30017966, 
0.053840984, -0.6838608, 0.18776295, 0.27523986, 0.030647015, -0.050793342, 
0.16383554, -0.6075103, 0
 .19556412, 0.018092714, -0.16764373, 0.14828643, 0.36529306, 0.23918682, 
-0.14223269, 0.09385912, -0.06041352, 0.5
     2> 251344, 0.07537809, 0.21631652, -0.83510923, 0.3721457, -0.27135444, 
0.005293347, -0.43568593, -8.978257E-8, -0.1748908, -0.10325901, 0.30206668, 
-0.25535515, -0.2113391, 0.19971393, 0.054012146, -0.10861126, 0.16375908, 
-0.32779583, -0.14241043, -0.16339755, 0.08492915, -0.05674003, -0.6035583, 
0.47954202, 0.056245763, -0.43246943, 0.46378663, 0.21455076, 0.06528324, 
0.094717205, 0.050139554, 0.022383839, -0.008147487, 0.60859174, 0.0068899076, 
-0.31076998, -0.47079635, -0.41923416, -0.05839868, -0.20116055, 0.3302345, 
-0.28399584, -0.16132203, -0.737176, 0.08288847, -0.2685848, 0.089237176, 
-0.061198384, 0.37296394, -0.24046403, -0.27589902, -0.02441192, 0.32220256, 
0.1953786, 0.06121865, -0.26500437, 0.38481644, -0.29773325, -0.04927551, 
0.17562498, -0.092829935, 0.38234922, -0.01803736, 0.5299904, 0.4966593, 
0.4951843, 0.54891765, 0.10730604, -0.068107314, 0.21509248, -0.014800385, 
0.1159264]
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@solr.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@solr.apache.org
For additional commands, e-mail: issues-h...@solr.apache.org

Reply via email to