dsmiley commented on code in PR #2809:
URL: https://github.com/apache/solr/pull/2809#discussion_r1855551197


##########
solr/modules/llm/src/java/org/apache/solr/llm/embedding/SolrEmbeddingModel.java:
##########
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.llm.embedding;
+
+import dev.langchain4j.data.embedding.Embedding;
+import dev.langchain4j.model.embedding.EmbeddingModel;
+import java.lang.reflect.Method;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.Map;
+import java.util.Objects;
+import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.solr.core.SolrResourceLoader;
+import org.apache.solr.llm.store.EmbeddingModelException;
+import org.apache.solr.llm.store.rest.ManagedEmbeddingModelStore;
+
+/**
+ * This object wraps a {@link dev.langchain4j.model.embedding.EmbeddingModel} 
to encode text to
+ * vector. It's meant to be used as a managed resource with the {@link 
ManagedEmbeddingModelStore}
+ */
+public class SolrEmbeddingModel implements Accountable {
+  private static final long BASE_RAM_BYTES =
+      RamUsageEstimator.shallowSizeOfInstance(SolrEmbeddingModel.class);
+  private static final String TIMEOUT_PARAM = "timeout";
+  private static final String MAX_SEGMENTS_PER_BATCH_PARAM = 
"maxSegmentsPerBatch";
+  private static final String MAX_RETRIES_PARAM = "maxRetries";
+
+  private final String name;
+  private final Map<String, Object> params;
+  private final EmbeddingModel textToVector;
+  private final Integer hashCode;
+
+  public static SolrEmbeddingModel getInstance(
+      SolrResourceLoader solrResourceLoader,
+      String className,
+      String name,
+      Map<String, Object> params)
+      throws EmbeddingModelException {
+    try {
+      /*
+       * The idea here is to build a {@link 
dev.langchain4j.model.embedding.EmbeddingModel} using inversion
+       * of control.
+       * Each model has its own list of parameters we don't know beforehand, 
but each {@link dev.langchain4j.model.embedding.EmbeddingModel} class
+       * has its own builder that uses setters with the same name of the 
parameter in input.
+       * */
+      EmbeddingModel textToVector;
+      Class<?> modelClass = solrResourceLoader.findClass(className, 
EmbeddingModel.class);
+      var builder = modelClass.getMethod("builder").invoke(null);
+      if (params != null) {
+        /**
+         * Some {@link dev.langchain4j.model.embedding.EmbeddingModel} classes 
have params of
+         * specific types that must be constructed, for primitive types we can 
resort to the
+         * default. N.B. when adding support to new models, pay attention to 
all the parameters they
+         * support, some of them may require to be handled in here as separate 
switch cases
+         */
+        for (String paramName : params.keySet()) {
+          switch (paramName) {
+            case TIMEOUT_PARAM:
+              Duration timeOut = Duration.ofSeconds((Long) 
params.get(paramName));
+              builder.getClass().getMethod(paramName, 
Duration.class).invoke(builder, timeOut);
+              break;
+            case MAX_SEGMENTS_PER_BATCH_PARAM:
+              builder
+                  .getClass()
+                  .getMethod(paramName, Integer.class)
+                  .invoke(builder, ((Long) params.get(paramName)).intValue());
+              break;
+            case MAX_RETRIES_PARAM:
+              builder
+                  .getClass()
+                  .getMethod(paramName, Integer.class)
+                  .invoke(builder, ((Long) params.get(paramName)).intValue());
+              break;
+            default:
+              ArrayList<Method> paramNameMatches = new ArrayList<>();
+              for (var method : builder.getClass().getMethods()) {
+                if (paramName.equals(method.getName()) && 
method.getParameterCount() == 1) {
+                  paramNameMatches.add(method);
+                }
+              }
+              if (paramNameMatches.size() == 1) {
+                paramNameMatches.get(0).invoke(builder, params.get(paramName));
+              } else {
+                builder

Review Comment:
   Have you tested this code path?



##########
solr/solr-ref-guide/modules/query-guide/pages/embedding-text.adoc:
##########


Review Comment:
   Possibly rename later (after we debate the text of the doc), as I don't yet 
know what "embedding text" is and this doc doesn't yet explain what that is.



##########
solr/solr-ref-guide/modules/query-guide/pages/embedding-text.adoc:
##########
@@ -0,0 +1,269 @@
+= Embedding Text
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+With the *Large Language Model* (or *LLM* for short) module you can interact 
with Large Language Models in Solr to encode text to vectors at indexing and 
query time.
+
+
+== Text Embedding Concepts
+
+=== From Text to Vector
+
+The task of sentence similarity aims to encode text to vector in a way that 
sentences semantically similar are encoded to vectors close in a vector space 
(using a vector distance metric).
+
+
+=== Large Language Models 
+
+Large Language Models can be fine-tuned for such task.
+The resulting model is able to encode text to a numerical vector.
+
+For additional information you can refer to this 
https://sease.io/2021/12/using-bert-to-improve-search-relevance.html[blog post].
+
+==== Embedding Services
+
+Training, fine-tuning and operating such Large Language Models is expensive.
+
+Many companies focus on this aspect and let users access APIs to encode the 
text (at the price of a license fee).
+
+Apache Solr uses https://github.com/langchain4j/langchain4j[LangChain4j] to 
connect to such apis.
+
+[IMPORTANT]
+====
+At the moment a subset of the embedding models supported by LangChain4j is 
supported by Solr.
+
+*Disclaimer*: Apache Solr is *in no way* affiliated to any of these 
corporations or services.
+
+If you want to add support for additional services or improve the support for 
the existing ones, feel free to contribute:
+
+* https://github.com/apache/solr/blob/main/CONTRIBUTING.md[Contributing to 
Solr]

Review Comment:
   That's a given for any open-source project.



##########
solr/modules/llm/src/java/org/apache/solr/llm/store/EmbeddingModelStore.java:
##########
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.llm.store;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.solr.llm.embedding.SolrEmbeddingModel;
+
+/** Simple store to manage CRUD operations on the {@link SolrEmbeddingModel} */
+public class EmbeddingModelStore {
+
+  private final Map<String, SolrEmbeddingModel> availableModels;
+
+  public EmbeddingModelStore() {
+    availableModels = Collections.synchronizedMap(new LinkedHashMap<>());

Review Comment:
   the javadocs for synchronizedMap indicate that we must additionally 
synchronize when iterating all the values.  That occurs in getModels but you 
didn't do that.



##########
solr/modules/llm/src/java/org/apache/solr/llm/store/EmbeddingModelStore.java:
##########
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.llm.store;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.solr.llm.embedding.SolrEmbeddingModel;
+
+/** Simple store to manage CRUD operations on the {@link SolrEmbeddingModel} */
+public class EmbeddingModelStore {
+
+  private final Map<String, SolrEmbeddingModel> availableModels;
+
+  public EmbeddingModelStore() {
+    availableModels = Collections.synchronizedMap(new LinkedHashMap<>());
+  }
+
+  public SolrEmbeddingModel getModel(String name) {
+    return availableModels.get(name);
+  }
+
+  public void clear() {
+    availableModels.clear();
+  }
+
+  public List<SolrEmbeddingModel> getModels() {
+    final List<SolrEmbeddingModel> availableModelsValues =
+        new ArrayList<SolrEmbeddingModel>(availableModels.values());
+    return Collections.unmodifiableList(availableModelsValues);
+  }
+
+  @Override
+  public String toString() {
+    return "ModelStore [availableModels=" + availableModels.keySet() + "]";
+  }
+
+  public SolrEmbeddingModel delete(String modelName) {
+    return availableModels.remove(modelName);
+  }
+
+  public void addModel(SolrEmbeddingModel modeldata) throws 
EmbeddingModelException {
+    final String name = modeldata.getName();
+    if (availableModels.containsKey(name)) {

Review Comment:
   not threadSafe; try putIfAbsent instead



##########
solr/modules/llm/src/java/org/apache/solr/llm/embedding/SolrEmbeddingModel.java:
##########
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.llm.embedding;
+
+import dev.langchain4j.data.embedding.Embedding;
+import dev.langchain4j.model.embedding.EmbeddingModel;
+import java.lang.reflect.Method;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.Map;
+import java.util.Objects;
+import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.solr.core.SolrResourceLoader;
+import org.apache.solr.llm.store.EmbeddingModelException;
+import org.apache.solr.llm.store.rest.ManagedEmbeddingModelStore;
+
+/**
+ * This object wraps a {@link dev.langchain4j.model.embedding.EmbeddingModel} 
to encode text to
+ * vector. It's meant to be used as a managed resource with the {@link 
ManagedEmbeddingModelStore}
+ */
+public class SolrEmbeddingModel implements Accountable {
+  private static final long BASE_RAM_BYTES =
+      RamUsageEstimator.shallowSizeOfInstance(SolrEmbeddingModel.class);
+  private static final String TIMEOUT_PARAM = "timeout";
+  private static final String MAX_SEGMENTS_PER_BATCH_PARAM = 
"maxSegmentsPerBatch";
+  private static final String MAX_RETRIES_PARAM = "maxRetries";
+
+  private final String name;
+  private final Map<String, Object> params;
+  private final EmbeddingModel textToVector;
+  private final Integer hashCode;
+
+  public static SolrEmbeddingModel getInstance(
+      SolrResourceLoader solrResourceLoader,
+      String className,
+      String name,
+      Map<String, Object> params)
+      throws EmbeddingModelException {
+    try {
+      /*
+       * The idea here is to build a {@link 
dev.langchain4j.model.embedding.EmbeddingModel} using inversion
+       * of control.
+       * Each model has its own list of parameters we don't know beforehand, 
but each {@link dev.langchain4j.model.embedding.EmbeddingModel} class
+       * has its own builder that uses setters with the same name of the 
parameter in input.
+       * */
+      EmbeddingModel textToVector;
+      Class<?> modelClass = solrResourceLoader.findClass(className, 
EmbeddingModel.class);
+      var builder = modelClass.getMethod("builder").invoke(null);
+      if (params != null) {
+        /**
+         * Some {@link dev.langchain4j.model.embedding.EmbeddingModel} classes 
have params of
+         * specific types that must be constructed, for primitive types we can 
resort to the
+         * default. N.B. when adding support to new models, pay attention to 
all the parameters they
+         * support, some of them may require to be handled in here as separate 
switch cases
+         */
+        for (String paramName : params.keySet()) {
+          switch (paramName) {
+            case TIMEOUT_PARAM:
+              Duration timeOut = Duration.ofSeconds((Long) 
params.get(paramName));
+              builder.getClass().getMethod(paramName, 
Duration.class).invoke(builder, timeOut);
+              break;
+            case MAX_SEGMENTS_PER_BATCH_PARAM:
+              builder
+                  .getClass()
+                  .getMethod(paramName, Integer.class)
+                  .invoke(builder, ((Long) params.get(paramName)).intValue());
+              break;
+            case MAX_RETRIES_PARAM:
+              builder
+                  .getClass()
+                  .getMethod(paramName, Integer.class)
+                  .invoke(builder, ((Long) params.get(paramName)).intValue());
+              break;
+            default:
+              ArrayList<Method> paramNameMatches = new ArrayList<>();

Review Comment:
   this section could use a comment or two or three to explain what/why



##########
solr/modules/llm/src/test-files/solr/collection1/conf/schema.xml:
##########
@@ -0,0 +1,52 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Test schema file for DenseVectorField -->
+
+<schema name="schema-densevector" version="1.0">

Review Comment:
   wow, version 1.0, which was maybe when Solr was introduced.  Should use the 
latest version (1.7)



##########
solr/modules/llm/src/java/org/apache/solr/llm/embedding/SolrEmbeddingModel.java:
##########
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.llm.embedding;
+
+import dev.langchain4j.data.embedding.Embedding;
+import dev.langchain4j.model.embedding.EmbeddingModel;
+import java.lang.reflect.Method;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.Map;
+import java.util.Objects;
+import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.solr.core.SolrResourceLoader;
+import org.apache.solr.llm.store.EmbeddingModelException;
+import org.apache.solr.llm.store.rest.ManagedEmbeddingModelStore;
+
+/**
+ * This object wraps a {@link dev.langchain4j.model.embedding.EmbeddingModel} 
to encode text to
+ * vector. It's meant to be used as a managed resource with the {@link 
ManagedEmbeddingModelStore}
+ */
+public class SolrEmbeddingModel implements Accountable {
+  private static final long BASE_RAM_BYTES =
+      RamUsageEstimator.shallowSizeOfInstance(SolrEmbeddingModel.class);
+  private static final String TIMEOUT_PARAM = "timeout";
+  private static final String MAX_SEGMENTS_PER_BATCH_PARAM = 
"maxSegmentsPerBatch";
+  private static final String MAX_RETRIES_PARAM = "maxRetries";
+
+  private final String name;
+  private final Map<String, Object> params;
+  private final EmbeddingModel textToVector;
+  private final Integer hashCode;

Review Comment:
   why not a primitive?



##########
solr/solr-ref-guide/modules/query-guide/pages/embedding-text.adoc:
##########
@@ -0,0 +1,269 @@
+= Embedding Text
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+With the *Large Language Model* (or *LLM* for short) module you can interact 
with Large Language Models in Solr to encode text to vectors at indexing and 
query time.
+
+
+== Text Embedding Concepts
+
+=== From Text to Vector
+
+The task of sentence similarity aims to encode text to vector in a way that 
sentences semantically similar are encoded to vectors close in a vector space 
(using a vector distance metric).
+
+
+=== Large Language Models 
+
+Large Language Models can be fine-tuned for such task.
+The resulting model is able to encode text to a numerical vector.
+
+For additional information you can refer to this 
https://sease.io/2021/12/using-bert-to-improve-search-relevance.html[blog post].

Review Comment:
   I read _some_ of that.  It was a long interesting document.  It didn't say 
"LLM" but it speaks of "language models", which I assume are large.  It does 
say "Embedding" a bunch of times.  On that subject, that term doesn't seem 
particularly relevant to this module.  What's relevant, as far as Solr is 
concerns is: *text in, numerical vector out*.  Easy; no terminlogy or education 
on embeddings or LLMs for that matter even needed.  Heck next year if some new 
tech / approach can do this, this module will still exist but "LLM" may become 
anachronistic.  Your call on this.



##########
solr/modules/llm/src/test/org/apache/solr/llm/TestLlmBase.java:
##########
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.llm;
+
+import java.lang.invoke.MethodHandles;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import org.apache.commons.io.file.PathUtils;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.llm.store.rest.ManagedEmbeddingModelStore;
+import org.apache.solr.util.RestTestBase;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class TestLlmBase extends RestTestBase {
+
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  protected static Path tmpSolrHome;
+  protected static Path tmpConfDir;
+
+  public static final String MODEL_FILE_NAME = 
"_schema_embedding-model-store.json";
+  protected static final String COLLECTION = "collection1";
+  protected static final String CONF_DIR = COLLECTION + "/conf";
+
+  protected static Path embeddingModelStoreFile = null;
+
+  protected static String IDField = "id";
+  protected static String stringField = "string_field";
+  protected static String vectorField = "vector";
+  protected static String vectorField2 = "vector2";
+  protected static String vectorFieldByteEncoding = "vector_byte_encoding";
+
+  protected static void setupTest(
+      String solrconfig, String schema, boolean buildIndex, boolean 
persistModelStore)
+      throws Exception {
+    initFolders(persistModelStore);
+    createJettyAndHarness(
+        tmpSolrHome.toAbsolutePath().toString(), solrconfig, schema, "/solr", 
true, null);
+    if (buildIndex) prepareIndex();
+  }
+
+  protected static void initFolders(boolean isPersistent) throws Exception {
+    tmpSolrHome = createTempDir();
+    tmpConfDir = tmpSolrHome.resolve(CONF_DIR);
+    tmpConfDir.toFile().deleteOnExit();
+    PathUtils.copyDirectory(TEST_PATH(), tmpSolrHome.toAbsolutePath());
+
+    final Path modelStore = tmpConfDir.resolve(MODEL_FILE_NAME);
+
+    if (isPersistent) {
+      embeddingModelStoreFile = modelStore;
+    }
+
+    if (Files.exists(modelStore)) {
+      if (log.isInfoEnabled()) {
+        log.info("remove model store config file in {}", 
modelStore.toAbsolutePath());
+      }
+      Files.delete(modelStore);
+    }
+
+    System.setProperty("managed.schema.mutable", "true");
+  }
+
+  protected static void afterTest() throws Exception {
+    if (null != restTestHarness) {
+      restTestHarness.close();
+      restTestHarness = null;
+    }
+    solrClientTestRule.reset();
+    if (null != tmpSolrHome) {
+      PathUtils.deleteDirectory(tmpSolrHome);
+      tmpSolrHome = null;
+    }
+    System.clearProperty("managed.schema.mutable");
+  }
+
+  /** produces a model encoded in json * */
+  public static String getModelInJson(String name, String className, String 
params) {
+    final StringBuilder sb = new StringBuilder();
+    sb.append("{\n");
+    sb.append("\"name\":").append('"').append(name).append('"').append(",\n");
+    
sb.append("\"class\":").append('"').append(className).append('"').append(",\n");
+    if (params != null) {
+      sb.append(",\n");
+      sb.append("\"params\":").append(params);
+    }
+    sb.append("\n}\n");
+    return sb.toString();
+  }
+
+  protected static void loadModel(String name, String className, String 
params) throws Exception {
+    final String model = getModelInJson(name, className, params);
+    log.info("loading model \n{} ", model);

Review Comment:
   newlines are unusual in log messages.  If you think this one is helpful then 
okay.



##########
solr/solr-ref-guide/modules/query-guide/pages/embedding-text.adoc:
##########
@@ -0,0 +1,269 @@
+= Embedding Text
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+With the *Large Language Model* (or *LLM* for short) module you can interact 
with Large Language Models in Solr to encode text to vectors at indexing and 
query time.
+
+
+== Text Embedding Concepts
+
+=== From Text to Vector
+
+The task of sentence similarity aims to encode text to vector in a way that 
sentences semantically similar are encoded to vectors close in a vector space 
(using a vector distance metric).

Review Comment:
   ```suggestion
   The task of sentence similarity aims to encode text to numerical vectors in 
a way that semantically similar sentences are encoded to vectors close in a 
vector space.
   Vector distance metrics (algorithms) compute a pairwise similarity, 
producing a score.
   ```
   
   The heading "From Text to Vector" above this sounds nice yet the text about 
sentence similarity doesn't seem to go with it at all.



##########
solr/solr-ref-guide/modules/query-guide/pages/embedding-text.adoc:
##########
@@ -0,0 +1,269 @@
+= Embedding Text
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+With the *Large Language Model* (or *LLM* for short) module you can interact 
with Large Language Models in Solr to encode text to vectors at indexing and 
query time.
+
+
+== Text Embedding Concepts
+
+=== From Text to Vector
+
+The task of sentence similarity aims to encode text to vector in a way that 
sentences semantically similar are encoded to vectors close in a vector space 
(using a vector distance metric).
+
+
+=== Large Language Models 
+
+Large Language Models can be fine-tuned for such task.

Review Comment:
   ```suggestion
   Large Language Models can be fine-tuned for such a task.
   ```
   
   But moreover this point seems irrelevant.  This module will not be fine 
tuning a model, nor are we advising the user to somehow do so.



##########
solr/solr-ref-guide/modules/query-guide/pages/embedding-text.adoc:
##########
@@ -0,0 +1,269 @@
+= Embedding Text
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+With the *Large Language Model* (or *LLM* for short) module you can interact 
with Large Language Models in Solr to encode text to vectors at indexing and 
query time.

Review Comment:
   I'm not well-versed in this tech so I'd like to give a shot at rewording 
this a little in hopes a wide audience will grok what this is about:
   ```suggestion
   This module brings the power of *Large Language Models* (*LLM*s) to Solr.  
More specifically, it provides a text-to-vector capability, used on documents 
or queries, via integrating with popular external services that do this.  The 
state-of-the-art of such services use an LLM, hence the name of this module. 
   _Without_ this module, vectors must be supplied _to_ Solr for indexing & 
searching, possibly coordinating with such services.
   ```



##########
solr/solr-ref-guide/modules/query-guide/pages/embedding-text.adoc:
##########
@@ -0,0 +1,269 @@
+= Embedding Text
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+With the *Large Language Model* (or *LLM* for short) module you can interact 
with Large Language Models in Solr to encode text to vectors at indexing and 
query time.
+
+
+== Text Embedding Concepts
+
+=== From Text to Vector
+
+The task of sentence similarity aims to encode text to vector in a way that 
sentences semantically similar are encoded to vectors close in a vector space 
(using a vector distance metric).
+
+
+=== Large Language Models 
+
+Large Language Models can be fine-tuned for such task.
+The resulting model is able to encode text to a numerical vector.
+
+For additional information you can refer to this 
https://sease.io/2021/12/using-bert-to-improve-search-relevance.html[blog post].
+
+==== Embedding Services
+
+Training, fine-tuning and operating such Large Language Models is expensive.
+

Review Comment:
   No reason to be allergic to multi-sentence paragraphs; the text here flows 
nicely.



##########
solr/solr-ref-guide/modules/query-guide/pages/embedding-text.adoc:
##########
@@ -0,0 +1,269 @@
+= Embedding Text
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+With the *Large Language Model* (or *LLM* for short) module you can interact 
with Large Language Models in Solr to encode text to vectors at indexing and 
query time.
+

Review Comment:
   I think it's really important (with an "IMPORTANT" box up front) to 
emphasize that this module is going to send your documents and text off to some 
hosted service on the internet.  There are cost, privacy, performance, and 
service availability implications on such a strong dependency that should be 
diligently examined before employing this module in a serious way.



##########
solr/solr-ref-guide/modules/query-guide/pages/embedding-text.adoc:
##########
@@ -0,0 +1,269 @@
+= Embedding Text
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+With the *Large Language Model* (or *LLM* for short) module you can interact 
with Large Language Models in Solr to encode text to vectors at indexing and 
query time.
+
+
+== Text Embedding Concepts
+
+=== From Text to Vector
+
+The task of sentence similarity aims to encode text to vector in a way that 
sentences semantically similar are encoded to vectors close in a vector space 
(using a vector distance metric).
+
+
+=== Large Language Models 
+
+Large Language Models can be fine-tuned for such task.
+The resulting model is able to encode text to a numerical vector.

Review Comment:
   Result of what?



##########
solr/solr-ref-guide/modules/query-guide/pages/embedding-text.adoc:
##########
@@ -0,0 +1,269 @@
+= Embedding Text
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+With the *Large Language Model* (or *LLM* for short) module you can interact 
with Large Language Models in Solr to encode text to vectors at indexing and 
query time.
+
+
+== Text Embedding Concepts
+
+=== From Text to Vector
+
+The task of sentence similarity aims to encode text to vector in a way that 
sentences semantically similar are encoded to vectors close in a vector space 
(using a vector distance metric).
+
+
+=== Large Language Models 
+
+Large Language Models can be fine-tuned for such task.
+The resulting model is able to encode text to a numerical vector.
+
+For additional information you can refer to this 
https://sease.io/2021/12/using-bert-to-improve-search-relevance.html[blog post].
+
+==== Embedding Services
+
+Training, fine-tuning and operating such Large Language Models is expensive.
+
+Many companies focus on this aspect and let users access APIs to encode the 
text (at the price of a license fee).
+
+Apache Solr uses https://github.com/langchain4j/langchain4j[LangChain4j] to 
connect to such apis.
+
+[IMPORTANT]
+====
+At the moment a subset of the embedding models supported by LangChain4j is 
supported by Solr.
+
+*Disclaimer*: Apache Solr is *in no way* affiliated to any of these 
corporations or services.
+
+If you want to add support for additional services or improve the support for 
the existing ones, feel free to contribute:
+
+* https://github.com/apache/solr/blob/main/CONTRIBUTING.md[Contributing to 
Solr]
+====
+
+== Module
+
+This is provided via the `llm` xref:configuration-guide:solr-modules.adoc[Solr 
Module] that needs to be enabled before use.
+
+At the moment the only supported way to interact with Large Language Models is 
via embedding text.

Review Comment:
   versus what?



##########
solr/solr-ref-guide/modules/query-guide/pages/embedding-text.adoc:
##########
@@ -0,0 +1,269 @@
+= Embedding Text
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+With the *Large Language Model* (or *LLM* for short) module you can interact 
with Large Language Models in Solr to encode text to vectors at indexing and 
query time.
+
+
+== Text Embedding Concepts
+
+=== From Text to Vector
+
+The task of sentence similarity aims to encode text to vector in a way that 
sentences semantically similar are encoded to vectors close in a vector space 
(using a vector distance metric).
+
+
+=== Large Language Models 
+
+Large Language Models can be fine-tuned for such task.
+The resulting model is able to encode text to a numerical vector.
+
+For additional information you can refer to this 
https://sease.io/2021/12/using-bert-to-improve-search-relevance.html[blog post].
+
+==== Embedding Services
+
+Training, fine-tuning and operating such Large Language Models is expensive.
+
+Many companies focus on this aspect and let users access APIs to encode the 
text (at the price of a license fee).
+
+Apache Solr uses https://github.com/langchain4j/langchain4j[LangChain4j] to 
connect to such apis.
+
+[IMPORTANT]
+====
+At the moment a subset of the embedding models supported by LangChain4j is 
supported by Solr.
+
+*Disclaimer*: Apache Solr is *in no way* affiliated to any of these 
corporations or services.
+
+If you want to add support for additional services or improve the support for 
the existing ones, feel free to contribute:
+
+* https://github.com/apache/solr/blob/main/CONTRIBUTING.md[Contributing to 
Solr]
+====
+
+== Module
+
+This is provided via the `llm` xref:configuration-guide:solr-modules.adoc[Solr 
Module] that needs to be enabled before use.
+
+At the moment the only supported way to interact with Large Language Models is 
via embedding text.
+
+In the future additional components to empower Solr with LLM will be added.
+
+
+== LLM Configuration
+
+Large-Language-Model is a module and therefore its plugins must be configured 
in `solrconfig.xml`.

Review Comment:
   Not entirely wrong but an awkward cause-effect.  Like here is a car and 
therefore it goes.
   
   Basically just say, you need to register / configure the plugins provided by 
the LLM module that you want to use.  This is done in solrconfig.xml.  These 
plugins self-actuate a RESTful model API, similar to the LTR module.



##########
solr/solr-ref-guide/modules/query-guide/pages/embedding-text.adoc:
##########
@@ -0,0 +1,269 @@
+= Embedding Text
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+With the *Large Language Model* (or *LLM* for short) module you can interact 
with Large Language Models in Solr to encode text to vectors at indexing and 
query time.
+
+
+== Text Embedding Concepts
+
+=== From Text to Vector
+
+The task of sentence similarity aims to encode text to vector in a way that 
sentences semantically similar are encoded to vectors close in a vector space 
(using a vector distance metric).
+
+
+=== Large Language Models 
+
+Large Language Models can be fine-tuned for such task.
+The resulting model is able to encode text to a numerical vector.
+
+For additional information you can refer to this 
https://sease.io/2021/12/using-bert-to-improve-search-relevance.html[blog post].
+
+==== Embedding Services
+
+Training, fine-tuning and operating such Large Language Models is expensive.
+
+Many companies focus on this aspect and let users access APIs to encode the 
text (at the price of a license fee).
+
+Apache Solr uses https://github.com/langchain4j/langchain4j[LangChain4j] to 
connect to such apis.
+
+[IMPORTANT]
+====
+At the moment a subset of the embedding models supported by LangChain4j is 
supported by Solr.

Review Comment:
   This sentence is good to know but doesn't seem worthy of IMPORTANT; could 
easily be the sentence immediately following introducing LangChain4j.



##########
solr/solr-ref-guide/modules/query-guide/pages/embedding-text.adoc:
##########
@@ -0,0 +1,269 @@
+= Embedding Text
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+With the *Large Language Model* (or *LLM* for short) module you can interact 
with Large Language Models in Solr to encode text to vectors at indexing and 
query time.
+
+
+== Text Embedding Concepts
+
+=== From Text to Vector
+
+The task of sentence similarity aims to encode text to vector in a way that 
sentences semantically similar are encoded to vectors close in a vector space 
(using a vector distance metric).
+
+
+=== Large Language Models 
+
+Large Language Models can be fine-tuned for such task.
+The resulting model is able to encode text to a numerical vector.
+
+For additional information you can refer to this 
https://sease.io/2021/12/using-bert-to-improve-search-relevance.html[blog post].
+
+==== Embedding Services
+
+Training, fine-tuning and operating such Large Language Models is expensive.
+
+Many companies focus on this aspect and let users access APIs to encode the 
text (at the price of a license fee).
+
+Apache Solr uses https://github.com/langchain4j/langchain4j[LangChain4j] to 
connect to such apis.
+
+[IMPORTANT]
+====
+At the moment a subset of the embedding models supported by LangChain4j is 
supported by Solr.
+
+*Disclaimer*: Apache Solr is *in no way* affiliated to any of these 
corporations or services.
+
+If you want to add support for additional services or improve the support for 
the existing ones, feel free to contribute:
+
+* https://github.com/apache/solr/blob/main/CONTRIBUTING.md[Contributing to 
Solr]
+====
+
+== Module
+
+This is provided via the `llm` xref:configuration-guide:solr-modules.adoc[Solr 
Module] that needs to be enabled before use.
+
+At the moment the only supported way to interact with Large Language Models is 
via embedding text.
+
+In the future additional components to empower Solr with LLM will be added.
+
+
+== LLM Configuration
+
+Large-Language-Model is a module and therefore its plugins must be configured 
in `solrconfig.xml`.
+
+=== Minimum Requirements

Review Comment:
   I think this heading can go



##########
solr/solr-ref-guide/modules/query-guide/pages/embedding-text.adoc:
##########
@@ -0,0 +1,269 @@
+= Embedding Text
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+With the *Large Language Model* (or *LLM* for short) module you can interact 
with Large Language Models in Solr to encode text to vectors at indexing and 
query time.
+
+
+== Text Embedding Concepts
+
+=== From Text to Vector
+
+The task of sentence similarity aims to encode text to vector in a way that 
sentences semantically similar are encoded to vectors close in a vector space 
(using a vector distance metric).
+
+
+=== Large Language Models 
+
+Large Language Models can be fine-tuned for such task.
+The resulting model is able to encode text to a numerical vector.
+
+For additional information you can refer to this 
https://sease.io/2021/12/using-bert-to-improve-search-relevance.html[blog post].
+
+==== Embedding Services
+
+Training, fine-tuning and operating such Large Language Models is expensive.
+
+Many companies focus on this aspect and let users access APIs to encode the 
text (at the price of a license fee).
+
+Apache Solr uses https://github.com/langchain4j/langchain4j[LangChain4j] to 
connect to such apis.
+
+[IMPORTANT]
+====
+At the moment a subset of the embedding models supported by LangChain4j is 
supported by Solr.
+
+*Disclaimer*: Apache Solr is *in no way* affiliated to any of these 
corporations or services.
+
+If you want to add support for additional services or improve the support for 
the existing ones, feel free to contribute:
+
+* https://github.com/apache/solr/blob/main/CONTRIBUTING.md[Contributing to 
Solr]
+====
+
+== Module
+
+This is provided via the `llm` xref:configuration-guide:solr-modules.adoc[Solr 
Module] that needs to be enabled before use.
+
+At the moment the only supported way to interact with Large Language Models is 
via embedding text.
+
+In the future additional components to empower Solr with LLM will be added.
+
+
+== LLM Configuration
+
+Large-Language-Model is a module and therefore its plugins must be configured 
in `solrconfig.xml`.
+
+=== Minimum Requirements
+
+* Declaration of the `text_to_vector` query parser.
++
+[source,xml]
+----
+<queryParser name="text_to_vector" 
class="org.apache.solr.llm.search.TextToVectorQParserPlugin"/>
+----
+
+== Text Embedding Lifecycle

Review Comment:
   "Text Embedding" hasn't been defined yet, and I don't think you define it at 
all.  I naively hope to avoid this concept but I'll leave it to you if it's 
important terminology.



##########
solr/modules/llm/src/test/org/apache/solr/llm/store/rest/TestModelManager.java:
##########
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.llm.store.rest;
+
+import dev.langchain4j.model.cohere.CohereEmbeddingModel;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.core.SolrResourceLoader;
+import org.apache.solr.llm.TestLlmBase;
+import org.apache.solr.llm.search.TextEmbedderQParserPlugin;
+import org.apache.solr.rest.ManagedResource;
+import org.apache.solr.rest.ManagedResourceStorage;
+import org.apache.solr.rest.RestManager;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class TestModelManager extends TestLlmBase {
+
+  @BeforeClass
+  public static void init() throws Exception {
+    setuptest(false);
+  }
+
+  @Test
+  public void test() throws Exception {
+    final SolrResourceLoader loader = new SolrResourceLoader(tmpSolrHome);
+
+    final RestManager.Registry registry = loader.getManagedResourceRegistry();
+    assertNotNull(
+        "Expected a non-null RestManager.Registry from the 
SolrResourceLoader!", registry);
+
+    final String resourceId = "/schema/mstore1";
+    registry.registerManagedResource(
+        resourceId, ManagedEmbeddingModelStore.class, new 
TextEmbedderQParserPlugin());
+
+    final NamedList<String> initArgs = new NamedList<>();
+
+    final RestManager restManager = new RestManager();
+    restManager.init(loader, initArgs, new 
ManagedResourceStorage.InMemoryStorageIO());
+
+    final ManagedResource res = restManager.getManagedResource(resourceId);
+    assertTrue(res instanceof ManagedEmbeddingModelStore);
+    assertEquals(res.getResourceId(), resourceId);
+  }
+
+  @Test
+  public void testRestManagerEndpoints() throws Exception {
+    assertJQ("/schema/managed", "/responseHeader/status==0");
+
+    final String cohereModelClassName = CohereEmbeddingModel.class.getName();
+
+    // Add models
+    String model = "{ \"name\":\"testModel1\", \"class\":\"" + 
cohereModelClassName + "\"}";
+    // fails since it does not have params
+    assertJPut(ManagedEmbeddingModelStore.REST_END_POINT, model, 
"/responseHeader/status==400");
+    // success
+    model =
+        "{ name:\"testModel2\", class:\""
+            + cohereModelClassName
+            + "\","
+            + "params:{"
+            + "baseUrl:\"https://api.cohere.ai/v1/\",";
+            + "apiKey:\"cohereApiKey2\","
+            + "modelName:\"embed-english-light-v3.0\","
+            + "inputType:\"search_document\","
+            + "logRequests:true,"
+            + "logResponses:false"
+            + "}}";
+    assertJPut(ManagedEmbeddingModelStore.REST_END_POINT, model, 
"/responseHeader/status==0");
+    // success
+    final String multipleModels =
+        "[{ name:\"testModel3\", class:\""

Review Comment:
   He was thinking-out-loud, not telling you to do differently or even implied. 
 If I may do the same, I'll say I look forward to us using text blocks, now 
available on main due to Java 21.  I suppose you will back-port this to 9.x so 
I suppose you won't bother using Java 21 features yet.



##########
solr/solr-ref-guide/modules/query-guide/pages/embedding-text.adoc:
##########
@@ -0,0 +1,269 @@
+= Embedding Text
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+With the *Large Language Model* (or *LLM* for short) module you can interact 
with Large Language Models in Solr to encode text to vectors at indexing and 
query time.
+
+
+== Text Embedding Concepts
+
+=== From Text to Vector
+
+The task of sentence similarity aims to encode text to vector in a way that 
sentences semantically similar are encoded to vectors close in a vector space 
(using a vector distance metric).
+
+
+=== Large Language Models 
+
+Large Language Models can be fine-tuned for such task.
+The resulting model is able to encode text to a numerical vector.
+
+For additional information you can refer to this 
https://sease.io/2021/12/using-bert-to-improve-search-relevance.html[blog post].
+
+==== Embedding Services
+
+Training, fine-tuning and operating such Large Language Models is expensive.
+
+Many companies focus on this aspect and let users access APIs to encode the 
text (at the price of a license fee).
+
+Apache Solr uses https://github.com/langchain4j/langchain4j[LangChain4j] to 
connect to such apis.
+
+[IMPORTANT]
+====
+At the moment a subset of the embedding models supported by LangChain4j is 
supported by Solr.
+
+*Disclaimer*: Apache Solr is *in no way* affiliated to any of these 
corporations or services.
+
+If you want to add support for additional services or improve the support for 
the existing ones, feel free to contribute:
+
+* https://github.com/apache/solr/blob/main/CONTRIBUTING.md[Contributing to 
Solr]
+====
+
+== Module
+
+This is provided via the `llm` xref:configuration-guide:solr-modules.adoc[Solr 
Module] that needs to be enabled before use.
+
+At the moment the only supported way to interact with Large Language Models is 
via embedding text.
+
+In the future additional components to empower Solr with LLM will be added.
+
+
+== LLM Configuration
+
+Large-Language-Model is a module and therefore its plugins must be configured 
in `solrconfig.xml`.
+
+=== Minimum Requirements
+
+* Declaration of the `text_to_vector` query parser.
++
+[source,xml]
+----
+<queryParser name="text_to_vector" 
class="org.apache.solr.llm.search.TextToVectorQParserPlugin"/>
+----
+
+== Text Embedding Lifecycle
+
+
+=== Models
+
+* A model encodes text to a vector.
+* A model in Solr is a reference to an external API that runs the Large 
Language Model responsible for text embedding.
+
+*N.B.* the Solr embedding model specifies the parameters to access the APIs, 
the model doesn't run internally in Solr
+
+
+A model is described by these parameters:
+
+
+`class`::
++
+[%autowidth,frame=none]
+|===
+s|Required |Default: none
+|===
++
+The model implementation.
+Accepted values: 
+
+* `dev.langchain4j.model.huggingface.HuggingFaceEmbeddingModel`.
+* `dev.langchain4j.model.mistralai.MistralAiEmbeddingModel`.
+* `dev.langchain4j.model.openai.OpenAiEmbeddingModel`.
+* `dev.langchain4j.model.cohere.CohereEmbeddingModel`.
+
+
+`name`::
++
+[%autowidth,frame=none]
+|===
+s|Required |Default: none
+|===
++
+The identifier of your model, this is used by any component that intends to 
use the model (`text_to_vector` query parser).
+
+`params`::
++
+[%autowidth,frame=none]
+|===
+|Optional |Default: none
+|===
++
+Each model class has potentially different params.
+Many are shared but for the full set of parameters of the model you are 
interested in please refer to the official documentation of the LangChain4j 
version included in Solr: 
https://docs.langchain4j.dev/category/embedding-models[Embedding Models in 
LangChain4j].
+
+
+=== Supported Models
+Apache Solr uses https://github.com/langchain4j/langchain4j[LangChain4j] to 
support text embedding.
+The models currently supported are:
+
+[tabs#supported-models]
+======
+Hugging Face::
++
+====
+
+[source,json]
+----
+{
+  "class": "dev.langchain4j.model.huggingface.HuggingFaceEmbeddingModel",
+  "name": "<a-name-for-your-model>",
+  "params": {
+    "accessToken": "<your-huggingface-api-key>",
+    "modelId": "<a-huggingface-embedding-model>"
+  }
+}
+----
+====
+
+MistralAI::
++
+====
+[source,json]
+----
+{
+  "class": "dev.langchain4j.model.mistralai.MistralAiEmbeddingModel",
+  "name": "<a-name-for-your-model>",
+  "params": {
+    "baseUrl": "https://api.mistral.ai/v1";,
+    "apiKey": "<your-mistralAI-api-key>",
+    "modelName": "<a-mistralAI-embedding-model>",
+    "timeout": 60,
+    "logRequests": true,
+    "logResponses": true,
+    "maxRetries": 5
+  }
+}
+----
+====
+
+OpenAI::
++
+====
+[source,json]
+----
+{
+  "class": "dev.langchain4j.model.openai.OpenAiEmbeddingModel",
+  "name": "<a-name-for-your-model>",
+  "params": {
+    "baseUrl": "https://api.openai.com/v1";,
+    "apiKey": "<your-openAI-api-key>",
+    "modelName": "<a-openAI-embedding-model>",
+    "timeout": 60,
+    "logRequests": true,
+    "logResponses": true,
+    "maxRetries": 5
+  }
+}
+----
+====
+
+Cohere::
++
+====
+[source,json]
+----
+{
+  "class": "dev.langchain4j.model.cohere.CohereEmbeddingModel",
+  "name": "<a-name-for-your-model>",
+  "params": {
+    "baseUrl": "https://api.cohere.ai/v1/";,
+    "apiKey": "<your-cohere-api-key>",
+    "modelName": "<a-cohere-embedding-model>",
+    "inputType": "search_document",
+    "timeout": 60,
+    "logRequests": true,
+    "logResponses": true
+  }
+}
+----
+====
+======
+
+=== Uploading a Model
+
+To upload the model in a `/path/myModel.json` file, please run:
+
+[source,bash]
+----
+curl -XPUT 
'http://localhost:8983/solr/techproducts/schema/embedding-model-store' 
--data-binary "@/path/myModel.json" -H 'Content-type:application/json'
+----
+
+
+To view all models:
+
+[source,text]
+http://localhost:8983/solr/techproducts/schema/embedding-model-store
+
+To delete the `currentModel` model:
+
+[source,bash]
+----
+curl -XDELETE 
'http://localhost:8983/solr/techproducts/schema/embedding-model-store/currentModel'
+----
+
+
+To view the model you just uploaded please open the following URL in a browser:
+
+[source,text]
+http://localhost:8983/solr/techproducts/schema/embedding-model-store
+
+.Example: /path/myModel.json
+[source,json]
+----
+{
+  "class": "dev.langchain4j.model.openai.OpenAiEmbeddingModel",
+  "name": "openai-1",
+  "params": {
+    "baseUrl": "https://api.openai.com/v1";,
+    "apiKey": "apiKey-openAI",
+    "modelName": "text-embedding-3-small",
+    "timeout": 60,
+    "logRequests": true,
+    "logResponses": true,
+    "maxRetries": 5
+  }
+}
+
+----
+
+=== Running an embedding Query
+To run a query that embeds your query text, using a model you previously 
uploaded is simple:
+
+[source,text]
+?q={!text_to_vector model=a-model f=vector topK=10}hello world query

Review Comment:
   Did you consider adding capabilities to KnnQParser instead of defining a new 
one?  Subclassing seems to lose the fact that we are doing KNN.  I suppose the 
issue is modularization, as KNN is in Solr-Core but what you are adding is 
elsewhere.  Nonetheless, some interfaces could be in solr-core.  



##########
solr/solr-ref-guide/modules/query-guide/pages/embedding-text.adoc:
##########
@@ -0,0 +1,269 @@
+= Embedding Text
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+With the *Large Language Model* (or *LLM* for short) module you can interact 
with Large Language Models in Solr to encode text to vectors at indexing and 
query time.
+
+
+== Text Embedding Concepts
+
+=== From Text to Vector
+
+The task of sentence similarity aims to encode text to vector in a way that 
sentences semantically similar are encoded to vectors close in a vector space 
(using a vector distance metric).
+
+
+=== Large Language Models 
+
+Large Language Models can be fine-tuned for such task.
+The resulting model is able to encode text to a numerical vector.
+
+For additional information you can refer to this 
https://sease.io/2021/12/using-bert-to-improve-search-relevance.html[blog post].
+
+==== Embedding Services
+
+Training, fine-tuning and operating such Large Language Models is expensive.
+
+Many companies focus on this aspect and let users access APIs to encode the 
text (at the price of a license fee).
+
+Apache Solr uses https://github.com/langchain4j/langchain4j[LangChain4j] to 
connect to such apis.
+
+[IMPORTANT]
+====
+At the moment a subset of the embedding models supported by LangChain4j is 
supported by Solr.
+
+*Disclaimer*: Apache Solr is *in no way* affiliated to any of these 
corporations or services.
+
+If you want to add support for additional services or improve the support for 
the existing ones, feel free to contribute:
+
+* https://github.com/apache/solr/blob/main/CONTRIBUTING.md[Contributing to 
Solr]
+====
+
+== Module
+
+This is provided via the `llm` xref:configuration-guide:solr-modules.adoc[Solr 
Module] that needs to be enabled before use.
+
+At the moment the only supported way to interact with Large Language Models is 
via embedding text.
+
+In the future additional components to empower Solr with LLM will be added.
+
+
+== LLM Configuration
+
+Large-Language-Model is a module and therefore its plugins must be configured 
in `solrconfig.xml`.
+
+=== Minimum Requirements
+
+* Declaration of the `text_to_vector` query parser.
++
+[source,xml]
+----
+<queryParser name="text_to_vector" 
class="org.apache.solr.llm.search.TextToVectorQParserPlugin"/>
+----
+
+== Text Embedding Lifecycle
+
+
+=== Models
+
+* A model encodes text to a vector.
+* A model in Solr is a reference to an external API that runs the Large 
Language Model responsible for text embedding.
+
+*N.B.* the Solr embedding model specifies the parameters to access the APIs, 
the model doesn't run internally in Solr

Review Comment:
   I forget what N.B. means.
   In theory, could someone write a custom plugin that actually does the 
embedding or whatever in-process?  I'd say that would be "embedding" this but 
that word is used in different ways on this page.



##########
solr/modules/llm/src/test/org/apache/solr/llm/TestLlmBase.java:
##########
@@ -0,0 +1,297 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.llm;
+
+import java.lang.invoke.MethodHandles;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import org.apache.commons.io.file.PathUtils;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.util.Utils;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.core.SolrResourceLoader;
+import org.apache.solr.llm.embedding.SolrEmbeddingModel;
+import org.apache.solr.llm.store.EmbeddingModelException;
+import org.apache.solr.llm.store.rest.ManagedEmbeddingModelStore;
+import org.apache.solr.util.RestTestBase;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class TestLlmBase extends RestTestBase {
+
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  protected static final SolrResourceLoader solrResourceLoader =
+      new SolrResourceLoader(Path.of("").toAbsolutePath());
+
+  protected static Path tmpSolrHome;
+  protected static Path tmpConfDir;
+
+  public static final String MODEL_FILE_NAME = 
"_schema_embedding-model-store.json";
+  protected static final String COLLECTION = "collection1";
+  protected static final String CONF_DIR = COLLECTION + "/conf";
+
+  protected static Path embeddingModelStoreFile = null;
+
+  protected static String IDField = "id";
+  protected static String stringField = "string_field";
+  protected static String vectorField = "vector";
+  protected static String vectorField2 = "vector2";
+  protected static String vectorFieldByteEncoding = "vector_byte_encoding";
+
+  protected static void setuptest(boolean bulkIndex) throws Exception {
+    setuptest("solrconfig-llm.xml", "schema.xml");
+    if (bulkIndex) prepareIndex();
+  }
+
+  protected static void setupPersistenttest(boolean bulkIndex) throws 
Exception {
+    setupPersistentTest("solrconfig-llm.xml", "schema.xml");
+    if (bulkIndex) prepareIndex();
+  }
+
+  public static ManagedEmbeddingModelStore getManagedModelStore() {
+    try (SolrCore core = 
solrClientTestRule.getCoreContainer().getCore(DEFAULT_TEST_CORENAME)) {
+      return ManagedEmbeddingModelStore.getManagedModelStore(core);
+    }
+  }
+
+  protected static void setupTestInit(String solrconfig, String schema, 
boolean isPersistent)
+      throws Exception {
+    tmpSolrHome = createTempDir();
+    tmpConfDir = tmpSolrHome.resolve(CONF_DIR);
+    tmpConfDir.toFile().deleteOnExit();
+    PathUtils.copyDirectory(TEST_PATH(), tmpSolrHome.toAbsolutePath());
+
+    final Path mstore = tmpConfDir.resolve(MODEL_FILE_NAME);
+
+    if (isPersistent) {
+      embeddingModelStoreFile = mstore;
+    }
+
+    if (Files.exists(mstore)) {
+      if (log.isInfoEnabled()) {
+        log.info("remove model store config file in {}", 
mstore.toAbsolutePath());
+      }
+      Files.delete(mstore);
+    }
+    if (!solrconfig.equals("solrconfig-llm.xml")) {
+      Files.copy(
+          tmpSolrHome.resolve(CONF_DIR).resolve(solrconfig),
+          tmpSolrHome.resolve(CONF_DIR).resolve("solrconfig-llm.xml"));
+    }
+    if (!schema.equals("schema.xml")) {
+      Files.copy(
+          tmpSolrHome.resolve(CONF_DIR).resolve(schema),
+          tmpSolrHome.resolve(CONF_DIR).resolve("schema.xml"));
+    }
+
+    System.setProperty("managed.schema.mutable", "true");
+  }
+
+  public static void setuptest(String solrconfig, String schema) throws 
Exception {
+
+    setupTestInit(solrconfig, schema, false);
+    System.setProperty("enable.update.log", "false");
+
+    createJettyAndHarness(
+        tmpSolrHome.toAbsolutePath().toString(), solrconfig, schema, "/solr", 
true, null);
+  }
+
+  public static void setupPersistentTest(String solrconfig, String schema) 
throws Exception {
+
+    setupTestInit(solrconfig, schema, true);
+
+    createJettyAndHarness(
+        tmpSolrHome.toAbsolutePath().toString(), solrconfig, schema, "/solr", 
true, null);
+  }
+
+  protected static void aftertest() throws Exception {
+    if (null != restTestHarness) {
+      restTestHarness.close();
+      restTestHarness = null;
+    }
+    solrClientTestRule.reset();
+    if (null != tmpSolrHome) {
+      PathUtils.deleteDirectory(tmpSolrHome);
+      tmpSolrHome = null;
+    }
+    System.clearProperty("managed.schema.mutable");
+  }
+
+  public static void makeRestTestHarnessNull() {
+    restTestHarness = null;
+  }
+
+  /** produces a model encoded in json * */
+  public static String getModelInJson(String name, String className, String 
params) {

Review Comment:
   I suggest not using StringBuilder here; it's needlessly verbose.  I think 
IntelliJ will switch it for you.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@solr.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@solr.apache.org
For additional commands, e-mail: issues-h...@solr.apache.org

Reply via email to