This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/main by this push:
new f679a5e3 OPENNLP-976: Implement GermEval2014 Format (#971)
f679a5e3 is described below
commit f679a5e37d0c72b7872260ac2c175fb566e0cf7c
Author: Richard Zowalla <[email protected]>
AuthorDate: Fri Mar 6 07:33:11 2026 +0100
OPENNLP-976: Implement GermEval2014 Format (#971)
---
.../tools/cmdline/StreamFactoryRegistry.java | 2 +
.../formats/GermEval2014NameSampleStream.java | 250 +++++++++++++++++
.../GermEval2014NameSampleStreamFactory.java | 104 +++++++
.../GermEval2014NameSampleStreamFactoryTest.java | 133 +++++++++
.../formats/GermEval2014NameSampleStreamTest.java | 298 +++++++++++++++++++++
.../opennlp/tools/formats/germeval2014.sample | 44 +++
.../tools/eval/GermEval2014NameFinderEval.java | 216 +++++++++++++++
7 files changed, 1047 insertions(+)
diff --git
a/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
b/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index c4bef61f..cfa46f1f 100644
---
a/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++
b/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -30,6 +30,7 @@ import
opennlp.tools.formats.ConllXSentenceSampleStreamFactory;
import opennlp.tools.formats.ConllXTokenSampleStreamFactory;
import opennlp.tools.formats.DocumentSampleStreamFactory;
import opennlp.tools.formats.EvalitaNameSampleStreamFactory;
+import opennlp.tools.formats.GermEval2014NameSampleStreamFactory;
import opennlp.tools.formats.LanguageDetectorSampleStreamFactory;
import opennlp.tools.formats.LemmatizerSampleStreamFactory;
import opennlp.tools.formats.NameSampleDataStreamFactory;
@@ -107,6 +108,7 @@ public final class StreamFactoryRegistry {
Conll02NameSampleStreamFactory.registerFactory();
Conll03NameSampleStreamFactory.registerFactory();
EvalitaNameSampleStreamFactory.registerFactory();
+ GermEval2014NameSampleStreamFactory.registerFactory();
ConllXPOSSampleStreamFactory.registerFactory();
ConllXSentenceSampleStreamFactory.registerFactory();
ConllXTokenSampleStreamFactory.registerFactory();
diff --git
a/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/formats/GermEval2014NameSampleStream.java
b/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/formats/GermEval2014NameSampleStream.java
new file mode 100644
index 00000000..ed2e1a53
--- /dev/null
+++
b/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/formats/GermEval2014NameSampleStream.java
@@ -0,0 +1,250 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.commons.Internal;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.StringUtil;
+
+/**
+ * Parser for the GermEval 2014 Named Entity Recognition Shared Task data.
+ * <p>
+ * The data is in a tab-separated format with four columns:
+ * <ol>
+ * <li>Token index (1-based per sentence)</li>
+ * <li>Token text</li>
+ * <li>Outer named entity tag (IOB2 scheme)</li>
+ * <li>Nested/embedded named entity tag (IOB2 scheme)</li>
+ * </ol>
+ * Comment lines starting with {@code #} mark document boundaries and contain
+ * source URL and date metadata. Blank lines separate sentences.
+ * <p>
+ * The data uses four main entity types: Person (PER), Location (LOC),
+ * Organization (ORG) and Other (OTH), with additional {@code deriv} and
+ * {@code part} suffixes for derived forms and name parts respectively.
+ * <p>
+ * Since {@link NameSample} does not support overlapping spans, this stream
+ * requires selecting either the {@link NerLayer#OUTER outer} or
+ * {@link NerLayer#INNER inner} annotation layer via a {@link NerLayer}
parameter.
+ * <p>
+ * Data can be found on
+ * <a href="https://sites.google.com/site/germeval2014ner/data">this web
site</a>.
+ * <p>
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
+ */
+@Internal
+public class GermEval2014NameSampleStream implements ObjectStream<NameSample> {
+
+ /**
+ * Selects which NER annotation layer to read from the GermEval 2014 data.
+ */
+ public enum NerLayer {
+ /** The outer (top-level) named entity annotations (column 3). */
+ OUTER,
+ /** The nested/embedded named entity annotations (column 4). */
+ INNER
+ }
+
+ public static final int GENERATE_PERSON_ENTITIES = 0x01;
+ public static final int GENERATE_ORGANIZATION_ENTITIES = 0x01 << 1;
+ public static final int GENERATE_LOCATION_ENTITIES = 0x01 << 2;
+ public static final int GENERATE_MISC_ENTITIES = 0x01 << 3;
+
+ private final ObjectStream<String> lineStream;
+ private final int types;
+ private final NerLayer layer;
+
+ /**
+ * Initializes a {@link GermEval2014NameSampleStream}.
+ *
+ * @param lineStream An {@link ObjectStream} over the lines
+ * in the GermEval 2014 data file.
+ * @param types The entity types to include in the Name Sample object
stream.
+ * @param layer The {@link NerLayer} to read.
+ */
+ public GermEval2014NameSampleStream(final ObjectStream<String> lineStream,
+ final int types, final NerLayer layer) {
+ this.lineStream = lineStream;
+ this.types = types;
+ this.layer = layer;
+ }
+
+ /**
+ * Initializes a {@link GermEval2014NameSampleStream}.
+ *
+ * @param in The {@link InputStreamFactory} for the input file.
+ * @param types The entity types to include in the Name Sample object stream.
+ * @param layer The {@link NerLayer} to read.
+ * @throws IOException Thrown if IO errors occurred.
+ */
+ public GermEval2014NameSampleStream(final InputStreamFactory in, final int
types,
+ final NerLayer layer) throws IOException
{
+ this(new PlainTextByLineStream(in, StandardCharsets.UTF_8), types, layer);
+ }
+
+ static Span extract(final int begin, final int end, final String beginTag)
+ throws InvalidFormatException {
+
+ final String type = mapTagToType(beginTag);
+ return new Span(begin, end, type);
+ }
+
+ private static String mapTagToType(final String tag) throws
InvalidFormatException {
+ // Strip B- or I- prefix
+ final String rawType = tag.substring(2);
+
+ return switch (rawType) {
+ case "PER" -> "person";
+ case "PERderiv" -> "personderiv";
+ case "PERpart" -> "personpart";
+ case "LOC" -> "location";
+ case "LOCderiv" -> "locationderiv";
+ case "LOCpart" -> "locationpart";
+ case "ORG" -> "organization";
+ case "ORGderiv" -> "organizationderiv";
+ case "ORGpart" -> "organizationpart";
+ case "OTH" -> "misc";
+ case "OTHderiv" -> "miscderiv";
+ case "OTHpart" -> "miscpart";
+ default -> throw new InvalidFormatException("Unknown type: " + rawType);
+ };
+ }
+
+ private boolean isTypeEnabled(final String tag) {
+ if (tag.startsWith("B-PER") || tag.startsWith("I-PER")) {
+ return (types & GENERATE_PERSON_ENTITIES) != 0;
+ }
+ if (tag.startsWith("B-ORG") || tag.startsWith("I-ORG")) {
+ return (types & GENERATE_ORGANIZATION_ENTITIES) != 0;
+ }
+ if (tag.startsWith("B-LOC") || tag.startsWith("I-LOC")) {
+ return (types & GENERATE_LOCATION_ENTITIES) != 0;
+ }
+ if (tag.startsWith("B-OTH") || tag.startsWith("I-OTH")) {
+ return (types & GENERATE_MISC_ENTITIES) != 0;
+ }
+ return tag.equals("O");
+ }
+
+ private List<Span> convertTagsToSpans(final List<String> tags) throws
IOException {
+ final List<Span> names = new ArrayList<>();
+
+ int beginIndex = -1;
+ int endIndex = -1;
+
+ for (int i = 0; i < tags.size(); i++) {
+ String tag = tags.get(i);
+
+ if (!tag.equals("O") && !isTypeEnabled(tag)) {
+ tag = "O";
+ }
+
+ if (tag.startsWith("B-")) {
+ if (beginIndex != -1) {
+ names.add(extract(beginIndex, endIndex, tags.get(beginIndex)));
+ }
+ beginIndex = i;
+ endIndex = i + 1;
+ } else if (tag.startsWith("I-")) {
+ endIndex++;
+ } else if (tag.equals("O")) {
+ if (beginIndex != -1) {
+ names.add(extract(beginIndex, endIndex, tags.get(beginIndex)));
+ beginIndex = -1;
+ endIndex = -1;
+ }
+ } else {
+ throw new IOException("Invalid tag: " + tag);
+ }
+ }
+
+ // if one span remains, create it here
+ if (beginIndex != -1) {
+ names.add(extract(beginIndex, endIndex, tags.get(beginIndex)));
+ }
+
+ return names;
+ }
+
+ @Override
+ public NameSample read() throws IOException {
+
+ final List<String> sentence = new ArrayList<>();
+ final List<String> outerTags = new ArrayList<>();
+ final List<String> innerTags = new ArrayList<>();
+
+ boolean isClearAdaptiveData = false;
+
+ // Empty line indicates end of sentence
+ String line;
+ while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line)) {
+
+ // Comment lines starting with # mark document boundaries
+ if (line.startsWith("#")) {
+ isClearAdaptiveData = true;
+ continue;
+ }
+
+ final String[] fields = line.split("\t");
+
+ if (fields.length >= 4) {
+ sentence.add(fields[1]);
+ outerTags.add(fields[2]);
+ innerTags.add(fields[3].trim());
+ } else {
+ throw new IOException("Expected at least four tab-separated fields per
line "
+ + "in GermEval 2014 data, got " + fields.length + " for line '" +
line + "'!");
+ }
+ }
+
+ if (sentence.size() > 0) {
+ final List<String> selectedTags = (layer == NerLayer.OUTER) ? outerTags
: innerTags;
+ final List<Span> names = convertTagsToSpans(selectedTags);
+
+ return new NameSample(sentence.toArray(new String[0]),
+ names.toArray(new Span[0]), isClearAdaptiveData);
+ } else if (line != null) {
+ // Just filter out empty events, if two lines in a row are empty
+ return read();
+ } else {
+ // source stream is not returning anymore lines
+ return null;
+ }
+ }
+
+ @Override
+ public void reset() throws IOException, UnsupportedOperationException {
+ lineStream.reset();
+ }
+
+ @Override
+ public void close() throws IOException {
+ lineStream.close();
+ }
+}
diff --git
a/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/formats/GermEval2014NameSampleStreamFactory.java
b/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/formats/GermEval2014NameSampleStreamFactory.java
new file mode 100644
index 00000000..760fb371
--- /dev/null
+++
b/opennlp-core/opennlp-formats/src/main/java/opennlp/tools/formats/GermEval2014NameSampleStreamFactory.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.commons.Internal;
+import opennlp.tools.formats.GermEval2014NameSampleStream.NerLayer;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
+ *
+ * @see GermEval2014NameSampleStream
+ */
+@Internal
+public class GermEval2014NameSampleStreamFactory extends
+ LanguageSampleStreamFactory<NameSample,
GermEval2014NameSampleStreamFactory.Parameters> {
+
+ public interface Parameters extends BasicFormatParams {
+ @ParameterDescription(valueName = "per,loc,org,misc")
+ String getTypes();
+
+ @ParameterDescription(valueName = "outer|inner", description = "NER
annotation layer to use. " +
+ "Use 'outer' for top-level entities or 'inner' for nested/embedded
entities.")
+ String getLayer();
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(NameSample.class,
+ "germeval2014", new
GermEval2014NameSampleStreamFactory(Parameters.class));
+ }
+
+ protected GermEval2014NameSampleStreamFactory(final Class<Parameters>
params) {
+ super(params);
+ }
+
+ @Override
+ public ObjectStream<NameSample> create(final String[] args) {
+
+ final Parameters params = validateBasicFormatParameters(args,
Parameters.class);
+
+ language = "deu";
+
+ int typesToGenerate = 0;
+
+ if (params.getTypes().contains("per")) {
+ typesToGenerate = typesToGenerate |
+ GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES;
+ }
+ if (params.getTypes().contains("org")) {
+ typesToGenerate = typesToGenerate |
+ GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES;
+ }
+ if (params.getTypes().contains("loc")) {
+ typesToGenerate = typesToGenerate |
+ GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES;
+ }
+ if (params.getTypes().contains("misc")) {
+ typesToGenerate = typesToGenerate |
+ GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES;
+ }
+
+ final NerLayer layer;
+ final String layerParam = params.getLayer();
+ if (layerParam == null || "outer".equals(layerParam)) {
+ layer = NerLayer.OUTER;
+ } else if ("inner".equals(layerParam)) {
+ layer = NerLayer.INNER;
+ } else {
+ throw new TerminateToolException(1, "Unsupported layer: " + layerParam
+ + ". Use 'outer' or 'inner'.");
+ }
+
+ try {
+ return new GermEval2014NameSampleStream(
+ FormatUtil.createInputStreamFactory(params.getData()),
typesToGenerate, layer);
+ } catch (final IOException e) {
+ throw new TerminateToolException(-1,
+ "IO Error while creating an Input Stream: " + e.getMessage(), e);
+ }
+ }
+}
diff --git
a/opennlp-core/opennlp-formats/src/test/java/opennlp/tools/formats/GermEval2014NameSampleStreamFactoryTest.java
b/opennlp-core/opennlp-formats/src/test/java/opennlp/tools/formats/GermEval2014NameSampleStreamFactoryTest.java
new file mode 100644
index 00000000..c6528a06
--- /dev/null
+++
b/opennlp-core/opennlp-formats/src/test/java/opennlp/tools/formats/GermEval2014NameSampleStreamFactoryTest.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.IOException;
+
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
+
+import opennlp.tools.cmdline.ObjectStreamFactory;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.ObjectStream;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertInstanceOf;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+public class GermEval2014NameSampleStreamFactoryTest extends
+ AbstractSampleStreamFactoryTest<NameSample,
GermEval2014NameSampleStreamFactory.Parameters> {
+
+ private static final String SAMPLE = "germeval2014.sample";
+
+ // SUT
+ private GermEval2014NameSampleStreamFactory factory;
+
+ private String sampleFileFullPath;
+
+ @Override
+ protected AbstractSampleStreamFactory<NameSample,
+ GermEval2014NameSampleStreamFactory.Parameters> getFactory() {
+ return factory;
+ }
+
+ @Override
+ protected String getDataFilePath() {
+ return sampleFileFullPath;
+ }
+
+ @BeforeAll
+ static void initEnv() {
+ GermEval2014NameSampleStreamFactory.registerFactory();
+ }
+
+ @BeforeEach
+ void setUp() {
+ final ObjectStreamFactory<NameSample,
GermEval2014NameSampleStreamFactory.Parameters> f =
+ StreamFactoryRegistry.getFactory(NameSample.class, "germeval2014");
+ assertInstanceOf(GermEval2014NameSampleStreamFactory.class, f);
+ factory = ((GermEval2014NameSampleStreamFactory) f);
+ assertEquals(GermEval2014NameSampleStreamFactory.Parameters.class,
factory.params);
+ sampleFileFullPath = getResourceWithoutPrefix(FORMAT_SAMPLE_DIR +
SAMPLE).getPath();
+ }
+
+ @Test
+ void testCreateWithValidParameter() throws IOException {
+ try (final ObjectStream<NameSample> stream = factory.create(
+ new String[]{"-types", "per,loc,org,misc", "-layer", "outer",
+ "-data", sampleFileFullPath})) {
+ final NameSample sample = stream.read();
+ assertNotNull(sample);
+ }
+ }
+
+ @ParameterizedTest
+ @ValueSource(strings = {"outer", "inner"})
+ void testCreateWithDifferentLayers(final String layer) throws IOException {
+ try (final ObjectStream<NameSample> stream = factory.create(
+ new String[]{"-types", "per,loc,org,misc", "-layer", layer,
+ "-data", sampleFileFullPath})) {
+ final NameSample sample = stream.read();
+ assertNotNull(sample);
+ }
+ }
+
+ @ParameterizedTest
+ @ValueSource(strings = {"", "per", "loc", "org", "misc", "per,loc,org,misc"})
+ void testCreateWithDifferentTypes(final String types) throws IOException {
+ try (final ObjectStream<NameSample> stream = factory.create(
+ new String[]{"-types", types, "-layer", "outer", "-data",
sampleFileFullPath})) {
+ final NameSample sample = stream.read();
+ assertNotNull(sample);
+ }
+ }
+
+ @Test
+ void testCreateWithInvalidLayer() {
+ assertThrows(TerminateToolException.class, () -> {
+ try (final ObjectStream<NameSample> stream = factory.create(
+ new String[]{"-types", "per,loc,org,misc", "-layer", "xyz",
+ "-data", sampleFileFullPath})) {
+ final NameSample sample = stream.read();
+ assertNotNull(sample);
+ }
+ });
+ }
+
+ /*
+ * Note: Overriding this test case, as more params are required!
+ */
+ @Test
+ @Override
+ protected void testCreateWithInvalidDataFilePath() {
+ assertThrows(TerminateToolException.class, () -> {
+ try (final ObjectStream<NameSample> stream = factory.create(new String[]
+ {"-types", "per,loc,org,misc", "-layer", "outer",
+ "-data", sampleFileFullPath + "xyz"})) {
+ final NameSample sample = stream.read();
+ assertNotNull(sample);
+ }
+ });
+ }
+}
diff --git
a/opennlp-core/opennlp-formats/src/test/java/opennlp/tools/formats/GermEval2014NameSampleStreamTest.java
b/opennlp-core/opennlp-formats/src/test/java/opennlp/tools/formats/GermEval2014NameSampleStreamTest.java
new file mode 100644
index 00000000..0118a313
--- /dev/null
+++
b/opennlp-core/opennlp-formats/src/test/java/opennlp/tools/formats/GermEval2014NameSampleStreamTest.java
@@ -0,0 +1,298 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.formats.GermEval2014NameSampleStream.NerLayer;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+/**
+ * Test for the {@link GermEval2014NameSampleStream} class.
+ */
+public class GermEval2014NameSampleStreamTest extends AbstractSampleStreamTest
{
+
+ private static final String SAMPLE = "germeval2014.sample";
+
+ private static final int ALL_TYPES =
+ GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES
+ | GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES
+ | GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES
+ | GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES;
+
+ @Test
+ void testParsingSampleFirstSentence() throws IOException {
+ try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES,
NerLayer.OUTER)) {
+ final NameSample sample = sampleStream.read();
+
+ Assertions.assertNotNull(sample);
+ // First sentence: 14 tokens
+ Assertions.assertEquals(14, sample.getSentence().length);
+ Assertions.assertEquals("Schartau", sample.getSentence()[0]);
+ Assertions.assertEquals(".", sample.getSentence()[13]);
+
+ // Comment line means clear adaptive data
+ Assertions.assertTrue(sample.isClearAdaptiveDataSet());
+
+ // 4 outer entities: Schartau (PER), Tagesspiegel (ORG), Fischer (PER),
Berlin (LOC)
+ Assertions.assertEquals(4, sample.getNames().length);
+
+ // Verify Schartau = PER at position 0
+ final Span schartau = findSpanAt(sample.getNames(), 0);
+ Assertions.assertNotNull(schartau);
+ Assertions.assertEquals("person", schartau.getType());
+ Assertions.assertEquals(0, schartau.getStart());
+ Assertions.assertEquals(1, schartau.getEnd());
+
+ // Verify Tagesspiegel = ORG at position 4
+ final Span tagesspiegel = findSpanAt(sample.getNames(), 4);
+ Assertions.assertNotNull(tagesspiegel);
+ Assertions.assertEquals("organization", tagesspiegel.getType());
+
+ // Verify Fischer = PER at position 9
+ final Span fischer = findSpanAt(sample.getNames(), 9);
+ Assertions.assertNotNull(fischer);
+ Assertions.assertEquals("person", fischer.getType());
+
+ // Verify Berlin = LOC at position 12
+ final Span berlin = findSpanAt(sample.getNames(), 12);
+ Assertions.assertNotNull(berlin);
+ Assertions.assertEquals("location", berlin.getType());
+ }
+ }
+
+ @Test
+ void testOuterLayerEntities() throws IOException {
+ try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES,
NerLayer.OUTER)) {
+ // Skip first sentence
+ sampleStream.read();
+ final NameSample sample = sampleStream.read();
+
+ Assertions.assertNotNull(sample);
+ // Second sentence: 13 tokens (Bayern München...)
+ Assertions.assertEquals(13, sample.getSentence().length);
+ Assertions.assertEquals("Bayern", sample.getSentence()[0]);
+ Assertions.assertEquals("München", sample.getSentence()[1]);
+ Assertions.assertTrue(sample.isClearAdaptiveDataSet());
+
+ // Outer layer: Bayern München (ORG), deutschen (LOCderiv) = 2 spans
+ Assertions.assertEquals(2, sample.getNames().length);
+
+ // Bayern München = ORG (0,2)
+ final Span org = findSpanAt(sample.getNames(), 0);
+ Assertions.assertNotNull(org);
+ Assertions.assertEquals("organization", org.getType());
+ Assertions.assertEquals(2, org.getEnd());
+
+ // deutschen = LOCderiv (10,11)
+ final Span locDeriv = findSpanAt(sample.getNames(), 10);
+ Assertions.assertNotNull(locDeriv);
+ Assertions.assertEquals("locationderiv", locDeriv.getType());
+ Assertions.assertEquals(11, locDeriv.getEnd());
+ }
+ }
+
+ @Test
+ void testInnerLayerEntities() throws IOException {
+ try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES,
NerLayer.INNER)) {
+ // Skip first sentence (all inner tags are O)
+ final NameSample first = sampleStream.read();
+ Assertions.assertNotNull(first);
+ Assertions.assertEquals(0, first.getNames().length);
+
+ // Second sentence has inner layer entities
+ final NameSample sample = sampleStream.read();
+ Assertions.assertNotNull(sample);
+
+ // Inner layer: Bayern (LOC), München (LOC) = 2 spans
+ Assertions.assertEquals(2, sample.getNames().length);
+
+ final Span bayernLoc = findSpanAt(sample.getNames(), 0);
+ Assertions.assertNotNull(bayernLoc);
+ Assertions.assertEquals("location", bayernLoc.getType());
+ Assertions.assertEquals(1, bayernLoc.getEnd());
+
+ final Span muenchenLoc = findSpanAt(sample.getNames(), 1);
+ Assertions.assertNotNull(muenchenLoc);
+ Assertions.assertEquals("location", muenchenLoc.getType());
+ Assertions.assertEquals(2, muenchenLoc.getEnd());
+ }
+ }
+
+ @Test
+ void testMiscEntityType() throws IOException {
+ try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES,
NerLayer.OUTER)) {
+ sampleStream.read(); // skip 1st
+ sampleStream.read(); // skip 2nd
+ final NameSample sample = sampleStream.read();
+
+ Assertions.assertNotNull(sample);
+ // Third sentence: "Ecce homo ist ein Werk ."
+ Assertions.assertEquals(6, sample.getSentence().length);
+ Assertions.assertFalse(sample.isClearAdaptiveDataSet());
+
+ // Ecce homo = OTH -> misc
+ Assertions.assertEquals(1, sample.getNames().length);
+ final Span oth = sample.getNames()[0];
+ Assertions.assertEquals("misc", oth.getType());
+ Assertions.assertEquals(0, oth.getStart());
+ Assertions.assertEquals(2, oth.getEnd());
+ }
+ }
+
+ @Test
+ void testPartEntityType() throws IOException {
+ try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES,
NerLayer.OUTER)) {
+ sampleStream.read(); // skip 1st
+ sampleStream.read(); // skip 2nd
+ sampleStream.read(); // skip 3rd
+ final NameSample sample = sampleStream.read();
+
+ Assertions.assertNotNull(sample);
+ // Fourth sentence: "ARD-Programmchef Volker Herres sagte ."
+ Assertions.assertEquals(5, sample.getSentence().length);
+
+ // ARD-Programmchef = ORGpart, Volker Herres = PER
+ Assertions.assertEquals(2, sample.getNames().length);
+
+ final Span orgPart = findSpanAt(sample.getNames(), 0);
+ Assertions.assertNotNull(orgPart);
+ Assertions.assertEquals("organizationpart", orgPart.getType());
+ Assertions.assertEquals(1, orgPart.getEnd());
+
+ final Span person = findSpanAt(sample.getNames(), 1);
+ Assertions.assertNotNull(person);
+ Assertions.assertEquals("person", person.getType());
+ Assertions.assertEquals(3, person.getEnd());
+ }
+ }
+
+ @Test
+ void testStreamExhaustion() throws IOException {
+ try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES,
NerLayer.OUTER)) {
+ sampleStream.read(); // 1st
+ sampleStream.read(); // 2nd
+ sampleStream.read(); // 3rd
+ sampleStream.read(); // 4th
+ Assertions.assertNull(sampleStream.read()); // end of stream
+ }
+ }
+
+ @Test
+ void testFilterPersonEntitiesOnly() throws IOException {
+ try (final ObjectStream<NameSample> sampleStream =
+ openData(GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES,
NerLayer.OUTER)) {
+ final NameSample sample = sampleStream.read();
+
+ Assertions.assertNotNull(sample);
+ // Only PER entities from first sentence: Schartau, Fischer
+ Assertions.assertEquals(2, sample.getNames().length);
+ for (final Span name : sample.getNames()) {
+ Assertions.assertTrue(name.getType().startsWith("person"));
+ }
+ }
+ }
+
+ @Test
+ void testFilterLocationEntitiesOnly() throws IOException {
+ try (final ObjectStream<NameSample> sampleStream =
+ openData(GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES,
NerLayer.OUTER)) {
+ final NameSample sample = sampleStream.read();
+
+ Assertions.assertNotNull(sample);
+ // Only LOC entities from first sentence: Berlin
+ Assertions.assertEquals(1, sample.getNames().length);
+ Assertions.assertEquals("location", sample.getNames()[0].getType());
+ }
+ }
+
+ @Test
+ void testFilterNoEntities() throws IOException {
+ try (final ObjectStream<NameSample> sampleStream = openData(0,
NerLayer.OUTER)) {
+ final NameSample sample = sampleStream.read();
+
+ Assertions.assertNotNull(sample);
+ Assertions.assertEquals(0, sample.getNames().length);
+ }
+ }
+
+ @Test
+ void testReset() throws IOException {
+ try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES,
NerLayer.OUTER)) {
+ final NameSample sample = sampleStream.read();
+ sampleStream.reset();
+
+ Assertions.assertEquals(sample, sampleStream.read());
+ }
+ }
+
+ @Test
+ void testDocumentBoundaryClearsAdaptiveData() throws IOException {
+ try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES,
NerLayer.OUTER)) {
+ final NameSample first = sampleStream.read();
+ Assertions.assertTrue(first.isClearAdaptiveDataSet()); // has # comment
+
+ final NameSample second = sampleStream.read();
+ Assertions.assertTrue(second.isClearAdaptiveDataSet()); // has # comment
+
+ final NameSample third = sampleStream.read();
+ Assertions.assertFalse(third.isClearAdaptiveDataSet()); // no # comment
+
+ final NameSample fourth = sampleStream.read();
+ Assertions.assertFalse(fourth.isClearAdaptiveDataSet()); // no # comment
+ }
+ }
+
+ @Test
+ void testAllEntityTypesPresent() throws IOException {
+ try (final ObjectStream<NameSample> sampleStream = openData(ALL_TYPES,
NerLayer.OUTER)) {
+ final Set<String> foundTypes = new HashSet<>();
+ NameSample sample;
+ while ((sample = sampleStream.read()) != null) {
+ for (final Span name : sample.getNames()) {
+ foundTypes.add(name.getType());
+ }
+ }
+ // Should find: person, organization, location, locationderiv, misc,
organizationpart
+ Assertions.assertTrue(foundTypes.containsAll(
+ Arrays.asList("person", "organization", "location", "misc")));
+ }
+ }
+
+ private ObjectStream<NameSample> openData(final int types, final NerLayer
layer)
+ throws IOException {
+ return new GermEval2014NameSampleStream(getFactory(SAMPLE), types, layer);
+ }
+
+ private Span findSpanAt(final Span[] spans, final int start) {
+ for (final Span span : spans) {
+ if (span.getStart() == start) {
+ return span;
+ }
+ }
+ return null;
+ }
+}
diff --git
a/opennlp-core/opennlp-formats/src/test/resources/opennlp/tools/formats/germeval2014.sample
b/opennlp-core/opennlp-formats/src/test/resources/opennlp/tools/formats/germeval2014.sample
new file mode 100644
index 00000000..b28ae382
--- /dev/null
+++
b/opennlp-core/opennlp-formats/src/test/resources/opennlp/tools/formats/germeval2014.sample
@@ -0,0 +1,44 @@
+# n-tv.de vom 26.02.2005 [2005-02-26]
+1 Schartau B-PER O
+2 sagte O O
+3 dem O O
+4 " O O
+5 Tagesspiegel B-ORG O
+6 " O O
+7 vom O O
+8 Freitag O O
+9 , O O
+10 Fischer B-PER O
+11 sei O O
+12 in O O
+13 Berlin B-LOC O
+14 . O O
+
+# stern.de vom 21.03.2006 [2006-03-21]
+1 Bayern B-ORG B-LOC
+2 München I-ORG B-LOC
+3 ist O O
+4 wieder O O
+5 alleiniger O O
+6 Favorit O O
+7 auf O O
+8 den O O
+9 Gewinn O O
+10 der O O
+11 deutschen B-LOCderiv O
+12 Fußball-Meisterschaft O O
+13 . O O
+
+1 Ecce B-OTH O
+2 homo I-OTH O
+3 ist O O
+4 ein O O
+5 Werk O O
+6 . O O
+
+1 ARD-Programmchef B-ORGpart O
+2 Volker B-PER O
+3 Herres I-PER O
+4 sagte O O
+5 . O O
+
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/eval/GermEval2014NameFinderEval.java
b/opennlp-tools/src/test/java/opennlp/tools/eval/GermEval2014NameFinderEval.java
new file mode 100644
index 00000000..c67e84bc
--- /dev/null
+++
b/opennlp-tools/src/test/java/opennlp/tools/eval/GermEval2014NameFinderEval.java
@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.eval;
+
+import java.io.File;
+import java.io.IOException;
+import java.math.BigInteger;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.formats.GermEval2014NameSampleStream;
+import opennlp.tools.formats.GermEval2014NameSampleStream.NerLayer;
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.namefind.TokenNameFinderEvaluator;
+import opennlp.tools.namefind.TokenNameFinderFactory;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.util.MarkableFileInputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+import opennlp.tools.util.model.ModelUtil;
+
+/**
+ * Evaluates the name finder against the GermEval 2014 NER corpus (German).
+ * <p>
+ * Download the data files from the GermEval 2014 shared task
+ * <a href="https://sites.google.com/site/germeval2014ner/data">site</a>
+ * and place them into this directory: {@code $OPENNLP_DATA_DIR/germeval2014/}.
+ * <p>
+ * Expected files:
+ * <ul>
+ * <li>{@code NER-de-train.tsv} - Training data</li>
+ * <li>{@code NER-de-test.tsv} - Test data</li>
+ * </ul>
+ */
+public class GermEval2014NameFinderEval extends AbstractEvalTest {
+
+ private static final int ALL_TYPES =
+ GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES
+ | GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES
+ | GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES
+ | GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES;
+
+ private static File trainingFile;
+ private static File testFile;
+
+ private TokenNameFinderModel train(final File trainFile, final
TrainingParameters params,
+ final int types) throws IOException {
+
+ final ObjectStream<NameSample> samples = new GermEval2014NameSampleStream(
+ new MarkableFileInputStreamFactory(trainFile), types, NerLayer.OUTER);
+
+ return NameFinderME.train("deu", null, samples, params, new
TokenNameFinderFactory());
+ }
+
+ private void eval(final TokenNameFinderModel model, final File testData,
+ final int types, final double expectedFMeasure) throws
IOException {
+
+ final ObjectStream<NameSample> samples = new GermEval2014NameSampleStream(
+ new MarkableFileInputStreamFactory(testData), types, NerLayer.OUTER);
+
+ final TokenNameFinderEvaluator evaluator = new
TokenNameFinderEvaluator(new NameFinderME(model));
+ evaluator.evaluate(samples);
+
+ Assertions.assertEquals(expectedFMeasure,
evaluator.getFMeasure().getFMeasure(), ACCURACY_DELTA);
+ }
+
+ @BeforeAll
+ static void verifyTrainingData() throws Exception {
+
+ trainingFile = new File(getOpennlpDataDir(),
"germeval2014/NER-de-train.tsv");
+ testFile = new File(getOpennlpDataDir(), "germeval2014/NER-de-test.tsv");
+
+ verifyTrainingData(new GermEval2014NameSampleStream(
+ new MarkableFileInputStreamFactory(trainingFile),
+ ALL_TYPES, NerLayer.OUTER),
+ new BigInteger("175386258960384643455328517118707394452"));
+ verifyTrainingData(new GermEval2014NameSampleStream(
+ new MarkableFileInputStreamFactory(testFile),
+ ALL_TYPES, NerLayer.OUTER),
+ new BigInteger("112232325598196372951673841456976805014"));
+ }
+
+ // -- Person entity evaluation --
+
+ @Test
+ void evalPersonPerceptron() throws IOException {
+ final TrainingParameters params = createPerceptronParams();
+
+ final TokenNameFinderModel model = train(trainingFile, params,
+ GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES);
+
+ eval(model, testFile,
+ GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES,
0.6086631814787155d);
+ }
+
+ @Test
+ void evalPersonMaxentGis() throws IOException {
+ final TrainingParameters params =
ModelUtil.createDefaultTrainingParameters();
+
+ final TokenNameFinderModel model = train(trainingFile, params,
+ GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES);
+
+ eval(model, testFile,
+ GermEval2014NameSampleStream.GENERATE_PERSON_ENTITIES,
0.5204518893650175d);
+ }
+
+ // -- Organization entity evaluation --
+
+ @Test
+ void evalOrganizationPerceptron() throws IOException {
+ final TrainingParameters params = createPerceptronParams();
+
+ final TokenNameFinderModel model = train(trainingFile, params,
+ GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES);
+
+ eval(model, testFile,
+ GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES,
0.5588235294117646d);
+ }
+
+ @Test
+ void evalOrganizationMaxentGis() throws IOException {
+ final TrainingParameters params =
ModelUtil.createDefaultTrainingParameters();
+
+ final TokenNameFinderModel model = train(trainingFile, params,
+ GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES);
+
+ eval(model, testFile,
+ GermEval2014NameSampleStream.GENERATE_ORGANIZATION_ENTITIES,
0.4594180704441041d);
+ }
+
+ // -- Location entity evaluation --
+
+ @Test
+ void evalLocationPerceptron() throws IOException {
+ final TrainingParameters params = createPerceptronParams();
+
+ final TokenNameFinderModel model = train(trainingFile, params,
+ GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES);
+
+ eval(model, testFile,
+ GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES,
0.6705613411226822d);
+ }
+
+ @Test
+ void evalLocationMaxentGis() throws IOException {
+ final TrainingParameters params =
ModelUtil.createDefaultTrainingParameters();
+
+ final TokenNameFinderModel model = train(trainingFile, params,
+ GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES);
+
+ eval(model, testFile,
+ GermEval2014NameSampleStream.GENERATE_LOCATION_ENTITIES,
0.5537280701754386d);
+ }
+
+ // -- Misc (OTH) entity evaluation --
+
+ @Test
+ void evalMiscPerceptron() throws IOException {
+ final TrainingParameters params = createPerceptronParams();
+
+ final TokenNameFinderModel model = train(trainingFile, params,
+ GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES);
+
+ eval(model, testFile,
+ GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES,
0.4482142857142857d);
+ }
+
+ @Test
+ void evalMiscMaxentGis() throws IOException {
+ final TrainingParameters params =
ModelUtil.createDefaultTrainingParameters();
+
+ final TokenNameFinderModel model = train(trainingFile, params,
+ GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES);
+
+ eval(model, testFile,
+ GermEval2014NameSampleStream.GENERATE_MISC_ENTITIES,
0.3932267168391345d);
+ }
+
+ // -- Combined (all types) evaluation --
+
+ @Test
+ void evalCombinedPerceptron() throws IOException {
+ final TrainingParameters params = createPerceptronParams();
+
+ final TokenNameFinderModel model = train(trainingFile, params, ALL_TYPES);
+
+ eval(model, testFile, ALL_TYPES, 0.6016631636662707d);
+ }
+
+ @Test
+ void evalCombinedMaxentGis() throws IOException {
+ final TrainingParameters params =
ModelUtil.createDefaultTrainingParameters();
+
+ final TokenNameFinderModel model = train(trainingFile, params, ALL_TYPES);
+
+ eval(model, testFile, ALL_TYPES, 0.5229054890631449d);
+ }
+}