(opennlp-sandbox) 01/01: Drop tf-ner-poc component - deletes the tf-ner-poc component entirely - reasons: TensorFlow 1.15 is EOL, no Java migration path to TF 2.x

mawiesne Fri, 20 Mar 2026 14:22:22 -0700

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch cleanup/drop-tf-ner-poc
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git


commit dd0c5d71b0e18b7e53dca4ab0d4ee14accd92b9a
Author: Martin Wiesner <[email protected]>
AuthorDate: Fri Mar 20 22:21:14 2026 +0100

    Drop tf-ner-poc component
    - deletes the tf-ner-poc component entirely
    - reasons: TensorFlow 1.15 is EOL, no Java migration path to TF 2.x
---
 README.md                                          |   1 -
 pom.xml                                            |   1 -
 tf-ner-poc/pom.xml                                 |  81 ----
 .../main/java/org/apache/opennlp/ModelUtil.java    |  50 ---
 .../apache/opennlp/namecat/NameCategorizer.java    | 106 -----
 .../apache/opennlp/namefinder/FeedDictionary.java  | 148 -------
 .../org/apache/opennlp/namefinder/IndexTagger.java |  57 ---
 .../namefinder/PredictionConfiguration.java        |  57 ---
 .../apache/opennlp/namefinder/SequenceTagging.java | 133 ------
 .../org/apache/opennlp/namefinder/TokenIds.java    |  37 --
 .../org/apache/opennlp/namefinder/Viterbi.java     | 157 -------
 .../org/apache/opennlp/namefinder/WordIndexer.java | 171 -------
 .../org/apache/opennlp/normalizer/Normalizer.java  | 147 ------
 tf-ner-poc/src/main/python/doccat/doccat.py        | 217 ---------
 tf-ner-poc/src/main/python/namecat/namecat.py      | 238 ----------
 .../src/main/python/namefinder/namefinder.py       | 493 ---------------------
 tf-ner-poc/src/main/python/namefinder/split.py     |  61 ---
 .../src/main/python/normalizer/date_generator.py   |  86 ----
 .../src/main/python/normalizer/normalizer.py       | 322 --------------
 .../opennlp/namefinder/FeedDictionaryTest.java     |  64 ---
 .../org/apache/opennlp/namefinder/PredictTest.java |  62 ---
 .../apache/opennlp/namefinder/WordIndexerTest.java | 146 ------
 tf-ner-poc/src/test/resources/chars.txt.gz         | Bin 152 -> 0 bytes
 tf-ner-poc/src/test/resources/tags.txt.gz          | Bin 64 -> 0 bytes
 tf-ner-poc/src/test/resources/words.txt.gz         | Bin 89404 -> 0 bytes
 25 files changed, 2835 deletions(-)

diff --git a/README.md b/README.md
index 107fda3..dc9d61a 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,6 @@ Currently, the library has different components:
 * `opennlp-wsd`: A set of components that allow for word sense disambiguation.
 * `summarizer`: A set of classes providing text summarization.
 * `tagging-server`: A RESTful webservice to allow for NER, POS tagging, 
sentence detection and tokenization.
-* `tf-ner-poc`: An adapter component for 
[Tensorflow](https://www.tensorflow.org), in an early proof-of-concept (poc) 
stage.
 * `wikinews-importer`: A set of classes to process and annotate text formatted 
in [MediaWiki markup](https://www.mediawiki.org/wiki/Help:Formatting).
 
 ## Getting Started
diff --git a/pom.xml b/pom.xml
index 90f6c33..e078240 100644
--- a/pom.xml
+++ b/pom.xml
@@ -108,7 +108,6 @@
         <module>opennlp-grpc</module>
         <module>opennlp-similarity</module>
         <module>opennlp-wsd</module>
-        <module>tf-ner-poc</module>
         <module>summarizer</module>
         <module>tagging-server</module>
         <module>wikinews-importer</module>
diff --git a/tf-ner-poc/pom.xml b/tf-ner-poc/pom.xml
deleted file mode 100644
index 72ad8b1..0000000
--- a/tf-ner-poc/pom.xml
+++ /dev/null
@@ -1,81 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one
-   or more contributor license agreements.  See the NOTICE file
-   distributed with this work for additional information
-   regarding copyright ownership.  The ASF licenses this file
-   to you under the Apache License, Version 2.0 (the
-   "License"); you may not use this file except in compliance
-   with the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing,
-   software distributed under the License is distributed on an
-   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-   KIND, either express or implied.  See the License for the
-   specific language governing permissions and limitations
-   under the License.
--->
-
-<project xmlns="http://maven.apache.org/POM/4.0.0";
-         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
-         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
-    <modelVersion>4.0.0</modelVersion>
-    <parent>
-        <groupId>org.apache.opennlp</groupId>
-        <artifactId>opennlp-sandbox</artifactId>
-        <version>3.0.0-SNAPSHOT</version>
-    </parent>
-
-    <artifactId>tf-ner-poc</artifactId>
-    <name>Apache OpenNLP TF NER poc</name>
-
-    <properties>
-        <tensorflow.version>1.15.0</tensorflow.version>
-    </properties>
-
-    <dependencies>
-        <dependency>
-            <groupId>org.tensorflow</groupId>
-            <artifactId>tensorflow</artifactId>
-            <version>${tensorflow.version}</version>
-        </dependency>
-
-        <dependency>
-            <groupId>org.apache.opennlp</groupId>
-            <artifactId>opennlp-runtime</artifactId>
-        </dependency>
-
-        <dependency>
-            <groupId>org.junit.jupiter</groupId>
-            <artifactId>junit-jupiter-api</artifactId>
-        </dependency>
-
-        <dependency>
-            <groupId>org.junit.jupiter</groupId>
-            <artifactId>junit-jupiter-engine</artifactId>
-        </dependency>
-
-        <dependency>
-            <groupId>org.junit.jupiter</groupId>
-            <artifactId>junit-jupiter-params</artifactId>
-        </dependency>
-    </dependencies>
-
-    <build>
-        <plugins>
-            <plugin>
-                <groupId>org.apache.maven.plugins</groupId>
-                <artifactId>maven-compiler-plugin</artifactId>
-                <configuration>
-                    <source>${maven.compiler.source}</source>
-                    <target>${maven.compiler.target}</target>
-                </configuration>
-            </plugin>
-
-        </plugins>
-    </build>
-
-</project>
diff --git a/tf-ner-poc/src/main/java/org/apache/opennlp/ModelUtil.java 
b/tf-ner-poc/src/main/java/org/apache/opennlp/ModelUtil.java
deleted file mode 100644
index 1f5b2d2..0000000
--- a/tf-ner-poc/src/main/java/org/apache/opennlp/ModelUtil.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.opennlp;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipInputStream;
-
-public class ModelUtil {
-
-  public static Path writeModelToTmpDir(InputStream modelIn) throws 
IOException {
-    Path tmpDir = Files.createTempDirectory("opennlp2");
-
-    try (ZipInputStream zis = new ZipInputStream(modelIn)) {
-      ZipEntry zipEntry = zis.getNextEntry();
-      while(zipEntry != null){
-        Path newFile = tmpDir.resolve(zipEntry.getName());
-
-        Files.createDirectories(newFile.getParent());
-        Files.copy(zis, newFile);
-
-        // TODO: How to delete the tmp directory after we are done loading 
from it ?!
-        newFile.toFile().deleteOnExit();
-
-        zipEntry = zis.getNextEntry();
-      }
-      zis.closeEntry();
-    }
-
-    return tmpDir;
-  }
-}
diff --git 
a/tf-ner-poc/src/main/java/org/apache/opennlp/namecat/NameCategorizer.java 
b/tf-ner-poc/src/main/java/org/apache/opennlp/namecat/NameCategorizer.java
deleted file mode 100644
index de0215a..0000000
--- a/tf-ner-poc/src/main/java/org/apache/opennlp/namecat/NameCategorizer.java
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.opennlp.namecat;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.opennlp.ModelUtil;
-import org.tensorflow.SavedModelBundle;
-import org.tensorflow.Session;
-import org.tensorflow.Tensor;
-
-public class NameCategorizer {
-
-  private final Session session;
-  private final Map<Character, Integer> charMap = new HashMap<>();
-  private final Map<Integer, String> labelMap;
-
-  public NameCategorizer(InputStream modelZipPackage) throws IOException {
-
-    Path tmpModelPath = ModelUtil.writeModelToTmpDir(modelZipPackage);
-
-    try (BufferedReader in = Files.newBufferedReader(
-            tmpModelPath.resolve("char_dict.txt"), StandardCharsets.UTF_8)) {
-      in.lines().forEach(ch -> charMap.put(ch.charAt(0), charMap.size()));
-    }
-
-    labelMap = new HashMap<>();
-    try (BufferedReader in = Files.newBufferedReader(
-            tmpModelPath.resolve("label_dict.txt"), StandardCharsets.UTF_8)) {
-      in.lines().forEach(label -> labelMap.put(labelMap.size(), label));
-    }
-
-    SavedModelBundle model = SavedModelBundle.load(tmpModelPath.toString(), 
"serve");
-    session = model.session();
-  }
-
-  private static int argmax(float[] x) {
-    if (x == null || x.length == 0) {
-      throw new IllegalArgumentException("Vector x is null or empty");
-    }
-
-    int maxIdx = 0;
-    for (int i = 1; i < x.length; i++) {
-      if (x[maxIdx] < x[i])
-        maxIdx = i;
-    }
-    return maxIdx;
-  }
-
-  public String[] categorize(String[] names) {
-    if (names.length == 0) {
-      return new String[0];
-    }
-
-    int maxLength = 
Arrays.stream(names).mapToInt(String::length).max().getAsInt();
-
-    int[][] charIds = new int[names.length][maxLength];
-    int[] nameLengths = new int[names.length];
-
-    for (int nameIndex = 0; nameIndex < names.length; nameIndex++) {
-      for (int charIndex = 0; charIndex < names[nameIndex].length(); 
charIndex++) {
-        charIds[nameIndex][charIndex] = 
charMap.get(names[nameIndex].charAt(charIndex));
-      }
-      nameLengths[nameIndex] = names[nameIndex].length();
-    }
-
-    try (Tensor<?> dropout = Tensor.create(1f, Float.class);
-         Tensor<?> charTensor = Tensor.create(charIds);
-         Tensor<?> nameLength = Tensor.create(nameLengths)) {
-      List<Tensor<?>> result = session.runner()
-          .feed("dropout_keep_prop", dropout)
-          .feed("char_ids", charTensor)
-          .feed("name_lengths", nameLength)
-          .fetch("norm_probs", 0).run();
-
-      try (Tensor<?> probTensor = result.get(0)) {
-        float[][] probs = probTensor.copyTo(new 
float[names.length][labelMap.size()]);
-        return Arrays.stream(probs).map(prob -> 
labelMap.get(argmax(prob))).toArray(String[]::new);
-      }
-    }
-  }
-}
diff --git 
a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/FeedDictionary.java 
b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/FeedDictionary.java
deleted file mode 100644
index c7b7234..0000000
--- a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/FeedDictionary.java
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.opennlp.namefinder;
-
-import java.util.Arrays;
-
-import org.tensorflow.Tensor;
-
-public class FeedDictionary implements AutoCloseable  {
-
-  static final int PAD_VALUE = 0;
-
-  private final Tensor<Float> dropoutTensor;
-  private final Tensor<Integer> charIdsTensor;
-  private final Tensor<Integer> wordLengthsTensor;
-  private final Tensor<Integer> wordIdsTensor;
-  private final int[] sentenceLengths;
-  private final Tensor<Integer> sentenceLengthsTensor;
-  private final int maxSentenceLength;
-  private final int maxCharLength;
-  private final int numberOfSentences;
-
-  public int[] getSentenceLengths() {
-    return sentenceLengths;
-  }
-
-  public int getMaxSentenceLength() {
-    return maxSentenceLength;
-  }
-
-  public int getNumberOfSentences() {
-    return numberOfSentences;
-  }
-
-  public Tensor<Float> getDropoutTensor() {
-    return dropoutTensor;
-  }
-
-  public Tensor<Integer> getCharIdsTensor() {
-    return charIdsTensor;
-  }
-
-  public Tensor<Integer> getSentenceLengthsTensor() {
-    return sentenceLengthsTensor;
-  }
-
-  public Tensor<Integer> getWordLengthsTensor() {
-    return wordLengthsTensor;
-  }
-
-  public Tensor<Integer> getWordIdsTensor() {
-    return wordIdsTensor;
-  }
-
-  private FeedDictionary(final float dropout, final int[][][] charIds,
-                         final int[][] wordLengths, final int[][] wordIds,
-                         final int[] sentenceLengths, final int 
maxSentenceLength,
-                         final int maxCharLength, final int numberOfSentences) 
{
-
-    dropoutTensor = Tensor.create(dropout, Float.class);
-    charIdsTensor = Tensor.create(charIds, Integer.class);
-    wordLengthsTensor = Tensor.create(wordLengths, Integer.class);
-    wordIdsTensor = Tensor.create(wordIds, Integer.class);
-    this.sentenceLengths = sentenceLengths;
-    sentenceLengthsTensor = Tensor.create(sentenceLengths, Integer.class);
-    this.maxSentenceLength = maxSentenceLength;
-    this.maxCharLength = maxCharLength;
-    this.numberOfSentences = numberOfSentences;
-
-  }
-
-  @Override
-  public void close() {
-    dropoutTensor.close();
-    charIdsTensor.close();
-    wordLengthsTensor.close();
-    wordIdsTensor.close();
-    sentenceLengthsTensor.close();
-  }
-
-  // multi sentences
-  public static FeedDictionary create(TokenIds sentences) {
-
-    int numberOfSentences = sentences.getWordIds().length;
-
-    int[][][] charIds = new int[numberOfSentences][][];
-    int[][] wordLengths = new int[numberOfSentences][];
-
-    int maxSentenceLength = Arrays.stream(sentences.getWordIds()).map(s -> 
s.length).reduce(Integer::max).get();
-    Padded paddedSentences = padArrays(sentences.getWordIds(), 
maxSentenceLength);
-    int[][] wordIds = paddedSentences.ids;
-    int[] sentenceLengths = paddedSentences.lengths;
-
-    int maxCharLength = Arrays.stream(sentences.getCharIds()).flatMap(s -> 
Arrays.stream(s).map(c -> c.length)).reduce(Integer::max).get();
-    for (int i=0; i < numberOfSentences; i++) {
-      Padded paddedWords = padArrays(sentences.getCharIds()[i], maxCharLength);
-      charIds[i] = paddedWords.ids;
-      wordLengths[i] = paddedWords.lengths;
-    }
-
-    return new FeedDictionary(1.0f, charIds, wordLengths, wordIds, 
sentenceLengths, maxSentenceLength, maxCharLength, numberOfSentences);
-
-  }
-
-  private static Padded padArrays(int[][] ids, int length) {
-
-    int[][] paddedIds = new int[ids.length][length];
-    int[] lengths = new int[ids.length];
-
-    for (int i = 0; i < ids.length; i++) {
-      int[] src = ids[i];
-      int[] dest = new int[length];
-      System.arraycopy(src, 0, dest, 0, src.length);
-      if (src.length < length)
-        Arrays.fill(dest, src.length, length, PAD_VALUE);
-      paddedIds[i] = dest;
-      lengths[i] = src.length;
-    }
-
-    return new Padded(paddedIds, lengths);
-
-  }
-
-  private static class Padded {
-    private final int[][] ids;
-    private final int[] lengths;
-    
-    Padded(int[][] ids, int[] lengths) {
-      this.ids = ids;
-      this.lengths = lengths;
-    }
-  }
-}
diff --git 
a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/IndexTagger.java 
b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/IndexTagger.java
deleted file mode 100644
index dfa451f..0000000
--- a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/IndexTagger.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.opennlp.namefinder;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.charset.StandardCharsets;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Map;
-
-public class IndexTagger {
-
-  private final Map<Integer, String> idx2Tag = new HashMap<>();
-
-  public IndexTagger(InputStream vocabTags) throws IOException {
-    try(BufferedReader in = new BufferedReader(
-            new InputStreamReader(vocabTags, StandardCharsets.UTF_8))) {
-      String tag;
-      int idx = 0;
-      while ((tag = in.readLine()) != null) {
-        idx2Tag.put(idx, tag);
-        idx += 1;
-      }
-    }
-  }
-
-  public String getTag(Integer idx) {
-    return idx2Tag.get(idx);
-  }
-
-  public Map<Integer, String> getIdx2Tag() {
-    return Collections.unmodifiableMap(idx2Tag);
-  }
-
-  public int getNumberOfTags() {
-    return idx2Tag.size();
-  }
-
-}
diff --git 
a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/PredictionConfiguration.java
 
b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/PredictionConfiguration.java
deleted file mode 100644
index 30d18d9..0000000
--- 
a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/PredictionConfiguration.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.opennlp.namefinder;
-
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-
-public class PredictionConfiguration {
-
-  private final String vocabWords;
-  private final String vocabChars;
-  private final String vocabTags;
-  private final String savedModel;
-
-  public PredictionConfiguration(String vocabWords, String vocabChars, String 
vocabTags, String savedModel) {
-    this.vocabWords = vocabWords;
-    this.vocabChars = vocabChars;
-    this.vocabTags = vocabTags;
-    this.savedModel = savedModel;
-  }
-
-  public String getVocabWords() {
-    return vocabWords;
-  }
-
-  public String getVocabChars() {
-    return vocabChars;
-  }
-
-  public String getVocabTags() {
-    return vocabTags;
-  }
-
-  public String getSavedModel() {
-    return savedModel;
-  }
-
-  public InputStream getVocabWordsInputStream() throws IOException{
-    return new FileInputStream(getVocabWords());
-  }
-}
diff --git 
a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/SequenceTagging.java 
b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/SequenceTagging.java
deleted file mode 100644
index 9d33b56..0000000
--- 
a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/SequenceTagging.java
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.opennlp.namefinder;
-
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.opennlp.ModelUtil;
-import org.tensorflow.SavedModelBundle;
-import org.tensorflow.Session;
-import org.tensorflow.Tensor;
-
-import opennlp.tools.namefind.BioCodec;
-import opennlp.tools.namefind.TokenNameFinder;
-import opennlp.tools.util.Span;
-
-public class SequenceTagging implements TokenNameFinder, AutoCloseable {
-  private final SavedModelBundle model;
-  private final Session session;
-  private final WordIndexer wordIndexer;
-  private final IndexTagger indexTagger;
-
-  public SequenceTagging(PredictionConfiguration config) throws IOException {
-    model = SavedModelBundle.load(config.getSavedModel(), "serve");
-    session = model.session();
-
-    this.wordIndexer = new WordIndexer(new 
FileInputStream(config.getVocabWords()),
-            new FileInputStream(config.getVocabChars()));
-
-    this.indexTagger = new IndexTagger((new 
FileInputStream(config.getVocabTags())));
-  }
-
-  public SequenceTagging(InputStream modelZipPackage) throws IOException {
-
-    Path tmpDir = ModelUtil.writeModelToTmpDir(modelZipPackage);
-
-    try (InputStream wordsIn = 
Files.newInputStream(tmpDir.resolve("word_dict.txt"));
-         InputStream charsIn = 
Files.newInputStream(tmpDir.resolve("char_dict.txt"))) {
-      wordIndexer = new WordIndexer(wordsIn, charsIn);
-    }
-
-    try (InputStream in = 
Files.newInputStream(tmpDir.resolve("label_dict.txt"))) {
-      indexTagger = new IndexTagger(in);
-    }
-
-    model = SavedModelBundle.load(tmpDir.toString(), "serve");
-    session = model.session();
-  }
-
-  @Override
-  public Span[] find(String[] sentence) {
-    if (sentence.length > 0) {
-      TokenIds tokenIds = wordIndexer.toTokenIds(sentence);
-      return new BioCodec().decode(Arrays.asList(predict(tokenIds)[0]));
-    }
-    else {
-      return new Span[0];
-    }
-  }
-
-  public String[][] predict(String[][] sentences) {
-    TokenIds tokenIds = wordIndexer.toTokenIds(sentences);
-    return predict(tokenIds);
-  }
-
-  private String[][] predict(TokenIds tokenIds) {
-
-    try (FeedDictionary fd = FeedDictionary.create(tokenIds)) {
-
-      List<Tensor<?>> run = session.runner()
-          .feed("chars/char_ids:0", fd.getCharIdsTensor())
-          .feed("dropout_keep_prop:0", fd.getDropoutTensor())
-          .feed("words/sequence_lengths:0", fd.getSentenceLengthsTensor())
-          .feed("words/word_ids:0", fd.getWordIdsTensor())
-          .feed("chars/word_lengths:0", fd.getWordLengthsTensor())
-          .fetch("logits", 0)
-          .fetch("trans_params", 0).run();
-
-      float[][][] logits = new 
float[fd.getNumberOfSentences()][fd.getMaxSentenceLength()][indexTagger.getNumberOfTags()];
-      run.get(0).copyTo(logits);
-
-      float[][] trans_params = new 
float[indexTagger.getNumberOfTags()][indexTagger.getNumberOfTags()];
-      run.get(1).copyTo(trans_params);
-
-      String[][] returnValue = new String[fd.getNumberOfSentences()][];
-      for (int i = 0; i < logits.length; i++) {
-        float[][] logit = Arrays.copyOf(logits[i], fd.getSentenceLengths()[i]);
-        returnValue[i] = Viterbi.decode(logit, 
trans_params).stream().map(indexTagger::getTag).toArray(String[]::new);
-      }
-
-      for (int i = 0; i < returnValue[0].length; i++) {
-        if (returnValue[0][i] == null) {
-          returnValue[0][i] = "other";
-        }
-      }
-
-      for (Tensor<?> t : run) {
-        t.close();
-      }
-
-      return returnValue;
-    }
-  }
-
-  @Override
-  public void clearAdaptiveData() {
-  }
-
-  @Override
-  public void close() {
-    session.close();
-  }
-}
diff --git 
a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/TokenIds.java 
b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/TokenIds.java
deleted file mode 100644
index 621ab3e..0000000
--- a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/TokenIds.java
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.opennlp.namefinder;
-
-public final class TokenIds {
-
-  private final int[][][] charIds;
-  private final int[][] wordIds;
-
-  public TokenIds(int[][][] charIds, int[][] wordIds) {
-    this.charIds = charIds;
-    this.wordIds = wordIds;
-  }
-
-  public int[][][] getCharIds() {
-    return charIds;
-  }
-
-  public int[][] getWordIds() {
-    return wordIds;
-  }
-}
diff --git 
a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/Viterbi.java 
b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/Viterbi.java
deleted file mode 100644
index 254afc5..0000000
--- a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/Viterbi.java
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.opennlp.namefinder;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-
-public class Viterbi {
-
-  /*
-  """Viterbi the highest scoring sequence of tags outside of TensorFlow.
-  This should only be used at test time.
-  Args:
-    score: A [seq_len, num_tags] matrix of unary potentials.
-    transition_params: A [num_tags, num_tags] matrix of binary potentials.
-  Returns:
-    viterbi: A [seq_len] list of integers containing the highest scoring tag
-        indices.
-    viterbi_score: A float containing the score for the Viterbi sequence.
-  """
-   */
-
-  private static float[][] zeros_like(float[][] matrix) {
-    float[][] returnValue = new float[matrix.length][matrix[0].length];
-    for (int i=0; i<matrix.length; i++)
-      Arrays.fill(returnValue[i], 0.0f);
-    return returnValue;
-  }
-
-  private static int[][] zeros_like(int[] shape) {
-    int[][] returnValue = new int[shape[0]][shape[1]];
-    for (int i=0; i<shape[0]; i++)
-      Arrays.fill(returnValue[i], 0);
-    return returnValue;
-  }
-
-  private static int[] shape(float[][] var) {
-    return new int[] {var.length, var[0].length};
-  }
-
-  private static float[][] expand_dims_axis_one_plus_array(float[] array, 
float[][] plus) {
-    int[] plus_shape = shape(plus);
-    if (plus_shape[0] != array.length)
-      throw new RuntimeException("Not same shape");
-    float[][] returnValue = new float[plus_shape[0]][plus_shape[1]];
-    for (int i=0; i < array.length; i++) {
-      for (int j=0; j < plus_shape[1]; j++) {
-        returnValue[i][j] = array[i] + plus[i][j];
-      }
-    }
-    return returnValue;
-  }
-
-  private static float[] max_columnwise(float[][] array) {
-    float[] returnValue = new float[array[0].length];
-    for (int col=0; col < array[0].length; col++) {
-      returnValue[col] = Float.MIN_VALUE;
-      for (float[] floats : array) {
-        returnValue[col] = Float.max(returnValue[col], floats[col]);
-      }
-    }
-
-    return returnValue;
-  }
-
-  private static float max(float[] array) {
-    float returnValue = Float.MIN_VALUE;
-    for (float v : array) {
-      returnValue = Float.max(returnValue, v);
-    }
-    return returnValue;
-  }
-
-  private static int[] argmax_columnwise(float[][] array) {
-    int[] returnValue = new int[array[0].length];
-    for (int col=0; col < array[0].length; col++) {
-      float max = Float.MIN_VALUE;
-      int idx = -1;
-      for (int row=0; row < array.length; row++) {
-        if (Float.compare(array[row][col], max) > 0) {
-          max = array[row][col];
-          idx = row;
-        }
-      }
-      returnValue[col] = idx;
-    }
-    return returnValue;
-  }
-
-  private static int argmax(float[] array) {
-    int returnValue = -1;
-    float max = Float.MIN_VALUE;
-    for (int col=0; col < array.length; col++) {
-      if (Float.compare(array[col], max) > 0) {
-        max = array[col];
-        returnValue = col;
-      }
-    }
-    return returnValue;
-  }
-
-  public static float[] plus(float[] a, float[] b) {
-    if (a.length == b.length) {
-      float[] returnValue = new float[a.length];
-      for(int i = 0; i < a.length; ++i) {
-        returnValue[i] = Float.sum(a[i], b[i]);
-      }
-      return returnValue;
-    } else {
-      throw new IllegalArgumentException("Arrays doesn't have same shape.");
-    }
-  }
-
-  public static List<Integer> decode(float[][] score, float[][] 
transition_params) {
-
-    float[][] trellis = zeros_like(score);
-    int[][] backpointers = zeros_like(shape(score));
-
-    trellis[0] = score[0];
-
-    for (int t=1; t < score.length; t++) {
-      float[][] v = expand_dims_axis_one_plus_array(trellis[t - 1], 
transition_params);
-      trellis[t] = plus(score[t], max_columnwise(v));
-      backpointers[t] = argmax_columnwise(v);
-    }
-
-    List<Integer> viterbi = new ArrayList<>();
-    viterbi.add(argmax(trellis[trellis.length - 1]));
-
-    for (int i=backpointers.length - 1; i >= 1; i--) {
-      int[] bp = backpointers[i];
-      viterbi.add(bp[viterbi.get(viterbi.size() - 1)]);
-    }
-
-    Collections.reverse(viterbi);
-
-    return viterbi;
-  }
-
-}
diff --git 
a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/WordIndexer.java 
b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/WordIndexer.java
deleted file mode 100644
index 729ecf1..0000000
--- a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/WordIndexer.java
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.opennlp.namefinder;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.charset.StandardCharsets;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.regex.Pattern;
-
-import opennlp.tools.util.StringUtil;
-
-public class WordIndexer {
-
-  private final Map<Character, Integer> char2idx;
-  private final Map<String, Integer> word2idx;
-
-  public static final String UNK = "$UNK$";
-  public static String NUM = "$NUM$";
-
-  private final boolean lowerCase = false;
-  private final boolean allowUnk = true;
-
-  private final Pattern digitPattern = 
Pattern.compile("\\d+(,\\d+)*(\\.\\d+)?");
-
-  public WordIndexer(InputStream vocabWords, InputStream vocabChars) throws 
IOException {
-    this.word2idx = new HashMap<>();
-    this.char2idx = new HashMap<>();
-
-    readVocabWords(vocabWords);
-    readVocacChars(vocabChars);
-  }
-
-  private void readVocacChars(InputStream vocabChars) throws IOException {
-    try(BufferedReader in = new BufferedReader(new 
InputStreamReader(vocabChars, StandardCharsets.UTF_8))) {
-      String ch;
-      int idx = 0;
-      while ((ch = in.readLine()) != null) {
-        char2idx.put(ch.charAt(0), idx);
-        idx += 1;
-      }
-    }
-  }
-
-  private void readVocabWords(InputStream vocabWords) throws IOException {
-    try(BufferedReader in = new BufferedReader(new 
InputStreamReader(vocabWords, StandardCharsets.UTF_8))) {
-      String word;
-      int idx = 0;
-      while ((word = in.readLine()) != null) {
-        word2idx.put(word, idx);
-        idx += 1;
-      }
-    }
-  }
-
-  public TokenIds toTokenIds(String[] tokens) {
-    String[][] sentences = new String[1][];
-    sentences[0] = tokens;
-    return toTokenIds(sentences);
-  }
-
-  public TokenIds toTokenIds(String[][] sentences) {
-    int[][][] charIds = new int[sentences.length][][];
-    int[][] wordIds = new int[sentences.length][];
-
-    for (int i = 0; i < sentences.length; i++) {
-      String[] sentenceWords = sentences[i];
-
-      int[][] sentcharIds = new int[sentenceWords.length][];
-      int[] sentwordIds = new int[sentenceWords.length];
-
-      for (int j=0; j < sentenceWords.length; j++) {
-        Ids ids = apply(sentenceWords[j]);
-
-        sentcharIds[j] = Arrays.copyOf(ids.getChars(), ids.getChars().length);
-        sentwordIds[j] = ids.getWord();
-      }
-
-      charIds[i] = sentcharIds;
-      wordIds[i] = sentwordIds;
-    }
-
-    return new TokenIds(charIds, wordIds);
-  }
-
-
-  private Ids apply(String word) {
-    // 0. get chars of words
-    int[] charIds = new int[word.length()];
-    int skipChars = 0;
-    for (int i = 0; i < word.length(); i++) {
-      char ch = word.charAt(i);
-      // ignore chars out of vocabulary
-      if (char2idx.containsKey(ch))
-        charIds[i - skipChars] = char2idx.get(ch);
-      else
-        skipChars += 1;
-    }
-
-    // 1. preprocess word
-    if (lowerCase) {
-      word = StringUtil.toLowerCase(word);
-    }
-
-    // if (digitPattern.matcher(word).find())
-    //  word = NUM;
-
-    // 2. get id of word
-    Integer wordId;
-    if (word2idx.containsKey(word)) {
-      wordId = word2idx.get(word);
-    } else {
-      if (allowUnk)
-        wordId = word2idx.get(UNK);
-      else
-        throw new RuntimeException("Unknown word '" + word + "' is not 
allowed.");
-    }
-
-    // 3. return tuple char ids, word id
-    Ids tokenIds = new Ids();
-    if (skipChars > 0) {
-      tokenIds.setChars(Arrays.copyOf(charIds, charIds.length - skipChars));
-    } else {
-      tokenIds.setChars(charIds);
-    }
-    tokenIds.setWord(wordId);
-
-    return tokenIds;
-  }
-
-  public static class Ids {
-
-    private int[] chars;
-    private int word;
-
-    public int[] getChars() {
-      return chars;
-    }
-
-    public void setChars(int[] chars) {
-      this.chars = chars;
-    }
-
-    public int getWord() {
-      return word;
-    }
-
-    public void setWord(int word) {
-      this.word = word;
-    }
-  }
-}
diff --git 
a/tf-ner-poc/src/main/java/org/apache/opennlp/normalizer/Normalizer.java 
b/tf-ner-poc/src/main/java/org/apache/opennlp/normalizer/Normalizer.java
deleted file mode 100644
index 281a7bf..0000000
--- a/tf-ner-poc/src/main/java/org/apache/opennlp/normalizer/Normalizer.java
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.opennlp.normalizer;
-
-import java.io.BufferedReader;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.stream.Collectors;
-
-import org.apache.opennlp.ModelUtil;
-import org.tensorflow.SavedModelBundle;
-import org.tensorflow.Session;
-import org.tensorflow.Tensor;
-
-public class Normalizer {
-
-  private static final char END_MARKER = 3;
-
-  private final Session session;
-  private final Map<Character, Integer> sourceCharMap;
-  private final Map<Integer, Character> targetCharMap;
-
-  public Normalizer(InputStream modelZipPackage) throws IOException {
-
-    Path tmpModelPath = ModelUtil.writeModelToTmpDir(modelZipPackage);
-    try(InputStream sourceCharMapIn = new FileInputStream(
-        tmpModelPath.resolve("source_char_dict.txt").toFile())) {
-      sourceCharMap = loadCharMap(sourceCharMapIn).entrySet().stream()
-          .collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey));
-    }
-
-    try(InputStream targetCharMapIn = new FileInputStream(
-        tmpModelPath.resolve("target_char_dict.txt").toFile())) {
-      targetCharMap = loadCharMap(targetCharMapIn);
-    }
-
-    try (SavedModelBundle model = 
SavedModelBundle.load(tmpModelPath.toString(), "serve")) {
-      session = model.session();
-    }
-  }
-
-  private static Map<Integer, Character> loadCharMap(InputStream in) throws 
IOException {
-    try(BufferedReader reader = new BufferedReader(
-        new InputStreamReader(in, StandardCharsets.UTF_8))) {
-      Map<Integer, Character> characterMap = new HashMap<>();
-
-      String tag;
-      while ((tag = reader.readLine()) != null) {
-        characterMap.put(characterMap.size(), tag.charAt(0));
-      }
-
-      return Collections.unmodifiableMap(characterMap);
-    }
-  }
-
-  public String[] normalize(String[] texts) {
-
-    if (texts.length == 0) {
-      return new String[0];
-    }
-
-    int[] textLengths = 
Arrays.stream(texts).mapToInt(String::length).toArray();
-    int maxLength = Arrays.stream(textLengths).max().getAsInt();
-
-    int[][] charIds = new int[texts.length][maxLength];
-
-    for (int textIndex = 0; textIndex < texts.length; textIndex++) {
-      for (int charIndex = 0; charIndex < texts[textIndex].length(); 
charIndex++) {
-        charIds[textIndex][charIndex] =
-                sourceCharMap.getOrDefault(texts[textIndex].charAt(charIndex), 
0);
-      }
-
-      textLengths[textIndex] = texts[textIndex].length();
-    }
-
-    try (Tensor<?> charTensor = Tensor.create(charIds);
-         Tensor<?> textLength = Tensor.create(textLengths);
-         Tensor<?> batchSize = Tensor.create(texts.length)) {
-
-      List<Tensor<?>> result = session.runner()
-          .feed("encoder_char_ids", charTensor)
-          .feed("encoder_lengths", textLength)
-          .feed("batch_size", batchSize)
-          .fetch("decode", 0).run();
-
-      try (Tensor<?> translationTensor = result.get(0)) {
-        int[][] translations =
-            translationTensor.copyTo(new int[texts.length][(int) 
translationTensor.shape()[1]]);
-
-        List<String> normalizedTexts = new ArrayList<>();
-
-        for (int[] translation : translations) {
-          StringBuilder normalizedText = new StringBuilder();
-          for (int i : translation) {
-            normalizedText.append(targetCharMap.get(i));
-          }
-
-          // Remove the end marker from the translated string
-          for (int ci = normalizedText.length() - 1; ci >= 0; ci--) {
-            if (END_MARKER == normalizedText.charAt(ci)) {
-              normalizedText.setLength(ci);
-            }
-          }
-
-          normalizedTexts.add(normalizedText.toString());
-        }
-
-        return normalizedTexts.toArray(new String[0]);
-      }
-    }
-  }
-
-  public static void main(String[] args) throws Exception {
-    Normalizer normalizer = new Normalizer(new 
FileInputStream("python/normalizer/normalizer.zip"));
-
-    String[] result = normalizer.normalize(new String[] {
-        "18 Mars 2012"
-    });
-
-    System.out.println(result[0]);
-  }
-}
diff --git a/tf-ner-poc/src/main/python/doccat/doccat.py 
b/tf-ner-poc/src/main/python/doccat/doccat.py
deleted file mode 100644
index ef55f94..0000000
--- a/tf-ner-poc/src/main/python/doccat/doccat.py
+++ /dev/null
@@ -1,217 +0,0 @@
-#
-#  Licensed to the Apache Software Foundation (ASF) under one
-#  or more contributor license agreements.  See the NOTICE file
-#  distributed with this work for additional information
-#  regarding copyright ownership.  The ASF licenses this file
-#  to you under the Apache License, Version 2.0 (the
-#  "License"); you may not use this file except in compliance
-#  with the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing,
-#  software distributed under the License is distributed on an
-#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#  KIND, either express or implied.  See the License for the
-#  specific language governing permissions and limitations
-#  under the License.
-#
-
-import re
-import tensorflow as tf
-import sys
-from util import load_glove
-from util import write_mapping
-from math import floor
-import random
-import numpy as np
-
-class Doccat:
-    def __init__(self, vector_size=100):
-        self.__vector_size = vector_size
-
-    def load_data(self, file):
-        with open(file, encoding="utf-8") as f:
-            labels = []
-            docs = []
-            for line in f:
-                parts = re.split(r'\t+', line)
-                labels.append(parts[0].strip())
-                docs.append(parts[1].strip())
-        return labels, docs
-
-    def create_placeholders(self):
-
-        dropout_keep_prob = tf.placeholder(tf.float32, 
name="dropout_keep_prop")
-
-        # shape is batch_size, and number of tokens
-        token_ids_ph = tf.placeholder(tf.int32, shape=[None, None], 
name="token_ids")
-
-        # shape is batch_size
-        token_lengths_ph = tf.placeholder(tf.int32, shape=[None], 
name="token_lengths")
-
-        # shape is batch_size
-        y_ph = tf.placeholder(tf.int32, shape=[None], name="y")
-
-        return dropout_keep_prob, token_ids_ph, token_lengths_ph, y_ph
-
-    def create_graph(self, dropout_keep_prob, token_ids_ph, name_lengths_ph, 
y_ph, embedding_dict, nclasses):
-
-
-
-        # This is a hack to make it load an embedding matrix larger than 2GB
-        # Don't hardcode this 300
-        embedding_placeholder = tf.placeholder(dtype=tf.float32, 
name="embedding_placeholder",
-                                               shape=(len(embedding_dict), 
self.__vector_size))
-        embedding_matrix = tf.Variable(embedding_placeholder, 
dtype=tf.float32, trainable=False, name="glove_embeddings")
-
-        token_embeddings = tf.nn.embedding_lookup(embedding_matrix, 
token_ids_ph)
-
-
-        char_hidden_size = 256
-        cell_fw = tf.contrib.rnn.LSTMCell(char_hidden_size, 
state_is_tuple=True)
-        cell_bw = tf.contrib.rnn.LSTMCell(char_hidden_size, 
state_is_tuple=True)
-
-        _, ((_, output_fw), (_, output_bw)) = 
tf.nn.bidirectional_dynamic_rnn(cell_fw,
-                                                                              
cell_bw,
-                                                                              
token_embeddings,
-                                                                              
sequence_length=name_lengths_ph,
-                                                                              
dtype=tf.float32)
-
-        output = tf.concat([output_fw, output_bw], axis=-1)
-
-        output = tf.nn.dropout(output, dropout_keep_prob)
-
-        W = tf.get_variable("W", shape=[2*char_hidden_size, nclasses])
-        b = tf.get_variable("b", shape=[nclasses])
-        logits = tf.nn.xw_plus_b(output, W, b, name="logits")
-
-        # softmax ...
-        probs = tf.exp(logits)
-        norm_probs = tf.identity(probs / tf.reduce_sum(probs, 1, 
keepdims=True), name="norm_probs")
-
-        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, 
labels=y_ph)
-        mean_loss = tf.reduce_mean(loss)
-
-        train_op = tf.train.AdamOptimizer().minimize(loss)
-        #train_op = 
tf.train.RMSPropOptimizer(learning_rate=0.001).minimize(loss)
-
-        return embedding_placeholder, train_op, norm_probs
-
-
-def encode_doc(word_dict, doc):
-    encoded_doc = []
-    for c in doc:
-        if c in word_dict:
-            encoded_doc.append(word_dict[c])
-        else:
-            encoded_doc.append(word_dict["__UNK__"])
-
-    return encoded_doc
-
-
-def mini_batch(label_dict, word_dict, labels, docs, batch_size, batch_index):
-    begin = batch_size * batch_index
-    end = min(batch_size * (batch_index + 1), len(labels))
-
-    max_length = 0
-    for i in range(begin, end):
-        length = len(docs[i])
-        if length > max_length:
-            max_length = length
-
-    doc_batch = []
-    label_batch = []
-    doc_length = []
-    for i in range(begin, end):
-        label_batch.append( label_dict[labels[i]])
-        doc_batch.append(encode_doc(word_dict, docs[i]) + [0] * max(max_length 
- len(docs[i]), 0))
-        doc_length.append(len(docs[i]))
-
-    return label_batch, np.asarray(doc_batch), doc_length
-
-def main():
-
-    if len(sys.argv) != 5:
-        print("Usage doccat.py embedding_file train_file dev_file test_file")
-        return
-
-    doccat = Doccat(100)
-
-    labels_train, docs_train = doccat.load_data(sys.argv[2])
-    labels_dev, docs_dev = doccat.load_data(sys.argv[3])
-    labels_test, docs_test = doccat.load_data(sys.argv[4])
-
-
-    word_dict, rev_word_dict, embeddings, vector_size = load_glove(sys.argv[1])
-
-    # Encode labels into ids
-    label_dict = {}
-    for label in labels_train:
-        if not label in label_dict:
-            label_dict[label] = len(label_dict)
-
-
-    dropout_keep_prob, token_ids_ph, token_lengths_ph, y_ph = 
doccat.create_placeholders()
-
-    embedding_ph, train_op, probs_op = doccat.create_graph(dropout_keep_prob, 
token_ids_ph,
-                                                           token_lengths_ph, 
y_ph,
-                                                           embeddings, 
len(label_dict))
-
-    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
-                                            log_device_placement=True))
-
-    with sess.as_default():
-        init=tf.global_variables_initializer()
-        sess.run(init, feed_dict={embedding_ph: embeddings})
-        batch_size = 20
-        for epoch in range(50):
-            print("Epoch " + str(epoch))
-            acc_train = []
-
-            batch_indexes = list(range(floor(len(docs_train) / batch_size)))
-            random.Random(epoch).shuffle(batch_indexes)
-
-            for batch_index in batch_indexes:
-                label_train_batch, doc_train_batch, name_train_length = \
-                    mini_batch(label_dict, word_dict, labels_train, 
docs_train, batch_size, batch_index)
-
-                feed_dict = {dropout_keep_prob: 0.5, token_ids_ph: 
doc_train_batch, token_lengths_ph: name_train_length, y_ph: label_train_batch}
-                _, probs = sess.run([train_op, probs_op], feed_dict)
-
-                acc_train.append((batch_size - 
np.sum(np.minimum(np.abs(label_train_batch - np.argmax(probs, axis=1)),
-                                                                 
np.full((batch_size), 1)))) / batch_size)
-
-            print("Train acc: " + str(np.mean(acc_train)))
-
-            acc_dev = []
-            for batch_index in range(floor(len(docs_dev) / batch_size)):
-                label_dev_batch, doc_dev_batch, doc_dev_length = \
-                    mini_batch(label_dict, word_dict, labels_dev, docs_dev, 
batch_size, batch_index)
-
-                feed_dict = {dropout_keep_prob: 1, token_ids_ph: 
doc_dev_batch, token_lengths_ph: doc_dev_length, y_ph: label_dev_batch}
-                probs = sess.run(probs_op, feed_dict)
-
-                acc_dev.append((batch_size - 
np.sum(np.minimum(np.abs(label_dev_batch - np.argmax(probs, axis=1)),
-                                                               
np.full((batch_size), 1)))) / batch_size)
-
-            print("Dev acc: " + str(np.mean(acc_dev)))
-
-        with TemporaryDirectory() as temp_dir:
-            temp_model_dir = temp_dir + "/model"
-
-            builder = tf.saved_model.builder.SavedModelBuilder(temp_model_dir)
-            builder.add_meta_graph_and_variables(sess, 
[tf.saved_model.tag_constants.SERVING])
-            builder.save()
-
-            write_mapping(label_dict, temp_model_dir + "/label_dict.txt")
-
-            zipf = zipfile.ZipFile("doccat-" + str(epoch) +".zip", 'w', 
zipfile.ZIP_DEFLATED)
-
-            for root, dirs, files in os.walk(temp_model_dir):
-                for file in files:
-                    modelFile = os.path.join(root, file)
-                    zipf.write(modelFile, arcname=os.path.relpath(modelFile, 
temp_model_dir))
-
-if __name__ == "__main__":
-    main()
diff --git a/tf-ner-poc/src/main/python/namecat/namecat.py 
b/tf-ner-poc/src/main/python/namecat/namecat.py
deleted file mode 100644
index cc3f28a..0000000
--- a/tf-ner-poc/src/main/python/namecat/namecat.py
+++ /dev/null
@@ -1,238 +0,0 @@
-#
-#  Licensed to the Apache Software Foundation (ASF) under one
-#  or more contributor license agreements.  See the NOTICE file
-#  distributed with this work for additional information
-#  regarding copyright ownership.  The ASF licenses this file
-#  to you under the Apache License, Version 2.0 (the
-#  "License"); you may not use this file except in compliance
-#  with the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing,
-#  software distributed under the License is distributed on an
-#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#  KIND, either express or implied.  See the License for the
-#  specific language governing permissions and limitations
-#  under the License.
-#
-
-import re
-import tensorflow as tf
-import sys
-from math import floor
-import numpy as np
-import random
-import zipfile
-import os
-from tempfile import TemporaryDirectory
-
-def load_data(file):
-    with open(file, encoding="utf-8") as f:
-        labels = []
-        names = []
-        for line in f:
-            parts = re.split(r'\t+', line)
-            labels.append(parts[0].strip())
-            names.append(parts[1].strip())
-    return labels, names
-
-# create placeholders
-def create_placeholders():
-
-    dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prop")
-
-    # shape is batch_size, and length of name
-    char_ids_ph = tf.placeholder(tf.int32, shape=[None, None], name="char_ids")
-
-    # shape is batch_size
-    name_lengths_ph = tf.placeholder(tf.int32, shape=[None], 
name="name_lengths")
-
-    # shape is batch_size
-    y_ph = tf.placeholder(tf.int32, shape=[None], name="y")
-    return dropout_keep_prob, char_ids_ph, name_lengths_ph, y_ph
-
-def create_graph(dropout_keep_prob, char_ids_ph, name_lengths_ph, y_ph, 
nchars, nclasses):
-
-
-    dim_char = 100
-
-    K = tf.get_variable(name="char_embeddings", dtype=tf.float32,
-                        shape=[nchars, dim_char])
-
-    char_embeddings = tf.nn.embedding_lookup(K, char_ids_ph)
-
-    char_embeddings = tf.nn.dropout(char_embeddings, dropout_keep_prob)
-
-    char_hidden_size = 256
-    cell_fw = tf.contrib.rnn.LSTMCell(char_hidden_size, state_is_tuple=True)
-    cell_bw = tf.contrib.rnn.LSTMCell(char_hidden_size, state_is_tuple=True)
-
-    _, ((_, output_fw), (_, output_bw)) = 
tf.nn.bidirectional_dynamic_rnn(cell_fw,
-                                                                      cell_bw,
-                                                                      
char_embeddings,
-                                                                      
sequence_length=name_lengths_ph,
-                                                                      
dtype=tf.float32)
-
-    output = tf.concat([output_fw, output_bw], axis=-1)
-
-    output = tf.nn.dropout(output, dropout_keep_prob)
-
-    W = tf.get_variable("W", shape=[2*char_hidden_size, nclasses])
-    b = tf.get_variable("b", shape=[nclasses])
-    logits = tf.nn.xw_plus_b(output, W, b, name="logits")
-
-    # softmax ...
-    probs = tf.exp(logits)
-    norm_probs = tf.identity(probs / tf.reduce_sum(probs, 1, keepdims=True), 
name="norm_probs")
-
-    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, 
labels=y_ph)
-    mean_loss = tf.reduce_mean(loss)
-
-    train_op = tf.train.AdamOptimizer().minimize(loss)
-    #train_op = tf.train.RMSPropOptimizer(learning_rate=0.001).minimize(loss)
-
-    return train_op, norm_probs
-
-
-def encode_name(char_dict, name):
-    encoded_name = []
-    for c in name:
-        encoded_name.append(char_dict[c])
-    return encoded_name
-
-def mini_batch(label_dict, char_dict, labels, names, batch_size, batch_index):
-    begin = batch_size * batch_index
-    end = min(batch_size * (batch_index + 1), len(labels))
-
-    max_length = 0
-    for i in range(begin, end):
-        length = len(names[i])
-        if length > max_length:
-            max_length = length
-
-    name_batch = []
-    label_batch = []
-    name_length = []
-    for i in range(begin, end):
-        label_batch.append( label_dict[labels[i]])
-        name_batch.append(encode_name(char_dict, names[i]) + [0] * 
max(max_length - len(names[i]), 0))
-        name_length.append(len(names[i]))
-
-    return label_batch, np.asarray(name_batch), name_length
-
-def write_mapping(tags, output_filename):
-    with open(output_filename, 'w', encoding='utf-8') as f:
-        for i, tag in enumerate(tags):
-            f.write(tag)
-            f.write("\n")
-
-def main():
-
-    if len(sys.argv) != 4:
-        print("Usage namecat.py train_file dev_file test_file")
-        return
-
-    labels_train, names_train = load_data(sys.argv[1])
-    labels_dev, names_dev = load_data(sys.argv[2])
-    labels_test, names_test = load_data(sys.argv[3])
-
-    # Encode labels into ids
-    label_dict = {}
-    for label in labels_train:
-        if not label in label_dict:
-            label_dict[label] = len(label_dict)
-
-    # Create char dict from names ...
-
-    char_set = set()
-    for name in names_train + names_dev + names_train:
-        char_set = char_set.union(name)
-
-    char_dict = {k: v for v, k in enumerate(char_set)}
-    char_dict[chr(0)] = 0
-
-    dropout_keep_prob, char_ids_ph, name_lengths_ph, y_ph = 
create_placeholders()
-
-    train_op, probs_op = create_graph(dropout_keep_prob, char_ids_ph, 
name_lengths_ph, y_ph, len(char_set), len(label_dict))
-
-    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
-                                            log_device_placement=True))
-
-    with sess.as_default():
-        init=tf.global_variables_initializer()
-        sess.run(init)
-
-        batch_size = 20
-        for epoch in range(20):
-            print("Epoch " + str(epoch))
-            acc_train = []
-
-            batch_indexes = list(range(floor(len(names_train) / batch_size)))
-
-            # Shuffle the data
-            combined = list(zip(names_train, labels_train))
-            random.shuffle(combined)
-            names_train[:], labels_train[:] = zip(*combined)
-
-            for batch_index in batch_indexes:
-                label_train_batch, name_train_batch, name_train_length = \
-                    mini_batch(label_dict, char_dict, labels_train, 
names_train, batch_size, batch_index)
-
-                # Add char dropout here ...
-                for i, j in np.ndindex(name_train_batch.shape):
-                    if random.uniform(0, 1) <= 0.0005:
-                        name_train_batch[i][j] = 0
-
-                feed_dict = {dropout_keep_prob: 0.5, char_ids_ph: 
name_train_batch, name_lengths_ph: name_train_length, y_ph: label_train_batch}
-                _, probs = sess.run([train_op, probs_op], feed_dict)
-
-                acc_train.append((batch_size - 
np.sum(np.minimum(np.abs(label_train_batch - np.argmax(probs, axis=1)),
-                                                                        
np.full((batch_size), 1)))) / batch_size)
-
-            print("Train acc: " + str(np.mean(acc_train)))
-
-            acc_dev = []
-            for batch_index in range(floor(len(names_dev) / batch_size)):
-                label_dev_batch, name_dev_batch, name_dev_length = \
-                    mini_batch(label_dict, char_dict, labels_dev, names_dev, 
batch_size, batch_index)
-
-                feed_dict = {dropout_keep_prob: 1, char_ids_ph: 
name_dev_batch, name_lengths_ph: name_dev_length, y_ph: label_dev_batch}
-                probs = sess.run(probs_op, feed_dict)
-
-                acc_dev.append((batch_size - 
np.sum(np.minimum(np.abs(label_dev_batch - np.argmax(probs, axis=1)),
-                                                               
np.full((batch_size), 1)))) / batch_size)
-
-            print("Dev acc: " + str(np.mean(acc_dev)))
-
-        #acc_test = []
-        #for batch_index in range(floor(len(names_test) / batch_size)):
-        #    label_test_batch, name_test_batch, name_test_length = \
-        #        mini_batch(label_dict, char_dict, labels_test, names_test, 
batch_size, batch_index)
-
-        #    feed_dict = {char_ids_ph: name_test_batch, name_lengths_ph: 
name_test_length, y_ph: label_test_batch}
-        #    probs = sess.run(probs_op, feed_dict)
-
-        #    acc_test.append((batch_size  - np.sum(np.abs(label_test_batch - 
np.argmax(probs, axis=1)))) / batch_size)
-
-        #print("Test acc: " + str(np.mean(acc_test)))
-
-        with TemporaryDirectory() as temp_dir:
-            temp_model_dir = temp_dir + "/model"
-
-            builder = tf.saved_model.builder.SavedModelBuilder(temp_model_dir)
-            builder.add_meta_graph_and_variables(sess, 
[tf.saved_model.tag_constants.SERVING])
-            builder.save()
-
-            write_mapping(label_dict, temp_model_dir + "/label_dict.txt")
-            write_mapping(char_dict, temp_model_dir + "/char_dict.txt")
-
-            zipf = zipfile.ZipFile("namecat-" + str(epoch) +".zip", 'w', 
zipfile.ZIP_DEFLATED)
-
-            for root, dirs, files in os.walk(temp_model_dir):
-                for file in files:
-                    modelFile = os.path.join(root, file)
-                    zipf.write(modelFile, arcname=os.path.relpath(modelFile, 
temp_model_dir))
-
-if __name__ == "__main__":
-    main()
diff --git a/tf-ner-poc/src/main/python/namefinder/namefinder.py 
b/tf-ner-poc/src/main/python/namefinder/namefinder.py
deleted file mode 100644
index f180f2b..0000000
--- a/tf-ner-poc/src/main/python/namefinder/namefinder.py
+++ /dev/null
@@ -1,493 +0,0 @@
-#
-#  Licensed to the Apache Software Foundation (ASF) under one
-#  or more contributor license agreements.  See the NOTICE file
-#  distributed with this work for additional information
-#  regarding copyright ownership.  The ASF licenses this file
-#  to you under the Apache License, Version 2.0 (the
-#  "License"); you may not use this file except in compliance
-#  with the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing,
-#  software distributed under the License is distributed on an
-#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#  KIND, either express or implied.  See the License for the
-#  specific language governing permissions and limitations
-#  under the License.
-#
-
-# This poc is based on source code taken from:
-# https://github.com/guillaumegenthial/sequence_tagging
-
-import sys
-from math import floor
-import tensorflow as tf
-import re
-import numpy as np
-import zipfile
-import os
-from tempfile import TemporaryDirectory
-
-# global variables for unknown word and numbers
-__UNK__ = '__UNK__'
-__NUM__ = '__NUM__'
-
-
-# Parse the OpenNLP Name Finder format into begin, end, type triples
-class NameSample:
-
-    def __init__(self, line):
-        self.tokens = []
-        self.names = []
-        start_regex = re.compile("<START(:([^:>\\s]*))?>")
-        parts = line.split()
-        start_index = -1
-        word_index = 0
-        for i in range(0, len(parts)):
-            if start_regex.match(parts[i]):
-                start_index = word_index
-                name_type = start_regex.search(parts[i]).group(2)
-                if name_type is None:
-                    name_type = "default"
-            elif parts[i] == "<END>":
-                self.names.append((start_index, word_index, name_type))
-            else:
-                self.tokens.append(parts[i])
-                word_index += 1
-
-
-class VectorException(Exception):
-    def __init__(self, value):
-        self.value = value
-
-    def __str__(self):
-        return repr(self.value)
-
-
-class NameFinder:
-    label_dict = {}
-
-    def __init__(self, use_lower_case_embeddings=False, vector_size=100):
-        self.__vector_size = vector_size
-        self.__use_lower_case_embeddings = use_lower_case_embeddings
-
-    def load_data(self, word_dict, file):
-        with open(file) as f:
-            raw_data = f.readlines()
-
-        sentences = []
-        labels = []
-        chars_set = set()
-
-        for line in raw_data:
-            name_sample = NameSample(line)
-            sentence = []
-            tokens = []
-
-            if len(name_sample.tokens) == 0:
-                continue
-
-            for token in name_sample.tokens:
-
-                chars_set.update(list(token))  # Add all chars to the set
-                tokens.append(token)  # Add original token so chars can be 
encoded correctly later
-
-                if self.__use_lower_case_embeddings:
-                    token = token.lower()
-
-                # TODO: implement NUM encoding
-
-                if word_dict.get(token) is not None:
-                    vector = word_dict[token]
-                else:
-                    vector = word_dict[__UNK__]
-
-                sentence.append(vector)
-
-            label = ["other"] * len(name_sample.tokens)
-            for name in name_sample.names:
-                label[name[0]] = name[2] + "-start"
-                for i in range(name[0] + 1, name[1]):
-                    label[i] = name[2] + "-cont"
-
-            sentences.append((sentence, tokens))  # Add a tuple of list of 
word vectors and list of original words
-            labels.append(label)
-
-            for label_string in label:
-                if label_string not in self.label_dict:
-                    self.label_dict[label_string] = len(self.label_dict)
-
-        return sentences, labels, chars_set
-
-    def encode_labels(self, labels):
-        return list(map(lambda l: self.label_dict[l], labels))
-
-    def mini_batch(self, char_dict, sentences, labels, batch_size, 
batch_index):
-        begin = batch_size * batch_index
-        end = min(batch_size * (batch_index + 1), len(labels))
-
-        # Determine the max sentence length in the batch
-        max_length = 0
-        for i in range(begin, end):
-            length = len(sentences[i][0])
-            if length > max_length:
-                max_length = length
-
-        sb = []
-        lb = []
-        seq_length = []
-        for i in range(begin, end):
-            sb.append(sentences[i][0] + [0] * max(max_length - 
len(sentences[i][0]), 0))
-            lb.append(self.encode_labels(labels[i]) + [0] * max(max_length - 
len(labels[i]), 0))
-            seq_length.append(len(sentences[i][0]))
-
-        # Determine the max word length in the batch
-        max_word_length = 0
-        for i in range(begin, end):
-            for word in sentences[i][1]:
-                length = len(word)
-                if length > max_word_length:
-                    max_word_length = length
-
-        cb = []
-        wlb = []
-        for i in range(begin, end):
-            sentence_word_length = []
-            sentence_word_chars = []
-
-            for word in sentences[i][1]:
-                word_chars = []
-                for c in word:
-                    word_chars.append(char_dict[c])
-
-                sentence_word_length.append(len(word_chars))
-                word_chars = word_chars + [0] * max(max_word_length - 
len(word_chars), 0)
-                sentence_word_chars.append(word_chars)
-
-            for i in range(max(max_length - len(sentence_word_chars), 0)):
-                sentence_word_chars.append([0] * max_word_length)
-
-            cb.append(sentence_word_chars)
-            wlb.append(sentence_word_length + [0] * max(max_length - 
len(sentence_word_length), 0))
-
-        return sb, cb, wlb, lb, seq_length
-
-    # probably not necessary to pass in the embedding_dict, can be passed to 
init directly
-    def create_graph(self, nchars, embedding_dict):
-
-        dropout_keep_prob = tf.placeholder(tf.float32, 
name="dropout_keep_prop")
-
-        with tf.variable_scope("chars"):
-            # shape = (batch size, max length of sentence, max length of word)
-            char_ids = tf.placeholder(tf.int32, shape=[None, None, None], 
name="char_ids")
-
-            # shape = (batch_size, max_length of sentence)
-            word_lengths_ph = tf.placeholder(tf.int32, shape=[None, None], 
name="word_lengths")
-
-            dim_char = 100
-
-            # 1. get character embeddings
-            K = tf.get_variable(name="char_embeddings", dtype=tf.float32,
-                                shape=[nchars, dim_char])
-
-            # shape = (batch, sentence, word, dim of char embeddings)
-            char_embeddings = tf.nn.embedding_lookup(K, char_ids)
-
-            # 2. put the time dimension on axis=1 for dynamic_rnn
-            s = tf.shape(char_embeddings)  # store old shape
-            # shape = (batch x sentence, word, dim of char embeddings)
-            char_embeddings = tf.reshape(char_embeddings, shape=[s[0] * s[1], 
s[-2], dim_char])
-            word_lengths = tf.reshape(word_lengths_ph, shape=[s[0] * s[1]])
-
-            # 3. bi lstm on chars
-            char_hidden_size = 100
-            cell_fw = tf.contrib.rnn.LSTMCell(char_hidden_size, 
state_is_tuple=True)
-            cell_bw = tf.contrib.rnn.LSTMCell(char_hidden_size, 
state_is_tuple=True)
-
-            _, ((_, output_fw), (_, output_bw)) = 
tf.nn.bidirectional_dynamic_rnn(cell_fw,
-                                                                               
   cell_bw,
-                                                                               
   char_embeddings,
-                                                                               
   sequence_length=word_lengths,
-                                                                               
   dtype=tf.float32)
-            # shape = (batch x sentence, 2 x char_hidden_size)
-            output = tf.concat([output_fw, output_bw], axis=-1)
-
-            # shape = (batch, sentence, 2 x char_hidden_size)
-            char_rep = tf.reshape(output, shape=[-1, s[1], 2 * 
char_hidden_size])
-
-        with tf.variable_scope("words"):
-            token_ids = tf.placeholder(tf.int32, shape=[None, None], 
name="word_ids")
-            sequence_lengths = tf.placeholder(tf.int32, shape=[None], 
name="sequence_lengths")
-
-            # This is a hack to make it load an embedding matrix larger than 
2GB
-            # Don't hardcode this 300
-            embedding_placeholder = tf.placeholder(dtype=tf.float32, 
name="embedding_placeholder",
-                                                   shape=(len(embedding_dict), 
self.__vector_size))
-            embedding_matrix = tf.Variable(embedding_placeholder, 
dtype=tf.float32, trainable=False,
-                                           name="glove_embeddings")
-
-            token_embeddings = tf.nn.embedding_lookup(embedding_matrix, 
token_ids)
-
-            # shape = (batch, sentence, 2 x char_hidden_size + 
word_vector_size)
-            word_embeddings = tf.concat([token_embeddings, char_rep], axis=-1)
-
-            word_embeddings = tf.nn.dropout(word_embeddings, dropout_keep_prob)
-
-        hidden_size = 300
-
-        # Lets add a char lstm layer to reproduce the state of the art results 
...
-
-        with tf.variable_scope("bi-lstm"):
-            # Add LSTM layer
-            cell_fw = tf.contrib.rnn.LSTMCell(hidden_size)
-            cell_bw = tf.contrib.rnn.LSTMCell(hidden_size)
-
-            (output_fw, output_bw), _ = 
tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, word_embeddings,
-                                                                        
sequence_length=sequence_lengths,
-                                                                        
dtype=tf.float32)
-
-            context_rep = tf.concat([output_fw, output_bw], axis=-1)
-
-            context_rep = tf.nn.dropout(context_rep, dropout_keep_prob)
-
-            labels = tf.placeholder(tf.int32, shape=[None, None], 
name="labels")
-
-        ntags = len(self.label_dict)
-
-        W = tf.get_variable("W", shape=[2 * hidden_size, ntags], 
dtype=tf.float32)
-        b = tf.get_variable("b", shape=[ntags], dtype=tf.float32, 
initializer=tf.zeros_initializer())
-        ntime_steps = tf.shape(context_rep)[1]
-        context_rep_flat = tf.reshape(context_rep, [-1, 2 * hidden_size])
-        pred = tf.matmul(context_rep_flat, W) + b
-        self.logits = tf.reshape(pred, [-1, ntime_steps, ntags], name="logits")
-
-        log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
-            self.logits, labels, sequence_lengths)
-
-        self.transition_params = tf.identity(transition_params, 
name="trans_params")
-
-        loss = tf.reduce_mean(-log_likelihood)
-
-        train_op = tf.train.AdamOptimizer().minimize(loss)
-
-        return embedding_placeholder, token_ids, char_ids, word_lengths_ph, \
-               sequence_lengths, labels, dropout_keep_prob, train_op
-
-    def predict_batch(self, sess, token_ids_ph, char_ids_ph, word_lengths_ph,
-                      sequence_lengths_ph, sentences, char_ids, word_length, 
lengths, dropout_keep_prob):
-
-        feed_dict = {token_ids_ph: sentences, char_ids_ph: char_ids, 
word_lengths_ph: word_length,
-                     sequence_lengths_ph: lengths, dropout_keep_prob: 1}
-
-        viterbi_sequences = []
-        logits, trans_params = sess.run([self.logits, self.transition_params], 
feed_dict=feed_dict)
-
-        for logit, sequence_length in zip(logits, lengths):
-            if sequence_length != 0:
-                logit = logit[:sequence_length]  # keep only the valid steps
-                viterbi_seq, viterbi_score = 
tf.contrib.crf.viterbi_decode(logit, trans_params)
-                viterbi_sequences += [viterbi_seq]
-            else:
-                viterbi_sequences += []
-
-        return viterbi_sequences, lengths
-
-
-def get_chunk_type(tok, idx_to_tag):
-    tag_name = idx_to_tag[tok]
-    tag_class = tag_name.split('-')[0]
-    tag_type = tag_name.split('-')[-1]
-    return tag_class, tag_type
-
-
-def get_chunks(seq, tags):
-    default = tags["other"]
-    idx_to_tag = {idx: tag for tag, idx in tags.items()}
-    chunks = []
-    chunk_type, chunk_start = None, None
-    for i, tok in enumerate(seq):
-        # End of a chunk 1
-        if tok == default and chunk_type is not None:
-            # Add a chunk.
-            chunk = (chunk_type, chunk_start, i)
-            chunks.append(chunk)
-            chunk_type, chunk_start = None, None
-
-        # End of a chunk + start of a chunk!
-        elif tok != default:
-            tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag)
-            if chunk_type is None:
-                chunk_type, chunk_start = tok_chunk_type, i
-            elif tok_chunk_type != chunk_type or tok_chunk_class == "B":
-                chunk = (chunk_type, chunk_start, i)
-                chunks.append(chunk)
-                chunk_type, chunk_start = tok_chunk_type, i
-        else:
-            pass
-
-    # end condition
-    if chunk_type is not None:
-        chunk = (chunk_type, chunk_start, len(seq))
-        chunks.append(chunk)
-
-    return chunks
-
-
-def write_mapping(tags, output_filename):
-    with open(output_filename, 'w', encoding='utf-8') as f:
-        for (tag, i) in sorted(tags.items(), key=lambda x: x[1]):
-            f.write('{}\n'.format(tag))
-
-
-def load_glove(glove_file):
-    with open(glove_file) as f:
-
-        word_dict = {}
-        embeddings = []
-
-        vector_size = -1
-
-        for line in f:
-            parts = line.strip().split(" ")
-
-            if vector_size == -1:
-                if len(parts) == 2:
-                    vector_size = int(parts[1])
-                    continue
-                vector_size = len(parts) - 1
-
-            if len(parts) != vector_size + 1:
-                raise VectorException("Bad Vector in line: {}, size: {} 
vector: {}".format(len(line), len(parts), line))
-                continue
-            word_dict[parts[0]] = len(word_dict)
-            embeddings.append(np.array(parts[1:], dtype=np.float32))
-
-    # add unknown word symbol and number symbol
-    if __UNK__ not in word_dict:
-        word_dict[__UNK__] = len(word_dict)
-        unk_random = 0.08 * np.random.random_sample(vector_size) - 0.04
-        embeddings.append(unk_random.astype(np.float32))
-    if __NUM__ not in word_dict:
-        word_dict[__NUM__] = len(word_dict)
-        embeddings.append(np.zeros(vector_size, dtype=np.float32))
-
-    # Create a reverse word dict
-    rev_word_dict = {}
-    for word, id in word_dict.items():
-        rev_word_dict[id] = word
-
-    return word_dict, rev_word_dict, np.asarray(embeddings), vector_size
-
-
-def main():
-    if len(sys.argv) != 5:
-        print("Usage namefinder.py embedding_file train_file dev_file 
test_file")
-        return
-
-    word_dict, rev_word_dict, embeddings, vector_size = load_glove(sys.argv[1])
-
-    name_finder = NameFinder(vector_size)
-
-    sentences, labels, char_set = name_finder.load_data(word_dict, sys.argv[2])
-    sentences_dev, labels_dev, char_set_dev = name_finder.load_data(word_dict, 
sys.argv[3])
-
-    char_dict = {k: v for v, k in enumerate(char_set | char_set_dev)}
-
-    embedding_ph, token_ids_ph, char_ids_ph, word_lengths_ph, 
sequence_lengths_ph, labels_ph, dropout_keep_prob, train_op \
-        = name_finder.create_graph(len(char_set | char_set_dev), embeddings)
-
-    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
-                                            log_device_placement=True))
-
-    best_f1 = 0.0
-    no_improvement = 0
-    with sess.as_default():
-        init = tf.global_variables_initializer()
-        sess.run(init, feed_dict={embedding_ph: embeddings})
-
-        batch_size = 20
-        for epoch in range(100):
-            print("Epoch " + str(epoch))
-
-            for batch_index in range(floor(len(sentences) / batch_size)):
-                if batch_index % 200 == 0:
-                    print("batch_index " + str(batch_index))
-
-                # mini_batch should also return char_ids and word length ...
-                sentences_batch, chars_batch, word_length_batch, labels_batch, 
lengths = \
-                    name_finder.mini_batch(char_dict, sentences, labels, 
batch_size, batch_index)
-
-                feed_dict = {token_ids_ph: sentences_batch, char_ids_ph: 
chars_batch,
-                             word_lengths_ph: word_length_batch, 
sequence_lengths_ph: lengths,
-                             labels_ph: labels_batch, dropout_keep_prob: 0.5}
-
-                train_op.run(feed_dict, sess)
-
-            accs = []
-            correct_preds, total_correct, total_preds = 0., 0., 0.
-            for batch_index in range(floor(len(sentences_dev) / batch_size)):
-                sentences_test_batch, chars_batch_test, 
word_length_batch_test, \
-                labels_test_batch, length_test = 
name_finder.mini_batch(char_dict,
-                                                                        
sentences_dev,
-                                                                        
labels_dev,
-                                                                        
batch_size,
-                                                                        
batch_index)
-
-                labels_pred, sequence_lengths = name_finder.predict_batch(
-                    sess, token_ids_ph, char_ids_ph, word_lengths_ph, 
sequence_lengths_ph,
-                    sentences_test_batch, chars_batch_test, 
word_length_batch_test, length_test, dropout_keep_prob)
-
-                for lab, lab_pred, length in zip(labels_test_batch, 
labels_pred,
-                                                 sequence_lengths):
-                    lab = lab[:length]
-                    lab_pred = lab_pred[:length]
-                    accs += [a == b for (a, b) in zip(lab, lab_pred)]
-
-                    lab_chunks = set(get_chunks(lab, name_finder.label_dict))
-                    lab_pred_chunks = set(get_chunks(lab_pred, 
name_finder.label_dict))
-
-                    correct_preds += len(lab_chunks & lab_pred_chunks)
-                    total_preds += len(lab_pred_chunks)
-                    total_correct += len(lab_chunks)
-
-            p = correct_preds / total_preds if correct_preds > 0 else 0
-            r = correct_preds / total_correct if correct_preds > 0 else 0
-            f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0
-            acc = np.mean(accs)
-
-            if f1 > best_f1:
-
-                best_f1 = f1
-                no_improvement = 0
-
-                with TemporaryDirectory() as temp_dir:
-                    temp_model_dir = temp_dir + "/model"
-
-                    builder = 
tf.saved_model.builder.SavedModelBuilder(temp_model_dir)
-                    builder.add_meta_graph_and_variables(sess, 
[tf.saved_model.tag_constants.SERVING])
-                    builder.save()
-
-                    write_mapping(word_dict, temp_model_dir + '/word_dict.txt')
-                    write_mapping(name_finder.label_dict, temp_model_dir + 
"/label_dict.txt")
-                    write_mapping(char_dict, temp_model_dir + "/char_dict.txt")
-
-                    zipf = zipfile.ZipFile("namefinder-" + str(epoch) + 
".zip", 'w', zipfile.ZIP_DEFLATED)
-
-                    for root, dirs, files in os.walk(temp_model_dir):
-                        for file in files:
-                            modelFile = os.path.join(root, file)
-                            zipf.write(modelFile, 
arcname=os.path.relpath(modelFile, temp_model_dir))
-            else:
-                no_improvement += 1
-
-            print("ACC " + str(acc))
-            print("F1  " + str(f1) + "  P " + str(p) + "  R " + str(r))
-
-            if no_improvement > 5:
-                print("No further improvement. Stopping.")
-                break
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tf-ner-poc/src/main/python/namefinder/split.py 
b/tf-ner-poc/src/main/python/namefinder/split.py
deleted file mode 100644
index 1e5ea4d..0000000
--- a/tf-ner-poc/src/main/python/namefinder/split.py
+++ /dev/null
@@ -1,61 +0,0 @@
-#
-#  Licensed to the Apache Software Foundation (ASF) under one
-#  or more contributor license agreements.  See the NOTICE file
-#  distributed with this work for additional information
-#  regarding copyright ownership.  The ASF licenses this file
-#  to you under the Apache License, Version 2.0 (the
-#  "License"); you may not use this file except in compliance
-#  with the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing,
-#  software distributed under the License is distributed on an
-#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#  KIND, either express or implied.  See the License for the
-#  specific language governing permissions and limitations
-#  under the License.
-#
-
-import random
-import sys
-
-def main():
-
-    if len(sys.argv) != 5:
-        print("Usage split.py data_file train_file dev_file test_file")
-        return
-
-    train = []
-    dev = []
-    test = []
-
-    with open(sys.argv[1]) as f:
-        for line in f:
-
-            if len(line.strip()) == 0:
-                continue
-
-            rand = random.random()
-            if rand < 0.8:
-                train.append(line)
-            elif rand < 0.9:
-                dev.append(line)
-            elif rand <= 1.0:
-                test.append(line)
-
-    with open(sys.argv[2], 'w') as f:
-        for item in train:
-            f.write("%s" % item)
-
-    with open(sys.argv[3], 'w') as f:
-        for item in dev:
-            f.write("%s" % item)
-
-    with open(sys.argv[4], 'w') as f:
-        for item in test:
-            f.write("%s" % item)
-
-if __name__ == "__main__":
-    main()
-
diff --git a/tf-ner-poc/src/main/python/normalizer/date_generator.py 
b/tf-ner-poc/src/main/python/normalizer/date_generator.py
deleted file mode 100644
index 965aec0..0000000
--- a/tf-ner-poc/src/main/python/normalizer/date_generator.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#
-#  Licensed to the Apache Software Foundation (ASF) under one
-#  or more contributor license agreements.  See the NOTICE file
-#  distributed with this work for additional information
-#  regarding copyright ownership.  The ASF licenses this file
-#  to you under the Apache License, Version 2.0 (the
-#  "License"); you may not use this file except in compliance
-#  with the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing,
-#  software distributed under the License is distributed on an
-#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#  KIND, either express or implied.  See the License for the
-#  specific language governing permissions and limitations
-#  under the License.
-#
-
-from faker import Faker
-from babel.dates import format_date
-import random
-from datetime import datetime
-
-fake = Faker()
-
-# TOOD: If possible set date range on Faker
-
-FORMATS = ['short',
-           'medium',
-           'long',
-           'dd MMM YYY',
-           'dd MMM',
-           'dd, MMM YYY',
-           'dd, MMM',
-           'd MMM YYY',
-           'd MMM',
-           'd MMMM YYY',
-           'MMMM YYY',
-           'd MMMM, YYY',
-           'd MMM, YYY',
-           'd MM YY',
-           'd MMMM YYY',
-           'MMMM d YYY',
-           'MMMM YYY',
-           'MMM YYY',
-           'MMMM d, YYY',
-           'YYY',
-           'dd.MM.YY',
-           'dd.MM',
-           'full',
-           'full',
-           'full',
-           'full',
-           'full',
-           'full',
-           'full',
-           'full',
-           'full',
-           'full']
-
-# TODO: maybe avoid duplicates, output dates also for other locales such as 
german, and french ...
-
-with open('date_dev_deu.txt', 'w', encoding="utf-8") as f:
-    for i in range(2000):
-        dt = fake.date_time_ad(start_datetime=datetime(1900, 1, 1))
-
-        format = random.choice(FORMATS)
-        source_date = format_date(dt, format=format,  locale='en_US')
-        target_date = format_date(dt, format='YYYYMMdd',  locale='en_US')
-
-        if "short" not in format \
-                and "medium" not in format \
-                and "long" not in format \
-                and "full" not in format :
-
-            if "Y" not in format:
-                target_date = "0000" + target_date[4:]
-
-            if "d" not in format:
-                target_date = target_date[:6] + "00"
-
-            if "M" not in format:
-                target_date = target_date[:4] + "00" + target_date[6:]
-
-        f.write(target_date + '\t' + source_date + '\n')
\ No newline at end of file
diff --git a/tf-ner-poc/src/main/python/normalizer/normalizer.py 
b/tf-ner-poc/src/main/python/normalizer/normalizer.py
deleted file mode 100644
index 04be1bb..0000000
--- a/tf-ner-poc/src/main/python/normalizer/normalizer.py
+++ /dev/null
@@ -1,322 +0,0 @@
-#
-#  Licensed to the Apache Software Foundation (ASF) under one
-#  or more contributor license agreements.  See the NOTICE file
-#  distributed with this work for additional information
-#  regarding copyright ownership.  The ASF licenses this file
-#  to you under the Apache License, Version 2.0 (the
-#  "License"); you may not use this file except in compliance
-#  with the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing,
-#  software distributed under the License is distributed on an
-#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#  KIND, either express or implied.  See the License for the
-#  specific language governing permissions and limitations
-#  under the License.
-#
-import os
-import re
-import zipfile
-from tempfile import TemporaryDirectory
-
-import tensorflow as tf
-import numpy as np
-import random
-from math import floor
-import sys
-
-def load_data(file):
-    with open(file, encoding="utf-8") as f:
-        target = []
-        source = []
-        for line in f:
-            parts = re.split(r'\t+', line)
-            target.append(parts[0].strip());
-            source.append(parts[1].strip())
-    return source, target
-
-def encode_name(char_dict, names):
-
-    max_length = 0
-    for name in names:
-        length = len(name)
-        if length > max_length:
-            max_length = length
-
-    # TODO: To be able to use padding for variable length sequences
-    #       pad with the eos marker
-
-    encoded_names = np.zeros((len(names), max_length))
-
-    for bi in range(len(names)):
-        for ci in range(len(names[bi])):
-            encoded_names.itemset((bi, ci), char_dict[names[bi][ci]])
-
-    return encoded_names
-
-def mini_batch(target_char_dict, target, source_char_dict, source, batch_size, 
batch_index):
-
-    begin = batch_index
-    end = min(batch_index + batch_size, len(source))
-
-    target_batch = target[begin : end]
-
-    target_length = []
-    for i in range(begin, end):
-        target_length.append(len(target[i]) + 1) # TODO: The correction should 
be done in the graph ...
-
-    source_batch = source[batch_index : batch_index + batch_size]
-    source_length = []
-    for i in range(begin, end):
-        source_length.append(len(source[i]))
-
-    return encode_name(target_char_dict, target_batch), 
np.asarray(target_length), \
-           encode_name(source_char_dict, source_batch), 
np.asarray(source_length)
-
-def create_graph(mode, batch_size, encoder_nchars, max_target_length, 
decoder_nchars):
-
-    # Hyper  parameters
-    encoder_char_dim = 100
-    num_units = 256
-
-    batch_size_ph = tf.placeholder_with_default(batch_size, shape=(), 
name="batch_size")
-
-    # Encoder
-    encoder_char_ids_ph = tf.placeholder(tf.int32, shape=[None, None], 
name="encoder_char_ids")
-    encoder_lengths_ph = tf.placeholder(tf.int32, shape=[None], 
name="encoder_lengths")
-
-    encoder_embedding_weights = tf.get_variable(name="char_embeddings", 
dtype=tf.float32,
-                        shape=[encoder_nchars, encoder_char_dim])
-
-    encoder_emb_inp = tf.nn.embedding_lookup(encoder_embedding_weights, 
encoder_char_ids_ph)
-
-    if "TRAIN" == mode:
-        encoder_emb_inp = tf.nn.dropout(encoder_emb_inp, 0.7)
-
-    encoder_emb_inp = tf.transpose(encoder_emb_inp, perm=[1, 0, 2])
-
-    encoder_cell = tf.nn.rnn_cell.LSTMCell(num_units)
-    initial_state = encoder_cell.zero_state(batch_size_ph, dtype=tf.float32)
-
-    encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
-        encoder_cell, encoder_emb_inp, initial_state=initial_state,
-        sequence_length=encoder_lengths_ph,
-        time_major=True, swap_memory=True)
-
-    # Decoder
-    decoder_char_ids_ph = tf.placeholder(tf.int32, shape=[None, None], 
name="decoder_char_ids")
-    decoder_lengths = tf.placeholder(tf.int32, shape=[None], 
name="decoder_lengths")
-
-    # decoder output (decoder_input shifted to the left by one)
-
-    decoder_char_dim = 100
-    decoder_embedding_weights = 
tf.get_variable(name="decoder_char_embeddings", dtype=tf.float32,
-                                             shape=[decoder_nchars, 
decoder_char_dim])
-
-    projection_layer = tf.layers.Dense(units=decoder_nchars, use_bias=True) # 
To predict one output char at a time ...
-
-    attention_states = tf.transpose(encoder_outputs, [1, 0, 2])
-
-    attention_mechanism = tf.contrib.seq2seq.LuongAttention(
-        num_units, attention_states,
-        memory_sequence_length=encoder_lengths_ph)
-
-    decoder_cell = tf.nn.rnn_cell.LSTMCell(num_units)
-
-    decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, 
attention_mechanism,
-        attention_layer_size=num_units)
-
-    # decoder_initial_state = encoder_state
-    decoder_initial_state = decoder_cell.zero_state(dtype=tf.float32, 
batch_size=batch_size_ph)
-
-    if "TRAIN" == mode:
-
-        decoder_input = tf.pad(decoder_char_ids_ph, tf.constant([[0,0], 
[1,0]]),
-                               'CONSTANT', constant_values=(decoder_nchars-2))
-
-        decoder_emb_inp = tf.nn.embedding_lookup(decoder_embedding_weights, 
decoder_input)
-        decoder_emb_inp = tf.transpose(decoder_emb_inp, perm=[1, 0, 2])
-
-        helper = tf.contrib.seq2seq.TrainingHelper(
-            decoder_emb_inp, [max_target_length for _ in range(batch_size)], 
time_major=True)
-
-
-        decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper,
-                                                  decoder_initial_state, 
output_layer=projection_layer)
-
-        outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, 
output_time_major=True, swap_memory=True )
-
-        logits = outputs.rnn_output
-        train_prediction = outputs.sample_id
-
-        decoder_output = tf.pad(tf.transpose(decoder_char_ids_ph, perm=[1, 
0]), tf.constant([[0,1], [0,0]]),
-                                'CONSTANT', constant_values=(decoder_nchars-1))
-
-        crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
-            labels=decoder_output, logits=logits, name="crossent")
-
-        loss = tf.reduce_sum(crossent * tf.to_float(decoder_lengths)) / 
(batch_size * max_target_length)
-
-        # Optimizer
-        # TODO: Tutorial suggest to swap to SGD for alter iterations
-        # optimizer = tf.train.AdamOptimizer()
-        optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001)
-        gradients, v = zip(*optimizer.compute_gradients(loss))
-        gradients, _ = tf.clip_by_global_norm(gradients, 10.0)
-        optimize = optimizer.apply_gradients(zip(gradients, v))
-
-        return encoder_char_ids_ph, encoder_lengths_ph, decoder_char_ids_ph, 
decoder_lengths, optimize, train_prediction, outputs
-
-    if "EVAL" == mode:
-        helperE = tf.contrib.seq2seq.GreedyEmbeddingHelper(
-            decoder_embedding_weights,
-            tf.fill([batch_size_ph], decoder_nchars-2), decoder_nchars-1)
-        decoderE = tf.contrib.seq2seq.BasicDecoder(
-            decoder_cell, helperE, decoder_initial_state,
-            output_layer=projection_layer)
-        outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoderE, 
maximum_iterations=20)
-
-        translations = tf.identity(outputs.sample_id, name="decode")
-
-        return encoder_char_ids_ph, encoder_lengths_ph, translations
-
-def encode_chars(names):
-    char_set = set()
-    for name in names:
-        char_set = char_set.union(name)
-    return {k: v for v, k in enumerate(char_set)}
-
-# TODO: Deduplicate this, same as in namefinder.py
-def write_mapping(tags, output_filename):
-    with open(output_filename, 'w', encoding='utf-8') as f:
-        for i, tag in enumerate(tags):
-            f.write('{}\n'.format(tag))
-
-def main():
-
-    if len(sys.argv) != 4:
-        print("Usage normalizer.py train_file dev_file test_file")
-        return
-
-    checkpoints_path = "/tmp/model/checkpoints"
-
-    source_train, target_train = load_data(sys.argv[1])
-    source_dev, target_dev = load_data(sys.argv[2])
-    source_test, target_test = load_data(sys.argv[3])
-
-    source_char_dict = encode_chars(source_train + source_dev + source_test)
-    source_char_dict[chr(0)] = 0
-
-    target_char_dict = encode_chars(target_train + target_dev + target_test)
-
-    # char id 2 is STX (Start of Text), and 3 ETX (End of Text)
-    target_char_dict[chr(2)] = len(target_char_dict)
-    target_char_dict[chr(3)] = len(target_char_dict)
-
-    target_dict_rev = {v: k for k, v in target_char_dict.items()}
-
-    batch_size = 20
-
-    target_max_len = -1
-    for token in (target_train + target_dev + target_test):
-        target_max_len = max(target_max_len, len(token))
-
-    # Increase size by one for termination char
-    target_max_len += 1
-
-    train_graph = tf.Graph()
-    eval_graph = tf.Graph()
-
-    with train_graph.as_default():
-        t_encoder_char_ids_ph, t_encoder_lengths_ph, t_decoder_char_ids_ph, 
t_decoder_lengths, t_adam_optimize, t_train_prediction, t_dec_out = \
-            create_graph("TRAIN", batch_size, len(source_char_dict), 
target_max_len, len(target_char_dict))
-        train_saver = tf.train.Saver()
-        train_sess = tf.Session()
-        train_sess.run(tf.global_variables_initializer())
-
-    with eval_graph.as_default():
-        e_encoder_char_ids_ph, e_encoder_lengths_ph, e_dec_out = \
-            create_graph("EVAL", batch_size, len(source_char_dict), 
target_max_len, len(target_char_dict))
-        eval_saver = tf.train.Saver()
-
-        eval_sess = tf.Session(graph=eval_graph)
-
-    for epoch in range(20):
-        print("Epoch " + str(epoch))
-
-        with train_graph.as_default():
-            for batch_index in range(floor(len(source_train) / batch_size)):
-                if batch_index > 0 and batch_index % 100 == 0:
-                    print("batch_index " + str(batch_index))
-
-                target_batch, target_length, source_batch, source_length = \
-                    mini_batch(target_char_dict, target_train, 
source_char_dict, source_train, batch_size, batch_index)
-
-                # TODO: Add char dropout here ...
-                for i, j in np.ndindex(source_batch.shape):
-                    if random.uniform(0, 1) <= 0.0005:
-                        source_batch[i][j] = 0
-
-                feed_dict = {t_encoder_lengths_ph: source_length, 
t_encoder_char_ids_ph: source_batch,
-                             t_decoder_lengths: target_length, 
t_decoder_char_ids_ph: target_batch}
-
-                t1, dec1 = train_sess.run([t_adam_optimize, t_dec_out], 
feed_dict)
-                dec2 = train_sess.run([t_dec_out], feed_dict)
-                tv=1
-
-            # Save train model, and restore it into the eval session
-            checkpoint_path = train_saver.save(train_sess, checkpoints_path, 
global_step=epoch)
-            eval_saver.restore(eval_sess, checkpoint_path)
-
-        with eval_graph.as_default():
-            count_correct = 0
-            for batch_index in range(floor(len(source_dev) / batch_size)):
-                target_batch, target_length, source_batch, source_length = \
-                    mini_batch(target_char_dict, target_dev, source_char_dict, 
source_dev, batch_size, batch_index)
-
-                begin = batch_index
-                end = min(batch_index + batch_size, len(source_dev))
-                target_strings = target_dev[begin:end]
-
-                feed_dict = {e_encoder_lengths_ph: source_length, 
e_encoder_char_ids_ph: source_batch}
-                result = eval_sess.run(e_dec_out, feed_dict)
-
-                decoded_dates = []
-
-                for coded_date in result:
-                    date = ""
-                    for char_id in coded_date:
-                        if not char_id == len(target_char_dict) - 1:
-                            date = date + (target_dict_rev[char_id])
-                    decoded_dates.append(date)
-
-                for i in range(len(target_strings)):
-                    if target_strings[i] == decoded_dates[i]:
-                        count_correct = count_correct + 1
-
-            print("Dev: " + str(count_correct / len(target_dev)))
-
-    with TemporaryDirectory() as temp_dir:
-
-        temp_model_dir = temp_dir + "/model"
-
-
-        with eval_graph.as_default():
-            builder = tf.saved_model.builder.SavedModelBuilder(temp_model_dir)
-            builder.add_meta_graph_and_variables(eval_sess, 
[tf.saved_model.tag_constants.SERVING])
-            builder.save()
-
-        write_mapping(source_char_dict, temp_model_dir + 
'/source_char_dict.txt')
-        write_mapping(target_char_dict, temp_model_dir + 
'/target_char_dict.txt')
-
-        zipf = zipfile.ZipFile("normalizer.zip", 'w', zipfile.ZIP_DEFLATED)
-
-        for root, dirs, files in os.walk(temp_model_dir):
-            for file in files:
-                modelFile = os.path.join(root, file)
-                zipf.write(modelFile, arcname=os.path.relpath(modelFile, 
temp_model_dir))
-
-if __name__ == "__main__":
-    main()
diff --git 
a/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/FeedDictionaryTest.java
 
b/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/FeedDictionaryTest.java
deleted file mode 100644
index edd9843..0000000
--- 
a/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/FeedDictionaryTest.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.opennlp.namefinder;
-
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Test;
-
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.List;
-import java.util.zip.GZIPInputStream;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNotNull;
-
-class FeedDictionaryTest {
-
-  private static WordIndexer indexer;
-
-  @BeforeAll
-  static void beforeClass() {
-    try (InputStream words = new 
GZIPInputStream(FeedDictionaryTest.class.getResourceAsStream("/words.txt.gz"));
-         InputStream chars = new 
GZIPInputStream(FeedDictionaryTest.class.getResourceAsStream("/chars.txt.gz"))) 
{
-
-      indexer = new WordIndexer(words, chars);
-    } catch (Exception ex) {
-      indexer = null;
-    }
-    assertNotNull(indexer);
-  }
-
-  @Test
-  void testToTokenIds() {
-    String text1 = "Stormy Cars ' friend says she also plans to sue Michael 
Cohen .";
-    TokenIds oneSentence = indexer.toTokenIds(text1.split("\\s+"));
-    assertNotNull(oneSentence);
-    assertEquals(13, oneSentence.getWordIds()[0].length, "Expect 13 tokenIds");
-
-    String[] text2 = new String[] {"I wish I was born in Copenhagen Denmark",
-        "Donald Trump died on his way to Tivoli Gardens in Denmark ."};
-    List<String[]> collect = Arrays.stream(text2).map(s -> 
s.split("\\s+")).toList();
-    TokenIds twoSentences = indexer.toTokenIds(collect.toArray(new 
String[2][]));
-    assertNotNull(twoSentences);
-    assertEquals(8, twoSentences.getWordIds()[0].length, "Expect 8 tokenIds");
-    assertEquals(12, twoSentences.getWordIds()[1].length, "Expect 12 
tokenIds");
-  }
-}
diff --git 
a/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/PredictTest.java 
b/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/PredictTest.java
deleted file mode 100644
index 4501630..0000000
--- a/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/PredictTest.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.opennlp.namefinder;
-
-import java.io.IOException;
-import java.nio.file.Path;
-
-import org.junit.jupiter.api.Disabled;
-import org.junit.jupiter.api.Test;
-
-import opennlp.tools.util.Span;
-
-class PredictTest {
-
-  // Note: As of Feb 2023, this test won't work on all platforms and, for 
instance, fails with
-  //  "Cannot find TensorFlow native library for OS: darwin, architecture: 
aarch64"
-  // That's why it is disabled via the architecture system property.
-  // @DisabledIfSystemProperty(named = "os.arch", matches = "aarch64")
-  @Test
-  @Disabled
-  // TODO This test won't work as the required TF model is missing and needs 
to be re-trained.
-  //      Further details, see: 
https://github.com/apache/opennlp-sandbox/pull/89
-  void testFindTokens() throws IOException {
-
-    // can be changed to File or InputStream
-    String words = PredictTest.class.getResource("/words.txt.gz").getPath();
-    String chars = PredictTest.class.getResource("/chars.txt.gz").getPath();
-    String tags = PredictTest.class.getResource("/tags.txt.gz").getPath();
-    // Load model takes a String path!!
-    Path model = Path.of("savedmodel");
-
-    PredictionConfiguration config = new PredictionConfiguration(words, chars, 
tags, model.toString());
-
-    try (SequenceTagging tagger = new SequenceTagging(config)) {
-      String[] tokens = "Stormy Cars ' friend says she also plans to sue 
Michael Cohen .".split("\\s+");
-      Span[] pred = tagger.find(tokens);
-
-      for (int i = 0; i < tokens.length; i++) {
-        System.out.print(tokens[i] + "/" + pred[i] + " ");
-      }
-      System.out.println();
-    }
-
-  }
-}
diff --git 
a/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/WordIndexerTest.java 
b/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/WordIndexerTest.java
deleted file mode 100644
index 0254612..0000000
--- 
a/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/WordIndexerTest.java
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.opennlp.namefinder;
-
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.List;
-import java.util.zip.GZIPInputStream;
-
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Test;
-
-import static org.junit.jupiter.api.Assertions.assertArrayEquals;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNotNull;
-
-class WordIndexerTest {
-
-  private static WordIndexer indexer;
-
-  @BeforeAll
-  static void beforeClass() {
-    try (InputStream words = new 
GZIPInputStream(WordIndexerTest.class.getResourceAsStream("/words.txt.gz"));
-         InputStream chars = new 
GZIPInputStream(WordIndexerTest.class.getResourceAsStream("/chars.txt.gz"))) {
-      indexer = new WordIndexer(words, chars);
-    } catch (Exception ex) {
-      indexer = null;
-    }
-    assertNotNull(indexer);
-  }
-
-  @Test
-  void testToTokenIdsWithOneSentence() {
-    String text = "Stormy Cars ' friend says she also plans to sue Michael 
Cohen .";
-
-    TokenIds ids = indexer.toTokenIds(text.split("\\s+"));
-    assertEquals(13, ids.getWordIds()[0].length, "Expect 13 tokenIds");
-
-    assertArrayEquals(new int[] {7, 30, 34, 80, 42, 3}, 
ids.getCharIds()[0][0]);
-    assertArrayEquals(new int[] {51, 41, 80, 54}, ids.getCharIds()[0][1]);
-    assertArrayEquals(new int[] {64}, ids.getCharIds()[0][2]);
-    assertArrayEquals(new int[] {47, 80, 82, 83, 31, 23}, 
ids.getCharIds()[0][3]);
-    assertArrayEquals(new int[] {54, 41, 3, 54}, ids.getCharIds()[0][4]);
-    assertArrayEquals(new int[] {54, 76, 83}, ids.getCharIds()[0][5]);
-    assertArrayEquals(new int[] {41, 55, 54, 34}, ids.getCharIds()[0][6]);
-    assertArrayEquals(new int[] {46, 55, 41, 31, 54}, ids.getCharIds()[0][7]);
-    assertArrayEquals(new int[] {30, 34}, ids.getCharIds()[0][8]);
-    assertArrayEquals(new int[] {54, 50, 83}, ids.getCharIds()[0][9]);
-    assertArrayEquals(new int[] {39, 82, 20, 76, 41, 83, 55}, 
ids.getCharIds()[0][10]);
-    assertArrayEquals(new int[] {51, 34, 76, 83, 31}, ids.getCharIds()[0][11]);
-    assertArrayEquals(new int[] {65}, ids.getCharIds()[0][12]);
-
-    // TODO investigate why the 3 commented checks are different: Different 
data / assertions?
-    assertEquals(2720, ids.getWordIds()[0][0]);
-    // assertEquals(15275,ids.getWordIds()[0][1]);
-    assertEquals(3256, ids.getWordIds()[0][2]);
-    assertEquals(11348, ids.getWordIds()[0][3]);
-    assertEquals(21054, ids.getWordIds()[0][4]);
-    assertEquals(18337, ids.getWordIds()[0][5]);
-    assertEquals(7885, ids.getWordIds()[0][6]);
-    assertEquals(7697, ids.getWordIds()[0][7]);
-    assertEquals(16601, ids.getWordIds()[0][8]);
-    assertEquals(2720, ids.getWordIds()[0][9]);
-    // assertEquals(17408, ids.getWordIds()[0][10]);
-    // assertEquals(11541, ids.getWordIds()[0][11]);
-    assertEquals(2684, ids.getWordIds()[0][12]);
-
-  }
-
-  @Test
-  void testToTokenIdsWithTwoSentences() {
-
-    String[] text = new String[] {"I wish I was born in Copenhagen Denmark",
-        "Donald Trump died on his way to Tivoli Gardens in Denmark ."};
-
-    List<String[]> collect = Arrays.stream(text).map(s -> 
s.split("\\s+")).toList();
-
-    TokenIds ids = indexer.toTokenIds(collect.toArray(new String[2][]));
-
-    assertEquals(8, ids.getWordIds()[0].length);
-    assertEquals(12, ids.getWordIds()[1].length);
-
-    assertArrayEquals(new int[] {4}, ids.getCharIds()[0][0]);
-    assertArrayEquals(new int[] {6, 82, 54, 76}, ids.getCharIds()[0][1]);
-    assertArrayEquals(new int[] {4}, ids.getCharIds()[0][2]);
-    assertArrayEquals(new int[] {6, 41, 54}, ids.getCharIds()[0][3]);
-    assertArrayEquals(new int[] {59, 34, 80, 31}, ids.getCharIds()[0][4]);
-    assertArrayEquals(new int[] {82, 31}, ids.getCharIds()[0][5]);
-    assertArrayEquals(new int[] {51, 34, 46, 83, 31, 76, 41, 28, 83, 31}, 
ids.getCharIds()[0][6]);
-    assertArrayEquals(new int[] {36, 83, 31, 42, 41, 80, 49}, 
ids.getCharIds()[0][7]);
-
-    assertArrayEquals(new int[] {36, 34, 31, 41, 55, 23}, 
ids.getCharIds()[1][0]);
-    assertArrayEquals(new int[] {52, 80, 50, 42, 46}, ids.getCharIds()[1][1]);
-    assertArrayEquals(new int[] {23, 82, 83, 23}, ids.getCharIds()[1][2]);
-    assertArrayEquals(new int[] {34, 31}, ids.getCharIds()[1][3]);
-    assertArrayEquals(new int[] {76, 82, 54}, ids.getCharIds()[1][4]);
-    assertArrayEquals(new int[] {6, 41, 3}, ids.getCharIds()[1][5]);
-    assertArrayEquals(new int[] {30, 34}, ids.getCharIds()[1][6]);
-    assertArrayEquals(new int[] {52, 82, 11, 34, 55, 82}, 
ids.getCharIds()[1][7]);
-    assertArrayEquals(new int[] {74, 41, 80, 23, 83, 31, 54}, 
ids.getCharIds()[1][8]);
-    assertArrayEquals(new int[] {82, 31}, ids.getCharIds()[1][9]);
-    assertArrayEquals(new int[] {36, 83, 31, 42, 41, 80, 49}, 
ids.getCharIds()[1][10]);
-    assertArrayEquals(new int[] {65}, ids.getCharIds()[1][11]);
-
-    // TODO investigate why the 6 commented checks are different: Different 
data / assertions?
-    // assertEquals(21931, ids.getWordIds()[0][0]);
-    assertEquals(20473, ids.getWordIds()[0][1]);
-    // assertEquals(21931, ids.getWordIds()[0][2]);
-    assertEquals(5477, ids.getWordIds()[0][3]);
-    assertEquals(11538, ids.getWordIds()[0][4]);
-    assertEquals(21341, ids.getWordIds()[0][5]);
-    // assertEquals(14024, ids.getWordIds()[0][6]);
-    // assertEquals(7420, ids.getWordIds()[0][7]);
-
-    // assertEquals(12492, ids.getWordIds()[1][0]);
-    assertEquals(2720, ids.getWordIds()[1][1]);
-    assertEquals(9476, ids.getWordIds()[1][2]);
-    assertEquals(16537, ids.getWordIds()[1][3]);
-    assertEquals(18966, ids.getWordIds()[1][4]);
-    assertEquals(21088, ids.getWordIds()[1][5]);
-    assertEquals(16601, ids.getWordIds()[1][6]);
-    assertEquals(2720, ids.getWordIds()[1][7]);
-    assertEquals(2720, ids.getWordIds()[1][8]);
-    assertEquals(21341, ids.getWordIds()[1][9]);
-    // assertEquals(7420, ids.getWordIds()[1][10]);
-    assertEquals(2684, ids.getWordIds()[1][11]);
-  }
-
-}
diff --git a/tf-ner-poc/src/test/resources/chars.txt.gz 
b/tf-ner-poc/src/test/resources/chars.txt.gz
deleted file mode 100644
index c31b81a..0000000
Binary files a/tf-ner-poc/src/test/resources/chars.txt.gz and /dev/null differ
diff --git a/tf-ner-poc/src/test/resources/tags.txt.gz 
b/tf-ner-poc/src/test/resources/tags.txt.gz
deleted file mode 100644
index 0f0ceda..0000000
Binary files a/tf-ner-poc/src/test/resources/tags.txt.gz and /dev/null differ
diff --git a/tf-ner-poc/src/test/resources/words.txt.gz 
b/tf-ner-poc/src/test/resources/words.txt.gz
deleted file mode 100644
index 5f55ec0..0000000
Binary files a/tf-ner-poc/src/test/resources/words.txt.gz and /dev/null differ

(opennlp-sandbox) 01/01: Drop tf-ner-poc component - deletes the tf-ner-poc component entirely - reasons: TensorFlow 1.15 is EOL, no Java migration path to TF 2.x

Reply via email to