This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch cleanup/drop-tf-ner-poc in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
commit dd0c5d71b0e18b7e53dca4ab0d4ee14accd92b9a Author: Martin Wiesner <[email protected]> AuthorDate: Fri Mar 20 22:21:14 2026 +0100 Drop tf-ner-poc component - deletes the tf-ner-poc component entirely - reasons: TensorFlow 1.15 is EOL, no Java migration path to TF 2.x --- README.md | 1 - pom.xml | 1 - tf-ner-poc/pom.xml | 81 ---- .../main/java/org/apache/opennlp/ModelUtil.java | 50 --- .../apache/opennlp/namecat/NameCategorizer.java | 106 ----- .../apache/opennlp/namefinder/FeedDictionary.java | 148 ------- .../org/apache/opennlp/namefinder/IndexTagger.java | 57 --- .../namefinder/PredictionConfiguration.java | 57 --- .../apache/opennlp/namefinder/SequenceTagging.java | 133 ------ .../org/apache/opennlp/namefinder/TokenIds.java | 37 -- .../org/apache/opennlp/namefinder/Viterbi.java | 157 ------- .../org/apache/opennlp/namefinder/WordIndexer.java | 171 ------- .../org/apache/opennlp/normalizer/Normalizer.java | 147 ------ tf-ner-poc/src/main/python/doccat/doccat.py | 217 --------- tf-ner-poc/src/main/python/namecat/namecat.py | 238 ---------- .../src/main/python/namefinder/namefinder.py | 493 --------------------- tf-ner-poc/src/main/python/namefinder/split.py | 61 --- .../src/main/python/normalizer/date_generator.py | 86 ---- .../src/main/python/normalizer/normalizer.py | 322 -------------- .../opennlp/namefinder/FeedDictionaryTest.java | 64 --- .../org/apache/opennlp/namefinder/PredictTest.java | 62 --- .../apache/opennlp/namefinder/WordIndexerTest.java | 146 ------ tf-ner-poc/src/test/resources/chars.txt.gz | Bin 152 -> 0 bytes tf-ner-poc/src/test/resources/tags.txt.gz | Bin 64 -> 0 bytes tf-ner-poc/src/test/resources/words.txt.gz | Bin 89404 -> 0 bytes 25 files changed, 2835 deletions(-) diff --git a/README.md b/README.md index 107fda3..dc9d61a 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,6 @@ Currently, the library has different components: * `opennlp-wsd`: A set of components that allow for word sense disambiguation. * `summarizer`: A set of classes providing text summarization. * `tagging-server`: A RESTful webservice to allow for NER, POS tagging, sentence detection and tokenization. -* `tf-ner-poc`: An adapter component for [Tensorflow](https://www.tensorflow.org), in an early proof-of-concept (poc) stage. * `wikinews-importer`: A set of classes to process and annotate text formatted in [MediaWiki markup](https://www.mediawiki.org/wiki/Help:Formatting). ## Getting Started diff --git a/pom.xml b/pom.xml index 90f6c33..e078240 100644 --- a/pom.xml +++ b/pom.xml @@ -108,7 +108,6 @@ <module>opennlp-grpc</module> <module>opennlp-similarity</module> <module>opennlp-wsd</module> - <module>tf-ner-poc</module> <module>summarizer</module> <module>tagging-server</module> <module>wikinews-importer</module> diff --git a/tf-ner-poc/pom.xml b/tf-ner-poc/pom.xml deleted file mode 100644 index 72ad8b1..0000000 --- a/tf-ner-poc/pom.xml +++ /dev/null @@ -1,81 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> - -<project xmlns="http://maven.apache.org/POM/4.0.0" - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - <parent> - <groupId>org.apache.opennlp</groupId> - <artifactId>opennlp-sandbox</artifactId> - <version>3.0.0-SNAPSHOT</version> - </parent> - - <artifactId>tf-ner-poc</artifactId> - <name>Apache OpenNLP TF NER poc</name> - - <properties> - <tensorflow.version>1.15.0</tensorflow.version> - </properties> - - <dependencies> - <dependency> - <groupId>org.tensorflow</groupId> - <artifactId>tensorflow</artifactId> - <version>${tensorflow.version}</version> - </dependency> - - <dependency> - <groupId>org.apache.opennlp</groupId> - <artifactId>opennlp-runtime</artifactId> - </dependency> - - <dependency> - <groupId>org.junit.jupiter</groupId> - <artifactId>junit-jupiter-api</artifactId> - </dependency> - - <dependency> - <groupId>org.junit.jupiter</groupId> - <artifactId>junit-jupiter-engine</artifactId> - </dependency> - - <dependency> - <groupId>org.junit.jupiter</groupId> - <artifactId>junit-jupiter-params</artifactId> - </dependency> - </dependencies> - - <build> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-compiler-plugin</artifactId> - <configuration> - <source>${maven.compiler.source}</source> - <target>${maven.compiler.target}</target> - </configuration> - </plugin> - - </plugins> - </build> - -</project> diff --git a/tf-ner-poc/src/main/java/org/apache/opennlp/ModelUtil.java b/tf-ner-poc/src/main/java/org/apache/opennlp/ModelUtil.java deleted file mode 100644 index 1f5b2d2..0000000 --- a/tf-ner-poc/src/main/java/org/apache/opennlp/ModelUtil.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.opennlp; - -import java.io.IOException; -import java.io.InputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.zip.ZipEntry; -import java.util.zip.ZipInputStream; - -public class ModelUtil { - - public static Path writeModelToTmpDir(InputStream modelIn) throws IOException { - Path tmpDir = Files.createTempDirectory("opennlp2"); - - try (ZipInputStream zis = new ZipInputStream(modelIn)) { - ZipEntry zipEntry = zis.getNextEntry(); - while(zipEntry != null){ - Path newFile = tmpDir.resolve(zipEntry.getName()); - - Files.createDirectories(newFile.getParent()); - Files.copy(zis, newFile); - - // TODO: How to delete the tmp directory after we are done loading from it ?! - newFile.toFile().deleteOnExit(); - - zipEntry = zis.getNextEntry(); - } - zis.closeEntry(); - } - - return tmpDir; - } -} diff --git a/tf-ner-poc/src/main/java/org/apache/opennlp/namecat/NameCategorizer.java b/tf-ner-poc/src/main/java/org/apache/opennlp/namecat/NameCategorizer.java deleted file mode 100644 index de0215a..0000000 --- a/tf-ner-poc/src/main/java/org/apache/opennlp/namecat/NameCategorizer.java +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.opennlp.namecat; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.apache.opennlp.ModelUtil; -import org.tensorflow.SavedModelBundle; -import org.tensorflow.Session; -import org.tensorflow.Tensor; - -public class NameCategorizer { - - private final Session session; - private final Map<Character, Integer> charMap = new HashMap<>(); - private final Map<Integer, String> labelMap; - - public NameCategorizer(InputStream modelZipPackage) throws IOException { - - Path tmpModelPath = ModelUtil.writeModelToTmpDir(modelZipPackage); - - try (BufferedReader in = Files.newBufferedReader( - tmpModelPath.resolve("char_dict.txt"), StandardCharsets.UTF_8)) { - in.lines().forEach(ch -> charMap.put(ch.charAt(0), charMap.size())); - } - - labelMap = new HashMap<>(); - try (BufferedReader in = Files.newBufferedReader( - tmpModelPath.resolve("label_dict.txt"), StandardCharsets.UTF_8)) { - in.lines().forEach(label -> labelMap.put(labelMap.size(), label)); - } - - SavedModelBundle model = SavedModelBundle.load(tmpModelPath.toString(), "serve"); - session = model.session(); - } - - private static int argmax(float[] x) { - if (x == null || x.length == 0) { - throw new IllegalArgumentException("Vector x is null or empty"); - } - - int maxIdx = 0; - for (int i = 1; i < x.length; i++) { - if (x[maxIdx] < x[i]) - maxIdx = i; - } - return maxIdx; - } - - public String[] categorize(String[] names) { - if (names.length == 0) { - return new String[0]; - } - - int maxLength = Arrays.stream(names).mapToInt(String::length).max().getAsInt(); - - int[][] charIds = new int[names.length][maxLength]; - int[] nameLengths = new int[names.length]; - - for (int nameIndex = 0; nameIndex < names.length; nameIndex++) { - for (int charIndex = 0; charIndex < names[nameIndex].length(); charIndex++) { - charIds[nameIndex][charIndex] = charMap.get(names[nameIndex].charAt(charIndex)); - } - nameLengths[nameIndex] = names[nameIndex].length(); - } - - try (Tensor<?> dropout = Tensor.create(1f, Float.class); - Tensor<?> charTensor = Tensor.create(charIds); - Tensor<?> nameLength = Tensor.create(nameLengths)) { - List<Tensor<?>> result = session.runner() - .feed("dropout_keep_prop", dropout) - .feed("char_ids", charTensor) - .feed("name_lengths", nameLength) - .fetch("norm_probs", 0).run(); - - try (Tensor<?> probTensor = result.get(0)) { - float[][] probs = probTensor.copyTo(new float[names.length][labelMap.size()]); - return Arrays.stream(probs).map(prob -> labelMap.get(argmax(prob))).toArray(String[]::new); - } - } - } -} diff --git a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/FeedDictionary.java b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/FeedDictionary.java deleted file mode 100644 index c7b7234..0000000 --- a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/FeedDictionary.java +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.opennlp.namefinder; - -import java.util.Arrays; - -import org.tensorflow.Tensor; - -public class FeedDictionary implements AutoCloseable { - - static final int PAD_VALUE = 0; - - private final Tensor<Float> dropoutTensor; - private final Tensor<Integer> charIdsTensor; - private final Tensor<Integer> wordLengthsTensor; - private final Tensor<Integer> wordIdsTensor; - private final int[] sentenceLengths; - private final Tensor<Integer> sentenceLengthsTensor; - private final int maxSentenceLength; - private final int maxCharLength; - private final int numberOfSentences; - - public int[] getSentenceLengths() { - return sentenceLengths; - } - - public int getMaxSentenceLength() { - return maxSentenceLength; - } - - public int getNumberOfSentences() { - return numberOfSentences; - } - - public Tensor<Float> getDropoutTensor() { - return dropoutTensor; - } - - public Tensor<Integer> getCharIdsTensor() { - return charIdsTensor; - } - - public Tensor<Integer> getSentenceLengthsTensor() { - return sentenceLengthsTensor; - } - - public Tensor<Integer> getWordLengthsTensor() { - return wordLengthsTensor; - } - - public Tensor<Integer> getWordIdsTensor() { - return wordIdsTensor; - } - - private FeedDictionary(final float dropout, final int[][][] charIds, - final int[][] wordLengths, final int[][] wordIds, - final int[] sentenceLengths, final int maxSentenceLength, - final int maxCharLength, final int numberOfSentences) { - - dropoutTensor = Tensor.create(dropout, Float.class); - charIdsTensor = Tensor.create(charIds, Integer.class); - wordLengthsTensor = Tensor.create(wordLengths, Integer.class); - wordIdsTensor = Tensor.create(wordIds, Integer.class); - this.sentenceLengths = sentenceLengths; - sentenceLengthsTensor = Tensor.create(sentenceLengths, Integer.class); - this.maxSentenceLength = maxSentenceLength; - this.maxCharLength = maxCharLength; - this.numberOfSentences = numberOfSentences; - - } - - @Override - public void close() { - dropoutTensor.close(); - charIdsTensor.close(); - wordLengthsTensor.close(); - wordIdsTensor.close(); - sentenceLengthsTensor.close(); - } - - // multi sentences - public static FeedDictionary create(TokenIds sentences) { - - int numberOfSentences = sentences.getWordIds().length; - - int[][][] charIds = new int[numberOfSentences][][]; - int[][] wordLengths = new int[numberOfSentences][]; - - int maxSentenceLength = Arrays.stream(sentences.getWordIds()).map(s -> s.length).reduce(Integer::max).get(); - Padded paddedSentences = padArrays(sentences.getWordIds(), maxSentenceLength); - int[][] wordIds = paddedSentences.ids; - int[] sentenceLengths = paddedSentences.lengths; - - int maxCharLength = Arrays.stream(sentences.getCharIds()).flatMap(s -> Arrays.stream(s).map(c -> c.length)).reduce(Integer::max).get(); - for (int i=0; i < numberOfSentences; i++) { - Padded paddedWords = padArrays(sentences.getCharIds()[i], maxCharLength); - charIds[i] = paddedWords.ids; - wordLengths[i] = paddedWords.lengths; - } - - return new FeedDictionary(1.0f, charIds, wordLengths, wordIds, sentenceLengths, maxSentenceLength, maxCharLength, numberOfSentences); - - } - - private static Padded padArrays(int[][] ids, int length) { - - int[][] paddedIds = new int[ids.length][length]; - int[] lengths = new int[ids.length]; - - for (int i = 0; i < ids.length; i++) { - int[] src = ids[i]; - int[] dest = new int[length]; - System.arraycopy(src, 0, dest, 0, src.length); - if (src.length < length) - Arrays.fill(dest, src.length, length, PAD_VALUE); - paddedIds[i] = dest; - lengths[i] = src.length; - } - - return new Padded(paddedIds, lengths); - - } - - private static class Padded { - private final int[][] ids; - private final int[] lengths; - - Padded(int[][] ids, int[] lengths) { - this.ids = ids; - this.lengths = lengths; - } - } -} diff --git a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/IndexTagger.java b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/IndexTagger.java deleted file mode 100644 index dfa451f..0000000 --- a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/IndexTagger.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.opennlp.namefinder; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.nio.charset.StandardCharsets; -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; - -public class IndexTagger { - - private final Map<Integer, String> idx2Tag = new HashMap<>(); - - public IndexTagger(InputStream vocabTags) throws IOException { - try(BufferedReader in = new BufferedReader( - new InputStreamReader(vocabTags, StandardCharsets.UTF_8))) { - String tag; - int idx = 0; - while ((tag = in.readLine()) != null) { - idx2Tag.put(idx, tag); - idx += 1; - } - } - } - - public String getTag(Integer idx) { - return idx2Tag.get(idx); - } - - public Map<Integer, String> getIdx2Tag() { - return Collections.unmodifiableMap(idx2Tag); - } - - public int getNumberOfTags() { - return idx2Tag.size(); - } - -} diff --git a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/PredictionConfiguration.java b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/PredictionConfiguration.java deleted file mode 100644 index 30d18d9..0000000 --- a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/PredictionConfiguration.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.opennlp.namefinder; - -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; - -public class PredictionConfiguration { - - private final String vocabWords; - private final String vocabChars; - private final String vocabTags; - private final String savedModel; - - public PredictionConfiguration(String vocabWords, String vocabChars, String vocabTags, String savedModel) { - this.vocabWords = vocabWords; - this.vocabChars = vocabChars; - this.vocabTags = vocabTags; - this.savedModel = savedModel; - } - - public String getVocabWords() { - return vocabWords; - } - - public String getVocabChars() { - return vocabChars; - } - - public String getVocabTags() { - return vocabTags; - } - - public String getSavedModel() { - return savedModel; - } - - public InputStream getVocabWordsInputStream() throws IOException{ - return new FileInputStream(getVocabWords()); - } -} diff --git a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/SequenceTagging.java b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/SequenceTagging.java deleted file mode 100644 index 9d33b56..0000000 --- a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/SequenceTagging.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.opennlp.namefinder; - -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.List; - -import org.apache.opennlp.ModelUtil; -import org.tensorflow.SavedModelBundle; -import org.tensorflow.Session; -import org.tensorflow.Tensor; - -import opennlp.tools.namefind.BioCodec; -import opennlp.tools.namefind.TokenNameFinder; -import opennlp.tools.util.Span; - -public class SequenceTagging implements TokenNameFinder, AutoCloseable { - private final SavedModelBundle model; - private final Session session; - private final WordIndexer wordIndexer; - private final IndexTagger indexTagger; - - public SequenceTagging(PredictionConfiguration config) throws IOException { - model = SavedModelBundle.load(config.getSavedModel(), "serve"); - session = model.session(); - - this.wordIndexer = new WordIndexer(new FileInputStream(config.getVocabWords()), - new FileInputStream(config.getVocabChars())); - - this.indexTagger = new IndexTagger((new FileInputStream(config.getVocabTags()))); - } - - public SequenceTagging(InputStream modelZipPackage) throws IOException { - - Path tmpDir = ModelUtil.writeModelToTmpDir(modelZipPackage); - - try (InputStream wordsIn = Files.newInputStream(tmpDir.resolve("word_dict.txt")); - InputStream charsIn = Files.newInputStream(tmpDir.resolve("char_dict.txt"))) { - wordIndexer = new WordIndexer(wordsIn, charsIn); - } - - try (InputStream in = Files.newInputStream(tmpDir.resolve("label_dict.txt"))) { - indexTagger = new IndexTagger(in); - } - - model = SavedModelBundle.load(tmpDir.toString(), "serve"); - session = model.session(); - } - - @Override - public Span[] find(String[] sentence) { - if (sentence.length > 0) { - TokenIds tokenIds = wordIndexer.toTokenIds(sentence); - return new BioCodec().decode(Arrays.asList(predict(tokenIds)[0])); - } - else { - return new Span[0]; - } - } - - public String[][] predict(String[][] sentences) { - TokenIds tokenIds = wordIndexer.toTokenIds(sentences); - return predict(tokenIds); - } - - private String[][] predict(TokenIds tokenIds) { - - try (FeedDictionary fd = FeedDictionary.create(tokenIds)) { - - List<Tensor<?>> run = session.runner() - .feed("chars/char_ids:0", fd.getCharIdsTensor()) - .feed("dropout_keep_prop:0", fd.getDropoutTensor()) - .feed("words/sequence_lengths:0", fd.getSentenceLengthsTensor()) - .feed("words/word_ids:0", fd.getWordIdsTensor()) - .feed("chars/word_lengths:0", fd.getWordLengthsTensor()) - .fetch("logits", 0) - .fetch("trans_params", 0).run(); - - float[][][] logits = new float[fd.getNumberOfSentences()][fd.getMaxSentenceLength()][indexTagger.getNumberOfTags()]; - run.get(0).copyTo(logits); - - float[][] trans_params = new float[indexTagger.getNumberOfTags()][indexTagger.getNumberOfTags()]; - run.get(1).copyTo(trans_params); - - String[][] returnValue = new String[fd.getNumberOfSentences()][]; - for (int i = 0; i < logits.length; i++) { - float[][] logit = Arrays.copyOf(logits[i], fd.getSentenceLengths()[i]); - returnValue[i] = Viterbi.decode(logit, trans_params).stream().map(indexTagger::getTag).toArray(String[]::new); - } - - for (int i = 0; i < returnValue[0].length; i++) { - if (returnValue[0][i] == null) { - returnValue[0][i] = "other"; - } - } - - for (Tensor<?> t : run) { - t.close(); - } - - return returnValue; - } - } - - @Override - public void clearAdaptiveData() { - } - - @Override - public void close() { - session.close(); - } -} diff --git a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/TokenIds.java b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/TokenIds.java deleted file mode 100644 index 621ab3e..0000000 --- a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/TokenIds.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.opennlp.namefinder; - -public final class TokenIds { - - private final int[][][] charIds; - private final int[][] wordIds; - - public TokenIds(int[][][] charIds, int[][] wordIds) { - this.charIds = charIds; - this.wordIds = wordIds; - } - - public int[][][] getCharIds() { - return charIds; - } - - public int[][] getWordIds() { - return wordIds; - } -} diff --git a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/Viterbi.java b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/Viterbi.java deleted file mode 100644 index 254afc5..0000000 --- a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/Viterbi.java +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.opennlp.namefinder; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -public class Viterbi { - - /* - """Viterbi the highest scoring sequence of tags outside of TensorFlow. - This should only be used at test time. - Args: - score: A [seq_len, num_tags] matrix of unary potentials. - transition_params: A [num_tags, num_tags] matrix of binary potentials. - Returns: - viterbi: A [seq_len] list of integers containing the highest scoring tag - indices. - viterbi_score: A float containing the score for the Viterbi sequence. - """ - */ - - private static float[][] zeros_like(float[][] matrix) { - float[][] returnValue = new float[matrix.length][matrix[0].length]; - for (int i=0; i<matrix.length; i++) - Arrays.fill(returnValue[i], 0.0f); - return returnValue; - } - - private static int[][] zeros_like(int[] shape) { - int[][] returnValue = new int[shape[0]][shape[1]]; - for (int i=0; i<shape[0]; i++) - Arrays.fill(returnValue[i], 0); - return returnValue; - } - - private static int[] shape(float[][] var) { - return new int[] {var.length, var[0].length}; - } - - private static float[][] expand_dims_axis_one_plus_array(float[] array, float[][] plus) { - int[] plus_shape = shape(plus); - if (plus_shape[0] != array.length) - throw new RuntimeException("Not same shape"); - float[][] returnValue = new float[plus_shape[0]][plus_shape[1]]; - for (int i=0; i < array.length; i++) { - for (int j=0; j < plus_shape[1]; j++) { - returnValue[i][j] = array[i] + plus[i][j]; - } - } - return returnValue; - } - - private static float[] max_columnwise(float[][] array) { - float[] returnValue = new float[array[0].length]; - for (int col=0; col < array[0].length; col++) { - returnValue[col] = Float.MIN_VALUE; - for (float[] floats : array) { - returnValue[col] = Float.max(returnValue[col], floats[col]); - } - } - - return returnValue; - } - - private static float max(float[] array) { - float returnValue = Float.MIN_VALUE; - for (float v : array) { - returnValue = Float.max(returnValue, v); - } - return returnValue; - } - - private static int[] argmax_columnwise(float[][] array) { - int[] returnValue = new int[array[0].length]; - for (int col=0; col < array[0].length; col++) { - float max = Float.MIN_VALUE; - int idx = -1; - for (int row=0; row < array.length; row++) { - if (Float.compare(array[row][col], max) > 0) { - max = array[row][col]; - idx = row; - } - } - returnValue[col] = idx; - } - return returnValue; - } - - private static int argmax(float[] array) { - int returnValue = -1; - float max = Float.MIN_VALUE; - for (int col=0; col < array.length; col++) { - if (Float.compare(array[col], max) > 0) { - max = array[col]; - returnValue = col; - } - } - return returnValue; - } - - public static float[] plus(float[] a, float[] b) { - if (a.length == b.length) { - float[] returnValue = new float[a.length]; - for(int i = 0; i < a.length; ++i) { - returnValue[i] = Float.sum(a[i], b[i]); - } - return returnValue; - } else { - throw new IllegalArgumentException("Arrays doesn't have same shape."); - } - } - - public static List<Integer> decode(float[][] score, float[][] transition_params) { - - float[][] trellis = zeros_like(score); - int[][] backpointers = zeros_like(shape(score)); - - trellis[0] = score[0]; - - for (int t=1; t < score.length; t++) { - float[][] v = expand_dims_axis_one_plus_array(trellis[t - 1], transition_params); - trellis[t] = plus(score[t], max_columnwise(v)); - backpointers[t] = argmax_columnwise(v); - } - - List<Integer> viterbi = new ArrayList<>(); - viterbi.add(argmax(trellis[trellis.length - 1])); - - for (int i=backpointers.length - 1; i >= 1; i--) { - int[] bp = backpointers[i]; - viterbi.add(bp[viterbi.get(viterbi.size() - 1)]); - } - - Collections.reverse(viterbi); - - return viterbi; - } - -} diff --git a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/WordIndexer.java b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/WordIndexer.java deleted file mode 100644 index 729ecf1..0000000 --- a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/WordIndexer.java +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.opennlp.namefinder; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; -import java.util.regex.Pattern; - -import opennlp.tools.util.StringUtil; - -public class WordIndexer { - - private final Map<Character, Integer> char2idx; - private final Map<String, Integer> word2idx; - - public static final String UNK = "$UNK$"; - public static String NUM = "$NUM$"; - - private final boolean lowerCase = false; - private final boolean allowUnk = true; - - private final Pattern digitPattern = Pattern.compile("\\d+(,\\d+)*(\\.\\d+)?"); - - public WordIndexer(InputStream vocabWords, InputStream vocabChars) throws IOException { - this.word2idx = new HashMap<>(); - this.char2idx = new HashMap<>(); - - readVocabWords(vocabWords); - readVocacChars(vocabChars); - } - - private void readVocacChars(InputStream vocabChars) throws IOException { - try(BufferedReader in = new BufferedReader(new InputStreamReader(vocabChars, StandardCharsets.UTF_8))) { - String ch; - int idx = 0; - while ((ch = in.readLine()) != null) { - char2idx.put(ch.charAt(0), idx); - idx += 1; - } - } - } - - private void readVocabWords(InputStream vocabWords) throws IOException { - try(BufferedReader in = new BufferedReader(new InputStreamReader(vocabWords, StandardCharsets.UTF_8))) { - String word; - int idx = 0; - while ((word = in.readLine()) != null) { - word2idx.put(word, idx); - idx += 1; - } - } - } - - public TokenIds toTokenIds(String[] tokens) { - String[][] sentences = new String[1][]; - sentences[0] = tokens; - return toTokenIds(sentences); - } - - public TokenIds toTokenIds(String[][] sentences) { - int[][][] charIds = new int[sentences.length][][]; - int[][] wordIds = new int[sentences.length][]; - - for (int i = 0; i < sentences.length; i++) { - String[] sentenceWords = sentences[i]; - - int[][] sentcharIds = new int[sentenceWords.length][]; - int[] sentwordIds = new int[sentenceWords.length]; - - for (int j=0; j < sentenceWords.length; j++) { - Ids ids = apply(sentenceWords[j]); - - sentcharIds[j] = Arrays.copyOf(ids.getChars(), ids.getChars().length); - sentwordIds[j] = ids.getWord(); - } - - charIds[i] = sentcharIds; - wordIds[i] = sentwordIds; - } - - return new TokenIds(charIds, wordIds); - } - - - private Ids apply(String word) { - // 0. get chars of words - int[] charIds = new int[word.length()]; - int skipChars = 0; - for (int i = 0; i < word.length(); i++) { - char ch = word.charAt(i); - // ignore chars out of vocabulary - if (char2idx.containsKey(ch)) - charIds[i - skipChars] = char2idx.get(ch); - else - skipChars += 1; - } - - // 1. preprocess word - if (lowerCase) { - word = StringUtil.toLowerCase(word); - } - - // if (digitPattern.matcher(word).find()) - // word = NUM; - - // 2. get id of word - Integer wordId; - if (word2idx.containsKey(word)) { - wordId = word2idx.get(word); - } else { - if (allowUnk) - wordId = word2idx.get(UNK); - else - throw new RuntimeException("Unknown word '" + word + "' is not allowed."); - } - - // 3. return tuple char ids, word id - Ids tokenIds = new Ids(); - if (skipChars > 0) { - tokenIds.setChars(Arrays.copyOf(charIds, charIds.length - skipChars)); - } else { - tokenIds.setChars(charIds); - } - tokenIds.setWord(wordId); - - return tokenIds; - } - - public static class Ids { - - private int[] chars; - private int word; - - public int[] getChars() { - return chars; - } - - public void setChars(int[] chars) { - this.chars = chars; - } - - public int getWord() { - return word; - } - - public void setWord(int word) { - this.word = word; - } - } -} diff --git a/tf-ner-poc/src/main/java/org/apache/opennlp/normalizer/Normalizer.java b/tf-ner-poc/src/main/java/org/apache/opennlp/normalizer/Normalizer.java deleted file mode 100644 index 281a7bf..0000000 --- a/tf-ner-poc/src/main/java/org/apache/opennlp/normalizer/Normalizer.java +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.opennlp.normalizer; - -import java.io.BufferedReader; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.nio.charset.StandardCharsets; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -import org.apache.opennlp.ModelUtil; -import org.tensorflow.SavedModelBundle; -import org.tensorflow.Session; -import org.tensorflow.Tensor; - -public class Normalizer { - - private static final char END_MARKER = 3; - - private final Session session; - private final Map<Character, Integer> sourceCharMap; - private final Map<Integer, Character> targetCharMap; - - public Normalizer(InputStream modelZipPackage) throws IOException { - - Path tmpModelPath = ModelUtil.writeModelToTmpDir(modelZipPackage); - try(InputStream sourceCharMapIn = new FileInputStream( - tmpModelPath.resolve("source_char_dict.txt").toFile())) { - sourceCharMap = loadCharMap(sourceCharMapIn).entrySet().stream() - .collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey)); - } - - try(InputStream targetCharMapIn = new FileInputStream( - tmpModelPath.resolve("target_char_dict.txt").toFile())) { - targetCharMap = loadCharMap(targetCharMapIn); - } - - try (SavedModelBundle model = SavedModelBundle.load(tmpModelPath.toString(), "serve")) { - session = model.session(); - } - } - - private static Map<Integer, Character> loadCharMap(InputStream in) throws IOException { - try(BufferedReader reader = new BufferedReader( - new InputStreamReader(in, StandardCharsets.UTF_8))) { - Map<Integer, Character> characterMap = new HashMap<>(); - - String tag; - while ((tag = reader.readLine()) != null) { - characterMap.put(characterMap.size(), tag.charAt(0)); - } - - return Collections.unmodifiableMap(characterMap); - } - } - - public String[] normalize(String[] texts) { - - if (texts.length == 0) { - return new String[0]; - } - - int[] textLengths = Arrays.stream(texts).mapToInt(String::length).toArray(); - int maxLength = Arrays.stream(textLengths).max().getAsInt(); - - int[][] charIds = new int[texts.length][maxLength]; - - for (int textIndex = 0; textIndex < texts.length; textIndex++) { - for (int charIndex = 0; charIndex < texts[textIndex].length(); charIndex++) { - charIds[textIndex][charIndex] = - sourceCharMap.getOrDefault(texts[textIndex].charAt(charIndex), 0); - } - - textLengths[textIndex] = texts[textIndex].length(); - } - - try (Tensor<?> charTensor = Tensor.create(charIds); - Tensor<?> textLength = Tensor.create(textLengths); - Tensor<?> batchSize = Tensor.create(texts.length)) { - - List<Tensor<?>> result = session.runner() - .feed("encoder_char_ids", charTensor) - .feed("encoder_lengths", textLength) - .feed("batch_size", batchSize) - .fetch("decode", 0).run(); - - try (Tensor<?> translationTensor = result.get(0)) { - int[][] translations = - translationTensor.copyTo(new int[texts.length][(int) translationTensor.shape()[1]]); - - List<String> normalizedTexts = new ArrayList<>(); - - for (int[] translation : translations) { - StringBuilder normalizedText = new StringBuilder(); - for (int i : translation) { - normalizedText.append(targetCharMap.get(i)); - } - - // Remove the end marker from the translated string - for (int ci = normalizedText.length() - 1; ci >= 0; ci--) { - if (END_MARKER == normalizedText.charAt(ci)) { - normalizedText.setLength(ci); - } - } - - normalizedTexts.add(normalizedText.toString()); - } - - return normalizedTexts.toArray(new String[0]); - } - } - } - - public static void main(String[] args) throws Exception { - Normalizer normalizer = new Normalizer(new FileInputStream("python/normalizer/normalizer.zip")); - - String[] result = normalizer.normalize(new String[] { - "18 Mars 2012" - }); - - System.out.println(result[0]); - } -} diff --git a/tf-ner-poc/src/main/python/doccat/doccat.py b/tf-ner-poc/src/main/python/doccat/doccat.py deleted file mode 100644 index ef55f94..0000000 --- a/tf-ner-poc/src/main/python/doccat/doccat.py +++ /dev/null @@ -1,217 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import re -import tensorflow as tf -import sys -from util import load_glove -from util import write_mapping -from math import floor -import random -import numpy as np - -class Doccat: - def __init__(self, vector_size=100): - self.__vector_size = vector_size - - def load_data(self, file): - with open(file, encoding="utf-8") as f: - labels = [] - docs = [] - for line in f: - parts = re.split(r'\t+', line) - labels.append(parts[0].strip()) - docs.append(parts[1].strip()) - return labels, docs - - def create_placeholders(self): - - dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prop") - - # shape is batch_size, and number of tokens - token_ids_ph = tf.placeholder(tf.int32, shape=[None, None], name="token_ids") - - # shape is batch_size - token_lengths_ph = tf.placeholder(tf.int32, shape=[None], name="token_lengths") - - # shape is batch_size - y_ph = tf.placeholder(tf.int32, shape=[None], name="y") - - return dropout_keep_prob, token_ids_ph, token_lengths_ph, y_ph - - def create_graph(self, dropout_keep_prob, token_ids_ph, name_lengths_ph, y_ph, embedding_dict, nclasses): - - - - # This is a hack to make it load an embedding matrix larger than 2GB - # Don't hardcode this 300 - embedding_placeholder = tf.placeholder(dtype=tf.float32, name="embedding_placeholder", - shape=(len(embedding_dict), self.__vector_size)) - embedding_matrix = tf.Variable(embedding_placeholder, dtype=tf.float32, trainable=False, name="glove_embeddings") - - token_embeddings = tf.nn.embedding_lookup(embedding_matrix, token_ids_ph) - - - char_hidden_size = 256 - cell_fw = tf.contrib.rnn.LSTMCell(char_hidden_size, state_is_tuple=True) - cell_bw = tf.contrib.rnn.LSTMCell(char_hidden_size, state_is_tuple=True) - - _, ((_, output_fw), (_, output_bw)) = tf.nn.bidirectional_dynamic_rnn(cell_fw, - cell_bw, - token_embeddings, - sequence_length=name_lengths_ph, - dtype=tf.float32) - - output = tf.concat([output_fw, output_bw], axis=-1) - - output = tf.nn.dropout(output, dropout_keep_prob) - - W = tf.get_variable("W", shape=[2*char_hidden_size, nclasses]) - b = tf.get_variable("b", shape=[nclasses]) - logits = tf.nn.xw_plus_b(output, W, b, name="logits") - - # softmax ... - probs = tf.exp(logits) - norm_probs = tf.identity(probs / tf.reduce_sum(probs, 1, keepdims=True), name="norm_probs") - - loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y_ph) - mean_loss = tf.reduce_mean(loss) - - train_op = tf.train.AdamOptimizer().minimize(loss) - #train_op = tf.train.RMSPropOptimizer(learning_rate=0.001).minimize(loss) - - return embedding_placeholder, train_op, norm_probs - - -def encode_doc(word_dict, doc): - encoded_doc = [] - for c in doc: - if c in word_dict: - encoded_doc.append(word_dict[c]) - else: - encoded_doc.append(word_dict["__UNK__"]) - - return encoded_doc - - -def mini_batch(label_dict, word_dict, labels, docs, batch_size, batch_index): - begin = batch_size * batch_index - end = min(batch_size * (batch_index + 1), len(labels)) - - max_length = 0 - for i in range(begin, end): - length = len(docs[i]) - if length > max_length: - max_length = length - - doc_batch = [] - label_batch = [] - doc_length = [] - for i in range(begin, end): - label_batch.append( label_dict[labels[i]]) - doc_batch.append(encode_doc(word_dict, docs[i]) + [0] * max(max_length - len(docs[i]), 0)) - doc_length.append(len(docs[i])) - - return label_batch, np.asarray(doc_batch), doc_length - -def main(): - - if len(sys.argv) != 5: - print("Usage doccat.py embedding_file train_file dev_file test_file") - return - - doccat = Doccat(100) - - labels_train, docs_train = doccat.load_data(sys.argv[2]) - labels_dev, docs_dev = doccat.load_data(sys.argv[3]) - labels_test, docs_test = doccat.load_data(sys.argv[4]) - - - word_dict, rev_word_dict, embeddings, vector_size = load_glove(sys.argv[1]) - - # Encode labels into ids - label_dict = {} - for label in labels_train: - if not label in label_dict: - label_dict[label] = len(label_dict) - - - dropout_keep_prob, token_ids_ph, token_lengths_ph, y_ph = doccat.create_placeholders() - - embedding_ph, train_op, probs_op = doccat.create_graph(dropout_keep_prob, token_ids_ph, - token_lengths_ph, y_ph, - embeddings, len(label_dict)) - - sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, - log_device_placement=True)) - - with sess.as_default(): - init=tf.global_variables_initializer() - sess.run(init, feed_dict={embedding_ph: embeddings}) - batch_size = 20 - for epoch in range(50): - print("Epoch " + str(epoch)) - acc_train = [] - - batch_indexes = list(range(floor(len(docs_train) / batch_size))) - random.Random(epoch).shuffle(batch_indexes) - - for batch_index in batch_indexes: - label_train_batch, doc_train_batch, name_train_length = \ - mini_batch(label_dict, word_dict, labels_train, docs_train, batch_size, batch_index) - - feed_dict = {dropout_keep_prob: 0.5, token_ids_ph: doc_train_batch, token_lengths_ph: name_train_length, y_ph: label_train_batch} - _, probs = sess.run([train_op, probs_op], feed_dict) - - acc_train.append((batch_size - np.sum(np.minimum(np.abs(label_train_batch - np.argmax(probs, axis=1)), - np.full((batch_size), 1)))) / batch_size) - - print("Train acc: " + str(np.mean(acc_train))) - - acc_dev = [] - for batch_index in range(floor(len(docs_dev) / batch_size)): - label_dev_batch, doc_dev_batch, doc_dev_length = \ - mini_batch(label_dict, word_dict, labels_dev, docs_dev, batch_size, batch_index) - - feed_dict = {dropout_keep_prob: 1, token_ids_ph: doc_dev_batch, token_lengths_ph: doc_dev_length, y_ph: label_dev_batch} - probs = sess.run(probs_op, feed_dict) - - acc_dev.append((batch_size - np.sum(np.minimum(np.abs(label_dev_batch - np.argmax(probs, axis=1)), - np.full((batch_size), 1)))) / batch_size) - - print("Dev acc: " + str(np.mean(acc_dev))) - - with TemporaryDirectory() as temp_dir: - temp_model_dir = temp_dir + "/model" - - builder = tf.saved_model.builder.SavedModelBuilder(temp_model_dir) - builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING]) - builder.save() - - write_mapping(label_dict, temp_model_dir + "/label_dict.txt") - - zipf = zipfile.ZipFile("doccat-" + str(epoch) +".zip", 'w', zipfile.ZIP_DEFLATED) - - for root, dirs, files in os.walk(temp_model_dir): - for file in files: - modelFile = os.path.join(root, file) - zipf.write(modelFile, arcname=os.path.relpath(modelFile, temp_model_dir)) - -if __name__ == "__main__": - main() diff --git a/tf-ner-poc/src/main/python/namecat/namecat.py b/tf-ner-poc/src/main/python/namecat/namecat.py deleted file mode 100644 index cc3f28a..0000000 --- a/tf-ner-poc/src/main/python/namecat/namecat.py +++ /dev/null @@ -1,238 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import re -import tensorflow as tf -import sys -from math import floor -import numpy as np -import random -import zipfile -import os -from tempfile import TemporaryDirectory - -def load_data(file): - with open(file, encoding="utf-8") as f: - labels = [] - names = [] - for line in f: - parts = re.split(r'\t+', line) - labels.append(parts[0].strip()) - names.append(parts[1].strip()) - return labels, names - -# create placeholders -def create_placeholders(): - - dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prop") - - # shape is batch_size, and length of name - char_ids_ph = tf.placeholder(tf.int32, shape=[None, None], name="char_ids") - - # shape is batch_size - name_lengths_ph = tf.placeholder(tf.int32, shape=[None], name="name_lengths") - - # shape is batch_size - y_ph = tf.placeholder(tf.int32, shape=[None], name="y") - return dropout_keep_prob, char_ids_ph, name_lengths_ph, y_ph - -def create_graph(dropout_keep_prob, char_ids_ph, name_lengths_ph, y_ph, nchars, nclasses): - - - dim_char = 100 - - K = tf.get_variable(name="char_embeddings", dtype=tf.float32, - shape=[nchars, dim_char]) - - char_embeddings = tf.nn.embedding_lookup(K, char_ids_ph) - - char_embeddings = tf.nn.dropout(char_embeddings, dropout_keep_prob) - - char_hidden_size = 256 - cell_fw = tf.contrib.rnn.LSTMCell(char_hidden_size, state_is_tuple=True) - cell_bw = tf.contrib.rnn.LSTMCell(char_hidden_size, state_is_tuple=True) - - _, ((_, output_fw), (_, output_bw)) = tf.nn.bidirectional_dynamic_rnn(cell_fw, - cell_bw, - char_embeddings, - sequence_length=name_lengths_ph, - dtype=tf.float32) - - output = tf.concat([output_fw, output_bw], axis=-1) - - output = tf.nn.dropout(output, dropout_keep_prob) - - W = tf.get_variable("W", shape=[2*char_hidden_size, nclasses]) - b = tf.get_variable("b", shape=[nclasses]) - logits = tf.nn.xw_plus_b(output, W, b, name="logits") - - # softmax ... - probs = tf.exp(logits) - norm_probs = tf.identity(probs / tf.reduce_sum(probs, 1, keepdims=True), name="norm_probs") - - loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y_ph) - mean_loss = tf.reduce_mean(loss) - - train_op = tf.train.AdamOptimizer().minimize(loss) - #train_op = tf.train.RMSPropOptimizer(learning_rate=0.001).minimize(loss) - - return train_op, norm_probs - - -def encode_name(char_dict, name): - encoded_name = [] - for c in name: - encoded_name.append(char_dict[c]) - return encoded_name - -def mini_batch(label_dict, char_dict, labels, names, batch_size, batch_index): - begin = batch_size * batch_index - end = min(batch_size * (batch_index + 1), len(labels)) - - max_length = 0 - for i in range(begin, end): - length = len(names[i]) - if length > max_length: - max_length = length - - name_batch = [] - label_batch = [] - name_length = [] - for i in range(begin, end): - label_batch.append( label_dict[labels[i]]) - name_batch.append(encode_name(char_dict, names[i]) + [0] * max(max_length - len(names[i]), 0)) - name_length.append(len(names[i])) - - return label_batch, np.asarray(name_batch), name_length - -def write_mapping(tags, output_filename): - with open(output_filename, 'w', encoding='utf-8') as f: - for i, tag in enumerate(tags): - f.write(tag) - f.write("\n") - -def main(): - - if len(sys.argv) != 4: - print("Usage namecat.py train_file dev_file test_file") - return - - labels_train, names_train = load_data(sys.argv[1]) - labels_dev, names_dev = load_data(sys.argv[2]) - labels_test, names_test = load_data(sys.argv[3]) - - # Encode labels into ids - label_dict = {} - for label in labels_train: - if not label in label_dict: - label_dict[label] = len(label_dict) - - # Create char dict from names ... - - char_set = set() - for name in names_train + names_dev + names_train: - char_set = char_set.union(name) - - char_dict = {k: v for v, k in enumerate(char_set)} - char_dict[chr(0)] = 0 - - dropout_keep_prob, char_ids_ph, name_lengths_ph, y_ph = create_placeholders() - - train_op, probs_op = create_graph(dropout_keep_prob, char_ids_ph, name_lengths_ph, y_ph, len(char_set), len(label_dict)) - - sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, - log_device_placement=True)) - - with sess.as_default(): - init=tf.global_variables_initializer() - sess.run(init) - - batch_size = 20 - for epoch in range(20): - print("Epoch " + str(epoch)) - acc_train = [] - - batch_indexes = list(range(floor(len(names_train) / batch_size))) - - # Shuffle the data - combined = list(zip(names_train, labels_train)) - random.shuffle(combined) - names_train[:], labels_train[:] = zip(*combined) - - for batch_index in batch_indexes: - label_train_batch, name_train_batch, name_train_length = \ - mini_batch(label_dict, char_dict, labels_train, names_train, batch_size, batch_index) - - # Add char dropout here ... - for i, j in np.ndindex(name_train_batch.shape): - if random.uniform(0, 1) <= 0.0005: - name_train_batch[i][j] = 0 - - feed_dict = {dropout_keep_prob: 0.5, char_ids_ph: name_train_batch, name_lengths_ph: name_train_length, y_ph: label_train_batch} - _, probs = sess.run([train_op, probs_op], feed_dict) - - acc_train.append((batch_size - np.sum(np.minimum(np.abs(label_train_batch - np.argmax(probs, axis=1)), - np.full((batch_size), 1)))) / batch_size) - - print("Train acc: " + str(np.mean(acc_train))) - - acc_dev = [] - for batch_index in range(floor(len(names_dev) / batch_size)): - label_dev_batch, name_dev_batch, name_dev_length = \ - mini_batch(label_dict, char_dict, labels_dev, names_dev, batch_size, batch_index) - - feed_dict = {dropout_keep_prob: 1, char_ids_ph: name_dev_batch, name_lengths_ph: name_dev_length, y_ph: label_dev_batch} - probs = sess.run(probs_op, feed_dict) - - acc_dev.append((batch_size - np.sum(np.minimum(np.abs(label_dev_batch - np.argmax(probs, axis=1)), - np.full((batch_size), 1)))) / batch_size) - - print("Dev acc: " + str(np.mean(acc_dev))) - - #acc_test = [] - #for batch_index in range(floor(len(names_test) / batch_size)): - # label_test_batch, name_test_batch, name_test_length = \ - # mini_batch(label_dict, char_dict, labels_test, names_test, batch_size, batch_index) - - # feed_dict = {char_ids_ph: name_test_batch, name_lengths_ph: name_test_length, y_ph: label_test_batch} - # probs = sess.run(probs_op, feed_dict) - - # acc_test.append((batch_size - np.sum(np.abs(label_test_batch - np.argmax(probs, axis=1)))) / batch_size) - - #print("Test acc: " + str(np.mean(acc_test))) - - with TemporaryDirectory() as temp_dir: - temp_model_dir = temp_dir + "/model" - - builder = tf.saved_model.builder.SavedModelBuilder(temp_model_dir) - builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING]) - builder.save() - - write_mapping(label_dict, temp_model_dir + "/label_dict.txt") - write_mapping(char_dict, temp_model_dir + "/char_dict.txt") - - zipf = zipfile.ZipFile("namecat-" + str(epoch) +".zip", 'w', zipfile.ZIP_DEFLATED) - - for root, dirs, files in os.walk(temp_model_dir): - for file in files: - modelFile = os.path.join(root, file) - zipf.write(modelFile, arcname=os.path.relpath(modelFile, temp_model_dir)) - -if __name__ == "__main__": - main() diff --git a/tf-ner-poc/src/main/python/namefinder/namefinder.py b/tf-ner-poc/src/main/python/namefinder/namefinder.py deleted file mode 100644 index f180f2b..0000000 --- a/tf-ner-poc/src/main/python/namefinder/namefinder.py +++ /dev/null @@ -1,493 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -# This poc is based on source code taken from: -# https://github.com/guillaumegenthial/sequence_tagging - -import sys -from math import floor -import tensorflow as tf -import re -import numpy as np -import zipfile -import os -from tempfile import TemporaryDirectory - -# global variables for unknown word and numbers -__UNK__ = '__UNK__' -__NUM__ = '__NUM__' - - -# Parse the OpenNLP Name Finder format into begin, end, type triples -class NameSample: - - def __init__(self, line): - self.tokens = [] - self.names = [] - start_regex = re.compile("<START(:([^:>\\s]*))?>") - parts = line.split() - start_index = -1 - word_index = 0 - for i in range(0, len(parts)): - if start_regex.match(parts[i]): - start_index = word_index - name_type = start_regex.search(parts[i]).group(2) - if name_type is None: - name_type = "default" - elif parts[i] == "<END>": - self.names.append((start_index, word_index, name_type)) - else: - self.tokens.append(parts[i]) - word_index += 1 - - -class VectorException(Exception): - def __init__(self, value): - self.value = value - - def __str__(self): - return repr(self.value) - - -class NameFinder: - label_dict = {} - - def __init__(self, use_lower_case_embeddings=False, vector_size=100): - self.__vector_size = vector_size - self.__use_lower_case_embeddings = use_lower_case_embeddings - - def load_data(self, word_dict, file): - with open(file) as f: - raw_data = f.readlines() - - sentences = [] - labels = [] - chars_set = set() - - for line in raw_data: - name_sample = NameSample(line) - sentence = [] - tokens = [] - - if len(name_sample.tokens) == 0: - continue - - for token in name_sample.tokens: - - chars_set.update(list(token)) # Add all chars to the set - tokens.append(token) # Add original token so chars can be encoded correctly later - - if self.__use_lower_case_embeddings: - token = token.lower() - - # TODO: implement NUM encoding - - if word_dict.get(token) is not None: - vector = word_dict[token] - else: - vector = word_dict[__UNK__] - - sentence.append(vector) - - label = ["other"] * len(name_sample.tokens) - for name in name_sample.names: - label[name[0]] = name[2] + "-start" - for i in range(name[0] + 1, name[1]): - label[i] = name[2] + "-cont" - - sentences.append((sentence, tokens)) # Add a tuple of list of word vectors and list of original words - labels.append(label) - - for label_string in label: - if label_string not in self.label_dict: - self.label_dict[label_string] = len(self.label_dict) - - return sentences, labels, chars_set - - def encode_labels(self, labels): - return list(map(lambda l: self.label_dict[l], labels)) - - def mini_batch(self, char_dict, sentences, labels, batch_size, batch_index): - begin = batch_size * batch_index - end = min(batch_size * (batch_index + 1), len(labels)) - - # Determine the max sentence length in the batch - max_length = 0 - for i in range(begin, end): - length = len(sentences[i][0]) - if length > max_length: - max_length = length - - sb = [] - lb = [] - seq_length = [] - for i in range(begin, end): - sb.append(sentences[i][0] + [0] * max(max_length - len(sentences[i][0]), 0)) - lb.append(self.encode_labels(labels[i]) + [0] * max(max_length - len(labels[i]), 0)) - seq_length.append(len(sentences[i][0])) - - # Determine the max word length in the batch - max_word_length = 0 - for i in range(begin, end): - for word in sentences[i][1]: - length = len(word) - if length > max_word_length: - max_word_length = length - - cb = [] - wlb = [] - for i in range(begin, end): - sentence_word_length = [] - sentence_word_chars = [] - - for word in sentences[i][1]: - word_chars = [] - for c in word: - word_chars.append(char_dict[c]) - - sentence_word_length.append(len(word_chars)) - word_chars = word_chars + [0] * max(max_word_length - len(word_chars), 0) - sentence_word_chars.append(word_chars) - - for i in range(max(max_length - len(sentence_word_chars), 0)): - sentence_word_chars.append([0] * max_word_length) - - cb.append(sentence_word_chars) - wlb.append(sentence_word_length + [0] * max(max_length - len(sentence_word_length), 0)) - - return sb, cb, wlb, lb, seq_length - - # probably not necessary to pass in the embedding_dict, can be passed to init directly - def create_graph(self, nchars, embedding_dict): - - dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prop") - - with tf.variable_scope("chars"): - # shape = (batch size, max length of sentence, max length of word) - char_ids = tf.placeholder(tf.int32, shape=[None, None, None], name="char_ids") - - # shape = (batch_size, max_length of sentence) - word_lengths_ph = tf.placeholder(tf.int32, shape=[None, None], name="word_lengths") - - dim_char = 100 - - # 1. get character embeddings - K = tf.get_variable(name="char_embeddings", dtype=tf.float32, - shape=[nchars, dim_char]) - - # shape = (batch, sentence, word, dim of char embeddings) - char_embeddings = tf.nn.embedding_lookup(K, char_ids) - - # 2. put the time dimension on axis=1 for dynamic_rnn - s = tf.shape(char_embeddings) # store old shape - # shape = (batch x sentence, word, dim of char embeddings) - char_embeddings = tf.reshape(char_embeddings, shape=[s[0] * s[1], s[-2], dim_char]) - word_lengths = tf.reshape(word_lengths_ph, shape=[s[0] * s[1]]) - - # 3. bi lstm on chars - char_hidden_size = 100 - cell_fw = tf.contrib.rnn.LSTMCell(char_hidden_size, state_is_tuple=True) - cell_bw = tf.contrib.rnn.LSTMCell(char_hidden_size, state_is_tuple=True) - - _, ((_, output_fw), (_, output_bw)) = tf.nn.bidirectional_dynamic_rnn(cell_fw, - cell_bw, - char_embeddings, - sequence_length=word_lengths, - dtype=tf.float32) - # shape = (batch x sentence, 2 x char_hidden_size) - output = tf.concat([output_fw, output_bw], axis=-1) - - # shape = (batch, sentence, 2 x char_hidden_size) - char_rep = tf.reshape(output, shape=[-1, s[1], 2 * char_hidden_size]) - - with tf.variable_scope("words"): - token_ids = tf.placeholder(tf.int32, shape=[None, None], name="word_ids") - sequence_lengths = tf.placeholder(tf.int32, shape=[None], name="sequence_lengths") - - # This is a hack to make it load an embedding matrix larger than 2GB - # Don't hardcode this 300 - embedding_placeholder = tf.placeholder(dtype=tf.float32, name="embedding_placeholder", - shape=(len(embedding_dict), self.__vector_size)) - embedding_matrix = tf.Variable(embedding_placeholder, dtype=tf.float32, trainable=False, - name="glove_embeddings") - - token_embeddings = tf.nn.embedding_lookup(embedding_matrix, token_ids) - - # shape = (batch, sentence, 2 x char_hidden_size + word_vector_size) - word_embeddings = tf.concat([token_embeddings, char_rep], axis=-1) - - word_embeddings = tf.nn.dropout(word_embeddings, dropout_keep_prob) - - hidden_size = 300 - - # Lets add a char lstm layer to reproduce the state of the art results ... - - with tf.variable_scope("bi-lstm"): - # Add LSTM layer - cell_fw = tf.contrib.rnn.LSTMCell(hidden_size) - cell_bw = tf.contrib.rnn.LSTMCell(hidden_size) - - (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, word_embeddings, - sequence_length=sequence_lengths, - dtype=tf.float32) - - context_rep = tf.concat([output_fw, output_bw], axis=-1) - - context_rep = tf.nn.dropout(context_rep, dropout_keep_prob) - - labels = tf.placeholder(tf.int32, shape=[None, None], name="labels") - - ntags = len(self.label_dict) - - W = tf.get_variable("W", shape=[2 * hidden_size, ntags], dtype=tf.float32) - b = tf.get_variable("b", shape=[ntags], dtype=tf.float32, initializer=tf.zeros_initializer()) - ntime_steps = tf.shape(context_rep)[1] - context_rep_flat = tf.reshape(context_rep, [-1, 2 * hidden_size]) - pred = tf.matmul(context_rep_flat, W) + b - self.logits = tf.reshape(pred, [-1, ntime_steps, ntags], name="logits") - - log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood( - self.logits, labels, sequence_lengths) - - self.transition_params = tf.identity(transition_params, name="trans_params") - - loss = tf.reduce_mean(-log_likelihood) - - train_op = tf.train.AdamOptimizer().minimize(loss) - - return embedding_placeholder, token_ids, char_ids, word_lengths_ph, \ - sequence_lengths, labels, dropout_keep_prob, train_op - - def predict_batch(self, sess, token_ids_ph, char_ids_ph, word_lengths_ph, - sequence_lengths_ph, sentences, char_ids, word_length, lengths, dropout_keep_prob): - - feed_dict = {token_ids_ph: sentences, char_ids_ph: char_ids, word_lengths_ph: word_length, - sequence_lengths_ph: lengths, dropout_keep_prob: 1} - - viterbi_sequences = [] - logits, trans_params = sess.run([self.logits, self.transition_params], feed_dict=feed_dict) - - for logit, sequence_length in zip(logits, lengths): - if sequence_length != 0: - logit = logit[:sequence_length] # keep only the valid steps - viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode(logit, trans_params) - viterbi_sequences += [viterbi_seq] - else: - viterbi_sequences += [] - - return viterbi_sequences, lengths - - -def get_chunk_type(tok, idx_to_tag): - tag_name = idx_to_tag[tok] - tag_class = tag_name.split('-')[0] - tag_type = tag_name.split('-')[-1] - return tag_class, tag_type - - -def get_chunks(seq, tags): - default = tags["other"] - idx_to_tag = {idx: tag for tag, idx in tags.items()} - chunks = [] - chunk_type, chunk_start = None, None - for i, tok in enumerate(seq): - # End of a chunk 1 - if tok == default and chunk_type is not None: - # Add a chunk. - chunk = (chunk_type, chunk_start, i) - chunks.append(chunk) - chunk_type, chunk_start = None, None - - # End of a chunk + start of a chunk! - elif tok != default: - tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag) - if chunk_type is None: - chunk_type, chunk_start = tok_chunk_type, i - elif tok_chunk_type != chunk_type or tok_chunk_class == "B": - chunk = (chunk_type, chunk_start, i) - chunks.append(chunk) - chunk_type, chunk_start = tok_chunk_type, i - else: - pass - - # end condition - if chunk_type is not None: - chunk = (chunk_type, chunk_start, len(seq)) - chunks.append(chunk) - - return chunks - - -def write_mapping(tags, output_filename): - with open(output_filename, 'w', encoding='utf-8') as f: - for (tag, i) in sorted(tags.items(), key=lambda x: x[1]): - f.write('{}\n'.format(tag)) - - -def load_glove(glove_file): - with open(glove_file) as f: - - word_dict = {} - embeddings = [] - - vector_size = -1 - - for line in f: - parts = line.strip().split(" ") - - if vector_size == -1: - if len(parts) == 2: - vector_size = int(parts[1]) - continue - vector_size = len(parts) - 1 - - if len(parts) != vector_size + 1: - raise VectorException("Bad Vector in line: {}, size: {} vector: {}".format(len(line), len(parts), line)) - continue - word_dict[parts[0]] = len(word_dict) - embeddings.append(np.array(parts[1:], dtype=np.float32)) - - # add unknown word symbol and number symbol - if __UNK__ not in word_dict: - word_dict[__UNK__] = len(word_dict) - unk_random = 0.08 * np.random.random_sample(vector_size) - 0.04 - embeddings.append(unk_random.astype(np.float32)) - if __NUM__ not in word_dict: - word_dict[__NUM__] = len(word_dict) - embeddings.append(np.zeros(vector_size, dtype=np.float32)) - - # Create a reverse word dict - rev_word_dict = {} - for word, id in word_dict.items(): - rev_word_dict[id] = word - - return word_dict, rev_word_dict, np.asarray(embeddings), vector_size - - -def main(): - if len(sys.argv) != 5: - print("Usage namefinder.py embedding_file train_file dev_file test_file") - return - - word_dict, rev_word_dict, embeddings, vector_size = load_glove(sys.argv[1]) - - name_finder = NameFinder(vector_size) - - sentences, labels, char_set = name_finder.load_data(word_dict, sys.argv[2]) - sentences_dev, labels_dev, char_set_dev = name_finder.load_data(word_dict, sys.argv[3]) - - char_dict = {k: v for v, k in enumerate(char_set | char_set_dev)} - - embedding_ph, token_ids_ph, char_ids_ph, word_lengths_ph, sequence_lengths_ph, labels_ph, dropout_keep_prob, train_op \ - = name_finder.create_graph(len(char_set | char_set_dev), embeddings) - - sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, - log_device_placement=True)) - - best_f1 = 0.0 - no_improvement = 0 - with sess.as_default(): - init = tf.global_variables_initializer() - sess.run(init, feed_dict={embedding_ph: embeddings}) - - batch_size = 20 - for epoch in range(100): - print("Epoch " + str(epoch)) - - for batch_index in range(floor(len(sentences) / batch_size)): - if batch_index % 200 == 0: - print("batch_index " + str(batch_index)) - - # mini_batch should also return char_ids and word length ... - sentences_batch, chars_batch, word_length_batch, labels_batch, lengths = \ - name_finder.mini_batch(char_dict, sentences, labels, batch_size, batch_index) - - feed_dict = {token_ids_ph: sentences_batch, char_ids_ph: chars_batch, - word_lengths_ph: word_length_batch, sequence_lengths_ph: lengths, - labels_ph: labels_batch, dropout_keep_prob: 0.5} - - train_op.run(feed_dict, sess) - - accs = [] - correct_preds, total_correct, total_preds = 0., 0., 0. - for batch_index in range(floor(len(sentences_dev) / batch_size)): - sentences_test_batch, chars_batch_test, word_length_batch_test, \ - labels_test_batch, length_test = name_finder.mini_batch(char_dict, - sentences_dev, - labels_dev, - batch_size, - batch_index) - - labels_pred, sequence_lengths = name_finder.predict_batch( - sess, token_ids_ph, char_ids_ph, word_lengths_ph, sequence_lengths_ph, - sentences_test_batch, chars_batch_test, word_length_batch_test, length_test, dropout_keep_prob) - - for lab, lab_pred, length in zip(labels_test_batch, labels_pred, - sequence_lengths): - lab = lab[:length] - lab_pred = lab_pred[:length] - accs += [a == b for (a, b) in zip(lab, lab_pred)] - - lab_chunks = set(get_chunks(lab, name_finder.label_dict)) - lab_pred_chunks = set(get_chunks(lab_pred, name_finder.label_dict)) - - correct_preds += len(lab_chunks & lab_pred_chunks) - total_preds += len(lab_pred_chunks) - total_correct += len(lab_chunks) - - p = correct_preds / total_preds if correct_preds > 0 else 0 - r = correct_preds / total_correct if correct_preds > 0 else 0 - f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 - acc = np.mean(accs) - - if f1 > best_f1: - - best_f1 = f1 - no_improvement = 0 - - with TemporaryDirectory() as temp_dir: - temp_model_dir = temp_dir + "/model" - - builder = tf.saved_model.builder.SavedModelBuilder(temp_model_dir) - builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING]) - builder.save() - - write_mapping(word_dict, temp_model_dir + '/word_dict.txt') - write_mapping(name_finder.label_dict, temp_model_dir + "/label_dict.txt") - write_mapping(char_dict, temp_model_dir + "/char_dict.txt") - - zipf = zipfile.ZipFile("namefinder-" + str(epoch) + ".zip", 'w', zipfile.ZIP_DEFLATED) - - for root, dirs, files in os.walk(temp_model_dir): - for file in files: - modelFile = os.path.join(root, file) - zipf.write(modelFile, arcname=os.path.relpath(modelFile, temp_model_dir)) - else: - no_improvement += 1 - - print("ACC " + str(acc)) - print("F1 " + str(f1) + " P " + str(p) + " R " + str(r)) - - if no_improvement > 5: - print("No further improvement. Stopping.") - break - - -if __name__ == "__main__": - main() diff --git a/tf-ner-poc/src/main/python/namefinder/split.py b/tf-ner-poc/src/main/python/namefinder/split.py deleted file mode 100644 index 1e5ea4d..0000000 --- a/tf-ner-poc/src/main/python/namefinder/split.py +++ /dev/null @@ -1,61 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import random -import sys - -def main(): - - if len(sys.argv) != 5: - print("Usage split.py data_file train_file dev_file test_file") - return - - train = [] - dev = [] - test = [] - - with open(sys.argv[1]) as f: - for line in f: - - if len(line.strip()) == 0: - continue - - rand = random.random() - if rand < 0.8: - train.append(line) - elif rand < 0.9: - dev.append(line) - elif rand <= 1.0: - test.append(line) - - with open(sys.argv[2], 'w') as f: - for item in train: - f.write("%s" % item) - - with open(sys.argv[3], 'w') as f: - for item in dev: - f.write("%s" % item) - - with open(sys.argv[4], 'w') as f: - for item in test: - f.write("%s" % item) - -if __name__ == "__main__": - main() - diff --git a/tf-ner-poc/src/main/python/normalizer/date_generator.py b/tf-ner-poc/src/main/python/normalizer/date_generator.py deleted file mode 100644 index 965aec0..0000000 --- a/tf-ner-poc/src/main/python/normalizer/date_generator.py +++ /dev/null @@ -1,86 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -from faker import Faker -from babel.dates import format_date -import random -from datetime import datetime - -fake = Faker() - -# TOOD: If possible set date range on Faker - -FORMATS = ['short', - 'medium', - 'long', - 'dd MMM YYY', - 'dd MMM', - 'dd, MMM YYY', - 'dd, MMM', - 'd MMM YYY', - 'd MMM', - 'd MMMM YYY', - 'MMMM YYY', - 'd MMMM, YYY', - 'd MMM, YYY', - 'd MM YY', - 'd MMMM YYY', - 'MMMM d YYY', - 'MMMM YYY', - 'MMM YYY', - 'MMMM d, YYY', - 'YYY', - 'dd.MM.YY', - 'dd.MM', - 'full', - 'full', - 'full', - 'full', - 'full', - 'full', - 'full', - 'full', - 'full', - 'full'] - -# TODO: maybe avoid duplicates, output dates also for other locales such as german, and french ... - -with open('date_dev_deu.txt', 'w', encoding="utf-8") as f: - for i in range(2000): - dt = fake.date_time_ad(start_datetime=datetime(1900, 1, 1)) - - format = random.choice(FORMATS) - source_date = format_date(dt, format=format, locale='en_US') - target_date = format_date(dt, format='YYYYMMdd', locale='en_US') - - if "short" not in format \ - and "medium" not in format \ - and "long" not in format \ - and "full" not in format : - - if "Y" not in format: - target_date = "0000" + target_date[4:] - - if "d" not in format: - target_date = target_date[:6] + "00" - - if "M" not in format: - target_date = target_date[:4] + "00" + target_date[6:] - - f.write(target_date + '\t' + source_date + '\n') \ No newline at end of file diff --git a/tf-ner-poc/src/main/python/normalizer/normalizer.py b/tf-ner-poc/src/main/python/normalizer/normalizer.py deleted file mode 100644 index 04be1bb..0000000 --- a/tf-ner-poc/src/main/python/normalizer/normalizer.py +++ /dev/null @@ -1,322 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -import os -import re -import zipfile -from tempfile import TemporaryDirectory - -import tensorflow as tf -import numpy as np -import random -from math import floor -import sys - -def load_data(file): - with open(file, encoding="utf-8") as f: - target = [] - source = [] - for line in f: - parts = re.split(r'\t+', line) - target.append(parts[0].strip()); - source.append(parts[1].strip()) - return source, target - -def encode_name(char_dict, names): - - max_length = 0 - for name in names: - length = len(name) - if length > max_length: - max_length = length - - # TODO: To be able to use padding for variable length sequences - # pad with the eos marker - - encoded_names = np.zeros((len(names), max_length)) - - for bi in range(len(names)): - for ci in range(len(names[bi])): - encoded_names.itemset((bi, ci), char_dict[names[bi][ci]]) - - return encoded_names - -def mini_batch(target_char_dict, target, source_char_dict, source, batch_size, batch_index): - - begin = batch_index - end = min(batch_index + batch_size, len(source)) - - target_batch = target[begin : end] - - target_length = [] - for i in range(begin, end): - target_length.append(len(target[i]) + 1) # TODO: The correction should be done in the graph ... - - source_batch = source[batch_index : batch_index + batch_size] - source_length = [] - for i in range(begin, end): - source_length.append(len(source[i])) - - return encode_name(target_char_dict, target_batch), np.asarray(target_length), \ - encode_name(source_char_dict, source_batch), np.asarray(source_length) - -def create_graph(mode, batch_size, encoder_nchars, max_target_length, decoder_nchars): - - # Hyper parameters - encoder_char_dim = 100 - num_units = 256 - - batch_size_ph = tf.placeholder_with_default(batch_size, shape=(), name="batch_size") - - # Encoder - encoder_char_ids_ph = tf.placeholder(tf.int32, shape=[None, None], name="encoder_char_ids") - encoder_lengths_ph = tf.placeholder(tf.int32, shape=[None], name="encoder_lengths") - - encoder_embedding_weights = tf.get_variable(name="char_embeddings", dtype=tf.float32, - shape=[encoder_nchars, encoder_char_dim]) - - encoder_emb_inp = tf.nn.embedding_lookup(encoder_embedding_weights, encoder_char_ids_ph) - - if "TRAIN" == mode: - encoder_emb_inp = tf.nn.dropout(encoder_emb_inp, 0.7) - - encoder_emb_inp = tf.transpose(encoder_emb_inp, perm=[1, 0, 2]) - - encoder_cell = tf.nn.rnn_cell.LSTMCell(num_units) - initial_state = encoder_cell.zero_state(batch_size_ph, dtype=tf.float32) - - encoder_outputs, encoder_state = tf.nn.dynamic_rnn( - encoder_cell, encoder_emb_inp, initial_state=initial_state, - sequence_length=encoder_lengths_ph, - time_major=True, swap_memory=True) - - # Decoder - decoder_char_ids_ph = tf.placeholder(tf.int32, shape=[None, None], name="decoder_char_ids") - decoder_lengths = tf.placeholder(tf.int32, shape=[None], name="decoder_lengths") - - # decoder output (decoder_input shifted to the left by one) - - decoder_char_dim = 100 - decoder_embedding_weights = tf.get_variable(name="decoder_char_embeddings", dtype=tf.float32, - shape=[decoder_nchars, decoder_char_dim]) - - projection_layer = tf.layers.Dense(units=decoder_nchars, use_bias=True) # To predict one output char at a time ... - - attention_states = tf.transpose(encoder_outputs, [1, 0, 2]) - - attention_mechanism = tf.contrib.seq2seq.LuongAttention( - num_units, attention_states, - memory_sequence_length=encoder_lengths_ph) - - decoder_cell = tf.nn.rnn_cell.LSTMCell(num_units) - - decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism, - attention_layer_size=num_units) - - # decoder_initial_state = encoder_state - decoder_initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=batch_size_ph) - - if "TRAIN" == mode: - - decoder_input = tf.pad(decoder_char_ids_ph, tf.constant([[0,0], [1,0]]), - 'CONSTANT', constant_values=(decoder_nchars-2)) - - decoder_emb_inp = tf.nn.embedding_lookup(decoder_embedding_weights, decoder_input) - decoder_emb_inp = tf.transpose(decoder_emb_inp, perm=[1, 0, 2]) - - helper = tf.contrib.seq2seq.TrainingHelper( - decoder_emb_inp, [max_target_length for _ in range(batch_size)], time_major=True) - - - decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, - decoder_initial_state, output_layer=projection_layer) - - outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=True, swap_memory=True ) - - logits = outputs.rnn_output - train_prediction = outputs.sample_id - - decoder_output = tf.pad(tf.transpose(decoder_char_ids_ph, perm=[1, 0]), tf.constant([[0,1], [0,0]]), - 'CONSTANT', constant_values=(decoder_nchars-1)) - - crossent = tf.nn.sparse_softmax_cross_entropy_with_logits( - labels=decoder_output, logits=logits, name="crossent") - - loss = tf.reduce_sum(crossent * tf.to_float(decoder_lengths)) / (batch_size * max_target_length) - - # Optimizer - # TODO: Tutorial suggest to swap to SGD for alter iterations - # optimizer = tf.train.AdamOptimizer() - optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001) - gradients, v = zip(*optimizer.compute_gradients(loss)) - gradients, _ = tf.clip_by_global_norm(gradients, 10.0) - optimize = optimizer.apply_gradients(zip(gradients, v)) - - return encoder_char_ids_ph, encoder_lengths_ph, decoder_char_ids_ph, decoder_lengths, optimize, train_prediction, outputs - - if "EVAL" == mode: - helperE = tf.contrib.seq2seq.GreedyEmbeddingHelper( - decoder_embedding_weights, - tf.fill([batch_size_ph], decoder_nchars-2), decoder_nchars-1) - decoderE = tf.contrib.seq2seq.BasicDecoder( - decoder_cell, helperE, decoder_initial_state, - output_layer=projection_layer) - outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoderE, maximum_iterations=20) - - translations = tf.identity(outputs.sample_id, name="decode") - - return encoder_char_ids_ph, encoder_lengths_ph, translations - -def encode_chars(names): - char_set = set() - for name in names: - char_set = char_set.union(name) - return {k: v for v, k in enumerate(char_set)} - -# TODO: Deduplicate this, same as in namefinder.py -def write_mapping(tags, output_filename): - with open(output_filename, 'w', encoding='utf-8') as f: - for i, tag in enumerate(tags): - f.write('{}\n'.format(tag)) - -def main(): - - if len(sys.argv) != 4: - print("Usage normalizer.py train_file dev_file test_file") - return - - checkpoints_path = "/tmp/model/checkpoints" - - source_train, target_train = load_data(sys.argv[1]) - source_dev, target_dev = load_data(sys.argv[2]) - source_test, target_test = load_data(sys.argv[3]) - - source_char_dict = encode_chars(source_train + source_dev + source_test) - source_char_dict[chr(0)] = 0 - - target_char_dict = encode_chars(target_train + target_dev + target_test) - - # char id 2 is STX (Start of Text), and 3 ETX (End of Text) - target_char_dict[chr(2)] = len(target_char_dict) - target_char_dict[chr(3)] = len(target_char_dict) - - target_dict_rev = {v: k for k, v in target_char_dict.items()} - - batch_size = 20 - - target_max_len = -1 - for token in (target_train + target_dev + target_test): - target_max_len = max(target_max_len, len(token)) - - # Increase size by one for termination char - target_max_len += 1 - - train_graph = tf.Graph() - eval_graph = tf.Graph() - - with train_graph.as_default(): - t_encoder_char_ids_ph, t_encoder_lengths_ph, t_decoder_char_ids_ph, t_decoder_lengths, t_adam_optimize, t_train_prediction, t_dec_out = \ - create_graph("TRAIN", batch_size, len(source_char_dict), target_max_len, len(target_char_dict)) - train_saver = tf.train.Saver() - train_sess = tf.Session() - train_sess.run(tf.global_variables_initializer()) - - with eval_graph.as_default(): - e_encoder_char_ids_ph, e_encoder_lengths_ph, e_dec_out = \ - create_graph("EVAL", batch_size, len(source_char_dict), target_max_len, len(target_char_dict)) - eval_saver = tf.train.Saver() - - eval_sess = tf.Session(graph=eval_graph) - - for epoch in range(20): - print("Epoch " + str(epoch)) - - with train_graph.as_default(): - for batch_index in range(floor(len(source_train) / batch_size)): - if batch_index > 0 and batch_index % 100 == 0: - print("batch_index " + str(batch_index)) - - target_batch, target_length, source_batch, source_length = \ - mini_batch(target_char_dict, target_train, source_char_dict, source_train, batch_size, batch_index) - - # TODO: Add char dropout here ... - for i, j in np.ndindex(source_batch.shape): - if random.uniform(0, 1) <= 0.0005: - source_batch[i][j] = 0 - - feed_dict = {t_encoder_lengths_ph: source_length, t_encoder_char_ids_ph: source_batch, - t_decoder_lengths: target_length, t_decoder_char_ids_ph: target_batch} - - t1, dec1 = train_sess.run([t_adam_optimize, t_dec_out], feed_dict) - dec2 = train_sess.run([t_dec_out], feed_dict) - tv=1 - - # Save train model, and restore it into the eval session - checkpoint_path = train_saver.save(train_sess, checkpoints_path, global_step=epoch) - eval_saver.restore(eval_sess, checkpoint_path) - - with eval_graph.as_default(): - count_correct = 0 - for batch_index in range(floor(len(source_dev) / batch_size)): - target_batch, target_length, source_batch, source_length = \ - mini_batch(target_char_dict, target_dev, source_char_dict, source_dev, batch_size, batch_index) - - begin = batch_index - end = min(batch_index + batch_size, len(source_dev)) - target_strings = target_dev[begin:end] - - feed_dict = {e_encoder_lengths_ph: source_length, e_encoder_char_ids_ph: source_batch} - result = eval_sess.run(e_dec_out, feed_dict) - - decoded_dates = [] - - for coded_date in result: - date = "" - for char_id in coded_date: - if not char_id == len(target_char_dict) - 1: - date = date + (target_dict_rev[char_id]) - decoded_dates.append(date) - - for i in range(len(target_strings)): - if target_strings[i] == decoded_dates[i]: - count_correct = count_correct + 1 - - print("Dev: " + str(count_correct / len(target_dev))) - - with TemporaryDirectory() as temp_dir: - - temp_model_dir = temp_dir + "/model" - - - with eval_graph.as_default(): - builder = tf.saved_model.builder.SavedModelBuilder(temp_model_dir) - builder.add_meta_graph_and_variables(eval_sess, [tf.saved_model.tag_constants.SERVING]) - builder.save() - - write_mapping(source_char_dict, temp_model_dir + '/source_char_dict.txt') - write_mapping(target_char_dict, temp_model_dir + '/target_char_dict.txt') - - zipf = zipfile.ZipFile("normalizer.zip", 'w', zipfile.ZIP_DEFLATED) - - for root, dirs, files in os.walk(temp_model_dir): - for file in files: - modelFile = os.path.join(root, file) - zipf.write(modelFile, arcname=os.path.relpath(modelFile, temp_model_dir)) - -if __name__ == "__main__": - main() diff --git a/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/FeedDictionaryTest.java b/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/FeedDictionaryTest.java deleted file mode 100644 index edd9843..0000000 --- a/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/FeedDictionaryTest.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.opennlp.namefinder; - -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -import java.io.InputStream; -import java.util.Arrays; -import java.util.List; -import java.util.zip.GZIPInputStream; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; - -class FeedDictionaryTest { - - private static WordIndexer indexer; - - @BeforeAll - static void beforeClass() { - try (InputStream words = new GZIPInputStream(FeedDictionaryTest.class.getResourceAsStream("/words.txt.gz")); - InputStream chars = new GZIPInputStream(FeedDictionaryTest.class.getResourceAsStream("/chars.txt.gz"))) { - - indexer = new WordIndexer(words, chars); - } catch (Exception ex) { - indexer = null; - } - assertNotNull(indexer); - } - - @Test - void testToTokenIds() { - String text1 = "Stormy Cars ' friend says she also plans to sue Michael Cohen ."; - TokenIds oneSentence = indexer.toTokenIds(text1.split("\\s+")); - assertNotNull(oneSentence); - assertEquals(13, oneSentence.getWordIds()[0].length, "Expect 13 tokenIds"); - - String[] text2 = new String[] {"I wish I was born in Copenhagen Denmark", - "Donald Trump died on his way to Tivoli Gardens in Denmark ."}; - List<String[]> collect = Arrays.stream(text2).map(s -> s.split("\\s+")).toList(); - TokenIds twoSentences = indexer.toTokenIds(collect.toArray(new String[2][])); - assertNotNull(twoSentences); - assertEquals(8, twoSentences.getWordIds()[0].length, "Expect 8 tokenIds"); - assertEquals(12, twoSentences.getWordIds()[1].length, "Expect 12 tokenIds"); - } -} diff --git a/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/PredictTest.java b/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/PredictTest.java deleted file mode 100644 index 4501630..0000000 --- a/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/PredictTest.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.opennlp.namefinder; - -import java.io.IOException; -import java.nio.file.Path; - -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - -import opennlp.tools.util.Span; - -class PredictTest { - - // Note: As of Feb 2023, this test won't work on all platforms and, for instance, fails with - // "Cannot find TensorFlow native library for OS: darwin, architecture: aarch64" - // That's why it is disabled via the architecture system property. - // @DisabledIfSystemProperty(named = "os.arch", matches = "aarch64") - @Test - @Disabled - // TODO This test won't work as the required TF model is missing and needs to be re-trained. - // Further details, see: https://github.com/apache/opennlp-sandbox/pull/89 - void testFindTokens() throws IOException { - - // can be changed to File or InputStream - String words = PredictTest.class.getResource("/words.txt.gz").getPath(); - String chars = PredictTest.class.getResource("/chars.txt.gz").getPath(); - String tags = PredictTest.class.getResource("/tags.txt.gz").getPath(); - // Load model takes a String path!! - Path model = Path.of("savedmodel"); - - PredictionConfiguration config = new PredictionConfiguration(words, chars, tags, model.toString()); - - try (SequenceTagging tagger = new SequenceTagging(config)) { - String[] tokens = "Stormy Cars ' friend says she also plans to sue Michael Cohen .".split("\\s+"); - Span[] pred = tagger.find(tokens); - - for (int i = 0; i < tokens.length; i++) { - System.out.print(tokens[i] + "/" + pred[i] + " "); - } - System.out.println(); - } - - } -} diff --git a/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/WordIndexerTest.java b/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/WordIndexerTest.java deleted file mode 100644 index 0254612..0000000 --- a/tf-ner-poc/src/test/java/org/apache/opennlp/namefinder/WordIndexerTest.java +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.opennlp.namefinder; - -import java.io.InputStream; -import java.util.Arrays; -import java.util.List; -import java.util.zip.GZIPInputStream; - -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -import static org.junit.jupiter.api.Assertions.assertArrayEquals; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; - -class WordIndexerTest { - - private static WordIndexer indexer; - - @BeforeAll - static void beforeClass() { - try (InputStream words = new GZIPInputStream(WordIndexerTest.class.getResourceAsStream("/words.txt.gz")); - InputStream chars = new GZIPInputStream(WordIndexerTest.class.getResourceAsStream("/chars.txt.gz"))) { - indexer = new WordIndexer(words, chars); - } catch (Exception ex) { - indexer = null; - } - assertNotNull(indexer); - } - - @Test - void testToTokenIdsWithOneSentence() { - String text = "Stormy Cars ' friend says she also plans to sue Michael Cohen ."; - - TokenIds ids = indexer.toTokenIds(text.split("\\s+")); - assertEquals(13, ids.getWordIds()[0].length, "Expect 13 tokenIds"); - - assertArrayEquals(new int[] {7, 30, 34, 80, 42, 3}, ids.getCharIds()[0][0]); - assertArrayEquals(new int[] {51, 41, 80, 54}, ids.getCharIds()[0][1]); - assertArrayEquals(new int[] {64}, ids.getCharIds()[0][2]); - assertArrayEquals(new int[] {47, 80, 82, 83, 31, 23}, ids.getCharIds()[0][3]); - assertArrayEquals(new int[] {54, 41, 3, 54}, ids.getCharIds()[0][4]); - assertArrayEquals(new int[] {54, 76, 83}, ids.getCharIds()[0][5]); - assertArrayEquals(new int[] {41, 55, 54, 34}, ids.getCharIds()[0][6]); - assertArrayEquals(new int[] {46, 55, 41, 31, 54}, ids.getCharIds()[0][7]); - assertArrayEquals(new int[] {30, 34}, ids.getCharIds()[0][8]); - assertArrayEquals(new int[] {54, 50, 83}, ids.getCharIds()[0][9]); - assertArrayEquals(new int[] {39, 82, 20, 76, 41, 83, 55}, ids.getCharIds()[0][10]); - assertArrayEquals(new int[] {51, 34, 76, 83, 31}, ids.getCharIds()[0][11]); - assertArrayEquals(new int[] {65}, ids.getCharIds()[0][12]); - - // TODO investigate why the 3 commented checks are different: Different data / assertions? - assertEquals(2720, ids.getWordIds()[0][0]); - // assertEquals(15275,ids.getWordIds()[0][1]); - assertEquals(3256, ids.getWordIds()[0][2]); - assertEquals(11348, ids.getWordIds()[0][3]); - assertEquals(21054, ids.getWordIds()[0][4]); - assertEquals(18337, ids.getWordIds()[0][5]); - assertEquals(7885, ids.getWordIds()[0][6]); - assertEquals(7697, ids.getWordIds()[0][7]); - assertEquals(16601, ids.getWordIds()[0][8]); - assertEquals(2720, ids.getWordIds()[0][9]); - // assertEquals(17408, ids.getWordIds()[0][10]); - // assertEquals(11541, ids.getWordIds()[0][11]); - assertEquals(2684, ids.getWordIds()[0][12]); - - } - - @Test - void testToTokenIdsWithTwoSentences() { - - String[] text = new String[] {"I wish I was born in Copenhagen Denmark", - "Donald Trump died on his way to Tivoli Gardens in Denmark ."}; - - List<String[]> collect = Arrays.stream(text).map(s -> s.split("\\s+")).toList(); - - TokenIds ids = indexer.toTokenIds(collect.toArray(new String[2][])); - - assertEquals(8, ids.getWordIds()[0].length); - assertEquals(12, ids.getWordIds()[1].length); - - assertArrayEquals(new int[] {4}, ids.getCharIds()[0][0]); - assertArrayEquals(new int[] {6, 82, 54, 76}, ids.getCharIds()[0][1]); - assertArrayEquals(new int[] {4}, ids.getCharIds()[0][2]); - assertArrayEquals(new int[] {6, 41, 54}, ids.getCharIds()[0][3]); - assertArrayEquals(new int[] {59, 34, 80, 31}, ids.getCharIds()[0][4]); - assertArrayEquals(new int[] {82, 31}, ids.getCharIds()[0][5]); - assertArrayEquals(new int[] {51, 34, 46, 83, 31, 76, 41, 28, 83, 31}, ids.getCharIds()[0][6]); - assertArrayEquals(new int[] {36, 83, 31, 42, 41, 80, 49}, ids.getCharIds()[0][7]); - - assertArrayEquals(new int[] {36, 34, 31, 41, 55, 23}, ids.getCharIds()[1][0]); - assertArrayEquals(new int[] {52, 80, 50, 42, 46}, ids.getCharIds()[1][1]); - assertArrayEquals(new int[] {23, 82, 83, 23}, ids.getCharIds()[1][2]); - assertArrayEquals(new int[] {34, 31}, ids.getCharIds()[1][3]); - assertArrayEquals(new int[] {76, 82, 54}, ids.getCharIds()[1][4]); - assertArrayEquals(new int[] {6, 41, 3}, ids.getCharIds()[1][5]); - assertArrayEquals(new int[] {30, 34}, ids.getCharIds()[1][6]); - assertArrayEquals(new int[] {52, 82, 11, 34, 55, 82}, ids.getCharIds()[1][7]); - assertArrayEquals(new int[] {74, 41, 80, 23, 83, 31, 54}, ids.getCharIds()[1][8]); - assertArrayEquals(new int[] {82, 31}, ids.getCharIds()[1][9]); - assertArrayEquals(new int[] {36, 83, 31, 42, 41, 80, 49}, ids.getCharIds()[1][10]); - assertArrayEquals(new int[] {65}, ids.getCharIds()[1][11]); - - // TODO investigate why the 6 commented checks are different: Different data / assertions? - // assertEquals(21931, ids.getWordIds()[0][0]); - assertEquals(20473, ids.getWordIds()[0][1]); - // assertEquals(21931, ids.getWordIds()[0][2]); - assertEquals(5477, ids.getWordIds()[0][3]); - assertEquals(11538, ids.getWordIds()[0][4]); - assertEquals(21341, ids.getWordIds()[0][5]); - // assertEquals(14024, ids.getWordIds()[0][6]); - // assertEquals(7420, ids.getWordIds()[0][7]); - - // assertEquals(12492, ids.getWordIds()[1][0]); - assertEquals(2720, ids.getWordIds()[1][1]); - assertEquals(9476, ids.getWordIds()[1][2]); - assertEquals(16537, ids.getWordIds()[1][3]); - assertEquals(18966, ids.getWordIds()[1][4]); - assertEquals(21088, ids.getWordIds()[1][5]); - assertEquals(16601, ids.getWordIds()[1][6]); - assertEquals(2720, ids.getWordIds()[1][7]); - assertEquals(2720, ids.getWordIds()[1][8]); - assertEquals(21341, ids.getWordIds()[1][9]); - // assertEquals(7420, ids.getWordIds()[1][10]); - assertEquals(2684, ids.getWordIds()[1][11]); - } - -} diff --git a/tf-ner-poc/src/test/resources/chars.txt.gz b/tf-ner-poc/src/test/resources/chars.txt.gz deleted file mode 100644 index c31b81a..0000000 Binary files a/tf-ner-poc/src/test/resources/chars.txt.gz and /dev/null differ diff --git a/tf-ner-poc/src/test/resources/tags.txt.gz b/tf-ner-poc/src/test/resources/tags.txt.gz deleted file mode 100644 index 0f0ceda..0000000 Binary files a/tf-ner-poc/src/test/resources/tags.txt.gz and /dev/null differ diff --git a/tf-ner-poc/src/test/resources/words.txt.gz b/tf-ner-poc/src/test/resources/words.txt.gz deleted file mode 100644 index 5f55ec0..0000000 Binary files a/tf-ner-poc/src/test/resources/words.txt.gz and /dev/null differ
