[ https://issues.apache.org/jira/browse/FLINK-1735?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14537318#comment-14537318 ]
ASF GitHub Bot commented on FLINK-1735: --------------------------------------- Github user aalexandrov commented on a diff in the pull request: https://github.com/apache/flink/pull/665#discussion_r30004988 --- Diff: flink-staging/flink-ml/src/test/scala/org/apache/flink/ml/feature/extraction/FeatureHasherSuite.scala --- @@ -0,0 +1,245 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.feature.extraction + +import org.apache.flink.api.scala.{ExecutionEnvironment, _} +import org.apache.flink.ml.math.SparseVector +import org.apache.flink.test.util.FlinkTestBase +import org.scalatest.{FlatSpec, Matchers} + +class FeatureHasherSuite + extends FlatSpec + with Matchers + with FlinkTestBase { + + behavior of "Flink's Feature Hasher" + + import FeatureHasherData._ + + it should "transform a sequence of strings into a sparse feature vector of given size" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + env.setParallelism(2) + + for (numFeatures <- numFeaturesTest) { + val inputDS = env.fromCollection(input) + + val transformer = FeatureHasher() + .setNumFeatures(numFeatures) + + val transformedDS = transformer.transform(inputDS) + val results = transformedDS.collect() + + for ((result, expectedResult) <- results zip expectedResults(numFeatures)) { + result.equalsVector(expectedResult) should be(true) + } + } + } + + it should "transform a sequence of strings into a sparse feature vector of given size," + + "with non negative entries" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + env.setParallelism(2) + + for (numFeatures <- numFeaturesTest) { + val inputDS = env.fromCollection(input) + + val transformer = FeatureHasher() + .setNumFeatures(numFeatures).setNonNegative(true) + + val transformedDS = transformer.transform(inputDS) + val results = transformedDS.collect() + + for ((result, expectedResult) <- results zip expectedResultsNonNegative(numFeatures)) { + result.equalsVector(expectedResult) should be(true) + } + } + } + + it should "transform a sequence of strings into a sparse feature vector of default size," + + " when parameter is less than 1" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + env.setParallelism(2) + + val inputDS = env.fromCollection(input) + + val numFeatures = 0 + + val transformer = FeatureHasher() + .setNumFeatures(numFeatures).setNonNegative(false) + + val transformedDS = transformer.transform(inputDS) + val results = transformedDS.collect() + + for (result <- results) { + result.size should equal(Math.pow(2, 20).toInt) + } + } +} + +object FeatureHasherData { + + val input = Seq( + "Two households both alike in dignity".split(" ").toSeq, + "In fair Verona where we lay our scene".split(" ").toSeq, + "From ancient grudge break to new mutiny".split(" ").toSeq, + "Where civil blood makes civil hands unclean".split(" ").toSeq, + "From forth the fatal loins of these two foes".split(" ").toSeq + ) + + /* 2^30 features can't be tested right now because the implementation of Vector.equalsVector + performs an index wise comparison on the two vectors, which takes approx. forever */ + val numFeaturesTest = Seq(Math.pow(2, 4).toInt, Math.pow(2, 5).toInt, 1234, + Math.pow(2, 16).toInt, Math.pow(2, 20).toInt) //, Math.pow(2, 30).toInt) + + val expectedResults = List( + 16 -> List( + SparseVector.fromCOO(16, Map((0, 1.0), (1, 1.0), (2, -1.0), (14, -1.0))), --- End diff -- you can use arrow notation for the `pair` entries in the `Map` constructor, e.g., ```scala Map(0 -> 1.0, 1 -> 1.0, ...) ``` > Add FeatureHasher to machine learning library > --------------------------------------------- > > Key: FLINK-1735 > URL: https://issues.apache.org/jira/browse/FLINK-1735 > Project: Flink > Issue Type: New Feature > Components: Machine Learning Library > Reporter: Till Rohrmann > Assignee: Felix Neutatz > Labels: ML > > Using the hashing trick [1,2] is a common way to vectorize arbitrary feature > values. The hash of the feature value is used to calculate its index for a > vector entry. In order to mitigate possible collisions, a second hashing > function is used to calculate the sign for the update value which is added to > the vector entry. This way, it is likely that collision will simply cancel > out. > A feature hasher would also be helpful for NLP problems where it could be > used to vectorize bag of words or ngrams feature vectors. > Resources: > [1] [https://en.wikipedia.org/wiki/Feature_hashing] > [2] > [http://scikit-learn.org/stable/modules/feature_extraction.html#feature-extraction] -- This message was sent by Atlassian JIRA (v6.3.4#6332)