[ 
https://issues.apache.org/jira/browse/FLINK-1735?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14537318#comment-14537318
 ] 

ASF GitHub Bot commented on FLINK-1735:
---------------------------------------

Github user aalexandrov commented on a diff in the pull request:

    https://github.com/apache/flink/pull/665#discussion_r30004988
  
    --- Diff: 
flink-staging/flink-ml/src/test/scala/org/apache/flink/ml/feature/extraction/FeatureHasherSuite.scala
 ---
    @@ -0,0 +1,245 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.flink.ml.feature.extraction
    +
    +import org.apache.flink.api.scala.{ExecutionEnvironment, _}
    +import org.apache.flink.ml.math.SparseVector
    +import org.apache.flink.test.util.FlinkTestBase
    +import org.scalatest.{FlatSpec, Matchers}
    +
    +class FeatureHasherSuite
    +  extends FlatSpec
    +  with Matchers
    +  with FlinkTestBase {
    +
    +  behavior of "Flink's Feature Hasher"
    +
    +  import FeatureHasherData._
    +
    +  it should "transform a sequence of strings into a sparse feature vector 
of given size" in {
    +    val env = ExecutionEnvironment.getExecutionEnvironment
    +
    +    env.setParallelism(2)
    +
    +    for (numFeatures <- numFeaturesTest) {
    +      val inputDS = env.fromCollection(input)
    +
    +      val transformer = FeatureHasher()
    +        .setNumFeatures(numFeatures)
    +
    +      val transformedDS = transformer.transform(inputDS)
    +      val results = transformedDS.collect()
    +
    +      for ((result, expectedResult) <- results zip 
expectedResults(numFeatures)) {
    +        result.equalsVector(expectedResult) should be(true)
    +      }
    +    }
    +  }
    +
    +  it should "transform a sequence of strings into a sparse feature vector 
of given size," +
    +    "with non negative entries" in {
    +    val env = ExecutionEnvironment.getExecutionEnvironment
    +
    +    env.setParallelism(2)
    +
    +    for (numFeatures <- numFeaturesTest) {
    +      val inputDS = env.fromCollection(input)
    +
    +      val transformer = FeatureHasher()
    +        .setNumFeatures(numFeatures).setNonNegative(true)
    +
    +      val transformedDS = transformer.transform(inputDS)
    +      val results = transformedDS.collect()
    +
    +      for ((result, expectedResult) <- results zip 
expectedResultsNonNegative(numFeatures)) {
    +        result.equalsVector(expectedResult) should be(true)
    +      }
    +    }
    +  }
    +
    +  it should "transform a sequence of strings into a sparse feature vector 
of default size," +
    +    " when parameter is less than 1" in {
    +    val env = ExecutionEnvironment.getExecutionEnvironment
    +
    +    env.setParallelism(2)
    +
    +    val inputDS = env.fromCollection(input)
    +
    +    val numFeatures = 0
    +
    +    val transformer = FeatureHasher()
    +      .setNumFeatures(numFeatures).setNonNegative(false)
    +
    +    val transformedDS = transformer.transform(inputDS)
    +    val results = transformedDS.collect()
    +
    +    for (result <- results) {
    +      result.size should equal(Math.pow(2, 20).toInt)
    +    }
    +  }
    +}
    +
    +object FeatureHasherData {
    +
    +  val input = Seq(
    +    "Two households both alike in dignity".split(" ").toSeq,
    +    "In fair Verona where we lay our scene".split(" ").toSeq,
    +    "From ancient grudge break to new mutiny".split(" ").toSeq,
    +    "Where civil blood makes civil hands unclean".split(" ").toSeq,
    +    "From forth the fatal loins of these two foes".split(" ").toSeq
    +  )
    +
    +  /* 2^30 features can't be tested right now because the implementation of 
Vector.equalsVector
    +  performs an index wise comparison on the two vectors, which takes 
approx. forever */
    +  val numFeaturesTest = Seq(Math.pow(2, 4).toInt, Math.pow(2, 5).toInt, 
1234,
    +    Math.pow(2, 16).toInt, Math.pow(2, 20).toInt) //, Math.pow(2, 
30).toInt)
    +
    +  val expectedResults = List(
    +    16 -> List(
    +      SparseVector.fromCOO(16, Map((0, 1.0), (1, 1.0), (2, -1.0), (14, 
-1.0))),
    --- End diff --
    
    you can use arrow notation for the `pair` entries in the `Map` constructor, 
e.g.,
    
    ```scala
    Map(0 -> 1.0, 1 -> 1.0, ...)
    ``` 


> Add FeatureHasher to machine learning library
> ---------------------------------------------
>
>                 Key: FLINK-1735
>                 URL: https://issues.apache.org/jira/browse/FLINK-1735
>             Project: Flink
>          Issue Type: New Feature
>          Components: Machine Learning Library
>            Reporter: Till Rohrmann
>            Assignee: Felix Neutatz
>              Labels: ML
>
> Using the hashing trick [1,2] is a common way to vectorize arbitrary feature 
> values. The hash of the feature value is used to calculate its index for a 
> vector entry. In order to mitigate possible collisions, a second hashing 
> function is used to calculate the sign for the update value which is added to 
> the vector entry. This way, it is likely that collision will simply cancel 
> out.
> A feature hasher would also be helpful for NLP problems where it could be 
> used to vectorize bag of words or ngrams feature vectors.
> Resources:
> [1] [https://en.wikipedia.org/wiki/Feature_hashing]
> [2] 
> [http://scikit-learn.org/stable/modules/feature_extraction.html#feature-extraction]



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to