res = r
>
> metric = dp
>
> }
>
> }
>
> return res.getAs[String]("_c1")
>
> }
>
>
> def cosineSimilarity(vectorA: Vector, vectorB: Vector) = {
>
> var dotProduct = 0.0
>
> var normA = 0.0
>
> var normB = 0.0
>>
>> val dp = dorProduct(tfIdfSrc, tfIdfDst)
>>
>> if (dp > metric) {
>>
>> res = r
>>
>> metric = dp
>>
>> }
>>
>> }
>>
>> return res.getAs[String]("_c1")
&
* vectorB(i)
>
> normA += Math.pow(vectorA(i), 2)
>
> normB += Math.pow(vectorB(i), 2)
>
> }
>
> (dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)))
>
> }
>
>
> def dorProduct(vectorA: Vector, vectorB: Vector) = {
>
> var dp = 0.
tProduct / (Math.sqrt(normA) * Math.sqrt(normB)))
}
def dorProduct(vectorA: Vector, vectorB: Vector) = {
var dp = 0.0
var index = vectorA.size - 1
for (i <- 0 to index) {
dp += vectorA(i) * vectorB(i)
}
dp
}
On Sun, Nov 13, 2016 at 7:04 PM, Meeraj Kunnumpur
Hello,
I have a dataset containing TF-IDF vectors for a corpus of documents. How
do I perform a nearest neighbour search on the dataset, using cosine
similarity?
val df = spark.read.option("header", "false").csv("data")
val tk = new Tokenizer().setInput