Question

我有这个火花Udf的余弦相似性。

def cosineSimilarity(df):
    """ Cosine similarity of the each document with other

    """

    from pyspark.sql.functions import udf
    from pyspark.sql.types import DoubleType
    from scipy.spatial import distance

    cosine = udf(lambda v1, v2: (
     float(1-distance.cosine(v1, v2)) if v1 is not None and v2 is not None else None),
     DoubleType())

    # Creating a cross product of the table to get the cosine similarity vectors 

    crosstabDF=df.withColumnRenamed('id','id_1').withColumnRenamed('w2v_vector','w2v_vector_1')\
    .join(df.withColumnRenamed('id','id_2').withColumnRenamed('w2v_vector','w2v_vector_2'))

    similardocsDF= crosstabDF.withColumn('cosinesim', cosine("w2v_vector_1","w2v_vector_1"))

    return similardocsDF

similardocsDF=cosineSimilarity(w2vdf.select('id','w2v_vector'))

similardocsDF.cache().take(4)

我的数据如下：

w2vdf.select('id','tokenised_text','w2v_vector').take(1)
#tfidfDf.take(1)


Out[18]:
[Row(id=-33753621, tokenised_text=[u'if', u'you', u'hate', u'dealing', u'with', u'bank', u'tellers', u'or', u'customer', u'service', u'representatives,', u'then', u'the', u'royal', u'bank', u'of', u'scotland', u'might', u'have', u'a', u'solution', u'for', u'you.if', u'this', u'program', u'is', u'successful,', u'it', u'could', u'be', u'a', u'big', u'step', u'forward', u'on', u'the', u'road', u'to', u'automated', u'customer', u'service', u'through', u'the', u'use', u'of', u'ai,', u'notes', u'laurie', u'beaver,', u'research', u'associate', u'for', u'bi', u'intelligence,', u'business', u"insider's", u'premium', u'research', u"service.it's", u'noteworthy', u'that', u'luvo', u'does', u'not', u'operate', u'via', u'a', u'third-party', u'app', u'such', u'as', u'facebook', u'messenger,', u'wechat,', u'or', u'kik,', u'all', u'of', u'which', u'are', u'currently', u'trying', u'to', u'create', u'bots', u'that', u'would', u'assist', u'in', u'customer', u'service', u'within', u'their', u'respective', u'platforms.luvo', u'would', u'be', u'available', u'through', u'the', u'web', u'and', u'through', u'smartphones.', u'it', u'would', u'also', u'use', u'machine', u'learning', u'to', u'learn', u'from', u'its', u'mistakes,', u'which', u'should', u'ultimately', u'help', u'with', u'its', u'response', u'accuracy.down', u'the', u'road,', u'luvo', u'would', u'become', u'a', u'supplement', u'to', u'the', u'human', u'staff.', u'it', u'can', u'currently', u'answer', u'20', u'set', u'questions', u'but', u'as', u'that', u'number', u'grows,', u'it', u'would', u'allow', u'the', u'human', u'employees', u'to', u'more', u'complicated', u'issues.', u'if', u'a', u'problem', u'is', u'beyond', u"luvo's", u'comprehension,', u'then', u'it', u'would', u'refer', u'the', u'customer', u'to', u'a', u'bank', u'employee;', u'however,\xa0a', u'user', u'could', u'choose', u'to', u'speak', u'with', u'a', u'human', u'instead', u'of', u'luvo', u'anyway.ai', u'such', u'as', u'luvo,', u'if', u'successful,', u'could', u'help', u'businesses', u'become', u'more', u'efficient', u'and', u'increase', u'their', u'productivity,', u'while', u'simultaneously', u'improving', u'customer', u'service', u'capacity,', u'which', u'would', u'consequently\xa0save', u'money', u'that', u'would', u'otherwise', u'go', u'toward', u'manpower.and', u'this', u'trend', u'is', u'already', u'starting.', u'google,', u'microsoft,', u'and', u'ibm', u'are', u'investing', u'significantly', u'into', u'ai', u'research.', u'furthermore,', u'the', u'global', u'ai', u'market', u'is', u'estimated', u'to', u'grow', u'from', u'approximately', u'$420', u'million', u'in', u'2014', u'to', u'$5.05', u'billion', u'in', u'2020,', u'according', u'to', u'a', u'forecast', u'by', u'research', u'and', u'markets.\xa0the', u'move', u'toward', u'ai', u'would', u'be', u'just', u'one', u'more', u'way', u'in', u'which', u'the', u'digital', u'age', u'is', u'disrupting', u'retail', u'banking.', u'customers,', u'particularly', u'millennials,', u'are', u'increasingly', u'moving', u'toward', u'digital', u'banking,', u'and', u'as', u'a', u'result,', u"they're", u'walking', u'into', u'their', u"banks'", u'traditional', u'brick-and-mortar', u'branches', u'less', u'often', u'than', u'ever', u'before.'], w2v_vector=DenseVector([-0.0394, -0.0388, 0.0368, -0.0455, 0.0602, -0.0734, 0.0515, -0.0064, -0.068, -0.0438, 0.0671, 0.007, -0.0227, -0.0393, -0.0254, -0.024, 0.0115, 0.0415, -0.0116, -0.0169, 0.0545, -0.0439, 0.0414, 0.0312, -0.028, -0.0085, 0.0234, -0.1321, -0.0364, 0.0921, 0.0208, 0.0156, 0.0071, 0.0186, -0.0455, -0.0634, 0.0379, 0.0148, 0.0401, -0.0395, 0.0334, 0.0026, -0.0748, -0.0242, -0.0373, 0.0602, -0.0341, -0.0181, 0.0723, 0.0012, -0.1177, 0.0319, 0.0322, -0.1054, -0.0011, -0.0415, -0.0161, -0.0472, -0.0785, -0.0219, -0.0311, 0.0296, -0.0149, 0.04, 0.0001, 0.0337, 0.0841, -0.0344, -0.0171, 0.0425, -0.0122, 0.0838, 0.034, 0.0054, 0.0171, 0.0209, 0.0286, -0.0227, -0.0147, 0.0532, -0.027, -0.0645, -0.0858, -0.1444, 0.0824, 0.0128, -0.0485, -0.0378, -0.0229, 0.0331, -0.0248, 0.0427, -0.0624, -0.0324, -0.0271, 0.0135, 0.0504, 0.0028, -0.0772, 0.0121, -0.09, 0.031, -0.0771, -0.0703, 0.0947, 0.0997, -0.0084, 0.0774, 0.0281, 0.0405, -0.0475, 0.0217, 0.0591, 0.0241, -0.0287, 0.1064, 0.059, -0.06, 0.0422, 0.0908, 0.0341, 0.028, -0.0334, 0.0065, -0.0289, -0.0851, -0.0208, 0.0598, -0.0218, 0.001, 0.0049, 0.0257, 0.0076, -0.0599, 0.006, -0.0494, -0.0081, 0.0066, 0.0131, -0.0299, 0.0159, -0.0383, 0.0402, -0.0571, 0.0359, 0.0009, 0.0404, -0.0207, 0.0044, -0.0089, 0.0306, -0.0405, -0.0012, 0.0159, -0.005, -0.031, -0.0016, -0.0081, 0.0123, -0.0364, 0.0161, -0.0383, -0.0303, -0.0073, -0.0184, 0.0399, 0.0412, 0.0278, 0.0455, -0.0304, 0.0145, -0.0163, 0.0631, -0.0423, 0.0239, 0.0801, -0.0659, -0.0382, 0.0138, 0.051, 0.0056, -0.1605, 0.0018, 0.0077, -0.0076, 0.0119, 0.0397, -0.0823, -0.0462, 0.0465, 0.0735, 0.0283, -0.0205, -0.012, 0.0662, 0.0429, 0.0089, -0.0562, 0.1624, 0.0192, 0.0098, -0.0483, 0.0248, 0.0005, -0.0619, -0.0115, 0.0424, -0.0875, 0.0383, -0.0463, -0.0044, -0.0218, 0.014, -0.0404, -0.0198, -0.0162, -0.018, -0.0377, -0.0291, -0.0273, -0.0713, -0.0047, 0.0263, 0.0809, -0.0477, 0.0056, -0.0563, -0.061, -0.0185, 0.0223, -0.0718, 0.0163, 0.0061, -0.0716, -0.0081, 0.0079, 0.0156, -0.0124, -0.0223, -0.0092, -0.0621, 0.0033, 0.031, 0.0509, -0.0548, -0.0121, -0.0276, 0.0176, -0.04, 0.0382, -0.0737, 0.0202, -0.0314, -0.0702, 0.0685, -0.0928, 0.0698, -0.0484, 0.0541, -0.0539, 0.0895, 0.0076, -0.0134, -0.0116, 0.0227, -0.0361, -0.0729, -0.0068, -0.0501, 0.0137, -0.0134, 0.0039, -0.0463, 0.0289, -0.0336, -0.0731, -0.0362, -0.0195, 0.0466, -0.0132, 0.0336, 0.0108, 0.0219, -0.0702, -0.0117, -0.0285, 0.0644, -0.0806, 0.002, -0.0603, 0.0365, 0.0333, 0.0197, -0.037, 0.0983, 0.0011, 0.0436, 0.0506, -0.0089, -0.0134]))]

但是当我运行上面的代码时，它仍然运行了2个小时仍然没有完成。我在Mac上安装的本地火花上运行它。总观察值为59K，文本doc的w2vec矢量嵌入为300维。

Answer 1

总结对这个问题的评论：你计算相似性的方法是非常低效的（它的实际蛮力）。

您应该考虑使用其他算法，例如DIMSUM： https://databricks.com/blog/2014/10/20/efficient-similarity-algorithm-now-in-spark-twitter.html

自2.0.0起，它在pyspark中可用：http://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.RowMatrix.columnSimilarities

Spark Udf需要时间来运行

1 个答案: