topic modeling python code example

Example: Latent Dirichlet Allocation (LDA), a topic model designed for text documents

# Latent Dirichlet Allocation (LDA), a topic model designed for text documents

from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.ml.clustering import LDA
df = spark.createDataFrame([[1, Vectors.dense([0.0, 1.0])],
                            [2, SparseVector(2, {0: 1.0})],], ["id", "features"])
lda = LDA(k=2, seed=1, optimizer="em")
model = lda.fit(df)
model.isDistributed()
# True
localModel = model.toLocal()
localModel.isDistributed()
# False
model.vocabSize()
# 2
model.describeTopics().show()
# +-----+-----------+--------------------+
# |topic|termIndices|         termWeights|
# +-----+-----------+--------------------+
# |    0|     [1, 0]|[0.50401530077160...|
# |    1|     [0, 1]|[0.50401530077160...|
# +-----+-----------+--------------------+
# ...
model.topicsMatrix()
# DenseMatrix(2, 2, [0.496, 0.504, 0.504, 0.496], 0)
lda_path = temp_path + "/lda"
lda.save(lda_path)
sameLDA = LDA.load(lda_path)
distributed_model_path = temp_path + "/lda_distributed_model"
model.save(distributed_model_path)
sameModel = DistributedLDAModel.load(distributed_model_path)
local_model_path = temp_path + "/lda_local_model"
localModel.save(local_model_path)
sameLocalModel = LocalLDAModel.load(local_model_path)