tfidf calculator code example

Example: Compute the Inverse Document Frequency

# Compute the Inverse Document Frequency (IDF)

from pyspark.ml.linalg import DenseVector
df = spark.createDataFrame([(DenseVector([1.0, 2.0]),), 
                            (DenseVector([0.0, 1.0]),), 
                            (DenseVector([3.0, 0.2]),)], ["tf"])
idf = IDF(minDocFreq=3, inputCol="tf", outputCol="idf")
model = idf.fit(df)
model.idf
# DenseVector([0.0, 0.0])
model.transform(df).head().idf
# DenseVector([0.0, 0.0])
idf.setParams(outputCol="freqs").fit(df).transform(df).collect()[1].freqs
# DenseVector([0.0, 0.0])
params = {idf.minDocFreq: 1, idf.outputCol: "vector"}
idf.fit(df, params).transform(df).head().vector
# DenseVector([0.2877, 0.0])
idfPath = temp_path + "/idf"
idf.save(idfPath)
loadedIDF = IDF.load(idfPath)
loadedIDF.getMinDocFreq() == idf.getMinDocFreq()
# True
modelPath = temp_path + "/idf-model"
model.save(modelPath)
loadedModel = IDFModel.load(modelPath)
loadedModel.transform(df).head().idf == model.transform(df).head().idf
# True