tfidf vectorizer inverse docutment frequenct weighting code example
Example: Compute the Inverse Document Frequency
from pyspark.ml.linalg import DenseVector
df = spark.createDataFrame([(DenseVector([1.0, 2.0]),),
(DenseVector([0.0, 1.0]),),
(DenseVector([3.0, 0.2]),)], ["tf"])
idf = IDF(minDocFreq=3, inputCol="tf", outputCol="idf")
model = idf.fit(df)
model.idf
model.transform(df).head().idf
idf.setParams(outputCol="freqs").fit(df).transform(df).collect()[1].freqs
params = {idf.minDocFreq: 1, idf.outputCol: "vector"}
idf.fit(df, params).transform(df).head().vector
idfPath = temp_path + "/idf"
idf.save(idfPath)
loadedIDF = IDF.load(idfPath)
loadedIDF.getMinDocFreq() == idf.getMinDocFreq()
modelPath = temp_path + "/idf-model"
model.save(modelPath)
loadedModel = IDFModel.load(modelPath)
loadedModel.transform(df).head().idf == model.transform(df).head().idf