Example 1: Gradient-Boosted Trees (GBTs) learning algorithm for regression
from numpy import allclose
from pyspark.ml.linalg import Vectors
df = spark.createDataFrame([
(1.0, Vectors.dense(1.0)),
(0.0, Vectors.sparse(1, [], []))], ["label", "features"])
gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42)
print(gbt.getImpurity())
model = gbt.fit(df)
model.featureImportances
model.numFeatures
allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1])
test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
model.transform(test0).head().prediction
test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
model.transform(test1).head().prediction
gbtr_path = temp_path + "gbtr"
gbt.save(gbtr_path)
gbt2 = GBTRegressor.load(gbtr_path)
gbt2.getMaxDepth()
model_path = temp_path + "gbtr_model"
model.save(model_path)
model2 = GBTRegressionModel.load(model_path)
model.featureImportances == model2.featureImportances
model.treeWeights == model2.treeWeights
model.trees
Example 2: Gradient-Boosted Trees (GBTs) learning algorithm for classification
from numpy import allclose
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
df = spark.createDataFrame([
(1.0, Vectors.dense(1.0)),
(0.0, Vectors.sparse(1, [], []))], ["label", "features"])
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(df)
td = si_model.transform(df)
gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42)
model = gbt.fit(td)
model.featuresImportances
SparseVector(1, {0: 1.0})
allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1])
test0 = spark.createDataFrame([Vectors.dense(-1.0),)], ["features"])
model.transform(test0).head().prediction
test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
model.transform(test1).head().prediction
model.totalNumNodes
print(model.toDebugString)
gbtc_path = temp_path + "gbtc"
gbt.save(gbtc_path)
gbt2 = GBTClassifier.load(gbtc_path)
gbt2.getMaxDepth()
model_path = temp_path + "gbtc_model"
model.save(model_path)
model2 = GBTClassificationModel.load(model_path)
model.featureImportances == model2.featureImportances
model.treeWeights == model2.treeWeights
model.trees