环境准备
import pandas as pd
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
solTrainXtrans = pd.read_csv('Documents/solTrainXtrans.csv')
solTrainY = pd.read_csv('Documents/solTrainY.csv')
df = spark.createDataFrame(pd.concat([solTrainY,solTrainXtrans],axis=1))
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor,RandomForestRegressor,GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
new_col = df.columns[1:]
vecAssembler = VectorAssembler(inputCols=new_col,outputCol="features")
在spark-ml中,回归树就是一棵树的随机森林。由于笔记本内存有限,这里只调优了树的深度和直方图箱数两个参数。
rt = DecisionTreeRegressor(featuresCol='features',labelCol='x')
pipeline = Pipeline(stages=[vecAssembler, rt])
paramGrid = ParamGridBuilder() \
.addGrid(rt.maxDepth, [2, 3, 4, 5]) \
.addGrid(rt.maxBins, [8, 16, 32, 64]) \
.build()
crossval = CrossValidator(estimator=pipeline,
estimatorParamMaps=paramGrid,
evaluator=RegressionEvaluator(labelCol='x'),
numFolds=3) # use 3+ folds in practice
cvModel = crossval.fit(df)
cvModel.avgMetrics
[1.375145526899881,
1.3742227339596311,
1.3742227339596311,
1.3742227339596311,
1.145979683867352,
1.207687674896534,
1.199449418675475,
1.1973652530565928,
1.0414651087721725,
1.0787561062855386,
1.058464583664638,
1.0506277015788559,
0.9994746563115181,
1.0270945545741466,
0.9893333399004255,
0.9838269753761816]
随机森林,只调优了树的数目和采样率
rf = RandomForestRegressor(featuresCol='features',labelCol='x')
pipeline = Pipeline(stages=[vecAssembler, rf])
paramGrid = ParamGridBuilder() \
.addGrid(rf.numTrees, [10, 20]) \
.addGrid(rf.subsamplingRate, [0.8, 1.0]) \
.build()
crossval = CrossValidator(estimator=pipeline,
estimatorParamMaps=paramGrid,
evaluator=RegressionEvaluator(labelCol='x'),
numFolds=3) # use 3+ folds in practice
cvModel = crossval.fit(df)
cvModel.avgMetrics
[0.8366883165688397, 0.854227893368852, 0.822243917232207, 0.8174713058234282]
梯度提升树。参数非常多,个人经验是书的深度可以浅一些。
gb = GBTRegressor(featuresCol='features',labelCol='x',maxDepth=3,maxIter=100)
pipeline = Pipeline(stages=[vecAssembler, gb])
paramGrid = ParamGridBuilder() \
.addGrid(rf.subsamplingRate, [0.8]) \
.build()
crossval = CrossValidator(estimator=pipeline,
estimatorParamMaps=paramGrid,
evaluator=RegressionEvaluator(labelCol='x'),
numFolds=3) # use 3+ folds in practice
cvModel = crossval.fit(df)
cvModel.avgMetrics
[0.7461370059969544]
后面可以看一看源码。
Original: https://blog.csdn.net/littlehuangnan/article/details/126668375
Author: littlehuangnan
Title: 用pyspark学习《应用预测建模》(七)回归树、随机森林、梯度提升
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/634665/
转载文章受原作者版权保护。转载请注明原作者出处!