为什么使用spark.ml

sklearn只能单机计算，适合小数据的方法验证
spark有集群模式，适合大型数据

spark.ml基础

数据格式：spark的DataFrame（与pandas的DataFrame区分）
Transformer：是可以将一个DataFrame变换成另一个，用于数据前处理。
Estimator：是一个算法，对一个DataFrame进行Fit后得到Estimator，再对test数据进行验证。

spark.ml机器学习流程

源数据ETL
数据预处理：如从pandas的DataFrame到Spark的DataFrame;将字符型特征转化为数值
特征提取
模型的训练和验证

spark.ml分类实战代码

导入数据，并由pd.DataFrame转化为Spark.DataFrame

from pyspark.sql import SparkSession
import pandas as pd
from pyspark.ml.linalg import Vector,Vectors
from pyspark.sql import Row

pd_df = pd.read_table('/home/iris.txt',header =None,sep=',',skiprows=1)
m = []
for i in range(pd_df.shape[0]):   
    dense = []
    for j in range(1,pd_df.shape[1]-1):       
        dense.append(float(pd_df.iloc[i,j]))
    rel = {}
    # rel['features'] = Vectors.dense(float(pd_df.iloc[i,1]),float(pd_df.iloc[i,2]),float(pd_df.iloc[i,3]),float(pd_df.iloc[i,4]))
    rel['features'] = Vectors.dense(dense)
    rel['label'] = str(pd_df.iloc[i,5])
    m.append(rel)

data = spark.createDataFrame(m)

data.createOrReplaceTempView("iris")
df = spark.sql("select * from iris")

2.数据预处理

#分别获取标签列和特征列，进行索引，并进行了重命名。
labelIndexer = StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(df)
 
featureIndexer = VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(df)
#这里我们设置一个labelConverter，目的是把预测的类别重新转化成字符型的。
labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
#接下来，我们把数据集随机分成训练集和测试集，其中训练集占70%。
trainingData, testData = data.randomSplit([0.7, 0.3])

构建pipline

from pyspark.ml.classification import DecisionTreeClassificationModel,DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#训练决策树模型,这里我们可以通过setter的方法来设置决策树的参数，也可以用ParamMap来设置（具体的可以查看spark mllib的官网）。具体的可以设置的参数可以通过explainParams()来获取。
dtClassifier = DecisionTreeClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures")
#在pipeline中进行设置
pipelinedClassifier = Pipeline().setStages([labelIndexer, featureIndexer, dtClassifier, labelConverter])
#训练决策树模型
modelClassifier = pipelinedClassifier.fit(trainingData)
#进行预测
predictionsClassifier = modelClassifier.transform(testData)

predictionsClassifier.select("predictedLabel", "label", "features").show(20)


evaluatorClassifier = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("accuracy")
 
accuracy = evaluatorClassifier.evaluate(predictionsClassifier)
 
print("Test Error = " + str(1.0 - accuracy))

spark.ml回归实战代码

import pandas as pd
from pyspark.ml import Pipeline
from pyspark.sql import SQLContext, SparkSession

from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vector,Vectors
# Load and parse the data file, converting it to a DataFrame.
# data = spark.read.format("libsvm").load("file:///home/liyiguo/spark/sample_libsvm_data.txt")

spark = SparkSession.builder.master("local").appName("Word Count").getOrCreate()

pd_df = pd.read_table('/home/liyiguo/regressin_data.txt',header =None,sep='\t',skiprows=1)

m = []
for i in range(pd_df.shape[0]):   
    dense = []
    for j in range(1,pd_df.shape[1]-1):       
        dense.append(float(pd_df.iloc[i,j]))
    rel = {}
    rel['features'] = Vectors.dense(dense)
    # rel['label'] = str(pd_df.iloc[i,pd_df.shape[1]-1])
    rel['label'] = float(pd_df.iloc[i,pd_df.shape[1]-1])
    m.append(rel)

data = spark.createDataFrame(m)


# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.

scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
featureIndexer =VectorIndexer(inputCol="scaledFeatures", outputCol="indexedFeatures", maxCategories=4)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="indexedFeatures")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[scaler,featureIndexer, rf])

# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[1]
print(rfModel)  # summary only