python数据分析之分类模型与回归模型-第七次笔记
2021/7/13 20:36:06
本文主要是介绍python数据分析之分类模型与回归模型-第七次笔记,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!
python数据分析之分类模型与回归模型-第七次笔记
1.分类模型
– *1.1KNN 算法
– *1.2朴素贝叶斯 算法
– *1.3支持向量机SVM 算法
– *1.4集成方法—随机森林算法
– *1.5集成方法—Adaboost 算法
– *1.6决策树
2.回归模型
– *2.1线性回归
– *2.2岭回归
– *2.3Lasso回归
– *2.4逻辑回归
– *2.5人工神经网络
– *2.6GBDT,回归树和提升树
提取数据
#提取训练集,验证集,测试集 比例为6:2:2 from sklearn.model_selection import train_test_split f_v = features.values f_names = features.columns.values l_v = label.values X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2) X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)
1.分类模型
1.1KNN 算法
#导入模块 from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier models.append(("KNN",KNeighborsClassifier(n_neighbors=3))) [/code] #### 1.2朴素贝叶斯 算法 ```code from sklearn.naive_bayes import GaussianNB,BernoulliNB #朴素贝叶斯 models.append(("GaussianNB",GaussianNB())) models.append(("BernoulliNB",BernoulliNB())) [/code] #### 1.3支持向量机SVM 算法 ```code from sklearn.svm import SVC # SVM 支持向量机 C参数控制精度 models.append(("SVM Classifier",SVC(C=1000))) [/code] #### 1.4集成方法—随机森林算法 ```code from sklearn.ensemble import RandomForestClassifier #原始森林 models.append(("OriginalRandomForest",RandomForestClassifier())) #随机森林 models.append(("RandomForest",RandomForestClassifier(n_estimators=11,max_features=None))) [/code] #### 1.5集成方法—Adaboost 算法 ```code from sklearn.ensemble import AdaBoostClassifier #分类集成,Adaboost 方法:base_estimator=SVC(),n_estimators=100,algorithm="SAMME"))) models.append(("Adaboost",AdaBoostClassifier(n_estimators=100))) [/code] #### 1.6决策树 ```code from sklearn.tree import DecisionTreeClassifier,export_graphviz #min_impurity_split=0.1 最小不纯度的区分,减枝方法 #决策树(Gini)不纯度 models.append(("DecisionTreeGini",DecisionTreeClassifier())) #决策树 models.append(("DecisionTreeEntropy",DecisionTreeClassifier(criterion="entropy"))) [/code] ### 2.回归模型 #### 2.1线性回归 ```code #线性回归 from sklearn.linear_model import LinearRegression,Ridge,Lasso #线性回归 #regr=LinearRegression() [/code] #### 2.2岭回归 ```code #岭回归 regr=Ridge(alpha=1) [/code] #### 2.3Lasso回归 ```code #Lasso regr=Lasso(alpha=0.001)
2.4逻辑回归
#逻辑回归也是一种线性回归 models.append(("LogisticRegression",LogisticRegression(C=1000,tol=1e-10,solver="sag",max_iter=10000)))
2.5人工神经网络
#人工神经网络 人工神经网络的一个容器 from keras.models import Sequential #Dense 神经网络层(稠密层)Activation激活函数 from keras.layers.core import Dense,Activation #SGD 随机梯度下降算法 from keras.optimizers import SGD #建个容器 mdl=Sequential() #建个输入层,50指下一个层的神经元个数为50,intintput_dim表示输入的纬度 mdl.add(Dense(50,input_dim=len(f_v[0]))) #加入激活函数 mdl.add(Activation("sigmoid")) #输出层:2:有两个标注所以为2, mdl.add(Dense(2)) mdl.add(Activation("softmax")) #学习率为0.01 sgd=SGD(lr=0.05) #参数 loss指的是最优化函数(损失函数)optimizer优化器 sgd优化器 adam亚当优化器 mdl.compile(loss="mean_squared_error",optimizer="adam") #nb_epoch=迭代的次数,batch_size随机梯度下降算法,每次选取的个数 mdl.fit(X_train,np.array([[0,1] if i ==1 else [1,0] for i in Y_train]),nb_epoch=1000,batch_size=8999) xy_lst = [(X_train, Y_train), (X_validation, Y_validation), (X_test, Y_test)] import matplotlib.pyplot as plt from sklearn.metrics import roc_curve,auc,roc_auc_score f=plt.figure() for i in range(len(xy_lst)): X_part = xy_lst[i][0] Y_part = xy_lst[i][1] #predict_classes()输出分类标注 #Y_pred = mdl.predict_classes(X_part) Y_pred = mdl.predict(X_part) print(Y_pred) Y_pred=np.array(Y_pred[:,1]).reshape((1,-1))[0] # print(i) # print("NN", "-ACC", accuracy_score(Y_part, Y_pred)) # print("NN", "-REC", recall_score(Y_part, Y_pred)) # print("NN", "-Fl", f1_score(Y_part, Y_pred)) f.add_subplot(1,3,i+1) fpr,tpr,threshold=roc_curve(Y_part,Y_pred) plt.plot(fpr,tpr) print("NN","AUC",auc(fpr,tpr)) print("NN","AUC_Score",roc_auc_score(Y_part,Y_pred)) plt.show()
2.6GBDT,回归树和提升树
from sklearn.ensemble import GradientBoostingClassifier #GBDT,回归树和提升树 参数max_depth=6一般深度为6,n_estimators=树的数量 models.append(("GBDT",GradientBoostingClassifier(max_depth=6,n_estimators=100))) [/code] ### 模型的评估 ```code #准确度,召回度, F-score度,为了评价模型的好坏。 from sklearn.metrics import accuracy_score, recall_score, f1_score for clf_name ,clf in models: clf.fit(X_train,Y_train) xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)] for i in range(len(xy_lst)): X_part=xy_lst[i][0] Y_part=xy_lst[i][1] Y_pred=clf.predict(X_part) print(i) print(clf_name,"-ACC",accuracy_score(Y_part,Y_pred)) print(clf_name,"-REC",recall_score(Y_part,Y_pred)) print(clf_name,"-Fl",f1_score(Y_part,Y_pred)) [/code] ### 完整的程序: ```code #encoding utf-8 # time: 2018/08/08 # name: py粉 import pandas as pd import numpy as np from sklearn.preprocessing import MinMaxScaler,StandardScaler from sklearn.preprocessing import LabelEncoder,OneHotEncoder from sklearn.preprocessing import Normalizer from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.decomposition import PCA import os import pydotplus os.environ["PATH"]+=os.pathsep+"E:/Program/Graphviz/bin/" #sl:satisfaction_level---Flase:MinMaxScaler;Ture:StandardScaler #le:last_evaluation---Flase:MinMaxScaler;Ture:StandardScaler #npr:number_project---Flase:MinMaxScaler;Ture:StandardScaler #amh:average_monthly_hours---Flase:MinMaxScaler;Ture:StandardScaler #tsc:time_spend_company---Flase:MinMaxScaler;Ture:StandardScaler #wa:Work_accident---Flase:MinMaxScaler;Ture:StandardScaler #pl5:promotion_last_5years---Flase:MinMaxScaler;Ture:StandardScaler #dp:department---False:LabelEncoding;True:OneHotEncoding #slr:salary---False:LabelEncoding;True:OneHotEncoding def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=False,slr=False,lower_d=False,ld_n=1): f = open("D:\Python\python'数据分析与建模实现\data\HR.csv") df = pd.read_csv(f) #1.清洗数据 #satisfaction_level, last_evaluation, number_project,\ #average_monthly_hours, time_spend_company, Work_accident, # left, promotion_last_5years, department, salary df=df.dropna(subset=["satisfaction_level","last_evaluation"]) df=df[df["satisfaction_level"]<=1][df["salary"]!="nme"] # 2.得到标注 label = df["left"] df = df.drop("left", axis=1) #3.特征选取 #4.特征处理 scaler_lst=[sl,le,npr,amh,tsc,wa,pl5] column_lst=["satisfaction_level","last_evaluation","number_project",\ "average_monthly_hours","time_spend_company","Work_accident",\ "promotion_last_5years"] for i in range(len(scaler_lst)): if not scaler_lst[i]: df[column_lst[i]]=\ MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0] else: df[column_lst[i]]=\ StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0] scaler_lst=[slr,dp] column_lst=[ "salary","department"] for i in range(len(scaler_lst)): if not scaler_lst[i]: if column_lst[i]=="salary": df[column_lst[i]]=[map_salary(s) for s in df["salary"].values] else: df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]]) df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1)).reshape(1, -1)[0] else: # pandas提供了一个OneHotEncoding的方法 df=pd.get_dummies(df,columns=[column_lst[i]]) if lower_d: return PCA(n_components=ld_n).fit_transform(df.values),label return df,label #把“salary”的值标签化 d=dict([("low",0),("medium",1),("high",2)]) def map_salary(s): return d.get(s,0) def hr_modeling(features,label): #提取训练集,验证集,测试集 比例为6:2:2 from sklearn.model_selection import train_test_split f_v = features.values f_names = features.columns.values l_v = label.values X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2) X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25) #models from sklearn.metrics import accuracy_score, recall_score, f1_score from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier from sklearn.naive_bayes import GaussianNB,BernoulliNB from sklearn.tree import DecisionTreeClassifier,export_graphviz from sklearn.externals.six import StringIO from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier #逻辑回归 from sklearn.linear_model import LogisticRegression from sklearn.ensemble import GradientBoostingClassifier #人工神经网络 人工神经网络的一个容器 from keras.models import Sequential #Dense 神经网络层(稠密层)Activation激活函数 from keras.layers.core import Dense,Activation #SGD 随机梯度下降算法 from keras.optimizers import SGD #建个容器 mdl=Sequential() #建个输入层,50指下一个层的神经元个数为50,intintput_dim表示输入的纬度 mdl.add(Dense(50,input_dim=len(f_v[0]))) #加入激活函数 mdl.add(Activation("sigmoid")) #输出层:2:有两个标注所以为2, mdl.add(Dense(2)) mdl.add(Activation("softmax")) #学习率为0.01 sgd=SGD(lr=0.05) #参数 loss指的是最优化函数(损失函数)optimizer优化器 sgd优化器 adam亚当优化器 mdl.compile(loss="mean_squared_error",optimizer="adam") #nb_epoch=迭代的次数,batch_size随机梯度下降算法,每次选取的个数 mdl.fit(X_train,np.array([[0,1] if i ==1 else [1,0] for i in Y_train]),nb_epoch=1000,batch_size=8999) xy_lst = [(X_train, Y_train), (X_validation, Y_validation), (X_test, Y_test)] import matplotlib.pyplot as plt from sklearn.metrics import roc_curve,auc,roc_auc_score f=plt.figure() for i in range(len(xy_lst)): X_part = xy_lst[i][0] Y_part = xy_lst[i][1] #predict_classes()输出分类标注 #Y_pred = mdl.predict_classes(X_part) Y_pred = mdl.predict(X_part) print(Y_pred) Y_pred=np.array(Y_pred[:,1]).reshape((1,-1))[0] # print(i) # print("NN", "-ACC", accuracy_score(Y_part, Y_pred)) # print("NN", "-REC", recall_score(Y_part, Y_pred)) # print("NN", "-Fl", f1_score(Y_part, Y_pred)) f.add_subplot(1,3,i+1) fpr,tpr,threshold=roc_curve(Y_part,Y_pred) plt.plot(fpr,tpr) print("NN","AUC",auc(fpr,tpr)) print("NN","AUC_Score",roc_auc_score(Y_part,Y_pred)) plt.show() return models=[] models.append(("KNN",KNeighborsClassifier(n_neighbors=3))) #朴素贝叶斯 models.append(("GaussianNB",GaussianNB())) models.append(("BernoulliNB",BernoulliNB())) #min_impurity_split=0.1 最小不纯度的区分,减枝方法 #决策树(Gini)不纯度 models.append(("DecisionTreeGini",DecisionTreeClassifier())) #决策树 models.append(("DecisionTreeEntropy",DecisionTreeClassifier(criterion="entropy"))) # SVM 支持向量机 C参数控制精度 models.append(("SVM Classifier",SVC(C=1000))) #原始森林 models.append(("OriginalRandomForest",RandomForestClassifier())) #随机森林 models.append(("RandomForest",RandomForestClassifier(n_estimators=11,max_features=None))) #分类集成,Adaboost 方法:base_estimator=SVC(),n_estimators=100,algorithm="SAMME"))) models.append(("Adaboost",AdaBoostClassifier(n_estimators=100))) #逻辑回归也是一种线性回归 models.append(("LogisticRegression",LogisticRegression(C=1000,tol=1e-10,solver="sag",max_iter=10000))) #GBDT,回归树和提升树 参数max_depth=6一般深度为6,n_estimators=树的数量 models.append(("GBDT",GradientBoostingClassifier(max_depth=6,n_estimators=100))) for clf_name ,clf in models: clf.fit(X_train,Y_train) xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)] for i in range(len(xy_lst)): X_part=xy_lst[i][0] Y_part=xy_lst[i][1] Y_pred=clf.predict(X_part) print(i) print(clf_name,"-ACC",accuracy_score(Y_part,Y_pred)) print(clf_name,"-REC",recall_score(Y_part,Y_pred)) print(clf_name,"-Fl",f1_score(Y_part,Y_pred)) #绘制决策树 #dot_data=StringIO() #export_graphviz(clf,out_file=dot_data, # feature_names=f_names, # class_names=["NL","L"], # filled=True, # rounded=True, # special_characters=True) #graph=pydotplus.graph_from_dot_data(dot_data.getvalue()) #graph.write_pdf("dt_tree_2.pdf") def regr_test(features,label): print("X",features) print("Y",label) #线性回归 from sklearn.linear_model import LinearRegression,Ridge,Lasso #线性回归 #regr=LinearRegression() #岭回归 regr=Ridge(alpha=1) #Lasso #regr=Lasso(alpha=0.001) regr.fit(features.values,label.values) Y_pred=regr.predict(features.values) print("Coef:",regr.coef_) from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score print("MSE:",mean_squared_error(label.values,Y_pred)) print("MAE:",mean_absolute_error(label.values,Y_pred)) print("R2:",r2_score(label.values,Y_pred)) def main(): #数据处理,特征处理 features,label=hr_preprocessing() #线性回归 regr_test(features[["number_project","average_monthly_hours"]],features["last_evaluation"]) #分类与集成 #hr_modeling(features, label) if __name__ == '__main__': main()
这篇关于python数据分析之分类模型与回归模型-第七次笔记的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!
- 2024-12-24Python编程入门指南
- 2024-12-24Python编程基础入门
- 2024-12-24Python编程基础:变量与数据类型
- 2024-12-23使用python部署一个usdt合约,部署自己的usdt稳定币
- 2024-12-20Python编程入门指南
- 2024-12-20Python编程基础与进阶
- 2024-12-19Python基础编程教程
- 2024-12-19python 文件的后缀名是什么 怎么运行一个python文件?-icode9专业技术文章分享
- 2024-12-19使用python 把docx转为pdf文件有哪些方法?-icode9专业技术文章分享
- 2024-12-19python怎么更换换pip的源镜像?-icode9专业技术文章分享