2021/11/15 1:09:57
决策树(Decision Tree)是在已知各种情况发生概率的基础上,通过构成决策树来求取净现值的期望值大于等于零的概率,评价项目风险,判断其可行性的决策分析方法,是直观运用概率分析的一种图解法。由于这种决策分支画成图形很像一棵树的枝干,故称决策树。
1 from math import log 2 import operator 3 import numpy as np 4 import pandas as pd 5 from pandas import DataFrame, Series 6 #构建决策和数字的映射 7 productDict = {'高':1,'一般':2,'低':3,'中':2, '帅':1, '普通':2, '丑':3, '胖':3, '匀称':2,'瘦':1, '是':1, '否':0} 8 #导入数据 9 def Importdata(datafile): 10 dataa = pd.read_excel(datafile) # datafile是excel文件,所以用read_excel,如果是csv文件则用read_csv 11 #将文本中不可直接使用的文本变量替换成数字 12 13 dataa['income'] = dataa['收入'].map(productDict) # 将每一列中的数据按照字典规定的转化成数字 14 dataa['hight'] = dataa['身高'].map(productDict) 15 dataa['look'] = dataa['长相'].map(productDict) 16 dataa['shape'] = dataa['体型'].map(productDict) 17 dataa['is_meet'] = dataa['是否见面'].map(productDict) 18 19 data = dataa.iloc[:,5:].values.tolist() # 取量化后的几列,去掉文本列 20 b = dataa.iloc[0:0,5:-1] 21 labels = b.columns.values.tolist() # 将标题中的值存入列表中 22 23 return data,labels 24 25 #计算数据的熵(entropy)--原始熵 26 def dataentropy(data, feat): 27 lendata = len(data) # 数据条数 28 labelCounts = {} # 数据中不同类别的条数 29 for featVec in data: 30 category = featVec[-1] # 每行数据的最后一个字(叶子节点) 31 if category not in labelCounts.keys(): 32 labelCounts[category] = 0 33 labelCounts[category] += 1 # 统计有多少个类以及每个类的数量 34 entropy = 0 35 for key in labelCounts: 36 prob = float(labelCounts[key]) / lendata # 计算单个类的熵值 37 entropy -= prob * log(prob,2) # 累加每个类的熵值 38 39 return entropy 40 41 #对数据按某个特征value进行分类 42 def splitData(data,i,value): 43 splitData = [] 44 for featVec in data: 45 if featVec[i] == value: 46 rfv = featVec[:i] 47 rfv.extend(featVec[i+1:]) 48 splitData.append(rfv) 49 50 return splitData 51 52 #选择最优的分类特征 53 def BestSplit(data): 54 numFea = len(data[0]) - 1 # 计算一共有多少个特征,因为最后一列一般是分类结果,所以需要-1 55 baseEnt = dataentropy(data,-1) # 定义初始的熵,用于对比分类后信息增益的变化 56 bestInfo = 0 57 bestFeat = -1 58 for i in range(numFea): 59 featList = [rowdata[i] for rowdata in data] 60 uniqueVals = set(featList) 61 newEnt = 0 62 for value in uniqueVals: 63 subData = splitData(data,i,value) # 获取按照特征value分类后的数据 64 prob = len(subData) / float(len(data)) 65 newEnt += prob * dataentropy(subData,i) # 按特征分类后计算得到的熵 66 info = baseEnt - newEnt # 原始熵与按特征分类后的熵的差值,即信息增益 67 if (info > bestInfo): # 若按某特征划分后,若infoGain大于bestInf,则infoGain对应的特征分类区分样本的能力更强,更具有代表性。 68 bestInfo = info # 将infoGain赋值给bestInf,如果出现比infoGain更大的信息增益,说明还有更好地特征分类 69 bestFeat = i # 将最大的信息增益对应的特征下标赋给bestFea,返回最佳分类特征 70 71 return bestFeat 72 73 #按分类后类别数量排序,取数量较大的 74 def majorityCnt(classList): 75 c_count = {} 76 for i in classList: 77 if i not in c_count.keys(): 78 c_count[i] = 0 79 c_count[i] += 1 80 ClassCount = sorted(c_count.items(),key=operator.itemgetter(1),reverse=True) # 按照统计量降序排序 81 82 return ClassCount[0][0] # reverse=True表示降序,因此取[0][0],即最大值 83 84 #构建树 85 def createTree(data,labels): 86 87 classList = [rowdata[-1] for rowdata in data] # 取每一行的最后一列,分类结果(1/0) 88 #print(classList) 89 if classList.count(classList[0]) == len(classList): 90 return classList[0] 91 if len(data[0]) == 1: 92 return majorityCnt(classList) 93 bestFeat = BestSplit(data) # 根据信息增益选择最优特征 94 bestLab = labels[bestFeat] 95 myTree = {bestLab:{}} # 分类结果以字典形式保存 96 del(labels[bestFeat]) 97 featValues = [rowdata[bestFeat] for rowdata in data] 98 uniqueVals = set(featValues) 99 for value in uniqueVals: 100 subLabels = labels[:] 101 myTree[bestLab][value] = createTree(splitData(data,bestFeat,value),subLabels) 102 103 return myTree 104 105 #主程序 106 datafile = 'data/dateperson/date01.xlsx' # 文件所在位置 107 data, labels = Importdata(datafile) # 导入数据 108 109 jc=createTree(data, labels) # 输出决策树模型结果 110 111 print(jc)
1 PS C:\coding\machinelearning>ID3相亲决策实验.py 2 {'income': {1: 1, 2: {'hight': {1: {'look': {1: 1, 2: 1, 3: {'shape': {0: 0, 1: 1}}}}, 2: 1, 3: 0}}, 3: {'hight': {1: {'look': {2: 1, 3: 0}}, 2: 0, 3: 0}}}} 3 PS C:\coding\machinelearning>
1 from math import log 2 import operator 3 import numpy as np 4 import pandas as pd 5 from pandas import DataFrame, Series 6 productDict = {'高':1,'一般':2,'低':3,'中':2, '帅':1, '普通':2, '丑':3, '胖':3, '匀称':2,'瘦':1, '是':1, '否':0} 7 #导入数据 8 def Importdata(datafile): 9 dataa = pd.read_excel(datafile) # datafile是excel文件,所以用read_excel,如果是csv文件则用read_csv 10 #将文本中不可直接使用的文本变量替换成数字 11 12 dataa['income'] = dataa['收入'].map(productDict) # 将每一列中的数据按照字典规定的转化成数字 13 dataa['hight'] = dataa['身高'].map(productDict) 14 dataa['look'] = dataa['长相'].map(productDict) 15 dataa['shape'] = dataa['体型'].map(productDict) 16 dataa['is_meet'] = dataa['是否见面'].map(productDict) 17 18 data = dataa.iloc[:,5:].values.tolist() # 取量化后的几列,去掉文本列 19 b = dataa.iloc[0:0,5:-1] 20 labels = b.columns.values.tolist() # 将标题中的值存入列表中 21 22 return data,labels 23 24 #计算数据的熵(entropy)--原始熵 25 def dataentropy(data, feat): 26 lendata = len(data) # 数据条数 27 labelCounts = {} # 数据中不同类别的条数 28 for featVec in data: 29 category = featVec[-1] # 每行数据的最后一个字(叶子节点) 30 if category not in labelCounts.keys(): 31 labelCounts[category] = 0 32 labelCounts[category] += 1 # 统计有多少个类以及每个类的数量 33 entropy = 0 34 for key in labelCounts: 35 prob = float(labelCounts[key]) / lendata # 计算单个类的熵值 36 entropy -= prob * log(prob,2) # 累加每个类的熵值 37 38 return entropy 39 40 #对数据按某个特征value进行分类 41 def splitData(data,i,value): 42 splitData = [] 43 for featVec in data: 44 if featVec[i] == value: 45 rfv = featVec[:i] 46 rfv.extend(featVec[i+1:]) 47 splitData.append(rfv) 48 49 return splitData 50 51 #选择最优的分类特征 52 def BestSplit(data): 53 numFea = len(data[0]) - 1 # 计算一共有多少个特征,因为最后一列一般是分类结果,所以需要-1 54 baseEnt = dataentropy(data, -1) # 定义初始的熵,用于对比分类后信息增益的变化 55 bestGainRate = 0 56 bestFeat = -1 57 for i in range(numFea): 58 featList = [rowdata[i] for rowdata in data] 59 uniqueVals = set(featList) 60 newEnt = 0 61 for value in uniqueVals: 62 subData = splitData(data,i,value) # 获取按照特征value分类后的数据 63 prob = len(subData) / float(len(data)) 64 newEnt += prob * dataentropy(subData, i) # 按特征分类后计算得到的熵 65 info = baseEnt - newEnt # 原始熵与按特征分类后的熵的差值,即信息增益 66 splitonfo = dataentropy(subData,i) # 分裂信息 67 if splitonfo == 0: # 若特征值相同(eg:长相这一特征的值都是帅),即splitonfo和info均为0,则跳过该特征 68 continue 69 GainRate = info / splitonfo # 计算信息增益率 70 if (GainRate > bestGainRate): # 若按某特征划分后,若infoGain大于bestInf,则infoGain对应的特征分类区分样本的能力更强,更具有代表性。 71 bestGainRate = GainRate # 将infoGain赋值给bestInf,如果出现比infoGain更大的信息增益,说明还有更好地特征分类 72 bestFeat = i # 将最大的信息增益对应的特征下标赋给bestFea,返回最佳分类特征 73 return bestFeat 74 75 76 def majorityCnt(classList): 77 c_count = {} 78 for i in classList: 79 if i not in c_count.keys(): 80 c_count[i] = 0 81 c_count[i] += 1 82 ClassCount = sorted(c_count.items(),key=operator.itemgetter(1),reverse=True)#按照统计量降序排序 83 84 return ClassCount[0][0]#reverse=True表示降序,因此取[0][0],即最大值 85 #构建树 86 def createTree(data,labels): 87 classList = [rowdata[-1] for rowdata in data] # 取每一行的最后一列,分类结果(1/0) 88 if classList.count(classList[0]) == len(classList): 89 return classList[0] 90 if len(data[0]) == 1: 91 return majorityCnt(classList) 92 bestFeat = BestSplit(data) # 根据信息增益选择最优特征 93 bestLab = labels[bestFeat] 94 myTree = {bestLab:{}} # 分类结果以字典形式保存 95 del(labels[bestFeat]) 96 featValues = [rowdata[bestFeat] for rowdata in data] 97 uniqueVals = set(featValues) 98 for value in uniqueVals: 99 subLabels = labels[:] 100 myTree[bestLab][value] = createTree(splitData(data,bestFeat,value),subLabels) 101 102 return myTree 103 104 105 106 #主程序 107 datafile = 'data/dateperson/date01.xlsx' # 文件所在位置 108 data, labels = Importdata(datafile) # 导入数据 109 jc=createTree(data, labels) # 输出决策树模型结果 110 111 print(jc)
1 PS C:\coding\machinelearning>C4.5相亲决策实验.py 2 {'income': {1: 1, 2: {'look': {1: 1, 2: 1, 3: {'shape': {1: {'hight': {0: 0, 1: 1}}, 3: {'hight': {0: 0, 1: 1}}}}}}, 3: {'shape': {0: 0, 1: 1}}}} 3 PS C:\coding\machinelearning>
1 #数据处理的库 2 from numpy.lib.type_check import real 3 import pandas as pd 4 #数据分类 5 from sklearn.model_selection import train_test_split 6 #算法库 7 from sklearn.tree import DecisionTreeClassifier 8 from sklearn.preprocessing import LabelEncoder 9 from sklearn.model_selection import GridSearchCV 10 11 #评估用到的库 12 from sklearn.metrics import make_scorer 13 from sklearn.metrics import accuracy_score 14 from sklearn.metrics import f1_score 15 from sklearn.metrics import recall_score 16 from sklearn.metrics import precision_score 17 #预测 18 19 import numpy as np 20 21 #读取数据 22 data = pd.read_csv('data/titanic/train.csv') 23 print(data.head()) 24 25 # 计算各特征缺失总数 26 total = data.isnull().sum().sort_values(ascending=False) 27 # 计算各特征缺失比例 28 percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending = False) 29 miss_data = pd.concat([total, percent], axis = 1, keys = ['Miss_Total', 'Miss_Percent']) 30 miss_data.head() 31 # 缺失值处理。 32 # 删除‘Cabin’ 33 del data['deck'] 34 # 采用中位数填充缺失值 35 data['age'] = data['age'].fillna(data['age'].median()) 36 # 众数填充缺失值 37 data['embark_town'] = data['embark_town'].fillna(data['embark_town'].mode()[0]) 38 # 查看数据情况 39 40 41 # 观察Name特征提取其中的Title称呼 42 #data['Title'] = data['Name'].str.split(",", expand=True)[1].str.split(".", expand=True)[0] 43 # 将字符型变量做数值化处理 44 label = LabelEncoder() 45 data['sex'] = label.fit_transform(data['sex']) 46 data['class'] = label.fit_transform(data['class']) 47 data['alone'] = label.fit_transform(data['alone']) 48 #data['Embarked'] = data['Embarked'].astype(str) 49 data['embark_town'] = label.fit_transform(data['embark_town']) 50 # 考虑到PassengerId和Ticker为随机生成的变量,不作为影响目标变量的信息,因此特征选择时,将其去除 51 features = ['class', 'age', 'n_siblings_spouses', 'parch', 'fare', 'sex', 'alone', 'embark_town','survived'] 52 data = data[features] 53 data.head() 54 55 #划分训练集和测试集 56 X = data[['class', 'age', 'n_siblings_spouses', 'parch', 'fare', 'sex', 'alone', 'embark_town']] 57 y = data[['survived']] 58 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)#random_state为随机种子,确保每次划分的结果是相同的 59 60 #训练模型 61 dtc = DecisionTreeClassifier() 62, y_train) 63 y_predict = dtc.predict(X_test) 64 65 66 # 模型评分:准确率,查全率,查准率,F1得分 67 accuracyScore = accuracy_score(y_test, y_predict) 68 recallScore = recall_score(y_test, y_predict) 69 precisionScore = precision_score(y_test, y_predict) 70 f1Score = f1_score(y_test, y_predict) 71 print("DecisionTreeClassifier Results") 72 print("Accuracy :", accuracyScore) 73 print("Recall :", recallScore) 74 print("Precision :", precisionScore) 75 print("F1 Score :", f1Score) 76 77 78 param = {'max_depth': [1, 3, 5, 7]} 79 # 采用网格搜索进行参数调优 80 gsearch = GridSearchCV(estimator=dtc, param_grid=param, cv=5, scoring='f1') 81, y=y_train) 82 print("最优参数:{}".format(gsearch.best_params_)) 83 print("最优模型:{}".format((gsearch.best_estimator_))) 84 print("模型最高分:{:.3f}".format(gsearch.score(X_test, y_test))) 85 86 87 88 # 选择最优模型进行预测 89 dtc = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3, 90 max_features=None, max_leaf_nodes=None, 91 min_samples_leaf=1, min_samples_split=2, 92 min_weight_fraction_leaf=0.0, random_state=None, 93 splitter='best') 94, y_train) 95 y_predict = dtc.predict(X_test[:10]) 96 # 打印预测结果 97 print('===================预测值=======================') 98 print(y_predict) 99 # 打印真实值 100 print('===================真实值=======================') 101 #print(np.array(y_test[:10]).tolist()) 102 realz = np.array(y_test[:10]).ravel() 103 print(realz) 104 Accuracy = accuracy_score(realz, y_predict) 105 print('准确率为:{:.2f}%'.format(Accuracy*100)) 106
1 PS C:\coding\machinelearning>CART实验(titanic).py 2 survived sex age n_siblings_spouses parch fare class deck embark_town alone 3 0 0 male 22.0 1 0 7.2500 Third unknown Southampton n 4 1 1 female 38.0 1 0 71.2833 First C Cherbourg n 5 2 1 female 26.0 0 0 7.9250 Third unknown Southampton y 6 3 1 female 35.0 1 0 53.1000 First C Southampton n 7 4 0 male 28.0 0 0 8.4583 Third unknown Queenstown y 8 <class 'pandas.core.frame.DataFrame'> 9 RangeIndex: 627 entries, 0 to 626 10 Data columns (total 10 columns): 11 # Column Non-Null Count Dtype 12 --- ------ -------------- ----- 13 0 survived 627 non-null int64 14 1 sex 627 non-null object 15 2 age 627 non-null float64 16 3 n_siblings_spouses 627 non-null int64 17 4 parch 627 non-null int64 18 5 fare 627 non-null float64 19 6 class 627 non-null object 20 7 deck 627 non-null object 21 8 embark_town 627 non-null object 22 9 alone 627 non-null object 23 dtypes: float64(2), int64(3), object(5) 24 memory usage: 49.1+ KB 25 <class 'pandas.core.frame.DataFrame'> 26 RangeIndex: 627 entries, 0 to 626 27 Data columns (total 9 columns): 28 # Column Non-Null Count Dtype 29 --- ------ -------------- ----- 30 0 survived 627 non-null int64 31 1 sex 627 non-null object 32 2 age 627 non-null float64 33 3 n_siblings_spouses 627 non-null int64 34 4 parch 627 non-null int64 35 5 fare 627 non-null float64 36 6 class 627 non-null object 37 7 embark_town 627 non-null object 38 8 alone 627 non-null object 39 dtypes: float64(2), int64(3), object(4) 40 memory usage: 44.2+ KB 41 DecisionTreeClassifier Results 42 Accuracy : 0.7698412698412699 43 Recall : 0.7142857142857143 44 Precision : 0.7 45 F1 Score : 0.7070707070707072 46 最优参数:{'max_depth': 5} 47 最优模型:DecisionTreeClassifier(max_depth=5) 48 模型最高分:0.731 49 ===================预测值======================= 50 [1 1 0 1 0 0 0 1 0 0] 51 ===================真实值======================= 52 [1 0 0 1 0 0 0 1 0 0] 53 准确率为:90.00% 54 PS C:\coding\machinelearning>
- 2024-11-20实战:30 行代码做一个网页端的 AI 聊天助手
- 2024-11-185分钟搞懂大模型的重复惩罚后处理
- 2024-11-18基于Ollama和pgai的个人知识助手项目:用Postgres和向量扩展打造智能数据库
- 2024-11-15我用同一个提示测试了4款AI工具,看看谁设计的界面更棒
- 2024-11-15深度学习面试的时候,如何回答1x1卷积的作用
- 2024-11-15检索增强生成即服务:开发者的得力新帮手
- 2024-11-15技术与传统:人工智能时代的最后一袭纱丽
- 2024-11-15未结构化数据不仅仅是给嵌入用的:利用隐藏结构提升检索性能
- 2024-11-15Emotion项目实战:新手入门教程
- 2024-11-157 个开源库助你构建增强检索生成(RAG)、代理和 AI 搜索