WHCSRL 技术网

机器学习Sklearn实战——adaboost_Grateful

pandas批量处理体测成绩

  1. import numpy as np
  2. import pandas as pd
  3. from pandas import Series,DataFrame
  4. import matplotlib.pyplot as plt
  5. data = pd.read_excel("/Users/zhucan/Desktop/18级高一体测成绩汇总.xls")
  1. cond = data["班级"] != "班级"
  2. data = data[cond]
  3. data.fillna(0,inplace=True)
  4. data.isnull().any() #没有空数据了

结果:

  1. 班级 False
  2. 性别 False
  3. 姓名 False
  4. 1000False
  5. 50False
  6. 跳远 False
  7. 体前屈 False
  8. 引体 False
  9. 肺活量 False
  10. 身高 False
  11. 体重 False
  12. dtype: bool
data.head()

  1. #1000米成绩有string 有int
  2. def convert(x):
  3. if isinstance(x,str):
  4. minute,second = x.split("'")
  5. int(minute)
  6. minute = int(minute)
  7. second = int(second)
  8. return minute + second/100.0
  9. else:
  10. return x
  11. data["1000米"] = data["1000米"].map(convert)

  1. score = pd.read_excel("/Users/zhucan/Desktop/体侧成绩评分表.xls",header=[0,1])
  2. score

  1. def convert(item):
  2. m,s = item.strip('"').split("'")
  3. m,s =int(m),int(s)
  4. return m+s/100.0
  5. score.iloc[:,-4] = score.iloc[:,-4].map(convert)
  6. def convert(item):
  7. m,s = item.strip('"').split("'")
  8. m,s =int(m),int(s)
  9. return m+s/100.0
  10. score.iloc[:,-2] = score.iloc[:,-2].map(convert)
  11. score

data.columns = ['班级', '性别', '姓名', '男1000', '男50米跑', '跳远', '体前屈', '引体', '肺活量', '身高', '体重']
  1. data["男50米跑"] = data["男50米跑"].astype(np.float)
  2. for col in ["男1000","男50米跑"]:
  3. #获取成绩的标准
  4. s = score[col]
  5. def convert(x):
  6. for i in range(len(s)):
  7. if x<=s["成绩"].iloc[0]:
  8. if x == 0:
  9. return 0 #没有参加这个项目
  10. return 100
  11. elif x>s["成绩"].iloc[-1]:
  12. return 0 #跑的太慢
  13. elif (x>s["成绩"].iloc[i-1]) and (x<=s["成绩"].iloc[i]):
  14. return s["分数"].iloc[i]
  15. data[col + "成绩"] = data[col].map(convert)

  1. for col in ['跳远', '体前屈', '引体', '肺活量']:
  2. s = score["男"+col]
  3. def convert(x):
  4. for i in range(len(s)):
  5. if x>s["成绩"].iloc[i]:
  6. return s["分数"].iloc[i]
  7. return 0
  8. data[col+"成绩"] = data[col].map(convert)

data.columns

 结果:

  1. Index(['班级', '性别', '姓名', '男1000', '男50米跑', '跳远', '体前屈', '引体', '肺活量', '身高',
  2. '体重', '男1000成绩', '男50米跑成绩', '跳远成绩', '体前屈成绩', '引体成绩', '肺活量成绩'],
  3. dtype='object')
  1. #根据索引的顺序,去data取值
  2. cols = ['班级', '性别', '姓名', '男1000','男1000成绩','男50米跑','男50米跑成绩','跳远','跳远成绩','体前屈','体前屈成绩','引体','引体成绩', '肺活量','肺活量成绩','身高','体重']
  3. data[cols]

  1. #计算BMI
  2. data["BMI"] = data["体重"]/data["身高"]
  3. def convert(x):
  4. if x>100:
  5. return x/100
  6. else:
  7. return x
  8. data["身高"] = data["身高"].map(convert)
  9. data["BMI"] = data["体重"]/(data["身高"])**2
  1. def convert_bmi(x):
  2. if x >= 26.4:
  3. return 60
  4. elif (x <= 16.4) or (x > 23.3 and x <= 26.3):
  5. return 80
  6. elif x >= 16.5 and x <= 23.2:
  7. return 100
  8. else:
  9. return 0
  10. data["BMI_score"] = data["BMI"].map(convert_bmi)
  1. #统计分析
  2. data["BMI_score"].value_counts().plot(kind = "pie",autopct = "%%0.2f%%%%")
  3. #统计分析
  4. data["BMI_score"].value_counts().plot(kind = "bar")

data.groupby(["男1000成绩"])["BMI_score"].count().plot(kind = "bar")

adaboost

 

 值越大,特征越明显,越被容易分开;越后面的学习器,权重越大

梯度提升树没有修改原来的数据,使用的是残差,最终结果就是最后一棵树

上面的图不是GBDT

Boosting与Bagging模型相比,Boosting可以同时降低偏差和方差,Bagging只能降低模型的方差。在实际应用中,Boosting算法也还是存在明显的高方差问题,也就是过拟合。 

  1. import numpy as np
  2. y = np.array([0,1]*5)
  3. y_ = np.array([0,0,0,0,0,0,0,1,0,1])
  4. w = 0.1*(y != y_).sum()
  5. round(w,1)

结果:

0.3
  1. 0.5*np.log((1-0.3)/0.3)
  2. round((0.5*np.log((1-0.3)/0.3)),2)

 结果:

0.42

 

adaboost原理案例举例

  1. from sklearn.ensemble import AdaBoostClassifier
  2. from sklearn import tree
  3. import matplotlib.pyplot as plt
  4. X = np.arange(10).reshape(-1,1)
  5. y = np.array([1,1,1,-1,-1,-1,1,1,1,-1])
  6. ada = AdaBoostClassifier(n_estimators=3)
  7. ada.fit(X,y)
  1. plt.figure(figsize = (9,6))
  2. _ = tree.plot_tree(ada[0])

  1. y_ = ada[0].predict(X),4
  2. y_

结果:

array([ 1,  1,  1, -1, -1, -1, -1, -1, -1, -1])
  1. #误差率
  2. e1 = np.round(0.1*(y != y_).sum(),4)
  3. e1

结果:

0.3
  1. #计算第一棵树权重
  2. #随机森林中每棵树的权重是一样的
  3. #adaboost提升树中每棵树的权重不同
  4. a1 = np.round(1/2*np.log((1-e1)/e1),4)
  5. a1

结果:

0.4236
  1. #样本预测准确:更新的权重
  2. w2 = 0.1*np.e**(-a1*y*y_)
  3. w2 = w2/w2.sum()
  4. np.round(w2,4)

结果:

  1. array([0.0714, 0.0714, 0.0714, 0.0714, 0.0714, 0.0714, 0.1667, 0.1667,
  2. 0.1667, 0.0714])
  1. #样本预测准确:更新的权重
  2. w2 = 0.1*np.e**(-a1*y*y_)
  3. w2 = w2/w2.sum()
  4. np.round(w2,4)

结果:

  1. array([0.0714, 0.0714, 0.0714, 0.0714, 0.0714, 0.0714, 0.1667, 0.1667,
  2. 0.1667, 0.0714])

从上述第一轮的整个迭代过程可以看出:被误分类样本的权值之和影响误差率,误差率影响基本分类器在最终分类器中所占的权重

分类函数f1(x)= a1*G1(x)= 0.4236G1(x)

  1. plt.figure(figsize = (9,6))
  2. _ = tree.plot_tree(ada[1])

  1. e2 = 0.0714*3
  2. e2

结果:

0.2142
  1. a2 = np.round(1/2*np.log((1-e2)/e2),4)
  2. a2

 结果:

0.6499
  1. y_ = ada[1].predict(X)
  2. #样本预测准确:更新的权重
  3. w3 = w2*np.e**(-a2*y*y_)
  4. w3 = w3/w3.sum()
  5. np.round(w3,4)

结果:

  1. array([0.0454, 0.0454, 0.0454, 0.1667, 0.1667, 0.1667, 0.106 , 0.106 ,
  2. 0.106 , 0.0454])
  1. plt.figure(figsize = (9,6))
  2. _ = tree.plot_tree(ada[2])

树划分按照gini系数;结果和按照误差率是一致的~ 

  1. y_ = ada[2].predict(X)
  2. e3 = (w3*(y_ != y)).sum()
  3. a3 = 1/2*np.log((1-e3)/e3)
  4. a3
  5. #样本预测准确:更新的权重
  6. w4 = w3*np.e**(-a3*y*y_)
  7. w4 = w4/w4.sum()
  8. np.round(w4,4)

结果:

  1. array([0.125 , 0.125 , 0.125 , 0.1019, 0.1019, 0.1019, 0.0648, 0.0648,
  2. 0.0648, 0.125 ])
display(a1,a2,a3)

 结果:

  1. 0.4236
  2. 0.6498960745553556
  3. 0.7521752700597043

弱分类器合并成强分类器

综上,将上面计算得到的a1、a2、a3各值代入G(x)中,G(x) = sign[f3(x)] = sign[ a1 * G1(x) + a2 * G2(x) + a3 * G3(x) ],得到最终的分类器为:

G(x) = sign[f3(x)] = sign[ 0.4236G1(x) + 0.6496G2(x)+0.7514G3(x) ]

ada.predict(X)

 结果:

array([ 1,  1,  1, -1, -1, -1,  1,  1,  1, -1])
  1. y_predict = a1*ada[0].predict(X) + a2*ada[1].predict(X) +a3*ada[2].predict(X)
  2. y_predict
  3. np.sign(y_predict).astype(np.int)
array([ 1,  1,  1, -1, -1, -1,  1,  1,  1, -1])
推荐阅读