数学建模的常用功能
目录
导入自定义包且.py文件修改时jupyter notebook自动同步
pandas读取数据
- import numpy as np
- import pandas as pd
- import random
- Molecular_Descriptor = pd.read_excel('Molecular_Descriptor.xlsx',header=0)
- Molecular_Descriptor.head()
查看数据异常
- #判断数据NAN,INF
- print(Molecular_Descriptor.isnull().any())
- print(np.isnan(Molecular_Descriptor).any())
- print(np.isfinite(Molecular_Descriptor).all())
- print(np.isinf(Molecular_Descriptor).all())
提取指定列
Molecular_Descriptor.iloc[:,1:]
将dataframe数据以numpy形式提取
- # .values能够将dataframe中的数据以numpy的形式读取
- X = Molecular_Descriptor.iloc[:,1:].values
- Y = ERα_activity.iloc[:,2].values
数据划分
- from sklearn.model_selection import train_test_split
- X_train, X_test, y_train, y_test = train_test_split(X,
- Y,
- test_size=0.2,
- random_state=0)
-
- #打印出原始样本集、训练集和测试集的数目
- print("The length of original data X is:", X.shape[0])
- print("The length of train Data is:", X_train.shape[0])
- print("The length of test Data is:", X_test.shape[0])
随机森林回归
- #导入随机森林库
- from sklearn.ensemble import RandomForestRegressor
- #导入sklearn度量库
- from sklearn import metrics
- #定义分类器
- RFRegressor = RandomForestRegressor(n_estimators=200, random_state=0)
- #模型训练
- RFregressor.fit(X_train, y_train)
- #模型预测
- y_pred = RFregressor.predict(X_test)
- #输出回归模型评价指标
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
- #获得特征重要性
- print(RFregressor.feature_importances_)
GBDT回归
- from sklearn.ensemble import GradientBoostingRegressor
- gbdt = GradientBoostingRegressor(random_state=0)
- gbdt.fit(X_train, y_train)
- y_pred = gbdt.predict(X_test)
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
特征重要性可视化
- import matplotlib.pyplot as plt
- plt.rcParams["font.sans-serif"] = ["SimHei"] # 用来正常显示中文标签
- plt.rcParams["axes.unicode_minus"] = False # 解决负号"-"显示为方块的问题
-
- plt.rcParams['savefig.dpi'] = 150 #图片像素
- plt.rcParams['figure.dpi'] = 150 #分辨率
-
-
- def plot_feature_importance(dataset, model_bst):
- '''
- dataset : 数据集 dataframe
- model_bst : 训练好的模型
- '''
- list_feature_name = list(dataset.columns[1:])
- list_feature_importance = list(model_bst.feature_importances_)
- dataframe_feature_importance = pd.DataFrame(
- {'feature_name': list_feature_name, 'importance': list_feature_importance})
- dataframe_feature_importance20 = dataframe_feature_importance.sort_values(by='importance', ascending=False)[:20]
- print(dataframe_feature_importance20)
- x = range(len(dataframe_feature_importance20['feature_name']))
- plt.xticks(x, dataframe_feature_importance20['feature_name'], rotation=90, fontsize=8)
- plt.plot(x, dataframe_feature_importance20['importance'])
- plt.xlabel("分子描述符")
- plt.ylabel("重要程度")
- plt.title('重要程度可视化')
- plt.grid()
- #保存图像
- #plt.savefig('重要程度可视化.png')
- plt.show()
- return dataframe_feature_importance20['feature_name']
-
- if __name__ == '__main__':
- # 传入数据集dataframe , 模型对特征重要性进行评估
- gbdt_name = plot_feature_importance(Molecular_Descriptor,gbdt)
输出:
绘制3D散点图
- z = list(range(0,729))
- plt.rcParams['savefig.dpi'] = 150 #图片像素
- plt.rcParams['figure.dpi'] = 150 #分辨率
- plt.rcParams["font.sans-serif"] = ["SimHei"] # 用来正常显示中文标签
- plt.rcParams["axes.unicode_minus"] = False # 解决负号"-"显示为方块的问题
- from mpl_toolkits.mplot3d import Axes3D
- x = regressor.feature_importances_
- y = gbdt.feature_importances_
- fig = plt.figure()
- plt.subplots_adjust(right=0.8)
- ax = fig.add_subplot(111, projection='3d') # 创建一个三维的绘图工程
- ax.scatter(x,y,z,c='b',s=5,alpha=1)
- #设置x、y轴坐标刻标以及对应的标签
- plt.xticks(fontsize=7)
- plt.yticks(fontsize=7)
- #统一设置x、y、z轴标签字体
- plt.tick_params(labelsize=7)
- #设置x、y、z标签
- plt.xlabel("x轴",fontsize=8)
- plt.ylabel("y轴",fontsize=8)
- ax.set_zlabel('z轴',fontsize=8)
- plt.savefig('这是三维图.png')
导入自定义包且.py文件修改时jupyter notebook自动同步
- %%%%load_ext autoreload
- %%%%autoreload 2
dataframe删除某列中重复字段并删除对应行
dataframe_feature_importance = dataframe_feature_importance.drop_duplicates(subset=['feature_name'], keep='first', inplace=False)
LASSO回归
- from sklearn import linear_model
-
- model = linear_model.LassoCV()
- model.fit(X_train, y_train)
- y_predict = model.predict(X_test)
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predict))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_predict)))
绘制回归误差图
- x_t = np.linspace(0, len(np.array(y_test)), len(np.array(y_test)))
- plt.plot(x_t, y_test, marker='.', label="origin data")
- # plt.xticks([])
- plt.plot(x_t, y_predict, 'r-', marker='.', label="predict", lw=1)
- plt.xlabel('样本编号')
- plt.ylabel('预测结果')
- # plt.figure(figsize=(10,100))
- plt.legend(labels=['test','predict'],loc='best')
- # plt.xticks([])
- score = model.score(X_test,y_test)
- print(score)
- plt.text(140, 3, 'score=%%%%.4f' %%%% score, fontdict={'size': 15, 'color': 'red'})
- plt.savefig('Lasso.png')
- import numpy as np
- import pandas as pd
- import random
- Molecular_Descriptor = pd.read_excel('Molecular_Descriptor.xlsx',header=0)
- Molecular_Descriptor.head()
- #判断数据NAN,INF
- print(Molecular_Descriptor.isnull().any())
- print(np.isnan(Molecular_Descriptor).any())
- print(np.isfinite(Molecular_Descriptor).all())
- print(np.isinf(Molecular_Descriptor).all())
提取指定列
Molecular_Descriptor.iloc[:,1:]
将dataframe数据以numpy形式提取
- # .values能够将dataframe中的数据以numpy的形式读取
- X = Molecular_Descriptor.iloc[:,1:].values
- Y = ERα_activity.iloc[:,2].values
数据划分
- from sklearn.model_selection import train_test_split
- X_train, X_test, y_train, y_test = train_test_split(X,
- Y,
- test_size=0.2,
- random_state=0)
-
- #打印出原始样本集、训练集和测试集的数目
- print("The length of original data X is:", X.shape[0])
- print("The length of train Data is:", X_train.shape[0])
- print("The length of test Data is:", X_test.shape[0])
随机森林回归
- #导入随机森林库
- from sklearn.ensemble import RandomForestRegressor
- #导入sklearn度量库
- from sklearn import metrics
- #定义分类器
- RFRegressor = RandomForestRegressor(n_estimators=200, random_state=0)
- #模型训练
- RFregressor.fit(X_train, y_train)
- #模型预测
- y_pred = RFregressor.predict(X_test)
- #输出回归模型评价指标
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
- #获得特征重要性
- print(RFregressor.feature_importances_)
GBDT回归
- from sklearn.ensemble import GradientBoostingRegressor
- gbdt = GradientBoostingRegressor(random_state=0)
- gbdt.fit(X_train, y_train)
- y_pred = gbdt.predict(X_test)
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
特征重要性可视化
- import matplotlib.pyplot as plt
- plt.rcParams["font.sans-serif"] = ["SimHei"] # 用来正常显示中文标签
- plt.rcParams["axes.unicode_minus"] = False # 解决负号"-"显示为方块的问题
-
- plt.rcParams['savefig.dpi'] = 150 #图片像素
- plt.rcParams['figure.dpi'] = 150 #分辨率
-
-
- def plot_feature_importance(dataset, model_bst):
- '''
- dataset : 数据集 dataframe
- model_bst : 训练好的模型
- '''
- list_feature_name = list(dataset.columns[1:])
- list_feature_importance = list(model_bst.feature_importances_)
- dataframe_feature_importance = pd.DataFrame(
- {'feature_name': list_feature_name, 'importance': list_feature_importance})
- dataframe_feature_importance20 = dataframe_feature_importance.sort_values(by='importance', ascending=False)[:20]
- print(dataframe_feature_importance20)
- x = range(len(dataframe_feature_importance20['feature_name']))
- plt.xticks(x, dataframe_feature_importance20['feature_name'], rotation=90, fontsize=8)
- plt.plot(x, dataframe_feature_importance20['importance'])
- plt.xlabel("分子描述符")
- plt.ylabel("重要程度")
- plt.title('重要程度可视化')
- plt.grid()
- #保存图像
- #plt.savefig('重要程度可视化.png')
- plt.show()
- return dataframe_feature_importance20['feature_name']
-
- if __name__ == '__main__':
- # 传入数据集dataframe , 模型对特征重要性进行评估
- gbdt_name = plot_feature_importance(Molecular_Descriptor,gbdt)
输出:
绘制3D散点图
- z = list(range(0,729))
- plt.rcParams['savefig.dpi'] = 150 #图片像素
- plt.rcParams['figure.dpi'] = 150 #分辨率
- plt.rcParams["font.sans-serif"] = ["SimHei"] # 用来正常显示中文标签
- plt.rcParams["axes.unicode_minus"] = False # 解决负号"-"显示为方块的问题
- from mpl_toolkits.mplot3d import Axes3D
- x = regressor.feature_importances_
- y = gbdt.feature_importances_
- fig = plt.figure()
- plt.subplots_adjust(right=0.8)
- ax = fig.add_subplot(111, projection='3d') # 创建一个三维的绘图工程
- ax.scatter(x,y,z,c='b',s=5,alpha=1)
- #设置x、y轴坐标刻标以及对应的标签
- plt.xticks(fontsize=7)
- plt.yticks(fontsize=7)
- #统一设置x、y、z轴标签字体
- plt.tick_params(labelsize=7)
- #设置x、y、z标签
- plt.xlabel("x轴",fontsize=8)
- plt.ylabel("y轴",fontsize=8)
- ax.set_zlabel('z轴',fontsize=8)
- plt.savefig('这是三维图.png')
导入自定义包且.py文件修改时jupyter notebook自动同步
- %%%%load_ext autoreload
- %%%%autoreload 2
dataframe删除某列中重复字段并删除对应行
dataframe_feature_importance = dataframe_feature_importance.drop_duplicates(subset=['feature_name'], keep='first', inplace=False)
LASSO回归
- from sklearn import linear_model
-
- model = linear_model.LassoCV()
- model.fit(X_train, y_train)
- y_predict = model.predict(X_test)
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predict))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_predict)))
绘制回归误差图
- x_t = np.linspace(0, len(np.array(y_test)), len(np.array(y_test)))
- plt.plot(x_t, y_test, marker='.', label="origin data")
- # plt.xticks([])
- plt.plot(x_t, y_predict, 'r-', marker='.', label="predict", lw=1)
- plt.xlabel('样本编号')
- plt.ylabel('预测结果')
- # plt.figure(figsize=(10,100))
- plt.legend(labels=['test','predict'],loc='best')
- # plt.xticks([])
- score = model.score(X_test,y_test)
- print(score)
- plt.text(140, 3, 'score=%%%%.4f' %%%% score, fontdict={'size': 15, 'color': 'red'})
- plt.savefig('Lasso.png')
Molecular_Descriptor.iloc[:,1:]
- # .values能够将dataframe中的数据以numpy的形式读取
- X = Molecular_Descriptor.iloc[:,1:].values
- Y = ERα_activity.iloc[:,2].values
数据划分
- from sklearn.model_selection import train_test_split
- X_train, X_test, y_train, y_test = train_test_split(X,
- Y,
- test_size=0.2,
- random_state=0)
-
- #打印出原始样本集、训练集和测试集的数目
- print("The length of original data X is:", X.shape[0])
- print("The length of train Data is:", X_train.shape[0])
- print("The length of test Data is:", X_test.shape[0])
随机森林回归
- #导入随机森林库
- from sklearn.ensemble import RandomForestRegressor
- #导入sklearn度量库
- from sklearn import metrics
- #定义分类器
- RFRegressor = RandomForestRegressor(n_estimators=200, random_state=0)
- #模型训练
- RFregressor.fit(X_train, y_train)
- #模型预测
- y_pred = RFregressor.predict(X_test)
- #输出回归模型评价指标
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
- #获得特征重要性
- print(RFregressor.feature_importances_)
GBDT回归
- from sklearn.ensemble import GradientBoostingRegressor
- gbdt = GradientBoostingRegressor(random_state=0)
- gbdt.fit(X_train, y_train)
- y_pred = gbdt.predict(X_test)
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
特征重要性可视化
- import matplotlib.pyplot as plt
- plt.rcParams["font.sans-serif"] = ["SimHei"] # 用来正常显示中文标签
- plt.rcParams["axes.unicode_minus"] = False # 解决负号"-"显示为方块的问题
-
- plt.rcParams['savefig.dpi'] = 150 #图片像素
- plt.rcParams['figure.dpi'] = 150 #分辨率
-
-
- def plot_feature_importance(dataset, model_bst):
- '''
- dataset : 数据集 dataframe
- model_bst : 训练好的模型
- '''
- list_feature_name = list(dataset.columns[1:])
- list_feature_importance = list(model_bst.feature_importances_)
- dataframe_feature_importance = pd.DataFrame(
- {'feature_name': list_feature_name, 'importance': list_feature_importance})
- dataframe_feature_importance20 = dataframe_feature_importance.sort_values(by='importance', ascending=False)[:20]
- print(dataframe_feature_importance20)
- x = range(len(dataframe_feature_importance20['feature_name']))
- plt.xticks(x, dataframe_feature_importance20['feature_name'], rotation=90, fontsize=8)
- plt.plot(x, dataframe_feature_importance20['importance'])
- plt.xlabel("分子描述符")
- plt.ylabel("重要程度")
- plt.title('重要程度可视化')
- plt.grid()
- #保存图像
- #plt.savefig('重要程度可视化.png')
- plt.show()
- return dataframe_feature_importance20['feature_name']
-
- if __name__ == '__main__':
- # 传入数据集dataframe , 模型对特征重要性进行评估
- gbdt_name = plot_feature_importance(Molecular_Descriptor,gbdt)
输出:
绘制3D散点图
- z = list(range(0,729))
- plt.rcParams['savefig.dpi'] = 150 #图片像素
- plt.rcParams['figure.dpi'] = 150 #分辨率
- plt.rcParams["font.sans-serif"] = ["SimHei"] # 用来正常显示中文标签
- plt.rcParams["axes.unicode_minus"] = False # 解决负号"-"显示为方块的问题
- from mpl_toolkits.mplot3d import Axes3D
- x = regressor.feature_importances_
- y = gbdt.feature_importances_
- fig = plt.figure()
- plt.subplots_adjust(right=0.8)
- ax = fig.add_subplot(111, projection='3d') # 创建一个三维的绘图工程
- ax.scatter(x,y,z,c='b',s=5,alpha=1)
- #设置x、y轴坐标刻标以及对应的标签
- plt.xticks(fontsize=7)
- plt.yticks(fontsize=7)
- #统一设置x、y、z轴标签字体
- plt.tick_params(labelsize=7)
- #设置x、y、z标签
- plt.xlabel("x轴",fontsize=8)
- plt.ylabel("y轴",fontsize=8)
- ax.set_zlabel('z轴',fontsize=8)
- plt.savefig('这是三维图.png')
导入自定义包且.py文件修改时jupyter notebook自动同步
- %%%%load_ext autoreload
- %%%%autoreload 2
dataframe删除某列中重复字段并删除对应行
dataframe_feature_importance = dataframe_feature_importance.drop_duplicates(subset=['feature_name'], keep='first', inplace=False)
LASSO回归
- from sklearn import linear_model
-
- model = linear_model.LassoCV()
- model.fit(X_train, y_train)
- y_predict = model.predict(X_test)
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predict))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_predict)))
绘制回归误差图
- x_t = np.linspace(0, len(np.array(y_test)), len(np.array(y_test)))
- plt.plot(x_t, y_test, marker='.', label="origin data")
- # plt.xticks([])
- plt.plot(x_t, y_predict, 'r-', marker='.', label="predict", lw=1)
- plt.xlabel('样本编号')
- plt.ylabel('预测结果')
- # plt.figure(figsize=(10,100))
- plt.legend(labels=['test','predict'],loc='best')
- # plt.xticks([])
- score = model.score(X_test,y_test)
- print(score)
- plt.text(140, 3, 'score=%%%%.4f' %%%% score, fontdict={'size': 15, 'color': 'red'})
- plt.savefig('Lasso.png')
- from sklearn.model_selection import train_test_split
- X_train, X_test, y_train, y_test = train_test_split(X,
- Y,
- test_size=0.2,
- random_state=0)
-
- #打印出原始样本集、训练集和测试集的数目
- print("The length of original data X is:", X.shape[0])
- print("The length of train Data is:", X_train.shape[0])
- print("The length of test Data is:", X_test.shape[0])
- #导入随机森林库
- from sklearn.ensemble import RandomForestRegressor
- #导入sklearn度量库
- from sklearn import metrics
- #定义分类器
- RFRegressor = RandomForestRegressor(n_estimators=200, random_state=0)
- #模型训练
- RFregressor.fit(X_train, y_train)
- #模型预测
- y_pred = RFregressor.predict(X_test)
- #输出回归模型评价指标
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
- #获得特征重要性
- print(RFregressor.feature_importances_)
GBDT回归
- from sklearn.ensemble import GradientBoostingRegressor
- gbdt = GradientBoostingRegressor(random_state=0)
- gbdt.fit(X_train, y_train)
- y_pred = gbdt.predict(X_test)
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
特征重要性可视化
- import matplotlib.pyplot as plt
- plt.rcParams["font.sans-serif"] = ["SimHei"] # 用来正常显示中文标签
- plt.rcParams["axes.unicode_minus"] = False # 解决负号"-"显示为方块的问题
-
- plt.rcParams['savefig.dpi'] = 150 #图片像素
- plt.rcParams['figure.dpi'] = 150 #分辨率
-
-
- def plot_feature_importance(dataset, model_bst):
- '''
- dataset : 数据集 dataframe
- model_bst : 训练好的模型
- '''
- list_feature_name = list(dataset.columns[1:])
- list_feature_importance = list(model_bst.feature_importances_)
- dataframe_feature_importance = pd.DataFrame(
- {'feature_name': list_feature_name, 'importance': list_feature_importance})
- dataframe_feature_importance20 = dataframe_feature_importance.sort_values(by='importance', ascending=False)[:20]
- print(dataframe_feature_importance20)
- x = range(len(dataframe_feature_importance20['feature_name']))
- plt.xticks(x, dataframe_feature_importance20['feature_name'], rotation=90, fontsize=8)
- plt.plot(x, dataframe_feature_importance20['importance'])
- plt.xlabel("分子描述符")
- plt.ylabel("重要程度")
- plt.title('重要程度可视化')
- plt.grid()
- #保存图像
- #plt.savefig('重要程度可视化.png')
- plt.show()
- return dataframe_feature_importance20['feature_name']
-
- if __name__ == '__main__':
- # 传入数据集dataframe , 模型对特征重要性进行评估
- gbdt_name = plot_feature_importance(Molecular_Descriptor,gbdt)
输出:
绘制3D散点图
- z = list(range(0,729))
- plt.rcParams['savefig.dpi'] = 150 #图片像素
- plt.rcParams['figure.dpi'] = 150 #分辨率
- plt.rcParams["font.sans-serif"] = ["SimHei"] # 用来正常显示中文标签
- plt.rcParams["axes.unicode_minus"] = False # 解决负号"-"显示为方块的问题
- from mpl_toolkits.mplot3d import Axes3D
- x = regressor.feature_importances_
- y = gbdt.feature_importances_
- fig = plt.figure()
- plt.subplots_adjust(right=0.8)
- ax = fig.add_subplot(111, projection='3d') # 创建一个三维的绘图工程
- ax.scatter(x,y,z,c='b',s=5,alpha=1)
- #设置x、y轴坐标刻标以及对应的标签
- plt.xticks(fontsize=7)
- plt.yticks(fontsize=7)
- #统一设置x、y、z轴标签字体
- plt.tick_params(labelsize=7)
- #设置x、y、z标签
- plt.xlabel("x轴",fontsize=8)
- plt.ylabel("y轴",fontsize=8)
- ax.set_zlabel('z轴',fontsize=8)
- plt.savefig('这是三维图.png')
导入自定义包且.py文件修改时jupyter notebook自动同步
- %%%%load_ext autoreload
- %%%%autoreload 2
dataframe删除某列中重复字段并删除对应行
dataframe_feature_importance = dataframe_feature_importance.drop_duplicates(subset=['feature_name'], keep='first', inplace=False)
LASSO回归
- from sklearn import linear_model
-
- model = linear_model.LassoCV()
- model.fit(X_train, y_train)
- y_predict = model.predict(X_test)
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predict))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_predict)))
绘制回归误差图
- x_t = np.linspace(0, len(np.array(y_test)), len(np.array(y_test)))
- plt.plot(x_t, y_test, marker='.', label="origin data")
- # plt.xticks([])
- plt.plot(x_t, y_predict, 'r-', marker='.', label="predict", lw=1)
- plt.xlabel('样本编号')
- plt.ylabel('预测结果')
- # plt.figure(figsize=(10,100))
- plt.legend(labels=['test','predict'],loc='best')
- # plt.xticks([])
- score = model.score(X_test,y_test)
- print(score)
- plt.text(140, 3, 'score=%%%%.4f' %%%% score, fontdict={'size': 15, 'color': 'red'})
- plt.savefig('Lasso.png')
- from sklearn.ensemble import GradientBoostingRegressor
- gbdt = GradientBoostingRegressor(random_state=0)
- gbdt.fit(X_train, y_train)
- y_pred = gbdt.predict(X_test)
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
- import matplotlib.pyplot as plt
- plt.rcParams["font.sans-serif"] = ["SimHei"] # 用来正常显示中文标签
- plt.rcParams["axes.unicode_minus"] = False # 解决负号"-"显示为方块的问题
-
- plt.rcParams['savefig.dpi'] = 150 #图片像素
- plt.rcParams['figure.dpi'] = 150 #分辨率
-
-
- def plot_feature_importance(dataset, model_bst):
- '''
- dataset : 数据集 dataframe
- model_bst : 训练好的模型
- '''
- list_feature_name = list(dataset.columns[1:])
- list_feature_importance = list(model_bst.feature_importances_)
- dataframe_feature_importance = pd.DataFrame(
- {'feature_name': list_feature_name, 'importance': list_feature_importance})
- dataframe_feature_importance20 = dataframe_feature_importance.sort_values(by='importance', ascending=False)[:20]
- print(dataframe_feature_importance20)
- x = range(len(dataframe_feature_importance20['feature_name']))
- plt.xticks(x, dataframe_feature_importance20['feature_name'], rotation=90, fontsize=8)
- plt.plot(x, dataframe_feature_importance20['importance'])
- plt.xlabel("分子描述符")
- plt.ylabel("重要程度")
- plt.title('重要程度可视化')
- plt.grid()
- #保存图像
- #plt.savefig('重要程度可视化.png')
- plt.show()
- return dataframe_feature_importance20['feature_name']
-
- if __name__ == '__main__':
- # 传入数据集dataframe , 模型对特征重要性进行评估
- gbdt_name = plot_feature_importance(Molecular_Descriptor,gbdt)
输出:
绘制3D散点图
- z = list(range(0,729))
- plt.rcParams['savefig.dpi'] = 150 #图片像素
- plt.rcParams['figure.dpi'] = 150 #分辨率
- plt.rcParams["font.sans-serif"] = ["SimHei"] # 用来正常显示中文标签
- plt.rcParams["axes.unicode_minus"] = False # 解决负号"-"显示为方块的问题
- from mpl_toolkits.mplot3d import Axes3D
- x = regressor.feature_importances_
- y = gbdt.feature_importances_
- fig = plt.figure()
- plt.subplots_adjust(right=0.8)
- ax = fig.add_subplot(111, projection='3d') # 创建一个三维的绘图工程
- ax.scatter(x,y,z,c='b',s=5,alpha=1)
- #设置x、y轴坐标刻标以及对应的标签
- plt.xticks(fontsize=7)
- plt.yticks(fontsize=7)
- #统一设置x、y、z轴标签字体
- plt.tick_params(labelsize=7)
- #设置x、y、z标签
- plt.xlabel("x轴",fontsize=8)
- plt.ylabel("y轴",fontsize=8)
- ax.set_zlabel('z轴',fontsize=8)
- plt.savefig('这是三维图.png')
导入自定义包且.py文件修改时jupyter notebook自动同步
- %%%%load_ext autoreload
- %%%%autoreload 2
dataframe删除某列中重复字段并删除对应行
dataframe_feature_importance = dataframe_feature_importance.drop_duplicates(subset=['feature_name'], keep='first', inplace=False)
LASSO回归
- from sklearn import linear_model
-
- model = linear_model.LassoCV()
- model.fit(X_train, y_train)
- y_predict = model.predict(X_test)
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predict))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_predict)))
绘制回归误差图
- x_t = np.linspace(0, len(np.array(y_test)), len(np.array(y_test)))
- plt.plot(x_t, y_test, marker='.', label="origin data")
- # plt.xticks([])
- plt.plot(x_t, y_predict, 'r-', marker='.', label="predict", lw=1)
- plt.xlabel('样本编号')
- plt.ylabel('预测结果')
- # plt.figure(figsize=(10,100))
- plt.legend(labels=['test','predict'],loc='best')
- # plt.xticks([])
- score = model.score(X_test,y_test)
- print(score)
- plt.text(140, 3, 'score=%%%%.4f' %%%% score, fontdict={'size': 15, 'color': 'red'})
- plt.savefig('Lasso.png')
- z = list(range(0,729))
- plt.rcParams['savefig.dpi'] = 150 #图片像素
- plt.rcParams['figure.dpi'] = 150 #分辨率
- plt.rcParams["font.sans-serif"] = ["SimHei"] # 用来正常显示中文标签
- plt.rcParams["axes.unicode_minus"] = False # 解决负号"-"显示为方块的问题
- from mpl_toolkits.mplot3d import Axes3D
- x = regressor.feature_importances_
- y = gbdt.feature_importances_
- fig = plt.figure()
- plt.subplots_adjust(right=0.8)
- ax = fig.add_subplot(111, projection='3d') # 创建一个三维的绘图工程
- ax.scatter(x,y,z,c='b',s=5,alpha=1)
- #设置x、y轴坐标刻标以及对应的标签
- plt.xticks(fontsize=7)
- plt.yticks(fontsize=7)
- #统一设置x、y、z轴标签字体
- plt.tick_params(labelsize=7)
- #设置x、y、z标签
- plt.xlabel("x轴",fontsize=8)
- plt.ylabel("y轴",fontsize=8)
- ax.set_zlabel('z轴',fontsize=8)
- plt.savefig('这是三维图.png')
- %%%%load_ext autoreload
- %%%%autoreload 2
dataframe删除某列中重复字段并删除对应行
dataframe_feature_importance = dataframe_feature_importance.drop_duplicates(subset=['feature_name'], keep='first', inplace=False)
LASSO回归
- from sklearn import linear_model
-
- model = linear_model.LassoCV()
- model.fit(X_train, y_train)
- y_predict = model.predict(X_test)
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predict))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_predict)))
绘制回归误差图
- x_t = np.linspace(0, len(np.array(y_test)), len(np.array(y_test)))
- plt.plot(x_t, y_test, marker='.', label="origin data")
- # plt.xticks([])
- plt.plot(x_t, y_predict, 'r-', marker='.', label="predict", lw=1)
- plt.xlabel('样本编号')
- plt.ylabel('预测结果')
- # plt.figure(figsize=(10,100))
- plt.legend(labels=['test','predict'],loc='best')
- # plt.xticks([])
- score = model.score(X_test,y_test)
- print(score)
- plt.text(140, 3, 'score=%%%%.4f' %%%% score, fontdict={'size': 15, 'color': 'red'})
- plt.savefig('Lasso.png')
dataframe_feature_importance = dataframe_feature_importance.drop_duplicates(subset=['feature_name'], keep='first', inplace=False)
- from sklearn import linear_model
-
- model = linear_model.LassoCV()
- model.fit(X_train, y_train)
- y_predict = model.predict(X_test)
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predict))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_predict)))
绘制回归误差图
- x_t = np.linspace(0, len(np.array(y_test)), len(np.array(y_test)))
- plt.plot(x_t, y_test, marker='.', label="origin data")
- # plt.xticks([])
- plt.plot(x_t, y_predict, 'r-', marker='.', label="predict", lw=1)
- plt.xlabel('样本编号')
- plt.ylabel('预测结果')
- # plt.figure(figsize=(10,100))
- plt.legend(labels=['test','predict'],loc='best')
- # plt.xticks([])
- score = model.score(X_test,y_test)
- print(score)
- plt.text(140, 3, 'score=%%%%.4f' %%%% score, fontdict={'size': 15, 'color': 'red'})
- plt.savefig('Lasso.png')
- x_t = np.linspace(0, len(np.array(y_test)), len(np.array(y_test)))
- plt.plot(x_t, y_test, marker='.', label="origin data")
- # plt.xticks([])
- plt.plot(x_t, y_predict, 'r-', marker='.', label="predict", lw=1)
- plt.xlabel('样本编号')
- plt.ylabel('预测结果')
- # plt.figure(figsize=(10,100))
- plt.legend(labels=['test','predict'],loc='best')
- # plt.xticks([])
- score = model.score(X_test,y_test)
- print(score)
- plt.text(140, 3, 'score=%%%%.4f' %%%% score, fontdict={'size': 15, 'color': 'red'})
- plt.savefig('Lasso.png')
输出:
Adaboost回归
- from sklearn.ensemble import AdaBoostClassifier
- clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth=3),
- n_estimators=5000, random_state=123)
- clf.fit(X_train,y_train)
- y_predict = clf.predict(X_test)
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predict))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_predict)))
LightGBM回归
- import lightgbm as lgb
-
- clf = lgb.LGBMRegressor(
- boosting_type='gbdt',
- random_state=2019,
- objective='regression')
- # 训练模型
- clf.fit(X=X_train, y=y_train, eval_metric='MSE', verbose=50)
- y_predict = clf.predict(X_test)
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predict))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_predict)))
XGBoost
- import xgboost as xgb
- clf = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=5000, silent=False, objective='reg:gamma')
- # 训练模型
- clf.fit(X=X_train, y=y_train)
- y_predict = clf.predict(X_test)
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predict))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_predict)))
绘制学习曲线
- from sklearn.model_selection import learning_curve
- from sklearn.model_selection import ShuffleSplit
- def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
- plt.figure()
- plt.title(title)
- if ylim is not None:
- plt.ylim(*ylim)
- plt.xlabel("Training examples")
- plt.ylabel("Score")
- train_sizes, train_scores, test_scores = learning_curve(
- estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
- train_scores_mean = np.mean(train_scores, axis=1)
- train_scores_std = np.std(train_scores, axis=1)
- test_scores_mean = np.mean(test_scores, axis=1)
- test_scores_std = np.std(test_scores, axis=1)
- plt.grid()
-
- plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
- train_scores_mean + train_scores_std, alpha=0.1,
- color="r")
- plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
- test_scores_mean + test_scores_std, alpha=0.1, color="g")
- plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
- label="Training score")
- plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
- label="Cross-validation score")
-
- plt.legend(loc="best")
- return plt
- if __name__ == '__main__':
- title = "Learning Curves"
- # Cross validation with 100 iterations to get smoother mean test and train
- # score curves, each time with 20%%%% data randomly selected as a validation set.
- cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
- estimator =lgb.LGBMRegressor(learning_rate=0.001,
- max_depth=-1,
- n_estimators=10000,
- boosting_type='gbdt',
- random_state=2019,
- objective='regression',)
- #模型 图像标题 数据 标签 K折
- p = plot_learning_curve(estimator, title, XX, YY, cv=cv, n_jobs=4)
- p.savefig('LearnCurves.png')
输出:
- from sklearn.ensemble import AdaBoostClassifier
- clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth=3),
- n_estimators=5000, random_state=123)
- clf.fit(X_train,y_train)
- y_predict = clf.predict(X_test)
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predict))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_predict)))
- import lightgbm as lgb
-
- clf = lgb.LGBMRegressor(
- boosting_type='gbdt',
- random_state=2019,
- objective='regression')
- # 训练模型
- clf.fit(X=X_train, y=y_train, eval_metric='MSE', verbose=50)
- y_predict = clf.predict(X_test)
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predict))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_predict)))
XGBoost
- import xgboost as xgb
- clf = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=5000, silent=False, objective='reg:gamma')
- # 训练模型
- clf.fit(X=X_train, y=y_train)
- y_predict = clf.predict(X_test)
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predict))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_predict)))
绘制学习曲线
- from sklearn.model_selection import learning_curve
- from sklearn.model_selection import ShuffleSplit
- def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
- plt.figure()
- plt.title(title)
- if ylim is not None:
- plt.ylim(*ylim)
- plt.xlabel("Training examples")
- plt.ylabel("Score")
- train_sizes, train_scores, test_scores = learning_curve(
- estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
- train_scores_mean = np.mean(train_scores, axis=1)
- train_scores_std = np.std(train_scores, axis=1)
- test_scores_mean = np.mean(test_scores, axis=1)
- test_scores_std = np.std(test_scores, axis=1)
- plt.grid()
-
- plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
- train_scores_mean + train_scores_std, alpha=0.1,
- color="r")
- plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
- test_scores_mean + test_scores_std, alpha=0.1, color="g")
- plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
- label="Training score")
- plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
- label="Cross-validation score")
-
- plt.legend(loc="best")
- return plt
- if __name__ == '__main__':
- title = "Learning Curves"
- # Cross validation with 100 iterations to get smoother mean test and train
- # score curves, each time with 20%%%% data randomly selected as a validation set.
- cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
- estimator =lgb.LGBMRegressor(learning_rate=0.001,
- max_depth=-1,
- n_estimators=10000,
- boosting_type='gbdt',
- random_state=2019,
- objective='regression',)
- #模型 图像标题 数据 标签 K折
- p = plot_learning_curve(estimator, title, XX, YY, cv=cv, n_jobs=4)
- p.savefig('LearnCurves.png')
输出:
- import xgboost as xgb
- clf = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=5000, silent=False, objective='reg:gamma')
- # 训练模型
- clf.fit(X=X_train, y=y_train)
- y_predict = clf.predict(X_test)
- print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predict))
- print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict))
- print('Root Mean Squared Error:',
- np.sqrt(metrics.mean_squared_error(y_test, y_predict)))
- from sklearn.model_selection import learning_curve
- from sklearn.model_selection import ShuffleSplit
- def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
- plt.figure()
- plt.title(title)
- if ylim is not None:
- plt.ylim(*ylim)
- plt.xlabel("Training examples")
- plt.ylabel("Score")
- train_sizes, train_scores, test_scores = learning_curve(
- estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
- train_scores_mean = np.mean(train_scores, axis=1)
- train_scores_std = np.std(train_scores, axis=1)
- test_scores_mean = np.mean(test_scores, axis=1)
- test_scores_std = np.std(test_scores, axis=1)
- plt.grid()
-
- plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
- train_scores_mean + train_scores_std, alpha=0.1,
- color="r")
- plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
- test_scores_mean + test_scores_std, alpha=0.1, color="g")
- plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
- label="Training score")
- plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
- label="Cross-validation score")
-
- plt.legend(loc="best")
- return plt
- if __name__ == '__main__':
- title = "Learning Curves"
- # Cross validation with 100 iterations to get smoother mean test and train
- # score curves, each time with 20%%%% data randomly selected as a validation set.
- cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
- estimator =lgb.LGBMRegressor(learning_rate=0.001,
- max_depth=-1,
- n_estimators=10000,
- boosting_type='gbdt',
- random_state=2019,
- objective='regression',)
- #模型 图像标题 数据 标签 K折
- p = plot_learning_curve(estimator, title, XX, YY, cv=cv, n_jobs=4)
- p.savefig('LearnCurves.png')
输出:
绘制dataframe数据分布图
- #
- name = ['gmin', 'MDEC-22', 'minaaN', 'maxHBint10', 'minHBint10', 'maxdO',
- 'C2SP1', 'BCUTw-1h', 'BCUTp-1l', 'MDEN-33', 'VC-4', 'nAtomLAC',
- 'SHBint10', 'minHBint4', 'C2SP2', 'MDEC-24', 'hmax', 'SHBint9',
- 'fragC', 'LipinskiFailures']
- # 提取数据指定列
- t = Molecular_Descriptor[name]
- #数据归一化
- t = (t-t.min())/(t.max()-t.min())
-
- t.plot(alpha=0.8)
- #横向拉长x轴
- N=100
- plt.legend(loc=2, bbox_to_anchor=(1.05,1.0),borderaxespad= 0)
- # change x internal size
- plt.gca().margins(x=0)
- plt.gcf().canvas.draw()
- tl = plt.gca().get_xticklabels()
- # maxsize = max([t.get_window_extent().width for t in tl])
- maxsize = 30
- m = 0.2 # inch margin
- s = maxsize / plt.gcf().dpi * N + 2 * m
- margin = m / plt.gcf().get_size_inches()[0]
-
- plt.gcf().subplots_adjust(left=margin, right=1. - margin)
- plt.gcf().set_size_inches(s, plt.gcf().get_size_inches()[1])
- #合理布局
- plt.tight_layout()
- plt.savefig("数据分布.png")
输出:
- #
- name = ['gmin', 'MDEC-22', 'minaaN', 'maxHBint10', 'minHBint10', 'maxdO',
- 'C2SP1', 'BCUTw-1h', 'BCUTp-1l', 'MDEN-33', 'VC-4', 'nAtomLAC',
- 'SHBint10', 'minHBint4', 'C2SP2', 'MDEC-24', 'hmax', 'SHBint9',
- 'fragC', 'LipinskiFailures']
- # 提取数据指定列
- t = Molecular_Descriptor[name]
- #数据归一化
- t = (t-t.min())/(t.max()-t.min())
-
- t.plot(alpha=0.8)
- #横向拉长x轴
- N=100
- plt.legend(loc=2, bbox_to_anchor=(1.05,1.0),borderaxespad= 0)
- # change x internal size
- plt.gca().margins(x=0)
- plt.gcf().canvas.draw()
- tl = plt.gca().get_xticklabels()
- # maxsize = max([t.get_window_extent().width for t in tl])
- maxsize = 30
- m = 0.2 # inch margin
- s = maxsize / plt.gcf().dpi * N + 2 * m
- margin = m / plt.gcf().get_size_inches()[0]
-
- plt.gcf().subplots_adjust(left=margin, right=1. - margin)
- plt.gcf().set_size_inches(s, plt.gcf().get_size_inches()[1])
- #合理布局
- plt.tight_layout()
- plt.savefig("数据分布.png")
SVM分类
- from sklearn.svm import SVC
- from sklearn import metrics
- #定义SVM分类器
- clf = SVC()
- #模型训练
- clf.fit(X_train,y_train)
- #模型预测
- y_pred = clf.predict(X_test)
- #模型评估
- print('准确率=%%%%.4f'%%%%metrics.accuracy_score(y_test,y_pred))
- print('召回率=%%%%.4f'%%%%metrics.recall_score(y_test, y_pred, pos_label=1))
- print('精准率=%%%%.4f'%%%%metrics.precision_score(y_test, y_pred, pos_label=1) )
- print('F1=%%%%.4f'%%%%metrics.f1_score(y_test, y_pred, average='weighted',pos_label=1) )
使用贝叶斯优化SVM
- from sklearn.datasets import make_classification
- from sklearn.model_selection import cross_val_score
- from sklearn.ensemble import RandomForestClassifier as RFC
- from sklearn.svm import SVC
-
- from bayes_opt import BayesianOptimization
- from bayes_opt.util import Colours
-
-
- def svc_cv(C, gamma, X_train, y_train):
- """SVC cross validation.
- This function will instantiate a SVC classifier with parameters C and
- gamma. Combined with data and targets this will in turn be used to perform
- cross validation. The result of cross validation is returned.
- Our goal is to find combinations of C and gamma that maximizes the roc_auc
- metric.
- """
- #设置分类器
- estimator = SVC(C=C, gamma=gamma, random_state=2)
- #交叉验证
- cval = cross_val_score(estimator, X_train, y_train, scoring='roc_auc', cv=4)
- return cval.mean()
- def optimize_svc(X_train, y_train):
- """Apply Bayesian Optimization to SVC parameters."""
-
- def svc_crossval(expC, expGamma):
- """Wrapper of SVC cross validation.
- Notice how we transform between regular and log scale. While this
- is not technically necessary, it greatly improves the performance
- of the optimizer.
- """
- C = 10 ** expC
- gamma = 10 ** expGamma
- return svc_cv(C=C, gamma=gamma, data=X_train, targets=y_train)
-
- optimizer = BayesianOptimization(
- f=svc_crossval,
- #设置超参范围
- pbounds={"expC": (-3, 2), "expGamma": (-4, -1)},
- random_state=1234,
- verbose=2
- )
- optimizer.maximize(n_iter=10)
-
- print("Final result:", optimizer.max)
- if __name__ == '__main__':
- #开始搜索超参
- optimize_svc(X_train, y_train)
-
输出:
- | iter | target | expC | expGamma |
- -------------------------------------------------
- | 1 | 0.8239 | -2.042 | -2.134 |
- | 2 | 0.8973 | -0.8114 | -1.644 |
- | 3 | 0.8791 | 0.8999 | -3.182 |
- | 4 | 0.8635 | -1.618 | -1.594 |
- | 5 | 0.9104 | 1.791 | -1.372 |
- | 6 | 0.9213 | 1.099 | -1.502 |
- | 7 | 0.9165 | 0.2084 | -1.0 |
- | 8 | 0.8727 | 2.0 | -4.0 |
- | 9 | 0.9117 | 1.131 | -1.0 |
- | 10 | 0.9241 | 0.3228 | -1.88 |
- | 11 | 0.9346 | 2.0 | -2.322 |
- | 12 | 0.9335 | 1.429 | -2.239 |
- | 13 | 0.7927 | -3.0 | -4.0 |
- | 14 | 0.927 | 2.0 | -2.715 |
- | 15 | 0.9354 | 1.742 | -2.249 |
- =================================================
- Final result: {'target': 0.9353828944247531, 'params': {'expC': 1.7417094883510253, 'expGamma': -2.248984327197053}}
- from sklearn.svm import SVC
- from sklearn import metrics
- #定义SVM分类器
- clf = SVC()
- #模型训练
- clf.fit(X_train,y_train)
- #模型预测
- y_pred = clf.predict(X_test)
- #模型评估
- print('准确率=%%%%.4f'%%%%metrics.accuracy_score(y_test,y_pred))
- print('召回率=%%%%.4f'%%%%metrics.recall_score(y_test, y_pred, pos_label=1))
- print('精准率=%%%%.4f'%%%%metrics.precision_score(y_test, y_pred, pos_label=1) )
- print('F1=%%%%.4f'%%%%metrics.f1_score(y_test, y_pred, average='weighted',pos_label=1) )
- from sklearn.datasets import make_classification
- from sklearn.model_selection import cross_val_score
- from sklearn.ensemble import RandomForestClassifier as RFC
- from sklearn.svm import SVC
-
- from bayes_opt import BayesianOptimization
- from bayes_opt.util import Colours
-
-
- def svc_cv(C, gamma, X_train, y_train):
- """SVC cross validation.
- This function will instantiate a SVC classifier with parameters C and
- gamma. Combined with data and targets this will in turn be used to perform
- cross validation. The result of cross validation is returned.
- Our goal is to find combinations of C and gamma that maximizes the roc_auc
- metric.
- """
- #设置分类器
- estimator = SVC(C=C, gamma=gamma, random_state=2)
- #交叉验证
- cval = cross_val_score(estimator, X_train, y_train, scoring='roc_auc', cv=4)
- return cval.mean()
- def optimize_svc(X_train, y_train):
- """Apply Bayesian Optimization to SVC parameters."""
-
- def svc_crossval(expC, expGamma):
- """Wrapper of SVC cross validation.
- Notice how we transform between regular and log scale. While this
- is not technically necessary, it greatly improves the performance
- of the optimizer.
- """
- C = 10 ** expC
- gamma = 10 ** expGamma
- return svc_cv(C=C, gamma=gamma, data=X_train, targets=y_train)
-
- optimizer = BayesianOptimization(
- f=svc_crossval,
- #设置超参范围
- pbounds={"expC": (-3, 2), "expGamma": (-4, -1)},
- random_state=1234,
- verbose=2
- )
- optimizer.maximize(n_iter=10)
-
- print("Final result:", optimizer.max)
- if __name__ == '__main__':
- #开始搜索超参
- optimize_svc(X_train, y_train)
-
输出:
- | iter | target | expC | expGamma |
- -------------------------------------------------
- | 1 | 0.8239 | -2.042 | -2.134 |
- | 2 | 0.8973 | -0.8114 | -1.644 |
- | 3 | 0.8791 | 0.8999 | -3.182 |
- | 4 | 0.8635 | -1.618 | -1.594 |
- | 5 | 0.9104 | 1.791 | -1.372 |
- | 6 | 0.9213 | 1.099 | -1.502 |
- | 7 | 0.9165 | 0.2084 | -1.0 |
- | 8 | 0.8727 | 2.0 | -4.0 |
- | 9 | 0.9117 | 1.131 | -1.0 |
- | 10 | 0.9241 | 0.3228 | -1.88 |
- | 11 | 0.9346 | 2.0 | -2.322 |
- | 12 | 0.9335 | 1.429 | -2.239 |
- | 13 | 0.7927 | -3.0 | -4.0 |
- | 14 | 0.927 | 2.0 | -2.715 |
- | 15 | 0.9354 | 1.742 | -2.249 |
- =================================================
- Final result: {'target': 0.9353828944247531, 'params': {'expC': 1.7417094883510253, 'expGamma': -2.248984327197053}}
iter为迭代次数,target为模型所获得的分数(越高越好),expC、expGamma为需要贝叶斯优化的参数
后续:
如何使用?根据搜索到的超参数'params': {'expC': 1.7417094883510253, 'expGamma': -2.248984327197053}重新训练分类器即可
- clf = SVC(C=10**1.74,gamma=10**(-2.248))
- clf.fit(X_train,y_train)
- y_pred = clf.predict(X_test)
绘制ROC曲线
- import matplotlib.pyplot as plt
- from sklearn.metrics import roc_curve, auc
- import matplotlib.pyplot as plt
-
- plt.rcParams['savefig.dpi'] = 150 #图片像素
- plt.rcParams['figure.dpi'] = 150 #分辨率
- #传入真实值和预测值
- fpr, tpr, thersholds = roc_curve(y_test, y_pred, pos_label=1)
-
- for i, value in enumerate(thersholds):
- print("%%%%f %%%%f %%%%f" %%%% (fpr[i], tpr[i], value))
- roc_auc = auc(fpr, tpr)
-
- plt.plot(fpr, tpr, 'k--', label='ROC (area = {0:.2f})'.format(roc_auc), lw=2,c='r')
-
- plt.xlim([-0.05, 1.05]) # 设置x、y轴的上下限,以免和边缘重合,更好的观察图像的整体
- plt.ylim([-0.05, 1.05])
- plt.xlabel('False Positive Rate')
- plt.ylabel('True Positive Rate') # 可以使用中文,但需要导入一些库即字体
- plt.title('ROC Curve')
- plt.legend(loc="lower right")
- plt.savefig('Caco-2分类ROC曲线.png')
- plt.show()
- print(roc_auc)
输出:
- import matplotlib.pyplot as plt
- from sklearn.metrics import roc_curve, auc
- import matplotlib.pyplot as plt
-
- plt.rcParams['savefig.dpi'] = 150 #图片像素
- plt.rcParams['figure.dpi'] = 150 #分辨率
- #传入真实值和预测值
- fpr, tpr, thersholds = roc_curve(y_test, y_pred, pos_label=1)
-
- for i, value in enumerate(thersholds):
- print("%%%%f %%%%f %%%%f" %%%% (fpr[i], tpr[i], value))
- roc_auc = auc(fpr, tpr)
-
- plt.plot(fpr, tpr, 'k--', label='ROC (area = {0:.2f})'.format(roc_auc), lw=2,c='r')
-
- plt.xlim([-0.05, 1.05]) # 设置x、y轴的上下限,以免和边缘重合,更好的观察图像的整体
- plt.ylim([-0.05, 1.05])
- plt.xlabel('False Positive Rate')
- plt.ylabel('True Positive Rate') # 可以使用中文,但需要导入一些库即字体
- plt.title('ROC Curve')
- plt.legend(loc="lower right")
- plt.savefig('Caco-2分类ROC曲线.png')
- plt.show()
- print(roc_auc)
PCA降维
- from sklearn.decomposition import PCA
- #定义PCA分类器,n_components为需要降到的维数
- pca = PCA(n_components=50)
- # X.shape = (1974,729)
- #数据转换 (1974,729) -> (1974,50)
- new_X = pca.fit_transform(X)
- #new_X.shape = (1974,50)
PCA降维可视化
- # plt.rcParams['savefig.dpi'] = 150 #图片像素
- # plt.rcParams['figure.dpi'] = 150 #分辨率
- plt.rcParams["font.sans-serif"] = ["SimHei"] # 用来正常显示中文标签
- plt.rcParams["axes.unicode_minus"] = False # 解决负号"-"显示为方块的问题
- from mpl_toolkits.mplot3d import Axes3D
- # 降到3维
- pca = PCA(n_components=3)
- pca_test = pca.fit_transform(X_test)
- pca_test.shape
- fig = plt.figure()
- plt.subplots_adjust(right=0.8)
- ax = fig.add_subplot(111, projection='3d') # 创建一个三维的绘图工程
- y_pred==0
- #分离0 1
- label0 = pca_test[y_pred==0]
- label1 = pca_test[y_pred==1]
- # label0
- ax.scatter(label0[:,0],label0[:,1],label0[:,2],label=0,alpha=0.8)
- ax.scatter(label1[:,0],label1[:,1],label1[:,2],label=1,alpha=0.8)
- plt.legend()
- plt.savefig('Caco2分类三维图像.png')
输出:
- from sklearn.decomposition import PCA
- #定义PCA分类器,n_components为需要降到的维数
- pca = PCA(n_components=50)
- # X.shape = (1974,729)
- #数据转换 (1974,729) -> (1974,50)
- new_X = pca.fit_transform(X)
- #new_X.shape = (1974,50)
- # plt.rcParams['savefig.dpi'] = 150 #图片像素
- # plt.rcParams['figure.dpi'] = 150 #分辨率
- plt.rcParams["font.sans-serif"] = ["SimHei"] # 用来正常显示中文标签
- plt.rcParams["axes.unicode_minus"] = False # 解决负号"-"显示为方块的问题
- from mpl_toolkits.mplot3d import Axes3D
- # 降到3维
- pca = PCA(n_components=3)
- pca_test = pca.fit_transform(X_test)
- pca_test.shape
- fig = plt.figure()
- plt.subplots_adjust(right=0.8)
- ax = fig.add_subplot(111, projection='3d') # 创建一个三维的绘图工程
- y_pred==0
- #分离0 1
- label0 = pca_test[y_pred==0]
- label1 = pca_test[y_pred==1]
- # label0
- ax.scatter(label0[:,0],label0[:,1],label0[:,2],label=0,alpha=0.8)
- ax.scatter(label1[:,0],label1[:,1],label1[:,2],label=1,alpha=0.8)
- plt.legend()
- plt.savefig('Caco2分类三维图像.png')
输出:
求解极值
- # coding=utf-8
- from scipy.optimize import minimize
- import numpy as np
-
- #设置参数范围/约束条件
- l_x_min = [0,1,2,3]
- l_x_max = [4,5,6,7]
- def fun():
- #minimize只能求极小值,如果需要极大值,则在函数前添加负号,本案例为求极大值
- v=lambda x: -1*(coef[0]*x[0]+coef[1]*x[1]+coef[2]*x[2]+coef[3]*x[3]+intercept)
- return v
- def con():
- # 约束条件 分为eq 和ineq
- #eq表示 函数结果等于0 ; ineq 表示 表达式大于等于0
- #{'type': 'ineq', 'fun': lambda x: x[0] - l_x_min[0]}表示 x[0] - l_x_min[0]>=0
- cons = ({'type': 'ineq', 'fun': lambda x: x[0] - l_x_min[0]},
- {'type': 'ineq', 'fun': lambda x: -x[0] + l_x_max[0]},
- {'type': 'ineq', 'fun': lambda x: x[1] - l_x_min[1]},
- {'type': 'ineq', 'fun': lambda x: -x[1] + l_x_max[1]},
- {'type': 'ineq', 'fun': lambda x: x[2] - l_x_min[2]},
- {'type': 'ineq', 'fun': lambda x: -x[2] + l_x_max[2]},
- {'type': 'ineq', 'fun': lambda x: x[3] - l_x_min[3]},
- {'type': 'ineq', 'fun': lambda x: -x[3] + l_x_max[3]})
-
- return cons
-
- if __name__ == "__main__":
- #定义常量值
-
-
- cons = con()
- #设置初始猜测值
- x0 = np.random.rand(4)
-
- res = minimize(fun(), x0, method='SLSQP',constraints=cons)
- print(res.fun)
- print(res.success)
- print(res.x)
输出解释:
- #举例:
- [output]:
- -1114.4862509294192 # 由于在开始时给函数添加符号,最后还需要*-1,因此极大值为1114.4862509294192
- True #成功找到极值
- [-1.90754988e-10 6.36254335e+00 -1.25920646e-10 1.90480000e-01] #该极值对应x解
- # coding=utf-8
- from scipy.optimize import minimize
- import numpy as np
-
- #设置参数范围/约束条件
- l_x_min = [0,1,2,3]
- l_x_max = [4,5,6,7]
- def fun():
- #minimize只能求极小值,如果需要极大值,则在函数前添加负号,本案例为求极大值
- v=lambda x: -1*(coef[0]*x[0]+coef[1]*x[1]+coef[2]*x[2]+coef[3]*x[3]+intercept)
- return v
- def con():
- # 约束条件 分为eq 和ineq
- #eq表示 函数结果等于0 ; ineq 表示 表达式大于等于0
- #{'type': 'ineq', 'fun': lambda x: x[0] - l_x_min[0]}表示 x[0] - l_x_min[0]>=0
- cons = ({'type': 'ineq', 'fun': lambda x: x[0] - l_x_min[0]},
- {'type': 'ineq', 'fun': lambda x: -x[0] + l_x_max[0]},
- {'type': 'ineq', 'fun': lambda x: x[1] - l_x_min[1]},
- {'type': 'ineq', 'fun': lambda x: -x[1] + l_x_max[1]},
- {'type': 'ineq', 'fun': lambda x: x[2] - l_x_min[2]},
- {'type': 'ineq', 'fun': lambda x: -x[2] + l_x_max[2]},
- {'type': 'ineq', 'fun': lambda x: x[3] - l_x_min[3]},
- {'type': 'ineq', 'fun': lambda x: -x[3] + l_x_max[3]})
-
- return cons
-
- if __name__ == "__main__":
- #定义常量值
-
-
- cons = con()
- #设置初始猜测值
- x0 = np.random.rand(4)
-
- res = minimize(fun(), x0, method='SLSQP',constraints=cons)
- print(res.fun)
- print(res.success)
- print(res.x)
- #举例:
- [output]:
- -1114.4862509294192 # 由于在开始时给函数添加符号,最后还需要*-1,因此极大值为1114.4862509294192
- True #成功找到极值
- [-1.90754988e-10 6.36254335e+00 -1.25920646e-10 1.90480000e-01] #该极值对应x解
推荐阅读