WHCSRL 技术网

数学建模的常用功能

目录

pandas读取数据

查看数据异常

提取指定列

将dataframe数据以numpy形式提取

数据划分

随机森林回归

GBDT回归

特征重要性可视化

 输出:

​ 绘制3D散点图

导入自定义包且.py文件修改时jupyter notebook自动同步

 dataframe删除某列中重复字段并删除对应行

LASSO回归

 绘制回归误差图

输出:

​ Adaboost回归

LightGBM回归 

XGBoost

绘制学习曲线

 输出:

绘制dataframe数据分布图

输出:

SVM分类

使用贝叶斯优化SVM

输出:

后续:

 绘制ROC曲线

输出:

 PCA降维

PCA降维可视化

输出:

求解极值

 输出解释:


pandas读取数据

  1. import numpy as np
  2. import pandas as pd
  3. import random
  4. Molecular_Descriptor = pd.read_excel('Molecular_Descriptor.xlsx',header=0)
  5. Molecular_Descriptor.head()

查看数据异常

  1. #判断数据NAN,INF
  2. print(Molecular_Descriptor.isnull().any())
  3. print(np.isnan(Molecular_Descriptor).any())
  4. print(np.isfinite(Molecular_Descriptor).all())
  5. print(np.isinf(Molecular_Descriptor).all())

提取指定列

Molecular_Descriptor.iloc[:,1:]

将dataframe数据以numpy形式提取

  1. # .values能够将dataframe中的数据以numpy的形式读取
  2. X = Molecular_Descriptor.iloc[:,1:].values
  3. Y = ERα_activity.iloc[:,2].values

数据划分

  1. from sklearn.model_selection import train_test_split
  2. X_train, X_test, y_train, y_test = train_test_split(X,
  3. Y,
  4. test_size=0.2,
  5. random_state=0)
  6. #打印出原始样本集、训练集和测试集的数目
  7. print("The length of original data X is:", X.shape[0])
  8. print("The length of train Data is:", X_train.shape[0])
  9. print("The length of test Data is:", X_test.shape[0])

随机森林回归

  1. #导入随机森林库
  2. from sklearn.ensemble import RandomForestRegressor
  3. #导入sklearn度量库
  4. from sklearn import metrics
  5. #定义分类器
  6. RFRegressor = RandomForestRegressor(n_estimators=200, random_state=0)
  7. #模型训练
  8. RFregressor.fit(X_train, y_train)
  9. #模型预测
  10. y_pred = RFregressor.predict(X_test)
  11. #输出回归模型评价指标
  12. print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
  13. print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
  14. print('Root Mean Squared Error:',
  15. np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
  16. #获得特征重要性
  17. print(RFregressor.feature_importances_)

GBDT回归

  1. from sklearn.ensemble import GradientBoostingRegressor
  2. gbdt = GradientBoostingRegressor(random_state=0)
  3. gbdt.fit(X_train, y_train)
  4. y_pred = gbdt.predict(X_test)
  5. print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
  6. print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
  7. print('Root Mean Squared Error:',
  8. np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

特征重要性可视化

  1. import matplotlib.pyplot as plt
  2. plt.rcParams["font.sans-serif"] = ["SimHei"] # 用来正常显示中文标签
  3. plt.rcParams["axes.unicode_minus"] = False # 解决负号"-"显示为方块的问题
  4. plt.rcParams['savefig.dpi'] = 150 #图片像素
  5. plt.rcParams['figure.dpi'] = 150 #分辨率
  6. def plot_feature_importance(dataset, model_bst):
  7. '''
  8. dataset : 数据集 dataframe
  9. model_bst : 训练好的模型
  10. '''
  11. list_feature_name = list(dataset.columns[1:])
  12. list_feature_importance = list(model_bst.feature_importances_)
  13. dataframe_feature_importance = pd.DataFrame(
  14. {'feature_name': list_feature_name, 'importance': list_feature_importance})
  15. dataframe_feature_importance20 = dataframe_feature_importance.sort_values(by='importance', ascending=False)[:20]
  16. print(dataframe_feature_importance20)
  17. x = range(len(dataframe_feature_importance20['feature_name']))
  18. plt.xticks(x, dataframe_feature_importance20['feature_name'], rotation=90, fontsize=8)
  19. plt.plot(x, dataframe_feature_importance20['importance'])
  20. plt.xlabel("分子描述符")
  21. plt.ylabel("重要程度")
  22. plt.title('重要程度可视化')
  23. plt.grid()
  24. #保存图像
  25. #plt.savefig('重要程度可视化.png')
  26. plt.show()
  27. return dataframe_feature_importance20['feature_name']
  28. if __name__ == '__main__':
  29. # 传入数据集dataframe , 模型对特征重要性进行评估
  30. gbdt_name = plot_feature_importance(Molecular_Descriptor,gbdt)

 输出:

 绘制3D散点图

  1. z = list(range(0,729))
  2. plt.rcParams['savefig.dpi'] = 150 #图片像素
  3. plt.rcParams['figure.dpi'] = 150 #分辨率
  4. plt.rcParams["font.sans-serif"] = ["SimHei"] # 用来正常显示中文标签
  5. plt.rcParams["axes.unicode_minus"] = False # 解决负号"-"显示为方块的问题
  6. from mpl_toolkits.mplot3d import Axes3D
  7. x = regressor.feature_importances_
  8. y = gbdt.feature_importances_
  9. fig = plt.figure()
  10. plt.subplots_adjust(right=0.8)
  11. ax = fig.add_subplot(111, projection='3d') # 创建一个三维的绘图工程
  12. ax.scatter(x,y,z,c='b',s=5,alpha=1)
  13. #设置x、y轴坐标刻标以及对应的标签
  14. plt.xticks(fontsize=7)
  15. plt.yticks(fontsize=7)
  16. #统一设置x、y、z轴标签字体
  17. plt.tick_params(labelsize=7)
  18. #设置x、y、z标签
  19. plt.xlabel("x轴",fontsize=8)
  20. plt.ylabel("y轴",fontsize=8)
  21. ax.set_zlabel('z轴',fontsize=8)
  22. plt.savefig('这是三维图.png')

导入自定义包且.py文件修改时jupyter notebook自动同步

  1. %%%%load_ext autoreload
  2. %%%%autoreload 2

 dataframe删除某列中重复字段并删除对应行

dataframe_feature_importance = dataframe_feature_importance.drop_duplicates(subset=['feature_name'], keep='first', inplace=False)

LASSO回归

  1. from sklearn import linear_model
  2. model = linear_model.LassoCV()
  3. model.fit(X_train, y_train)
  4. y_predict = model.predict(X_test)
  5. print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predict))
  6. print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict))
  7. print('Root Mean Squared Error:',
  8. np.sqrt(metrics.mean_squared_error(y_test, y_predict)))

 绘制回归误差图

  1. x_t = np.linspace(0, len(np.array(y_test)), len(np.array(y_test)))
  2. plt.plot(x_t, y_test, marker='.', label="origin data")
  3. # plt.xticks([])
  4. plt.plot(x_t, y_predict, 'r-', marker='.', label="predict", lw=1)
  5. plt.xlabel('样本编号')
  6. plt.ylabel('预测结果')
  7. # plt.figure(figsize=(10,100))
  8. plt.legend(labels=['test','predict'],loc='best')
  9. # plt.xticks([])
  10. score = model.score(X_test,y_test)
  11. print(score)
  12. plt.text(140, 3, 'score=%%%%.4f' %%%% score, fontdict={'size': 15, 'color': 'red'})
  13. plt.savefig('Lasso.png')

输出:

 Adaboost回归

  1. from sklearn.ensemble import AdaBoostClassifier
  2. clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth=3),
  3. n_estimators=5000, random_state=123)
  4. clf.fit(X_train,y_train)
  5. y_predict = clf.predict(X_test)
  6. print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predict))
  7. print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict))
  8. print('Root Mean Squared Error:',
  9. np.sqrt(metrics.mean_squared_error(y_test, y_predict)))

LightGBM回归 

  1. import lightgbm as lgb
  2. clf = lgb.LGBMRegressor(
  3. boosting_type='gbdt',
  4. random_state=2019,
  5. objective='regression')
  6. # 训练模型
  7. clf.fit(X=X_train, y=y_train, eval_metric='MSE', verbose=50)
  8. y_predict = clf.predict(X_test)
  9. print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predict))
  10. print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict))
  11. print('Root Mean Squared Error:',
  12. np.sqrt(metrics.mean_squared_error(y_test, y_predict)))

XGBoost

  1. import xgboost as xgb
  2. clf = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=5000, silent=False, objective='reg:gamma')
  3. # 训练模型
  4. clf.fit(X=X_train, y=y_train)
  5. y_predict = clf.predict(X_test)
  6. print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predict))
  7. print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict))
  8. print('Root Mean Squared Error:',
  9. np.sqrt(metrics.mean_squared_error(y_test, y_predict)))

绘制学习曲线

  1. from sklearn.model_selection import learning_curve
  2. from sklearn.model_selection import ShuffleSplit
  3. def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
  4. plt.figure()
  5. plt.title(title)
  6. if ylim is not None:
  7. plt.ylim(*ylim)
  8. plt.xlabel("Training examples")
  9. plt.ylabel("Score")
  10. train_sizes, train_scores, test_scores = learning_curve(
  11. estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
  12. train_scores_mean = np.mean(train_scores, axis=1)
  13. train_scores_std = np.std(train_scores, axis=1)
  14. test_scores_mean = np.mean(test_scores, axis=1)
  15. test_scores_std = np.std(test_scores, axis=1)
  16. plt.grid()
  17. plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
  18. train_scores_mean + train_scores_std, alpha=0.1,
  19. color="r")
  20. plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
  21. test_scores_mean + test_scores_std, alpha=0.1, color="g")
  22. plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
  23. label="Training score")
  24. plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
  25. label="Cross-validation score")
  26. plt.legend(loc="best")
  27. return plt
  28. if __name__ == '__main__':
  29. title = "Learning Curves"
  30. # Cross validation with 100 iterations to get smoother mean test and train
  31. # score curves, each time with 20%%%% data randomly selected as a validation set.
  32. cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
  33. estimator =lgb.LGBMRegressor(learning_rate=0.001,
  34. max_depth=-1,
  35. n_estimators=10000,
  36. boosting_type='gbdt',
  37. random_state=2019,
  38. objective='regression',)
  39. #模型 图像标题 数据 标签 K折
  40. p = plot_learning_curve(estimator, title, XX, YY, cv=cv, n_jobs=4)
  41. p.savefig('LearnCurves.png')

 输出:

绘制dataframe数据分布图

  1. #
  2. name = ['gmin', 'MDEC-22', 'minaaN', 'maxHBint10', 'minHBint10', 'maxdO',
  3. 'C2SP1', 'BCUTw-1h', 'BCUTp-1l', 'MDEN-33', 'VC-4', 'nAtomLAC',
  4. 'SHBint10', 'minHBint4', 'C2SP2', 'MDEC-24', 'hmax', 'SHBint9',
  5. 'fragC', 'LipinskiFailures']
  6. # 提取数据指定列
  7. t = Molecular_Descriptor[name]
  8. #数据归一化
  9. t = (t-t.min())/(t.max()-t.min())
  10. t.plot(alpha=0.8)
  11. #横向拉长x轴
  12. N=100
  13. plt.legend(loc=2, bbox_to_anchor=(1.05,1.0),borderaxespad= 0)
  14. # change x internal size
  15. plt.gca().margins(x=0)
  16. plt.gcf().canvas.draw()
  17. tl = plt.gca().get_xticklabels()
  18. # maxsize = max([t.get_window_extent().width for t in tl])
  19. maxsize = 30
  20. m = 0.2 # inch margin
  21. s = maxsize / plt.gcf().dpi * N + 2 * m
  22. margin = m / plt.gcf().get_size_inches()[0]
  23. plt.gcf().subplots_adjust(left=margin, right=1. - margin)
  24. plt.gcf().set_size_inches(s, plt.gcf().get_size_inches()[1])
  25. #合理布局
  26. plt.tight_layout()
  27. plt.savefig("数据分布.png")

输出:

SVM分类

  1. from sklearn.svm import SVC
  2. from sklearn import metrics
  3. #定义SVM分类器
  4. clf = SVC()
  5. #模型训练
  6. clf.fit(X_train,y_train)
  7. #模型预测
  8. y_pred = clf.predict(X_test)
  9. #模型评估
  10. print('准确率=%%%%.4f'%%%%metrics.accuracy_score(y_test,y_pred))
  11. print('召回率=%%%%.4f'%%%%metrics.recall_score(y_test, y_pred, pos_label=1))
  12. print('精准率=%%%%.4f'%%%%metrics.precision_score(y_test, y_pred, pos_label=1) )
  13. print('F1=%%%%.4f'%%%%metrics.f1_score(y_test, y_pred, average='weighted',pos_label=1) )

使用贝叶斯优化SVM

  1. from sklearn.datasets import make_classification
  2. from sklearn.model_selection import cross_val_score
  3. from sklearn.ensemble import RandomForestClassifier as RFC
  4. from sklearn.svm import SVC
  5. from bayes_opt import BayesianOptimization
  6. from bayes_opt.util import Colours
  7. def svc_cv(C, gamma, X_train, y_train):
  8. """SVC cross validation.
  9. This function will instantiate a SVC classifier with parameters C and
  10. gamma. Combined with data and targets this will in turn be used to perform
  11. cross validation. The result of cross validation is returned.
  12. Our goal is to find combinations of C and gamma that maximizes the roc_auc
  13. metric.
  14. """
  15. #设置分类器
  16. estimator = SVC(C=C, gamma=gamma, random_state=2)
  17. #交叉验证
  18. cval = cross_val_score(estimator, X_train, y_train, scoring='roc_auc', cv=4)
  19. return cval.mean()
  20. def optimize_svc(X_train, y_train):
  21. """Apply Bayesian Optimization to SVC parameters."""
  22. def svc_crossval(expC, expGamma):
  23. """Wrapper of SVC cross validation.
  24. Notice how we transform between regular and log scale. While this
  25. is not technically necessary, it greatly improves the performance
  26. of the optimizer.
  27. """
  28. C = 10 ** expC
  29. gamma = 10 ** expGamma
  30. return svc_cv(C=C, gamma=gamma, data=X_train, targets=y_train)
  31. optimizer = BayesianOptimization(
  32. f=svc_crossval,
  33. #设置超参范围
  34. pbounds={"expC": (-3, 2), "expGamma": (-4, -1)},
  35. random_state=1234,
  36. verbose=2
  37. )
  38. optimizer.maximize(n_iter=10)
  39. print("Final result:", optimizer.max)
  40. if __name__ == '__main__':
  41. #开始搜索超参
  42. optimize_svc(X_train, y_train)

输出:

  1. | iter | target | expC | expGamma |
  2. -------------------------------------------------
  3. | 1 | 0.8239 | -2.042 | -2.134 |
  4. | 2 | 0.8973 | -0.8114 | -1.644 |
  5. | 3 | 0.8791 | 0.8999 | -3.182 |
  6. | 4 | 0.8635 | -1.618 | -1.594 |
  7. | 5 | 0.9104 | 1.791 | -1.372 |
  8. | 6 | 0.9213 | 1.099 | -1.502 |
  9. | 7 | 0.9165 | 0.2084 | -1.0 |
  10. | 8 | 0.8727 | 2.0 | -4.0 |
  11. | 9 | 0.9117 | 1.131 | -1.0 |
  12. | 10 | 0.9241 | 0.3228 | -1.88 |
  13. | 11 | 0.9346 | 2.0 | -2.322 |
  14. | 12 | 0.9335 | 1.429 | -2.239 |
  15. | 13 | 0.7927 | -3.0 | -4.0 |
  16. | 14 | 0.927 | 2.0 | -2.715 |
  17. | 15 | 0.9354 | 1.742 | -2.249 |
  18. =================================================
  19. Final result: {'target': 0.9353828944247531, 'params': {'expC': 1.7417094883510253, 'expGamma': -2.248984327197053}}

 iter为迭代次数,target为模型所获得的分数(越高越好),expC、expGamma为需要贝叶斯优化的参数

后续:

如何使用?根据搜索到的超参数'params': {'expC': 1.7417094883510253, 'expGamma': -2.248984327197053}重新训练分类器即可

  1. clf = SVC(C=10**1.74,gamma=10**(-2.248))
  2. clf.fit(X_train,y_train)
  3. y_pred = clf.predict(X_test)

 绘制ROC曲线

  1. import matplotlib.pyplot as plt
  2. from sklearn.metrics import roc_curve, auc
  3. import matplotlib.pyplot as plt
  4. plt.rcParams['savefig.dpi'] = 150 #图片像素
  5. plt.rcParams['figure.dpi'] = 150 #分辨率
  6. #传入真实值和预测值
  7. fpr, tpr, thersholds = roc_curve(y_test, y_pred, pos_label=1)
  8. for i, value in enumerate(thersholds):
  9. print("%%%%f %%%%f %%%%f" %%%% (fpr[i], tpr[i], value))
  10. roc_auc = auc(fpr, tpr)
  11. plt.plot(fpr, tpr, 'k--', label='ROC (area = {0:.2f})'.format(roc_auc), lw=2,c='r')
  12. plt.xlim([-0.05, 1.05]) # 设置x、y轴的上下限,以免和边缘重合,更好的观察图像的整体
  13. plt.ylim([-0.05, 1.05])
  14. plt.xlabel('False Positive Rate')
  15. plt.ylabel('True Positive Rate') # 可以使用中文,但需要导入一些库即字体
  16. plt.title('ROC Curve')
  17. plt.legend(loc="lower right")
  18. plt.savefig('Caco-2分类ROC曲线.png')
  19. plt.show()
  20. print(roc_auc)

输出:

 PCA降维

  1. from sklearn.decomposition import PCA
  2. #定义PCA分类器,n_components为需要降到的维数
  3. pca = PCA(n_components=50)
  4. # X.shape = (1974,729)
  5. #数据转换 (1974,729) -> (1974,50)
  6. new_X = pca.fit_transform(X)
  7. #new_X.shape = (1974,50)

PCA降维可视化

  1. # plt.rcParams['savefig.dpi'] = 150 #图片像素
  2. # plt.rcParams['figure.dpi'] = 150 #分辨率
  3. plt.rcParams["font.sans-serif"] = ["SimHei"] # 用来正常显示中文标签
  4. plt.rcParams["axes.unicode_minus"] = False # 解决负号"-"显示为方块的问题
  5. from mpl_toolkits.mplot3d import Axes3D
  6. # 降到3维
  7. pca = PCA(n_components=3)
  8. pca_test = pca.fit_transform(X_test)
  9. pca_test.shape
  10. fig = plt.figure()
  11. plt.subplots_adjust(right=0.8)
  12. ax = fig.add_subplot(111, projection='3d') # 创建一个三维的绘图工程
  13. y_pred==0
  14. #分离0 1
  15. label0 = pca_test[y_pred==0]
  16. label1 = pca_test[y_pred==1]
  17. # label0
  18. ax.scatter(label0[:,0],label0[:,1],label0[:,2],label=0,alpha=0.8)
  19. ax.scatter(label1[:,0],label1[:,1],label1[:,2],label=1,alpha=0.8)
  20. plt.legend()
  21. plt.savefig('Caco2分类三维图像.png')

输出:

求解极值

  1. # coding=utf-8
  2. from scipy.optimize import minimize
  3. import numpy as np
  4. #设置参数范围/约束条件
  5. l_x_min = [0,1,2,3]
  6. l_x_max = [4,5,6,7]
  7. def fun():
  8. #minimize只能求极小值,如果需要极大值,则在函数前添加负号,本案例为求极大值
  9. v=lambda x: -1*(coef[0]*x[0]+coef[1]*x[1]+coef[2]*x[2]+coef[3]*x[3]+intercept)
  10. return v
  11. def con():
  12. # 约束条件 分为eq 和ineq
  13. #eq表示 函数结果等于0 ; ineq 表示 表达式大于等于0
  14. #{'type': 'ineq', 'fun': lambda x: x[0] - l_x_min[0]}表示 x[0] - l_x_min[0]>=0
  15. cons = ({'type': 'ineq', 'fun': lambda x: x[0] - l_x_min[0]},
  16. {'type': 'ineq', 'fun': lambda x: -x[0] + l_x_max[0]},
  17. {'type': 'ineq', 'fun': lambda x: x[1] - l_x_min[1]},
  18. {'type': 'ineq', 'fun': lambda x: -x[1] + l_x_max[1]},
  19. {'type': 'ineq', 'fun': lambda x: x[2] - l_x_min[2]},
  20. {'type': 'ineq', 'fun': lambda x: -x[2] + l_x_max[2]},
  21. {'type': 'ineq', 'fun': lambda x: x[3] - l_x_min[3]},
  22. {'type': 'ineq', 'fun': lambda x: -x[3] + l_x_max[3]})
  23. return cons
  24. if __name__ == "__main__":
  25. #定义常量值
  26. cons = con()
  27. #设置初始猜测值
  28. x0 = np.random.rand(4)
  29. res = minimize(fun(), x0, method='SLSQP',constraints=cons)
  30. print(res.fun)
  31. print(res.success)
  32. print(res.x)

输出解释:

  1. #举例:
  2. [output]:
  3. -1114.4862509294192 # 由于在开始时给函数添加符号,最后还需要*-1,因此极大值为1114.4862509294192
  4. True #成功找到极值
  5. [-1.90754988e-10 6.36254335e+00 -1.25920646e-10 1.90480000e-01] #该极值对应x解

推荐阅读