Python应用于乳腺癌预测!为何Python会这么叼呢?你还不来分羹?
数据探索见:python:乳腺癌预测之数据探索 实验器材 ● UCI ● python ● seaborn 进群:548377875 ? 即可获取数十套PDF哦! 实验内容 数据预处理 对诊断结果进行二值话,方便适应所有的预测算法。同时采用?preprocessing.scale 进行量化处理 拆分数据集 按照80/20方式,拆分训练集和测试机。 训练集按照交叉验证方式进行训练。 train_x1,test_x1,train_y,test_y = train_test_split(x_value_scaled,y_values,test_size=0.2) 选择最好的预测诊断算法 本次实验分别实验了 逻辑回归,随机森林,svm,线性SVM,决策树,高斯贝叶斯,梯度迭代决策树 几种算法。并利用learning_curve来判断是否过拟合。 本次预测实验的评价标准为预测精确度。 先来定义几个常用的函数
1 来看看初步的算法筛选 总体上看逻辑回归,svm 效果是最好的。看看是不是存在过拟合的情况 并没有存在过拟合的情况。 随机森林和决策树(未调参)情况下,存在一定的过拟合情况。 2 超参数调整 从上面的实验情况看,LR和SVM是比较好的精度。现在来进一步调整下这两个算法的超参数。
C?: float,optional (default=1.0) Penalty parameter C of the error term. C越小决策平面越光滑,因为对误分类的惩罚较小,C越大越倾向于精确地分类,并且此时有更多的自由去选择更多的向量作为支持向量 kernel?: string,optional (default=’rbf’) Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’,‘poly’,‘rbf’,‘sigmoid’,‘precomputed’ or a callable. If none is given,‘rbf’ will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape (n_samples,n_samples). 在测试集上,有3个恶性的被判定为良性。
C?: float,default: 1.0 含义如SVM的C Inverse of regularization strength; must be a positive float. Like in support vector machines,smaller values specify stronger regularization. 测试集合判断的情况是一样的。 3 算法融合 采用VotingClassifier的硬分类方式。采用SVM和LR融合 可以加入不同方式的算法来进一步融合,这里选择KNN
实验结语 本实验『WedO实验君』和大家一个做了乳腺癌的预测,采用不同的算法融合的策略。关键点为:过拟合判断,超参数筛选,算法融合。 附上jupyter notebook代码 # coding: utf-8 # In[1]: import itertools import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn import preprocessing from sklearn import model_selection from sklearn.model_selection import train_test_split from sklearn.decomposition import PCA from sklearn.metrics import confusion_matrix,make_scorer,accuracy_score from sklearn.model_selection import GridSearchCV,learning_curve from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC,LinearSVC from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,VotingClassifier from sklearn.neural_network import MLPClassifier as MLPC get_ipython().run_line_magic('matplotlib','inline') # In[2]: data = pd.read_csv('f:/dm/data.csv') col = data.columns col # In[3]: data.isnull().sum() # In[4]: data.head() # In[5]: data.info() # In[6]: x_values = data.drop(['diagnosis','id'],axis = 1) y_values = data['diagnosis'] # In[7]: data.describe() # In[8]: def plot_box(data,cols = 3): size = len(data.columns) rows = size//cols + 1 fig = plt.figure(figsize=(13,10)) cnt = 1 for col_name in data.columns: ax = fig.add_subplot(rows,cols,cnt) plt.boxplot(data[col_name]) ax.set_xlabel(col_name) cnt = cnt + 1 plt.tight_layout() plt.show() plot_box(x_values.iloc[:,0:8],4) # In[9]: plot_box(x_values.iloc[:,8:16],4) # In[10]: plot_box(x_values.iloc[:,16:32],4) # In[11]: def plot_distribution(data,target_col): sns.set_style("whitegrid") for col_name in data.columns: if col_name != target_col: title=("# of %s vs %s " % (col_name,target_col)) distributionOne = sns.FacetGrid(data,hue=target_col,aspect=2.5) distributionOne.map(plt.hist,col_name,bins=30) distributionOne.add_legend() distributionOne.set_axis_labels(col_name,'Count') distributionOne.fig.suptitle(title) distributionTwo = sns.FacetGrid(data,aspect=2.5) distributionTwo.map(sns.kdeplot,shade= True) distributionTwo.set(xlim=(0,data[col_name].max())) distributionTwo.add_legend() distributionTwo.set_axis_labels(col_name,'Proportion') distributionTwo.fig.suptitle(title) plot_distribution(data,'diagnosis') # In[12]: g = sns.heatmap(x_values.corr(),cmap="BrBG",annot=False) # In[13]: plot_distribution(data[( data['area_mean'] > 500 ) & (data['area_mean'] < 800)],'diagnosis') # In[14]: g = sns.heatmap(x_values.iloc[:,1:10].corr(),annot=False) # In[15]: def diagnosis_to_binary(data): data["diagnosis"] = data["diagnosis"].astype("category") data["diagnosis"].cat.categories = [0,1] data["diagnosis"] = data["diagnosis"].astype("int") diagnosis_to_binary(data) x_values = data.drop(['diagnosis',axis = 1) y_values = data['diagnosis'] x_value_scaled = preprocessing.scale(x_values) x_value_scaled = pd.DataFrame(x_value_scaled,columns = x_values.columns,index=data["id"]) x_value_all = x_value_scaled #x_value_all['diag'] = y_values.tolist() #x_value_all.head() # In[16]: #x_value_scaled.groupby([u'diag']).agg({ 'compactness_mean': [np.mean]}).reset_index() # In[17]: variance_pct = .99 # Minimum percentage of variance we want to be described by the resulting transformed components pca = PCA(n_components=variance_pct) # Create PCA object x_transformed = pca.fit_transform(x_value_scaled,y_values) # Transform the initial features x_values_scaled_PCA = pd.DataFrame(x_transformed) # Create a data frame from the PCA'd data # In[18]: g = sns.heatmap(x_values_scaled_PCA.corr(),annot=False) # ## 拆分数据集合 # In[19]: x_value_scaled.head() # In[20]: y_values.head() # In[21]: def plot_learning_curve(estimator,title,X,y,ylim=None,cv=None, n_jobs=1,train_sizes=np.linspace(.1,1.0,5)): """ Plots a learning curve. http://scikit-learn.org/stable/modules/learning_curve.html """ plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes,train_scores,test_scores = learning_curve( estimator,cv=cv,n_jobs=n_jobs,train_sizes=train_sizes) train_scores_mean = np.mean(train_scores,axis=1) train_scores_std = np.std(train_scores,axis=1) test_scores_mean = np.mean(test_scores,axis=1) test_scores_std = np.std(test_scores,axis=1) plt.grid() plt.fill_between(train_sizes,train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,alpha=0.1, color="r") plt.fill_between(train_sizes,test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,color="g") plt.plot(train_sizes,train_scores_mean,'o-',color="r", label="Training score") plt.plot(train_sizes,test_scores_mean,color="g", label="Cross-validation score") plt.legend(loc="best") return plt def plot_confusion_matrix(cm,classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): """ http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html """ if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:,np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix,without normalization') print(cm) plt.imshow(cm,interpolation='nearest',cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks,rotation=45) plt.yticks(tick_marks,classes) fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. for i,j in itertools.product(range(cm.shape[0]),range(cm.shape[1])): plt.text(j,i,format(cm[i,j],fmt), horizontalalignment="center", color="white" if cm[i,j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') dict_characters = {1: 'Malignant',0: 'Benign'} # In[22]: def compareABunchOfDifferentModelsAccuracy(a,b,c,d): """ compare performance of classifiers on X_train,X_test,Y_train,Y_test http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_score http://scikit-learn.org/stable/modules/model_evaluation.html#accuracy-score """ print(' Compare Multiple Classifiers: ') print('K-Fold Cross-Validation Accuracy: ') names = [] models = [] resultsAccuracy = [] models.append(('LR',LogisticRegression())) models.append(('RF',RandomForestClassifier())) models.append(('KNN',KNeighborsClassifier())) models.append(('SVM',SVC())) models.append(('LSVM',LinearSVC())) models.append(('GNB',GaussianNB())) models.append(('DTC',DecisionTreeClassifier())) models.append(('GBC',GradientBoostingClassifier())) for name,model in models: plot_learning_curve(model,'Learning Curve For %s Classifier'% (name),a,(0.8,1.1),10) for name,model in models: model.fit(a,b) kfold = model_selection.KFold(n_splits=10,random_state=7) accuracy_results = model_selection.cross_val_score(model,cv=kfold,scoring='accuracy') resultsAccuracy.append(accuracy_results) names.append(name) accuracyMessage = "%s: %f (%f)" % (name,accuracy_results.mean(),accuracy_results.std()) print(accuracyMessage) # Boxplot fig = plt.figure() fig.suptitle('Algorithm Comparison: Accuracy') ax = fig.add_subplot(111) plt.boxplot(resultsAccuracy) ax.set_xticklabels(names) ax.set_ylabel('Cross-Validation: Accuracy Score') plt.show() # In[56]: train_x1,test_size=0.2) # In[57]: train_x1.columns # In[58]: #'texture_se','texture_worst' drop_list = [] train_x = train_x1.drop(drop_list,axis = 1) test_x = test_x1.drop(drop_list,axis = 1) # In[59]: compareABunchOfDifferentModelsAccuracy(train_x,None,None) # In[60]: def selectParametersForSVM(a,d): model = SVC() parameters = {'C': [ 0.01,0.1,0.5,5.0,10,25,50,100], 'kernel': ['linear','poly','rbf','sigmoid']} accuracy_scorer = make_scorer(accuracy_score) grid_obj = GridSearchCV(model,parameters,scoring=accuracy_scorer) grid_obj = grid_obj.fit(a,b) model = grid_obj.best_estimator_ model.fit(a,b) print('Selected Parameters for SVM: ') print(model," ") kfold = model_selection.KFold(n_splits=10,random_state=7) accuracy = model_selection.cross_val_score(model,scoring='accuracy') mean = accuracy.mean() stdev = accuracy.std() print('Support Vector Machine - Training set accuracy: %s (%s)' % (mean,stdev)) print('') prediction = model.predict(c) #print(prediction[0]) cnf_matrix = confusion_matrix(d,prediction) np.set_printoptions(precision=2) class_names = dict_characters plt.figure() plot_confusion_matrix(cnf_matrix,classes=class_names,title='Confusion matrix') plt.figure() plot_confusion_matrix(cnf_matrix,normalize=True, title='Normalized confusion matrix') plot_learning_curve(model,'Learning Curve For SVM Classifier',(0.85,10) return prediction # In[61]: def selectParametersForLR(a,d): model = LogisticRegression() parameters = {'C': [ 0.01,100]} accuracy_scorer = make_scorer(accuracy_score) grid_obj = GridSearchCV(model,b) print('Selected Parameters for LR: ') print(model,'Learning Curve For LR Classifier',10) return prediction # In[62]: prediction = selectParametersForLR(train_x,test_x,test_y) # In[63]: prediction = selectParametersForSVM(train_x,test_y) x_err_data = pd.DataFrame(columns = train_x.columns) real_ = test_y.tolist() indexs = [] err_diag = [] k=0 for i in range(len(prediction)): if prediction[i] != real_[i]: x_err_data.loc[k] = test_x.iloc[i].tolist() indexs.append(test_x.index[i]) err_diag.append(test_y.iloc[i]) k = k + 1 x_err_data.index = indexs x_err_data["diag"] = err_diag x_err_data # In[64]: data[data['id']==91594602] # In[65]: def selectParametersForMLPC(a,d): """http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html http://scikit-learn.org/stable/modules/grid_search.html#grid-search""" model = MLPC() parameters = {'verbose': [False], 'activation': ['logistic','relu'], 'max_iter': [1000,2000],'learning_rate': ['constant','adaptive']} accuracy_scorer = make_scorer(accuracy_score) grid_obj = GridSearchCV(model,b) print('Selected Parameters for Multi-Layer Perceptron NN: ') print(model) print('') kfold = model_selection.KFold(n_splits=10) accuracy = model_selection.cross_val_score(model,scoring='accuracy') mean = accuracy.mean() stdev = accuracy.std() print('SKlearn Multi-Layer Perceptron - Training set accuracy: %s (%s)' % (mean,stdev)) print('') prediction = model.predict(c) cnf_matrix = confusion_matrix(d,'Learning Curve For MLPC Classifier',1),10) # In[66]: selectParametersForMLPC(train_x,test_y) # In[71]: def runVotingClassifier(a,d): """http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html http://scikit-learn.org/stable/modules/ensemble.html#voting-classifier""" #global votingC,mean,stdev # eventually I should get rid of these global variables and use classes instead. in this case i need these variables for the submission function. votingC = VotingClassifier(estimators=[('SVM',SVC(C=5.0,cache_size=200,class_weight=None,coef0=0.0, decision_function_shape='ovr',degree=3,gamma='auto',kernel='rbf', max_iter=-1,probability=False,random_state=None,shrinking=True, tol=0.001,verbose=False)),('LR',LogisticRegression(C=0.1,dual=False,fit_intercept=True, intercept_scaling=1,max_iter=100,multi_class='ovr',n_jobs=1, penalty='l2',solver='liblinear',tol=0.0001, verbose=0,warm_start=False)),('KNN',KNeighborsClassifier())],voting='hard') votingC = votingC.fit(a,b) kfold = model_selection.KFold(n_splits=10) accuracy = model_selection.cross_val_score(votingC,scoring='accuracy') meanC = accuracy.mean() stdevC = accuracy.std() print('Ensemble Voting Classifier - Training set accuracy: %s (%s)' % (meanC,stdevC)) print('') #return votingC,meanC,stdevC prediction = votingC.predict(c) cnf_matrix = confusion_matrix(d, title='Normalized confusion matrix') plot_learning_curve(votingC,'Learning Curve For Ensemble Voting Classifier',10) # In[72]: runVotingClassifier(train_x,test_y) (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |