
# 核心功能实现代码
(1)PSI-blast批量生成
names=[name for name in os.listdir('//home/bhliu/data/negativetraindata1/01//') if os.path.isfile(os.path.join('//home/bhliu/data/negativetraindata1/01//', name))]
for each_item in names:
uniprotid=each_item.split('.')[0]
cmd='/home/bhliu/ncbi-blast-2.12.0+/bin/psiblast -num_iterations 3 -db /home/bhliu/uniref50.fasta -query /home/bhliu/data/negativetraindata1/01/'+uniprotid+'.fasta -out_ascii_pssm /home/bhliu/pssm/negativepssm/'+uniprotid+'.pssm '
#print(cmd)
os.system(cmd)
(2)特征提取
def PSSM_composition(proteinSeq_pro,proteinSeq,pssmMatrix):
#pssm_composition特征提取
PSSM_composition=[]
pssm_composition=[[0.0for m in range(20)] for n in range(20)]#创建一个20*20且元素都是0.0的矩阵
for k in range(20):
for i in range(len(proteinSeq)):
if proteinSeq_pro[k]==proteinSeq[i]:
for j in range(20): pssm_composition[k][j]=pssm_composition[k][j]+pssmMatrix[i][j]/len(pssmMatrix)
for a in pssm_composition:
for b in a:
PSSM_composition.append(b)
return PSSM_composition
def S_FPSSM(proteinSeq_pro,proteinSeq,pssmMatrix):
#S_FPSSM特征提取
F_PSSM=pssmMatrix#转换F_PSSM矩阵
for k in range(20):
for i in range(len(proteinSeq)):
if F_PSSM[i][k]<=0.0:
F_PSSM[i][k]=0.0
elif F_PSSM[i][k]>=7.0:
F_PSSM[i][k] = 7.0
S_fpssm=[]
s_fpssm=[[0.0for m in range(20)] for n in range(20)]
for k in range(20):
for i in range(len(proteinSeq)):
if proteinSeq_pro[k]==proteinSeq[i]:
for j in range(20):
s_fpssm[k][j]=s_fpssm[k][j]+F_PSSM[i][j]
for a in s_fpssm:
for b in a:
S_fpssm.append(b)
return S_fpssm
def RPM_PSSM(proteinSeq_pro,proteinSeq,pssmMatrix):
#RPM-PSSM特征提取
PPSSM=pssmMatrix#将原始PSSM矩阵转换为PPSSM
RPM_PSSM=[]
rpm_PSSM=[[0.0for m in range(20)] for n in range(20)]
for k in range(20):
for i in range(len(proteinSeq)):
if PPSSM[i][k]<=0.0:
PPSSM[i][k]=0
for k in range(20):
for i in range(len(proteinSeq)):
if proteinSeq_pro[k]==proteinSeq[i]:
for j in range(20):
rpm_PSSM[k][j]=rpm_PSSM[k][j]+PPSSM[i][j]/len(proteinSeq)
for a in rpm_PSSM:
for b in a:
RPM_PSSM.append(b)
return RPM_PSSM
(3)打分函数
def model_process(svm_model, test_data_x, test_data_y,cv_num,model_name):
p_lable = svm_model.predict(test_data_x)
print('总体精度为 : {}'.format(cv_num))
print('混淆矩阵为 :\n {}'.format(confusion_matrix(test_data_y, p_lable)))
print('kappa系数为 :\n {}'.format(cohen_kappa_score(test_data_y, p_lable)))
matric = confusion_matrix(test_data_y, p_lable)
TP = matric[1, 1] + 0.0
TN = matric[0, 0] + 0.0
FP = matric[0, 1] + 0.0
FN = matric[1, 0] + 0.0
recall = TP / (TP + FN)
precise = TP / (TP + FP)
specificity = TN / (TN + FP)
f1_score = 2 * (TP / (2 * TP + FP + FN))
MCC = ((TP * TN) - (FN * FP)) / math.sqrt((TP + FN) * (TN + FP) * (TP + FP) * (TN + FN))
print('{}的召回率(recall)为{:.4},特异度(specificity)为{:.4},精确率(precision)为{:.4},F1 score为{:.4},MCC值为{:.4} '.format(model_name, recall,specificity,precise,f1_score,MCC))
print(p_lable.tolist().count(0))
(4)交叉验证模型性能
def model_cv(path):
Pssm=csv_read.csv_read(path)
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()#实例化
scaler.fit(Pssm)#fit,本质是生成均值和方差
# scaler.mean_#查看均值的属性mean_
# scaler.var_#查看方差的属性var_
pssm=scaler.transform(Pssm)#通过接口导出结果
y=[1for m in range(4000)]+[0for n in range(4000)]#1为毒力因子。0为非毒力因子
y=np.array(y)
train_x, test_x, train_y, test_y = train_test_split(pssm, y, test_size=0.2,random_state=420)
if path == 's_fpssm_train.csv':
j = 'S-FPSSM'
svc_model = SVC(C=8 # rpm=2,s_fpssm=8,pssm_composition=2
, kernel="rbf"
, gamma=2 ** -7 # rpm=2**-8 pssm_composition=2**-9 s_fpssm=2**-7
# , degree = 1
, cache_size=5000
)
rf_model = RandomForestClassifier(random_state=0
, n_estimators=181
, max_depth=10
, max_features='auto'
) # 生成森林0.7656249999999999 181
elif path == 'pssm_composition_train.csv':
j = 'PSSM-composition'
svc_model = SVC(C=2 # rpm=2,s_fpssm=8,pssm_composition=2
, kernel="rbf"
, gamma=2 ** -9 # rpm=2**-8 pssm_composition=2**-9 s_fpssm=2**-7
# , degree = 1
, cache_size=5000
)
rf_model = RandomForestClassifier(random_state=0
, n_estimators=181
, max_depth=10
, max_features='auto'
) # 生成森林0.7656249999999999 181
elif path == 'rpm_pssm_train.csv':
j = 'RPM-PSSM'
svc_model = SVC(C=2 # rpm=2,s_fpssm=8,pssm_composition=2
, kernel="rbf"
, gamma=2 ** -8 # rpm=2**-8 pssm_composition=2**-9 s_fpssm=2**-7
# , degree = 1
, cache_size=5000
)
rf_model = RandomForestClassifier(random_state=0
, n_estimators=181
, max_depth=10
, max_features='auto'
) # 生成森林0.7656249999999999 181
mlp_model = MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', early_stopping=False, epsilon=1e-08,
hidden_layer_sizes=(256, 256, 256), learning_rate='constant',
learning_rate_init=0.001, max_iter=200, momentum=0.9,
nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
solver='adam',#在大样本好用
)
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[rf_model, mlp_model, svc_model],
meta_classifier=lr)
# model_process.model_process(sclf,test1_x,test1_y,5,j)
print('5-fold cross validation:\n')
for basemodel, label in zip([rf_model, mlp_model, svc_model, sclf],
['Random Forest',
'MLP',
'SVM',
'StackingClassifier']):
scores1 = model_selection.cross_val_score(basemodel,train_x, train_y,
cv=5, scoring='roc_auc')
scores2 = model_selection.cross_val_score(basemodel,train_x, train_y,
cv=5, scoring='accuracy')
scores3 = model_selection.cross_val_score(basemodel, train_x, train_y,
cv=5, scoring='f1')
print("auc: %0.4f (+/- %0.2f) [%s]"
% (scores1.mean(), scores1.std(), label))
print("acc: %0.4f (+/- %0.2f) [%s]"
% (scores2.mean(), scores2.std(), label))
print("f1: %0.4f (+/- %0.2f) [%s]"
% (scores3.mean(), scores3.std(), label))
(5)构建模型并绘制ROC曲线
def svm_roc():
csv_name=['s_fpssm_train.csv','pssm_composition_train.csv','rpm_pssm_train.csv']
# 创建画布
fig, ax = plt.subplots(figsize=(10, 8))
for i in csv_name:
Pssm = csv_read.csv_read(i)
#数据归一化 将数据按照最小值中心化后,在按极差缩放,数据移动了最小值个单位,并且收敛到[0,1]之间
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()#实例化
scaler.fit(Pssm)#fit,本质是生成均值和方差
# scaler.mean_#查看均值的属性mean_
# scaler.var_#查看方差的属性var_
pssm=scaler.transform(Pssm)#通过接口导出结果
y=[1for m in range(4000)]+[0for n in range(4000)]#1为毒力因子。2为非毒力因子
y=np.array(y)
train_x,test_x,train_y, test_y = train_test_split(pssm, y, test_size=0.2,random_state=420)
if i == 's_fpssm_train.csv':
j = 'S-FPSSM'
svc_model = SVC(C=8 # rpm=2,s_fpssm=8,pssm_composition=2
, kernel="rbf"
, gamma=2 ** -7
,degree = 1
, cache_size=5000
).fit(train_x, train_y)
elif i == 'pssm_composition_train.csv':
j = 'PSSM-composition'
svc_model = SVC(C=2 # rpm=2,s_fpssm=8,pssm_composition=2
, kernel="rbf"
, gamma=2 ** -9
, degree = 1
, cache_size=5000
).fit(train_x, train_y)
elif i == 'rpm_pssm_train.csv':
j = 'RPM-PSSM'
svc_model = SVC(C=2 # rpm=2,s_fpssm=8,pssm_composition=2
, kernel="rbf"
, gamma=2 ** -8 # rpm=2**-8 pssm_composition=2**-9 s_fpssm=2**-7
# , degree = 1
, cache_size=5000
).fit(train_x, train_y)
from sklearn.metrics import plot_roc_curve, roc_curve, auc, roc_auc_score
fpr_svm, tpr_svm, thresholds = roc_curve(test_y, svc_model.decision_function(test_x))
print("RPM-PSSM的AUC为:", auc(fpr_svm, tpr_svm))
# 自定义标签
ax.plot(fpr_svm, tpr_svm, linewidth=2,
label=j+'(AUC={})'.format(str(round(auc(fpr_svm, tpr_svm), 3))),)
# 绘制对角线
ax.plot([0, 1], [0, 1], linestyle='--', color='navy')
# 调整字体大小
plt.legend(fontsize=12, loc="lower right")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(" SVM ROC")
plt.show()
def rf_roc():
csv_name = ['s_fpssm_train.csv', 'pssm_composition_train.csv', 'rpm_pssm_train.csv']
# 创建画布
fig, ax = plt.subplots(figsize=(10, 8))
for i in csv_name:
Pssm = csv_read.csv_read(i)
# 数据归一化 将数据按照最小值中心化后,在按极差缩放,数据移动了最小值个单位,并且收敛到[0,1]之间
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() # 实例化
scaler.fit(Pssm) # fit,本质是生成均值和方差
# scaler.mean_#查看均值的属性mean_
# scaler.var_#查看方差的属性var_
pssm = scaler.transform(Pssm) # 通过接口导出结果
y = [1 for m in range(4000)] + [0 for n in range(4000)] # 1为毒力因子。2为非毒力因子
y = np.array(y)
train_x, test_x, train_y, test_y = train_test_split(pssm, y, test_size=0.2, random_state=420)
if i == 's_fpssm_train.csv':
j = 'S-FPSSM'
rf_model = RandomForestClassifier(random_state=0
,n_estimators=900
,max_depth=11
,max_features='auto'
, min_samples_split=50
).fit(train_x,train_y)#生成森林0.7656249999999999 181
model_process.model_process(rf_model, test_x, test_y, 1, j)
elif i == 'pssm_composition_train.csv':
j = 'PSSM-composition'
rf_model = RandomForestClassifier(random_state=0
,n_estimators=550
,max_depth=11
,max_features='auto'
,min_samples_split= 50
).fit(train_x,train_y)#生成森林0.7656249999999999 181
model_process.model_process(rf_model, test_x, test_y, 1, j)
elif i == 'rpm_pssm_train.csv':
j = 'RPM-PSSM'
rf_model = RandomForestClassifier(random_state=0
, n_estimators=550
, max_depth=13
, max_features='auto'
, min_samples_split=50
).fit(train_x, train_y) # 生成森林0.7656249999999999 181
model_process.model_process(rf_model, test_x, test_y, 1, j)
# 计算AUC值
from sklearn.metrics import plot_roc_curve, roc_curve, auc, roc_auc_score
score_svm = rf_model.predict_proba(test_x)[:, 1]
fpr_svm, tpr_svm, thres_svm = roc_curve(test_y, score_svm, )
print(j+"的AUC为:", auc(fpr_svm, tpr_svm))
# 自定义标签
ax.plot(fpr_svm, tpr_svm, linewidth=2,
label=j+'(AUC={})'.format(str(round(auc(fpr_svm, tpr_svm), 3))))
# 绘制对角线
ax.plot([0, 1], [0, 1], linestyle='--', color='navy')
# 调整字体大小
plt.legend(fontsize=12, loc="lower right")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(" RF ROC")
plt.show()
def mlp_roc():
csv_name = ['s_fpssm_train.csv', 'pssm_composition_train.csv', 'rpm_pssm_train.csv']
# 创建画布
fig, ax = plt.subplots(figsize=(10, 8))
for i in csv_name:
Pssm = csv_read.csv_read(i)
# 数据归一化 将数据按照最小值中心化后,在按极差缩放,数据移动了最小值个单位,并且收敛到[0,1]之间
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() # 实例化
scaler.fit(Pssm) # fit,本质是生成均值和方差
# scaler.mean_#查看均值的属性mean_
# scaler.var_#查看方差的属性var_
pssm = scaler.transform(Pssm) # 通过接口导出结果
y = [1 for m in range(4000)] + [0 for n in range(4000)] # 1为毒力因子。2为非毒力因子
y = np.array(y)
train_x, test_x, train_y, test_y = train_test_split(pssm, y, test_size=0.2, random_state=420)
mlp_model = MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', early_stopping=False,
epsilon=1e-08,
hidden_layer_sizes=(256,256,256), learning_rate='constant',learning_rate_init=0.001,max_iter=200,momentum=0.9,nesterovs_momentum=True,power_t=0.5,random_state=1,shuffle=True,solver='adam').fit(train_x, train_y)
if i == 's_fpssm_train.csv':
j = 'S-FPSSM'
elif i == 'pssm_composition_train.csv':
j = 'PSSM-composition'
elif i == 'rpm_pssm_train.csv':
j = 'RPM-PSSM'
# 计算AUC值
from sklearn.metrics import plot_roc_curve, roc_curve, auc, roc_auc_score
score_svm = mlp_model.predict_proba(test_x)[:, 1]
fpr_svm, tpr_svm, thres_svm = roc_curve(test_y, score_svm, )
print(j+"的AUC为:", auc(fpr_svm, tpr_svm))
# 自定义标签
ax.plot(fpr_svm, tpr_svm, linewidth=2,
label=j+'(AUC={})'.format(str(round(auc(fpr_svm, tpr_svm), 3))))
# 绘制对角线
ax.plot([0, 1], [0, 1], linestyle='--', color='navy')
# 调整字体大小
plt.legend(fontsize=12, loc="lower right")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(" MLP ROC")
plt.show()
欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)