前言:
而今咱们对“python句向量”可能比较珍视,姐妹们都需要剖析一些“python句向量”的相关知识。那么小编同时在网上搜集了一些有关“python句向量””的相关内容,希望我们能喜欢,我们一起来了解一下吧!实现功能:
python机器学习案例-支持向量机建模及评估。
实现代码:
1
# 导入需要的库
2
from warnings import simplefilter
3
simplefilter(action='ignore', category=FutureWarning)
4
import pandas as pd
5
from sklearn.model_selection import train_test_split
6
import seaborn as sns
7
import matplotlib.pyplot as plt
8
from sklearn import metrics
9
from sklearn.metrics import roc_curve, auc
10
from sklearn.svm import SVC
11
12
# =============读取数据===========
13
def Read_data(file):
14
dt = pd.read_csv(file)
15
dt.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol','fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved','exercise_induced_angina','st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']
16
data =dt
17
return data
18
19
# ===========数据清洗==============
20
def data_clean(data):
21
# 重复值处理
22
print('存在' if any(data.duplicated()) else '不存在', '重复观测值')
23
data.drop_duplicates()
24
25
# 缺失值处理
26
print('不存在' if any(data.isnull()) else '存在', '缺失值')
27
data.dropna() # 直接删除记录
28
data.fillna(method='ffill') # 前向填充
29
data.fillna(method='bfill') # 后向填充
30
data.fillna(value=2) # 值填充
31
data.fillna(value={'resting_blood_pressure': data['resting_blood_pressure'].mean()}) # 统计值填充
32
33
# 异常值处理
34
data1 = data['resting_blood_pressure']
35
# 标准差监测
36
xmean = data1.mean()
37
xstd = data1.std()
38
print('存在' if any(data1 > xmean + 2 * xstd) else '不存在', '上限异常值')
39
print('存在' if any(data1 < xmean - 2 * xstd) else '不存在', '下限异常值')
40
# 箱线图监测
41
q1 = data1.quantile(0.25)
42
q3 = data1.quantile(0.75)
43
up = q3 + 1.5 * (q3 - q1)
44
dw = q1 - 1.5 * (q3 - q1)
45
print('存在' if any(data1 > up) else '不存在', '上限异常值')
46
print('存在' if any(data1 < dw) else '不存在', '下限异常值')
47
data1[data1 > up] = data1[data1 < up].max()
48
data1[data1 < dw] = data1[data1 > dw].min()
49
return data
50
51
#==============数据编码=============
52
def data_encoding(data):
53
data = data[["age", 'sex', "chest_pain_type", "resting_blood_pressure", "cholesterol","fasting_blood_sugar", "rest_ecg","max_heart_rate_achieved", "exercise_induced_angina","st_depression", "st_slope", "num_major_vessels","thalassemia","target"]]
54
Discretefeature=['sex',"chest_pain_type", "fasting_blood_sugar", "rest_ecg","exercise_induced_angina", "st_slope", "thalassemia"]
55
Continuousfeature=["age", "resting_blood_pressure", "cholesterol","max_heart_rate_achieved","st_depression","num_major_vessels"]
56
57
df = pd.get_dummies(data,columns=Discretefeature)
58
59
df[Continuousfeature]=(df[Continuousfeature]-df[Continuousfeature].mean())/(df[Continuousfeature].std())
60
df["target"]=data[["target"]]
61
return df
62
63
#=============数据集划分==============
64
def data_partition(data):
65
# 1.4查看样本是否平衡
66
print(data["target"].value_counts())
67
# X提取变量特征;Y提取目标变量
68
X = data.drop('target', axis=1)
69
y = data['target']
70
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,random_state=10)
71
feature=list(X.columns)
72
return X_train, y_train, X_test, y_test,feature
73
74
#===========绘制ROC曲线================
75
def Draw_ROC(list1,list2):
76
fpr_model,tpr_model,thresholds=roc_curve(list1,list2,pos_label=1)
77
roc_auc_model=auc(fpr_model,tpr_model)
78
79
font = {'family': 'Times New Roman','size': 12,}
80
sns.set(font_scale=1.2)
81
plt.rc('font',family='Times New Roman')
82
83
plt.plot(fpr_model,tpr_model,'blue',label='AUC = %0.2f'% roc_auc_model)
84
plt.legend(loc='lower right',fontsize = 12)
85
plt.plot([0,1],[0,1],'r--')
86
plt.ylabel('True Positive Rate',fontsize = 14)
87
plt.xlabel('Flase Positive Rate',fontsize = 14)
88
plt.show()
89
return
90
91
#============支持向量机===================
92
def SVM(X_train, y_train, X_test, y_test,feature):
93
svc = SVC(probability = True)
94
svc.fit(X_train, y_train)
95
print("\nFinally results of SVM fitting:")
96
print("Accuracy on training set: {:.3f}".format(svc.score(X_train, y_train)))
97
print("Accuracy on test set: {:.3f}".format(svc.score(X_test, y_test)))
98
predict_target=svc.predict(X_test)
99
100
predict_target_prob=svc.predict_proba(X_test) # 输出分类概率
101
predict_target_prob_svc=predict_target_prob[:,1]
102
df = pd.DataFrame({'prob':predict_target_prob_svc,'target':predict_target,'labels':list(y_test)})
103
104
print('正确预测数量:')
105
print(sum(predict_target==y_test))
106
107
print('SVM验证集:')
108
print(metrics.classification_report(y_test,predict_target))
109
print(metrics.confusion_matrix(y_test, predict_target))
110
111
print('SVM训练集:')
112
predict_Target=svc.predict(X_train)
113
print(metrics.classification_report(y_train,predict_Target))
114
print(metrics.confusion_matrix(y_train, predict_Target))
115
return list(y_test), list(predict_target_prob_svc)
116
117
#============主函数==============
118
if __name__=="__main__":
119
data1=Read_data("F:\数据杂坛\\0504\heartdisease\Heart-Disease-Data-Set-main\\UCI Heart Disease Dataset.csv")
120
data1=data_clean(data1)
121
data2=data_encoding(data1)
122
X_train, y_train, X_test, y_test,feature= data_partition(data2)
123
124
y_test,predict_target_prob_svc=SVM(X_train, y_train, X_test, y_test,feature)
125
Draw_ROC(y_test,predict_target_prob_svc)
实现效果:
喜欢记得点赞,在看,收藏,
关注V订阅号:数据杂坛,获取完整代码和效果,将持续更新!
标签: #python句向量