-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBreast Cancer Detection Using SVM with SMOTE and Model Optimization.py
161 lines (130 loc) · 5.93 KB
/
Breast Cancer Detection Using SVM with SMOTE and Model Optimization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# Gerekli kütüphaneler
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter
# Veri setini yükleme
modelData = pd.read_csv("C:\\Users\\LENOVO\\Desktop\\support_vector_machine\\breast-cancer.csv")
# Sütun adlarını göster
print("Column Names:\n", modelData.columns)
# Eksik veri kontrolü
print("Missing Data Check:\n", modelData.isnull().sum())
# Gereksiz sütunları kaldır
if "X" in modelData.columns:
modelData = modelData.drop(columns=['X'])
if "id" in modelData.columns:
modelData = modelData.drop(columns=['id'])
# Seçilen sütunlar
selectedVars = ["diagnosis", "radius_mean", "texture_mean", "perimeter_mean", "area_mean",
"smoothness_mean", "compactness_mean", "concavity_mean", "concave points_mean",
"symmetry_mean", "fractal_dimension_mean"]
modelDataSelected = modelData[selectedVars]
# Kategorik hedef değişkeni (diagnosis) sayısallaştırma
modelDataSelected["diagnosis"] = modelDataSelected["diagnosis"].map({"B": 0, "M": 1})
# Görselleştirme: Scatter Plot
plt.figure(figsize=(10, 5))
plt.scatter(modelDataSelected[modelDataSelected["diagnosis"] == 0]["radius_mean"],
modelDataSelected[modelDataSelected["diagnosis"] == 0]["texture_mean"],
c="blue", marker="o", label="Benign")
plt.scatter(modelDataSelected[modelDataSelected["diagnosis"] == 1]["radius_mean"],
modelDataSelected[modelDataSelected["diagnosis"] == 1]["texture_mean"],
c="orange", marker="o", label="Malignant")
plt.xlabel("Radius Mean")
plt.ylabel("Texture Mean")
plt.legend(loc="best")
plt.title("Scatter Plot: Radius Mean vs. Texture Mean")
plt.show()
# Pairplot ile görselleştirme
sns.pairplot(modelDataSelected, hue="diagnosis",
vars=["radius_mean", "texture_mean", "area_mean"])
plt.show()
# Model oluşturma: Özellikler ve hedef değişken
X = modelDataSelected.drop(columns=["diagnosis"])
y = modelDataSelected["diagnosis"]
# Standartlaştırma
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Eğitim ve test setlerine ayırma
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=125)
# Modeller
modelLinear = svm.SVC(kernel='linear', probability=True)
modelRadial = svm.SVC(kernel='rbf', probability=True)
modelPoly = svm.SVC(kernel='poly', probability=True)
modelSigmoid = svm.SVC(kernel='sigmoid', probability=True)
# Model eğitimleri
modelLinear.fit(X_train, y_train)
modelRadial.fit(X_train, y_train)
modelPoly.fit(X_train, y_train)
modelSigmoid.fit(X_train, y_train)
# Tahminler
predLinear = modelLinear.predict(X_test)
predRadial = modelRadial.predict(X_test)
predPoly = modelPoly.predict(X_test)
predSigmoid = modelSigmoid.predict(X_test)
# Performans: Confusion Matrix ve Classification Report
models = {"Linear SVM": predLinear, "Radial SVM": predRadial,
"Polynomial SVM": predPoly, "Sigmoid SVM": predSigmoid}
for model_name, predictions in models.items():
print(f"\n{model_name} Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print(f"{model_name} Classification Report:\n", classification_report(y_test, predictions))
# Görselleştirme
ConfusionMatrixDisplay.from_predictions(y_test, predictions)
plt.title(f"{model_name} Confusion Matrix")
plt.show()
# Hiperparametre optimizasyonu
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]}
grid_radial = GridSearchCV(svm.SVC(kernel='rbf', probability=True), param_grid, verbose=1, cv=5)
grid_radial.fit(X_train, y_train)
# En iyi parametreler
print("Best Parameters (Radial SVM):", grid_radial.best_params_)
# Optimizasyon sonrası model
best_radial = grid_radial.best_estimator_
best_pred_radial = best_radial.predict(X_test)
print("\nBest Radial SVM Classification Report:\n", classification_report(y_test, best_pred_radial))
# SMOTE ile sınıf dengesini sağlama
print("Before SMOTE:", Counter(y_train))
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
print("After SMOTE:", Counter(y_train_balanced))
# SMOTE sonrası eğitim
modelLinear_balanced = svm.SVC(kernel='linear', probability=True)
modelLinear_balanced.fit(X_train_balanced, y_train_balanced)
predLinear_balanced = modelLinear_balanced.predict(X_test)
print("Balanced Linear SVM Classification Report:\n", classification_report(y_test, predLinear_balanced))
# ROC-AUC Skoru ve ROC Eğrisi
y_test_binary = y_test
pred_prob = modelRadial.decision_function(X_test)
roc_auc = roc_auc_score(y_test_binary, pred_prob)
fpr, tpr, thresholds = roc_curve(y_test_binary, pred_prob)
plt.plot(fpr, tpr, label=f"Radial SVM (AUC = {roc_auc:.2f})")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()
# Modellerin Doğruluk Karşılaştırması
accuracy_scores = {
"Linear": modelLinear.score(X_test, y_test),
"Radial": modelRadial.score(X_test, y_test),
"Polynomial": modelPoly.score(X_test, y_test),
"Sigmoid": modelSigmoid.score(X_test, y_test),
}
plt.bar(accuracy_scores.keys(), accuracy_scores.values(), color=["blue", "green", "orange", "red"])
plt.title("Model Accuracy Comparison")
plt.ylabel("Accuracy")
plt.show()
# Sonuçların Tablosu
results = pd.DataFrame({
"Model": ["Linear", "Radial", "Polynomial", "Sigmoid"],
"Accuracy": [modelLinear.score(X_test, y_test),
modelRadial.score(X_test, y_test),
modelPoly.score(X_test, y_test),
modelSigmoid.score(X_test, y_test)]
})
print(results)