In this project, we will use the Breast Cancer Wisconsin (Diagnostic) Database to create a model that can help to diagnose patients.
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
cancer = load_breast_cancer()
cancer.DESCR
cancer.keys()
cancer['feature_names']
df = pd.DataFrame(data=np.c_[cancer['data'], cancer['target']],
columns=np.append(cancer['feature_names'], ['target']))
df.head()
names = ['malignant', 'benign']
mal = np.where(df['target'] == 0.0)
ben = np.where(df['target'] == 1.0)
counts = pd.Series([np.size(mal), np.size(ben)], index=names)
counts = counts.to_frame().reset_index().rename(columns={'index':'Result', 0:'Count'})\
.replace({'malignant':'Malignant', 'benign':'Benign'})
fig = px.pie(counts, values='Count', names='Result', color_discrete_sequence=px.colors.sequential.PuRd)
fig.update_layout(title='Malignant or Benign?')
fig.show()
X = df.drop('target', axis=1)
y = df.get('target')
X.head()
y.head()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)
means = df.mean()[:-1].values.reshape(1, -1)
knn.predict(means)
pred = knn.predict(X_test)
pred
mal_train_X = X_train[y_train==0]
mal_train_y = y_train[y_train==0]
ben_train_X = X_train[y_train==1]
ben_train_y = y_train[y_train==1]
mal_test_X = X_test[y_test==0]
mal_test_y = y_test[y_test==0]
ben_test_X = X_test[y_test==1]
ben_test_y = y_test[y_test==1]
scores = [knn.score(mal_train_X, mal_train_y), knn.score(ben_train_X, ben_train_y),
knn.score(mal_test_X, mal_test_y), knn.score(ben_test_X, ben_test_y)]
scores_names = ['Malignant Training', 'Benign Training', 'Malignant Test', 'Benign Test']
scores_df = pd.DataFrame(scores, scores_names)
scores_df = scores_df.reset_index().rename(columns={'index':'Feature', 0:'Score'}).round(2)
import plotly.graph_objects as go
colors = ['olive',] * 4
colors[0], colors[1] = 'lightsalmon', 'lightsalmon'
texts = scores_df['Score']
fig = go.Figure(data=[go.Bar(x=['Malignant Training', 'Benign Training', 'Malignant Test', 'Benign Test'],
y=scores_df['Score'], marker_color=colors, text=scores_df['Score'],
textposition='auto', textfont_size=17, width=[0.8, 0.8, 0.8, 0.8])])
fig.update_layout(title_text='Training and Test Accuracies for Malignant and Benign Cells',
width=700, height=500, template='none')