import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, f1_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
import statsmodels.api as sm
from sklearn.metrics import roc_curve
%matplotlib inline
data_location = "dataset_infartos.csv"
data_inf = pd.read_csv(data_location)
data_inf.head()
male | age | education | currentSmoker | cigsPerDay | BPMeds | prevalentStroke | prevalentHyp | diabetes | totChol | sysBP | diaBP | BMI | heartRate | glucose | TenYearCHD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 39 | 4.0 | 0 | 0.0 | 0.0 | 0 | 0 | 0 | 195.0 | 106.0 | 70.0 | 26.97 | 80.0 | 77.0 | 0 |
1 | 0 | 46 | 2.0 | 0 | 0.0 | 0.0 | 0 | 0 | 0 | 250.0 | 121.0 | 81.0 | 28.73 | 95.0 | 76.0 | 0 |
2 | 1 | 48 | 1.0 | 1 | 20.0 | 0.0 | 0 | 0 | 0 | 245.0 | 127.5 | 80.0 | 25.34 | 75.0 | 70.0 | 0 |
3 | 0 | 61 | 3.0 | 1 | 30.0 | 0.0 | 0 | 1 | 0 | 225.0 | 150.0 | 95.0 | 28.58 | 65.0 | 103.0 | 1 |
4 | 0 | 46 | 3.0 | 1 | 23.0 | 0.0 | 0 | 0 | 0 | 285.0 | 130.0 | 84.0 | 23.10 | 85.0 | 85.0 | 0 |
data_inf.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4238 entries, 0 to 4237 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 male 4238 non-null int64 1 age 4238 non-null int64 2 education 4133 non-null float64 3 currentSmoker 4238 non-null int64 4 cigsPerDay 4209 non-null float64 5 BPMeds 4185 non-null float64 6 prevalentStroke 4238 non-null int64 7 prevalentHyp 4238 non-null int64 8 diabetes 4238 non-null int64 9 totChol 4188 non-null float64 10 sysBP 4238 non-null float64 11 diaBP 4238 non-null float64 12 BMI 4219 non-null float64 13 heartRate 4237 non-null float64 14 glucose 3850 non-null float64 15 TenYearCHD 4238 non-null int64 dtypes: float64(9), int64(7) memory usage: 529.9 KB
#vemos la cantidad de nulos
cantidad_nulos = data_inf.isnull().sum()
cantidad_nulos
male 0 age 0 education 105 currentSmoker 0 cigsPerDay 29 BPMeds 53 prevalentStroke 0 prevalentHyp 0 diabetes 0 totChol 50 sysBP 0 diaBP 0 BMI 19 heartRate 1 glucose 388 TenYearCHD 0 dtype: int64
total = data_inf.shape[0]
nulos = data_inf.isnull().sum()
porcentaje_nulos = 100 * nulos / total
porcentaje_nulos
male 0.000000 age 0.000000 education 2.477584 currentSmoker 0.000000 cigsPerDay 0.684285 BPMeds 1.250590 prevalentStroke 0.000000 prevalentHyp 0.000000 diabetes 0.000000 totChol 1.179802 sysBP 0.000000 diaBP 0.000000 BMI 0.448325 heartRate 0.023596 glucose 9.155262 TenYearCHD 0.000000 dtype: float64
#vemos los valores para la variable target, riesgo de cardioatia en los proximos 10 años. El daatset está desbalanceado
data_inf["TenYearCHD"].value_counts()
0 3594 1 644 Name: TenYearCHD, dtype: int64
#Borramos los datos que son nulos del dataset, es un porcentajo muy bajo
data_inf_sin_nulos = data_inf.dropna()
data_inf_sin_nulos.isnull().sum()
male 0 age 0 education 0 currentSmoker 0 cigsPerDay 0 BPMeds 0 prevalentStroke 0 prevalentHyp 0 diabetes 0 totChol 0 sysBP 0 diaBP 0 BMI 0 heartRate 0 glucose 0 TenYearCHD 0 dtype: int64
#vemos la cantidad de observaciones que tenemos dentro de cada categoría de la variabla target
data_inf_sin_nulos["TenYearCHD"].value_counts()
0 3099 1 557 Name: TenYearCHD, dtype: int64
# Vemos cuántos "Si" tenemos
riesgo_si_mask = (data_inf_sin_nulos["TenYearCHD"] > 0)
data_riesgo_si = data_inf_sin_nulos[riesgo_si_mask]
data_riesgo_si.shape
(557, 16)
# Vemos cuántos "no" tenemos
riesgo_no_mask = (data_inf_sin_nulos["TenYearCHD"] < 1)
data_riesgo_no = data_inf_sin_nulos[riesgo_no_mask]
data_riesgo_no.shape
(3099, 16)
# Dataset con 44% de datos de personas que no tienen riesgo de cardiopatía en 10 años
data_riesgo_no_557 = data_riesgo_no.sample(n=round(557*1.1) , random_state=123)
data_riesgo_no_557.head(3)
male | age | education | currentSmoker | cigsPerDay | BPMeds | prevalentStroke | prevalentHyp | diabetes | totChol | sysBP | diaBP | BMI | heartRate | glucose | TenYearCHD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3749 | 0 | 54 | 1.0 | 0 | 0.0 | 0.0 | 0 | 0 | 1 | 223.0 | 110.0 | 67.5 | 21.22 | 78.0 | 294.0 | 0 |
1995 | 0 | 50 | 2.0 | 0 | 0.0 | 1.0 | 0 | 1 | 0 | 241.0 | 132.0 | 85.0 | 23.81 | 55.0 | 84.0 | 0 |
3829 | 1 | 55 | 4.0 | 0 | 0.0 | 0.0 | 0 | 0 | 0 | 232.0 | 132.5 | 87.0 | 20.72 | 72.0 | 71.0 | 0 |
# concatenamos los datasets
data_para_regresion = pd.concat([data_riesgo_si,data_riesgo_no_557],axis=0)
data_para_regresion.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1170 entries, 3 to 163 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 male 1170 non-null int64 1 age 1170 non-null int64 2 education 1170 non-null float64 3 currentSmoker 1170 non-null int64 4 cigsPerDay 1170 non-null float64 5 BPMeds 1170 non-null float64 6 prevalentStroke 1170 non-null int64 7 prevalentHyp 1170 non-null int64 8 diabetes 1170 non-null int64 9 totChol 1170 non-null float64 10 sysBP 1170 non-null float64 11 diaBP 1170 non-null float64 12 BMI 1170 non-null float64 13 heartRate 1170 non-null float64 14 glucose 1170 non-null float64 15 TenYearCHD 1170 non-null int64 dtypes: float64(9), int64(7) memory usage: 155.4 KB
total = data_para_regresion.shape[0]
nulos = data_para_regresion.isnull().sum()
porcentaje_nulos = 100 * nulos / total
porcentaje_nulos
male 0.0 age 0.0 education 0.0 currentSmoker 0.0 cigsPerDay 0.0 BPMeds 0.0 prevalentStroke 0.0 prevalentHyp 0.0 diabetes 0.0 totChol 0.0 sysBP 0.0 diaBP 0.0 BMI 0.0 heartRate 0.0 glucose 0.0 TenYearCHD 0.0 dtype: float64
# chequeamos si el dataset para la regresión quedó más balanceado
data_para_regresion.TenYearCHD.value_counts()
0 613 1 557 Name: TenYearCHD, dtype: int64
data_para_regresion.cigsPerDay.value_counts()
0.0 607 20.0 215 30.0 61 15.0 61 40.0 31 10.0 30 5.0 28 3.0 24 9.0 21 1.0 17 43.0 15 25.0 13 6.0 6 35.0 6 2.0 5 7.0 4 17.0 3 50.0 3 60.0 3 8.0 3 45.0 2 18.0 2 12.0 2 4.0 2 16.0 1 19.0 1 11.0 1 23.0 1 14.0 1 13.0 1 Name: cigsPerDay, dtype: int64
# Medicamento para Presión
data_raw_bpmed = pd.DataFrame(
{'Posible CHD': ['No', 'No', 'Si', 'Si'],
'Cantidad de Personas': [data_riesgo_no_557.BPMeds.value_counts()[0],data_riesgo_no_557.BPMeds.value_counts()[1], data_riesgo_si.BPMeds.value_counts()[0], data_riesgo_si.BPMeds.value_counts()[1]],
'Medicamentos para control de presión arterial': ['No', 'Si', 'No', 'Si']
})
fig = px.bar(data_raw_bpmed, x="Medicamentos para control de presión arterial", y="Cantidad de Personas", color='Posible CHD', barmode='group' , color_discrete_sequence=["#00CC96", "#EF553B"]
)
fig.show()
# ACV
data_raw_acv = pd.DataFrame(
{'Posible CHD': ['No', 'No', 'Si', 'Si'],
'Cantidad de Personas': [data_riesgo_no_557.prevalentStroke.value_counts()[0],data_riesgo_no_557.prevalentStroke.value_counts()[1], data_riesgo_si.prevalentStroke.value_counts()[0], data_riesgo_si.prevalentStroke.value_counts()[1]],
'Antecedente ACV': ['No', 'Si', 'No', 'Si']
})
fig = px.bar(data_raw_acv, x="Antecedente ACV", y="Cantidad de Personas", color='Posible CHD', barmode='group' , color_discrete_sequence=["#00CC96", "#EF553B"]
)
fig.show()
# Hipertenso
data_raw_ht = pd.DataFrame(
{'Posible CHD': ['No', 'No', 'Si', 'Si'],
'Cantidad de Personas': [data_riesgo_no_557.prevalentHyp.value_counts()[0],data_riesgo_no_557.prevalentHyp.value_counts()[1], data_riesgo_si.prevalentHyp.value_counts()[0], data_riesgo_si.prevalentHyp.value_counts()[1]],
'Hipertenso': ['No', 'Si', 'No', 'Si']
})
fig = px.bar(data_raw_ht, x="Hipertenso", y="Cantidad de Personas", color='Posible CHD', barmode='group' , color_discrete_sequence=["#00CC96", "#EF553B"]
)
fig.show()
# Diabetes
data_raw_diab = pd.DataFrame(
{'Posible CHD': ['No', 'No', 'Si', 'Si'],
'Cantidad de Personas': [data_riesgo_no_557.diabetes.value_counts()[0],data_riesgo_no_557.diabetes.value_counts()[1], data_riesgo_si.diabetes.value_counts()[0], data_riesgo_si.diabetes.value_counts()[1]],
'Diabetico': ['No', 'Si', 'No', 'Si']
})
fig = px.bar(data_raw_diab, x="Diabetico", y="Cantidad de Personas", color='Posible CHD', barmode='group' , color_discrete_sequence=["#00CC96", "#EF553B"]
)
fig.show()
# SMOKER
data_raw_smoker = pd.DataFrame(
{'Posible CHD': ['No', 'No', 'Si', 'Si'],
'Cantidad de Personas': [data_riesgo_no_557.currentSmoker.value_counts()[0],data_riesgo_no_557.currentSmoker.value_counts()[1], data_riesgo_si.currentSmoker.value_counts()[0], data_riesgo_si.currentSmoker.value_counts()[1]],
'Fumador': ['No', 'Si', 'No', 'Si']
})
fig = px.bar(data_raw_smoker, x="Fumador", y="Cantidad de Personas", color='Posible CHD', barmode='relative' , color_discrete_sequence=["#00CC96", "#EF553B"]
)
fig.show()
data_para_regresion
male | age | education | currentSmoker | cigsPerDay | BPMeds | prevalentStroke | prevalentHyp | diabetes | totChol | sysBP | diaBP | BMI | heartRate | glucose | TenYearCHD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3 | 0 | 61 | 3.0 | 1 | 30.0 | 0.0 | 0 | 1 | 0 | 225.0 | 150.0 | 95.0 | 28.58 | 65.0 | 103.0 | 1 |
6 | 0 | 63 | 1.0 | 0 | 0.0 | 0.0 | 0 | 0 | 0 | 205.0 | 138.0 | 71.0 | 33.11 | 60.0 | 85.0 | 1 |
15 | 0 | 38 | 2.0 | 1 | 20.0 | 0.0 | 0 | 1 | 0 | 221.0 | 140.0 | 90.0 | 21.35 | 95.0 | 70.0 | 1 |
17 | 0 | 46 | 2.0 | 1 | 20.0 | 0.0 | 0 | 0 | 0 | 291.0 | 112.0 | 78.0 | 23.38 | 80.0 | 89.0 | 1 |
25 | 1 | 47 | 4.0 | 1 | 20.0 | 0.0 | 0 | 0 | 0 | 294.0 | 102.0 | 68.0 | 24.18 | 62.0 | 66.0 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1719 | 1 | 48 | 1.0 | 1 | 20.0 | 0.0 | 0 | 0 | 0 | 268.0 | 116.5 | 82.0 | 21.34 | 60.0 | 82.0 | 0 |
3964 | 0 | 39 | 3.0 | 0 | 0.0 | 0.0 | 0 | 0 | 0 | 213.0 | 125.0 | 87.0 | 16.73 | 110.0 | 75.0 | 0 |
3325 | 1 | 62 | 1.0 | 1 | 5.0 | 0.0 | 0 | 1 | 0 | 243.0 | 157.0 | 96.0 | 28.83 | 75.0 | 71.0 | 0 |
1933 | 1 | 41 | 2.0 | 1 | 30.0 | 0.0 | 0 | 0 | 0 | 224.0 | 114.0 | 68.0 | 21.42 | 66.0 | 107.0 | 0 |
163 | 1 | 57 | 2.0 | 1 | 1.0 | 0.0 | 0 | 0 | 0 | 242.0 | 102.0 | 61.0 | 23.50 | 70.0 | 82.0 | 0 |
1170 rows × 16 columns
# comparacion de edades entre quienes tienen riesgo y no tienen riesgo
fig = px.box(data_para_regresion, x="TenYearCHD", y="age")
fig.show()
# comparacion de educacion entre quienes tienen riesgo y no tienen riesgo
fig = px.box(data_para_regresion, x="TenYearCHD", y="education")
fig.show()
# comparacion colesterol
fig = px.box(data_para_regresion, x="TenYearCHD", y="totChol")
fig.show()
fig = px.box(data_para_regresion, x="TenYearCHD", y="sysBP")
fig.show()
fig = px.box(data_para_regresion, x="TenYearCHD", y="diaBP")
fig.show()
# BMI
fig = px.box(data_para_regresion, x="TenYearCHD", y="BMI")
fig.show()
# niveles de glucosa
fig = px.box(data_para_regresion, x="TenYearCHD", y="glucose")
fig.show()
# comparacion educacion
fig = px.box(data_para_regresion, x="TenYearCHD", y="heartRate")
fig.show()
cigarrillo_CHD_grouped =data_para_regresion.groupby( ["TenYearCHD", "cigsPerDay"] )
cigarrillo_CHD_count = pd.DataFrame(cigarrillo_CHD_grouped.size().reset_index(name = "Group_Count"))
cigarrillo_CHD_count.shape
(50, 3)
cigarrillo_CHD_count.head(30)
TenYearCHD | cigsPerDay | Group_Count | |
---|---|---|---|
0 | 0 | 0.0 | 335 |
1 | 0 | 1.0 | 10 |
2 | 0 | 2.0 | 3 |
3 | 0 | 3.0 | 15 |
4 | 0 | 4.0 | 2 |
5 | 0 | 5.0 | 15 |
6 | 0 | 6.0 | 3 |
7 | 0 | 7.0 | 1 |
8 | 0 | 8.0 | 3 |
9 | 0 | 9.0 | 12 |
10 | 0 | 10.0 | 14 |
11 | 0 | 12.0 | 1 |
12 | 0 | 13.0 | 1 |
13 | 0 | 14.0 | 1 |
14 | 0 | 15.0 | 30 |
15 | 0 | 16.0 | 1 |
16 | 0 | 17.0 | 1 |
17 | 0 | 19.0 | 1 |
18 | 0 | 20.0 | 112 |
19 | 0 | 25.0 | 4 |
20 | 0 | 30.0 | 24 |
21 | 0 | 35.0 | 2 |
22 | 0 | 40.0 | 12 |
23 | 0 | 43.0 | 4 |
24 | 0 | 45.0 | 2 |
25 | 0 | 50.0 | 2 |
26 | 0 | 60.0 | 2 |
27 | 1 | 0.0 | 272 |
28 | 1 | 1.0 | 7 |
29 | 1 | 2.0 | 2 |
#grafico tipo de propiedad por provincia/region#
fig = px.scatter(data_frame = cigarrillo_CHD_count, x = "TenYearCHD", y = "cigsPerDay",
size = "Group_Count", color = "cigsPerDay", opacity = 0.6,
width=700, height=800)
fig.show()
EDAD_CHD_grouped =data_para_regresion.groupby( ["TenYearCHD", "age"] )
EDAD_CHD_count = pd.DataFrame(EDAD_CHD_grouped.size().reset_index(name = "Group_Count"))
EDAD_CHD_count.shape
(72, 3)
EDAD_CHD_count.head(50)
TenYearCHD | age | Group_Count | |
---|---|---|---|
0 | 0 | 33 | 2 |
1 | 0 | 34 | 2 |
2 | 0 | 35 | 4 |
3 | 0 | 36 | 18 |
4 | 0 | 37 | 13 |
5 | 0 | 38 | 17 |
6 | 0 | 39 | 30 |
7 | 0 | 40 | 32 |
8 | 0 | 41 | 34 |
9 | 0 | 42 | 30 |
10 | 0 | 43 | 17 |
11 | 0 | 44 | 20 |
12 | 0 | 45 | 24 |
13 | 0 | 46 | 24 |
14 | 0 | 47 | 27 |
15 | 0 | 48 | 26 |
16 | 0 | 49 | 17 |
17 | 0 | 50 | 25 |
18 | 0 | 51 | 23 |
19 | 0 | 52 | 15 |
20 | 0 | 53 | 17 |
21 | 0 | 54 | 27 |
22 | 0 | 55 | 21 |
23 | 0 | 56 | 16 |
24 | 0 | 57 | 16 |
25 | 0 | 58 | 9 |
26 | 0 | 59 | 12 |
27 | 0 | 60 | 17 |
28 | 0 | 61 | 18 |
29 | 0 | 62 | 12 |
30 | 0 | 63 | 9 |
31 | 0 | 64 | 17 |
32 | 0 | 65 | 8 |
33 | 0 | 66 | 3 |
34 | 0 | 67 | 7 |
35 | 0 | 68 | 3 |
36 | 0 | 69 | 1 |
37 | 1 | 35 | 2 |
38 | 1 | 36 | 2 |
39 | 1 | 37 | 4 |
40 | 1 | 38 | 7 |
41 | 1 | 39 | 6 |
42 | 1 | 40 | 11 |
43 | 1 | 41 | 10 |
44 | 1 | 42 | 14 |
45 | 1 | 43 | 11 |
46 | 1 | 44 | 11 |
47 | 1 | 45 | 13 |
48 | 1 | 46 | 13 |
49 | 1 | 47 | 18 |
#grafico tipo de propiedad por provincia/region#
fig = px.scatter(data_frame = EDAD_CHD_count, x = "TenYearCHD", y = "age",
size = "Group_Count", color = "age", opacity = 0.6,
width=700, height=800)
fig.show()
fig = px.density_heatmap(data_para_regresion, x="age", y="TenYearCHD", facet_row="male" , color_continuous_scale=px.colors.diverging.PRGn)
fig.show()
# vemos cuántos mujeres y hombres hay en el dataset
data_para_regresion.male.value_counts()
0 608 1 562 Name: male, dtype: int64
data_inf_sin_nulos.head(4) #### NUESTRO DF LIMPIO PERO DESBALANCEADO:
male | age | education | currentSmoker | cigsPerDay | BPMeds | prevalentStroke | prevalentHyp | diabetes | totChol | sysBP | diaBP | BMI | heartRate | glucose | TenYearCHD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 39 | 4.0 | 0 | 0.0 | 0.0 | 0 | 0 | 0 | 195.0 | 106.0 | 70.0 | 26.97 | 80.0 | 77.0 | 0 |
1 | 0 | 46 | 2.0 | 0 | 0.0 | 0.0 | 0 | 0 | 0 | 250.0 | 121.0 | 81.0 | 28.73 | 95.0 | 76.0 | 0 |
2 | 1 | 48 | 1.0 | 1 | 20.0 | 0.0 | 0 | 0 | 0 | 245.0 | 127.5 | 80.0 | 25.34 | 75.0 | 70.0 | 0 |
3 | 0 | 61 | 3.0 | 1 | 30.0 | 0.0 | 0 | 1 | 0 | 225.0 | 150.0 | 95.0 | 28.58 | 65.0 | 103.0 | 1 |
X = data_inf_sin_nulos[["male", "age", "education", "currentSmoker", "cigsPerDay", "BPMeds", "prevalentStroke", "prevalentHyp", "diabetes", "totChol", "sysBP", "diaBP", "BMI", "heartRate", "glucose"]]
y = data_inf_sin_nulos.TenYearCHD
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 12)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
logistic_regression = LogisticRegression(penalty='none') # sin regularización
logistic_regression.fit(X_train, y_train);
y_test_pred = logistic_regression.predict(X_test)
y_test_pred_proba = logistic_regression.predict_proba(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
(tn, fp, fn, tp)
(767, 8, 127, 12)
print("habia" , tn+fp , "que NO iban a tener problemas cardiacos , de los cuales :" , tn , "fueron acertados y los que fueron incorrectos son " , fp)
print("habia" , fn+tp , "que SI iban a tener problemas cardiacos , de los cuales :" , tp , "fueron acertados y los que fueron incorrectos son " , fn)
habia 775 que NO iban a tener problemas cardiacos , de los cuales : 767 fueron acertados y los que fueron incorrectos son 8 habia 139 que SI iban a tener problemas cardiacos , de los cuales : 12 fueron acertados y los que fueron incorrectos son 127
sns.heatmap(confusion_matrix(y_test, y_test_pred), annot=True, fmt='g', xticklabels=['Negativo Predicho','Positivo Predicho'], yticklabels=['Negativo Real','Positivo Real'],cmap='Blues')
<AxesSubplot:>
acc=accuracy_score(y_test, y_test_pred)
f1=f1_score(y_test, y_test_pred)
recall=recall_score(y_test, y_test_pred)
preci=precision_score(y_test, y_test_pred)
my_scores=[acc,f1,recall,preci]
fig = px.bar(x=["accuracy_score", "f1_score", "recall_score", "precision_score"],y=my_scores, title="Metricas del modelo de Regresíon Logística con Datos Desbalanceados")
fig.show()
#Separamos las variables predictoras de la variable target
X = data_para_regresion[["male", "age", "education", "currentSmoker", "cigsPerDay", "BPMeds", "prevalentStroke", "prevalentHyp", "diabetes", "totChol", "sysBP", "diaBP", "BMI", "heartRate", "glucose"]]
y = data_para_regresion.TenYearCHD
#Separamos en train y test de forma proporcionada con el stratify
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 12)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
logistic_regression = LogisticRegression(penalty='none') # sin regularización
logistic_regression.fit(X_train, y_train);
y_train_pred = logistic_regression.predict(X_train)
#y_train_pred
accuracy_score(y_train, y_train_pred)
0.6716077537058153
confusion_matrix(y_train, y_train_pred)
array([[322, 137], [151, 267]], dtype=int64)
tn, fp, fn, tp = confusion_matrix(y_train, y_train_pred).ravel()
(tn, fp, fn, tp)
(322, 137, 151, 267)
print("habia" , tn+fp , "que NO iban a tener problemas cardiacos , de los cuales :" , tn , "fueron acertados y los que fueron incorrectos son " , fp)
print("habia" , fn+tp , "que SI iban a tener problemas cardiacos , de los cuales :" , tp , "fueron acertados y los que fueron incorrectos son " , fn)
habia 459 que NO iban a tener problemas cardiacos , de los cuales : 322 fueron acertados y los que fueron incorrectos son 137 habia 418 que SI iban a tener problemas cardiacos , de los cuales : 267 fueron acertados y los que fueron incorrectos son 151
y_test_pred = logistic_regression.predict(X_test)
#y_test_pred
y_test_pred_proba = logistic_regression.predict_proba(X_test)
#y_test_pred_proba
accuracy_score(y_test, y_test_pred)
0.6962457337883959
f1_score(y_test, y_test_pred)
0.6691449814126393
recall_score(y_test, y_test_pred)
0.6474820143884892
precision_score(y_test, y_test_pred)
0.6923076923076923
confusion_matrix(y_test, y_test_pred)
array([[114, 40], [ 49, 90]], dtype=int64)
#Matriz de confusion
sns.heatmap(confusion_matrix(y_test, y_test_pred), annot=True, fmt='g', xticklabels=['Negativo Predicho','Positivo Predicho'], yticklabels=['Negativo Real','Positivo Real'],cmap='Blues')
<AxesSubplot:>
tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
(tn, fp, fn, tp)
(114, 40, 49, 90)
print("habia" , tn+fp , "que NO iban a tener problemas cardiacos , de los cuales :" , tn , "fueron acertados y los que fueron incorrectos son " , fp)
print("habia" , fn+tp , "que SI iban a tener problemas cardiacos , de los cuales :" , tp , "fueron acertados y los que fueron incorrectos son " , fn)
habia 154 que NO iban a tener problemas cardiacos , de los cuales : 114 fueron acertados y los que fueron incorrectos son 40 habia 139 que SI iban a tener problemas cardiacos , de los cuales : 90 fueron acertados y los que fueron incorrectos son 49
logistic_regression.intercept_
array([-0.10166629])
logistic_regression.coef_
array([[ 0.2851362 , 0.50653973, 0.0458867 , 0.13492714, 0.1003944 , -0.06420957, 0.10135185, 0.09317174, 0.10326965, 0.03427066, 0.48445815, -0.11120784, 0.06289528, -0.01043178, 0.19476592]])
y_test.value_counts()
0 154 1 139 Name: TenYearCHD, dtype: int64
acc=accuracy_score(y_test, y_test_pred)
f1=f1_score(y_test, y_test_pred)
recall=recall_score(y_test, y_test_pred)
preci=precision_score(y_test, y_test_pred)
my_scores=[acc,f1,recall,preci]
fig = px.bar(x=["accuracy_score", "f1_score", "recall_score", "precision_score"],y=my_scores
, title="Metricas del modelo de Regresíon Logística")
fig.show()
##X_train, X_test, y_train, y_test
X_train_stats = sm.add_constant(X_train)
# Instanciamos la clase
logit = sm.Logit(y_train, X_train_stats)
# Fiteamos el modelo
result = logit.fit()
# Imprimimos el resumen
print(result.summary2())
Optimization terminated successfully. Current function value: 0.605124 Iterations 6 Results: Logit ================================================================= Model: Logit Pseudo R-squared: 0.126 Dependent Variable: TenYearCHD AIC: 1093.3868 Date: 2021-06-22 23:43 BIC: 1169.8109 No. Observations: 877 Log-Likelihood: -530.69 Df Model: 15 LL-Null: -606.93 Df Residuals: 861 LLR p-value: 7.7697e-25 Converged: 1.0000 Scale: 1.0000 No. Iterations: 6.0000 ------------------------------------------------------------------- Coef. Std.Err. z P>|z| [0.025 0.975] ------------------------------------------------------------------- const -0.1017 0.0743 -1.3676 0.1714 -0.2474 0.0440 x1 0.2851 0.0807 3.5336 0.0004 0.1270 0.4433 x2 0.5065 0.0882 5.7461 0.0000 0.3338 0.6793 x3 0.0459 0.0763 0.6010 0.5478 -0.1038 0.1955 x4 0.1349 0.1214 1.1115 0.2664 -0.1030 0.3729 x5 0.1004 0.1230 0.8163 0.4144 -0.1407 0.3415 x6 -0.0642 0.0803 -0.7999 0.4238 -0.2215 0.0931 x7 0.1014 0.0870 1.1648 0.2441 -0.0692 0.2719 x8 0.0931 0.1044 0.8922 0.3723 -0.1115 0.2977 x9 0.1033 0.1117 0.9243 0.3553 -0.1157 0.3223 x10 0.0343 0.0789 0.4345 0.6639 -0.1203 0.1888 x11 0.4845 0.1546 3.1338 0.0017 0.1815 0.7875 x12 -0.1112 0.1293 -0.8596 0.3900 -0.3647 0.1423 x13 0.0629 0.0810 0.7763 0.4376 -0.0959 0.2217 x14 -0.0104 0.0767 -0.1360 0.8919 -0.1608 0.1399 x15 0.1948 0.1246 1.5636 0.1179 -0.0494 0.4389 =================================================================
# x1 male , x2 age
# x3 education, x4 currentSmoker
# x5 cigsPerDay, x6 BPMeds, x7 prevalentStroke
#x8 prevalentHyp, x9 diabetes, x10totChol, x11 sysBP
# x12 diaBP, x13BMI, x14heartRate, x 15 glucose
Edad= 0.0001
Hombre= 0.0004
ACV = 0.2441
Fuma = 0.2664
Diabetes = 0.3553
Hipertension = 0.3723
p_value=[Edad, Hombre, ACV, Fuma, Diabetes, Hipertension ]
fig = px.bar(x=["1_Edad", "2_Hombre", "3_Fuma", "4_ACV", "5_Diabetes", "6_Hipertension"],y=p_value
, title="Variables predictoras más relevantes")
fig.show()
Ritmo_cardiaco= 0.8919
Colesterol= 0.6639
Nivel_educativo = 0.5478
p_value=[Ritmo_cardiaco, Colesterol, Nivel_educativo]
fig = px.bar(x=["Ritmo_cardiaco", "Colesterol", "Nivel_educativo"],y=p_value
, title="Variables predictoras menos relevantes")
fig.show()
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
GaussianNB()
y_pred = gnb.predict(X_test)
accuracy_score(y_test, y_pred)
0.6313993174061433
f1_score(y_test, y_pred)
0.443298969072165
recall_score(y_test, y_pred)
0.30935251798561153
precision_score(y_test, y_pred)
0.7818181818181819
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
(tn, fp, fn, tp)
(142, 12, 96, 43)
print("habia" , tn+fp , "que NO iban a tener problemas cardiacos , de los cuales :" , tn , "fueron acertados y los que fueron incorrectos son " , fp)
print("habia" , fn+tp , "que SI iban a tener problemas cardiacos , de los cuales :" , tp , "fueron acertados y los que fueron incorrectos son " , fn)
habia 154 que NO iban a tener problemas cardiacos , de los cuales : 142 fueron acertados y los que fueron incorrectos son 12 habia 139 que SI iban a tener problemas cardiacos , de los cuales : 43 fueron acertados y los que fueron incorrectos son 96
acc=accuracy_score(y_test, y_pred)
f1=f1_score(y_test, y_pred)
recall=recall_score(y_test, y_pred)
preci=precision_score(y_test, y_pred)
my_scores=[acc,f1,recall,preci]
fig = px.bar(x=["accuracy_score", "f1_score", "recall_score", "precision_score"],y=my_scores
, title="Metricas del modelo de Naive Bayes")
fig.show()
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
knn = KNeighborsClassifier(n_neighbors=59)
y_test_pred_proba = logistic_regression.predict_proba(X_test)
#y_test_pred_proba
# Ajustamos a los datos de entrenamiento.
knn.fit(X_train, y_train)
# Ajustamos a los datos de entrenamiento.
knn.fit(X_train, y_train);
# Predecimos etiquetas para los datos de test.
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)
0.6552901023890785
accuracy_score(y_test, y_pred)
0.6552901023890785
f1_score(y_test, y_pred)
0.5738396624472575
recall_score(y_test, y_pred)
0.4892086330935252
precision_score(y_test, y_pred)
0.6938775510204082
acc=accuracy_score(y_test, y_pred)
f1=f1_score(y_test, y_pred)
recall=recall_score(y_test, y_pred)
preci=precision_score(y_test, y_pred)
my_scores=[acc,f1,recall,preci]
fig = px.bar(x=["accuracy_score", "f1_score", "recall_score", "precision_score"],y=my_scores
, title="Metricas del modelo de KNN")
fig.show()
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
(tn, fp, fn, tp)
(124, 30, 71, 68)
print("habia" , tn+fp , "que NO iban a tener problemas cardiacos , de los cuales :" , tn , "fueron acertados y los que fueron incorrectos son " , fp)
print("habia" , fn+tp , "que SI iban a tener problemas cardiacos , de los cuales :" , tp , "fueron acertados y los que fueron incorrectos son " , fn)
habia 154 que NO iban a tener problemas cardiacos , de los cuales : 124 fueron acertados y los que fueron incorrectos son 30 habia 139 que SI iban a tener problemas cardiacos , de los cuales : 68 fueron acertados y los que fueron incorrectos son 71
X_train_train, X_validation, y_train_train, y_validation = train_test_split(X_train, y_train, random_state=2)
k_range = list(range(1, 50))
scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train_train, y_train_train)
y_pred = knn.predict(X_validation)
scores.append(f1_score(y_validation, y_pred , zero_division=1))
plt.figure(figsize=(4,3))
plt.plot(k_range, scores)
plt.xlabel('Valor de K'); plt.ylabel('Test F1')
plt.grid(); plt.show()
X_train_train, X_validation, y_train_train, y_validation = train_test_split(X_train, y_train, random_state=2)
k_range = list(range(1, 100))
scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train_train, y_train_train)
y_pred = knn.predict(X_validation)
scores.append(recall_score(y_validation, y_pred))
plt.figure(figsize=(4,3))
plt.plot(k_range, scores)
plt.xlabel('Valor de K'); plt.ylabel('Test recall')
plt.grid(); plt.show()
from sklearn.metrics import roc_curve, auc
y_probs_nb = gnb.predict_proba(X_test) #calculo las probabilidades para Naive Bayes
fpr_nb,tpr_nb,thr_nb = roc_curve(y_test, y_probs_nb[:,1]) #curva ROC para Naive Bayes
fpr_log,tpr_log,thr_log = roc_curve(y_test, y_test_pred_proba[:,1]) #curva ROC para Logistic Regresion
plt.axis([0, 1.01, 0, 1.01])
plt.xlabel('1 - Specificty')
plt.ylabel('TPR / Sensitivity')
plt.title('ROC Curve')
plt.plot(fpr_nb,tpr_nb)
plt.plot(fpr_log,tpr_log)
plt.plot(np.arange(0,1, step =0.01), np.arange(0,1, step =0.01))
plt.legend(['NB','Logit'])
plt.show()
print('AUC-NB=', auc(fpr_nb, tpr_nb))
print('AUC-Logit=', auc(fpr_log, tpr_log))
AUC-NB= 0.7181164159581427 AUC-Logit= 0.738064094179202