Factores qeu utiliza Torche para la on¡btención de indices economicos
personal computer, stove, washing machine, refrigerator, cellular phone, landline phone, internet access, inside toilet, electricity, domestic service, cable or satellite TV, shop or business, land or farm, secondresidence, animals, agricultural machinery or equipment, savings account, checking account, credit card, and cars.
Paht recomendado https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
#conda list
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#df1 = pd.read_stata('Datos/ESRU-EMOVI-2017/ESRU-EMOVI-2017-Entrevistado.dta', iterator = True)
# df1.variable_labels()
# df1.values.labels()
df = pd.read_stata('Datos/ESRU-EMOVI-2017/ESRU-EMOVI-2017-Entrevistado.dta',
convert_categoricals= False)
df.dtypes
df
folio | Estado | folio_ageb | consecutivo | Origen | Latitud | Longitud | LatitudGP | LongitudGP | recontacto | ... | region | cdmx | tot_int | rururb | cmo1_2 | cmo2_2 | cmo3_2 | cmo4_2 | cmo5_2 | tamhog | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0100100010286020830102 | 1.0 | 0100100010286 | 1 | 1.0 | 21.901323 | -102.310598 | 21.901477 | -102.310429 | 2 | ... | 3.0 | NaN | 5.0 | 0.0 | 41 | . | . | 13 | . | 5.0 |
1 | 0100100010286020850201 | 1.0 | 0100100010286 | 1 | 2.0 | 21.901323 | -102.310598 | 21.900773 | -102.311138 | 1 | ... | 3.0 | NaN | 1.0 | 0.0 | 41 | . | . | 41 | . | 1.0 |
2 | 0100100010286025830201 | 1.0 | 0100100010286 | 1 | 1.0 | 21.900830 | -102.311818 | 21.900549 | -102.313361 | 1 | ... | 3.0 | NaN | 2.0 | 0.0 | 81 | . | . | 11 | . | 2.0 |
3 | 0100100010286025840101 | 1.0 | 0100100010286 | 1 | 1.0 | 21.901188 | -102.310700 | 21.900765 | -102.313144 | 1 | ... | 3.0 | NaN | 1.0 | 0.0 | 52 | . | . | . | . | 1.0 |
4 | 0100100010286025850101 | 1.0 | 0100100010286 | 1 | 2.0 | 21.901188 | -102.310700 | 21.900577 | -102.312733 | 1 | ... | 3.0 | NaN | 2.0 | 0.0 | 52 | . | . | . | . | 2.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
17660 | 3205700010022019460402 | 32.0 | 3205700010022 | 2 | 1.0 | 22.755409 | -102.513985 | 22.755409 | -102.513985 | 1 | ... | 2.0 | NaN | 4.0 | 0.0 | 71 | . | . | . | 71 | 4.0 |
17661 | 3205700010022025450501 | 32.0 | 3205700010022 | 2 | 1.0 | 22.288405 | -101.577532 | 22.288405 | -101.577532 | 1 | ... | 2.0 | NaN | 4.0 | 0.0 | . | . | . | 82 | 71 | 4.0 |
17662 | 3205700010022025460301 | 32.0 | 3205700010022 | 2 | 1.0 | 22.758625 | -102.499375 | 22.758625 | -102.499375 | 1 | ... | 2.0 | NaN | 6.0 | 0.0 | . | . | . | 53 | 52 | 6.0 |
17663 | 3205700010022025460302 | 32.0 | 3205700010022 | 2 | 1.0 | 22.755420 | -102.513997 | 22.755420 | -102.513997 | 1 | ... | 2.0 | NaN | 5.0 | 0.0 | . | 52 | . | 53 | 62 | 5.0 |
17664 | 3205700010022025460501 | 32.0 | 3205700010022 | 7 | 1.0 | 22.758625 | -102.499375 | 22.758625 | -102.499375 | 1 | ... | 2.0 | NaN | 10.0 | 0.0 | . | . | . | . | 41 | 10.0 |
17665 rows × 366 columns
# escolaridad
#df.dropna(subset=['p13'])
#print(df['p13'].unique(), type(df[df.p13 != np.nan]['p13'].unique()[-1]) )
#l = 0
#for i in df['p13']:
# if np.isnan(i):
# df.loc[l,'p13']=1
# l += 1
#print(df['p13'].unique(), type(df[df.p13 != np.nan]['p13'].unique()[-1]), np.isnan(df[df.p13 != np.nan]['p13'].unique()[-1]) )
#df
from sklearn.model_selection import train_test_split
# pregunta p02, comparten el mimso gasto para comer: 1 ) Si 2 ) No
df2 = df[df.p02 == 1 ]
#edad
df2 = df2[(25<=df2.p05) & (df2.p05<=50)]
# pregunta p08, es el jefe del hogar: 1 ) Si
#df2 = df2[df2.p08 == 1 ]
# pregunta p12. actualmente estudia, 1)Si 2)No
df2 = df2[df2.p12 == 2 ]
# aós alacnazado podria ser importante p14
# p13, nivel escuela 1-12. 97 = no fue a la escuela
#df2 = df2[df2.p13 = 97 ] # quitamos la no asistencia
#print(df2.p13.unique())
# p63, Escuela publica o privada, 8 = No aplica
# estonces sino aplica con quedamos con el más alto
# esta valirvale es mas complicada, por el momento no la utiizamos
# (preguntas por cada caso)
# sosten principal
df2=df2[(df2.p26==1) | (df2.p26==2)]
# infromación sosoten principal
df2 = df2[((((( (~df2.p43.isna()&~np.isnan(df2.p43) )& df2.p43!=98) & df2.p26==1) & df2.cmo1_2!="." )&
(~np.isnan(df2.p38_11)) ) & ~df2.p38_11.isna()) |
((((((~df2.p43m.isna()&~np.isnan(df2.p43m) ) & df2.p43m!=98) & df2.p26==2) & df2.cmo2_2!="." )&
(~np.isnan(df2.p38m_11))) & ~df2.p38_11.isna())]
print('shape', df2.shape)
shape (8783, 366)
df2 = df2[(~np.isnan(df2.p13) & (~df2.p13.isna()))]
df2 = df2[df2.SINCO3!=" "]
# p68, Personas que trabajan, 1. Si, 2. No
# p69, negocio vacaciones, 1. Si, 2. No
df2 = df2[(df2.p68 == 1) | (df2.p69 == 1)] # quitamos los no ocupados
# p120, material de la casa opciones: 1,2,3
# p121, numero de cuartos para dormir, libre
# p122, numeor totales de cuartos, libre
# p123, casa propia o del conyugue, 1. Sí, 2. No
# p125, servicios básicos de la vivienda,
# 125a - 125e (preguntas por cada caso, 1. Si, 2. No)
# p126, articulos propiedad del hogar
# 126a - 126r (preguntas por cada caso, 1. Si , 2. No)
# p127, prestamo variable 1-9, 8 es otro caso
#df2 = df2[df2.p127 != 8 ] # quitamos la otra posibilidad
# p128, ahorros, tarjeta de credito, cuent abancaria,
# a-f (preguntas por cada caso) 1. Si 2. No
# p129, pertenencias propias o de conyugue,
# a-e (pregunta por cada caso) 1. Si, 2. No
# p130, apoyo economico porgramas u otro medio,
# a-f, (preguntas por cada caso)
# p131, numero de automoviles propios, libre
# p132, numero de miembros que aportan ingreso al hogar
df2 = df2[df2.p132 == 1] # un solo sosten
# Cohort de ingreso todas las personas que aportan ingresos
df2 = df2[((df2.p133 != 8) & ( df2.p133 != 9) )& (~np.isnan(df2.p133) & (~df2.p133.isna())) ] #ingreso no reportado o no diponible
print('valores cohort de ingreso',df2.p133.unique())
print('shape', df2.shape)
print("unique",df2.p43.unique())
# p134, condiciones del barrio
# a-i, (preguntas por cada caso) 1. Si , 2. No, 8 son respuesta
p = "p134"
#for _ in "abcdefghi":
# df2 = df2[df2[p+_] != 8]
# p147 percepción de 1 más pobre + 10 más rico
#df2.p08.plot.hist()
#df2.p133.plot.hist()
Estado = ["Estado"]
p5 = ['p05']
p6 = ['p06']
p13 = ['p13']
SINCO = ['SINCO3']
p63 = ['p63a','p63b','p63c','p63d'] # por el momento la quitamos de la estimación
p120 = ['p120']
p121 = ['p121']
p122 = ['p122']
p123 = ['p123']
p125 = ['p125a','p125b','p125c','p125d','p125e']
import string
p126 = ['p126' + i for i in string.ascii_lowercase[0:18] ]
p127 = ['p127']
p128 = ['p128a','p128b','p128c','p128d','p128e','p128f']
p129 = ['p129a','p129b','p129c','p129d','p129e']
p130 = ['p130a','p130b','p130c','p130d','p130e', 'p130f']
p131 = ['p131']
p132 = ['p132']
p134 = ['p134a','p134b','p134c','p134d','p134e', 'p134f','p134g','p134h','p134i']
p147 = ['p147']
#indexX = p13 + p120 + p121 + p122 + p123 + p125 + p126 + p127 + p128 + p129 + p130 + p131
#indexX = p13 + p121 + p122 + p127 + p131
#indexX = p13 + p131
indexX = p5 + p6 + p13 + SINCO + ['region']
indexY = ['p133']
#M = df2[indexX+indexY].dropna()
M = df2[indexX+indexY]
#M.loc[M.p133 == 2,'p133'] = 1
#M.loc[M.p133 != 1,'p133'] = M.p133 - 1
#a = StandardScaler().fit(M)
#M_scaled = a.transform(M)
#M_sca = pd.DataFrame(M_scaled)
#M_sca.describe()
#ax = StandardScaler().fit(M[indexX])
#X = ax.transform(M[indexX])
#M.SINCO3=pd.to_numeric(M.SINCO3)
X = M[indexX].to_numpy()
Xc = M[indexX].astype('category')
y = M[indexY].to_numpy()
yc = M[indexY].astype('category')
y = np.ravel(y)
y = y-1;
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc, yc, test_size=0.2, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc, yc, test_size=0.2, random_state=0)
# dummies
X_train, X_test, y_train, y_test = train_test_split(pd.get_dummies(Xc), yc, test_size=0.2, random_state=21, stratify=yc)
y_train=y_train.to_numpy().ravel()
y_test=y_test.to_numpy().ravel()
print('valores unicos para claseificar',yc.p133.unique())
print('valores del entrenamiento',np.unique(y_train))
print('valores del test',np.unique(y_test))
df2.loc[df2.p26==1,"p43"].unique()
X_train
valores cohort de ingreso [4 3 5 1 2 6 7] shape (2191, 366) unique [11. nan 6. 2. 98. 12. 4. 10. 9. 3. 1. 8. 5. 7.] valores unicos para claseificar [4, 3, 5, 1, 2, 6, 7] Categories (7, int64): [4, 3, 5, 1, 2, 6, 7] valores del entrenamiento [1 2 3 4 5 6 7] valores del test [1 2 3 4 5 6 7]
p05_25.0 | p05_26.0 | p05_27.0 | p05_28.0 | p05_29.0 | p05_30.0 | p05_31.0 | p05_32.0 | p05_33.0 | p05_34.0 | ... | SINCO3_9722 | SINCO3_9843 | SINCO3_9899 | SINCO3_9998 | SINCO3_9999 | region_1.0 | region_2.0 | region_3.0 | region_4.0 | region_5.0 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9702 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
1432 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
1391 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
3540 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
6772 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
6286 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
12819 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
12353 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
1845 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1414 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
1752 rows × 305 columns
Glosario scikit-learn https://scikit-learn.org/stable/glossary.html#glossary
Documentación SVC https://scikit-learn.org/stable/modules/svm.html#classification
Documentación funcion lineal https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
clas_linsvm = LinearSVC(max_iter = 3000, penalty = "l2",
loss = "squared_hinge", dual = True,
C = 0.01)
y_pred_linsvm = clas_linsvm.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_linsvm)
print('Accuracy: %.2f' % (accuracy*100))
# NO ganamos nada con categoricas
#y_pred_linsvmc = clas_linsvm.fit(Xc_train, yc_train).predict(Xc_test)
#accuracy = accuracy_score(yc_test, y_pred_linsvmc)
#print('Accuracy: %.2f' % (accuracy*100))
Accuracy: 31.21
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import itertools
def matrizviz(matrix):
plt.clf()
# place labels at the top
plt.gca().xaxis.tick_top()
plt.gca().xaxis.set_label_position('top')
# plot the matrix per se
plt.imshow(matrix, interpolation='nearest', cmap=plt.cm.Blues)
# plot colorbar to the right
plt.colorbar()
fmt = 'd'
class_names = ['1','2','3','4','5', '6',
'7']
# write the number of predictions in each bucket
thresh = matrix.max() / 2.
for i, j in itertools.product(range(matrix.shape[0]), range(matrix.shape[1])):
# if background is dark, use a white number, and vice-versa
plt.text(j, i, format(matrix[i, j], fmt),
horizontalalignment="center",
color="white" if matrix[i, j] > thresh else "black")
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names, rotation=45)
plt.yticks(tick_marks, class_names)
plt.tight_layout()
plt.ylabel('Intervalo registrado',size=14)
plt.xlabel('Intervalo predecido',size=14)
plt.show()
matrix = confusion_matrix(y_test,y_pred_linsvm)
matrizviz(matrix)
matrix = confusion_matrix(y_test,y_pred_linsvm, normalize = 'true')
disp = ConfusionMatrixDisplay(confusion_matrix=matrix)
disp.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f87aec3c1f0>
Documentación vecinos https://scikit-learn.org/stable/modules/neighbors.html
Documentación k-vecinos https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
Documentación R-vecinos https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.RadiusNeighborsClassifier.html#sklearn.neighbors.RadiusNeighborsClassifier
# clasifcación k vecinos
from sklearn.neighbors import KNeighborsClassifier
clas_neigk = KNeighborsClassifier(n_neighbors = 5, weights='distance',
leaf_size = 10, p = 2,
algorithm = 'ball_tree')#'kd_tree')#'brute')
y_pred_neigk = clas_neigk.fit(X_train, y_train).predict(X_test)
# no obtenemos mejora ocn categoricas
accuracy = accuracy_score(y_test, y_pred_neigk)
print('Accuracy k vecinos: %.2f' % (accuracy*100))
### buscamos la cantidad vecinos mejor
neighbors = np.arange(1, 150)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))
# Loop over different values of k
for i, k in enumerate(neighbors):
# Setup a k-NN Classifier with k neighbors: knn
knn = KNeighborsClassifier(n_neighbors = k, algorithm='brute')
# Fit the classifier to the training data
knn.fit(X_train,y_train)
#Compute accuracy on the training set
train_accuracy[i] = knn.score(X_train, y_train)
#Compute accuracy on the testing set
test_accuracy[i] = knn.score(X_test, y_test)
# Generate plot
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()
# clasificacion r-vecinos
from sklearn.neighbors import RadiusNeighborsClassifier
clas_neigr = RadiusNeighborsClassifier(radius = 4, weights='distance',
leaf_size = 10, p = 2,
outlier_label = 'most_frequent',
algorithm = 'ball_tree')#'kd_tree')#'brute')
y_pred_neigr = clas_neigr.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_neigr)
print('Accuracy r vecinos: %.2f' % (accuracy*100))
# clasifiacion centroide más cercano
from sklearn.neighbors import NearestCentroid
clas_neigc = NearestCentroid(shrink_threshold = 0.01)
y_pred_neigc= clas_neigc.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_neigc)
print('Accuracy centroide: %.2f' % (accuracy*100))
# clasifiacion con centoide encogido
for shrinkage in np.linspace(0, 1.0, num=5):
clas_neigce = NearestCentroid(shrink_threshold=shrinkage)
y_pred_neigce = clas_neigce.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_neigce)
print(shrinkage,'Accuracy centroide encogido: %.2f' % (accuracy*100))
# Tranformación de vecinos orehecho
from sklearn.manifold import Isomap
from sklearn.neighbors import KNeighborsTransformer
from sklearn.pipeline import make_pipeline
clas_neigt = make_pipeline(
KNeighborsTransformer(n_neighbors=5))
# NO jalo
#y_pred_neigt= clas_neigt.fit(X_train, y_train).predict(X_test)
#accuracy = accuracy_score(y_test, y_pred_neigt)
#print('Accuracy: %.2f' % (accuracy*100))
# clasifiacion con componentes de vecindad
from sklearn.neighbors import NeighborhoodComponentsAnalysis
nca = NeighborhoodComponentsAnalysis( max_iter = 100)
nca.fit(X_train, y_train)
clas_neigk.fit(X_train, y_train)
print(clas_neigk.score(nca.transform(X_test), y_test))
# clasificaciñon combianndo el anterior con el primero
from sklearn.pipeline import Pipeline
nca = NeighborhoodComponentsAnalysis(random_state=42, max_iter = 100)
nca_pipe = Pipeline([('nca', nca), ('knn', clas_neigk)])
nca_pipe.fit(X_train, y_train)
y_pred_nca = nca_pipe.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_nca)
print('Accuracy combinado: %.2f' % (accuracy*100))
#print(nca_pipe.score(X_test, y_test))
# clasificaciñon combianndo nca con cpmponentes reducidas y el primero
from sklearn.pipeline import Pipeline
nca = NeighborhoodComponentsAnalysis(random_state=42, max_iter = 100,
n_components=2)
nca_pipe = Pipeline([('nca', nca), ('knn', clas_neigk)])
nca_pipe.fit(X_train, y_train)
y_pred_nca = nca_pipe.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_nca)
print('Accuracy combinado: %.2f' % (accuracy*100))
#print(nca_pipe.score(X_test, y_test))
Accuracy k vecinos: 30.07
Accuracy r vecinos: 29.38 Accuracy centroide: 27.56 0.0 Accuracy centroide encogido: 27.56 0.25 Accuracy centroide encogido: 25.97 0.5 Accuracy centroide encogido: 25.06 0.75 Accuracy centroide encogido: 25.06 1.0 Accuracy centroide encogido: 24.83 0.2642369020501139 Accuracy combinado: 29.38 Accuracy combinado: 28.25
Docuemntación SVM https://scikit-learn.org/stable/modules/svm.html#classification
# Soporte no lineal
from sklearn.svm import SVC
clas_nlinsvm = SVC(max_iter = -1, C = 1,
kernel = 'rbf', gamma = 'auto')
y_pred_nlinsvm = clas_nlinsvm.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_nlinsvm)
print('Accuracy SVM no lineal : %.2f' % (accuracy*100))
# Sopoete no luenal con escalamiento
from sklearn.preprocessing import StandardScaler
clas_nelinsvm = make_pipeline(StandardScaler(), SVC(max_iter = -1, C = 1,
kernel = 'rbf',gamma='auto'))
y_pred_nelinsvm = clas_nelinsvm.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_nelinsvm)
print('Accuracy SVM no lineal datos escalados: %.2f' % (accuracy*100))
# Soporte vectorial en busqueda de parametros
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
scaler = StandardScaler()
X_trainsvmp = scaler.fit_transform(X_train)
C_range = np.logspace(-2, 3, 13)
gamma_range = np.logspace(-9, 3, 13)
#param_grid = dict(gamma=gamma_range, C=C_range)
#cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
#grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
#grid.fit(X_trainsvmp, y_train)
#print("The best parameters are %s with a score of %0.2f"
# % (grid.best_params_, grid.best_score_))
# The best parameters
# are {'C': 1.2115276586285888, 'gamma': 0.01} with a score of 0.37
# Soporte no lineal con los parametros ajustados
from sklearn.svm import SVC
clas_nlinsvmpa = SVC(max_iter = -1, C = 1.2115,
kernel = 'rbf', gamma = 0.01)
y_pred_nlinsvmpa = clas_nlinsvmpa.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_nlinsvmpa)
print('Accuracy SVM no lineal para ajustdos: %.2f' % (accuracy*100))
# Soporte no lineal con los parametros ajustados y pesos
from sklearn.svm import SVC
clas_nlinsvmpap = SVC(max_iter = -1, C = 1.2115,
kernel = 'rbf', gamma = 0.01,
class_weight={1: 500, 2:100})
y_pred_nlinsvmpap = clas_nlinsvmpap.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_nlinsvmpap)
print('Accuracy SVM no lineal para ajus pesados : %.2f' % (accuracy*100))
# reporte de la clasificación
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_nlinsvm ))
# Datos escalados u parametros ajustados
clas_nlsvmpae = make_pipeline(StandardScaler(),
SVC(max_iter = -1, C = 1.2115,
kernel = 'rbf', gamma = 0.01))
y_pred_nlsvmpae = clas_nlsvmpae.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_nlsvmpae)
print('Accuracy SVM no lineal datos escalados y ajustados: %.2f' % (accuracy*100))
Accuracy SVM no lineal : 26.88 Accuracy SVM no lineal datos escalados: 29.84 Accuracy SVM no lineal para ajustdos: 28.47 Accuracy SVM no lineal para ajus pesados : 19.82 precision recall f1-score support 1 0.00 0.00 0.00 81 2 0.00 0.00 0.00 73 3 0.27 1.00 0.42 118 4 0.00 0.00 0.00 94 5 0.00 0.00 0.00 52 6 0.00 0.00 0.00 17 7 0.00 0.00 0.00 4 accuracy 0.27 439 macro avg 0.04 0.14 0.06 439 weighted avg 0.07 0.27 0.11 439
/Users/rafamtz/opt/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
Accuracy SVM no lineal datos escalados y ajustados: 31.21
Docuemntación https://scikit-learn.org/stable/modules/sgd.html#classification
# gradiente estocastigo
from sklearn.linear_model import SGDClassifier
clas_sgd = SGDClassifier(loss="hinge", alpha=0.01, max_iter=200)
y_pred_sgd = clas_sgd.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_sgd)
print('Accuracy sgd: %.2f' % (accuracy*100))
# gradiento estocastico con promedio
clas_sgda = SGDClassifier(loss="hinge", alpha=0.01, max_iter=200,
average=True)#, class_weight={1: 50,2:100,3:50})
y_pred_sgda = clas_sgda.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_sgda)
print('Accuracy sgd average: %.2f' % (accuracy*100))
Accuracy sgd: 28.47 Accuracy sgd average: 29.84
Docuemntación perceptron https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Perceptron.html#sklearn.linear_model.Perceptron
Perceptron is a classification algorithm which shares the same underlying implementation with SGDClassifier. In fact, Perceptron() is equivalent to SGDClassifier(loss="perceptron", eta0=1, learning_rate="constant", penalty=None).
from sklearn.linear_model import Perceptron
clas_per = Perceptron(tol=1e-3, random_state=0)
y_pred_per = clas_per.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_per)
print('Accuracy perceptron: %.2f' % (accuracy*100))
Accuracy perceptron: 24.60
docuemntación función https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html#sklearn.linear_model.PassiveAggressiveClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
clas_pac = PassiveAggressiveClassifier(max_iter=1000, random_state=0,tol=1e-3)
y_pred_pac = clas_pac.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_pac)
print('Accuracy perceptron: %.2f' % (accuracy*100))
Accuracy perceptron: 22.10
Doceumntación https://scikit-learn.org/stable/modules/lda_qda.html
# discriminate lineal
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clas_dl = LinearDiscriminantAnalysis()
y_pred_dl = clas_dl.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_dl)
print('Accuracy perceptron: %.2f' % (accuracy*100))
# discriminante cuadratico
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
clas_dc = QuadraticDiscriminantAnalysis()
y_pred_dc = clas_dl.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_dc)
print('Accuracy perceptron: %.2f' % (accuracy*100))
Accuracy perceptron: 31.66 Accuracy perceptron: 31.66
Docuemntación https://scikit-learn.org/stable/modules/gaussian_process.html
# tarda mucho y es pobre en el uso más simple
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
kernel = 1.0 * RBF(1.0)
clas_gpc = GaussianProcessClassifier(kernel=kernel,random_state=0)
y_pred_gpc = clas_gpc.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_gpc)
print('Accuracy perceptron: %.2f' % (accuracy*100))
documentación https://scikit-learn.org/stable/modules/kernel_approximation.html
from sklearn.kernel_approximation import RBFSampler
rbf_feature = RBFSampler(gamma=1, random_state=1)
Xtrain_features = rbf_feature.fit_transform(X_train)
clas_ak = SGDClassifier(loss="hinge", alpha=0.01, max_iter=200,
average=True)
Xtest_features = rbf_feature.fit_transform(X_test)
y_pred_ak = clas_ak.fit(Xtrain_features, y_train).predict(Xtest_features)
accuracy = accuracy_score(y_test, y_pred_ak)
print('Accuracy kernel: %.2f' % (accuracy*100))
Accuracy kernel: 26.65
Docuemntación https://scikit-learn.org/stable/modules/naive_bayes.html
from sklearn.naive_bayes import GaussianNB
clas_nb = GaussianNB()
y_pred_nb = clas_nb.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_nb)
print('Accuracy bayes: %.2f' % (accuracy*100))
# multninomial bayes
from sklearn.naive_bayes import MultinomialNB
clas_mb = MultinomialNB()
y_pred_mb = clas_mb.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_mb)
print('Accuracy bayes multinomial: %.2f' % (accuracy*100))
# Bayes con complemento
from sklearn.naive_bayes import ComplementNB
clas_cb = ComplementNB()
y_pred_cb = clas_cb.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_cb)
print('Accuracy bayes complemento: %.2f' % (accuracy*100))
# Bayes categorico no jalo
#from sklearn.naive_bayes import CategoricalNB
#clas_cab = CategoricalNB()
#y_pred_cab = clas_cab.fit(Xc_train, yc_train).predict(Xc_test)
#accuracy = accuracy_score(y_test, y_pred_cab)
#print('Accuracy bayes categorico: %.2f' % (accuracy*100))
Accuracy bayes: 12.76 Accuracy bayes multinomial: 32.35 Accuracy bayes complemento: 31.66
Docuemnatación https://scikit-learn.org/stable/modules/linear_model.html#classification
from sklearn.linear_model import RidgeClassifier
clas_ridge = RidgeClassifier(max_iter = 100, normalize = True)
y_pred_ridge = clas_ridge.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_ridge)
print('Accuracy ridge: %.2f' % (accuracy*100))
Accuracy ridge: 32.12
Docuemntación https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
# regresiónlogistica multinomial
from sklearn.linear_model import LogisticRegression
def logistica(X_train, y_train, X_test, y_test):
y_pred_rlmulti = LogisticRegression(solver='saga',
max_iter=200,
random_state=42,
multi_class='multinomial').fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_rlmulti)
print('Accuracy reg log multi: %.2f' % (accuracy*100))
# regresión logistica ovr
y_pred_rlovr = LogisticRegression(solver='liblinear',
max_iter=200,
random_state=42,
multi_class='ovr').fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_rlovr)
print('Accuracy reg log ovr: %.2f' % (accuracy*100))
logistica(X_train, y_train, X_test, y_test)
Accuracy reg log multi: 33.26 Accuracy reg log ovr: 33.26
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
def arbolbos(X_train, y_train, X_test, y_test):
clas_tree = DecisionTreeClassifier()
clas_forest = RandomForestClassifier( max_depth = 14,max_features=None, min_samples_leaf = 20)
y_pred_tree = clas_tree.fit(X_train, y_train).predict(X_test)
y_pred_forest = clas_forest.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_forest)
print('Accuracy bosque: %.2f' % (accuracy*100))
accuracy = accuracy_score(y_test, y_pred_tree)
print('Accuracy arbol: %.2f' % (accuracy*100))
#print(np.unique(y_test))
#y_pred_tree.shape
#y_pred_forest.shape
arbolbos(X_train, y_train, X_test, y_test)
Accuracy bosque: 31.66 Accuracy arbol: 27.33
clas_tree = DecisionTreeClassifier()
clas_forest = RandomForestClassifier( max_depth = 14,max_features=None, min_samples_leaf = 20)
y_pred_tree = clas_tree.fit(X_train, y_train).predict(X_test)
y_pred_forest = clas_forest.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_forest)
print('Accuracy bosque: %.2f' % (accuracy*100))
accuracy = accuracy_score(y_test, y_pred_tree)
print('Accuracy arbol: %.2f' % (accuracy*100))
validacion = pd.DataFrame({'Actual': y_test,'Predicción': y_pred_tree, 'Predicción F': y_pred_forest,
'Diferencia': y_test-y_pred_forest})
validacion
a = np.abs(y_test-y_pred_forest)
print(a.mean())
accuracy = accuracy_score(y_test, y_pred_forest)
print('Accuracy: %.2f' % (accuracy*100))
accuracy = accuracy_score(y_test, y_pred_tree)
print('Accuracy: %.2f' % (accuracy*100))
Accuracy bosque: 31.21 Accuracy arbol: 27.11 1.0660592255125285 Accuracy: 31.21 Accuracy: 27.11
matrix = confusion_matrix(y_test,y_pred_forest)
matrizviz(matrix)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
# feature selection
def select_features(X_train, y_train, X_test):
fs = SelectKBest(score_func=chi2, k=40)
#mutual_info_classif
#chi2
fs.fit(X_train, y_train)
X_train_fs = fs.transform(X_train)
X_test_fs = fs.transform(X_test)
return X_train_fs, X_test_fs, fs
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)
# what are scores for the features
for i in range(len(fs.scores_)):
print('Feature %d: %f' % (i, fs.scores_[i]))
# plot the scores
plt.bar([i for i in range(len(fs.scores_))], fs.scores_)
plt.show()
Feature 0: 7.758087 Feature 1: 3.472157 Feature 2: 2.595307 Feature 3: 6.248632 Feature 4: 4.726717 Feature 5: 6.306663 Feature 6: 4.133533 Feature 7: 10.739050 Feature 8: 3.123382 Feature 9: 5.434297 Feature 10: 7.799463 Feature 11: 4.850173 Feature 12: 1.752700 Feature 13: 14.082139 Feature 14: 5.281824 Feature 15: 5.425489 Feature 16: 10.172653 Feature 17: 5.905378 Feature 18: 2.450644 Feature 19: 5.441612 Feature 20: 4.283228 Feature 21: 3.215003 Feature 22: 2.278915 Feature 23: 24.341717 Feature 24: 0.934253 Feature 25: 3.232639 Feature 26: 20.353787 Feature 27: 31.402141 Feature 28: 3.728972 Feature 29: 66.162677 Feature 30: 21.399958 Feature 31: 24.663924 Feature 32: 11.698925 Feature 33: 30.832805 Feature 34: 4.777782 Feature 35: 9.725263 Feature 36: 3.802018 Feature 37: 56.757117 Feature 38: 152.077957 Feature 39: 84.660611 Feature 40: 24.027463 Feature 41: 24.391304 Feature 42: 5.211538 Feature 43: 13.424624 Feature 44: 6.053682 Feature 45: nan Feature 46: 7.423077 Feature 47: 54.605932 Feature 48: nan Feature 49: nan Feature 50: 108.500000 Feature 51: 3.071218 Feature 52: 4.553784 Feature 53: 7.423077 Feature 54: 7.368984 Feature 55: 24.391304 Feature 56: 8.844580 Feature 57: 2.711864 Feature 58: 36.192994 Feature 59: 2.198178 Feature 60: 7.368984 Feature 61: 17.249253 Feature 62: 10.050084 Feature 63: 63.557759 Feature 64: 3.684492 Feature 65: 24.391304 Feature 66: 7.423077 Feature 67: 3.684492 Feature 68: 5.000000 Feature 69: 1.798785 Feature 70: 2.711864 Feature 71: 2.711864 Feature 72: 7.423077 Feature 73: 7.368984 Feature 74: 24.391304 Feature 75: 19.250489 Feature 76: 4.553784 Feature 77: 4.067471 Feature 78: 60.449419 Feature 79: 13.695652 Feature 80: 9.792267 Feature 81: 3.684492 Feature 82: 4.067471 Feature 83: 32.783538 Feature 84: 7.260767 Feature 85: 6.882126 Feature 86: 32.756561 Feature 87: 24.391304 Feature 88: 8.701056 Feature 89: 3.684492 Feature 90: 24.391304 Feature 91: 6.053682 Feature 92: 3.684492 Feature 93: 7.423077 Feature 94: 2.711864 Feature 95: 5.233301 Feature 96: 3.684492 Feature 97: 2.711864 Feature 98: 18.541531 Feature 99: 2.711864 Feature 100: nan Feature 101: 24.391304 Feature 102: 3.684492 Feature 103: 7.423077 Feature 104: 24.391304 Feature 105: 5.680463 Feature 106: 2.584904 Feature 107: 7.423077 Feature 108: 3.684492 Feature 109: 10.936828 Feature 110: 56.961538 Feature 111: 2.584904 Feature 112: 5.807845 Feature 113: 8.915888 Feature 114: 2.198178 Feature 115: 4.067471 Feature 116: 5.374458 Feature 117: 6.053682 Feature 118: nan Feature 119: 4.483277 Feature 120: 4.067471 Feature 121: 14.846154 Feature 122: 11.851394 Feature 123: 4.553784 Feature 124: 7.423077 Feature 125: 3.684492 Feature 126: 2.584904 Feature 127: 2.711864 Feature 128: 5.000000 Feature 129: 6.053682 Feature 130: 4.483277 Feature 131: 2.711864 Feature 132: 6.197778 Feature 133: 4.087524 Feature 134: 7.423077 Feature 135: 4.770673 Feature 136: 3.197442 Feature 137: 3.510650 Feature 138: 2.001417 Feature 139: 5.811084 Feature 140: 5.376025 Feature 141: 3.071218 Feature 142: 7.423077 Feature 143: 3.684492 Feature 144: 3.684492 Feature 145: 5.112424 Feature 146: 10.911699 Feature 147: 5.075884 Feature 148: 4.457944 Feature 149: nan Feature 150: 8.175048 Feature 151: 3.684492 Feature 152: 11.942026 Feature 153: 2.711864 Feature 154: 3.885910 Feature 155: 1.126194 Feature 156: 1.569344 Feature 157: 7.368984 Feature 158: 11.698869 Feature 159: 5.143686 Feature 160: 4.722799 Feature 161: 6.619567 Feature 162: 14.354318 Feature 163: 7.423077 Feature 164: 4.080061 Feature 165: 14.051162 Feature 166: 3.684492 Feature 167: 3.356974 Feature 168: 7.423077 Feature 169: 7.423077 Feature 170: 4.099089 Feature 171: 11.053476 Feature 172: 5.000000 Feature 173: 2.711864 Feature 174: nan Feature 175: 11.826486 Feature 176: 11.434916 Feature 177: 3.823224 Feature 178: 2.711864 Feature 179: 43.088169 Feature 180: 8.135593 Feature 181: 3.684492 Feature 182: 4.292452 Feature 183: 3.885910 Feature 184: 6.983729 Feature 185: 6.466599 Feature 186: 4.457944 Feature 187: 3.684492 Feature 188: nan Feature 189: 24.391304 Feature 190: nan Feature 191: 4.457944 Feature 192: 3.627007 Feature 193: 7.708914 Feature 194: 4.457944 Feature 195: 10.000000 Feature 196: 4.457944 Feature 197: nan Feature 198: 4.457944 Feature 199: 19.584533 Feature 200: 3.684492 Feature 201: nan Feature 202: 14.692690 Feature 203: 7.423077 Feature 204: 4.457944 Feature 205: nan Feature 206: 4.483277 Feature 207: 2.245626 Feature 208: 2.299950 Feature 209: 4.067471 Feature 210: 14.846154 Feature 211: 2.417578 Feature 212: 2.369637 Feature 213: 3.369190 Feature 214: 5.000000 Feature 215: 5.244672 Feature 216: 2.711864 Feature 217: 2.711864 Feature 218: 4.067471 Feature 219: 17.831776 Feature 220: 7.340504 Feature 221: 2.056603 Feature 222: 25.902028 Feature 223: 3.949153 Feature 224: 5.423729 Feature 225: 2.198178 Feature 226: 5.000000 Feature 227: 4.457944 Feature 228: 3.949153 Feature 229: 5.175739 Feature 230: 3.684492 Feature 231: 2.711864 Feature 232: 2.584904 Feature 233: 2.711864 Feature 234: 2.576350 Feature 235: 4.553784 Feature 236: 24.391304 Feature 237: 7.423077 Feature 238: 2.198178 Feature 239: 2.198178 Feature 240: 5.169808 Feature 241: 2.584904 Feature 242: 2.711864 Feature 243: 5.000000 Feature 244: 2.711864 Feature 245: 4.136478 Feature 246: 3.684492 Feature 247: nan Feature 248: 3.768467 Feature 249: nan Feature 250: 7.423077 Feature 251: 3.684492 Feature 252: 2.711864 Feature 253: 10.466602 Feature 254: 5.998212 Feature 255: 8.835109 Feature 256: 3.071218 Feature 257: 3.987294 Feature 258: 6.326316 Feature 259: 4.285201 Feature 260: 2.247473 Feature 261: 7.423077 Feature 262: 2.382987 Feature 263: 5.162867 Feature 264: 16.411303 Feature 265: nan Feature 266: 12.595383 Feature 267: 7.423077 Feature 268: 5.423729 Feature 269: 1.618100 Feature 270: 4.396356 Feature 271: 4.653407 Feature 272: 7.423077 Feature 273: 1.569344 Feature 274: 2.198178 Feature 275: nan Feature 276: 7.423077 Feature 277: 2.488776 Feature 278: 6.075509 Feature 279: 4.475792 Feature 280: 3.768467 Feature 281: 19.173287 Feature 282: 8.402497 Feature 283: 3.684492 Feature 284: 5.428950 Feature 285: 3.728972 Feature 286: 3.187869 Feature 287: 4.457944 Feature 288: 13.424624 Feature 289: 12.961225 Feature 290: 1.003221 Feature 291: 2.711864 Feature 292: 8.915888 Feature 293: 3.535847 Feature 294: 2.711864 Feature 295: 2.584904 Feature 296: 4.457944 Feature 297: 2.606478 Feature 298: 28.876576 Feature 299: 6.816186 Feature 300: 28.312784 Feature 301: 9.474995 Feature 302: 9.658377 Feature 303: 29.711773 Feature 304: 107.546986
clas_forest_chi = RandomForestClassifier(max_depth = 14,max_features=None, min_samples_leaf = 20)
yhat =clas_forest_chi.fit(X_train_fs, y_train).predict(X_test_fs)
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))
matrix = confusion_matrix(y_test,yhat)
matrizviz(matrix)
Accuracy: 28.47
#from sklearn.preprocessing import OneHotEncoder
Xc_train, Xc_test, yc_train, yc_test = train_test_split(pd.get_dummies(Xc), yc, test_size=0.2, random_state=0)
Xc_train_fs, Xc_test_fs, fsc = select_features(Xc_train, yc_train.to_numpy().ravel(), Xc_test)
plt.bar([i for i in range(len(fsc.scores_))], fsc.scores_)
plt.show()
ypc_forest = clas_forest.fit(Xc_train, yc_train.to_numpy().ravel()).predict(Xc_test)
matrix = confusion_matrix(yc_test,ypc_forest)
matrizviz(matrix)
yhat =clas_forest_chi.fit(Xc_train_fs, yc_train.to_numpy().ravel()).predict(Xc_test_fs)
accuracy = accuracy_score(yc_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))
matrix = confusion_matrix(yc_test,yhat)
matrizviz(matrix)
Xc_train_fs.shape
Accuracy: 28.70
(1752, 40)
ypc_forest = clas_forest.fit(Xc_train, yc_train.to_numpy().ravel()).predict(Xc_test)
accuracy = accuracy_score(yc_test, ypc_forest)
print('Accuracy: %.2f' % (accuracy*100))
Accuracy: 30.07
from yellowbrick.classifier import ClassPredictionError
#clases = ['a','aa','b','bb','c','cc','d']
#visualizer = ClassPredictionError(
# RandomForestClassifier(max_depth = None, max_features=None, min_samples_leaf = 5), classes= clases)
#visualizer.fit(X_train, y_train)
#visualizer.score(X_test, y_test)
#visualizer.show()
from sklearn.neural_network import MLPClassifier
X_train, X_test, y_train, y_test = train_test_split(pd.get_dummies(Xc), yc, test_size=0.2, random_state=21, stratify=yc)
y_train=y_train.to_numpy().ravel()
y_test=y_test.to_numpy().ravel()
#clas_mpl = MLPClassifier(random_state=1,hidden_layer_sizes=(100, 100, 100),
# solver='lbfgs',alpha = 1e-5, learning_rate = 'adaptive',
# max_iter=2000)
clf = MLPClassifier(solver='lbfgs', alpha=0.5, max_iter = 200,
hidden_layer_sizes=(10, 10, 10,10,10,10,10,10), random_state=1)
#y_pred_clas = clf.fit(X_train_fs, y_train).predict(X_test_fs)
y_pred_clas = clf.fit(Xc_train, yc_train).predict(Xc_test)
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred_clas, normalize ='true')
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
accuracy = accuracy_score(y_test, y_pred_clas)
print('Accuracy: %.2f' % (accuracy*100))
Es una combinación de varios entimadores dado un algoritmo de aprendizaje para mejprar la generalización /robustes, tres categorias
from sklearn.ensemble import BaggingClassifier
# con maquinas de soporte vectorial
clas_bsvc = BaggingClassifier(base_estimator=SVC(),
n_estimators=10, random_state=0)
y_pred_bsvc = clas_bsvc.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_bsvc)
print('Accuracy baggin con maquina: %.2f' % (accuracy*100))
# con k vecinos cercanos
clas_bkvecinos = BaggingClassifier(KNeighborsClassifier(),
max_samples=0.5, max_features=0.5)
y_pred_bkvecinos = clas_bkvecinos.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_bkvecinos)
print('Accuracy baggin con k vecinos: %.2f' % (accuracy*100))
# los randomforest es un metodo de ensambe
# extremadamente arboles aleatorios
from sklearn.ensemble import ExtraTreesClassifier
clas_etc = ExtraTreesClassifier(n_estimators=10, max_depth=None,
min_samples_split=2, random_state=0)
y_pred_etc = clas_etc.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_etc)
print('Accuracy baggin con extra forest: %.2f' % (accuracy*100))
# AdaBoost
from sklearn.ensemble import AdaBoostClassifier
clas_ada = AdaBoostClassifier(n_estimators=50)
y_pred_ada = clas_ada.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_ada)
print('Accuracy ada : %.2f' % (accuracy*100))
# Gradinte tree Boosting
from sklearn.ensemble import GradientBoostingClassifier
clas_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
max_depth=1, random_state=0)
y_pred_gb = clas_gb.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_gb)
print('Accuracy gb : %.2f' % (accuracy*100))
print(clas_gb.feature_importances_)
# hist gradiente
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
clas_hgb = HistGradientBoostingClassifier(max_iter=100).fit(X_train, y_train)
y_pred_hgb = clas_hgb.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred_hgb)
print('Accuracy hgb : %.2f' % (accuracy*100))
# Voting Clasificador
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()
eclf = VotingClassifier(
estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],
voting='soft')#'hard')
for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):
scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=5)
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
# se puede hacer lo mismo pero buscando el mejor parametro
#stakingclasifcador
from sklearn.ensemble import StackingClassifier
estimators = [
('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
('svr', make_pipeline(StandardScaler(),
LinearSVC(random_state=42)))
]
clf = StackingClassifier(
estimators=estimators, final_estimator=LogisticRegression()
)
clf.fit(X_train, y_train).score(X_test, y_test)
Xc.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 3010 entries, 7 to 17653 Data columns (total 47 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 p13 3010 non-null category 1 p120 3010 non-null category 2 p121 3010 non-null category 3 p122 3010 non-null category 4 p123 3010 non-null category 5 p125a 3010 non-null category 6 p125b 3010 non-null category 7 p125c 3010 non-null category 8 p125d 3010 non-null category 9 p125e 3010 non-null category 10 p126a 3010 non-null category 11 p126b 3010 non-null category 12 p126c 3010 non-null category 13 p126d 3010 non-null category 14 p126e 3010 non-null category 15 p126f 3010 non-null category 16 p126g 3010 non-null category 17 p126h 3010 non-null category 18 p126i 3010 non-null category 19 p126j 3010 non-null category 20 p126k 3010 non-null category 21 p126l 3010 non-null category 22 p126m 3010 non-null category 23 p126n 3010 non-null category 24 p126o 3010 non-null category 25 p126p 3010 non-null category 26 p126q 3010 non-null category 27 p126r 3010 non-null category 28 p127 3010 non-null category 29 p128a 3010 non-null category 30 p128b 3010 non-null category 31 p128c 3010 non-null category 32 p128d 3010 non-null category 33 p128e 3010 non-null category 34 p128f 3010 non-null category 35 p129a 3010 non-null category 36 p129b 3010 non-null category 37 p129c 3010 non-null category 38 p129d 3010 non-null category 39 p129e 3010 non-null category 40 p130a 3010 non-null category 41 p130b 3010 non-null category 42 p130c 3010 non-null category 43 p130d 3010 non-null category 44 p130e 3010 non-null category 45 p130f 3010 non-null category 46 p131 3010 non-null category dtypes: category(47) memory usage: 247.7 KB
Xc.describe()
p13 | p120 | p121 | p122 | p123 | p125a | p125b | p125c | p125d | p125e | ... | p129c | p129d | p129e | p130a | p130b | p130c | p130d | p130e | p130f | p131 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 3010.0 | 3010 | 3010 | 3010 | 3010 | 3010 | 3010 | 3010 | 3010 | 3010 | ... | 3010 | 3010 | 3010 | 3010 | 3010 | 3010 | 3010 | 3010 | 3010 | 3010 |
unique | 13.0 | 3 | 8 | 10 | 2 | 2 | 2 | 2 | 2 | 2 | ... | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 5 |
top | 4.0 | 2 | 2 | 3 | 1 | 1 | 1 | 1 | 1 | 2 | ... | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 0 |
freq | 776.0 | 1901 | 1360 | 854 | 1955 | 2778 | 2969 | 2696 | 1604 | 2723 | ... | 2754 | 2860 | 2931 | 2638 | 2874 | 2897 | 2902 | 2914 | 2981 | 1825 |
4 rows × 47 columns
Xc.value_counts()
p13 p120 p121 p122 p123 p125a p125b p125c p125d p125e p126a p126b p126c p126d p126e p126f p126g p126h p126i p126j p126k p126l p126m p126n p126o p126p p126q p126r p127 p128a p128b p128c p128d p128e p128f p129a p129b p129c p129d p129e p130a p130b p130c p130d p130e p130f p131 4.0 2 1 1 1 1 1 1 2 2 1 1 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 5 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2.0 2 3 3 1 1 1 1 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 5 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 11.0 3 3 4 1 1 1 1 1 2 1 1 1 1 1 2 2 1 2 2 2 1 2 2 1 2 2 2 4 2 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2.0 2 3 4 1 1 1 1 1 2 1 1 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 5 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 1 1 2 1 1 1 2 2 1 2 1 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2 5 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 .. 6.0 2 2 3 1 1 1 1 2 1 1 1 1 1 1 2 2 1 2 1 1 1 1 1 1 2 2 2 7 2 2 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 1 1 1 2 1 2 1 2 1 2 2 2 2 1 2 1 2 2 2 2 2 2 6 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 1 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 1 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1.0 1 1 1 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 1 Length: 3002, dtype: int64
Xc.value_counts(normalize= True)
p13 p120 p121 p122 p123 p125a p125b p125c p125d p125e p126a p126b p126c p126d p126e p126f p126g p126h p126i p126j p126k p126l p126m p126n p126o p126p p126q p126r p127 p128a p128b p128c p128d p128e p128f p129a p129b p129c p129d p129e p130a p130b p130c p130d p130e p130f p131 4.0 2 1 1 1 1 1 1 2 2 1 1 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 5 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0.000664 2.0 2 3 3 1 1 1 1 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 5 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0.000664 11.0 3 3 4 1 1 1 1 1 2 1 1 1 1 1 2 2 1 2 2 2 1 2 2 1 2 2 2 4 2 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 1 0.000664 2.0 2 3 4 1 1 1 1 1 2 1 1 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 5 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0.000664 1 1 2 1 1 1 2 2 1 2 1 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2 5 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0.000664 ... 6.0 2 2 3 1 1 1 1 2 1 1 1 1 1 1 2 2 1 2 1 1 1 1 1 1 2 2 2 7 2 2 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 1 0.000332 1 2 1 2 1 2 1 2 2 2 2 1 2 1 2 2 2 2 2 2 6 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0.000332 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 0.000332 1 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 1 0.000332 1.0 1 1 1 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0.000332 Length: 3002, dtype: float64
Xc.dtypes
p13 category p120 category p121 category p122 category p123 category p125a category p125b category p125c category p125d category p125e category p126a category p126b category p126c category p126d category p126e category p126f category p126g category p126h category p126i category p126j category p126k category p126l category p126m category p126n category p126o category p126p category p126q category p126r category p127 category p128a category p128b category p128c category p128d category p128e category p128f category p129a category p129b category p129c category p129d category p129e category p130a category p130b category p130c category p130d category p130e category p130f category p131 category dtype: object
Xc.p127.cat.categories
Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')
Xc.p127.value_counts(dropna=False)
5 1118 1 559 2 513 4 364 7 168 6 100 9 96 8 46 3 46 Name: p127, dtype: int64
Mc = M.astype('category')
import seaborn as sns
sns.catplot(x = "p133",
data = Mc,
col="p13",
col_wrap=4,
kind = 'count',
hue = 'p122')
plt.show()
Mc.p13.cat.codes
7 2 13 9 23 10 26 1 28 1 .. 17627 1 17629 1 17631 2 17633 3 17653 3 Length: 3010, dtype: int8
Mc.p13
7 3.0 13 10.0 23 11.0 26 2.0 28 2.0 ... 17627 2.0 17629 2.0 17631 3.0 17633 4.0 17653 4.0 Name: p13, Length: 3010, dtype: category Categories (13, float64): [1.0, 2.0, 3.0, 4.0, ..., 10.0, 11.0, 12.0, 97.0]
codigos = Mc.p13.cat.codes
categorias = Mc.p13
mapa = dict(zip(codigos,categorias))
mapa
{2: 3.0, 9: 10.0, 10: 11.0, 1: 2.0, 6: 7.0, 3: 4.0, 4: 5.0, 5: 6.0, 7: 8.0, 11: 12.0, 12: 97.0, 8: 9.0, 0: 1.0}
a = StandardScaler().fit(M)
M_scaled = a.transform(M)
M_sca = pd.DataFrame(M_scaled)
M_sca.describe()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 3.010000e+03 | 3.010000e+03 | 3.010000e+03 | 3.010000e+03 | 3.010000e+03 | 3.010000e+03 | 3.010000e+03 | 3.010000e+03 | 3.010000e+03 | 3.010000e+03 | ... | 3.010000e+03 | 3.010000e+03 | 3.010000e+03 | 3.010000e+03 | 3.010000e+03 | 3.010000e+03 | 3.010000e+03 | 3.010000e+03 | 3.010000e+03 | 3.010000e+03 |
mean | 6.705600e-17 | 1.583672e-15 | 3.205114e-15 | 1.178146e-15 | 1.643462e-15 | -1.311612e-16 | -1.701066e-15 | -1.516690e-16 | 2.397270e-15 | 3.677383e-16 | ... | 2.397123e-16 | 9.867891e-16 | 3.595131e-16 | -4.730251e-16 | 2.076597e-16 | 5.109423e-16 | 2.085449e-16 | 4.261357e-16 | -3.061412e-17 | 1.306522e-15 |
std | 1.000166e+00 | 1.000166e+00 | 1.000166e+00 | 1.000166e+00 | 1.000166e+00 | 1.000166e+00 | 1.000166e+00 | 1.000166e+00 | 1.000166e+00 | 1.000166e+00 | ... | 1.000166e+00 | 1.000166e+00 | 1.000166e+00 | 1.000166e+00 | 1.000166e+00 | 1.000166e+00 | 1.000166e+00 | 1.000166e+00 | 1.000166e+00 | 1.000166e+00 |
min | -4.187076e-01 | -2.488140e+00 | -1.145163e+00 | -1.845989e+00 | -7.346033e-01 | -2.889867e-01 | -1.175132e-01 | -3.412753e-01 | -9.362471e-01 | -3.080228e+00 | ... | -4.366539e+00 | -6.091081e+00 | -2.662968e+00 | -4.596994e+00 | -5.063316e+00 | -5.183664e+00 | -5.509462e+00 | -1.013869e+01 | -7.177844e-01 | -9.940309e-01 |
25% | -3.542515e-01 | -5.819803e-01 | -1.145163e+00 | -4.542729e-01 | -7.346033e-01 | -2.889867e-01 | -1.175132e-01 | -3.412753e-01 | -9.362471e-01 | 3.246513e-01 | ... | 2.290143e-01 | 1.641745e-01 | 3.755209e-01 | 2.175335e-01 | 1.974990e-01 | 1.929137e-01 | 1.815059e-01 | 9.863204e-02 | -7.177844e-01 | -9.940309e-01 |
50% | -2.253393e-01 | -5.819803e-01 | -7.667550e-02 | 2.415853e-01 | -7.346033e-01 | -2.889867e-01 | -1.175132e-01 | -3.412753e-01 | -9.362471e-01 | 3.246513e-01 | ... | 2.290143e-01 | 1.641745e-01 | 3.755209e-01 | 2.175335e-01 | 1.974990e-01 | 1.929137e-01 | 1.815059e-01 | 9.863204e-02 | -7.177844e-01 | -1.495395e-01 |
75% | -9.642716e-02 | 1.324179e+00 | 9.918119e-01 | 9.374435e-01 | 1.361279e+00 | -2.889867e-01 | -1.175132e-01 | -3.412753e-01 | 1.068094e+00 | 3.246513e-01 | ... | 2.290143e-01 | 1.641745e-01 | 3.755209e-01 | 2.175335e-01 | 1.974990e-01 | 1.929137e-01 | 1.815059e-01 | 9.863204e-02 | 8.123368e-01 | 6.949519e-01 |
max | 5.769076e+00 | 1.324179e+00 | 6.334249e+00 | 4.416734e+00 | 1.361279e+00 | 3.460367e+00 | 8.509679e+00 | 2.930186e+00 | 1.068094e+00 | 3.246513e-01 | ... | 2.290143e-01 | 1.641745e-01 | 3.755209e-01 | 2.175335e-01 | 1.974990e-01 | 1.929137e-01 | 1.815059e-01 | 9.863204e-02 | 5.402700e+00 | 3.228426e+00 |
8 rows × 48 columns
def histograma(ingresoc,c1,c2):
a = df2[df2.p133 == ingresoc]
L = [c1,c2]
a[L].plot.hist()
plt.show()
histograma(1,'p13','p131')
histograma(2,'p13','p131')
histograma(3,'p13','p131')
histograma(4,'p13','p131')
histograma(5,'p13','p131')
histograma(6,'p13','p131')
histograma(7,'p13','p131')
print(df2.p133.unique())
import seaborn as sns
def filtro(ingresoc,c1,c2):
a = df2[df2.p133 == ingresoc]
L = [c1,c2]
return a[L]
sns.pairplot(filtro(1,'p13','p131'))
<seaborn.axisgrid.PairGrid at 0x7fb42182d2b0>
X_train, X_test, y_train, y_test = train_test_split(pd.get_dummies(Xc), yc, test_size=0.2, random_state=0)
y_train=y_train.to_numpy().ravel()
y_test=y_test.to_numpy().ravel()
from sklearn.feature_selection import VarianceThreshold
# feature selection
def select_var(X_train, X_test,tres):
fs = VarianceThreshold(threshold=(tres * (1 - tres)))
#mutual_info_classif
#chi2
X_train_fs = fs.fit_transform(X_train)
X_test_fs = fs.fit_transform(X_test)
return X_train_fs, X_test_fs, fs
X_train_fs, X_test_fs, fs = select_var(X_train, X_test, 0.8)
print(X_train_fs.shape)
print(X_train.shape)
print(X_test_fs.shape)
logistica(X_train_fs, y_train, X_test_fs, y_test)
arbolbos(X_train_fs, y_train, X_test_fs, y_test)
(1752, 6) (1752, 305) (439, 6) Accuracy reg log multi: 27.11 Accuracy reg log ovr: 27.11 Accuracy bosque: 27.79 Accuracy arbol: 27.79
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import f_classif
def select_uni(X_train,y_train, X_test,i,k):
metodo = (chi2, mutual_info_classif, f_classif)
fs = SelectKBest(score_func= metodo[i], k=k)
#mutual_info_classif
#chi2
fs.fit(X_train, y_train)
X_train_fs = fs.transform(X_train)
X_test_fs = fs.transform(X_test)
plt.bar([i for i in range(len(fs.scores_))], fs.scores_)
plt.show()
return X_train_fs, X_test_fs, fs
X_train_fs, X_test_fs, fs = select_uni(X_train, y_train, X_test, 2, 60)
print(X_train_fs.shape)
print(X_train.shape)
print(X_test_fs.shape)
logistica(X_train_fs, y_train, X_test_fs, y_test)
arbolbos(X_train_fs, y_train, X_test_fs, y_test)
/Users/rafamtz/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_selection/_univariate_selection.py:114: UserWarning: Features [ 50 64 66 67 74 87 99 104 142 144 190 201 205 216 217 251 261 265 276 283] are constant. warnings.warn("Features %s are constant." % constant_features_idx, /Users/rafamtz/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_selection/_univariate_selection.py:116: RuntimeWarning: invalid value encountered in true_divide f = msb / msw
(1752, 60) (1752, 305) (439, 60) Accuracy reg log multi: 29.61 Accuracy reg log ovr: 28.93 Accuracy bosque: 29.16 Accuracy arbol: 27.56
from sklearn.feature_selection import RFE
estimator = SVC(kernel="linear")
selector = RFE(estimator, n_features_to_select=None, step=1)
selector = selector.fit(X_train, y_train)
#selector.support_
#selector.ranking_
X_train_fs = selector.transform(X_train)
X_test_fs = selector.transform(X_test)
logistica(X_train_fs, y_train, X_test_fs, y_test)
arbolbos(X_train_fs, y_train, X_test_fs, y_test)
Accuracy reg log multi: 28.47 Accuracy reg log ovr: 28.02 Accuracy bosque: 27.79 Accuracy arbol: 26.20