#conda list
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


#df1 = pd.read_stata('Datos/ESRU-EMOVI-2017/ESRU-EMOVI-2017-Entrevistado.dta', iterator = True)
# df1.variable_labels()
# df1.values.labels()
df = pd.read_stata('Datos/ESRU-EMOVI-2017/ESRU-EMOVI-2017-Entrevistado.dta',      
convert_categoricals= False)
df.dtypes
df


# escolaridad
#df.dropna(subset=['p13'])
#print(df['p13'].unique(), type(df[df.p13 != np.nan]['p13'].unique()[-1]) )
#l = 0
#for i in df['p13']:
#    if np.isnan(i):
#        df.loc[l,'p13']=1
#        l += 1
        
#print(df['p13'].unique(), type(df[df.p13 != np.nan]['p13'].unique()[-1]), np.isnan(df[df.p13 != np.nan]['p13'].unique()[-1]) )
#df


from sklearn.model_selection import train_test_split


# pregunta p02, comparten el mimso gasto para comer: 1 ) Si 2 ) No
df2 = df[df.p02 == 1 ]

#edad
df2 = df2[(25<=df2.p05) & (df2.p05<=50)]

# pregunta p08, es el jefe del hogar: 1 ) Si 
#df2 = df2[df2.p08 == 1 ]

# pregunta p12. actualmente estudia, 1)Si 2)No
df2 = df2[df2.p12 == 2 ]

# aós alacnazado podria ser importante p14

# p13, nivel escuela 1-12. 97 = no fue a la escuela
#df2 = df2[df2.p13 = 97 ] # quitamos la no asistencia
#print(df2.p13.unique())


# p63, Escuela publica o privada, 8 = No aplica 
# estonces sino aplica con quedamos con el más alto 
# esta valirvale es mas complicada, por el momento no la utiizamos
# (preguntas por cada caso)

# sosten principal

df2=df2[(df2.p26==1) | (df2.p26==2)]


# infromación sosoten principal

df2 = df2[((((( (~df2.p43.isna()&~np.isnan(df2.p43) )& df2.p43!=98) & df2.p26==1) & df2.cmo1_2!="." )&
          (~np.isnan(df2.p38_11)) ) & ~df2.p38_11.isna()) |
           ((((((~df2.p43m.isna()&~np.isnan(df2.p43m) ) & df2.p43m!=98) & df2.p26==2) & df2.cmo2_2!="." )&
          (~np.isnan(df2.p38m_11)))  & ~df2.p38_11.isna())]

print('shape', df2.shape)

shape (8783, 366)


df2 = df2[(~np.isnan(df2.p13) & (~df2.p13.isna()))]
df2 = df2[df2.SINCO3!=" "]

# p68, Personas que trabajan, 1. Si, 2. No
# p69, negocio vacaciones, 1. Si, 2. No
df2 = df2[(df2.p68 == 1) | (df2.p69 == 1)] # quitamos los no ocupados


# p120, material de la casa opciones: 1,2,3

# p121, numero de cuartos para dormir, libre

# p122, numeor totales de cuartos, libre

# p123, casa propia o  del conyugue, 1. Sí, 2. No 

# p125, servicios básicos de la vivienda, 
# 125a - 125e (preguntas por cada caso, 1. Si, 2. No)

# p126, articulos propiedad del hogar 
# 126a - 126r (preguntas por cada caso, 1. Si , 2. No)

# p127, prestamo variable 1-9, 8 es otro caso
#df2 = df2[df2.p127 != 8 ] # quitamos la otra posibilidad

# p128, ahorros, tarjeta de credito, cuent abancaria, 
# a-f (preguntas por cada caso) 1. Si 2. No

# p129, pertenencias propias o de conyugue, 
# a-e (pregunta por cada caso) 1. Si, 2. No

# p130, apoyo economico porgramas u otro medio, 
# a-f, (preguntas por cada caso)  

# p131, numero de automoviles propios, libre

# p132,  numero de miembros que aportan ingreso al hogar
df2 = df2[df2.p132 == 1] # un solo sosten

# Cohort  de ingreso todas las personas que aportan ingresos
df2 = df2[((df2.p133 != 8) & ( df2.p133 != 9) )& (~np.isnan(df2.p133) & (~df2.p133.isna())) ] #ingreso no reportado o no diponible

print('valores cohort de ingreso',df2.p133.unique())
print('shape', df2.shape)
print("unique",df2.p43.unique())
# p134, condiciones del barrio
# a-i, (preguntas por cada caso) 1. Si , 2. No, 8 son respuesta 
p = "p134"
#for _ in "abcdefghi":
#    df2 = df2[df2[p+_] != 8]


# p147 percepción de 1 más pobre + 10 más rico
    
    
#df2.p08.plot.hist()


#df2.p133.plot.hist()

Estado = ["Estado"]
p5 = ['p05']
p6 = ['p06']
p13  = ['p13']
SINCO = ['SINCO3']
p63 = ['p63a','p63b','p63c','p63d'] # por el momento la  quitamos de la estimación
p120  = ['p120']
p121  = ['p121']
p122  = ['p122']
p123  = ['p123']
p125 = ['p125a','p125b','p125c','p125d','p125e']
import string
p126 = ['p126' + i for i in string.ascii_lowercase[0:18] ]
p127 = ['p127']
p128 = ['p128a','p128b','p128c','p128d','p128e','p128f']
p129 = ['p129a','p129b','p129c','p129d','p129e']
p130 = ['p130a','p130b','p130c','p130d','p130e', 'p130f']
p131 = ['p131']
p132 = ['p132']
p134 = ['p134a','p134b','p134c','p134d','p134e', 'p134f','p134g','p134h','p134i']
p147 = ['p147']

#indexX =  p13  +  p120  + p121 +  p122  + p123 + p125  + p126 + p127 + p128 + p129  + p130 + p131  

#indexX =  p13   + p121 +  p122 + p127 + p131 


#indexX =  p13  + p131 

indexX =  p5 + p6 + p13 + SINCO + ['region']


indexY = ['p133']

#M  = df2[indexX+indexY].dropna() 
M  = df2[indexX+indexY]

#M.loc[M.p133 == 2,'p133'] = 1
#M.loc[M.p133 != 1,'p133'] = M.p133 - 1

#a = StandardScaler().fit(M)
#M_scaled = a.transform(M)
#M_sca = pd.DataFrame(M_scaled)
#M_sca.describe()


#ax = StandardScaler().fit(M[indexX])
#X = ax.transform(M[indexX])
#M.SINCO3=pd.to_numeric(M.SINCO3)
X = M[indexX].to_numpy()
Xc = M[indexX].astype('category')
y = M[indexY].to_numpy()
yc = M[indexY].astype('category')
y = np.ravel(y)
y = y-1;

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc, yc, test_size=0.2, random_state=0)



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc, yc, test_size=0.2, random_state=0)

# dummies
X_train, X_test, y_train, y_test = train_test_split(pd.get_dummies(Xc), yc, test_size=0.2, random_state=21, stratify=yc)
y_train=y_train.to_numpy().ravel()
y_test=y_test.to_numpy().ravel()

print('valores unicos para claseificar',yc.p133.unique())
print('valores del entrenamiento',np.unique(y_train))
print('valores del test',np.unique(y_test))

df2.loc[df2.p26==1,"p43"].unique()
X_train

valores cohort de ingreso [4 3 5 1 2 6 7]
shape (2191, 366)
unique [11. nan  6.  2. 98. 12.  4. 10.  9.  3.  1.  8.  5.  7.]
valores unicos para claseificar [4, 3, 5, 1, 2, 6, 7]
Categories (7, int64): [4, 3, 5, 1, 2, 6, 7]
valores del entrenamiento [1 2 3 4 5 6 7]
valores del test [1 2 3 4 5 6 7]


from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

clas_linsvm = LinearSVC(max_iter = 3000, penalty = "l2",
                       loss = "squared_hinge", dual = True,
                       C = 0.01)

y_pred_linsvm = clas_linsvm.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_linsvm)
print('Accuracy: %.2f' % (accuracy*100))

# NO ganamos nada con categoricas
#y_pred_linsvmc = clas_linsvm.fit(Xc_train, yc_train).predict(Xc_test)

#accuracy = accuracy_score(yc_test, y_pred_linsvmc)
#print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 31.21


from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import itertools

def matrizviz(matrix):
    

    plt.clf()

    # place labels at the top
    plt.gca().xaxis.tick_top()
    plt.gca().xaxis.set_label_position('top')

    # plot the matrix per se
    plt.imshow(matrix, interpolation='nearest', cmap=plt.cm.Blues)
    # plot colorbar to the right
    plt.colorbar()
    fmt = 'd'
    class_names = ['1','2','3','4','5', '6',
                  '7']
    # write the number of predictions in each bucket
    thresh = matrix.max() / 2.
    for i, j in itertools.product(range(matrix.shape[0]), range(matrix.shape[1])):
        # if background is dark, use a white number, and vice-versa
        plt.text(j, i, format(matrix[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if matrix[i, j] > thresh else "black")
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names, rotation=45)
    plt.yticks(tick_marks, class_names)
    plt.tight_layout()
    plt.ylabel('Intervalo registrado',size=14)
    plt.xlabel('Intervalo predecido',size=14)
    plt.show()
        
matrix = confusion_matrix(y_test,y_pred_linsvm)
matrizviz(matrix)

matrix = confusion_matrix(y_test,y_pred_linsvm, normalize = 'true')
disp = ConfusionMatrixDisplay(confusion_matrix=matrix)
disp.plot()

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f87aec3c1f0>


# clasifcación k vecinos

from sklearn.neighbors import KNeighborsClassifier

clas_neigk = KNeighborsClassifier(n_neighbors = 5, weights='distance',
                                leaf_size = 10, p = 2,
                                 algorithm = 'ball_tree')#'kd_tree')#'brute')

y_pred_neigk = clas_neigk.fit(X_train, y_train).predict(X_test)

# no obtenemos mejora ocn categoricas
accuracy = accuracy_score(y_test, y_pred_neigk)
print('Accuracy k vecinos: %.2f' % (accuracy*100))


### buscamos la cantidad vecinos mejor

neighbors = np.arange(1, 150)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

# Loop over different values of k
for i, k in enumerate(neighbors):
    # Setup a k-NN Classifier with k neighbors: knn
    knn = KNeighborsClassifier(n_neighbors = k, algorithm='brute')

    # Fit the classifier to the training data
    knn.fit(X_train,y_train)
    
    #Compute accuracy on the training set
    train_accuracy[i] = knn.score(X_train, y_train)

    #Compute accuracy on the testing set
    test_accuracy[i] = knn.score(X_test, y_test)

# Generate plot
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()







# clasificacion r-vecinos

from sklearn.neighbors import RadiusNeighborsClassifier


clas_neigr = RadiusNeighborsClassifier(radius = 4, weights='distance',
                                leaf_size = 10, p = 2,
                            outlier_label = 'most_frequent',
                                algorithm =  'ball_tree')#'kd_tree')#'brute')

y_pred_neigr = clas_neigr.fit(X_train, y_train).predict(X_test)


accuracy = accuracy_score(y_test, y_pred_neigr)
print('Accuracy r vecinos: %.2f' % (accuracy*100))




# clasifiacion centroide más cercano

from sklearn.neighbors import NearestCentroid

clas_neigc = NearestCentroid(shrink_threshold = 0.01)

y_pred_neigc= clas_neigc.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_neigc)
print('Accuracy centroide: %.2f' % (accuracy*100))



# clasifiacion con centoide encogido

for shrinkage in np.linspace(0, 1.0, num=5):
    
    clas_neigce = NearestCentroid(shrink_threshold=shrinkage)
    y_pred_neigce = clas_neigce.fit(X_train, y_train).predict(X_test)
    accuracy = accuracy_score(y_test, y_pred_neigce)
    print(shrinkage,'Accuracy centroide encogido: %.2f' % (accuracy*100))

# Tranformación de vecinos orehecho
from sklearn.manifold import Isomap
from sklearn.neighbors import KNeighborsTransformer
from sklearn.pipeline import make_pipeline
clas_neigt = make_pipeline(
    KNeighborsTransformer(n_neighbors=5))

# NO jalo
#y_pred_neigt= clas_neigt.fit(X_train, y_train).predict(X_test)

#accuracy = accuracy_score(y_test, y_pred_neigt)
#print('Accuracy: %.2f' % (accuracy*100))




# clasifiacion con componentes de vecindad

from sklearn.neighbors import NeighborhoodComponentsAnalysis

nca = NeighborhoodComponentsAnalysis( max_iter = 100)
nca.fit(X_train, y_train)

clas_neigk.fit(X_train, y_train)


print(clas_neigk.score(nca.transform(X_test), y_test))


# clasificaciñon combianndo el anterior con el primero
from sklearn.pipeline import Pipeline

nca = NeighborhoodComponentsAnalysis(random_state=42, max_iter = 100)
nca_pipe = Pipeline([('nca', nca), ('knn', clas_neigk)])
nca_pipe.fit(X_train, y_train)

y_pred_nca = nca_pipe.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_nca)
print('Accuracy combinado: %.2f' % (accuracy*100))
#print(nca_pipe.score(X_test, y_test))




# clasificaciñon combianndo nca con  cpmponentes reducidas y el primero
from sklearn.pipeline import Pipeline

nca = NeighborhoodComponentsAnalysis(random_state=42, max_iter = 100,
                                    n_components=2)
nca_pipe = Pipeline([('nca', nca), ('knn', clas_neigk)])
nca_pipe.fit(X_train, y_train)

y_pred_nca = nca_pipe.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_nca)
print('Accuracy combinado: %.2f' % (accuracy*100))
#print(nca_pipe.score(X_test, y_test))

Accuracy k vecinos: 30.07

Accuracy r vecinos: 29.38
Accuracy centroide: 27.56
0.0 Accuracy centroide encogido: 27.56
0.25 Accuracy centroide encogido: 25.97
0.5 Accuracy centroide encogido: 25.06
0.75 Accuracy centroide encogido: 25.06
1.0 Accuracy centroide encogido: 24.83
0.2642369020501139
Accuracy combinado: 29.38
Accuracy combinado: 28.25


# Soporte no lineal
from sklearn.svm import SVC

clas_nlinsvm = SVC(max_iter = -1, C = 1,
                  kernel = 'rbf', gamma = 'auto')

y_pred_nlinsvm = clas_nlinsvm.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_nlinsvm)
print('Accuracy SVM no lineal : %.2f' % (accuracy*100))



# Sopoete no luenal con escalamiento
from sklearn.preprocessing import StandardScaler

clas_nelinsvm =  make_pipeline(StandardScaler(), SVC(max_iter = -1, C = 1,
                  kernel = 'rbf',gamma='auto'))

y_pred_nelinsvm = clas_nelinsvm.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_nelinsvm)
print('Accuracy SVM no lineal datos escalados: %.2f' % (accuracy*100))




# Soporte vectorial en busqueda de parametros

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

scaler = StandardScaler()
X_trainsvmp = scaler.fit_transform(X_train)

C_range = np.logspace(-2, 3, 13)
gamma_range = np.logspace(-9, 3, 13)

#param_grid = dict(gamma=gamma_range, C=C_range)
#cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
#grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
#grid.fit(X_trainsvmp, y_train)

#print("The best parameters are %s with a score of %0.2f"
#      % (grid.best_params_, grid.best_score_))

# The best parameters 
# are {'C': 1.2115276586285888, 'gamma': 0.01} with a score of 0.37



# Soporte no lineal con los parametros ajustados

from sklearn.svm import SVC

clas_nlinsvmpa = SVC(max_iter = -1, C = 1.2115,
                  kernel = 'rbf', gamma = 0.01)

y_pred_nlinsvmpa = clas_nlinsvmpa.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_nlinsvmpa)
print('Accuracy SVM no lineal para ajustdos: %.2f' % (accuracy*100))




# Soporte no lineal con los parametros ajustados y pesos

from sklearn.svm import SVC

clas_nlinsvmpap = SVC(max_iter = -1, C = 1.2115,
                  kernel = 'rbf', gamma = 0.01,
                     class_weight={1: 500, 2:100})

y_pred_nlinsvmpap = clas_nlinsvmpap.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_nlinsvmpap)
print('Accuracy SVM no lineal para ajus pesados : %.2f' % (accuracy*100))



# reporte de la clasificación

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_nlinsvm ))


# Datos escalados u parametros ajustados

clas_nlsvmpae =  make_pipeline(StandardScaler(), 
                               SVC(max_iter = -1, C = 1.2115,
                  kernel = 'rbf', gamma = 0.01))

y_pred_nlsvmpae = clas_nlsvmpae.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_nlsvmpae)
print('Accuracy SVM no lineal datos escalados y ajustados: %.2f' % (accuracy*100))

Accuracy SVM no lineal : 26.88
Accuracy SVM no lineal datos escalados: 29.84
Accuracy SVM no lineal para ajustdos: 28.47
Accuracy SVM no lineal para ajus pesados : 19.82
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        81
           2       0.00      0.00      0.00        73
           3       0.27      1.00      0.42       118
           4       0.00      0.00      0.00        94
           5       0.00      0.00      0.00        52
           6       0.00      0.00      0.00        17
           7       0.00      0.00      0.00         4

    accuracy                           0.27       439
   macro avg       0.04      0.14      0.06       439
weighted avg       0.07      0.27      0.11       439

/Users/rafamtz/opt/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

Accuracy SVM no lineal datos escalados y ajustados: 31.21


# gradiente estocastigo

from sklearn.linear_model import SGDClassifier

clas_sgd = SGDClassifier(loss="hinge", alpha=0.01, max_iter=200)

y_pred_sgd = clas_sgd.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_sgd)
print('Accuracy sgd: %.2f' % (accuracy*100))



# gradiento estocastico con promedio

clas_sgda = SGDClassifier(loss="hinge", alpha=0.01, max_iter=200,
                        average=True)#, class_weight={1: 50,2:100,3:50})

y_pred_sgda = clas_sgda.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_sgda)
print('Accuracy sgd average: %.2f' % (accuracy*100))

Accuracy sgd: 28.47
Accuracy sgd average: 29.84


from sklearn.linear_model import Perceptron

clas_per = Perceptron(tol=1e-3, random_state=0)

y_pred_per = clas_per.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_per)
print('Accuracy perceptron: %.2f' % (accuracy*100))

Accuracy perceptron: 24.60


from sklearn.linear_model import PassiveAggressiveClassifier

clas_pac = PassiveAggressiveClassifier(max_iter=1000, random_state=0,tol=1e-3)


y_pred_pac = clas_pac.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_pac)
print('Accuracy perceptron: %.2f' % (accuracy*100))

Accuracy perceptron: 22.10


# discriminate lineal

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

clas_dl = LinearDiscriminantAnalysis()

y_pred_dl = clas_dl.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_dl)
print('Accuracy perceptron: %.2f' % (accuracy*100))


# discriminante cuadratico
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

clas_dc = QuadraticDiscriminantAnalysis()

y_pred_dc = clas_dl.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_dc)
print('Accuracy perceptron: %.2f' % (accuracy*100))

Accuracy perceptron: 31.66
Accuracy perceptron: 31.66


# tarda mucho y es pobre en el uso más simple

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

kernel = 1.0 * RBF(1.0)
clas_gpc = GaussianProcessClassifier(kernel=kernel,random_state=0)

y_pred_gpc = clas_gpc.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_gpc)
print('Accuracy perceptron: %.2f' % (accuracy*100))


from sklearn.kernel_approximation import RBFSampler

rbf_feature = RBFSampler(gamma=1, random_state=1)
Xtrain_features = rbf_feature.fit_transform(X_train)
clas_ak = SGDClassifier(loss="hinge", alpha=0.01, max_iter=200,
                        average=True)
Xtest_features = rbf_feature.fit_transform(X_test)

y_pred_ak = clas_ak.fit(Xtrain_features, y_train).predict(Xtest_features)

accuracy = accuracy_score(y_test, y_pred_ak)
print('Accuracy kernel: %.2f' % (accuracy*100))

Accuracy kernel: 26.65


from sklearn.naive_bayes import GaussianNB

clas_nb = GaussianNB()

y_pred_nb = clas_nb.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_nb)
print('Accuracy bayes: %.2f' % (accuracy*100))

# multninomial bayes
from sklearn.naive_bayes import MultinomialNB

clas_mb = MultinomialNB()

y_pred_mb = clas_mb.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_mb)
print('Accuracy bayes multinomial: %.2f' % (accuracy*100))



# Bayes con complemento
from sklearn.naive_bayes import ComplementNB

clas_cb = ComplementNB()

y_pred_cb = clas_cb.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_cb)
print('Accuracy bayes complemento: %.2f' % (accuracy*100))

# Bayes categorico no jalo
#from sklearn.naive_bayes import CategoricalNB

#clas_cab = CategoricalNB()

#y_pred_cab = clas_cab.fit(Xc_train, yc_train).predict(Xc_test)

#accuracy = accuracy_score(y_test, y_pred_cab)
#print('Accuracy bayes categorico: %.2f' % (accuracy*100))

Accuracy bayes: 12.76
Accuracy bayes multinomial: 32.35
Accuracy bayes complemento: 31.66


from sklearn.linear_model import RidgeClassifier

clas_ridge = RidgeClassifier(max_iter = 100, normalize = True)

y_pred_ridge = clas_ridge.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_ridge)
print('Accuracy ridge: %.2f' % (accuracy*100))

Accuracy ridge: 32.12


# regresiónlogistica multinomial
from sklearn.linear_model import LogisticRegression

def logistica(X_train, y_train, X_test, y_test):
    y_pred_rlmulti = LogisticRegression(solver='saga', 
                                    max_iter=200, 
                                    random_state=42,
            multi_class='multinomial').fit(X_train, y_train).predict(X_test)

    accuracy = accuracy_score(y_test, y_pred_rlmulti)
    print('Accuracy reg log multi: %.2f' % (accuracy*100))


# regresión logistica ovr

    y_pred_rlovr = LogisticRegression(solver='liblinear', 
                                    max_iter=200, 
                                    random_state=42,
            multi_class='ovr').fit(X_train, y_train).predict(X_test)

    accuracy = accuracy_score(y_test, y_pred_rlovr)
    print('Accuracy reg log ovr: %.2f' % (accuracy*100))

logistica(X_train, y_train, X_test, y_test)

Accuracy reg log multi: 33.26
Accuracy reg log ovr: 33.26


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

def arbolbos(X_train, y_train, X_test, y_test):
    clas_tree = DecisionTreeClassifier()
    clas_forest = RandomForestClassifier( max_depth = 14,max_features=None, min_samples_leaf = 20)

    y_pred_tree = clas_tree.fit(X_train, y_train).predict(X_test)
    y_pred_forest = clas_forest.fit(X_train, y_train).predict(X_test)

    accuracy = accuracy_score(y_test, y_pred_forest)
    print('Accuracy bosque: %.2f' % (accuracy*100))

    accuracy = accuracy_score(y_test, y_pred_tree)
    print('Accuracy arbol: %.2f' % (accuracy*100))



#print(np.unique(y_test))
#y_pred_tree.shape
#y_pred_forest.shape
arbolbos(X_train, y_train, X_test, y_test)

Accuracy bosque: 31.66
Accuracy arbol: 27.33


clas_tree = DecisionTreeClassifier()
clas_forest = RandomForestClassifier( max_depth = 14,max_features=None, min_samples_leaf = 20)

y_pred_tree = clas_tree.fit(X_train, y_train).predict(X_test)
y_pred_forest = clas_forest.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_forest)
print('Accuracy bosque: %.2f' % (accuracy*100))

accuracy = accuracy_score(y_test, y_pred_tree)
print('Accuracy arbol: %.2f' % (accuracy*100))
validacion = pd.DataFrame({'Actual': y_test,'Predicción': y_pred_tree, 'Predicción F': y_pred_forest,
                          'Diferencia': y_test-y_pred_forest}) 
validacion
a = np.abs(y_test-y_pred_forest)
print(a.mean())

accuracy = accuracy_score(y_test, y_pred_forest)
print('Accuracy: %.2f' % (accuracy*100))

accuracy = accuracy_score(y_test, y_pred_tree)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy bosque: 31.21
Accuracy arbol: 27.11
1.0660592255125285
Accuracy: 31.21
Accuracy: 27.11


matrix = confusion_matrix(y_test,y_pred_forest)

matrizviz(matrix)


from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif

# feature selection
def select_features(X_train, y_train, X_test):
	fs = SelectKBest(score_func=chi2, k=40)
    #mutual_info_classif
    #chi2
	fs.fit(X_train, y_train)
	X_train_fs = fs.transform(X_train)
	X_test_fs = fs.transform(X_test)
	return X_train_fs, X_test_fs, fs

X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)
# what are scores for the features
for i in range(len(fs.scores_)):
	print('Feature %d: %f' % (i, fs.scores_[i]))
# plot the scores
plt.bar([i for i in range(len(fs.scores_))], fs.scores_)
plt.show()

Feature 0: 7.758087
Feature 1: 3.472157
Feature 2: 2.595307
Feature 3: 6.248632
Feature 4: 4.726717
Feature 5: 6.306663
Feature 6: 4.133533
Feature 7: 10.739050
Feature 8: 3.123382
Feature 9: 5.434297
Feature 10: 7.799463
Feature 11: 4.850173
Feature 12: 1.752700
Feature 13: 14.082139
Feature 14: 5.281824
Feature 15: 5.425489
Feature 16: 10.172653
Feature 17: 5.905378
Feature 18: 2.450644
Feature 19: 5.441612
Feature 20: 4.283228
Feature 21: 3.215003
Feature 22: 2.278915
Feature 23: 24.341717
Feature 24: 0.934253
Feature 25: 3.232639
Feature 26: 20.353787
Feature 27: 31.402141
Feature 28: 3.728972
Feature 29: 66.162677
Feature 30: 21.399958
Feature 31: 24.663924
Feature 32: 11.698925
Feature 33: 30.832805
Feature 34: 4.777782
Feature 35: 9.725263
Feature 36: 3.802018
Feature 37: 56.757117
Feature 38: 152.077957
Feature 39: 84.660611
Feature 40: 24.027463
Feature 41: 24.391304
Feature 42: 5.211538
Feature 43: 13.424624
Feature 44: 6.053682
Feature 45: nan
Feature 46: 7.423077
Feature 47: 54.605932
Feature 48: nan
Feature 49: nan
Feature 50: 108.500000
Feature 51: 3.071218
Feature 52: 4.553784
Feature 53: 7.423077
Feature 54: 7.368984
Feature 55: 24.391304
Feature 56: 8.844580
Feature 57: 2.711864
Feature 58: 36.192994
Feature 59: 2.198178
Feature 60: 7.368984
Feature 61: 17.249253
Feature 62: 10.050084
Feature 63: 63.557759
Feature 64: 3.684492
Feature 65: 24.391304
Feature 66: 7.423077
Feature 67: 3.684492
Feature 68: 5.000000
Feature 69: 1.798785
Feature 70: 2.711864
Feature 71: 2.711864
Feature 72: 7.423077
Feature 73: 7.368984
Feature 74: 24.391304
Feature 75: 19.250489
Feature 76: 4.553784
Feature 77: 4.067471
Feature 78: 60.449419
Feature 79: 13.695652
Feature 80: 9.792267
Feature 81: 3.684492
Feature 82: 4.067471
Feature 83: 32.783538
Feature 84: 7.260767
Feature 85: 6.882126
Feature 86: 32.756561
Feature 87: 24.391304
Feature 88: 8.701056
Feature 89: 3.684492
Feature 90: 24.391304
Feature 91: 6.053682
Feature 92: 3.684492
Feature 93: 7.423077
Feature 94: 2.711864
Feature 95: 5.233301
Feature 96: 3.684492
Feature 97: 2.711864
Feature 98: 18.541531
Feature 99: 2.711864
Feature 100: nan
Feature 101: 24.391304
Feature 102: 3.684492
Feature 103: 7.423077
Feature 104: 24.391304
Feature 105: 5.680463
Feature 106: 2.584904
Feature 107: 7.423077
Feature 108: 3.684492
Feature 109: 10.936828
Feature 110: 56.961538
Feature 111: 2.584904
Feature 112: 5.807845
Feature 113: 8.915888
Feature 114: 2.198178
Feature 115: 4.067471
Feature 116: 5.374458
Feature 117: 6.053682
Feature 118: nan
Feature 119: 4.483277
Feature 120: 4.067471
Feature 121: 14.846154
Feature 122: 11.851394
Feature 123: 4.553784
Feature 124: 7.423077
Feature 125: 3.684492
Feature 126: 2.584904
Feature 127: 2.711864
Feature 128: 5.000000
Feature 129: 6.053682
Feature 130: 4.483277
Feature 131: 2.711864
Feature 132: 6.197778
Feature 133: 4.087524
Feature 134: 7.423077
Feature 135: 4.770673
Feature 136: 3.197442
Feature 137: 3.510650
Feature 138: 2.001417
Feature 139: 5.811084
Feature 140: 5.376025
Feature 141: 3.071218
Feature 142: 7.423077
Feature 143: 3.684492
Feature 144: 3.684492
Feature 145: 5.112424
Feature 146: 10.911699
Feature 147: 5.075884
Feature 148: 4.457944
Feature 149: nan
Feature 150: 8.175048
Feature 151: 3.684492
Feature 152: 11.942026
Feature 153: 2.711864
Feature 154: 3.885910
Feature 155: 1.126194
Feature 156: 1.569344
Feature 157: 7.368984
Feature 158: 11.698869
Feature 159: 5.143686
Feature 160: 4.722799
Feature 161: 6.619567
Feature 162: 14.354318
Feature 163: 7.423077
Feature 164: 4.080061
Feature 165: 14.051162
Feature 166: 3.684492
Feature 167: 3.356974
Feature 168: 7.423077
Feature 169: 7.423077
Feature 170: 4.099089
Feature 171: 11.053476
Feature 172: 5.000000
Feature 173: 2.711864
Feature 174: nan
Feature 175: 11.826486
Feature 176: 11.434916
Feature 177: 3.823224
Feature 178: 2.711864
Feature 179: 43.088169
Feature 180: 8.135593
Feature 181: 3.684492
Feature 182: 4.292452
Feature 183: 3.885910
Feature 184: 6.983729
Feature 185: 6.466599
Feature 186: 4.457944
Feature 187: 3.684492
Feature 188: nan
Feature 189: 24.391304
Feature 190: nan
Feature 191: 4.457944
Feature 192: 3.627007
Feature 193: 7.708914
Feature 194: 4.457944
Feature 195: 10.000000
Feature 196: 4.457944
Feature 197: nan
Feature 198: 4.457944
Feature 199: 19.584533
Feature 200: 3.684492
Feature 201: nan
Feature 202: 14.692690
Feature 203: 7.423077
Feature 204: 4.457944
Feature 205: nan
Feature 206: 4.483277
Feature 207: 2.245626
Feature 208: 2.299950
Feature 209: 4.067471
Feature 210: 14.846154
Feature 211: 2.417578
Feature 212: 2.369637
Feature 213: 3.369190
Feature 214: 5.000000
Feature 215: 5.244672
Feature 216: 2.711864
Feature 217: 2.711864
Feature 218: 4.067471
Feature 219: 17.831776
Feature 220: 7.340504
Feature 221: 2.056603
Feature 222: 25.902028
Feature 223: 3.949153
Feature 224: 5.423729
Feature 225: 2.198178
Feature 226: 5.000000
Feature 227: 4.457944
Feature 228: 3.949153
Feature 229: 5.175739
Feature 230: 3.684492
Feature 231: 2.711864
Feature 232: 2.584904
Feature 233: 2.711864
Feature 234: 2.576350
Feature 235: 4.553784
Feature 236: 24.391304
Feature 237: 7.423077
Feature 238: 2.198178
Feature 239: 2.198178
Feature 240: 5.169808
Feature 241: 2.584904
Feature 242: 2.711864
Feature 243: 5.000000
Feature 244: 2.711864
Feature 245: 4.136478
Feature 246: 3.684492
Feature 247: nan
Feature 248: 3.768467
Feature 249: nan
Feature 250: 7.423077
Feature 251: 3.684492
Feature 252: 2.711864
Feature 253: 10.466602
Feature 254: 5.998212
Feature 255: 8.835109
Feature 256: 3.071218
Feature 257: 3.987294
Feature 258: 6.326316
Feature 259: 4.285201
Feature 260: 2.247473
Feature 261: 7.423077
Feature 262: 2.382987
Feature 263: 5.162867
Feature 264: 16.411303
Feature 265: nan
Feature 266: 12.595383
Feature 267: 7.423077
Feature 268: 5.423729
Feature 269: 1.618100
Feature 270: 4.396356
Feature 271: 4.653407
Feature 272: 7.423077
Feature 273: 1.569344
Feature 274: 2.198178
Feature 275: nan
Feature 276: 7.423077
Feature 277: 2.488776
Feature 278: 6.075509
Feature 279: 4.475792
Feature 280: 3.768467
Feature 281: 19.173287
Feature 282: 8.402497
Feature 283: 3.684492
Feature 284: 5.428950
Feature 285: 3.728972
Feature 286: 3.187869
Feature 287: 4.457944
Feature 288: 13.424624
Feature 289: 12.961225
Feature 290: 1.003221
Feature 291: 2.711864
Feature 292: 8.915888
Feature 293: 3.535847
Feature 294: 2.711864
Feature 295: 2.584904
Feature 296: 4.457944
Feature 297: 2.606478
Feature 298: 28.876576
Feature 299: 6.816186
Feature 300: 28.312784
Feature 301: 9.474995
Feature 302: 9.658377
Feature 303: 29.711773
Feature 304: 107.546986


clas_forest_chi = RandomForestClassifier(max_depth = 14,max_features=None, min_samples_leaf = 20)

yhat =clas_forest_chi.fit(X_train_fs, y_train).predict(X_test_fs)

accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

matrix = confusion_matrix(y_test,yhat)

matrizviz(matrix)

Accuracy: 28.47


#from sklearn.preprocessing import OneHotEncoder
Xc_train, Xc_test, yc_train, yc_test = train_test_split(pd.get_dummies(Xc), yc, test_size=0.2, random_state=0)

Xc_train_fs, Xc_test_fs, fsc = select_features(Xc_train, yc_train.to_numpy().ravel(), Xc_test)
plt.bar([i for i in range(len(fsc.scores_))], fsc.scores_)
plt.show()

ypc_forest = clas_forest.fit(Xc_train, yc_train.to_numpy().ravel()).predict(Xc_test)

matrix = confusion_matrix(yc_test,ypc_forest)

matrizviz(matrix)

yhat =clas_forest_chi.fit(Xc_train_fs, yc_train.to_numpy().ravel()).predict(Xc_test_fs)

accuracy = accuracy_score(yc_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

matrix = confusion_matrix(yc_test,yhat)

matrizviz(matrix)
Xc_train_fs.shape

Accuracy: 28.70

(1752, 40)


ypc_forest = clas_forest.fit(Xc_train, yc_train.to_numpy().ravel()).predict(Xc_test)

accuracy = accuracy_score(yc_test, ypc_forest)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 30.07


#clases = ['a','aa','b','bb','c','cc','d']
#visualizer = ClassPredictionError(
#    RandomForestClassifier(max_depth = None, max_features=None, min_samples_leaf = 5), classes= clases)
#visualizer.fit(X_train, y_train)
#visualizer.score(X_test, y_test)
#visualizer.show()


from sklearn.neural_network import MLPClassifier
X_train, X_test, y_train, y_test = train_test_split(pd.get_dummies(Xc), yc, test_size=0.2, random_state=21, stratify=yc)
y_train=y_train.to_numpy().ravel()
y_test=y_test.to_numpy().ravel()
#clas_mpl = MLPClassifier(random_state=1,hidden_layer_sizes=(100, 100, 100),
#                      solver='lbfgs',alpha = 1e-5, learning_rate = 'adaptive',
#                      max_iter=2000)

clf = MLPClassifier(solver='lbfgs', alpha=0.5, max_iter = 200,
                    hidden_layer_sizes=(10, 10, 10,10,10,10,10,10), random_state=1)

#y_pred_clas = clf.fit(X_train_fs, y_train).predict(X_test_fs)
y_pred_clas = clf.fit(Xc_train, yc_train).predict(Xc_test)


from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_pred_clas, normalize ='true')

disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot() 

accuracy = accuracy_score(y_test, y_pred_clas)
print('Accuracy: %.2f' % (accuracy*100))


from sklearn.ensemble import BaggingClassifier


# con maquinas de soporte vectorial
clas_bsvc = BaggingClassifier(base_estimator=SVC(),
                        n_estimators=10, random_state=0)

y_pred_bsvc = clas_bsvc.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_bsvc)
print('Accuracy baggin con maquina: %.2f' % (accuracy*100))





# con k vecinos cercanos

clas_bkvecinos = BaggingClassifier(KNeighborsClassifier(),
                            max_samples=0.5, max_features=0.5)

y_pred_bkvecinos = clas_bkvecinos.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_bkvecinos)
print('Accuracy baggin con k vecinos: %.2f' % (accuracy*100))






# los randomforest es un metodo de ensambe





# extremadamente arboles aleatorios
from sklearn.ensemble import ExtraTreesClassifier


clas_etc = ExtraTreesClassifier(n_estimators=10, max_depth=None,
    min_samples_split=2, random_state=0)

y_pred_etc = clas_etc.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_etc)
print('Accuracy baggin con extra forest: %.2f' % (accuracy*100))






# AdaBoost
from sklearn.ensemble import AdaBoostClassifier

clas_ada = AdaBoostClassifier(n_estimators=50)
y_pred_ada = clas_ada.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_ada)
print('Accuracy ada : %.2f' % (accuracy*100))






# Gradinte tree Boosting
from sklearn.ensemble import GradientBoostingClassifier
clas_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=0)
y_pred_gb = clas_gb.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_gb)
print('Accuracy gb : %.2f' % (accuracy*100))
print(clas_gb.feature_importances_)





# hist gradiente
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

clas_hgb = HistGradientBoostingClassifier(max_iter=100).fit(X_train, y_train)
y_pred_hgb = clas_hgb.fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred_hgb)
print('Accuracy hgb : %.2f' % (accuracy*100))






# Voting Clasificador
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()

eclf = VotingClassifier(
    estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],
    voting='soft')#'hard')

for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):
    scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=5)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
    
    
# se puede hacer lo mismo pero buscando el mejor parametro



#stakingclasifcador
from sklearn.ensemble import StackingClassifier
estimators = [
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
    ('svr', make_pipeline(StandardScaler(),
                          LinearSVC(random_state=42)))
]

clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)
clf.fit(X_train, y_train).score(X_test, y_test)


Xc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3010 entries, 7 to 17653
Data columns (total 47 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   p13     3010 non-null   category
 1   p120    3010 non-null   category
 2   p121    3010 non-null   category
 3   p122    3010 non-null   category
 4   p123    3010 non-null   category
 5   p125a   3010 non-null   category
 6   p125b   3010 non-null   category
 7   p125c   3010 non-null   category
 8   p125d   3010 non-null   category
 9   p125e   3010 non-null   category
 10  p126a   3010 non-null   category
 11  p126b   3010 non-null   category
 12  p126c   3010 non-null   category
 13  p126d   3010 non-null   category
 14  p126e   3010 non-null   category
 15  p126f   3010 non-null   category
 16  p126g   3010 non-null   category
 17  p126h   3010 non-null   category
 18  p126i   3010 non-null   category
 19  p126j   3010 non-null   category
 20  p126k   3010 non-null   category
 21  p126l   3010 non-null   category
 22  p126m   3010 non-null   category
 23  p126n   3010 non-null   category
 24  p126o   3010 non-null   category
 25  p126p   3010 non-null   category
 26  p126q   3010 non-null   category
 27  p126r   3010 non-null   category
 28  p127    3010 non-null   category
 29  p128a   3010 non-null   category
 30  p128b   3010 non-null   category
 31  p128c   3010 non-null   category
 32  p128d   3010 non-null   category
 33  p128e   3010 non-null   category
 34  p128f   3010 non-null   category
 35  p129a   3010 non-null   category
 36  p129b   3010 non-null   category
 37  p129c   3010 non-null   category
 38  p129d   3010 non-null   category
 39  p129e   3010 non-null   category
 40  p130a   3010 non-null   category
 41  p130b   3010 non-null   category
 42  p130c   3010 non-null   category
 43  p130d   3010 non-null   category
 44  p130e   3010 non-null   category
 45  p130f   3010 non-null   category
 46  p131    3010 non-null   category
dtypes: category(47)
memory usage: 247.7 KB


Xc.describe()


Xc.value_counts()

p13   p120  p121  p122  p123  p125a  p125b  p125c  p125d  p125e  p126a  p126b  p126c  p126d  p126e  p126f  p126g  p126h  p126i  p126j  p126k  p126l  p126m  p126n  p126o  p126p  p126q  p126r  p127  p128a  p128b  p128c  p128d  p128e  p128f  p129a  p129b  p129c  p129d  p129e  p130a  p130b  p130c  p130d  p130e  p130f  p131
4.0   2     1     1     1     1      1      1      2      2      1      1      1      2      2      2      2      2      2      2      2      1      2      2      2      2      2      2      5     2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      0       2
2.0   2     3     3     1     1      1      1      2      2      1      2      2      2      1      2      2      2      2      2      2      2      2      2      2      2      2      2      5     2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      0       2
11.0  3     3     4     1     1      1      1      1      2      1      1      1      1      1      2      2      1      2      2      2      1      2      2      1      2      2      2      4     2      2      1      1      1      2      2      2      2      2      2      2      2      2      2      2      2      1       2
2.0   2     3     4     1     1      1      1      1      2      1      1      1      2      1      2      2      2      2      2      2      2      2      2      2      2      2      2      5     2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      0       2
            1     1     2     1      1      1      2      2      1      2      1      2      1      2      2      2      2      2      2      1      2      2      2      2      2      2      5     2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      0       2
                                                                                                                                                                                                                                                                                                                                   ..
6.0   2     2     3     1     1      1      1      2      1      1      1      1      1      1      2      2      1      2      1      1      1      1      1      1      2      2      2      7     2      2      1      1      1      1      2      2      2      2      2      2      2      2      2      2      2      1       1
                                                   1      2      1      2      1      2      1      2      2      2      2      1      2      1      2      2      2      2      2      2      6     2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      0       1
                                                                        1      2      2      2      2      2      1      2      2      2      2      2      2      2      2      2      2      1     2      2      2      2      2      2      2      2      1      2      2      2      2      2      2      2      2      2       1
                                                                               1      2      1      2      2      2      2      2      2      1      2      2      2      2      2      2      2     2      2      1      2      1      2      2      2      2      2      2      2      2      2      2      2      2      1       1
1.0   1     1     1     2     1      1      1      2      2      2      2      2      2      2      2      2      2      2      2      2      1      2      2      2      2      2      2      4     2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      0       1
Length: 3002, dtype: int64


Xc.value_counts(normalize= True)

p13   p120  p121  p122  p123  p125a  p125b  p125c  p125d  p125e  p126a  p126b  p126c  p126d  p126e  p126f  p126g  p126h  p126i  p126j  p126k  p126l  p126m  p126n  p126o  p126p  p126q  p126r  p127  p128a  p128b  p128c  p128d  p128e  p128f  p129a  p129b  p129c  p129d  p129e  p130a  p130b  p130c  p130d  p130e  p130f  p131
4.0   2     1     1     1     1      1      1      2      2      1      1      1      2      2      2      2      2      2      2      2      1      2      2      2      2      2      2      5     2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      0       0.000664
2.0   2     3     3     1     1      1      1      2      2      1      2      2      2      1      2      2      2      2      2      2      2      2      2      2      2      2      2      5     2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      0       0.000664
11.0  3     3     4     1     1      1      1      1      2      1      1      1      1      1      2      2      1      2      2      2      1      2      2      1      2      2      2      4     2      2      1      1      1      2      2      2      2      2      2      2      2      2      2      2      2      1       0.000664
2.0   2     3     4     1     1      1      1      1      2      1      1      1      2      1      2      2      2      2      2      2      2      2      2      2      2      2      2      5     2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      0       0.000664
            1     1     2     1      1      1      2      2      1      2      1      2      1      2      2      2      2      2      2      1      2      2      2      2      2      2      5     2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      0       0.000664
                                                                                                                                                                                                                                                                                                                                      ...   
6.0   2     2     3     1     1      1      1      2      1      1      1      1      1      1      2      2      1      2      1      1      1      1      1      1      2      2      2      7     2      2      1      1      1      1      2      2      2      2      2      2      2      2      2      2      2      1       0.000332
                                                   1      2      1      2      1      2      1      2      2      2      2      1      2      1      2      2      2      2      2      2      6     2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      0       0.000332
                                                                        1      2      2      2      2      2      1      2      2      2      2      2      2      2      2      2      2      1     2      2      2      2      2      2      2      2      1      2      2      2      2      2      2      2      2      2       0.000332
                                                                               1      2      1      2      2      2      2      2      2      1      2      2      2      2      2      2      2     2      2      1      2      1      2      2      2      2      2      2      2      2      2      2      2      2      1       0.000332
1.0   1     1     1     2     1      1      1      2      2      2      2      2      2      2      2      2      2      2      2      2      1      2      2      2      2      2      2      4     2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      2      0       0.000332
Length: 3002, dtype: float64


Xc.dtypes

p13      category
p120     category
p121     category
p122     category
p123     category
p125a    category
p125b    category
p125c    category
p125d    category
p125e    category
p126a    category
p126b    category
p126c    category
p126d    category
p126e    category
p126f    category
p126g    category
p126h    category
p126i    category
p126j    category
p126k    category
p126l    category
p126m    category
p126n    category
p126o    category
p126p    category
p126q    category
p126r    category
p127     category
p128a    category
p128b    category
p128c    category
p128d    category
p128e    category
p128f    category
p129a    category
p129b    category
p129c    category
p129d    category
p129e    category
p130a    category
p130b    category
p130c    category
p130d    category
p130e    category
p130f    category
p131     category
dtype: object


Xc.p127.cat.categories

Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')


Xc.p127.value_counts(dropna=False)

5    1118
1     559
2     513
4     364
7     168
6     100
9      96
8      46
3      46
Name: p127, dtype: int64


Mc = M.astype('category')
import seaborn as sns

sns.catplot(x = "p133",
            data = Mc,
            col="p13", 
            col_wrap=4,
           kind = 'count',
           hue = 'p122')
plt.show()


Mc.p13.cat.codes

7         2
13        9
23       10
26        1
28        1
         ..
17627     1
17629     1
17631     2
17633     3
17653     3
Length: 3010, dtype: int8


Mc.p13

7         3.0
13       10.0
23       11.0
26        2.0
28        2.0
         ... 
17627     2.0
17629     2.0
17631     3.0
17633     4.0
17653     4.0
Name: p13, Length: 3010, dtype: category
Categories (13, float64): [1.0, 2.0, 3.0, 4.0, ..., 10.0, 11.0, 12.0, 97.0]


codigos = Mc.p13.cat.codes
categorias = Mc.p13
mapa = dict(zip(codigos,categorias))
mapa

{2: 3.0,
 9: 10.0,
 10: 11.0,
 1: 2.0,
 6: 7.0,
 3: 4.0,
 4: 5.0,
 5: 6.0,
 7: 8.0,
 11: 12.0,
 12: 97.0,
 8: 9.0,
 0: 1.0}


a = StandardScaler().fit(M)
M_scaled = a.transform(M)
M_sca = pd.DataFrame(M_scaled)
M_sca.describe()


def histograma(ingresoc,c1,c2):
    a = df2[df2.p133 == ingresoc]
    L = [c1,c2]
    a[L].plot.hist()
    plt.show()
    
histograma(1,'p13','p131')
histograma(2,'p13','p131')
histograma(3,'p13','p131')
histograma(4,'p13','p131')
histograma(5,'p13','p131')
histograma(6,'p13','p131')
histograma(7,'p13','p131')
print(df2.p133.unique())


import seaborn as sns

def filtro(ingresoc,c1,c2):
    a = df2[df2.p133 == ingresoc]
    L = [c1,c2]
    return a[L]

sns.pairplot(filtro(1,'p13','p131'))

<seaborn.axisgrid.PairGrid at 0x7fb42182d2b0>


X_train, X_test, y_train, y_test = train_test_split(pd.get_dummies(Xc), yc, test_size=0.2, random_state=0)
y_train=y_train.to_numpy().ravel()
y_test=y_test.to_numpy().ravel()

from sklearn.feature_selection import VarianceThreshold
# feature selection
def select_var(X_train, X_test,tres):
	fs = VarianceThreshold(threshold=(tres * (1 - tres)))
    #mutual_info_classif
    #chi2
	X_train_fs = fs.fit_transform(X_train)
	X_test_fs = fs.fit_transform(X_test)
	return X_train_fs, X_test_fs, fs


X_train_fs, X_test_fs, fs = select_var(X_train, X_test, 0.8)
print(X_train_fs.shape)
print(X_train.shape)
print(X_test_fs.shape)

logistica(X_train_fs, y_train, X_test_fs, y_test)
arbolbos(X_train_fs, y_train, X_test_fs, y_test)

(1752, 6)
(1752, 305)
(439, 6)
Accuracy reg log multi: 27.11
Accuracy reg log ovr: 27.11
Accuracy bosque: 27.79
Accuracy arbol: 27.79


from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import f_classif

def select_uni(X_train,y_train, X_test,i,k):
    
    metodo = (chi2, mutual_info_classif, f_classif)
    fs = SelectKBest(score_func= metodo[i], k=k)
    #mutual_info_classif
    #chi2
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    plt.bar([i for i in range(len(fs.scores_))], fs.scores_)
    plt.show()
    return X_train_fs, X_test_fs, fs


X_train_fs, X_test_fs, fs = select_uni(X_train, y_train, X_test, 2, 60)
print(X_train_fs.shape)
print(X_train.shape)
print(X_test_fs.shape)

logistica(X_train_fs, y_train, X_test_fs, y_test)
arbolbos(X_train_fs, y_train, X_test_fs, y_test)

/Users/rafamtz/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_selection/_univariate_selection.py:114: UserWarning: Features [ 50  64  66  67  74  87  99 104 142 144 190 201 205 216 217 251 261 265
 276 283] are constant.
  warnings.warn("Features %s are constant." % constant_features_idx,
/Users/rafamtz/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_selection/_univariate_selection.py:116: RuntimeWarning: invalid value encountered in true_divide
  f = msb / msw

(1752, 60)
(1752, 305)
(439, 60)
Accuracy reg log multi: 29.61
Accuracy reg log ovr: 28.93
Accuracy bosque: 29.16
Accuracy arbol: 27.56


from sklearn.feature_selection import RFE

estimator = SVC(kernel="linear")
selector = RFE(estimator, n_features_to_select=None, step=1)
selector = selector.fit(X_train, y_train)
#selector.support_


#selector.ranking_

X_train_fs = selector.transform(X_train)
X_test_fs = selector.transform(X_test)

logistica(X_train_fs, y_train, X_test_fs, y_test)
arbolbos(X_train_fs, y_train, X_test_fs, y_test)

Accuracy reg log multi: 28.47
Accuracy reg log ovr: 28.02
Accuracy bosque: 27.79
Accuracy arbol: 26.20

	0	1	2	3	4	5	6	7	8	9	...	38	39	40	41	42	43	44	45	46	47
count	3.010000e+03	3.010000e+03	3.010000e+03	3.010000e+03	3.010000e+03	3.010000e+03	3.010000e+03	3.010000e+03	3.010000e+03	3.010000e+03	...	3.010000e+03	3.010000e+03	3.010000e+03	3.010000e+03	3.010000e+03	3.010000e+03	3.010000e+03	3.010000e+03	3.010000e+03	3.010000e+03
mean	6.705600e-17	1.583672e-15	3.205114e-15	1.178146e-15	1.643462e-15	-1.311612e-16	-1.701066e-15	-1.516690e-16	2.397270e-15	3.677383e-16	...	2.397123e-16	9.867891e-16	3.595131e-16	-4.730251e-16	2.076597e-16	5.109423e-16	2.085449e-16	4.261357e-16	-3.061412e-17	1.306522e-15
std	1.000166e+00	1.000166e+00	1.000166e+00	1.000166e+00	1.000166e+00	1.000166e+00	1.000166e+00	1.000166e+00	1.000166e+00	1.000166e+00	...	1.000166e+00	1.000166e+00	1.000166e+00	1.000166e+00	1.000166e+00	1.000166e+00	1.000166e+00	1.000166e+00	1.000166e+00	1.000166e+00
min	-4.187076e-01	-2.488140e+00	-1.145163e+00	-1.845989e+00	-7.346033e-01	-2.889867e-01	-1.175132e-01	-3.412753e-01	-9.362471e-01	-3.080228e+00	...	-4.366539e+00	-6.091081e+00	-2.662968e+00	-4.596994e+00	-5.063316e+00	-5.183664e+00	-5.509462e+00	-1.013869e+01	-7.177844e-01	-9.940309e-01
25%	-3.542515e-01	-5.819803e-01	-1.145163e+00	-4.542729e-01	-7.346033e-01	-2.889867e-01	-1.175132e-01	-3.412753e-01	-9.362471e-01	3.246513e-01	...	2.290143e-01	1.641745e-01	3.755209e-01	2.175335e-01	1.974990e-01	1.929137e-01	1.815059e-01	9.863204e-02	-7.177844e-01	-9.940309e-01
50%	-2.253393e-01	-5.819803e-01	-7.667550e-02	2.415853e-01	-7.346033e-01	-2.889867e-01	-1.175132e-01	-3.412753e-01	-9.362471e-01	3.246513e-01	...	2.290143e-01	1.641745e-01	3.755209e-01	2.175335e-01	1.974990e-01	1.929137e-01	1.815059e-01	9.863204e-02	-7.177844e-01	-1.495395e-01
75%	-9.642716e-02	1.324179e+00	9.918119e-01	9.374435e-01	1.361279e+00	-2.889867e-01	-1.175132e-01	-3.412753e-01	1.068094e+00	3.246513e-01	...	2.290143e-01	1.641745e-01	3.755209e-01	2.175335e-01	1.974990e-01	1.929137e-01	1.815059e-01	9.863204e-02	8.123368e-01	6.949519e-01
max	5.769076e+00	1.324179e+00	6.334249e+00	4.416734e+00	1.361279e+00	3.460367e+00	8.509679e+00	2.930186e+00	1.068094e+00	3.246513e-01	...	2.290143e-01	1.641745e-01	3.755209e-01	2.175335e-01	1.974990e-01	1.929137e-01	1.815059e-01	9.863204e-02	5.402700e+00	3.228426e+00

Clasificación tesina CIDE¶

Limpiamos los datos¶

Clasificaciòn con Máquinas de soporte vectorial lineales¶

Visualización Maquinas lineales¶

Clasificación con vecinos¶

Clasificación con Maquinas de soporte vectorial (no lineales)¶

Gradiente descendiente estocastico¶

Perceptron¶

Clasificador Pasivo agresivo¶

Discrminante lineal y cuadratico¶

Processos Gausianos¶

Aproximación con kernel¶

Bayes¶

Ridge clasificación¶

Regresión logistica¶

Clasificación con árboles y bosques¶

Visualización de arboles y bósques¶

Selección de caracteristocas categoricas, arboles¶

Visualización con selección¶

Encodig dummys arboles¶

Visualización con yellowbrick¶

Clasificador con redes neuronales¶

visualización redes neuronlaes¶

Metodos de ensamble¶

Estadística descriptiva¶

Selección de variables¶

Eliminacion recursiva¶

	folio	Estado	folio_ageb	consecutivo	Origen	Latitud	Longitud	LatitudGP	LongitudGP	recontacto	...	region	cdmx	tot_int	rururb	cmo1_2	cmo2_2	cmo3_2	cmo4_2	cmo5_2	tamhog
0	0100100010286020830102	1.0	0100100010286	1	1.0	21.901323	-102.310598	21.901477	-102.310429	2	...	3.0	NaN	5.0	0.0	41	.	.	13	.	5.0
1	0100100010286020850201	1.0	0100100010286	1	2.0	21.901323	-102.310598	21.900773	-102.311138	1	...	3.0	NaN	1.0	0.0	41	.	.	41	.	1.0
2	0100100010286025830201	1.0	0100100010286	1	1.0	21.900830	-102.311818	21.900549	-102.313361	1	...	3.0	NaN	2.0	0.0	81	.	.	11	.	2.0
3	0100100010286025840101	1.0	0100100010286	1	1.0	21.901188	-102.310700	21.900765	-102.313144	1	...	3.0	NaN	1.0	0.0	52	.	.	.	.	1.0
4	0100100010286025850101	1.0	0100100010286	1	2.0	21.901188	-102.310700	21.900577	-102.312733	1	...	3.0	NaN	2.0	0.0	52	.	.	.	.	2.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
17660	3205700010022019460402	32.0	3205700010022	2	1.0	22.755409	-102.513985	22.755409	-102.513985	1	...	2.0	NaN	4.0	0.0	71	.	.	.	71	4.0
17661	3205700010022025450501	32.0	3205700010022	2	1.0	22.288405	-101.577532	22.288405	-101.577532	1	...	2.0	NaN	4.0	0.0	.	.	.	82	71	4.0
17662	3205700010022025460301	32.0	3205700010022	2	1.0	22.758625	-102.499375	22.758625	-102.499375	1	...	2.0	NaN	6.0	0.0	.	.	.	53	52	6.0
17663	3205700010022025460302	32.0	3205700010022	2	1.0	22.755420	-102.513997	22.755420	-102.513997	1	...	2.0	NaN	5.0	0.0	.	52	.	53	62	5.0
17664	3205700010022025460501	32.0	3205700010022	7	1.0	22.758625	-102.499375	22.758625	-102.499375	1	...	2.0	NaN	10.0	0.0	.	.	.	.	41	10.0

	p05_25.0	p05_26.0	p05_27.0	p05_28.0	p05_29.0	p05_30.0	p05_31.0	p05_32.0	p05_33.0	p05_34.0	...	SINCO3_9722	SINCO3_9843	SINCO3_9899	SINCO3_9998	SINCO3_9999	region_1.0	region_2.0	region_3.0	region_4.0	region_5.0
9702	0	0	0	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	1	0
1432	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	1	0	0	0
1391	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	1	0	0	0
3540	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	1
6772	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	1	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
6286	0	0	0	0	0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	1	0
12819	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	0	0	1	0
12353	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	1	0
1845	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	1
1414	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	1	0	0	0

	p13	p120	p121	p122	p123	p125a	p125b	p125c	p125d	p125e	...	p129c	p129d	p129e	p130a	p130b	p130c	p130d	p130e	p130f	p131
count	3010.0	3010	3010	3010	3010	3010	3010	3010	3010	3010	...	3010	3010	3010	3010	3010	3010	3010	3010	3010	3010
unique	13.0	3	8	10	2	2	2	2	2	2	...	2	2	2	2	2	2	2	2	2	5
top	4.0	2	2	3	1	1	1	1	1	2	...	2	2	2	2	2	2	2	2	2	0
freq	776.0	1901	1360	854	1955	2778	2969	2696	1604	2723	...	2754	2860	2931	2638	2874	2897	2902	2914	2981	1825