{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![](https://raw.githubusercontent.com/rafneta/CienciaDatosPythonCIDE/master/imagenes/banner.png)\n",
    "\n",
    "\n",
    "# NumPy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import wooldridge as woo\n",
    "import statsmodels.formula.api as smf\n",
    "import matplotlib.pyplot as plt\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Regresión Simple \n",
    "\n",
    "\n",
    "$$y = \\beta_0 +\\beta_1 x + u$$\n",
    "\n",
    "$$\\hat\\beta_0 = \\bar{y}-\\hat\\beta_1 \\bar{x}$$\n",
    "$$\\hat\\beta_1= \\frac{Cov(x,y)}{Var(x)}$$\n",
    "\n",
    "### Wooldridge 2016, ejemplo-2-3 "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Cargamos las regresiones"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "archi_o = open(r'../Lab8/regsmf.pkl', 'rb')\n",
    "reg_smf = pickle.load(archi_o)\n",
    "archi_o.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>OLS Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>         <td>salary</td>      <th>  R-squared:         </th> <td>   0.013</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.008</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   2.767</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>             <td>Thu, 15 Apr 2021</td> <th>  Prob (F-statistic):</th>  <td>0.0978</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                 <td>14:45:18</td>     <th>  Log-Likelihood:    </th> <td> -1804.5</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>No. Observations:</th>      <td>   209</td>      <th>  AIC:               </th> <td>   3613.</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Residuals:</th>          <td>   207</td>      <th>  BIC:               </th> <td>   3620.</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Model:</th>              <td>     1</td>      <th>                     </th>     <td> </td>   \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>   \n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "      <td></td>         <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Intercept</th> <td>  963.1913</td> <td>  213.240</td> <td>    4.517</td> <td> 0.000</td> <td>  542.790</td> <td> 1383.592</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>roe</th>       <td>   18.5012</td> <td>   11.123</td> <td>    1.663</td> <td> 0.098</td> <td>   -3.428</td> <td>   40.431</td>\n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "  <th>Omnibus:</th>       <td>311.096</td> <th>  Durbin-Watson:     </th> <td>   2.105</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Prob(Omnibus):</th> <td> 0.000</td>  <th>  Jarque-Bera (JB):  </th> <td>31120.902</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Skew:</th>          <td> 6.915</td>  <th>  Prob(JB):          </th> <td>    0.00</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Kurtosis:</th>      <td>61.158</td>  <th>  Cond. No.          </th> <td>    43.3</td> \n",
       "</tr>\n",
       "</table><br/><br/>Notes:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified."
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                            OLS Regression Results                            \n",
       "==============================================================================\n",
       "Dep. Variable:                 salary   R-squared:                       0.013\n",
       "Model:                            OLS   Adj. R-squared:                  0.008\n",
       "Method:                 Least Squares   F-statistic:                     2.767\n",
       "Date:                Thu, 15 Apr 2021   Prob (F-statistic):             0.0978\n",
       "Time:                        14:45:18   Log-Likelihood:                -1804.5\n",
       "No. Observations:                 209   AIC:                             3613.\n",
       "Df Residuals:                     207   BIC:                             3620.\n",
       "Df Model:                           1                                         \n",
       "Covariance Type:            nonrobust                                         \n",
       "==============================================================================\n",
       "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
       "------------------------------------------------------------------------------\n",
       "Intercept    963.1913    213.240      4.517      0.000     542.790    1383.592\n",
       "roe           18.5012     11.123      1.663      0.098      -3.428      40.431\n",
       "==============================================================================\n",
       "Omnibus:                      311.096   Durbin-Watson:                   2.105\n",
       "Prob(Omnibus):                  0.000   Jarque-Bera (JB):            31120.902\n",
       "Skew:                           6.915   Prob(JB):                         0.00\n",
       "Kurtosis:                      61.158   Cond. No.                         43.3\n",
       "==============================================================================\n",
       "\n",
       "Notes:\n",
       "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
       "\"\"\""
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reg_smf.fit().summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Bondad de ajuste\n",
    "\n",
    "- Suma Total de Cuadrados\n",
    "\n",
    "$$\n",
    "\\mathrm{STC} \\equiv \\sum_{i=1}^{n}\\left(y_{i}-\\bar{y}\\right)^{2}\n",
    "$$\n",
    "\n",
    "- Suma Explicada de Cuadrados\n",
    "\n",
    "$$\n",
    "\\mathrm{SEC} \\equiv \\sum_{i=1}^{n}\\left(\\hat{y}_{i}-\\bar{y}\\right)^{2}\n",
    "$$\n",
    "\n",
    "\n",
    "- Suma Residual de Cuadrados\n",
    "\n",
    "$$\n",
    "\\mathrm{SRC} \\equiv \\sum_{i=1}^{n} \\hat{u}_{i}^{2}\n",
    "$$\n",
    "\n",
    "**Bondad de ajuste**\n",
    "\n",
    "$$\n",
    "R^{2} \\equiv \\mathrm{SEC} / \\mathrm{STC}=1-\\mathrm{SRC} / \\mathrm{STC}\n",
    "$$\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1224.058071\n",
       "1    1164.854261\n",
       "2    1397.969216\n",
       "3    1072.348338\n",
       "4    1218.507712\n",
       "dtype: float64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y = reg_smf.endog\n",
    "x = reg_smf.exog[:,1]\n",
    "\n",
    "ym = np.mean(y)\n",
    "yh = reg_smf.fit().fittedvalues\n",
    "yh.head()\n",
    "# imprimir tipos de datos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1224.058071\n",
       "1    1164.854261\n",
       "2    1397.969216\n",
       "3    1072.348338\n",
       "4    1218.507712\n",
       "dtype: float64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "yh = reg_smf.fit().predict(exog=dict(roe=x))\n",
    "yh.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "391732982.00956935"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stc = np.sum(np.power(y-ym,2))\n",
    "stc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5166419.039866708"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sec = np.sum(np.power(yh-ym,2))\n",
    "sec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "386566562.96970266"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uh = y - yh   # residuales\n",
    "\n",
    "src = np.sum(uh ** 2)\n",
    "src"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stc == sec + src"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bondad de ajuste 0.01318862408103405\n"
     ]
    }
   ],
   "source": [
    "r2 = 1 - src/stc\n",
    "print(\"Bondad de ajuste\",r2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Errores estándar\n",
    "\n",
    "- Estimador de la varianza $\\sigma^2$ de los errores $u_i$\n",
    "\n",
    "$$\n",
    "\\hat{\\sigma}^{2}=\\frac{1}{(n-2)} \\sum_{i=1}^{n} \\hat{u}_{i}^{2}=\\operatorname{SRC} /(n-2)\n",
    "$$\n",
    "\n",
    "- Error estándar de la regresion (EER, error estandar de la estimación, raí< del error cuadratico medio) es la estimación de la desviación estandar $\\sigma$ de los errores $u_i$\n",
    "\n",
    "$$\\hat\\sigma = \\sqrt{\\hat{\\sigma}^{2}}=\\sqrt{\\frac{1}{(n-2)} \\sum_{i=1}^{n} \\hat{u}_{i}^{2}}=\\sqrt{\\operatorname{SRC} /(n-2)}$$\n",
    "\n",
    "\n",
    "- Error estándar de $\\hat\\beta_0$, es el estimador de la desviación estándar de $\\hat\\beta_0$ \n",
    "\n",
    "$$\n",
    "ee(\\beta_0)=\\hat{\\operatorname{de}}\\left(\\hat{\\beta}_{0}\\right)=\\hat{\\sigma}  \\sqrt{\\frac{\\mathrm{SC}_x}{n\\mathrm{STC}_x}}=\\hat{\\sigma} \\sqrt{\\frac{\\sum_{i=1}^{n}x_i^2}{n\\sum_{i=1}^{n}\\left(x_{i}-\\bar{x}\\right)^{2}}}\n",
    "$$\n",
    "\n",
    "\n",
    "- Error estándar de $\\hat\\beta_1$, es el estimador de la desviación estándar de $\\hat\\beta_1$ \n",
    "\n",
    "$$\n",
    "ee(\\beta_1)=\\hat{\\operatorname{de}}\\left(\\hat{\\beta}_{1}\\right)=\\hat{\\sigma} / \\sqrt{\\mathrm{STC}_x}=\\hat{\\sigma} /\\sqrt{\\sum_{i=1}^{n}\\left(x_{i}-\\bar{x}\\right)^{2}}\n",
    "$$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1867471.3186942157"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n = reg_smf.nobs\n",
    "varhu = src/(n-2)\n",
    "varhu"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1366.5545428903363"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "eer = np.sqrt(varhu)\n",
    "eer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "213.24025690501887"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xm = np.mean(x)\n",
    "stcx = np.sum(np.power(x-xm,2))\n",
    "scx = np.sum(np.power(x,2))\n",
    "eeb0 = eer*np.sqrt(scx/(n*stcx))\n",
    "eeb0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "11.123250903287637"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "eeb1 = eer/np.sqrt(stcx)\n",
    "eeb1\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Estadísticos \n",
    "\n",
    "Bajo las condiciones adecuadas se tiene que:\n",
    "\n",
    "$$\n",
    "\\left(\\hat{\\beta}_{j}-\\beta_{j}\\right) / \\operatorname{ee}\\left(\\hat{\\beta}_{j}\\right) \\sim t_{n-2},\\quad j\\in\\{0,1\\}\n",
    "$$\n",
    "\n",
    "Se pueden plantear las siguientes pruebas de hipótesis\n",
    "\n",
    "$$H_0:\\beta_j=0   \\qquad H_a: \\beta_j\\neq 0$$\n",
    "\n",
    "Para cada muestra de datos $(x,y)_m$ se tienen las estimaciones $\\hat\\beta_j$ de $\\beta_j$, es por ello que se habla de la distribución de la variable aleatoria indicada.  \n",
    "\n",
    "Para una muestra dada, si aplicamos la hipótesis nula \n",
    "\n",
    "$$\\hat{\\beta}_{j}/ \\operatorname{ee}\\left(\\hat{\\beta}_{j}\\right)=t_{\\hat\\beta_j}$$\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4.516930107158806"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "regresion = reg_smf.fit()\n",
    "b0=regresion.params[0]\n",
    "tb0 = b0/eeb0\n",
    "tb0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.66328949208065"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b1=regresion.params[1]\n",
    "tb1 = b1/eeb1\n",
    "tb1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class = \"slide-w\">\n",
    "<div class = \"video-container\">\n",
    "<iframe src=\"https://www.desmos.com/calculator/jh4tz6vbyw\" width=\"100%\" height=\"650px\" scrolling=\"no\" frameborder=\"0\" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>\n",
    "    </div>\n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "<div class = \"slide-w\">\n",
    "<div class = \"video-container\">\n",
    "<iframe src=\"https://www.desmos.com/calculator/qsklmkjpxf\" width=\"100%\" height=\"650px\" scrolling=\"no\" frameborder=\"0\" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>\n",
    "    </div>\n",
    "</div>\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Valor $p$ \n",
    "\n",
    "\n",
    "$$p_{\\hat\\beta_j}\\text{-value}=P(|t_{n-2}|> |t_{\\hat\\beta_j}|) = P(t_{n-2}<-|t_{\\hat\\beta_j}|\\cup |t_{\\hat\\beta_j}|<t_{n-2})=P(t_{n-2}<-|t_{\\hat\\beta_j}|)+P(|t_{\\hat\\beta_j}|<t_{n-2})$$\n",
    "\n",
    "\n",
    "Para ello necesitamos el submódulo de scipy: [stats](https://docs.scipy.org/doc/scipy/reference/stats.html), en este caso la distribución $t$ tiene la siguiente descripción\n",
    "\n",
    "$$\n",
    "f(x, \\nu)=\\frac{\\Gamma((\\nu+1) / 2)}{\\sqrt{\\pi \\nu} \\Gamma(\\nu / 2)}\\left(1+x^{2} / \\nu\\right)^{-(\\nu+1) / 2}\n",
    "$$\n",
    "\n",
    "donde $\\nu$ son los grados de libertad, y `t.pdf(x, df, loc, scale)`, con la sigeuinte tranformación `y = (x - loc) / scale.`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0533519915197438e-05"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from scipy.stats import t\n",
    "\n",
    "df = 207\n",
    "\n",
    "rv = t(df)\n",
    "\n",
    "pb0 = rv.cdf(-tb0) + (1-rv.cdf(tb0)) # argumentos de cdf: loc = 0, scale = 1 \n",
    "pb0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.09776774891928593"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pb1 = rv.cdf(-tb1) + (1-rv.cdf(tb1))\n",
    "pb1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Intercept    0.000011\n",
       "roe          0.097768\n",
       "dtype: float64"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "regresion.pvalues"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Regresión Múltiple\n",
    "\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  },
  "toc-showcode": false,
  "toc-showmarkdowntxt": false,
  "toc-showtags": true
 },
 "nbformat": 4,
 "nbformat_minor": 5
}