Case 1:

O data set MEDIFIS contém informações sobre diversas medidas físicas de homens (1) e mulheres (0). Estamos interessados em classificar pessoas por sexo, segundo as medidas físicas que elas apresentam.

Os dados estão disponíveis aqui

library(MASS)
library(dplyr)
medifis <- read.table("https://raw.githubusercontent.com/jvega68/EA3/master/datos/medifis.dat")
glimpse(medifis)
Rows: 27
Columns: 8
$ V1 <int> 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, …
$ V2 <dbl> 159, 164, 172, 167, 164, 161, 168, 181, 18…
$ V3 <dbl> 49, 62, 65, 52, 51, 67, 48, 74, 74, 50, 65…
$ V4 <dbl> 36, 39, 38, 37, 36, 38, 39, 43, 41, 36, 36…
$ V5 <dbl> 68.0, 73.0, 75.0, 73.0, 71.0, 71.0, 72.5, …
$ V6 <dbl> 42.0, 44.0, 48.0, 41.5, 44.5, 44.0, 41.0, …
$ V7 <dbl> 57.0, 55.0, 58.0, 58.0, 54.0, 56.0, 54.5, …
$ V8 <dbl> 40, 44, 44, 44, 40, 42, 43, 47, 47, 41, 41…
colnames(medifis) <- c("sexo", "altura", "peso", "pe", "braco", "costas", "diam_craneo", "joelho")
glimpse(medifis)
Rows: 27
Columns: 8
$ sexo        <int> 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, …
$ altura      <dbl> 159, 164, 172, 167, 164, 161, 168, 181,…
$ peso        <dbl> 49, 62, 65, 52, 51, 67, 48, 74, 74, 50,…
$ pe          <dbl> 36, 39, 38, 37, 36, 38, 39, 43, 41, 36,…
$ braco       <dbl> 68.0, 73.0, 75.0, 73.0, 71.0, 71.0, 72.…
$ costas      <dbl> 42.0, 44.0, 48.0, 41.5, 44.5, 44.0, 41.…
$ diam_craneo <dbl> 57.0, 55.0, 58.0, 58.0, 54.0, 56.0, 54.…
$ joelho      <dbl> 40, 44, 44, 44, 40, 42, 43, 47, 47, 41,…

EAD

library(tidyr)
library(ggplot2)
medifis %>% pivot_longer(!sexo, names_to = "variaveis") %>% ggplot() + 
  geom_boxplot(aes(y = value, group = factor(sexo), fill = factor(sexo))) + 
  facet_wrap(.~variaveis, scales = "free_y")

Suposições:

biotools::boxM(medifis[,-1], medifis$sexo)

    Box's M-test for Homogeneity of Covariance Matrices

data:  medifis[, -1]
Chi-Sq (approx.) = 39.871, df = 28, p-value = 0.06789
mvnormalTest::mardia(medifis[medifis$sexo == 0, -1])
$mv.test

$uv.shapiro
            W      p-value UV.Normality
altura      0.9509 0.5383  Yes         
peso        0.8966 0.0845  Yes         
pe          0.8904 0.068   Yes         
braco       0.94   0.3821  Yes         
costas      0.9515 0.5477  Yes         
diam_craneo 0.9602 0.6951  Yes         
joelho      0.9475 0.4865  Yes         
mvnormalTest::mardia(medifis[medifis$sexo == 1, -1])
$mv.test

$uv.shapiro
            W      p-value UV.Normality
altura      0.9613 0.8028  Yes         
peso        0.9453 0.5702  Yes         
pe          0.9602 0.7868  Yes         
braco       0.9215 0.2984  Yes         
costas      0.9567 0.7361  Yes         
diam_craneo 0.9822 0.9909  Yes         
joelho      0.8495 0.0362  No          

Análise Discriminante

discrinante_linear <- lda(sexo ~ ., data = medifis)
discrinante_linear
Call:
lda(sexo ~ ., data = medifis)

Prior probabilities of groups:
        0         1 
0.5555556 0.4444444 

Group means:
    altura  peso       pe    braco   costas diam_craneo
0 161.7333 55.60 36.83333 70.03333 43.33333    56.63333
1 177.5833 74.25 41.66667 77.75000 49.00000    58.00000
    joelho
0 41.06667
1 45.62500

Coefficients of linear discriminants:
                     LD1
altura      -0.088142097
peso        -0.005872569
pe           0.650276334
braco        0.134222958
costas       0.123645380
diam_craneo -0.202387598
joelho       0.101000944
classificacao <- predict(discrinante_linear, medifis[, -1])$class
table(classificacao, medifis$sexo)
             
classificacao  0  1
            0 15  0
            1  0 12

Significa que o modelo nunca erra?

Como podemos estimar a probabilidade de erro?

  • Fórmula fechada (apenas valido neste caso em particular)
  • Validação cruzada.

Coefficients of linear discriminants?

discrinante_linear$scaling
                     LD1
altura      -0.088142097
peso        -0.005872569
pe           0.650276334
braco        0.134222958
costas       0.123645380
diam_craneo -0.202387598
joelho       0.101000944
  • Fisher (1938), sob outra abordagem que não assume normalidade, derivou uma forma de classificar (método conhecido como Análise Discriminante Linear de Fisher).
  • A ideia é transformar \(\textbf{x} \in \mathbb{R}^p\) em \(z = \alpha'\textbf{x} \in \mathbb{R}\) de forma que \(z\) separe \(P_1\) de \(P_2\) o máximo que puder.
  • Sejam \(z_{11}, \cdots, z_{1n_1}\) as transformações \(z = \alpha'\textbf{x}\) para \(\textbf{x} \in P_1\) e sejam \(z_{21}, \cdots, z_{2n_2}\) as transformações \(z = \alpha'\textbf{x}\) para \(\textbf{x} \in P_2\)
  • Uma forma de medir a separação de ambos os grupos é atraves das suas médias \(\bar{z}_1\) e \(\bar{z}_2\). Assim queremos maximizar \[\phi = \dfrac{(\bar{z}_1 - \bar{z}_2)^2}{s^2_z} = \dfrac{(\alpha' [\bar{x}_1 - \bar{x}_2])^2}{\alpha' \textbf{S}_x \alpha} = \dfrac{\alpha' \overbrace{(\bar{x}_1 - \bar{x}_2)(\bar{x}_1 - \bar{x}_2)'}^{\textbf{B}}\alpha}{\alpha' \textbf{S}_x \alpha}\]

Derivando w.r.t \(\alpha\) e igualando a zero, temos: \[\dfrac{2\textbf{B}\alpha(\alpha'\textbf{S}_x \alpha) - \alpha' \textbf{B}\alpha 2 \textbf{S}_x \alpha}{(\alpha'\textbf{S}_x \alpha)^2} = 0\]

\[\textbf{B}\alpha \alpha'\textbf{S}_x\alpha = \alpha'\textbf{B}\alpha \textbf{S}_x \alpha \rightarrow \textbf{B}\alpha = \underbrace{\dfrac{\alpha'\textbf{B}\alpha}{\alpha'\textbf{S}_x \alpha}}_{\phi} \textbf{S}_x \alpha \rightarrow \textbf{S}_x^{-1}\textbf{B}\alpha = \phi \alpha\]

x <- medifis %>% filter(sexo == 0) %>% dplyr::select(!sexo)
y <- medifis %>% filter(sexo == 1) %>% dplyr::select(!sexo)
mu_1 <- matrix(apply(x, 2, mean), ncol = 1)
mu_2 <- matrix(apply(y, 2, mean), ncol = 1)
n_1 <- nrow(x)
n_2 <- nrow(y)
S <- (n_1 - 1)*cov(x) + (n_2 - 1)*cov(y)
S <- S/(n_1 + n_2 - 2)
B <- (mu_1 - mu_2) %*% t(mu_1 - mu_2)
eigen(solve(S) %*% B)$vectors[,1]
[1]  0.122807669+0i  0.008182203+0i -0.906024743+0i
[4] -0.187011759+0i -0.172274105+0i  0.281985000+0i
[7] -0.140723796+0i
discrinante_linear$scaling/sqrt(sum(discrinante_linear$scaling^2))
                     LD1
altura      -0.122807669
peso        -0.008182203
pe           0.906024743
braco        0.187011759
costas       0.172274105
diam_craneo -0.281985000
joelho       0.140723796

Observação:

LS0tCnRpdGxlOiAiTUU3MzE6IEFuw6FsaXNlIERpc2NyaW1pbm5hdGUiCmF1dGhvcjogIlByb2YuIENhcmxvcyBUcnVjw61vcyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCiMjIENhc2UgMToKCk8gX2RhdGEgc2V0XyBNRURJRklTIGNvbnTDqW0gaW5mb3JtYcOnw7VlcyBzb2JyZSBkaXZlcnNhcyBtZWRpZGFzIGbDrXNpY2FzIGRlIGhvbWVucyAoMSkgZSBtdWxoZXJlcyAoMCkuIEVzdGFtb3MgaW50ZXJlc3NhZG9zIGVtIGNsYXNzaWZpY2FyIHBlc3NvYXMgcG9yIHNleG8sIHNlZ3VuZG8gYXMgbWVkaWRhcyBmw61zaWNhcyBxdWUgZWxhcyBhcHJlc2VudGFtLgoKPiBPcyBkYWRvcyBlc3TDo28gZGlzcG9uw612ZWlzIFthcXVpXShodHRwczovL3Jhdy5naXRodWJ1c2VyY29udGVudC5jb20vanZlZ2E2OC9FQTMvbWFzdGVyL2RhdG9zL21lZGlmaXMuZGF0KQoKYGBge3IsIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9CmxpYnJhcnkoTUFTUykKbGlicmFyeShkcGx5cikKbWVkaWZpcyA8LSByZWFkLnRhYmxlKCJodHRwczovL3Jhdy5naXRodWJ1c2VyY29udGVudC5jb20vanZlZ2E2OC9FQTMvbWFzdGVyL2RhdG9zL21lZGlmaXMuZGF0IikKZ2xpbXBzZShtZWRpZmlzKQpgYGAKCmBgYHtyfQpjb2xuYW1lcyhtZWRpZmlzKSA8LSBjKCJzZXhvIiwgImFsdHVyYSIsICJwZXNvIiwgInBlIiwgImJyYWNvIiwgImNvc3RhcyIsICJkaWFtX2NyYW5lbyIsICJqb2VsaG8iKQpnbGltcHNlKG1lZGlmaXMpCmBgYAoKCiMjIyBFQUQKCmBgYHtyLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQpsaWJyYXJ5KHRpZHlyKQpsaWJyYXJ5KGdncGxvdDIpCm1lZGlmaXMgJT4lIHBpdm90X2xvbmdlcighc2V4bywgbmFtZXNfdG8gPSAidmFyaWF2ZWlzIikgJT4lIGdncGxvdCgpICsgCiAgZ2VvbV9ib3hwbG90KGFlcyh5ID0gdmFsdWUsIGdyb3VwID0gZmFjdG9yKHNleG8pLCBmaWxsID0gZmFjdG9yKHNleG8pKSkgKyAKICBmYWNldF93cmFwKC5+dmFyaWF2ZWlzLCBzY2FsZXMgPSAiZnJlZV95IikKYGBgCgoKIyMjIFN1cG9zacOnw7VlczoKCmBgYHtyfQpiaW90b29sczo6Ym94TShtZWRpZmlzWywtMV0sIG1lZGlmaXMkc2V4bykKYGBgCgoKCmBgYHtyfQptdm5vcm1hbFRlc3Q6Om1hcmRpYShtZWRpZmlzW21lZGlmaXMkc2V4byA9PSAwLCAtMV0pCm12bm9ybWFsVGVzdDo6bWFyZGlhKG1lZGlmaXNbbWVkaWZpcyRzZXhvID09IDEsIC0xXSkKYGBgCgoKCiMjIyBBbsOhbGlzZSBEaXNjcmltaW5hbnRlCgpgYGB7cn0KZGlzY3JpbmFudGVfbGluZWFyIDwtIGxkYShzZXhvIH4gLiwgZGF0YSA9IG1lZGlmaXMpCmRpc2NyaW5hbnRlX2xpbmVhcgpgYGAKCgpgYGB7cn0KY2xhc3NpZmljYWNhbyA8LSBwcmVkaWN0KGRpc2NyaW5hbnRlX2xpbmVhciwgbWVkaWZpc1ssIC0xXSkkY2xhc3MKdGFibGUoY2xhc3NpZmljYWNhbywgbWVkaWZpcyRzZXhvKQpgYGAKCj4gU2lnbmlmaWNhIHF1ZSBvIG1vZGVsbyBudW5jYSBlcnJhPyAKCkNvbW8gcG9kZW1vcyBlc3RpbWFyIGEgcHJvYmFiaWxpZGFkZSBkZSBlcnJvPwoKLSBGw7NybXVsYSBmZWNoYWRhIChhcGVuYXMgdmFsaWRvIG5lc3RlIGNhc28gZW0gcGFydGljdWxhcikKLSBWYWxpZGHDp8OjbyBjcnV6YWRhLgoKCgoqKkNvZWZmaWNpZW50cyBvZiBsaW5lYXIgZGlzY3JpbWluYW50cz8qKgoKCiFbXShpbWFnZW5zL2R1dmlkYV9tZW1lLmpwZykKCmBgYHtyfQpkaXNjcmluYW50ZV9saW5lYXIkc2NhbGluZwpgYGAKCgotIEZpc2hlciAoMTkzOCksIHNvYiBvdXRyYSBhYm9yZGFnZW0gcXVlIG7Do28gYXNzdW1lIG5vcm1hbGlkYWRlLCBkZXJpdm91IHVtYSBmb3JtYSBkZSBjbGFzc2lmaWNhciAobcOpdG9kbyBjb25oZWNpZG8gY29tbyAqKkFuw6FsaXNlIERpc2NyaW1pbmFudGUgTGluZWFyIGRlIEZpc2hlcioqKS4KLSBBIGlkZWlhIMOpIHRyYW5zZm9ybWFyICRcdGV4dGJme3h9IFxpbiBcbWF0aGJie1J9XnAkIGVtICR6ID0gXGFscGhhJ1x0ZXh0YmZ7eH0gXGluIFxtYXRoYmJ7Un0kIGRlIGZvcm1hIHF1ZSAkeiQgc2VwYXJlICRQXzEkIGRlICRQXzIkIG8gbcOheGltbyBxdWUgcHVkZXIuCi0gU2VqYW0gJHpfezExfSwgXGNkb3RzLCB6X3sxbl8xfSQgYXMgdHJhbnNmb3JtYcOnw7VlcyAkeiA9IFxhbHBoYSdcdGV4dGJme3h9JCBwYXJhICRcdGV4dGJme3h9IFxpbiBQXzEkIGUgc2VqYW0gJHpfezIxfSwgXGNkb3RzLCB6X3sybl8yfSQgYXMgdHJhbnNmb3JtYcOnw7VlcyAkeiA9IFxhbHBoYSdcdGV4dGJme3h9JCBwYXJhICRcdGV4dGJme3h9IFxpbiBQXzIkIAotIFVtYSBmb3JtYSBkZSBtZWRpciBhIHNlcGFyYcOnw6NvIGRlIGFtYm9zIG9zIGdydXBvcyDDqSBhdHJhdmVzIGRhcyBzdWFzIG3DqWRpYXMgJFxiYXJ7en1fMSQgZSAkXGJhcnt6fV8yJC4gQXNzaW0gcXVlcmVtb3MgbWF4aW1pemFyICQkXHBoaSA9IFxkZnJhY3soXGJhcnt6fV8xIC0gXGJhcnt6fV8yKV4yfXtzXjJfen0gPSBcZGZyYWN7KFxhbHBoYScgW1xiYXJ7eH1fMSAtIFxiYXJ7eH1fMl0pXjJ9e1xhbHBoYScgXHRleHRiZntTfV94IFxhbHBoYX0gPSBcZGZyYWN7XGFscGhhJyBcb3ZlcmJyYWNleyhcYmFye3h9XzEgLSBcYmFye3h9XzIpKFxiYXJ7eH1fMSAtIFxiYXJ7eH1fMiknfV57XHRleHRiZntCfX1cYWxwaGF9e1xhbHBoYScgXHRleHRiZntTfV94IFxhbHBoYX0kJAoKRGVyaXZhbmRvIHcuci50ICRcYWxwaGEkIGUgaWd1YWxhbmRvIGEgemVybywgdGVtb3M6ICQkXGRmcmFjezJcdGV4dGJme0J9XGFscGhhKFxhbHBoYSdcdGV4dGJme1N9X3ggXGFscGhhKSAtIFxhbHBoYScgXHRleHRiZntCfVxhbHBoYSAyIFx0ZXh0YmZ7U31feCBcYWxwaGF9eyhcYWxwaGEnXHRleHRiZntTfV94IFxhbHBoYSleMn0gPSAwJCQKCiQkXHRleHRiZntCfVxhbHBoYSBcYWxwaGEnXHRleHRiZntTfV94XGFscGhhID0gXGFscGhhJ1x0ZXh0YmZ7Qn1cYWxwaGEgXHRleHRiZntTfV94IFxhbHBoYSBccmlnaHRhcnJvdyBcdGV4dGJme0J9XGFscGhhID0gXHVuZGVyYnJhY2V7XGRmcmFje1xhbHBoYSdcdGV4dGJme0J9XGFscGhhfXtcYWxwaGEnXHRleHRiZntTfV94IFxhbHBoYX19X3tccGhpfSBcdGV4dGJme1N9X3ggXGFscGhhIFxyaWdodGFycm93IFx0ZXh0YmZ7U31feF57LTF9XHRleHRiZntCfVxhbHBoYSA9IFxwaGkgXGFscGhhJCQKCgoKYGBge3J9CnggPC0gbWVkaWZpcyAlPiUgZmlsdGVyKHNleG8gPT0gMCkgJT4lIGRwbHlyOjpzZWxlY3QoIXNleG8pCnkgPC0gbWVkaWZpcyAlPiUgZmlsdGVyKHNleG8gPT0gMSkgJT4lIGRwbHlyOjpzZWxlY3QoIXNleG8pCm11XzEgPC0gbWF0cml4KGFwcGx5KHgsIDIsIG1lYW4pLCBuY29sID0gMSkKbXVfMiA8LSBtYXRyaXgoYXBwbHkoeSwgMiwgbWVhbiksIG5jb2wgPSAxKQpuXzEgPC0gbnJvdyh4KQpuXzIgPC0gbnJvdyh5KQpTIDwtIChuXzEgLSAxKSpjb3YoeCkgKyAobl8yIC0gMSkqY292KHkpClMgPC0gUy8obl8xICsgbl8yIC0gMikKQiA8LSAobXVfMSAtIG11XzIpICUqJSB0KG11XzEgLSBtdV8yKQplaWdlbihzb2x2ZShTKSAlKiUgQikkdmVjdG9yc1ssMV0KYGBgCgoKYGBge3J9CmRpc2NyaW5hbnRlX2xpbmVhciRzY2FsaW5nL3NxcnQoc3VtKGRpc2NyaW5hbnRlX2xpbmVhciRzY2FsaW5nXjIpKQpgYGAKCiMjIE9ic2VydmHDp8OjbzoKCi0gVmFsaWRhw6fDo28gY3J1emFkYSDDqSB1bWEgdMOpY25pY2EgcGFyYSBlc3RpbWFyIGEgcHJvYmFiaWxpZGFkZSBkZSBlcnJvIChwb2lzIG7Do28gc2VtcHJlIHBvZGVtb3MgZXN0aW1hciBlc3RhIHByb2JhYmlsaWRhZGUgZGUgZm9ybWEgYW5hbMOtdGljYSkuCi0gTmEgcHLDoXRpY2EsIHF1YW5kbyBvIHRhbWFuaG8gYW1vc3RyYWwgw6kgcGVxdWVubywgbWVzbW8gcXVlICRcU2lnbWFfMSBcbmVxIFxTaWdtYV8yJC4gQW7DoWxpc2UgZGlzY3JpbWluYW50ZSBsaW5lYXIgcG9kZSBhcHJlc2VudGFyIHVtIG1lbGhvciBkZXNlbXBlbmhvLiBJc3RvIMOpIGRldmlkbyBhIHF1ZSBhbsOhbGlzZSBkaXNjcmluYW50ZSBxdWFkcsOhdGljYSBpbXBsaWNhIGVzdGltYXIgbXVpdG9zIHBhcsOibWV0cm9zIGFkaWNpb25haXMuCi0gRXhpc3RlbSBkaXZlcnNhcyBleHRlbnPDtWVzIGRlIGFuw6FsaXNlIGRpc2NyaW1pbmFudGUgKGFsZ3VtYXMgZGVsYXMgc2Vyw6NvIGFwcmVzZW50YWRhcyBub3MgdHJhYmFsaG9zIGRhIHByw7N4aW1hIHNlbWFuYSkKCgo=