- Basic Question: How to represent any data in two dimensions?
- Projection
- Squeeze data onto a table: close points stay close.
- Both ideas yield sensible visualization - but show a different aspect of the data.
April 26, 2019
library(datasets); class(eurodist)
## [1] "dist"
Athens Barcelona Brussels Calais Cherbourg Cologne Barcelona 3313 Brussels 2963 1318 Calais 3175 1326 204 Cherbourg 3339 1294 583 460 Cologne 2762 1498 206 409 785
eurocmd <- cmdscale(eurodist) plot(eurocmd, type = "n") text(eurocmd, rownames(eurocmd), cex = 0.7)
plot(eurocmd[,1], -eurocmd[,2], type = "n", asp = 1) text(eurocmd[,1], -eurocmd[,2], rownames(eurocmd), cex = 0.7)
data("USairpollution", package = "HSAUR2") summary(USairpollution)
## SO2 temp manu popul ## Min. : 8.00 Min. :43.50 Min. : 35.0 Min. : 71.0 ## 1st Qu.: 13.00 1st Qu.:50.60 1st Qu.: 181.0 1st Qu.: 299.0 ## Median : 26.00 Median :54.60 Median : 347.0 Median : 515.0 ## Mean : 30.05 Mean :55.76 Mean : 463.1 Mean : 608.6 ## 3rd Qu.: 35.00 3rd Qu.:59.30 3rd Qu.: 462.0 3rd Qu.: 717.0 ## Max. :110.00 Max. :75.50 Max. :3344.0 Max. :3369.0 ## wind precip predays ## Min. : 6.000 Min. : 7.05 Min. : 36.0 ## 1st Qu.: 8.700 1st Qu.:30.96 1st Qu.:103.0 ## Median : 9.300 Median :38.74 Median :115.0 ## Mean : 9.444 Mean :36.77 Mean :113.9 ## 3rd Qu.:10.600 3rd Qu.:43.11 3rd Qu.:128.0 ## Max. :12.700 Max. :59.80 Max. :166.0
dat <- USairpollution # less typing
xs <- apply(dat, 2, function(x) (x - min(x))/(diff(range(x)))) poldist <- dist(xs) pol.mds <- cmdscale(poldist, k = 2, eig = TRUE) x <- pol.mds$points pol.mds
## $points ## [,1] [,2] ## Albany 0.140558172 -0.046859954 ## Albuquerque -0.364824787 -0.636602091 ## Atlanta -0.155922591 0.244511276 ## Baltimore 0.153189990 0.067519907 ## Buffalo 0.256244063 0.003022604 ## Charleston -0.128730958 0.215783429 ## Chicago 1.197000315 0.009638168 ## Cincinnati -0.084166097 0.106828800 ## Cleveland 0.531787447 0.056305378 ## Columbus 0.025412911 0.033574934 ## Dallas -0.258008194 -0.062640448 ## Denver -0.110682033 -0.510378502 ## Des Moines -0.007603614 -0.244344703 ## Detroit 0.341537781 -0.105917971 ## Hartford 0.206766531 0.105259858 ## Houston -0.188167760 0.243707765 ## Indianapolis 0.069589745 0.010565926 ## Jacksonville -0.349520267 0.412490203 ## Kansas City -0.106424371 -0.085118726 ## Little Rock -0.355970056 0.194004542 ## Louisville -0.046780470 0.144850917 ## Memphis -0.249259311 0.208737990 ## Miami -0.449823739 0.604996816 ## Milwaukee 0.217298744 -0.249612250 ## Minneapolis 0.326439578 -0.242858309 ## Nashville -0.215002650 0.211835269 ## New Orleans -0.410715158 0.438263300 ## Norfolk -0.066285208 0.149134571 ## Omaha -0.063335982 -0.241936316 ## Philadelphia 0.521031706 0.081089446 ## Phoenix -0.695773353 -0.527859295 ## Pittsburgh 0.314965899 0.074640031 ## Providence 0.466505620 0.110503750 ## Richmond -0.191967563 0.140461889 ## Salt Lake City -0.111111665 -0.461383196 ## San Francisco -0.253430076 -0.401897024 ## Seattle 0.170829143 0.147411289 ## St. Louis 0.162208664 -0.016576959 ## Washington -0.031338057 0.041417952 ## Wichita -0.149744969 -0.268806546 ## Wilmington -0.056777379 0.046236280 ## ## $eig ## [1] 4.456648e+00 2.819944e+00 2.256196e+00 1.651762e+00 6.199354e-01 ## [6] 1.904906e-01 3.068220e-02 1.558353e-15 9.406328e-16 2.494225e-16 ## [11] 1.736021e-16 1.471280e-16 1.356518e-16 8.017147e-17 7.511957e-17 ## [16] 6.686099e-17 5.684599e-17 5.034791e-17 4.025565e-17 3.312471e-17 ## [21] 2.974204e-17 1.555983e-17 1.132251e-17 3.668800e-18 -5.206488e-18 ## [26] -8.948794e-18 -9.519928e-18 -1.506805e-17 -1.853275e-17 -2.314710e-17 ## [31] -2.858271e-17 -3.093804e-17 -3.151435e-17 -3.396470e-17 -7.209856e-17 ## [36] -7.714641e-17 -1.524915e-16 -2.390840e-16 -2.833661e-16 -3.238640e-16 ## [41] -1.263609e-15 ## ## $x ## NULL ## ## $ac ## [1] 0 ## ## $GOF ## [1] 0.6050889 0.6050889
plot(x[,1], x[,2], type = "n") text(x[,1], x[,2], labels = rownames(x), cex = 0.7)
stars(xs, draw.segments = TRUE, key.loc = c(15,2))
- Jacksonville and New Orleans have similar profile.
\[ d(x,y) \ge 0 \\ d(x,y) = 0 \text{ iff } x = y \\ d(x,y) = d(y,x) \\ d(x,z) \le d(x,y) + d(y,z) \]
Given a dissimilarity matrix \(D = (d_{ij})\), MDS seeks to find \(x_1, \ldots, x_n \in \mathbb{R}^p\), such that: \[ d_{ij} \approx \Vert x_i - x_j \Vert^2 \text{ as close as possible} \]
Such a distance is called non-Euclidean distance.
library(randomForest) ind <- sample(2,nrow(iris),replace=TRUE,prob=c(0.7,0.3)) trainData <- iris[ind==1,] testData <- iris[ind==2,] iris_rf <- randomForest(Species~.,data=trainData,ntree=100,proximity=TRUE) table(predict(iris_rf),trainData$Species)
## ## setosa versicolor virginica ## setosa 35 0 0 ## versicolor 0 37 2 ## virginica 0 1 34
plot(iris_rf)
varImpPlot(iris_rf)
irisPred<-predict(iris_rf,newdata=testData) table(irisPred, testData$Species)
## ## irisPred setosa versicolor virginica ## setosa 15 0 0 ## versicolor 0 11 2 ## virginica 0 1 12
The margin of a data point is defined as the proportion of votes for the correct class minus maximum proportion of votes for the other classes.
Thus under majority votes, positive margin means correct classification, and vice versa.
plot(margin(iris_rf,testData$Species))
randomForest
is the fraction of trees in which elements \(i\) and \(j\) fall in the same terminal node.set.seed(71) iris.rf <- randomForest(Species ~ ., data=iris, importance=TRUE, proximity=TRUE) ## Do MDS on 1 - proximity: iris.mds <- cmdscale(1 - iris.rf$proximity, eig=TRUE) ## Look at variable importance: round(importance(iris.rf), 2)
## setosa versicolor virginica MeanDecreaseAccuracy ## Sepal.Length 5.88 5.87 9.21 10.62 ## Sepal.Width 5.23 0.31 4.71 4.94 ## Petal.Length 21.60 31.41 27.71 32.39 ## Petal.Width 22.96 33.74 32.07 33.85 ## MeanDecreaseGini ## Sepal.Length 9.37 ## Sepal.Width 2.45 ## Petal.Length 42.13 ## Petal.Width 45.28
op <- par(pty="s") pairs(cbind(iris[,1:4], iris.mds$points), cex=0.6, gap=0, col=c("red", "green", "blue")[as.numeric(iris$Species)], main="Iris Data: Predictors and MDS of Proximity Based on RandomForest")
par(op)
data("voting", package = "HSAUR") require("MASS") colnames(voting)
## [1] "Hunt(R)" "Sandman(R)" "Howard(D)" ## [4] "Thompson(D)" "Freylinghuysen(R)" "Forsythe(R)" ## [7] "Widnall(R)" "Roe(D)" "Heltoski(D)" ## [10] "Rodino(D)" "Minish(D)" "Rinaldo(R)" ## [13] "Maraziti(R)" "Daniels(D)" "Patten(D)"
voting.mds <- cmdscale(voting, eig=TRUE) names(voting.mds)
## [1] "points" "eig" "x" "ac" "GOF"
plot(voting.mds$points[,1], voting.mds$points[,2], type = "n", xlab = "Coordinate 1", ylab = "Coordinate 2", xlim = range(voting.mds$points[,1])*1.2) text(voting.mds$points[,1], voting.mds$points[,2], labels = colnames(voting))
voting_mds <- isoMDS(voting)
## initial value 15.268246 ## iter 5 value 10.264075 ## final value 9.879047 ## converged
plot(voting_mds$points[,1], voting_mds$points[,2], type = "n", xlab = "Coordinate 1", ylab = "Coordinate 2", xlim = range(voting_mds$points[,1])*1.2) text(voting_mds$points[,1], voting_mds$points[,2], labels = colnames(voting))