Multidimensional Scaling

Athens Barcelona Brussels Calais Cherbourg Cologne Barcelona 3313 Brussels 2963 1318 Calais 3175 1326 204 Cherbourg 3339 1294 583 460 Cologne 2762 1498 206 409 785

## SO2 temp manu popul ## Min. : 8.00 Min. :43.50 Min. : 35.0 Min. : 71.0 ## 1st Qu.: 13.00 1st Qu.:50.60 1st Qu.: 181.0 1st Qu.: 299.0 ## Median : 26.00 Median :54.60 Median : 347.0 Median : 515.0 ## Mean : 30.05 Mean :55.76 Mean : 463.1 Mean : 608.6 ## 3rd Qu.: 35.00 3rd Qu.:59.30 3rd Qu.: 462.0 3rd Qu.: 717.0 ## Max. :110.00 Max. :75.50 Max. :3344.0 Max. :3369.0 ## wind precip predays ## Min. : 6.000 Min. : 7.05 Min. : 36.0 ## 1st Qu.: 8.700 1st Qu.:30.96 1st Qu.:103.0 ## Median : 9.300 Median :38.74 Median :115.0 ## Mean : 9.444 Mean :36.77 Mean :113.9 ## 3rd Qu.:10.600 3rd Qu.:43.11 3rd Qu.:128.0 ## Max. :12.700 Max. :59.80 Max. :166.0

xs <- apply(dat, 2, function(x) (x - min(x))/(diff(range(x))))
poldist <- dist(xs)
pol.mds <- cmdscale(poldist, k = 2, eig = TRUE)
x <- pol.mds$points
pol.mds

## $points
##                        [,1]         [,2]
## Albany          0.140558172 -0.046859954
## Albuquerque    -0.364824787 -0.636602091
## Atlanta        -0.155922591  0.244511276
## Baltimore       0.153189990  0.067519907
## Buffalo         0.256244063  0.003022604
## Charleston     -0.128730958  0.215783429
## Chicago         1.197000315  0.009638168
## Cincinnati     -0.084166097  0.106828800
## Cleveland       0.531787447  0.056305378
## Columbus        0.025412911  0.033574934
## Dallas         -0.258008194 -0.062640448
## Denver         -0.110682033 -0.510378502
## Des Moines     -0.007603614 -0.244344703
## Detroit         0.341537781 -0.105917971
## Hartford        0.206766531  0.105259858
## Houston        -0.188167760  0.243707765
## Indianapolis    0.069589745  0.010565926
## Jacksonville   -0.349520267  0.412490203
## Kansas City    -0.106424371 -0.085118726
## Little Rock    -0.355970056  0.194004542
## Louisville     -0.046780470  0.144850917
## Memphis        -0.249259311  0.208737990
## Miami          -0.449823739  0.604996816
## Milwaukee       0.217298744 -0.249612250
## Minneapolis     0.326439578 -0.242858309
## Nashville      -0.215002650  0.211835269
## New Orleans    -0.410715158  0.438263300
## Norfolk        -0.066285208  0.149134571
## Omaha          -0.063335982 -0.241936316
## Philadelphia    0.521031706  0.081089446
## Phoenix        -0.695773353 -0.527859295
## Pittsburgh      0.314965899  0.074640031
## Providence      0.466505620  0.110503750
## Richmond       -0.191967563  0.140461889
## Salt Lake City -0.111111665 -0.461383196
## San Francisco  -0.253430076 -0.401897024
## Seattle         0.170829143  0.147411289
## St. Louis       0.162208664 -0.016576959
## Washington     -0.031338057  0.041417952
## Wichita        -0.149744969 -0.268806546
## Wilmington     -0.056777379  0.046236280
## 
## $eig
##  [1]  4.456648e+00  2.819944e+00  2.256196e+00  1.651762e+00  6.199354e-01
##  [6]  1.904906e-01  3.068220e-02  1.558353e-15  9.406328e-16  2.494225e-16
## [11]  1.736021e-16  1.471280e-16  1.356518e-16  8.017147e-17  7.511957e-17
## [16]  6.686099e-17  5.684599e-17  5.034791e-17  4.025565e-17  3.312471e-17
## [21]  2.974204e-17  1.555983e-17  1.132251e-17  3.668800e-18 -5.206488e-18
## [26] -8.948794e-18 -9.519928e-18 -1.506805e-17 -1.853275e-17 -2.314710e-17
## [31] -2.858271e-17 -3.093804e-17 -3.151435e-17 -3.396470e-17 -7.209856e-17
## [36] -7.714641e-17 -1.524915e-16 -2.390840e-16 -2.833661e-16 -3.238640e-16
## [41] -1.263609e-15
## 
## $x
## NULL
## 
## $ac
## [1] 0
## 
## $GOF
## [1] 0.6050889 0.6050889

library(randomForest) ind <- sample(2,nrow(iris),replace=TRUE,prob=c(0.7,0.3)) trainData <- iris[ind==1,] testData <- iris[ind==2,] iris_rf <- randomForest(Species~.,data=trainData,ntree=100,proximity=TRUE) table(predict(iris_rf),trainData$Species)

set.seed(71) iris.rf <- randomForest(Species ~ ., data=iris, importance=TRUE, proximity=TRUE) ## Do MDS on 1 - proximity: iris.mds <- cmdscale(1 - iris.rf$proximity, eig=TRUE) ## Look at variable importance: round(importance(iris.rf), 2)

## setosa versicolor virginica MeanDecreaseAccuracy ## Sepal.Length 5.88 5.87 9.21 10.62 ## Sepal.Width 5.23 0.31 4.71 4.94 ## Petal.Length 21.60 31.41 27.71 32.39 ## Petal.Width 22.96 33.74 32.07 33.85 ## MeanDecreaseGini ## Sepal.Length 9.37 ## Sepal.Width 2.45 ## Petal.Length 42.13 ## Petal.Width 45.28

op <- par(pty="s") pairs(cbind(iris[,1:4], iris.mds$points), cex=0.6, gap=0, col=c("red", "green", "blue")[as.numeric(iris$Species)], main="Iris Data: Predictors and MDS of Proximity Based on RandomForest")

## [1] "Hunt(R)" "Sandman(R)" "Howard(D)" ## [4] "Thompson(D)" "Freylinghuysen(R)" "Forsythe(R)" ## [7] "Widnall(R)" "Roe(D)" "Heltoski(D)" ## [10] "Rodino(D)" "Minish(D)" "Rinaldo(R)" ## [13] "Maraziti(R)" "Daniels(D)" "Patten(D)"

plot(voting.mds$points[,1], voting.mds$points[,2], type = "n", xlab = "Coordinate 1", ylab = "Coordinate 2", xlim = range(voting.mds$points[,1])*1.2) text(voting.mds$points[,1], voting.mds$points[,2], labels = colnames(voting))

plot(voting_mds$points[,1], voting_mds$points[,2], type = "n", xlab = "Coordinate 1", ylab = "Coordinate 2", xlim = range(voting_mds$points[,1])*1.2) text(voting_mds$points[,1], voting_mds$points[,2], labels = colnames(voting))

Multidimensional Scaling

Which idea is better?

Idea of MDS

Goal of MDS

Examples first - Classical MDS

First Try MDS

Do we recover?

Flip Axes

Another Example

Try MDS at 2-D

Plot the 2-D map

Star plot

Thoery of MDS

Distance

Thoery of MDS

Thoery (continued)

Thoery (hand-waving)

Thoery

Classical MDS

Random Forest and MDS

Plot the Random Forest

Variable Importance

Prediction on new data

Margin of Random Forest

MDS

MDS on the full Iris Data

Plot the variables

Example

Solution - Metric

Plot - Metric

Non-metric

Plot