Cluster analysis of college major and salary in R

Introduction

Choosing a major can be a daunting task. I have worked on clustering analysis to figure out high paying majors based on salary data in initial, mid and end career stages.

library(tidyverse)
library(factoextra)
library(cluster)

df <- readr::read_csv("main_data.csv", col_names = c("College.Major", "Starting.Median.Salary", 
                                                     "Mid.Career.Median.Salary", "Career.Percent.Growth", 
                                                     "Percentile.10", "Percentile.25", "Percentile.75", 
                                                     "Percentile.90"), skip=1)

head(df)

## # A tibble: 6 x 8
##   College.Major Starting.Median~ Mid.Career.Medi~ Career.Percent.~ Percentile.10
##   <chr>                    <dbl>            <dbl>            <dbl>         <dbl>
## 1 Accounting               46000            77100             67.6         42200
## 2 Aerospace En~            57700           101000             75           64300
## 3 Agriculture              42600            71900             68.8         36300
## 4 Anthropology             36800            61500             67.1         33800
## 5 Architecture             41600            76800             84.6         50600
## 6 Art History              35800            64900             81.3         28800
## # ... with 3 more variables: Percentile.25 <dbl>, Percentile.75 <dbl>,
## #   Percentile.90 <dbl>

Preparing data for clustering analysis

k.means.data <- df %>% 
    select(c("Starting.Median.Salary", "Mid.Career.Median.Salary", "Percentile.10", "Percentile.90")) %>%
    scale()

head(k.means.data)

##      Starting.Median.Salary Mid.Career.Median.Salary Percentile.10
## [1,]              0.1805388                0.1438303    -0.1006601
## [2,]              1.4304232                1.6293723     1.7408869
## [3,]             -0.1826754               -0.1793839    -0.5922949
## [4,]             -0.8022762               -0.8258122    -0.8006147
## [5,]             -0.2895031                0.1251833     0.5992944
## [6,]             -0.9091039               -0.6144799    -1.2172543
##      Percentile.90
## [1,]     0.3315471
## [2,]     0.6546924
## [3,]     0.2597370
## [4,]    -0.1711234
## [5,]    -0.2429334
## [6,]    -0.6378888

fviz_nbclust(k.means.data, kmeans , method = "wss")

fviz_nbclust(k.means.data, kmeans , method = "silhouette")

gap_stat <- cluster::clusGap(k.means.data, FUNcluster = kmeans, nstart=25, K.max=10, B=50)

fviz_gap_stat(gap_stat)

K-Means clustering

num_cluster <- 3
set.seed(3)
k_means <- kmeans(k.means.data, centers=num_cluster, iter.max = 15, nstart = 25)

df.labelled <- df %>% 
    mutate(clusters = k_means$cluster)

ggplot(df.labelled, aes(Mid.Career.Median.Salary, Starting.Median.Salary, color=clusters))+
    geom_point()+
    xlab("Median Salary mid career")+
    ylab("Median salary start career")+
    scale_x_continuous(labels= scales::dollar)+
    scale_y_continuous(labels= scales::dollar)

Arranging data by factoring percentile category

df_perc <- df.labelled%>% select(c("College.Major", "Percentile.10", "Percentile.25", "Mid.Career.Median.Salary", "Percentile.75", "Percentile.90", "clusters")) %>%
    gather(key = "Percentile", value="Salary", -c(College.Major, clusters))%>%
    mutate(Percentile=factor(Percentile,levels=c('Percentile.10','Percentile.25',
                                                 'Mid.Career.Median.Salary',
                                                 'Percentile.75','Percentile.90')))

ggplot(df_perc[df_perc$clusters==1,], aes(x=Percentile,y=Salary, group=College.Major, color=College.Major, order=Salary)) +
    geom_point() +
    geom_line() +
    ggtitle('Cluster 1:  The Liberal Arts') +
    theme(axis.text.x = element_text(size=7, angle=25))

ggplot(df_perc[df_perc$clusters==2,], aes(x=Percentile,y=Salary, group=College.Major, color=College.Major, order=Salary)) +
    geom_point() +
    geom_line() +
    ggtitle('Cluster 2:  The Goldilocks') +
    theme(axis.text.x = element_text(size=7, angle=25))

ggplot(df_perc[df_perc$clusters==3,], aes(x=Percentile,y=Salary, group=College.Major, color=College.Major, order=Salary)) +
    geom_point() +
    geom_line() +
    ggtitle('Cluster 2: The Over Achievers') +
    theme(axis.text.x = element_text(size=7, angle=25))

High paying careers

arrange(df.labelled,desc(Career.Percent.Growth))

## # A tibble: 50 x 9
##    College.Major Starting.Median~ Mid.Career.Medi~ Career.Percent.~
##    <chr>                    <dbl>            <dbl>            <dbl>
##  1 Math                     45400            92400            104. 
##  2 Philosophy               39900            81200            104. 
##  3 Internationa~            40900            80900             97.8
##  4 Economics                50100            98600             96.8
##  5 Marketing                40800            79600             95.1
##  6 Physics                  50300            97300             93.4
##  7 Political Sc~            40800            78200             91.7
##  8 Chemistry                42600            79900             87.6
##  9 Journalism               35600            66700             87.4
## 10 Architecture             41600            76800             84.6
## # ... with 40 more rows, and 5 more variables: Percentile.10 <dbl>,
## #   Percentile.25 <dbl>, Percentile.75 <dbl>, Percentile.90 <dbl>,
## #   clusters <int>

Cluster analysis of college major and salary in R

Jay Khedekar

Introduction

Preparing data for clustering analysis

K-Means clustering

Arranging data by factoring percentile category

High paying careers