Choosing a major can be a daunting task. I have worked on clustering analysis to figure out high paying majors based on salary data in initial, mid and end career stages.
library(tidyverse)
library(factoextra)
library(cluster)
<- readr::read_csv("main_data.csv", col_names = c("College.Major", "Starting.Median.Salary",
df "Mid.Career.Median.Salary", "Career.Percent.Growth",
"Percentile.10", "Percentile.25", "Percentile.75",
"Percentile.90"), skip=1)
head(df)
## # A tibble: 6 x 8
## College.Major Starting.Median~ Mid.Career.Medi~ Career.Percent.~ Percentile.10
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Accounting 46000 77100 67.6 42200
## 2 Aerospace En~ 57700 101000 75 64300
## 3 Agriculture 42600 71900 68.8 36300
## 4 Anthropology 36800 61500 67.1 33800
## 5 Architecture 41600 76800 84.6 50600
## 6 Art History 35800 64900 81.3 28800
## # ... with 3 more variables: Percentile.25 <dbl>, Percentile.75 <dbl>,
## # Percentile.90 <dbl>
<- df %>%
k.means.data select(c("Starting.Median.Salary", "Mid.Career.Median.Salary", "Percentile.10", "Percentile.90")) %>%
scale()
head(k.means.data)
## Starting.Median.Salary Mid.Career.Median.Salary Percentile.10
## [1,] 0.1805388 0.1438303 -0.1006601
## [2,] 1.4304232 1.6293723 1.7408869
## [3,] -0.1826754 -0.1793839 -0.5922949
## [4,] -0.8022762 -0.8258122 -0.8006147
## [5,] -0.2895031 0.1251833 0.5992944
## [6,] -0.9091039 -0.6144799 -1.2172543
## Percentile.90
## [1,] 0.3315471
## [2,] 0.6546924
## [3,] 0.2597370
## [4,] -0.1711234
## [5,] -0.2429334
## [6,] -0.6378888
fviz_nbclust(k.means.data, kmeans , method = "wss")
fviz_nbclust(k.means.data, kmeans , method = "silhouette")
<- cluster::clusGap(k.means.data, FUNcluster = kmeans, nstart=25, K.max=10, B=50) gap_stat
fviz_gap_stat(gap_stat)
<- 3
num_cluster set.seed(3)
<- kmeans(k.means.data, centers=num_cluster, iter.max = 15, nstart = 25) k_means
<- df %>%
df.labelled mutate(clusters = k_means$cluster)
ggplot(df.labelled, aes(Mid.Career.Median.Salary, Starting.Median.Salary, color=clusters))+
geom_point()+
xlab("Median Salary mid career")+
ylab("Median salary start career")+
scale_x_continuous(labels= scales::dollar)+
scale_y_continuous(labels= scales::dollar)
<- df.labelled%>% select(c("College.Major", "Percentile.10", "Percentile.25", "Mid.Career.Median.Salary", "Percentile.75", "Percentile.90", "clusters")) %>%
df_perc gather(key = "Percentile", value="Salary", -c(College.Major, clusters))%>%
mutate(Percentile=factor(Percentile,levels=c('Percentile.10','Percentile.25',
'Mid.Career.Median.Salary',
'Percentile.75','Percentile.90')))
ggplot(df_perc[df_perc$clusters==1,], aes(x=Percentile,y=Salary, group=College.Major, color=College.Major, order=Salary)) +
geom_point() +
geom_line() +
ggtitle('Cluster 1: The Liberal Arts') +
theme(axis.text.x = element_text(size=7, angle=25))
ggplot(df_perc[df_perc$clusters==2,], aes(x=Percentile,y=Salary, group=College.Major, color=College.Major, order=Salary)) +
geom_point() +
geom_line() +
ggtitle('Cluster 2: The Goldilocks') +
theme(axis.text.x = element_text(size=7, angle=25))
ggplot(df_perc[df_perc$clusters==3,], aes(x=Percentile,y=Salary, group=College.Major, color=College.Major, order=Salary)) +
geom_point() +
geom_line() +
ggtitle('Cluster 2: The Over Achievers') +
theme(axis.text.x = element_text(size=7, angle=25))
arrange(df.labelled,desc(Career.Percent.Growth))
## # A tibble: 50 x 9
## College.Major Starting.Median~ Mid.Career.Medi~ Career.Percent.~
## <chr> <dbl> <dbl> <dbl>
## 1 Math 45400 92400 104.
## 2 Philosophy 39900 81200 104.
## 3 Internationa~ 40900 80900 97.8
## 4 Economics 50100 98600 96.8
## 5 Marketing 40800 79600 95.1
## 6 Physics 50300 97300 93.4
## 7 Political Sc~ 40800 78200 91.7
## 8 Chemistry 42600 79900 87.6
## 9 Journalism 35600 66700 87.4
## 10 Architecture 41600 76800 84.6
## # ... with 40 more rows, and 5 more variables: Percentile.10 <dbl>,
## # Percentile.25 <dbl>, Percentile.75 <dbl>, Percentile.90 <dbl>,
## # clusters <int>