# install and load some libraries
# https://cran.r-project.org/web/packages/ggplot2/index.html
if (!require('ggplot2')) install.packages('ggplot2'); library('ggplot2')
# https://cran.r-project.org/web/packages/dplyr/index.html
if (!require('dplyr')) install.packages('dplyr'); library('dplyr')
# https://CRAN.R-project.org/package=ggthemes
if (!require('ggthemes')) install.packages('ggthemes'); library('ggthemes')
Loading the dataset
# adult.data does not have a header row, so, we need to create it
<- c('age','workclass','fnlwgt','education','education.num',
column.names 'marital.status','occupation','relationship','race','sex',
'capital.gain','capital.loss','hours.per.week','native.country',
'income')
# strip.white = TRUE to keep out the latest rows (empty rows)
<- read.csv('../data/adult.data', stringsAsFactors = FALSE,
adult.data header = FALSE, col.names = column.names,
strip.white = TRUE)
#rows <- dim(adult.data)[1]
We verify the structure of the dataset
str(adult.data)
## 'data.frame': 32561 obs. of 15 variables:
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ workclass : chr "State-gov" "Self-emp-not-inc" "Private" "Private" ...
## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
## $ education : chr "Bachelors" "Bachelors" "HS-grad" "11th" ...
## $ education.num : int 13 13 9 7 13 14 5 9 14 13 ...
## $ marital.status: chr "Never-married" "Married-civ-spouse" "Divorced" "Married-civ-spouse" ...
## $ occupation : chr "Adm-clerical" "Exec-managerial" "Handlers-cleaners" "Handlers-cleaners" ...
## $ relationship : chr "Not-in-family" "Husband" "Not-in-family" "Husband" ...
## $ race : chr "White" "White" "White" "Black" ...
## $ sex : chr "Male" "Male" "Male" "Male" ...
## $ capital.gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week: int 40 13 40 40 40 40 16 45 50 40 ...
## $ native.country: chr "United-States" "United-States" "United-States" "United-States" ...
## $ income : chr "<=50K" "<=50K" "<=50K" "<=50K" ...
The dataset has 32561 rows and 15 variables.
We check that the description of the variables and the loaded variable types correspond to those of the file:
We get the basic statistics:
summary(adult.data)
## age workclass fnlwgt education
## Min. :17.00 Length:32561 Min. : 12285 Length:32561
## 1st Qu.:28.00 Class :character 1st Qu.: 117827 Class :character
## Median :37.00 Mode :character Median : 178356 Mode :character
## Mean :38.58 Mean : 189778
## 3rd Qu.:48.00 3rd Qu.: 237051
## Max. :90.00 Max. :1484705
## education.num marital.status occupation relationship
## Min. : 1.00 Length:32561 Length:32561 Length:32561
## 1st Qu.: 9.00 Class :character Class :character Class :character
## Median :10.00 Mode :character Mode :character Mode :character
## Mean :10.08
## 3rd Qu.:12.00
## Max. :16.00
## race sex capital.gain capital.loss
## Length:32561 Length:32561 Min. : 0 Min. : 0.0
## Class :character Class :character 1st Qu.: 0 1st Qu.: 0.0
## Mode :character Mode :character Median : 0 Median : 0.0
## Mean : 1078 Mean : 87.3
## 3rd Qu.: 0 3rd Qu.: 0.0
## Max. :99999 Max. :4356.0
## hours.per.week native.country income
## Min. : 1.00 Length:32561 Length:32561
## 1st Qu.:40.00 Class :character Class :character
## Median :40.00 Mode :character Mode :character
## Mean :40.44
## 3rd Qu.:45.00
## Max. :99.00
We look for variables with null data:
colSums(is.na(adult.data))
## age workclass fnlwgt education education.num
## 0 0 0 0 0
## marital.status occupation relationship race sex
## 0 0 0 0 0
## capital.gain capital.loss hours.per.week native.country income
## 0 0 0 0 0
colSums(adult.data=="")
## age workclass fnlwgt education education.num
## 0 0 0 0 0
## marital.status occupation relationship race sex
## 0 0 0 0 0
## capital.gain capital.loss hours.per.week native.country income
## 0 0 0 0 0
At first glance, it is observed that there are no null values. We also check that the values of the text fields are valid values.
unique(adult.data$workclass)
## [1] "State-gov" "Self-emp-not-inc" "Private" "Federal-gov"
## [5] "Local-gov" "?" "Self-emp-inc" "Without-pay"
## [9] "Never-worked"
unique(adult.data$education)
## [1] "Bachelors" "HS-grad" "11th" "Masters" "9th"
## [6] "Some-college" "Assoc-acdm" "Assoc-voc" "7th-8th" "Doctorate"
## [11] "Prof-school" "5th-6th" "10th" "1st-4th" "Preschool"
## [16] "12th"
unique(adult.data$marital.status)
## [1] "Never-married" "Married-civ-spouse" "Divorced"
## [4] "Married-spouse-absent" "Separated" "Married-AF-spouse"
## [7] "Widowed"
unique(adult.data$occupation)
## [1] "Adm-clerical" "Exec-managerial" "Handlers-cleaners"
## [4] "Prof-specialty" "Other-service" "Sales"
## [7] "Craft-repair" "Transport-moving" "Farming-fishing"
## [10] "Machine-op-inspct" "Tech-support" "?"
## [13] "Protective-serv" "Armed-Forces" "Priv-house-serv"
unique(adult.data$relationship)
## [1] "Not-in-family" "Husband" "Wife" "Own-child"
## [5] "Unmarried" "Other-relative"
unique(adult.data$race)
## [1] "White" "Black" "Asian-Pac-Islander"
## [4] "Amer-Indian-Eskimo" "Other"
unique(adult.data$sex)
## [1] "Male" "Female"
unique(adult.data$native.country)
## [1] "United-States" "Cuba"
## [3] "Jamaica" "India"
## [5] "?" "Mexico"
## [7] "South" "Puerto-Rico"
## [9] "Honduras" "England"
## [11] "Canada" "Germany"
## [13] "Iran" "Philippines"
## [15] "Italy" "Poland"
## [17] "Columbia" "Cambodia"
## [19] "Thailand" "Ecuador"
## [21] "Laos" "Taiwan"
## [23] "Haiti" "Portugal"
## [25] "Dominican-Republic" "El-Salvador"
## [27] "France" "Guatemala"
## [29] "China" "Japan"
## [31] "Yugoslavia" "Peru"
## [33] "Outlying-US(Guam-USVI-etc)" "Scotland"
## [35] "Trinadad&Tobago" "Greece"
## [37] "Nicaragua" "Vietnam"
## [39] "Hong" "Ireland"
## [41] "Hungary" "Holand-Netherlands"
unique(adult.data$income)
## [1] "<=50K" ">50K"
colSums(adult.data=="?")
## age workclass fnlwgt education education.num
## 0 1836 0 0 0
## marital.status occupation relationship race sex
## 0 1843 0 0 0
## capital.gain capital.loss hours.per.week native.country income
## 0 0 0 583 0
The unknown values have been labeled as ‘?’. We have unknown values in workclass, occupation, and native.country.
What we have for now is:
capital
as capital.gain minus capital.loss.We add a new field age.segment
with the age discretized in intervals..
$age.segment <- cut(adult.data$age,
adult.databreaks = c(0,17,27,37,47,57,67,77,87,100),
labels = c("< 18", "18-27", "28-37", "38-47",
"48-57","58-67","68-77","78-87", "> 87"))
ggplot(data = adult.data) +
aes(x=age.segment) +
geom_bar(fill=color.main) +
theme_clean()
As we had seen from the IQR, the segments with the largest population in the census are those between the ages of 18 and 48.
Let’s look at the variables related to education:
ggplot(data = adult.data) +
aes(x=education.num, fill=education) +
geom_bar() +
scale_fill_manual(values = big.palette(length(unique(adult.data$education.num))))+
theme_clean()
We can see the educational system of the United States in wikipedia (https://en.wikipedia.org/wiki/Education_in_the_United_States), so we can create a new discretized variable based on the education.num field that will be grouped in the following intervals :
$education.segment <- cut(adult.data$education.num, breaks = c(0,4,8,12,17),
adult.datalabels = c("0 to 4", "5 to 8", "9 to 12", ">= 13"))
ggplot(data = adult.data) +
aes(x=education.segment) +
geom_bar(fill=color.main) +
theme_clean()
In both graphs we can see how, in our data, a large part of the individuals have attended some type of higher education.
Now, we create the new variable capital
.
$capital <- adult.data$capital.gain - adult.data$capital.loss
adult.data
ggplot(data = adult.data) +
aes(x=capital) +
geom_histogram(binwidth=5000, fill=color.main) +
theme_clean()
summary(adult.data$capital)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4356.0 0.0 0.0 990.4 0.0 99999.0
We standardize the created variable.
$capital.scaled <- scale(adult.data$capital, center = TRUE,
adult.datascale = TRUE)
summary(adult.data$capital.scaled)
## V1
## Min. :-0.7216
## 1st Qu.:-0.1337
## Median :-0.1337
## Mean : 0.0000
## 3rd Qu.:-0.1337
## Max. :13.3633
And we discretize it.
$capital.segment <- cut(adult.data$capital.scaled,
adult.databreaks = c(-1.0, 0.0, 1.0, 14.0),
labels = c("Losses", "Gains", "Large Gains"))
ggplot(data = adult.data) +
aes(x=capital.segment) +
geom_bar(fill=color.main) +
theme_clean()
We observe how the most frequent is to lose capital.
Finally, we add a new field hours.per.week.segment
with the hours worked per weeks in intervals.
$hours.per.week.segment <- cut(adult.data$hours.per.week,
adult.databreaks = c(0,9,19,29,39,49,59,69,79,89,100),
labels = c( "1-9", "10-19", "20-29",
"30-39","40-49","50-59",
"60-69","70-79","80-89","90-99"))
ggplot(data = adult.data) +
aes(x=hours.per.week.segment) +
geom_bar(fill=color.main) +
theme_clean()
We verify, as we already knew from the interquartile range (IQR), that the highest frequency occurs in the range of 40 to 49 hours per week.
We are going to analyze the relationships of the variables with the income variable.
First we will do it in relative terms.
ggplot(data = adult.data) +
aes(x = age, fill = income) +
geom_histogram(binwidth=5, position="fill") +
labs(x="Age", y="Frequency") +
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean()
The percentage of income over 50K increases with age up to 50 - 55 years. From that moment it begins to decrease. From the retirement age, the proportion remains stable.
Let’s see the frequency distribution.
ggplot(data = adult.data) +
aes(x=age,fill=income) +
geom_histogram(binwidth =3) +
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean()
We show the relationship between the working class and occupation. First in absolute terms.
ggplot(data = adult.data) +
aes(x=workclass, fill=occupation) +
geom_bar() +
scale_fill_manual(values = big.palette(length(unique(adult.data$occupation))))+
theme_clean() +
theme(axis.text.x = element_text(angle = 90))
Most of the dataset corresponds to the Private sector.
Let’s see the frequency distribution.
ggplot(data = adult.data) +
aes(x = workclass, fill = occupation) +
geom_bar(position="fill") +
labs(y="Frequency") +
scale_fill_manual(values = big.palette(length(unique(adult.data$occupation))))+
theme_clean() +
theme(axis.text.x = element_text(angle = 90))
The graph shows us the distribution of occupations by type of work.
We generate the graphs to see the relationship of these two variables with income.
Firstly by workclass:
ggplot(data = adult.data) +
aes(x=workclass,fill=income) +
geom_bar()+
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean() +
theme(axis.text.x = element_text(angle = 90))
ggplot(data = adult.data) +
aes(x=workclass,fill=income) +
geom_bar(position="fill") +
ylab("Frequency") +
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean() +
theme(axis.text.x = element_text(angle = 90))
In the first graph, it appears that private sector workers are the most likely to have an income of over 50K. But, in the second graph, we see that, it is the one with the lowest probability of exceeding 50K.
Now, we see it by occupation:
ggplot(data = adult.data) +
aes(x=occupation,fill=income) +
geom_bar()+
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean() +
theme(axis.text.x = element_text(angle = 90))
ggplot(data = adult.data) +
aes(x=occupation,fill=income) +
geom_bar(position="fill") +
ylab("Frequency") +
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean() +
theme(axis.text.x = element_text(angle = 90))
We will analyze the relationship between education and income with the new discretized variable that we have created.
ggplot(data = adult.data) +
aes(x=education.segment ,fill=income) +
geom_bar()+
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean()
ggplot(data = adult.data) +
aes(x=education.segment,fill=income) +
geom_bar(position="fill") +
ylab("Frequency") +
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean()
As expected, income increases with education.
Let’s see, first, the relationship between these variables.
ggplot(data = adult.data) +
aes(x=marital.status, fill=relationship) +
geom_bar() +
scale_fill_manual(values = big.palette(length(unique(adult.data$relationship))))+
theme_clean() +
theme(axis.text.x = element_text(angle = 90))
Let’s see the frequency distribution.
ggplot(data = adult.data) +
aes(x = marital.status, fill = relationship) +
geom_bar(position="fill") +
labs(y="Frequency") +
scale_fill_manual(values = big.palette(length(unique(adult.data$relationship))))+
theme_clean() +
theme(axis.text.x = element_text(angle = 90))
We generate the graphs to see the relationship of these two variables with the income.
Firstly by marital status:
ggplot(data = adult.data) +
aes(x=marital.status,fill=income) +
geom_bar()+
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean() +
theme(axis.text.x = element_text(angle = 90))
ggplot(data = adult.data) +
aes(x=marital.status,fill=income) +
geom_bar(position="fill") +
ylab("Frequency") +
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean() +
theme(axis.text.x = element_text(angle = 90))
It seems that there is a higher percentage of people with incomes above 50K among married people.
By relationship:
ggplot(data = adult.data) +
aes(x=relationship,fill=income) +
geom_bar()+
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean() +
theme(axis.text.x = element_text(angle = 90))
ggplot(data = adult.data) +
aes(x=relationship,fill=income) +
geom_bar(position="fill") +
ylab("Frequency") +
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean() +
theme(axis.text.x = element_text(angle = 90))
Considering that we have seen that the probability of having an income greater than 50K was greater among married people, it is not surprising that here we see that it is greater in the relationships Husband
andWife
.
ggplot(data = adult.data) +
aes(x=relationship ,fill=income) +
geom_bar(position="fill") +
facet_wrap(~marital.status)+
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean() +
theme(axis.text.x = element_text(angle = 90))
We show the graph of income by race.
ggplot(data = adult.data) +
aes(x=race,fill=income) +
geom_bar()+
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean() +
theme(axis.text.x = element_text(angle = 90))
As it is not very clear, we show the frequency distribution:
ggplot(data = adult.data) +
aes(x=race,fill=income) +
geom_bar(position="fill") +
ylab("Frequency") +
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean() +
theme(axis.text.x = element_text(angle = 90))
The highest percentages of people with incomes over 50K are among Asian-Pac-Islander people, followed by white people.
We see the distribution by gender.
ggplot(data = adult.data) +
aes(x=sex,fill=income) +
geom_bar() +
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean()
At first glance, it can be seen that the proportion of women with income above 50K is lower than the proportion of men with income above 50K. Still, we show the frequency distribution.
ggplot(data = adult.data) +
aes(x=sex,fill=income) +
geom_bar(position="fill") +
ylab("Frequency") +
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean()
It is found that the percentage of men with incomes above 50K is more than double that of women.
We analyze the variable created capital.
ggplot(data = adult.data) +
aes(x=capital.segment, fill=income) +
geom_bar(position="fill") +
ylab("Frequency") +
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean()
It seems that people with higher incomes also have higher capital increases. We have discovered the wheel!
We show the graph of the income by hours per week
ggplot(data = adult.data) +
aes(x=hours.per.week.segment,fill=income) +
geom_bar()+
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean()
ggplot(data = adult.data) +
aes(x=hours.per.week.segment,fill=income) +
geom_bar(position="fill") +
ylab("Frequency") +
theme_clean() +
scale_fill_manual(values=c(color.less50K, color.more50k))
After working 50 hours a week, the percentage of people with incomes greater than $ 50,000 almost doubles.
We show income by country of origin.
ggplot(data = adult.data) +
aes(x=native.country,fill=income) +
geom_bar() +
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean() +
theme(axis.text.x = element_text(angle = 90))
Obviously, much of the dataset is native to the United States.
ggplot(data = adult.data) +
aes(x=native.country,fill=income) +
geom_bar(position="fill") +
ylab("Frequency") +
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean() +
theme(axis.text.x = element_text(angle = 90))
There are important differences in income depending on the country of origin.
The fnlwgt parameter represents the amount of population that each entry in the data set represents.
We show its distribution.
ggplot(data = adult.data) +
aes(x=fnlwgt) +
geom_histogram(binwidth=100000, fill=color.main) +
theme_clean()
We show the relationship of the parameter with income.
ggplot(data = adult.data) +
aes(x = fnlwgt, fill = income) +
geom_histogram(binwidth=100000, position="fill") +
labs(y="Frequency") +
scale_fill_manual(values=c(color.less50K, color.more50k)) +
theme_clean()