# install and load some libraries
# https://cran.r-project.org/web/packages/ggplot2/index.html
if (!require('ggplot2')) install.packages('ggplot2'); library('ggplot2')
# https://cran.r-project.org/web/packages/dplyr/index.html
if (!require('dplyr')) install.packages('dplyr'); library('dplyr')
# https://CRAN.R-project.org/package=ggthemes 
if (!require('ggthemes')) install.packages('ggthemes'); library('ggthemes')

Preprocesing data


Loading the dataset

# adult.data does not have a header row, so, we need to create it
column.names <- c('age','workclass','fnlwgt','education','education.num',
                  'marital.status','occupation','relationship','race','sex',
                  'capital.gain','capital.loss','hours.per.week','native.country',
                  'income')

# strip.white = TRUE to keep out the latest rows (empty rows)
adult.data <- read.csv('../data/adult.data', stringsAsFactors = FALSE, 
                       header = FALSE, col.names = column.names, 
                       strip.white = TRUE)

#rows <- dim(adult.data)[1]

We verify the structure of the dataset

str(adult.data)
## 'data.frame':    32561 obs. of  15 variables:
##  $ age           : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ workclass     : chr  "State-gov" "Self-emp-not-inc" "Private" "Private" ...
##  $ fnlwgt        : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
##  $ education     : chr  "Bachelors" "Bachelors" "HS-grad" "11th" ...
##  $ education.num : int  13 13 9 7 13 14 5 9 14 13 ...
##  $ marital.status: chr  "Never-married" "Married-civ-spouse" "Divorced" "Married-civ-spouse" ...
##  $ occupation    : chr  "Adm-clerical" "Exec-managerial" "Handlers-cleaners" "Handlers-cleaners" ...
##  $ relationship  : chr  "Not-in-family" "Husband" "Not-in-family" "Husband" ...
##  $ race          : chr  "White" "White" "White" "Black" ...
##  $ sex           : chr  "Male" "Male" "Male" "Male" ...
##  $ capital.gain  : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital.loss  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week: int  40 13 40 40 40 40 16 45 50 40 ...
##  $ native.country: chr  "United-States" "United-States" "United-States" "United-States" ...
##  $ income        : chr  "<=50K" "<=50K" "<=50K" "<=50K" ...

The dataset has 32561 rows and 15 variables.

We check that the description of the variables and the loaded variable types correspond to those of the file:

  • age: continuous.
  • workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
  • fnlwgt: continuous.
  • education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
  • education-num: continuous.
  • marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
  • occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
  • relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
  • race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
  • sex: Female, Male.
  • capital-gain: continuous.
  • capital-loss: continuous.
  • hours-per-week: continuous.
  • native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.
  • income: >50K, <=50K.

We get the basic statistics:

summary(adult.data)
##       age         workclass             fnlwgt         education        
##  Min.   :17.00   Length:32561       Min.   :  12285   Length:32561      
##  1st Qu.:28.00   Class :character   1st Qu.: 117827   Class :character  
##  Median :37.00   Mode  :character   Median : 178356   Mode  :character  
##  Mean   :38.58                      Mean   : 189778                     
##  3rd Qu.:48.00                      3rd Qu.: 237051                     
##  Max.   :90.00                      Max.   :1484705                     
##  education.num   marital.status      occupation        relationship      
##  Min.   : 1.00   Length:32561       Length:32561       Length:32561      
##  1st Qu.: 9.00   Class :character   Class :character   Class :character  
##  Median :10.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :10.08                                                           
##  3rd Qu.:12.00                                                           
##  Max.   :16.00                                                           
##      race               sex             capital.gain    capital.loss   
##  Length:32561       Length:32561       Min.   :    0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:    0   1st Qu.:   0.0  
##  Mode  :character   Mode  :character   Median :    0   Median :   0.0  
##                                        Mean   : 1078   Mean   :  87.3  
##                                        3rd Qu.:    0   3rd Qu.:   0.0  
##                                        Max.   :99999   Max.   :4356.0  
##  hours.per.week  native.country        income         
##  Min.   : 1.00   Length:32561       Length:32561      
##  1st Qu.:40.00   Class :character   Class :character  
##  Median :40.00   Mode  :character   Mode  :character  
##  Mean   :40.44                                        
##  3rd Qu.:45.00                                        
##  Max.   :99.00

We look for variables with null data:

colSums(is.na(adult.data))
##            age      workclass         fnlwgt      education  education.num 
##              0              0              0              0              0 
## marital.status     occupation   relationship           race            sex 
##              0              0              0              0              0 
##   capital.gain   capital.loss hours.per.week native.country         income 
##              0              0              0              0              0
colSums(adult.data=="")
##            age      workclass         fnlwgt      education  education.num 
##              0              0              0              0              0 
## marital.status     occupation   relationship           race            sex 
##              0              0              0              0              0 
##   capital.gain   capital.loss hours.per.week native.country         income 
##              0              0              0              0              0

At first glance, it is observed that there are no null values. We also check that the values of the text fields are valid values.

unique(adult.data$workclass)
## [1] "State-gov"        "Self-emp-not-inc" "Private"          "Federal-gov"     
## [5] "Local-gov"        "?"                "Self-emp-inc"     "Without-pay"     
## [9] "Never-worked"
unique(adult.data$education)
##  [1] "Bachelors"    "HS-grad"      "11th"         "Masters"      "9th"         
##  [6] "Some-college" "Assoc-acdm"   "Assoc-voc"    "7th-8th"      "Doctorate"   
## [11] "Prof-school"  "5th-6th"      "10th"         "1st-4th"      "Preschool"   
## [16] "12th"
unique(adult.data$marital.status)
## [1] "Never-married"         "Married-civ-spouse"    "Divorced"             
## [4] "Married-spouse-absent" "Separated"             "Married-AF-spouse"    
## [7] "Widowed"
unique(adult.data$occupation)
##  [1] "Adm-clerical"      "Exec-managerial"   "Handlers-cleaners"
##  [4] "Prof-specialty"    "Other-service"     "Sales"            
##  [7] "Craft-repair"      "Transport-moving"  "Farming-fishing"  
## [10] "Machine-op-inspct" "Tech-support"      "?"                
## [13] "Protective-serv"   "Armed-Forces"      "Priv-house-serv"
unique(adult.data$relationship)
## [1] "Not-in-family"  "Husband"        "Wife"           "Own-child"     
## [5] "Unmarried"      "Other-relative"
unique(adult.data$race)
## [1] "White"              "Black"              "Asian-Pac-Islander"
## [4] "Amer-Indian-Eskimo" "Other"
unique(adult.data$sex)
## [1] "Male"   "Female"
unique(adult.data$native.country)
##  [1] "United-States"              "Cuba"                      
##  [3] "Jamaica"                    "India"                     
##  [5] "?"                          "Mexico"                    
##  [7] "South"                      "Puerto-Rico"               
##  [9] "Honduras"                   "England"                   
## [11] "Canada"                     "Germany"                   
## [13] "Iran"                       "Philippines"               
## [15] "Italy"                      "Poland"                    
## [17] "Columbia"                   "Cambodia"                  
## [19] "Thailand"                   "Ecuador"                   
## [21] "Laos"                       "Taiwan"                    
## [23] "Haiti"                      "Portugal"                  
## [25] "Dominican-Republic"         "El-Salvador"               
## [27] "France"                     "Guatemala"                 
## [29] "China"                      "Japan"                     
## [31] "Yugoslavia"                 "Peru"                      
## [33] "Outlying-US(Guam-USVI-etc)" "Scotland"                  
## [35] "Trinadad&Tobago"            "Greece"                    
## [37] "Nicaragua"                  "Vietnam"                   
## [39] "Hong"                       "Ireland"                   
## [41] "Hungary"                    "Holand-Netherlands"
unique(adult.data$income)
## [1] "<=50K" ">50K"
colSums(adult.data=="?")
##            age      workclass         fnlwgt      education  education.num 
##              0           1836              0              0              0 
## marital.status     occupation   relationship           race            sex 
##              0           1843              0              0              0 
##   capital.gain   capital.loss hours.per.week native.country         income 
##              0              0              0            583              0

The unknown values have been labeled as ‘?’. We have unknown values in workclass, occupation, and native.country.

What we have for now is:

  • The minimum age of the data set is 17 years and the maximum 90. The interquartile range (IQR) is 28 to 48 years.
  • The education numeric field ranges from 1 to 16.
  • We can create a new variable capital as capital.gain minus capital.loss.
  • Weekly work hours range from 1 to 99 and the interquartile range (IQR) ranges from 40 to 45 hours per week.

We add a new field age.segment with the age discretized in intervals..

adult.data$age.segment <- cut(adult.data$age, 
                              breaks = c(0,17,27,37,47,57,67,77,87,100), 
                              labels = c("< 18", "18-27", "28-37", "38-47",
                                         "48-57","58-67","68-77","78-87", "> 87"))

ggplot(data = adult.data) +
  aes(x=age.segment) +
  geom_bar(fill=color.main) + 
  theme_clean()

As we had seen from the IQR, the segments with the largest population in the census are those between the ages of 18 and 48.

Let’s look at the variables related to education:

ggplot(data = adult.data) +
  aes(x=education.num, fill=education) +
  geom_bar() + 
  scale_fill_manual(values = big.palette(length(unique(adult.data$education.num))))+ 
  theme_clean()

We can see the educational system of the United States in wikipedia (https://en.wikipedia.org/wiki/Education_in_the_United_States), so we can create a new discretized variable based on the education.num field that will be grouped in the following intervals :

  • From 0 to 4
  • From 5 to 8
  • From 9 to 12
  • From 13
adult.data$education.segment <- cut(adult.data$education.num, breaks = c(0,4,8,12,17), 
                              labels = c("0 to 4", "5 to 8", "9 to 12", ">= 13"))

ggplot(data = adult.data) +
  aes(x=education.segment) +
  geom_bar(fill=color.main) + 
  theme_clean()

In both graphs we can see how, in our data, a large part of the individuals have attended some type of higher education.

Now, we create the new variable capital.

adult.data$capital <- adult.data$capital.gain - adult.data$capital.loss

ggplot(data = adult.data) +
  aes(x=capital) +
  geom_histogram(binwidth=5000, fill=color.main) + 
  theme_clean()

summary(adult.data$capital)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -4356.0     0.0     0.0   990.4     0.0 99999.0

We standardize the created variable.

adult.data$capital.scaled <- scale(adult.data$capital, center = TRUE, 
                                   scale = TRUE)
summary(adult.data$capital.scaled)
##        V1         
##  Min.   :-0.7216  
##  1st Qu.:-0.1337  
##  Median :-0.1337  
##  Mean   : 0.0000  
##  3rd Qu.:-0.1337  
##  Max.   :13.3633

And we discretize it.

adult.data$capital.segment <- cut(adult.data$capital.scaled, 
                                  breaks = c(-1.0, 0.0, 1.0, 14.0), 
                                  labels = c("Losses", "Gains", "Large Gains"))

ggplot(data = adult.data) +
  aes(x=capital.segment) +
  geom_bar(fill=color.main) + 
  theme_clean()

We observe how the most frequent is to lose capital.

Finally, we add a new field hours.per.week.segment with the hours worked per weeks in intervals.

adult.data$hours.per.week.segment <- cut(adult.data$hours.per.week, 
                                         breaks = c(0,9,19,29,39,49,59,69,79,89,100), 
                                         labels = c( "1-9", "10-19", "20-29",
                                                     "30-39","40-49","50-59",
                                                     "60-69","70-79","80-89","90-99"))

ggplot(data = adult.data) +
  aes(x=hours.per.week.segment) +
  geom_bar(fill=color.main) + 
  theme_clean()

We verify, as we already knew from the interquartile range (IQR), that the highest frequency occurs in the range of 40 to 49 hours per week.


Exploring data


We are going to analyze the relationships of the variables with the income variable.

Age

First we will do it in relative terms.

ggplot(data = adult.data) +
  aes(x = age, fill = income) +
  geom_histogram(binwidth=5, position="fill") +
  labs(x="Age", y="Frequency") +
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean() 

The percentage of income over 50K increases with age up to 50 - 55 years. From that moment it begins to decrease. From the retirement age, the proportion remains stable.

Let’s see the frequency distribution.

ggplot(data = adult.data) +
  aes(x=age,fill=income) +
  geom_histogram(binwidth =3) + 
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean()

Type of job

We show the relationship between the working class and occupation. First in absolute terms.

ggplot(data = adult.data) +
  aes(x=workclass, fill=occupation) +
  geom_bar() +  
  scale_fill_manual(values = big.palette(length(unique(adult.data$occupation))))+ 
  theme_clean() + 
  theme(axis.text.x = element_text(angle = 90))

Most of the dataset corresponds to the Private sector.

Let’s see the frequency distribution.

ggplot(data = adult.data) +
  aes(x = workclass, fill = occupation) +
  geom_bar(position="fill") +
  labs(y="Frequency") +
  scale_fill_manual(values = big.palette(length(unique(adult.data$occupation))))+ 
  theme_clean() + 
  theme(axis.text.x = element_text(angle = 90))

The graph shows us the distribution of occupations by type of work.

We generate the graphs to see the relationship of these two variables with income.

Firstly by workclass:

ggplot(data = adult.data) +
  aes(x=workclass,fill=income) + 
  geom_bar()+ 
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean() +
  theme(axis.text.x = element_text(angle = 90))

ggplot(data = adult.data) +
  aes(x=workclass,fill=income) + 
  geom_bar(position="fill") + 
  ylab("Frequency") +
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean() +
  theme(axis.text.x = element_text(angle = 90))

In the first graph, it appears that private sector workers are the most likely to have an income of over 50K. But, in the second graph, we see that, it is the one with the lowest probability of exceeding 50K.

Now, we see it by occupation:

ggplot(data = adult.data) +
  aes(x=occupation,fill=income) + 
  geom_bar()+ 
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean() +
  theme(axis.text.x = element_text(angle = 90))

ggplot(data = adult.data) +
  aes(x=occupation,fill=income) + 
  geom_bar(position="fill") + 
  ylab("Frequency") + 
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean() +
  theme(axis.text.x = element_text(angle = 90))

Education

We will analyze the relationship between education and income with the new discretized variable that we have created.

ggplot(data = adult.data) +
  aes(x=education.segment ,fill=income) + 
  geom_bar()+ 
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean()

ggplot(data = adult.data) +
  aes(x=education.segment,fill=income) + 
  geom_bar(position="fill") + 
  ylab("Frequency") + 
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean() 

As expected, income increases with education.

Marital status and relationship

Let’s see, first, the relationship between these variables.

ggplot(data = adult.data) +
  aes(x=marital.status, fill=relationship) +
  geom_bar() +
  scale_fill_manual(values = big.palette(length(unique(adult.data$relationship))))+ 
  theme_clean() + 
  theme(axis.text.x = element_text(angle = 90))

Let’s see the frequency distribution.

ggplot(data = adult.data) +
  aes(x = marital.status, fill = relationship) +
  geom_bar(position="fill") +
  labs(y="Frequency") +
  scale_fill_manual(values = big.palette(length(unique(adult.data$relationship))))+ 
  theme_clean() + 
  theme(axis.text.x = element_text(angle = 90))

We generate the graphs to see the relationship of these two variables with the income.

Firstly by marital status:

ggplot(data = adult.data) +
  aes(x=marital.status,fill=income) + 
  geom_bar()+ 
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean() +
  theme(axis.text.x = element_text(angle = 90))

ggplot(data = adult.data) +
  aes(x=marital.status,fill=income) + 
  geom_bar(position="fill") + 
  ylab("Frequency") + 
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean() +
  theme(axis.text.x = element_text(angle = 90))

It seems that there is a higher percentage of people with incomes above 50K among married people.

By relationship:

ggplot(data = adult.data) +
  aes(x=relationship,fill=income) + 
  geom_bar()+ 
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean() +
  theme(axis.text.x = element_text(angle = 90))

ggplot(data = adult.data) +
  aes(x=relationship,fill=income) + 
  geom_bar(position="fill") + 
  ylab("Frequency") + 
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean() +
  theme(axis.text.x = element_text(angle = 90))

Considering that we have seen that the probability of having an income greater than 50K was greater among married people, it is not surprising that here we see that it is greater in the relationships Husband andWife.

ggplot(data = adult.data) +
  aes(x=relationship ,fill=income) + 
  geom_bar(position="fill") + 
  facet_wrap(~marital.status)+ 
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean() +
  theme(axis.text.x = element_text(angle = 90))

Race

We show the graph of income by race.

ggplot(data = adult.data) +
  aes(x=race,fill=income) + 
  geom_bar()+ 
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean() +
  theme(axis.text.x = element_text(angle = 90))

As it is not very clear, we show the frequency distribution:

ggplot(data = adult.data) +
  aes(x=race,fill=income) + 
  geom_bar(position="fill") + 
  ylab("Frequency") + 
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean() +
  theme(axis.text.x = element_text(angle = 90))

The highest percentages of people with incomes over 50K are among Asian-Pac-Islander people, followed by white people.

Gender

We see the distribution by gender.

ggplot(data = adult.data) +
  aes(x=sex,fill=income) + 
  geom_bar() + 
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean()

At first glance, it can be seen that the proportion of women with income above 50K is lower than the proportion of men with income above 50K. Still, we show the frequency distribution.

ggplot(data = adult.data) +
  aes(x=sex,fill=income) + 
  geom_bar(position="fill") + 
  ylab("Frequency") + 
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean()

It is found that the percentage of men with incomes above 50K is more than double that of women.

Capital gains and losses

We analyze the variable created capital.

ggplot(data = adult.data) +
  aes(x=capital.segment, fill=income) + 
  geom_bar(position="fill") + 
  ylab("Frequency") + 
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean()

It seems that people with higher incomes also have higher capital increases. We have discovered the wheel!

Hours per week

We show the graph of the income by hours per week

ggplot(data = adult.data) +
  aes(x=hours.per.week.segment,fill=income) + 
  geom_bar()+ 
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean()

ggplot(data = adult.data) +
  aes(x=hours.per.week.segment,fill=income) + 
  geom_bar(position="fill") + 
  ylab("Frequency") + 
  theme_clean() +
  scale_fill_manual(values=c(color.less50K, color.more50k)) 

After working 50 hours a week, the percentage of people with incomes greater than $ 50,000 almost doubles.

Native country

We show income by country of origin.

ggplot(data = adult.data) +
  aes(x=native.country,fill=income) + 
  geom_bar() + 
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean() +
  theme(axis.text.x = element_text(angle = 90))

Obviously, much of the dataset is native to the United States.

ggplot(data = adult.data) +
  aes(x=native.country,fill=income) + 
  geom_bar(position="fill") + 
  ylab("Frequency") + 
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean() +
  theme(axis.text.x = element_text(angle = 90))

There are important differences in income depending on the country of origin.

Final weight

The fnlwgt parameter represents the amount of population that each entry in the data set represents.

We show its distribution.

ggplot(data = adult.data) +
  aes(x=fnlwgt) +
  geom_histogram(binwidth=100000, fill=color.main) + 
  theme_clean()

We show the relationship of the parameter with income.

ggplot(data = adult.data) +
  aes(x = fnlwgt, fill = income) +
  geom_histogram(binwidth=100000, position="fill") +
  labs(y="Frequency") +
  scale_fill_manual(values=c(color.less50K, color.more50k)) +
  theme_clean()