R Programming Assignment Help | Exploratory Data Analysis(EDA) With R Programming

Final Output In Fancy Format

Install all related libraries which is given below:

library(dplyr)
library(plotly)
library(purrr) 
library(cluster)
library(NbClust)
library(factoextra)
library(IRdisplay)
library(plyr)
library(tidyverse)
library(ggpubr)
library(GGally)
library(factoextra)
library(RColorBrewer)
library(ggplotify)
library(hrbrthemes)
library(dendextend)
library(plyr)

Read Data

data <- read.csv("Mall_Customers.csv")
data

Result

...

Check all data type

str(data)

Output:

## 'data.frame':    200 obs. of  5 variables:
##  $ CustomerID            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender                : chr  "Male" "Male" "Female" "Female" ...
##  $ Age                   : int  19 21 20 23 31 22 35 23 64 30 ...
##  $ Annual.Income..k..    : int  15 15 16 16 17 17 18 18 19 19 ...
##  $ Spending.Score..1.100.: int  39 81 6 77 40 76 6 94 3 72 .

# Setting a theme for the graphs
My_Theme = theme(plot.title = element_text(size = 14, face = "bold"), 
                 axis.text = element_text(size = 16),
                 axis.title = element_text(size = 16))

# Gender distribution
table(data$Gender)

Output:

## 
## Female   Male 
##    112     88

Bar Plot For Mal And Female Count

ggplot(data, aes(Gender)) + 
  geom_histogram(fill="#9C9CEE", alpha=0.8, aes(y=(..count..)), stat="count") + 
  scale_y_continuous(breaks = seq(0, 120, 20)) +
  labs(x = "Gender", y = "Count") +
  My_Theme

Output:

Histogram For Distribution of Age

density1 <- density(data$Age)
p_age <- data %>% plot_ly(x=~Age) %>% 
  add_histogram(color=I("mediumpurple"), name = "Histogram") %>% 
  add_lines(x = density1$x, y = density1$y, fill = "tozeroy", color = I("lavender"), yaxis = "y2", name = "Density") %>% 
  layout(title = "Distribution of Age ", xaxis = list (title = "Age"), 
         yaxis2 = list(overlaying = "y", side = "right"), showlegend = FALSE)
p_age

Output:

Histogram For Distribution of Income

density2 <- density(data$Annual.Income..k..)
p_income <- data %>% plot_ly(x=~Annual.Income..k..) %>% 
  add_histogram(color=I("mediumpurple"), name = "Histogram") %>% 
  add_lines(x = density2$x, y = density2$y, fill = "tozeroy", color = I("lavender"), 
            yaxis = "y2", name="Density") %>%
  layout(title = "Distribution of Income ", xaxis = list (title = "Annual Income (k$)"), 
         yaxis2 = list(overlaying = "y", side = "right"), showlegend = FALSE)
p_income

Output:

Histogram For Distribution of Spending Score

density3<- density(data$Spending.Score..1.100.)
p_score <- data %>% plot_ly(x=~Spending.Score..1.100.) %>% 
  add_histogram(color=I("mediumpurple"), name="Histogram") %>% 
  add_lines(x = density3$x, y = density3$y, fill = "tozeroy", color = I("lavender"), yaxis = "y2", name="Density") %>% 
  layout(title = "Distribution of Spending score ", xaxis = list (title = "Spending Score"), 
         yaxis2 = list(overlaying = "y", side = "right"), showlegend = FALSE)
p_score

Output:

Multiplot

multi_plot <- function(data_, x_, group_) {     
 
  # Histogram --------------------
  
   hist_plot <- ggplot(data = data_,
                       aes(x    = x_,
                           fill = group_)) +
  geom_histogram(alpha       = 0.5,
                 show.legend = FALSE,
                 binwidth    = 8) +
  
  labs(title = 
         paste(deparse(substitute(x_)),
         deparse(substitute(group_)),
         sep = " - "),
       subtitle = "Histogram",
       x = deparse(substitute(x_)))
 
   # Boxplot ----------------------

  box_plot <- ggplot(data = data_,
                     aes(x    = x_,
                         y    = group_,
                         fill = group_)) +
  geom_boxplot(alpha       = 0.5, 
               show.legend = FALSE) +
 
  labs(title    = '',
       subtitle = "Boxplot", 
       x        = deparse(substitute(x_)),
       y        = '')

# Density Plot --------------------

  den_plot <- ggplot(data = data_,
                     aes(x    = x_,
                         fill = group_)) +
  geom_density(alpha       = 0.5,
               show.legend = FALSE) +
 
  labs(subtitle = "Density Plot", 
       x        = deparse(substitute(x_)))

  ggarrange(hist_plot, 
            ggarrange(box_plot,
                      den_plot,
                      nrow = 2,
            labels = c("B", "C")),
            ncol   = 2, labels = "A")   
}

ddply(data,
      'Gender',
      summarise,
      min = min(Age),
      mean = round(mean(Age)),
      median = median(Age),
      max = max(Age))

Output:

##   Gender min mean median max
## 1 Female  18   38     35  68
## 2   Male  18   40     37  70

Gender <- data$Gender
Age <- data$Age
multi_plot(data, data$Age, data$Gender)

Output:

Scatter Plot

scatter_ans <- data%>% 
  ggplot(aes(x = Annual.Income..k..,
             y = Spending.Score..1.100.,
             colour = Gender)) +
  geom_point(size = 2,
             alpha = 0.6,
             show.legend = FALSE) +
 
  labs(title = 'Scatterplots',
       subtitle = 'Age - Income - Score')

scatter_aa <- data %>% 
  ggplot(aes(x = Age,
             y = Annual.Income..k..,
             colour = Gender)) +
  geom_point(size = 2,
             alpha = 0.6) 

scatter_ags <- data %>% 
  ggplot(aes(x = Age,
             y = Spending.Score..1.100.,
             colour = Gender)) +
  geom_point(size = 2,
             alpha = 0.6) 
scatter_aa

Output:

set.seed(50)
#Build a k-means model for data with k = 6
model_customers <- kmeans(data[,3:5], centers = 6)
model_customers

Output:

## K-means clustering with 6 clusters of sizes 39, 21, 22, 45, 35, 38
## 
## Cluster means:
##        Age Annual.Income..k.. Spending.Score..1.100.
## 1 32.69231           86.53846               82.12821
## 2 44.14286           25.14286               19.52381
## 3 25.27273           25.72727               79.36364
## 4 56.15556           53.37778               49.08889
## 5 41.68571           88.22857               17.28571
## 6 27.00000           56.65789               49.13158
## 
## Clustering vector:
##   [1] 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2
##  [38] 3 2 3 4 3 4 6 2 3 4 6 6 6 4 6 6 4 4 4 4 4 6 4 4 6 4 4 4 6 4 4 6 6 4 4 4 4
##  [75] 4 6 4 6 6 4 4 6 4 4 6 4 4 6 6 4 4 6 4 6 6 6 4 6 4 6 6 4 4 6 4 6 4 4 4 4 4
## [112] 6 6 6 6 6 4 4 4 4 6 6 6 1 6 1 5 1 5 1 5 1 6 1 5 1 5 1 5 1 5 1 6 1 5 1 5 1
## [149] 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5
## [186] 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1
## 
## Within cluster sum of squares by cluster:
## [1] 13972.359  7732.381  4099.818  8062.133 16690.857  7742.895
##  (between_SS / total_SS =  81.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

clusters <- model_customers$cluster

p_clusters <- data %>% plot_ly(x=~Annual.Income..k.., y =~Spending.Score..1.100., z=~Age) %>%
  add_markers(color = factor(clusters)) %>%
  layout(scene = list(
    xaxis = list(title="Annual Income (k$)"), 
      yaxis = list(title="Spending Score"), 
      zaxis = list(title="Age")))
p_clusters

Output: