#check packages out of the library
library(tidyverse)
library(dplyr)
#call for data
data("iris")
1) Examine the structure of the iris data set. How many observations and variables are in the data set?
str(iris)
data frame | 150 observations | 5 variables
2) Create a new data frame iris1
that contains
only the species virginica and versicolor with sepal
lengths longer than 6 cm and sepal widths longer than 2.5 cm. How many
observations and variables are in the data set?
iris1 <- filter(iris, Species == "virginica" | Species == "versicolor", Sepal.Length > 6, Sepal.Width > 2.5)
str(iris1)
data frame | 56 observations | 5 variables
3) Create a iris2
data frame
from iris1
that contains only the columns for Species,
Sepal.Length, and Sepal.Width. How many observations and variables are
in the data set?
iris2 <- select(iris1, Species, Sepal.Length, Sepal.Width)
str(iris2)
data frame | 56 observations | 3 variables
4) Create an iris3
data frame
from iris2
that orders the observations from largest to
smallest sepal length. Show the first 6 rows of this data
set.
iris3 <- arrange(iris2, by = desc(Sepal.Length))
head(iris3, n = 6)
Species Sepal.Length Sepal.Width
1 virginica 7.9 3.8
2 virginica 7.7 3.8
3 virginica 7.7 2.6
4 virginica 7.7 2.8
5 virginica 7.7 3.0
6 virginica 7.6 3.0
5) Create an iris4
data frame
from iris3
that creates a column with a sepal area (length
* width) value for each observation. How many observations and variables
are in the data set?
iris4 <- mutate(iris3, Sepal.Area = Sepal.Length * Sepal.Width)
str(iris4)
data frame | 56 observations | 4 variables
6) Create iris5
that calculates the average
sepal length, the average sepal width, and the sample size of the
entire iris4
data frame and
print iris5
.
iris5 <- iris4 %>%
summarize(
avg_sepal_length = mean(Sepal.Length),
avg_sepal_width = mean(Sepal.Width),
sample_size = n()
)
print(iris5)
avg_sepal_length avg_sepal_width sample_size
1 6.698214 3.041071 56
7) Create iris6
that calculates the average
sepal length, the average sepal width, and the sample size for each
species of in the iris4
data frame and
print iris6
.
iris6 <- iris4 %>%
group_by(Species) %>%
summarize(
avg_sepal_length = mean(Sepal.Length),
avg_sepal_width = mean(Sepal.Width),
sample_size = n()
)
print(iris6)
Species avg_sepal_length avg_sepal_width sample_size
1 versicolor 6.48 2.99 17
2 virginica 6.79 3.06 39
8) Rework all of your previous statements (except
for iris5
) into an extended piping operation that
uses iris
as the input and
generates irisFinal
as the output.
irisFinal <- iris %>%
filter(Species %in% c("virginica", "versicolor"), Sepal.Length > 6, Sepal.Width > 2.5) %>%
select(Species, Sepal.Length, Sepal.Width) %>%
arrange(desc(Sepal.Length)) %>%
mutate(Sepal.Area = Sepal.Length * Sepal.Width) %>%
group_by(Species) %>%
summarize(
avg_sepal_length = mean(Sepal.Length),
avg_sepal_width = mean(Sepal.Width),
sample_size = n()
)
print(irisFinal)
Species avg_sepal_length avg_sepal_width sample_size
1 versicolor 6.48 2.99 17
2 virginica 6.79 3.06 39
9) Create a ‘longer’ data frame using the
original iris
data set with three columns named “Species”,
“Measure”, “Value”. The column “Species” will retain the species names
of the data set. The column “Measure” will include whether the value
corresponds to Sepal.Length, Sepal.Width, Petal.Length, or Petal.Width
and the column “Value” will include the numerical values of those
measurements.
iris_long <- iris %>%
pivot_longer(
cols = c(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width),
names_to = "Measure",
values_to = "Value"
) %>%
select(Species, Measure, Value)
head(iris_long)
Species Measure Value
1 setosa Sepal.Length 5.1
2 setosa Sepal.Width 3.5
3 setosa Petal.Length 1.4
4 setosa Petal.Width 0.2
5 setosa Sepal.Length 4.9
6 setosa Sepal.Width 3