R - Basic R stat on iris

R - Basic R stat on iris - 2020/01/08

jchung56 2020. 1. 8. 18:49

title: "stat1"
output: html_document

knitr::opts_chunk$set(echo = TRUE)

##Basic R coding for Statistics --------------------------------------------------------------------

using iris dataset in R library

Learning R on iris dataset

Author: Juhee Chung

Iris, introduced by Ronald Fisher in his 1936 paper The use of multiple measurements in taxonomic problems, contains three plant species (setosa, virginica, versicolor) and four features measured for each sample. These quantify the morphologic variation of the iris flower in its three species, all measurements given in centimeters.

library(datasets)
data("iris")
summary(iris)
head(iris)
str(iris)

# Sepal.Length    Sepal.Width     Petal.Length    Petal.Width          Species

#The dplyr package
install.packages("dplyr")

names(iris)<-tolower(names(iris))
library(dplyr)
require(dplyr)

#apply()

# iris3 : 3dimensional array structure

d<-matrix(1:9, ncol = 3)
d
#apply(array, margin, ...) margin=1(row) | margin=2(column)

apply(d,1,sum) #1+4+7,2+5+8, 3+6+9 
apply(d, 2, sum) #

apply(iris[,1:4], 2, sum)
#sepal.length  sepal.width petal.length  petal.width 
#       876.5        458.6        563.7        179.9 

colSums(iris[,1:4])
#sepal.length  sepal.width petal.length  petal.width 
#       876.5        458.6        563.7        179.9

#lapply(x,function, ...) for list()

#lapply(x,function, ...) for list()

result<-lapply(1:3, function(x){x*2})
result
result[[1]]
unlist(result)

x<-list(a=1:3,b=4:6)
x
lapply(x,mean)
lapply(iris[,1:4], mean) # mean of each column

#$sepal.length
#[1] 5.843333

#$sepal.width
#[1] 3.057333

#$petal.length
#[1] 3.758

#$petal.width
#[1] 1.199333


colMeans(iris[,1:4])
#sepal.length  sepal.width petal.length  petal.width 
#    5.843333     3.057333     3.758000     1.199333

#1. unlist() - list to vector
#2. matrix() - matrix to array
#3. as.data.frame() - array to dataframe
#4. names() - fetch the names of variables from the 'iris' list, and grant the name to each column of dataframe

d<-as.data.frame(matrix(unlist(lapply(iris[,1:4],mean)),ncol = 4,byrow = TRUE))
d
names(d)<-names(iris[,1:4])
d

data.frame(do.call(cbind, lapply(iris[,1:4], mean)))

#if data types are different, you have to use do.call()

x<-list(data.frame(name="juhee", value=1), data.frame(name="chan", value=2))
unlist(x)
x<-list(data.frame(name="juhee", value=1), data.frame(name="chan", value=2))
do.call(rbind,x) #do.call() : response time is little slow...

#sapply() : matrix, vector

#sapply : matrix, vector 
sapply(iris[,1:4], mean)#return vector
lapply(iris[,1:4], mean)#return list
class(sapply(iris[,1:4], mean))#[1] "numeric"
class(lapply(iris[,1:4], mean))#[1] "list"

x<-sapply(iris[,1:4], mean)
as.data.frame(x)
as.data.frame(t(x)) #transpose of x

sapply(iris,class)

y<-sapply(iris[,1:4], function(x){x>3})
class(y)#[1] "matrix"
head(y,10)

#tapply() : apply function by group

#tapply(vector, index, function)
tapply(1:10, rep(1,10), sum)
tapply(1:10, 1:10%%2==1, sum)
#FALSE  TRUE 
#   30    25 

tapply(iris$sepal.length, iris$species, mean)
#   setosa versicolor  virginica 
#     5.006      5.936      6.588 

m<-matrix(1:8, ncol = 2, dimnames = list(c("Spring","Summer","Fall","Winter"),c("Male","Female")))
m
tapply(m,list(c(1,1,2,2,1,1,2,2),c(1,1,1,1,2,2,2,2)),sum)

#mapply()

#rnorm(n,mean=0,sd=1) # normal distribution 
#runinf(n,min=0,max=1) #uniform distribution
#rpois(n,lambda ) #poisson distribution
#rexp(n,rate = 1) #exponential distribution

rnorm(10,0,1)

mapply(rnorm, c(1,2,3),c(0,10,100),c(1,1,1))

mapply(mean, iris[,1:4])

#doBy : useful tool for managing data

#grouping by specific value

library(doBy)

#filter() the data for species virginica
virginica <-filter(iris,species=="virginica")

head(virginica,10)

sepallen6 <- filter(iris, species == "virginica", sepal.length >6)
tail(sepallen6)
head(sepallen6)


#select the specified columns in the data in few different ways
selected <- select(iris, sepal.length, sepal.width, petal.length)
selected2<- select(iris, sepal.length:petal.length)

head(selected,3)
identical(selected,selected2)

#mutate()

newcol <- mutate(iris, greater.half = sepal.width > 0.5 * sepal.length)

tail(newcol)

newcol2 <-arrange(newcol, petal.width)
head(newcol2)

arr.vir <- newcol %>% filter(species == "virginica") %>% arrange(sepal.width)

arr.vir[30:35,]

plot(iris$sepal.length , iris$sepal.width) 

hist(iris$sepal.width)

summarise(arr.vir, mean.length = mean(sepal.length, na.rm = TRUE))

pairs(iris[,1:4],col=iris[,5],oma=c(4,4,6,12))
par(xpd=TRUE)
#legend(0.85,0.6, as.vector(unique(iris$species)),fill=c(1,2,3))

library(cluster) 
library(fpc)
 library(ggplot2)
library(corrplot)

data(iris)

#we set seed to create our visualization.
set.seed(8593)

#now I am assigning iris data set to variable iris2, which is makes this process safer and easier #to work with, also we can know for sure that in our R environment section this will be variable #for this particular plot.
iris2 <- iris
iris2


# Now I am wemoving the species by assigning them to NULL.
iris2$Species <- NULL


#now I am making 3 clusters, because I already know for sure there are three species, this would be totally different process if I would not now particular count of clusters
(kmeans.result <- kmeans(iris2, 3))
#After this code I can see size of these clusters, which means exact count of variables for #cluster, in this case: 50, 62, 38. 
#Also we see clustering vectors, of order of each cluster 
#Here are sum of squared by cluster, and if I get it right it has to do something with deviants, #a little bit similarly to linear regression models, but I won't go deeper to explain this.

#From this code we extract the information about accuracy of species in each cluster. We can see #that setoda are currect, bet versicolor and wirginica is overlapping with each other
table(iris$Species, kmeans.result$cluster)

#At last we are plotting the cluster itself
plot(iris2[c("Sepal.Length", "Sepal.Width")], col = kmeans.result$cluster)

#And to distinguish the clusters better, we can plot cluster centers
#For some reason this code is not runing right now, go and figure, but I hope this still will do #as an effort atleast
points(kmeans.result$centers[c("Sepal.Length", "Sepal.Width")], col = 1:3, pch = 8, cex= 2)
data(iris) 

 data_for_clustering <- iris[,-5] 

clusters_iris <- kmeans(data_for_clustering, centers = 3) 

plotcluster(data_for_clustering,clusters_iris$cluster) 
clusplot(data_for_clustering, clusters_iris$cluster, color = TRUE, shade = TRUE)

ggplot(iris,aes(x=Petal.Length,y=Sepal.Length, colour=Species))+geom_point()+
  ggtitle('iris species by petal and sepal lenght')

Considering this plot, we can see that setosa specie is totally different from other species. but now we need to classify this data using only petal.length, and sepal.length

#Regression
\begin{tablular}
| H_0 False | H_0 True |
----------- |------------|-----------|
Reject H_0 | correct |type2 error|
Reject H_0 | type2 error| correct |

\end{tabular}