R - Basic R stat on iris - 2020/01/08
title: "stat1"
output: html_document
knitr::opts_chunk$set(echo = TRUE)
##Basic R coding for Statistics --------------------------------------------------------------------
using iris dataset in R library
Learning R on iris dataset
Author: Juhee Chung
Iris, introduced by Ronald Fisher in his 1936 paper The use of multiple measurements in taxonomic problems, contains three plant species (setosa, virginica, versicolor) and four features measured for each sample. These quantify the morphologic variation of the iris flower in its three species, all measurements given in centimeters.
library(datasets)
data("iris")
summary(iris)
head(iris)
str(iris)
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#The dplyr package
install.packages("dplyr")
names(iris)<-tolower(names(iris))
library(dplyr)
require(dplyr)
#apply()
# iris3 : 3dimensional array structure
d<-matrix(1:9, ncol = 3)
d
#apply(array, margin, ...) margin=1(row) | margin=2(column)
apply(d,1,sum) #1+4+7,2+5+8, 3+6+9
apply(d, 2, sum) #
apply(iris[,1:4], 2, sum)
#sepal.length sepal.width petal.length petal.width
# 876.5 458.6 563.7 179.9
colSums(iris[,1:4])
#sepal.length sepal.width petal.length petal.width
# 876.5 458.6 563.7 179.9
#lapply(x,function, ...) for list()
#lapply(x,function, ...) for list()
result<-lapply(1:3, function(x){x*2})
result
result[[1]]
unlist(result)
x<-list(a=1:3,b=4:6)
x
lapply(x,mean)
lapply(iris[,1:4], mean) # mean of each column
#$sepal.length
#[1] 5.843333
#$sepal.width
#[1] 3.057333
#$petal.length
#[1] 3.758
#$petal.width
#[1] 1.199333
colMeans(iris[,1:4])
#sepal.length sepal.width petal.length petal.width
# 5.843333 3.057333 3.758000 1.199333
#1. unlist() - list to vector
#2. matrix() - matrix to array
#3. as.data.frame() - array to dataframe
#4. names() - fetch the names of variables from the 'iris' list, and grant the name to each column of dataframe
d<-as.data.frame(matrix(unlist(lapply(iris[,1:4],mean)),ncol = 4,byrow = TRUE))
d
names(d)<-names(iris[,1:4])
d
data.frame(do.call(cbind, lapply(iris[,1:4], mean)))
#if data types are different, you have to use do.call()
x<-list(data.frame(name="juhee", value=1), data.frame(name="chan", value=2))
unlist(x)
x<-list(data.frame(name="juhee", value=1), data.frame(name="chan", value=2))
do.call(rbind,x) #do.call() : response time is little slow...
#sapply() : matrix, vector
#sapply : matrix, vector
sapply(iris[,1:4], mean)#return vector
lapply(iris[,1:4], mean)#return list
class(sapply(iris[,1:4], mean))#[1] "numeric"
class(lapply(iris[,1:4], mean))#[1] "list"
x<-sapply(iris[,1:4], mean)
as.data.frame(x)
as.data.frame(t(x)) #transpose of x
sapply(iris,class)
y<-sapply(iris[,1:4], function(x){x>3})
class(y)#[1] "matrix"
head(y,10)
#tapply() : apply function by group
#tapply(vector, index, function)
tapply(1:10, rep(1,10), sum)
tapply(1:10, 1:10%%2==1, sum)
#FALSE TRUE
# 30 25
tapply(iris$sepal.length, iris$species, mean)
# setosa versicolor virginica
# 5.006 5.936 6.588
m<-matrix(1:8, ncol = 2, dimnames = list(c("Spring","Summer","Fall","Winter"),c("Male","Female")))
m
tapply(m,list(c(1,1,2,2,1,1,2,2),c(1,1,1,1,2,2,2,2)),sum)
#mapply()
#rnorm(n,mean=0,sd=1) # normal distribution
#runinf(n,min=0,max=1) #uniform distribution
#rpois(n,lambda ) #poisson distribution
#rexp(n,rate = 1) #exponential distribution
rnorm(10,0,1)
mapply(rnorm, c(1,2,3),c(0,10,100),c(1,1,1))
mapply(mean, iris[,1:4])
#doBy : useful tool for managing data
#grouping by specific value
library(doBy)
#filter() the data for species virginica
virginica <-filter(iris,species=="virginica")
head(virginica,10)
sepallen6 <- filter(iris, species == "virginica", sepal.length >6)
tail(sepallen6)
head(sepallen6)
#select the specified columns in the data in few different ways
selected <- select(iris, sepal.length, sepal.width, petal.length)
selected2<- select(iris, sepal.length:petal.length)
head(selected,3)
identical(selected,selected2)
#mutate()
newcol <- mutate(iris, greater.half = sepal.width > 0.5 * sepal.length)
tail(newcol)
newcol2 <-arrange(newcol, petal.width)
head(newcol2)
arr.vir <- newcol %>% filter(species == "virginica") %>% arrange(sepal.width)
arr.vir[30:35,]
plot(iris$sepal.length , iris$sepal.width)
hist(iris$sepal.width)
summarise(arr.vir, mean.length = mean(sepal.length, na.rm = TRUE))
pairs(iris[,1:4],col=iris[,5],oma=c(4,4,6,12))
par(xpd=TRUE)
#legend(0.85,0.6, as.vector(unique(iris$species)),fill=c(1,2,3))
library(cluster)
library(fpc)
library(ggplot2)
library(corrplot)
data(iris)
#we set seed to create our visualization.
set.seed(8593)
#now I am assigning iris data set to variable iris2, which is makes this process safer and easier #to work with, also we can know for sure that in our R environment section this will be variable #for this particular plot.
iris2 <- iris
iris2
# Now I am wemoving the species by assigning them to NULL.
iris2$Species <- NULL
#now I am making 3 clusters, because I already know for sure there are three species, this would be totally different process if I would not now particular count of clusters
(kmeans.result <- kmeans(iris2, 3))
#After this code I can see size of these clusters, which means exact count of variables for #cluster, in this case: 50, 62, 38.
#Also we see clustering vectors, of order of each cluster
#Here are sum of squared by cluster, and if I get it right it has to do something with deviants, #a little bit similarly to linear regression models, but I won't go deeper to explain this.
#From this code we extract the information about accuracy of species in each cluster. We can see #that setoda are currect, bet versicolor and wirginica is overlapping with each other
table(iris$Species, kmeans.result$cluster)
#At last we are plotting the cluster itself
plot(iris2[c("Sepal.Length", "Sepal.Width")], col = kmeans.result$cluster)
#And to distinguish the clusters better, we can plot cluster centers
#For some reason this code is not runing right now, go and figure, but I hope this still will do #as an effort atleast
points(kmeans.result$centers[c("Sepal.Length", "Sepal.Width")], col = 1:3, pch = 8, cex= 2)
data(iris)
data_for_clustering <- iris[,-5]
clusters_iris <- kmeans(data_for_clustering, centers = 3)
plotcluster(data_for_clustering,clusters_iris$cluster)
clusplot(data_for_clustering, clusters_iris$cluster, color = TRUE, shade = TRUE)
ggplot(iris,aes(x=Petal.Length,y=Sepal.Length, colour=Species))+geom_point()+
ggtitle('iris species by petal and sepal lenght')
Considering this plot, we can see that setosa specie is totally different from other species. but now we need to classify this data using only petal.length, and sepal.length
#Regression
\begin{tablular}
| H_0 False | H_0 True |
----------- |------------|-----------|
Reject H_0 | correct |type2 error|
Reject H_0 | type2 error| correct |
\end{tabular}