# R 입문 및 기초 프로그래밍 6장

<프로그램>

# 제6장 자료분석을 위한 주요 통계 함수

# 6.1 기술통계량과 분할표의 재구성

score1 <- read.csv("top50.csv", sep=",", header = TRUE)
data1 <- score1[sapply(score1, is.numeric)] # 숫자형 변수만 추출
names(data1) # 변수명 확인

mean(data1$Size) # Size 변수의 평균

median(data1$Size) # Size 변수의 중위수

quantile(data1$Size) # Size 변수의 사분위수
fivenum(data1$Size) # Size 변수의 사분위수

var(data1$Size) # Size 변수의 분산
var(data1) # 전체 변수의 분산-공분산

sd(data1$Size) # 표준편차

min(data1$Size) # 최소값

max(data1$Size) # 최대값

range(data1$Size) # 범위

summary(data1$Size) # 최소, 4분위수, 중앙값, 평균, 최대

x <- c(3.7,3.3,3.5,2.8) # 평균을 구할 자료
wt <- c(5,5,4,1)/15 # 가중치
wm <- weighted.mean(x,wt) # 가중한 평균
wm

score = c(7,7,9,7,10,6,6,7,9,7,9,7,9,10,10,10,9,6,6,6,8,8,8,7,7)
n <- length(score)

par(mfrow = c(2,2)) # 화면 분할, 행을 중심
barplot(score, col = "red") # 원자료를 그래프화
barplot(table(score), col = "blue") # 요약자료(도수)를 그래프화
barplot(table(score)/n, col = "dark red") # 요약자료(비율)를 그래프화
pie(table(score), col = rainbow(24), radius = 1.0) # pie chart

data(warpbreaks) # 분할표 작성용 데이터 로드
warpbreaks
table(warpbreaks$wool, warpbreaks$tension) # wool by tension 빈도표

data(airquality) # 분할표 작성용 데이터 로드
c1 <- cut(airquality$Temp, quantile(airquality$Temp)) # Temp의 구간을 4분위수로 나눔
table(c1, airquality$Month) # c1 by Month 빈도표

data(Titanic) # flat table 작성용 데이터 로드
Titanic
ftable(Titanic, row.vars = 1:3) # flat table 행변수=1:3
ftable(Titanic, row.vars = 1:2, col.vars = "Survived") # flat table 행변수=1:2, 열변수=Survived
ftable(Titanic, row.vars = 2:1, col.vars = "Survived") # flat table 행변수=2:1, 열변수=Survived

data(mtcars) # 테이블 작성용 데이터 로드
mtcars
table(mtcars$vs, mtcars$am, mtcars$gear) # 분할표 작성
x <- ftable(mtcars[c("cyl","vs","am","gear")])
x
ftable(x, row.vars = c(2,4))
mytab <- ftable(Titanic, row.vars = c(1,3), type = "r")
mytab
prop.table(mytab, 1)


# 6.2 확률분포함수

# 분포관련 함수의 첫 글자가
# d 이면 probability density function x에 해당하는 확률 ex) dnorm
# p 이면 cumulative density function x에 해당하는 누적확률 ex) pnorm
# q 이면 누적 확률에 해당하는 x 값, 즉 역함수 ex) qnorm
# r 이면 난수생성 함수 ex) rnorm

# 이산형분포 : binom(이항분포), multinom(다항분포), pois(포아송분포), geom(기하분포), hyper(초기하분포)
# 연속형분포 : unif(균일분포), norm(정규분포), t(t분포), f(F분포), chisq(카이제곱분포), exp(지수분포)

z0 <- qnorm(p = c(.975, .95, .9), mean = 0, sd = 1) # 누적확률 .975 등에 해당하는 x 값 구하기
z1 <- round(z0, digit = 3) # 반올림
z1

p0 <- pnorm(q = c(-1.28, -1.64, -1.95), mean = 0, sd = 1, lower.tail = FALSE) #lower.tail = FALSE이므로 P[X > -1.28]를 구함
p1 <- round(p0, digit = 4)
p1

op <- par(yaxs = "i")
plot(function(x) dnorm(x, mean = 0, sd = 1), -3, 3,
ylim = c(0, 0.5), main = "Normal Distribution N(0,1)") # 정규분포 그래프 그리기
par(op)

par(mfrow = c(2,2)) # 화면 분할, 행을 중심
plot(function(x) dchisq(x, ncp=0, df=5), 0, 50,
ylim = c(0, 0.2), main = "Chi-Squared Distribution with df = 5") # 카이제곱분포 그리기
plot(function(x) dchisq(x, ncp=0, df=10), 0, 50,
ylim = c(0, 0.2), main = "Chi-Squared Distribution with df = 10") # 카이제곱분포 그리기
plot(function(x) dchisq(x, ncp=0, df=20), 0, 50,
ylim = c(0, 0.2), main = "Chi-Squared Distribution with df = 20") # 카이제곱분포 그리기
plot(function(x) dchisq(x, ncp=0, df=30), 0, 50,
ylim = c(0, 0.2), main = "Chi-Squared Distribution with df = 30") # 카이제곱분포 그리기
par(op)


set.seed(5000)
x <- runif(100, min = 0, max = 100) # 균일분포에서 난수 생성하기
x
breaks <- seq(0, 100, 20) # 0, 20, 40, 60, 80, 100 생성
barplot(table(cut(x, breaks))) # 구간별 개수로 그래프 그리기

outmean <- numeric(0) # numeric data type의 객체 만들기
for(i in 1:100) {
xx <- sample(1:5, 100, replace = TRUE) # 1에서 5까지의 숫자 중 랜덤하게 복원 100개 추출
outmean[i] <- mean(xx)
}
outmean
hist(outmean)


# 6.3 가설검증 관련 함수들

# 6.3.1 t.test wilcox.test var.test cor.test

extra1 <- c(0.7, -1.6, -0.2, -1.2, -0.1, 3.4, 3.7, 0.8, 0.0, 2.0)
length(extra1)
t.test(extra1, alternative = "two.sided", mu = 0) # one sample 양측 t-검정

extra2 <- c(1.9, 0.8, 1.1, 0.1, -0.1, 4.4, 5.5, 1.6, 4.6, 3.4)
length(extra2)

var.test(extra1, extra2) # 두 샘플의 등분산성 검정

sleep <- rbind(cbind(extra1, rep(1,10)),cbind(extra2, rep(2,10))) # t-검정을 위하여 자료 만들기
colnames(sleep) <- c("extra", "group") # 변수 이름 주기
sleep

t.test(extra ~ group, data = sleep, var.equal = TRUE) # t-검정 실시
# t.test(extra1, extra2, var.equal = TRUE) 위의 분석과 동일

boxplot(extra ~ group, data = sleep)


x <- c(1.83, 0.5, 1.62, 2.48, 1.68, 1.88, 1.55, 3.06, 1.3) # paired comparison test 짝비교
y <- c(0.878, 0.647, 0.598, 2.05, 1.06, 1.29, 1.06, 3.14, 1.29)
t.test(x, y, paired = TRUE) # paried t-test
t.test(x - y) # x에서 y를 뺀 값을 one sample t-test


x <- c(44.4, 45.9, 41.9, 53.3, 44.7, 44.1, 50.7, 45.2, 60.1)
y <- c( 2.6, 3.1, 2.5, 5.0, 3.6, 4.0, 5.2, 2.8, 3.8)
cor.test(x, y, method = "pearson", alternative = "two.sided") # 상관분석

# 6.3.2 binom.test, prop.test 함수

# 100명 중 후보 A 지지:67명, 후보 B지지:33명 이항검증 Exact binomial test
binom.test(67, 100, p = 1/2, alternative = "two.sided")

# 4개 그룹의 흡연율이 모두 동일한가?
smokers <- c(83, 90, 129, 70)
patients <- c(86, 93, 136, 82)
prop.test(smokers, patients)

# 6.3.3 oneway.test, kruskal.test 함수
score <- c(64, 72, 68, 77, 56, 95,
78, 91, 97, 82, 85, 77,
75, 93, 78, 71, 63, 76,
55, 66, 49, 64, 70, 68)
group <- as.factor(c(rep(1,6),rep(2,6),rep(3,6),rep(4,6)))
edu <- data.frame(cbind(score, group))
names(edu)

oneway.test(edu$score ~ edu$group, data = edu) # 일원분산분석
anova(lm(edu$score ~ edu$group, data = edu))


# 6.3.4 chisq.test, fisher.test, mcnemar.test

x <- matrix(c(12,5,7,7), nc=2)
x
chisq.test(x) # 카이제곱 검정
chisq.test(x, simulate.p.value = TRUE, B = 10000) # 몬테칼로 모의실험을 통한 유의확률 계산

x <- c(A = 20, B = 15, C = 25)
x
chisq.test(x) # 동일 비율인지 카이제곱 검정
chisq.test(x, p = c(1,1,3)/5) # 1:1:3에 비율인지 카이제곱 검정


<프로그램 결과>

R version 2.8.0 (2008-10-20)
Copyright (C) 2008 The R Foundation for Statistical Computing
ISBN 3-900051-07-0

R은 free 소프트웨어이고, [완전하게 무보증]입니다.
일정한 조건에 따르면, 자유롭게 이것을 재배포할수가 있습니다.
배포 조건의 상세한것에 대해서는 'license()' 또는 'licence()' 라고 입력해주십시오

R는 많은 공헌자에의한 공동 프로젝트입니다
더 자세한것에 대해서는 'contributors()'라고 입력해 주십시오.
또는, R나 R의 패키지를 출판물로 인용할때의 형식에 대해서는
'citation()'라고 입력해주십시오
'demo()'라고 입력하면, demos를 볼수가 있습니다.
'help()'라고 한다면, on-line help가 나옵니다.
'help.start()'로 HTML 브라우저에 의한 help가 보여집니다
'q()'라고 입력하면 R를 종료합니다
> # R 입문 및 기초 프로그래밍
>
> # 제6장 자료분석을 위한 주요 통계 함수
>
> # 6.1 기술통계량과 분할표의 재구성
>
> score1 <- read.csv("top50.csv", sep=",", header = TRUE)
> data1 <- score1[sapply(score1, is.numeric)] # 숫자형 변수만 추출
> names(data1) # 변수명 확인
[1] "Rank" "Total" "Alumni" "Award" "HiCi" "NS" "SCI" "Size"
>
> mean(data1$Size) # Size 변수의 평균
[1] 36.546
>
> median(data1$Size) # Size 변수의 중위수
[1] 34.9
>
> quantile(data1$Size) # Size 변수의 사분위수
0% 25% 50% 75% 100%
12.80 20.90 34.90 45.05 100.00
> fivenum(data1$Size) # Size 변수의 사분위수
[1] 12.8 20.7 34.9 45.1 100.0
>
> var(data1$Size) # Size 변수의 분산
[1] 310.707
> var(data1) # 전체 변수의 분산-공분산
Rank Total Alumni Award HiCi NS
Rank 209.1837 -197.3527 -236.5000 -267.77429 -183.2665 -193.9914
Total -197.3527 220.4495 275.3721 308.37487 202.9632 212.4297
Alumni -236.5000 275.3721 538.1906 438.98106 185.7502 215.9799
Award -267.7743 308.3749 438.9811 590.31620 228.8054 261.4273
HiCi -183.2665 202.9632 185.7502 228.80541 256.5221 213.9668
NS -193.9914 212.4297 215.9799 261.42726 213.9668 243.1977
SCI -101.5918 107.5739 113.6661 38.01971 121.6901 109.3743
Size -163.3371 176.6299 194.6954 282.24778 119.7899 165.5881
SCI Size
Rank -101.59184 -163.33714
Total 107.57392 176.62991
Alumni 113.66612 194.69543
Award 38.01971 282.24778
HiCi 121.69008 119.78991
NS 109.37433 165.58810
SCI 176.42490 26.75588
Size 26.75588 310.70702
>
> sd(data1$Size) # 표준편차
[1] 17.62688
>
> min(data1$Size) # 최소값
[1] 12.8
>
> max(data1$Size) # 최대값
[1] 100
>
> range(data1$Size) # 범위
[1] 12.8 100.0
>
> summary(data1$Size) # 최소, 4분위수, 중앙값, 평균, 최대
Min. 1st Qu. Median Mean 3rd Qu. Max.
12.80 20.90 34.90 36.55 45.05 100.00
>
> x <- c(3.7,3.3,3.5,2.8) # 평균을 구할 자료
> wt <- c(5,5,4,1)/15 # 가중치
> wm <- weighted.mean(x,wt) # 가중한 평균
> wm
[1] 3.453333
>
> score = c(7,7,9,7,10,6,6,7,9,7,9,7,9,10,10,10,9,6,6,6,8,8,8,7,7)
> n <- length(score)
>
> par(mfrow = c(2,2)) # 화면 분할, 행을 중심
> barplot(score, col = "red") # 원자료를 그래프화
> barplot(table(score), col = "blue") # 요약자료(도수)를 그래프화
> barplot(table(score)/n, col = "dark red") # 요약자료(비율)를 그래프화
> pie(table(score), col = rainbow(24), radius = 1.0) # pie chart
>
> data(warpbreaks) # 분할표 작성용 데이터 로드
> warpbreaks
breaks wool tension
1 26 A L
2 30 A L
3 54 A L
4 25 A L
5 70 A L
6 52 A L
7 51 A L
8 26 A L
9 67 A L
10 18 A M
11 21 A M
12 29 A M
13 17 A M
14 12 A M
15 18 A M
16 35 A M
17 30 A M
18 36 A M
19 36 A H
20 21 A H
21 24 A H
22 18 A H
23 10 A H
24 43 A H
25 28 A H
26 15 A H
27 26 A H
28 27 B L
29 14 B L
30 29 B L
31 19 B L
32 29 B L
33 31 B L
34 41 B L
35 20 B L
36 44 B L
37 42 B M
38 26 B M
39 19 B M
40 16 B M
41 39 B M
42 28 B M
43 21 B M
44 39 B M
45 29 B M
46 20 B H
47 21 B H
48 24 B H
49 17 B H
50 13 B H
51 15 B H
52 15 B H
53 16 B H
54 28 B H
> table(warpbreaks$wool, warpbreaks$tension) # wool by tension 빈도표

L M H
A 9 9 9
B 9 9 9
>
> data(airquality) # 분할표 작성용 데이터 로드
> c1 <- cut(airquality$Temp, quantile(airquality$Temp)) # Temp의 구간을 4분위수로 나눔
> table(c1, airquality$Month) # c1 by Month 빈도표

c1 5 6 7 8 9
(56,72] 24 3 0 1 10
(72,79] 5 15 2 9 10
(79,85] 1 7 19 7 5
(85,97] 0 5 10 14 5
>
> data(Titanic) # flat table 작성용 데이터 로드
> Titanic
, , Age = Child, Survived = No

Sex
Class Male Female
1st 0 0
2nd 0 0
3rd 35 17
Crew 0 0

, , Age = Adult, Survived = No

Sex
Class Male Female
1st 118 4
2nd 154 13
3rd 387 89
Crew 670 3

, , Age = Child, Survived = Yes

Sex
Class Male Female
1st 5 1
2nd 11 13
3rd 13 14
Crew 0 0

, , Age = Adult, Survived = Yes

Sex
Class Male Female
1st 57 140
2nd 14 80
3rd 75 76
Crew 192 20

> ftable(Titanic, row.vars = 1:3) # flat table 행변수=1:3
Survived No Yes
Class Sex Age
1st Male Child 0 5
Adult 118 57
Female Child 0 1
Adult 4 140
2nd Male Child 0 11
Adult 154 14
Female Child 0 13
Adult 13 80
3rd Male Child 35 13
Adult 387 75
Female Child 17 14
Adult 89 76
Crew Male Child 0 0
Adult 670 192
Female Child 0 0
Adult 3 20
> ftable(Titanic, row.vars = 1:2, col.vars = "Survived") # flat table 행변수=1:2, 열변수=Survived
Survived No Yes
Class Sex
1st Male 118 62
Female 4 141
2nd Male 154 25
Female 13 93
3rd Male 422 88
Female 106 90
Crew Male 670 192
Female 3 20
> ftable(Titanic, row.vars = 2:1, col.vars = "Survived") # flat table 행변수=2:1, 열변수=Survived
Survived No Yes
Sex Class
Male 1st 118 62
2nd 154 25
3rd 422 88
Crew 670 192
Female 1st 4 141
2nd 13 93
3rd 106 90
Crew 3 20
>
> data(mtcars) # 테이블 작성용 데이터 로드
> mtcars
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
> table(mtcars$vs, mtcars$am, mtcars$gear) # 분할표 작성
, , = 3


0 1
0 12 0
1 3 0

, , = 4


0 1
0 0 2
1 4 6

, , = 5


0 1
0 0 4
1 0 1

> x <- ftable(mtcars[c("cyl","vs","am","gear")])
> x
gear 3 4 5
cyl vs am
4 0 0 0 0 0
1 0 0 1
1 0 1 2 0
1 0 6 1
6 0 0 0 0 0
1 0 2 1
1 0 2 2 0
1 0 0 0
8 0 0 12 0 0
1 0 0 2
1 0 0 0 0
1 0 0 0
> ftable(x, row.vars = c(2,4))
cyl 4 6 8
am 0 1 0 1 0 1
vs gear
0 3 0 0 0 0 12 0
4 0 0 0 2 0 0
5 0 1 0 1 0 2
1 3 1 0 2 0 0 0
4 2 6 2 0 0 0
5 0 1 0 0 0 0
> mytab <- ftable(Titanic, row.vars = c(1,3), type = "r")
> mytab
Sex Male Female
Survived No Yes No Yes
Class Age
1st Child 0 5 0 1
Adult 118 57 4 140
2nd Child 0 11 0 13
Adult 154 14 13 80
3rd Child 35 13 17 14
Adult 387 75 89 76
Crew Child 0 0 0 0
Adult 670 192 3 20
> prop.table(mytab, 1)
Sex Male Female
Survived No Yes No Yes
Class Age
1st Child 0.000000000 0.833333333 0.000000000 0.166666667
Adult 0.369905956 0.178683386 0.012539185 0.438871473
2nd Child 0.000000000 0.458333333 0.000000000 0.541666667
Adult 0.590038314 0.053639847 0.049808429 0.306513410
3rd Child 0.443037975 0.164556962 0.215189873 0.177215190
Adult 0.617224880 0.119617225 0.141945774 0.121212121
Crew Child NaN NaN NaN NaN
Adult 0.757062147 0.216949153 0.003389831 0.022598870
>
>
> # 6.2 확률분포함수
>
> # 분포관련 함수의 첫 글자가
> # d 이면 probability density function x에 해당하는 확률 ex) dnorm
> # p 이면 cumulative density function x에 해당하는 누적확률 ex) pnorm
> # q 이면 누적 확률에 해당하는 x 값, 즉 역함수 ex) qnorm
> # r 이면 난수생성 함수 ex) rnorm
>
> # 이산형분포 : binom(이항분포), multinom(다항분포), pois(포아송분포), geom(기하분포), hyper(초기하분포)
> # 연속형분포 : unif(균일분포), norm(정규분포), t(t분포), f(F분포), chisq(카이제곱분포), exp(지수분포)
>
> z0 <- qnorm(p = c(.975, .95, .9), mean = 0, sd = 1) # 누적확률 .975 등에 해당하는 x 값 구하기
> z1 <- round(z0, digit = 3) # 반올림
> z1
[1] 1.960 1.645 1.282
>
> p0 <- pnorm(q = c(-1.28, -1.64, -1.95), mean = 0, sd = 1, lower.tail = FALSE) #lower.tail = FALSE이므로 P[X > -1.28]를 구함
> p1 <- round(p0, digit = 4)
> p1
[1] 0.8997 0.9495 0.9744
>
> op <- par(yaxs = "i")
> plot(function(x) dnorm(x, mean = 0, sd = 1), -3, 3,
+ ylim = c(0, 0.5), main = "Normal Distribution N(0,1)") # 정규분포 그래프 그리기
> par(op)
>
> par(mfrow = c(2,2)) # 화면 분할, 행을 중심
> plot(function(x) dchisq(x, ncp=0, df=5), 0, 50,
+ ylim = c(0, 0.2), main = "Chi-Squared Distribution with df = 5") # 카이제곱분포 그리기
> plot(function(x) dchisq(x, ncp=0, df=10), 0, 50,
+ ylim = c(0, 0.2), main = "Chi-Squared Distribution with df = 10") # 카이제곱분포 그리기
> plot(function(x) dchisq(x, ncp=0, df=20), 0, 50,
+ ylim = c(0, 0.2), main = "Chi-Squared Distribution with df = 20") # 카이제곱분포 그리기
> plot(function(x) dchisq(x, ncp=0, df=30), 0, 50,
+ ylim = c(0, 0.2), main = "Chi-Squared Distribution with df = 30") # 카이제곱분포 그리기
> par(op)
>
>
> set.seed(5000)
> x <- runif(100, min = 0, max = 100) # 균일분포에서 난수 생성하기
> x
[1] 44.8220263 44.2002986 12.7015065 98.4049592 1.4467274 69.6072417
[7] 77.7870707 0.7336880 31.7917702 46.8641533 34.5300433 91.8756908
[13] 85.0416658 62.6601236 14.5366424 62.8844190 48.5009914 21.1912350
[19] 89.7244355 23.8245935 63.5789237 4.5388972 33.2479654 72.6832475
[25] 39.0438123 53.6073128 74.9977842 62.6367480 89.5192926 17.0346685
[31] 64.3730106 16.2617763 14.3332748 53.2736583 13.5270498 27.0631616
[37] 42.6475586 77.4119293 91.3527230 42.0280545 72.5862657 22.3027105
[43] 21.0247756 91.2986977 39.7505348 22.1824197 54.9299244 0.4067294
[49] 44.7970997 52.3048372 83.0666517 28.6782759 31.9669676 11.6344629
[55] 9.0248491 0.8261689 66.9839170 81.4940627 56.0016208 4.8549011
[61] 82.7248210 71.1825128 97.1149146 80.3429085 12.5407167 22.1994634
[67] 56.6060178 29.5777780 16.0533162 39.8827740 45.4359173 44.4860377
[73] 46.0990052 78.0708252 11.0269129 41.2984187 25.3414123 54.2448909
[79] 54.3813558 90.8013948 92.7887089 21.0828942 94.7526456 73.3672802
[85] 77.8470048 13.3707660 64.9445184 5.2184763 82.1628311 40.2009236
[91] 63.9286351 61.2824839 24.9548796 84.3814270 14.5189138 53.3733159
[97] 58.7353092 73.9414691 81.2902514 33.4766168
> breaks <- seq(0, 100, 20) # 0, 20, 40, 60, 80, 100 생성
> barplot(table(cut(x, breaks))) # 구간별 개수로 그래프 그리기
>
> outmean <- numeric(0) # numeric data type의 객체 만들기
> for(i in 1:100) {
+ xx <- sample(1:5, 100, replace = TRUE) # 1에서 5까지의 숫자 중 랜덤하게 복원 100개 추출
+ outmean[i] <- mean(xx)
+ }
> outmean
[1] 2.89 2.98 3.10 2.88 2.87 3.07 2.72 3.01 3.04 3.14 2.88 2.98 2.78 3.21
[15] 3.01 2.85 2.81 2.89 3.06 3.16 3.32 3.03 3.16 3.21 2.74 2.76 2.84 3.11
[29] 2.88 2.90 3.04 3.08 3.17 2.98 3.07 3.03 2.92 3.12 3.02 3.08 2.99 3.00
[43] 2.86 2.94 2.98 3.02 3.12 2.89 3.09 3.02 3.07 3.05 3.14 3.15 2.93 2.89
[57] 3.12 3.15 2.89 3.01 3.20 3.15 3.09 3.28 2.93 2.99 3.09 2.82 3.16 3.07
[71] 3.09 2.90 2.87 2.93 2.86 2.99 3.32 3.04 2.80 3.03 2.97 3.05 2.99 2.89
[85] 2.88 3.17 3.04 3.00 3.07 2.88 2.92 3.12 2.92 2.79 2.67 2.94 3.06 2.98
[99] 2.87 2.77
> hist(outmean)
>
>
> # 6.3 가설검증 관련 함수들
>
> # 6.3.1 t.test wilcox.test var.test cor.test
>
> extra1 <- c(0.7, -1.6, -0.2, -1.2, -0.1, 3.4, 3.7, 0.8, 0.0, 2.0)
> length(extra1)
[1] 10
> t.test(extra1, alternative = "two.sided", mu = 0) # one sample 양측 t-검정

One Sample t-test

data: extra1
t = 1.3257, df = 9, p-value = 0.2176
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
-0.5297804 2.0297804
sample estimates:
mean of x
0.75

>
> extra2 <- c(1.9, 0.8, 1.1, 0.1, -0.1, 4.4, 5.5, 1.6, 4.6, 3.4)
> length(extra2)
[1] 10
>
> var.test(extra1, extra2) # 두 샘플의 등분산성 검정

F test to compare two variances

data: extra1 and extra2
F = 0.7983, num df = 9, denom df = 9, p-value = 0.7427
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
0.198297 3.214123
sample estimates:
ratio of variances
0.7983426

>
> sleep <- rbind(cbind(extra1, rep(1,10)),cbind(extra2, rep(2,10))) # t-검정을 위하여 자료 만들기
> colnames(sleep) <- c("extra", "group") # 변수 이름 주기
> sleep
extra group
[1,] 0.7 1
[2,] -1.6 1
[3,] -0.2 1
[4,] -1.2 1
[5,] -0.1 1
[6,] 3.4 1
[7,] 3.7 1
[8,] 0.8 1
[9,] 0.0 1
[10,] 2.0 1
[11,] 1.9 2
[12,] 0.8 2
[13,] 1.1 2
[14,] 0.1 2
[15,] -0.1 2
[16,] 4.4 2
[17,] 5.5 2
[18,] 1.6 2
[19,] 4.6 2
[20,] 3.4 2
>
> t.test(extra ~ group, data = sleep, var.equal = TRUE) # t-검정 실시

Two Sample t-test

data: extra by group
t = -1.8608, df = 18, p-value = 0.07919
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-3.3638740 0.2038740
sample estimates:
mean in group 1 mean in group 2
0.75 2.33

> # t.test(extra1, extra2, var.equal = TRUE) 위의 분석과 동일
>
> boxplot(extra ~ group, data = sleep)
>
>
> x <- c(1.83, 0.5, 1.62, 2.48, 1.68, 1.88, 1.55, 3.06, 1.3) # paired comparison test 짝비교
> y <- c(0.878, 0.647, 0.598, 2.05, 1.06, 1.29, 1.06, 3.14, 1.29)
> t.test(x, y, paired = TRUE) # paried t-test

Paired t-test

data: x and y
t = 3.0354, df = 8, p-value = 0.01618
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
0.1037787 0.7599991
sample estimates:
mean of the differences
0.4318889

> t.test(x - y) # x에서 y를 뺀 값을 one sample t-test

One Sample t-test

data: x - y
t = 3.0354, df = 8, p-value = 0.01618
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
0.1037787 0.7599991
sample estimates:
mean of x
0.4318889

>
>
> x <- c(44.4, 45.9, 41.9, 53.3, 44.7, 44.1, 50.7, 45.2, 60.1)
> y <- c( 2.6, 3.1, 2.5, 5.0, 3.6, 4.0, 5.2, 2.8, 3.8)
> cor.test(x, y, method = "pearson", alternative = "two.sided") # 상관분석

Pearson's product-moment correlation

data: x and y
t = 1.8411, df = 7, p-value = 0.1082
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.1497426 0.8955795
sample estimates:
cor
0.5711816

>
> # 6.3.2 binom.test, prop.test 함수
>
> # 100명 중 후보 A 지지:67명, 후보 B지지:33명 이항검증 Exact binomial test
> binom.test(67, 100, p = 1/2, alternative = "two.sided")

Exact binomial test

data: 67 and 100
number of successes = 67, number of trials = 100, p-value =
0.0008737
alternative hypothesis: true probability of success is not equal to 0.5
95 percent confidence interval:
0.5688272 0.7608015
sample estimates:
probability of success
0.67

>
> # 4개 그룹의 흡연율이 모두 동일한가?
> smokers <- c(83, 90, 129, 70)
> patients <- c(86, 93, 136, 82)
> prop.test(smokers, patients)

4-sample test for equality of proportions without continuity
correction

data: smokers out of patients
X-squared = 12.6004, df = 3, p-value = 0.005585
alternative hypothesis: two.sided
sample estimates:
prop 1 prop 2 prop 3 prop 4
0.9651163 0.9677419 0.9485294 0.8536585

>
> # 6.3.3 oneway.test, kruskal.test 함수
> score <- c(64, 72, 68, 77, 56, 95,
+ 78, 91, 97, 82, 85, 77,
+ 75, 93, 78, 71, 63, 76,
+ 55, 66, 49, 64, 70, 68)
> group <- as.factor(c(rep(1,6),rep(2,6),rep(3,6),rep(4,6)))
> edu <- data.frame(cbind(score, group))
> names(edu)
[1] "score" "group"
>
> oneway.test(edu$score ~ edu$group, data = edu) # 일원분산분석

One-way analysis of means (not assuming equal variances)

data: edu$score and edu$group
F = 7.4757, num df = 3.000, denom df = 10.953, p-value = 0.005353

> anova(lm(edu$score ~ edu$group, data = edu))
Analysis of Variance Table

Response: edu$score
Df Sum Sq Mean Sq F value Pr(>F)
edu$group 1 456.3 456.3 3.1388 0.0903 .
Residuals 22 3198.2 145.4
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
>
> # 6.3.4 chisq.test, fisher.test, mcnemar.test
>
> x <- matrix(c(12,5,7,7), nc=2)
> x
[,1] [,2]
[1,] 12 7
[2,] 5 7
> chisq.test(x) # 카이제곱 검정

Pearson's Chi-squared test with Yates' continuity correction

data: x
X-squared = 0.6411, df = 1, p-value = 0.4233

> chisq.test(x, simulate.p.value = TRUE, B = 10000) # 몬테칼로 모의실험을 통한 유의확률 계산

Pearson's Chi-squared test with simulated p-value (based on 10000
replicates)

data: x
X-squared = 1.3716, df = NA, p-value = 0.2907

>
> x <- c(A = 20, B = 15, C = 25)
> x
A B C
20 15 25
> chisq.test(x) # 동일 비율인지 카이제곱 검정

Chi-squared test for given probabilities

data: x
X-squared = 2.5, df = 2, p-value = 0.2865

> chisq.test(x, p = c(1,1,3)/5) # 1:1:3에 비율인지 카이제곱 검정

Chi-squared test for given probabilities

data: x
X-squared = 9.4444, df = 2, p-value = 0.008895

>

+ Recent posts