# 應用統計筆記
## 區間估計
[觀念參考](http://web.cjcu.edu.tw/~jdwu/stat01/lect003.pdf)
## 變數類型
```graphviz
digraph hierarchy {
nodesep=1.0
node [color=Black, fontname=Courier,shape=box]
edge [color=Black]
"all variables"->{"numerical/qualitative" "categorical/qualitative"}
"numerical/qualitative"->{ratio interval}
"categorical/qualitative"->nominal
{rank=same;nominal ordinal }
}
```
**Type of variable**
* Nominal(名目): A nominal variable is a type of variable that is used to name, label or categorize particular attributes that are being measured, Like ++Gender++。
* Ordinal(順序): Ordinal variable is a type of measurement variable that takes values with an order or rank.**The difference in the rank of an ordinal variable is not equal.**
* Interval(區間): The interval variable is a measurement variable that is used to define **values measured along a scale**, with **each point placed at an equal distance from one another**. Like ++Temperature++, ++Time++, ++IQ Test++ (zero has no meaning).
* Ratio(比例): The only difference between the ratio variable and interval variable is that **the ratio variable already has a zero value** (zero has meaning).
### Population and sample
* Population: Populations can include people, but other examples include objects, events, businesses, and so on.
* Population Parameters: A characteristic of **an entire population**.
* Sample statistics: A statistic is a characteristic of **a sample**.
* case/observation: Objects in the sample collection
* sample: A sample is the specific group that you will collect data from.
## Mar. 17
### barplot 長條圖(用在類別資料上)
```r
#bar chart
t1<-table(lg$gender)
barplot(t1)
barplot(t1,main="Gender in TW legislature",
xlab="Gender",ylab="Frequency")
barplot(t1,main="Gender in TW legislature",
xlab="Gender",ylab="Frequency",
col="red",border="red")
barplot(t1,main="Gender in TW legislature",
xlab="Gender",ylab="Frequency",
col=c("#FF8C00","#4169E1"))
#names.arg 給類別名稱
barplot(t1,main="Gender in TW legislature",
xlab="Gender",ylab="Frequency",
col=c("#FF8C00","#4169E1"),
names.arg = c("Female","Male"))
#相對次數分配表
t2<-prop.table(t1)
barplot(t2,main="Gender in TW legislature",
xlab="Gender",ylab="Proportion")
```
### histogram 直方圖(用在數值資料上 )(連續的)
```r
#histogram
hist(lg$age)
hist(lg$age,main="Age in TW legislature",
xlab="Age")
# breaks 用來設定長條的數量
hist(lg$age,main="Age in TW legislature",
xlab="Age",breaks=30)
range(lg$age) # 得到最小跟最大值
# breaks 如果給c(...) 代表 25到30畫一個,30到35畫一個...
# (但給的範圍必須小於最小值和大於最大值)
hist(lg$age,main="Age in TW legislature",
xlab="Age",breaks=c(25,30,35,40,45,50,55,70,75,80))
#seq(25,80,5) = c(25,30,35,40,45,50,55,70,75,80)
hist(lg$age,main="Age in TW legislature",
xlab="Age",breaks=seq(25,80,5))
hist(lg$age,main="Age in TW legislature",
xlab="Age",breaks=seq(25,80,5),col="red")
hist(lg$age,main="Age in TW legislature",
xlab="Age",breaks=seq(25,80,5),col=c("#BC8F8F","#6A5ACD"))
hist(lg$age[lg$party=="KMT"],main="Age in TW legislature",
xlab="Age",breaks=c(25,30,35,40,45,50,55,60,65,70,75,80))
hist(lg$age[lg$party=="DPP"],main="Age in TW legislature",
xlab="Age",breaks=c(25,30,35,40,45,50,55,60,65,70,75,80))
#abline
hist(lg$age,main="Age in TW legislature",
xlab="Age",breaks=c(25,30,35,40,45,50,55,60,65,70,75,80))
mean(lg$age)
# abline 在已經有的圖上 畫直線
# v 畫直的
# h 畫橫的
abline(v=mean(lg$age))
# lty 是設定線的形狀
# 4 for ·-·-
abline(v=mean(lg$age),col="red",lwd=3,lty=4)
abline(h=15,col="red",lwd=5,lty=4)
hist(lg$age,main="Age in TW legislature",
xlab="Age",breaks=c(25,30,35,40,45,50,55,60,65,70,75,80), freq=F)
# freq 設定false 為density plot
```
## Mar. 24 : Normal Distribution and 31 : Correlations and Regressions (pnorm)
### boxplot 盒型圖
```r
#boxplot 盒型圖
boxplot(lg$age)
summary(lg$age) #顯示 Min. 1st Qu. Median Mean 3rd Qu. max.
boxplot(lg$age,main="Age in TW legislature",ylab="Age")
boxplot(lg$age[lg$party=="KMT"],main="Age of KMT legislators",ylab="Age")
#前面是用來畫圖的變項 "~" 後面是用來顯示分組的變項
boxplot(lg$age~lg$party)
boxplot(lg$age~lg$party,main="Age in TW legislature",ylab="Age",
xlab="Party")
boxplot(lg$age~lg$party,main="Age in TW legislature",ylab="Age",
xlab="Party",col=c("green","blue","gray","yellow",
"gray","orange"))
```

```r
largeparty<-lg[lg$party%in%c("KMT","DPP"),]
boxplot(largeparty$age~largeparty$party)
#用as.character 來讓我的類別只剩下DPP KMT,因為原本的型態還留著其他政黨的變量(NONE, NPP, othrer, PFP)
boxplot(largeparty$age~as.character(largeparty$party))
```
### scatterplot 散佈圖
```r
#scatterplot
plot(x=lg$age,y=lg$term)
plot(x=lg$age,y=lg$term, main="Age and term in TW legislature",
xlab="Age",ylab="Term")
plot(x=lg$age,y=lg$term,main="Age and term in TW legislature",
xlab="Age",ylab="Term",col="red")
plot(x=lg$age,y=lg$term,main="Age and term in TW legislature",
xlab="Age",ylab="Term",col="red",pch=3)
# pch 是用來設定點的形狀
# 3 is +
# 16 is ·
plot(x=lg$age[lg$party=="KMT"],y=lg$term[lg$party=="KMT"],main="Age and term in TW legislature",
xlab="Age",ylab="Term",col="blue",pch=16)
#points 是將圖來畫在前一個plot裡(疊圖),如下圖
points(x=lg$age[lg$party=="DPP"],y=lg$term[lg$party=="DPP"],
col="dark green",pch=16)
```

### norm
```r
#pnorm: get the propability of -Inf to Z
pnorm(2) # 0.9772499
pnorm(2)-pnorm(0) # 0.9772499 - 0.5
1-pnorm(1.5) # get z to Inf
#pnorm(x,平均,標準差)
pnorm(130,100,15) #0.9772499
###############
#qnorm: what is the Z-score of the pth quantile of the normal distribution?
# qnorm 是用來得到我們的z值
# 例如我們用pnorm(2)會得到負無窮大到2的機率值 = 0.9772499
# 而用 qnorm(0.9772499) 會得到接近2的值 = 2.1
qnorm(0.9772499) # 2
qnorm(0.5)
#qnorm(機率,平均,標準差)
qnorm(0.08, 100, 15) # 78.92393
###############
#rnorm: generate random numbers from a normal distribution
# 從標準常態分佈中 抽五個數出來
n5<-rnorm(5)
hist(n5)
n100<-rnorm(100)
hist(n100)
n10000<-rnorm(10000)
hist(n10000)
###############
#generate Q-Q plot to compare a normal distribution with the observed data
hist(lg$age)
qqnorm(lg$age)
qqline(lg$age)
```
## Apr. 7 (Foundation for Inference)
```r
#correlation
#畫出資料的散佈圖
#plot(x, y)
plot(x=puhe$polity2,y=puhe$wdi_puhe)
#算出兩者的相關係數
# use="complete" --> 確保遺漏值並不會放入計算,也就是說當兩個變量都沒有遺漏,才會列入計算
cor(puhe$polity2,puhe$wdi_puhe,use="complete")
#regression 迴歸分析
# lm = linear model
#lm(y~x)
m1<-lm(wdi_puhe~polity2,data=puhe)
summary(m1) # 回歸模型的細節
abline(m1,col="red")# 將回歸線畫在散佈圖上
plot(m1)
############
#regression
y<-c(10.7, 11.1, 11.5, 11.7, 12, 12, 12.1, 12.7, 12.5, 13.3, 12.5)
x<-c(3.6, 3.5, 4.9, 5.9, 5.6, 4.9, 5.6, 8.5, 7.7, 7, 6)
ybar<-mean(y)
ybar-y
xbar<-mean(x)
xbar-x
sum((y-ybar)*(x-xbar))
sum((x-xbar)^2)
#slope
sum((y-ybar)*(x-xbar))/sum((x-xbar)^2)
ybar-0.4066376*xbar
reg1<-lm(y~x)
summary(reg1)
```
---
## Apr. 14: Estimating Unknown Parameters
### CLT (中央極限定理)
```r
# runif(觀察值,區間)
population<-runif(10000,0,10)
hist(population)
# 抽樣 每次抽出30個樣本
s1<-sample(population,30)
# 計算 樣本平均數(30 個觀察值)
m1<-mean(s1)
s2<-sample(population,30)
m2<-mean(s2)
s3<-sample(population,30)
m3<-mean(s3)
samplemean <- c(m1,m2,m3)
#直方圖
hist(samplemean)
#畫出機率密度出來
plot(density(samplemean))
mean(samplemean)
####################
#looping
n30<-NULL
for(i in 1:100){
x<-sample(population,30)
n30[i]<-mean(x)
}
hist(n30)
plot(density(n30))
mean(n30)
####################
n100<-NULL
for(i in 1:100){
x<-sample(population,100)
n100[i]<-mean(x)
}
hist(n100)
plot(density(n100))
mean(n100)
##################
n3<-NULL
for(i in 1:100){
x<-sample(population,3)
n3[i]<-mean(x)
}
plot(density(n3))
mean(n100)
```
## Apr. 21: Hypothesis Testing
### 信賴區間的計算
```r=
#calculating CI from a normal distribution
#upper bound: point estimate + 1.96*s/sqrt(n)
#lower bound: point estimate - 1.96*s/sqrt(n)
point<-5
s<-2
n<-40
#95% CI
point+qnorm(0.975)*s/sqrt(n)
point-qnorm(0.975)*s/sqrt(n)
point+qnorm(0.025)*s/sqrt(n)
#80% CI
point+qnorm(0.9)*s/sqrt(n)
point-qnorm(0.9)*s/sqrt(n)
point+qnorm(0.1)*s/sqrt(n)
#population percentage
#upper bound: point estimate + 1.96*sqrt(p(1-p)/n)
#lower bound: point estimate - 1.96*sqrt(p(1-p)/n)
```
---
| | do not reject H~0~ | reject H~0~ in favor of H~A~ |
| ---- | ---- | ---- |
| **H~0~ True** | okay | Type 1 error |
| **H~A~ True** | Type 2 error | okay |
#### Decision Error
- **Type1 Error: H~0~ is True but we reject H~0~ in favor of H~A~**
If making a Type 1 Error is dangerous or especially costly, we should choose a **small** significance level
- **Type2 Error: H~A~ is False but we do not reject H~0~**
If making a Type 2 Error is dangerous or especially costly, we should choose a **higher** significance level
## May 5: One-sample Tests
```r=
# t 分配
# used when we rarely know σ
# As the sample size increases, t distribution converges to the standard normal
# df are the number of values that are free to vary
#calculating CI from a t distribution
point<-5
s<-2
n<-25
#95% CI
# df 為自由度 ( n - 1)
point+qt(0.975, df=24)*s/sqrt(n)
point-qt(0.975, df=24)*s/sqrt(n)
point+qt(0.025, df=24)*s/sqrt(n)
```
## May 12: Two-sample Tests
### p value
- **p-value** is a way of quantifying the strength of the
evidence against H~0~ and in favor of H~A~
- If p-value is **small**, we reject H~0~

```r
#calculating a p-value from a normal distribution
z<-(7.42-7)/(1.75/sqrt(110))
pnorm(z)
1-pnorm(z) #one-sided
(1-pnorm(z))*2 #two-sided
#calculating a p-value from a t-distribution
t<-(7.42-7)/(1.75/sqrt(20))
1-pt(t,df=20-1)
```

### one-sample test
```r
#one-sample test (one-sided)
#H0: mu=5
#Ha: mu>5
# t.test(觀察值, 虛無假設 H0,"greater" or "two.sided" )
t.test(QoG_Data$atop_number,mu=5,alternative = "greater")
#one-sample test (two-sided)
#H0: mu=5
#Ha: mu != 5
t.test(QoG_Data$atop_number,mu=5,alternative = "two.sided")
```
## May 19: Linear Regressions
```r
#indepedent samples
#H0: mu1-mu2=0
#H1: mu1-mu2=/=0
m1<-mean(teds2012$like_tsai[teds2012$gender=="M"],na.rm=T)
n1<-length(na.omit(teds2012$like_tsai[teds2012$gender=="M"]))
s1<-sd(teds2012$like_tsai[teds2012$gender=="M"],na.rm=T)
m2<-mean(teds2012$like_tsai[teds2012$gender=="F"],na.rm=T)
n2<-length(na.omit(teds2012$like_tsai[teds2012$gender=="F"]))
s2<-sd(teds2012$like_tsai[teds2012$gender=="F"],na.rm=T)
s<-sqrt(s1^2/n1+s2^2/n2)
(m1-m2-0)/s
(1-pnorm((m1-m2-0)/s))*2
##
t.test(teds2012$like_tsai[teds2012$gender=="M"],
teds2012$like_tsai[teds2012$gender=="F"],mu=0,
alternative="two.sided")
```
```r
#paired samples
#H0: mu_diff=0
#H1: mu_diff>0
diff<-teds2012$like_ma-teds2012$like_wu
m<-mean(diff,na.rm=T)
s<-sd(diff,na.rm=T)
n<-length(na.omit(diff))
(m-0)/(s/sqrt(n))
pnorm((m-0)/(s/sqrt(n)))
1-pnorm((m-0)/(s/sqrt(n)))
###
t.test(teds2012$like_ma, teds2012$like_wu,
mu=0,alternative="greater",
paired=T)
```
## May. 26: Linear Regressions
```r
############### multiple regressions
model1<-lm(wdi_puhe~polity2+gdppc_ln, data=puhe)
summary(model1)
confint(model1, level=.95)
nobs(model1)
plot(model1)
############### regressions with dummy variables
puhe$regime<-NA
puhe$regime[puhe$polity2>0]<-1
puhe$regime[puhe$polity2<=0]<-0
model2<-lm(wdi_puhe~regime+gdppc_ln, data=puhe)
summary(model2)
plot(puhe$gdppc_ln[puhe$regime==1],puhe$wdi_puhe[puhe$regime==1], xlab="log GDP pc",
ylab="Public Health Expenditure (% GDP pc)",ylim=c(0,12),
pch=20,col="#F8766D")
points(puhe$gdppc_ln[puhe$regime==0],puhe$wdi_puhe[puhe$regime==0],
col="#00C094",pch=20)
abline(a=-4.4170,b=0.8246,col="#00C094")
abline(a=-4.4170+1.5883,b=0.8246,col="#F8766D")
###############
model4<-lm(wdi_puhe~regime+gdppc_ln+regime:gdppc_ln, data=puhe)
summary(model4)
plot(puhe$gdppc_ln[puhe$regime==1],puhe$wdi_puhe[puhe$regime==1], xlab="log GDP pc",
ylab="Public Health Expenditure (% GDP pc)",ylim=c(0,12),
pch=20,col="#F8766D")
points(puhe$gdppc_ln[puhe$regime==0],puhe$wdi_puhe[puhe$regime==0],
col="#00C094",pch=20)
abline(a=1.68497,b=0.06896,col="#00C094")
abline(a=1.68497-6.64448,b=0.06896+1.00320,col="#F8766D")
```
## Jun. 9
卡方檢定
```r
#chi-square test
#H0: independent
#Ha: not independent
campaign<-c(41,108,81,124,263,161,302,475,156)
campaign.m<-matrix(campaign,nrow=3)
chisq.test(campaign.m)
chisq.test(campaign.m)$expected
```