應用統計筆記

# 應用統計筆記 ## 區間估計 [觀念參考](http://web.cjcu.edu.tw/~jdwu/stat01/lect003.pdf) ## 變數類型 ```graphviz digraph hierarchy { nodesep=1.0 node [color=Black, fontname=Courier,shape=box] edge [color=Black] "all variables"->{"numerical/qualitative" "categorical/qualitative"} "numerical/qualitative"->{ratio interval} "categorical/qualitative"->nominal {rank=same;nominal ordinal } } ``` **Type of variable** * Nominal(名目): A nominal variable is a type of variable that is used to name, label or categorize particular attributes that are being measured, Like ++Gender++。 * Ordinal(順序): Ordinal variable is a type of measurement variable that takes values with an order or rank.**The difference in the rank of an ordinal variable is not equal.** * Interval(區間): The interval variable is a measurement variable that is used to define **values measured along a scale**, with **each point placed at an equal distance from one another**. Like ++Temperature++, ++Time++, ++IQ Test++ (zero has no meaning). * Ratio(比例): The only difference between the ratio variable and interval variable is that **the ratio variable already has a zero value** (zero has meaning). ### Population and sample * Population: Populations can include people, but other examples include objects, events, businesses, and so on. * Population Parameters: A characteristic of **an entire population**. * Sample statistics: A statistic is a characteristic of **a sample**. * case/observation: Objects in the sample collection * sample: A sample is the specific group that you will collect data from. ## Mar. 17 ### barplot 長條圖（用在類別資料上） ```r #bar chart t1<-table(lg$gender) barplot(t1) barplot(t1,main="Gender in TW legislature", xlab="Gender",ylab="Frequency") barplot(t1,main="Gender in TW legislature", xlab="Gender",ylab="Frequency", col="red",border="red") barplot(t1,main="Gender in TW legislature", xlab="Gender",ylab="Frequency", col=c("#FF8C00","#4169E1")) #names.arg 給類別名稱 barplot(t1,main="Gender in TW legislature", xlab="Gender",ylab="Frequency", col=c("#FF8C00","#4169E1"), names.arg = c("Female","Male")) #相對次數分配表 t2<-prop.table(t1) barplot(t2,main="Gender in TW legislature", xlab="Gender",ylab="Proportion") ``` ### histogram 直方圖（用在數值資料上）（連續的） ```r #histogram hist(lg$age) hist(lg$age,main="Age in TW legislature", xlab="Age") # breaks 用來設定長條的數量 hist(lg$age,main="Age in TW legislature", xlab="Age",breaks=30) range(lg$age) # 得到最小跟最大值 # breaks 如果給c(...) 代表 25到30畫一個,30到35畫一個... # （但給的範圍必須小於最小值和大於最大值） hist(lg$age,main="Age in TW legislature", xlab="Age",breaks=c(25,30,35,40,45,50,55,70,75,80)) #seq(25,80,5) = c(25,30,35,40,45,50,55,70,75,80) hist(lg$age,main="Age in TW legislature", xlab="Age",breaks=seq(25,80,5)) hist(lg$age,main="Age in TW legislature", xlab="Age",breaks=seq(25,80,5),col="red") hist(lg$age,main="Age in TW legislature", xlab="Age",breaks=seq(25,80,5),col=c("#BC8F8F","#6A5ACD")) hist(lg$age[lg$party=="KMT"],main="Age in TW legislature", xlab="Age",breaks=c(25,30,35,40,45,50,55,60,65,70,75,80)) hist(lg$age[lg$party=="DPP"],main="Age in TW legislature", xlab="Age",breaks=c(25,30,35,40,45,50,55,60,65,70,75,80)) #abline hist(lg$age,main="Age in TW legislature", xlab="Age",breaks=c(25,30,35,40,45,50,55,60,65,70,75,80)) mean(lg$age) # abline 在已經有的圖上畫直線 # v 畫直的 # h 畫橫的 abline(v=mean(lg$age)) # lty 是設定線的形狀 # 4 for ·-·- abline(v=mean(lg$age),col="red",lwd=3,lty=4) abline(h=15,col="red",lwd=5,lty=4) hist(lg$age,main="Age in TW legislature", xlab="Age",breaks=c(25,30,35,40,45,50,55,60,65,70,75,80), freq=F) # freq 設定false 為density plot ``` ## Mar. 24 : Normal Distribution and 31 : Correlations and Regressions (pnorm) ### boxplot 盒型圖 ```r #boxplot 盒型圖 boxplot(lg$age) summary(lg$age) #顯示 Min. 1st Qu. Median Mean 3rd Qu. max. boxplot(lg$age,main="Age in TW legislature",ylab="Age") boxplot(lg$age[lg$party=="KMT"],main="Age of KMT legislators",ylab="Age") #前面是用來畫圖的變項 "~" 後面是用來顯示分組的變項 boxplot(lg$age~lg$party) boxplot(lg$age~lg$party,main="Age in TW legislature",ylab="Age", xlab="Party") boxplot(lg$age~lg$party,main="Age in TW legislature",ylab="Age", xlab="Party",col=c("green","blue","gray","yellow", "gray","orange")) ``` ![](https://i.imgur.com/Hl6vIh6.png) ```r largeparty<-lg[lg$party%in%c("KMT","DPP"),] boxplot(largeparty$age~largeparty$party) #用as.character 來讓我的類別只剩下DPP KMT，因為原本的型態還留著其他政黨的變量(NONE, NPP, othrer, PFP) boxplot(largeparty$age~as.character(largeparty$party)) ``` ### scatterplot 散佈圖 ```r #scatterplot plot(x=lg$age,y=lg$term) plot(x=lg$age,y=lg$term, main="Age and term in TW legislature", xlab="Age",ylab="Term") plot(x=lg$age,y=lg$term,main="Age and term in TW legislature", xlab="Age",ylab="Term",col="red") plot(x=lg$age,y=lg$term,main="Age and term in TW legislature", xlab="Age",ylab="Term",col="red",pch=3) # pch 是用來設定點的形狀 # 3 is + # 16 is · plot(x=lg$age[lg$party=="KMT"],y=lg$term[lg$party=="KMT"],main="Age and term in TW legislature", xlab="Age",ylab="Term",col="blue",pch=16) #points 是將圖來畫在前一個plot裡（疊圖），如下圖 points(x=lg$age[lg$party=="DPP"],y=lg$term[lg$party=="DPP"], col="dark green",pch=16) ``` ![](https://i.imgur.com/9lEBS5v.png) ### norm ```r #pnorm: get the propability of -Inf to Z pnorm(2) # 0.9772499 pnorm(2)-pnorm(0) # 0.9772499 - 0.5 1-pnorm(1.5) # get z to Inf #pnorm(x,平均,標準差) pnorm(130,100,15) #0.9772499 ############### #qnorm: what is the Z-score of the pth quantile of the normal distribution? # qnorm 是用來得到我們的z值 # 例如我們用pnorm(2)會得到負無窮大到2的機率值 = 0.9772499 # 而用 qnorm(0.9772499) 會得到接近2的值 = 2.1 qnorm(0.9772499) # 2 qnorm(0.5) #qnorm(機率,平均,標準差) qnorm(0.08, 100, 15) # 78.92393 ############### #rnorm: generate random numbers from a normal distribution # 從標準常態分佈中抽五個數出來 n5<-rnorm(5) hist(n5) n100<-rnorm(100) hist(n100) n10000<-rnorm(10000) hist(n10000) ############### #generate Q-Q plot to compare a normal distribution with the observed data hist(lg$age) qqnorm(lg$age) qqline(lg$age) ``` ## Apr. 7 (Foundation for Inference) ```r #correlation #畫出資料的散佈圖 #plot(x, y) plot(x=puhe$polity2,y=puhe$wdi_puhe) #算出兩者的相關係數 # use="complete" --> 確保遺漏值並不會放入計算，也就是說當兩個變量都沒有遺漏，才會列入計算 cor(puhe$polity2,puhe$wdi_puhe,use="complete") #regression 迴歸分析 # lm = linear model #lm(y~x) m1<-lm(wdi_puhe~polity2,data=puhe) summary(m1) # 回歸模型的細節 abline(m1,col="red")# 將回歸線畫在散佈圖上 plot(m1) ############ #regression y<-c(10.7, 11.1, 11.5, 11.7, 12, 12, 12.1, 12.7, 12.5, 13.3, 12.5) x<-c(3.6, 3.5, 4.9, 5.9, 5.6, 4.9, 5.6, 8.5, 7.7, 7, 6) ybar<-mean(y) ybar-y xbar<-mean(x) xbar-x sum((y-ybar)*(x-xbar)) sum((x-xbar)^2) #slope sum((y-ybar)*(x-xbar))/sum((x-xbar)^2) ybar-0.4066376*xbar reg1<-lm(y~x) summary(reg1) ``` --- ## Apr. 14: Estimating Unknown Parameters ### CLT (中央極限定理) ```r # runif(觀察值,區間) population<-runif(10000,0,10) hist(population) # 抽樣每次抽出30個樣本 s1<-sample(population,30) # 計算樣本平均數(30 個觀察值) m1<-mean(s1) s2<-sample(population,30) m2<-mean(s2) s3<-sample(population,30) m3<-mean(s3) samplemean <- c(m1,m2,m3) #直方圖 hist(samplemean) #畫出機率密度出來 plot(density(samplemean)) mean(samplemean) #################### #looping n30<-NULL for(i in 1:100){ x<-sample(population,30) n30[i]<-mean(x) } hist(n30) plot(density(n30)) mean(n30) #################### n100<-NULL for(i in 1:100){ x<-sample(population,100) n100[i]<-mean(x) } hist(n100) plot(density(n100)) mean(n100) ################## n3<-NULL for(i in 1:100){ x<-sample(population,3) n3[i]<-mean(x) } plot(density(n3)) mean(n100) ``` ## Apr. 21: Hypothesis Testing ### 信賴區間的計算 ```r= #calculating CI from a normal distribution #upper bound: point estimate + 1.96*s/sqrt(n) #lower bound: point estimate - 1.96*s/sqrt(n) point<-5 s<-2 n<-40 #95% CI point+qnorm(0.975)*s/sqrt(n) point-qnorm(0.975)*s/sqrt(n) point+qnorm(0.025)*s/sqrt(n) #80% CI point+qnorm(0.9)*s/sqrt(n) point-qnorm(0.9)*s/sqrt(n) point+qnorm(0.1)*s/sqrt(n) #population percentage #upper bound: point estimate + 1.96*sqrt(p(1-p)/n) #lower bound: point estimate - 1.96*sqrt(p(1-p)/n) ``` --- | | do not reject H~0~ | reject H~0~ in favor of H~A~ | | ---- | ---- | ---- | | **H~0~ True** | okay | Type 1 error | | **H~A~ True** | Type 2 error | okay | #### Decision Error - **Type1 Error: H~0~ is True but we reject H~0~ in favor of H~A~** If making a Type 1 Error is dangerous or especially costly, we should choose a **small** significance level - **Type2 Error: H~A~ is False but we do not reject H~0~** If making a Type 2 Error is dangerous or especially costly, we should choose a **higher** significance level ## May 5: One-sample Tests ```r= # t 分配 # used when we rarely know σ # As the sample size increases, t distribution converges to the standard normal # df are the number of values that are free to vary #calculating CI from a t distribution point<-5 s<-2 n<-25 #95% CI # df 為自由度 ( n - 1) point+qt(0.975, df=24)*s/sqrt(n) point-qt(0.975, df=24)*s/sqrt(n) point+qt(0.025, df=24)*s/sqrt(n) ``` ## May 12: Two-sample Tests ### p value - **p-value** is a way of quantifying the strength of the evidence against H~0~ and in favor of H~A~ - If p-value is **small**, we reject H~0~ ![](https://i.imgur.com/lfwIG24.png) ```r #calculating a p-value from a normal distribution z<-(7.42-7)/(1.75/sqrt(110)) pnorm(z) 1-pnorm(z) #one-sided (1-pnorm(z))*2 #two-sided #calculating a p-value from a t-distribution t<-(7.42-7)/(1.75/sqrt(20)) 1-pt(t,df=20-1) ``` ![](https://i.imgur.com/T85rCJh.png) ### one-sample test ```r #one-sample test (one-sided) #H0: mu=5 #Ha: mu>5 # t.test(觀察值, 虛無假設 H0,"greater" or "two.sided" ) t.test(QoG_Data$atop_number,mu=5,alternative = "greater") #one-sample test (two-sided) #H0: mu=5 #Ha: mu != 5 t.test(QoG_Data$atop_number,mu=5,alternative = "two.sided") ``` ## May 19: Linear Regressions ```r #indepedent samples #H0: mu1-mu2=0 #H1: mu1-mu2=/=0 m1<-mean(teds2012$like_tsai[teds2012$gender=="M"],na.rm=T) n1<-length(na.omit(teds2012$like_tsai[teds2012$gender=="M"])) s1<-sd(teds2012$like_tsai[teds2012$gender=="M"],na.rm=T) m2<-mean(teds2012$like_tsai[teds2012$gender=="F"],na.rm=T) n2<-length(na.omit(teds2012$like_tsai[teds2012$gender=="F"])) s2<-sd(teds2012$like_tsai[teds2012$gender=="F"],na.rm=T) s<-sqrt(s1^2/n1+s2^2/n2) (m1-m2-0)/s (1-pnorm((m1-m2-0)/s))*2 ## t.test(teds2012$like_tsai[teds2012$gender=="M"], teds2012$like_tsai[teds2012$gender=="F"],mu=0, alternative="two.sided") ``` ```r #paired samples #H0: mu_diff=0 #H1: mu_diff>0 diff<-teds2012$like_ma-teds2012$like_wu m<-mean(diff,na.rm=T) s<-sd(diff,na.rm=T) n<-length(na.omit(diff)) (m-0)/(s/sqrt(n)) pnorm((m-0)/(s/sqrt(n))) 1-pnorm((m-0)/(s/sqrt(n))) ### t.test(teds2012$like_ma, teds2012$like_wu, mu=0,alternative="greater", paired=T) ``` ## May. 26: Linear Regressions ```r ############### multiple regressions model1<-lm(wdi_puhe~polity2+gdppc_ln, data=puhe) summary(model1) confint(model1, level=.95) nobs(model1) plot(model1) ############### regressions with dummy variables puhe$regime<-NA puhe$regime[puhe$polity2>0]<-1 puhe$regime[puhe$polity2<=0]<-0 model2<-lm(wdi_puhe~regime+gdppc_ln, data=puhe) summary(model2) plot(puhe$gdppc_ln[puhe$regime==1],puhe$wdi_puhe[puhe$regime==1], xlab="log GDP pc", ylab="Public Health Expenditure (% GDP pc)",ylim=c(0,12), pch=20,col="#F8766D") points(puhe$gdppc_ln[puhe$regime==0],puhe$wdi_puhe[puhe$regime==0], col="#00C094",pch=20) abline(a=-4.4170,b=0.8246,col="#00C094") abline(a=-4.4170+1.5883,b=0.8246,col="#F8766D") ############### model4<-lm(wdi_puhe~regime+gdppc_ln+regime:gdppc_ln, data=puhe) summary(model4) plot(puhe$gdppc_ln[puhe$regime==1],puhe$wdi_puhe[puhe$regime==1], xlab="log GDP pc", ylab="Public Health Expenditure (% GDP pc)",ylim=c(0,12), pch=20,col="#F8766D") points(puhe$gdppc_ln[puhe$regime==0],puhe$wdi_puhe[puhe$regime==0], col="#00C094",pch=20) abline(a=1.68497,b=0.06896,col="#00C094") abline(a=1.68497-6.64448,b=0.06896+1.00320,col="#F8766D") ``` ## Jun. 9 卡方檢定 ```r #chi-square test #H0: independent #Ha: not independent campaign<-c(41,108,81,124,263,161,302,475,156) campaign.m<-matrix(campaign,nrow=3) chisq.test(campaign.m) chisq.test(campaign.m)$expected ```