Skip to content

Instantly share code, notes, and snippets.

@kawasaki2013
Last active November 19, 2018 14:37
Show Gist options
  • Select an option

  • Save kawasaki2013/dc88cd5859d29a30740bc0ec0e1f21a4 to your computer and use it in GitHub Desktop.

Select an option

Save kawasaki2013/dc88cd5859d29a30740bc0ec0e1f21a4 to your computer and use it in GitHub Desktop.
「テキストデータの統計科学入門」サポートサイト

#第5章 #zipf法則のグラフ

ABE<-read.csv("http://mjin.doshisha.ac.jp/iwanami/data/ABE.csv",head=T,row.names=1)
FUKUDA<-read.csv("http://mjin.doshisha.ac.jp/iwanami/data/FUKUDA.csv",head=T,row.names=1)
par(mfrow=c(1,2),mar=c(4.5,4,1,1))
plot(1:nrow(FUKUDA),FUKUDA[,1],xlab="ランク",ylab="頻度")
plot(log(1:nrow(FUKUDA)),log(FUKUDA[,1]),xlab="ランクの対数",ylab="頻度の対数")
lm(log(FUKUDA[,1])~log(1:nrow(FUKUDA)))->fukuda.lm
abline(fukuda.lm,lw=2)

#K特性値の計算 #表5.2の作成

sum(FUKUDA) #述べ語数
fukuda<-data.frame(table(FUKUDA))
fix(fukuda)	#開かれたデータシートの第1列の文字列データを量的データに変換し、データシートを閉じる
sum(fukuda[,2]) #異なり語数

#p56の計算

sum(fukuda[,1]^2*fukuda[,2])
#[1] 168941
10^4*(168941-3423)/3423^2
#[1] 141.2640
10^4*(sum(fukuda[,1]^2*fukuda[,2])-sum(fukuda[,1]*fukuda[,2]))/sum(fukuda[,1]*fukuda[,2])^2
#[1] 141.2640

#p.57

abe<-data.frame(table(ABE))
fix(abe)#開かれたデータシートの第1列の文字列データを量的データに変換し、データシートを閉じる
sum(abe[,1]^2*abe[,2])
#[1] 260965
sum(abe[,1]*abe[,2])
#[1] 4343
10^4*(sum(abe[,1]^2*abe[,2])-sum(abe[,1]*abe[,2]))/sum(abe[,1]*abe[,2])^2
#[1] 136.0549

#p.67

abet<-ABE
abe<-data.frame(ランク=1:nrow(abet),度数=abet)
par(mfrow=c(1,2))
plot(abe,type="l")
plot(log(abe),type="l")

abeD<-data.frame(table(abet))
fix(abeD)	#第1列の文字列データを量的データに変換
sum(abeD[,2])
#[1] 1221
sum(abeD[,1]*abeD[,2])
#[1] 4343
 1221/4343
#[1] 0.2811421
 sum(abeD[,1]^2*abeD[,2])
#[1] 260965
 10^4*(260965-4343)/4343^2
#[1] 136.0549
 

#TF-IDFを求める
TFIDF<-function(x){
N<-nrow(x)
I<-ncol(x)
TD<-matrix(0,N,I)
for(i in 1:I){
{df<-0
for(k in 1:N)
{if(x[k,i]>0)df<-df+1}
}
for(j in 1:N)
TD[j,i]<-x[j,i]*log(N/df)

}
list(TD=TD)
}

hyou4<-matrix(c(1,0,2,1,
3,2,1,0,
0,1,0,2,
0,3,2,1,
0,2,3,0),5,4,byrow=TRUE)

TFIDF(hyou4)

#第6章ネットワーク

install.packages("igraph");library(igraph)
test<-matrix(c(0,0,1,0,1,0,0,0,1,0,0,0,1,1,0,0),4,4)
(test.g<-graph.adjacency(test))
V(test.g)$name<-c("A","B","C","D")
plot(test.g,vertex.label=V(test.g)$name,layout=layout.circle)

#無向グラフ

test2<-matrix(c(0,1,1,1,1,0,0,1,1,0,0,0,1,1,0,0),4,4)
graph.adjacency(test2,mode="undirected")->test2.g
V(test2.g)$name<-c("A","B","C","D")
plot(test2.g,vertex.label=V(test.g)$name,layout=layout.circle)

p.69~p.70

fukudaNbi<-read.csv("http://mjin.doshisha.ac.jp/iwanami/data/fukudabi.csv",head=FALSE)
fukudaNbi[86:87,]
fukuda2<-fukudaNbi[fukudaNbi[,3]>1,]
fuku.g<-graph.data.frame(fukuda2)
E(fuku.g)$weight<-fukuda2[,3]
tkplot(fuku.g,vertex.label=V(fuku.g)$name,vertex.size=1,layout=layout.fruchterman.reingold,edge.label=E(fuku.g)$weight)

#7.3.1基本統計量とヒストグラム

IOScomma<-read.csv("http://mjin.doshisha.ac.jp/iwanami/data/IOScomma.csv",head=TRUE,row.names=1)
izumi<-IOScomma[1:20,4]
okamoto<-IOScomma[21:40,4]
mean(izumi)	#平均を求める
var(izumi)	#分散を求める
min(izumi)	#最小値を求める
max(izumi)	#最大値を求める
range(izumi)	#データの区間を求める
summary(izumi)	#四分位数と平均を求める

summary(okamoto)
mean(okamoto)
var(okamoto)
min(okamoto)
max(okamoto)
range(okamoto)

#ヒストグラム

hist(izumi)			#ヒストグラムの図を作成
hist(izumi,plot = FALSE)	#ヒストグラムの図を作成

hist(okamoto)			#ヒストグラムの図を作成
hist(okamoto,plot = FALSE)	#ヒストグラムの図を作成

#7.3.2箱髭図 temp<-data.frame(泉鏡花=izumi,岡本綺堂=okamoto) boxplot(temp,col="lightblue",cex=1.5) points(c(rep(1,20),rep(2,20))+rnorm(40,0,0.1),c(temp[,1],temp[,2]),col="gray",cex=0.8,pch=4)

#7.3.3散布図の作成 par(mfrow=c(1,1),mar=c(4.1,4.1,1,1)) plot(IOScomma[,4],IOScomma[,13],type="n",xlab="「と、」の百分率",ylab="「た、」の百分率") text(IOScomma[,4],IOScomma[,13],rownames(IOScomma)) lines(c(8.3,8.3),c(-1,6),lty=2,col=2) lines(c(min(IOScomma[,4])-1,8.3),c(1.45,1.45),lty=2,col=2)

#7.4変数の選択 hensa<-function(x){ nc<-ncol(x) kekka<-matrix(0,nc,4) rownames(kekka)<-colnames(x) for(i in 1:nc){ iSS<-var(x[1:20,i])*19 oSS<-var(x[21:40,i])*19 sSS<-var(x[41:60,i])*19 SST<-var(x[,i])*59 SSW<-iSS+oSS+sSS kekka[i,1]<-SST kekka[i,2]<-SSW kekka[i,3]<-SST-SSW kekka[i,4]<-kekka[i,3]/SSW } kekka<-kekka[sort.list(kekka[,4],decreasing = TRUE),] colnames(kekka)<-c("SS","SSw","SSb","B/W") list(kekka=kekka) } round(hensa(IOScomma[,-22])$kekka,2)

# 上位6つの箱髭図 par(mfrow=c(2,3),mar=c(3,2,3,2)) boxplot(data.frame(泉=IOScomma$と.[1:20],岡本=IOScomma$と.[21:40],島崎=IOScomma$と.[41:60]),main="「と、」",col="lightblue") boxplot(data.frame(泉=IOScomma$を.[1:20],岡本=IOScomma$を.[21:40],島崎=IOScomma$を.[41:60]),main="「を、」",col="lightblue") boxplot(data.frame(泉=IOScomma$の.[1:20],岡本=IOScomma$の.[21:40],島崎=IOScomma$の.[41:60]),main="「の、」",col="lightblue") boxplot(data.frame(泉=IOScomma$た.[1:20],岡本=IOScomma$た.[21:40],島崎=IOScomma$た.[41:60]),main="「た、」",col="lightblue") boxplot(data.frame(泉=IOScomma$が.[1:20],岡本=IOScomma$が.[21:40],島崎=IOScomma$が.[41:60]),main="「が、」",col="lightblue") boxplot(data.frame(泉=IOScomma$で.[1:20],岡本=IOScomma$で.[21:40],島崎=IOScomma$で.[41:60]),main="「で、」",col="lightblue")

#第8章確率分布 #ポアソン分布 kekka<-matrix(0,6,4) colnames(kekka)<-c("実測度数","実測相対度数","推測度数","推測相対度数") kekka[,1]<-c(466,541,391,172,64,15) kekka[,2]<-kekka[,1]/sum(kekka[,1]) X<-1:6 EX<-sum(X*kekka[,2]) ramuda<-EX-1 for(i in 1:6){ if(i==1) kekka[i,4]<-((ramuda^(i-1))/prod(1))*exp(-ramuda) else kekka[i,4]<-((ramuda^(i-1))/prod(1:(i-1)))*exp(-ramuda) } kekka[,3]<-round(kekka[,4]*sum(kekka[,1])) round(kekka,4)

#正規分布のグラフ par(mfrow=c(1,1),mar=c(4.1,2,1,1),bty="l") x=seq(-4,4,0.05) curve(dnorm(x,0,0.5),-4,4,ylab="",type="l",col="gray",lwd=3) arrows(0.3, 0.7, 1.2, 0.7, length = 0.1, angle = 20, code = 1) text(2.2,0.7,"平均=0,分散=0.5",cex=1.2) points(x,dnorm(x,0,1),type="l",col="blue",lwd=3) arrows(0.3, 0.38, 1.2, 0.38, length = 0.1, angle = 20, code = 1) text(2.2,0.38,"平均=0,分散=1 ",cex=1.2) curve(dnorm(x,0,1.5),add=T,col="green",lwd=3) arrows(0.3, 0.26, 1.2, 0.26, length = 0.1, angle = 20, code = 1) text(2.2,0.26,"平均=0,分散=1.5",cex=1.2) curve(dnorm(x,-2,1),add=T,col="orange",lwd=3) arrows(-2, 0.4, -2.5, 0.45, length = 0.1, angle = 20, code = 1) text(-2.3,0.48,"平均=-2,分散が1",cex=1.2)

legend(-3.8, 0.8, c( "平均0,分散0.5",
"平均0,分散1", "平均0,分散1.5", "平均-2,分散1" ), lwd=2, col=c("gray", "blue", "green","orange"),cex=1.2,bty="n" )

#対数正規分布のグラフ #対数正規分布

par(mfrow=c(1,1),mar=c(4.1,2,1,1),bty="l") x=seq(0,8,0.05) curve(dlnorm(x,0,0.5),0,8,type="l",col="gray",lwd=3,ylab="") arrows(1.1, 0.7, 1.4, 0.7, length = 0.1, angle = 20, code = 1)

points(x,dlnorm(x,0.5,0.5),type="l",col="blue",lwd=3) arrows(2, 0.4, 2.4, 0.4, length = 0.1, angle = 20, code = 1)

curve(dlnorm(x,1.5,0.5),add=T,col="green",lwd=3) arrows(4.5, 0.18, 4.9, 0.18, length = 0.1, angle = 20, code = 1)

curve(dlnorm(x,1.5,0.1),add=T,col="orange",lwd=3) arrows(5, 0.5, 5.3, 0.5, length = 0.1, angle = 20, code = 1)

text(2.5,0.7,"平均=0,標準偏差=0.5",cex=1.2) text(3.5,0.4,"平均=0.5,標準偏差=0.5",cex=1.2) text(6,0.18,"平均=1.5,標準偏差=0.5",cex=1.2) text(6.8,0.5,"平均=1.5,対数の標準偏差=0.5",cex=1.2) text(7,0.8, "平均、標準偏差は\n確率変数の対数の\n平均と標準偏差!",cex=1.2)

p.99 sen_length<-read.csv("http://mjin.doshisha.ac.jp/iwanami/data/SL.csv",head=,row.names=1)

matplot(natsumeSL,type="b", pch=1:3,xlab="",ylab="",axes=F) axis(1, 1:length(natsumeSL[,2]),row.names(natsumeSL)) axis(2) legend(11,0.15, c("虞美人草","草枕","吾輩は猫である"),col=1:3,lty=1:3,pch=1:3,cex=1.5)

#第9章尤度の計算 #データの準備 bs<-matrix(0,10,7) bs[,1]<-1:10 bs[,2]<-c(53,1988,3538,1927,999,409,189,53,34,10) bs[,3]<-bs[,2]/sum(bs[,2]) Su<-sum(bs[,2])

#ポアソンモデル M1mean<-sum(bs[,1]*bs[,3]) ramuda<-M1mean-1 fact <- function(n) {if(n<=1) 1 else prod(1:n)}; #階乗を求める for(i in 1:10){bs[i,5]<-exp(-ramuda)ramuda^(i-1)/fact(i-1)} #ポアソン関数dpoisを用いてもよい bs[,5]<-dpois(bs[,1]-1, ramuda) bs[,4]<-round(Subs[,5])

#対数正規モデル x<-bs[,1] xln<-log(x) xlmean<-sum(xlnbs[,3]) xlvar<-sum(xln^2bs[,3])-xlmean^2 bs[,7]<-(2pixlvarx^2)^(-1/2)exp(-(xln-xlmean)^2/(2xlvar)) #対数正規関数dlnormを用いてもよい bs[,7]<-dlnorm(bs[,1],xlmean,sqrt(xlvar)) bs[,6]<-round(Subs[,7])

round(bs,3)

#ポアソンモデルのAIC lm1<-sum(bs[,2]log(bs[,5])) -2lm1+2

#対数正規モデルのAIC lm2<-sum(bs[,2]log(bs[,7])) -2lm2+2*2

par(mar=c(2,4,1,1)) matplot(bs[,c(3,5,7)],type="b",ylab="相対度数", pch = 1:3, col =c(1,2,4)) legend(6, max(bs[,3]), c("実測値","ポアソンモデル","対数正規モデル"),col=c(1,2,4),lty=1:3,pch = 1:3)

#第10章 推測分析 t15<-c(0.818, 2.160, 1.074, 0.606, 1.571,1.068, 3.364, 0.534, 0.606, 0.882,1.032, 0.789, 0.450, 1.200, 0.708,0.641, 0.821, 0.538, 0.459, 0.624, 0.185, 1.077, 0.604, 0.783, 0.172) s02<-c(0.653, 0.694, 0.800, 0.513, 0.500, 0.821,0.418, 1.118, 1.129, 0.778, 0.470, 0.301 ,0.612, 0.642, 0.444, 0.826, 0.628, 0.546, 0.412, 0.500, 0.470, 0.712, 0.667, 0.381, 0.524, 0.694, 0.577, 0.635, 0.500, 0.333, 0.282, 0.337) t.test(t15,s02)

AF<-read.csv("http://mjin.doshisha.ac.jp/iwanami/data/AFjiritsu.csv",head=T,row.names=1) xs<-apply(AF,2,sum)[1] ys<-apply(AF,2,sum)[2] P<-round((28+5)/(xs+ys),4) (28/xs-5/ys)/sqrt(P*(1-P)*(1/xs+1/ys))

zp<-function(data){ x<-data[,1];y<-data[,2] nr<-length(x) res<-matrix(0,nr,2) colnames(res)<-c("z値","p値") xsum<-sum(x); ysum<-sum(y) P<-(x+y)/(xsum+ysum) siguma<-sqrt(P*(1-P)*(1/xsum+1/ysum)) xp<-x/xsum;yp<-y/ysum res[,1]<-abs(xp-yp)/siguma res[,2]<-1-pnorm(res[,1]) kekka<-data.frame(data,res) res<-kekka[sort.list(kekka[,3],decreasing = TRUE),] list(kekka=round(res,4)) } round(head(zp(AF)$kekka,25),4)

#第11推測分析と特徴抽出 #表11.3 kekka<-matrix(0,6,4) colnames(kekka)<-c("実測度数","実測相対度数","推測度数","推測相対度数") kekka[,1]<-c(466,541,391,172,64,15) kekka[,2]<-kekka[,1]/sum(kekka[,1]) X<-1:6 EX<-sum(X*kekka[,2]) ramuda<-EX-1 for(i in 1:6){ if(i==1) kekka[i,4]<-((ramuda^(i-1))/prod(1))*exp(-ramuda) else kekka[i,4]<-((ramuda^(i-1))/prod(1:(i-1)))*exp(-ramuda) } kekka[,3]<-round(kekka[,4]*sum(kekka[,1])) kekka sum((kekka[,1]-kekka[,4]*1649)^2/(kekka[,4]*1649))

#分割票―表11.4 table11.5<-matrix(c(41,103,26,107,30,290,82,429,6,303,61,803),6,2) rownames(table11.5)<-c("は","が","て","ら","に","その他") colnames(table11.5)<-c("大正15年","昭和2年") chisq.test(table11.5) #期待度数を求める chisq.test(table11.5)$expected

table11.7<-matrix(c(3,1,1,3),2,2) fisher.test(table11.7, alternative = "greater")

#p.143 install.packages("vcd");library(vcd) assocstats(table11.5)

#表11.12の計算

fisher2<-function(x){ tal<-apply(x,2,sum) nr<-nrow(x) nc<-ncol(x) te<-matrix(0,nr,1) y<-cbind(x,Fisher.p値=te) for(i in 1:nr){ temp<-rbind(x[i,],tal-x[i,]) y[i,nc+1]<-fisher.test(temp)$p.value yy<-y[sort.list(y[,4]),] } list(kekka=yy) }

AFAmeishi<-read.csv("http://mjin.doshisha.ac.jp/iwanami/data/AFAmeishi.csv",head=T,row.names=1) kekka<-fisher2(AFAmeishi)$kekka head(kekka)

#12主成分分析 gaiyou<-read.csv("http://mjin.doshisha.ac.jp/iwanami/data/gaiyou.csv", head=T,row.names=1) gaiyou.pc<-prcomp(gaiyou[,1:50], scale=TRUE) par(mar=c(rep(2.5,4))) biplot(gaiyou.pc,var.axes = FALSE)

x<-gaiyou.pc$rotation[,c(1,3)] y<- gaiyou.pc$x[,c(1,3)] par(mar=c(rep(2.5,4))) biplot(x,y, var.axes = FALSE)

#第2主成分と主成分得点 par(mar=c(6,4,2,2)) barplot(gaiyou.pc$x[,2],las=2)

y<-gaiyou.pc$rotation[,2] par(mar=c(4,3,2,2)) y1<-y[sort.list(y,dec=TRUE)] barplot(y1[c(1:10,40:50)],las=2)

#1対応分析 library(MASS) gaiyou.coa<-corresp(gaiyou[,1:50],2) biplot(gaiyou.coa,cex=1.2)

gaiyou<-read.csv("http://mjin.doshisha.ac.jp/iwanami/data/gaiyou.csv", head=T,row.names=1) gaiyou<-gaiyou/apply(gaiyou,1,sum) gaiyou.svd<-svd(gaiyou[,1:67]) rownames(gaiyou.svd$u)<-rownames(gaiyou) rownames(gaiyou.svd$v)<-colnames(gaiyou[,1:67]) biplot(gaiyou.svd$u[,1:2],gaiyou.svd$v[,1:2])

#第13章クラスター分析

sb<-read.csv("http://mjin.doshisha.ac.jp/iwanami/data/sb2.csv",head=T,row.names=1) sb.p<-sb/apply(sb,1,sum)

#多次元尺度法 par(mar=c(4.1,4.1,1,1)) sb.ed<-dist(sb.p[,1:32]) sb.sdm<- cmdscale(sb.ed,3) plot(sb.sdm,type="n",xlab="第1固有ベクトル",ylab="第2固有ベクトル") text(sb.sdm,rownames(sb.sdm),col=c(rep(1:2,11)))

#IR距離を求める自作関数 myldist <- function(x){ x<-x+1e-006 #ゼロである値の対数演算のため x<-(x/apply(x,1,sum)) apply(x, 1, function(y){ # 各行に次の関数を適用する apply(x, 1, function(z){ sum((ylog(2y/(y+z))+zlog(2z/(y+z)))/2) }) }) }

#IR距離による多次元尺度法 sb.ld<-myldist(sb.p[,1:32]) sbld.sdm<-cmdscale(sb.ld,3) plot(sbld.sdm,type="n",xlab="第1固有ベクトル",ylab="第2固有ベクトル") text(sbld.sdm,rownames(sbld.sdm),col=c(rep(1:2,11)))

#3次元散布図 install.packages("rgl");library(rgl) sb.ed<-dist(sb.p[,1:32]) sb.sdm<- cmdscale(sb.ed,3) x<-sb.sdm[,1];y<-sb.sdm[,2];z<-sb.sdm[,3] plot3d(x,y,z,type="n") text3d(x,y,z,text=rownames(sb.sdm),adj=0.5) grid3d(c("x", "y+", "z"))

#IR距離による3次元散布図 sb.ld<-myldist(sb.p[,1:32]) sbld.sdm<-cmdscale(sb.ld,3) x<-sbld.sdm[,1] y<-sbld.sdm[,1] y<-sbld.sdm[,2] z<-sbld.sdm[,3] plot3d(x,y,z,type="n") text3d(x,y,z,text=rownames(sb.sdm),adj=0.5) grid3d(c("x", "y+", "z"))

#階層的クラスター par(mar=c(1,2,1,1)) sb.ed<-dist(sb.p[,1:32]) sb.hc<-hclust(sb.ed,"ward") plot(sb.hc,hang=-1,main="") rect.hclust(sb.hc, k=2, border="red")

#IR距離のクラスター sb.ld<-as.dist(myldist(sb.p[,1:32])) sbld.hc<-hclust(sb.ld,"ward") plot(sbld.hc,hang=-1,main="") rect.hclust(sbld.hc, k=2, border="red")

#k-means法 sb.km<-kmeans(sb.p[,1:32],2) summary(sb.km) sb.km$cluster table(rep(2:1,11),sb.km$clust)

#系統樹 install.packages("ape") library(ape) par(mar=c(rep(0.2,4))) plot(nj(dist(sb.p[,1:32])),type="u",lab4ut="axial",cex=1.5)

#第14章テキストの分類 install.packages("mvpart");library(mvpart); sb3<-read.csv("http://mjin.doshisha.ac.jp/iwanami/data/sb3.csv",head=T,row.names=1) (sb3.rp<-mvpart(y~.,sb3,size=4)) #sizeは木のターミナルの数

#予測 set.seed(1) sam<-sample(1:33,6) train<-sb3[-sam,] test<-sb3[sam,]

sb3.rp2<-mvpart(y~.,train) sb3.pr<-predict(sb3.rp2,test[,-48],type="class") table(test$y,sb3.pr)

#森の作成 par(mfrow=c(3,2)) for(i in 1:6){ sam<-sample(1:33,6) train<-sb3[-sam,]

samv<-sample(1:46,7) train<-cbind(sb3[-sam,samv],y=sb3[-sam,48]) sb3.rp2<-mvpart(y~.,train) }

install.packages("randomForest");library(randomForest) set.seed(10) (sb3.rf<-randomForest(y~.,sb3))

sam<-sample(1:33,6) train<-sb3[-sam,] samv<-sample(1:46,7) train<-cbind(sb3[-sam,samv],y=sb3[-sam,48]) mvpart(y~.,train,big.pts = TRUE,digits = 2)

sb3.pr<-predict(sb3.rp2,test[,-48],type="class") table(test$y,sb3.pr)

#作図される。 plot(sb3.rp,uniform=T) text(sb3.rp,all=T,use.n=T,tadj = 1) #tadjはグループのラベルを若干下の方向に移動する指定である。

#11人が書いた作文の saku11<-read.csv("http://mjin.doshisha.ac.jp/iwanami/data/saku11.csv",head=T,row.names=1) saku11p<-100*saku11/apply(saku11,1,sum) y<-rep(LETTERS[1:10], each = 1, times = 11) saku11y<-cbind(saku11p,y=y) saku11y2<-saku11y[sort.list(saku11y$y),] randomForest(y~.,saku11y2)

saku11y33<-saku11y2[1:33,] saku11y33$y<-rep(LETTERS[1:3], each = 11) saku.rp<-rpart(y~.,saku11y33) plot(saku.rp,uniform=T) text(saku.rp,all=T,use.n=T,tadj = 1)

#時系列分析 FUKUDA<-read.csv("http://mjin.doshisha.ac.jp/iwanami/data/FUKUDA.csv",head=T,row.names=1)

#Zipf法則のグラフ par(mar=c(4.5,4,1,1),mfrow=c(1,2)) plot(1:nrow(FUKUDA),FUKUDA[,1],xlab="ランク",ylab="頻度") x<-log(1:nrow(FUKUDA)) y<-log(FUKUDA[,1]) plot(x,y,xlab="ランクの対数",ylab="頻度の対数") (fukuda.lm<-lm(y~x)) abline(fukuda.lm,lw=2,col="red")

akuta<-read.csv("http://mjin.doshisha.ac.jp/iwanami/data/akuta250.csv",head=T,row.names=1) akuta.lm<-lm(y~.,data=akuta) akuta.lm2<-step(akuta.lm,trace=0)

#予測

set.seed(100) sam<-sample(1:250,10) test<-akuta[sam,] train<-akuta[-sam,] te.lm<-lm(y~.,data=train) te.lm2<-step(te.lm,trace = 0) te.pr<-predict(te.lm2,test[,-40]) temp<-round(cbind(test$y,te.pr,test$y-te.pr),1) colnames(temp)<-c("初出年","予測値","残差") temp

#p222 te.tr<-mvpart(y~.,data=train) te.pr<-predict(te.tr,test) temp<-round(cbind(test$y,te.pr,test$y-te.pr),1) colnames(temp)<-c("初出年","予測値","残差") temp

p.223 te.rf<-randomForest(y~.,data=train) te.pr<-predict(te.rf,test[,-40]) temp<-round(cbind(test$y,te.pr,test$y-te.pr),1) rownames(temp)<-rownames(test) colnames(temp)<-c("初出年","予測値","残差") temp

akuta.rf<-randomForest(y~.,data=akuta,important=TRUE) varImpPlot(akuta.rf)

akuta.ga<-glm(y~.,train,gamma) predict(akuta.ga,test)

#第16章 data1<-list( c("学費","下げ","講義","充実","はかっ","欲しい","適当","授業","し","いる","思わ","れる","先生","かなり","居る"), c("学費","もう少し","安く","し","欲しい"), c("休み","期間","多い","割","学費","高い","何","使わ","いる","はっきりし","欲しい"), c("授業","担当","教員","生徒","選ば","せ","欲しい"), c("学費","削減","あと","ロッカー"), c("個人","ロッカー","作っ","下さい","自動車","通学","認め"), c("学費","軽減"), c("学費","もっと","安く","し","欲しい"), c("クーラー","つけ","欲しい"), c("学費","安く","し","下さい") ) install.packages("arules");library(arules); data1.tran<-as(data1,"transactions") #class(data.tran) #as(data.tran,"matrix") as(data1.tran,"data.frame") itemFrequencyPlot(data1.tran,type="absolute") data.ap<-apriori(data1.tran) inspect(head(SORT(data.ap,by="support"),n=10))

data.ap1 <- subset(data.ap, subset = rhs %in% "欲しい" & lift>2 ) data.ap1 <- subset(data.ap, subset = rhs %in% "安く") inspect(SORT(data.ap1)[1:3])

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment