StanでGLM
WPからの引っ越し記事なのでレイアウトが崩れてます。
GLM図鑑 by Stan
- 線形回帰モデル
- ロバスト回帰モデル
- ポアソン回帰モデル
- ロジステック回帰モデル
- 二項回帰モデル
- 多項ロジスティクス回帰モデル
- ガンマ回帰モデル
- ベータ回帰モデル
- 対数正規回帰モデル
- 負の二項回帰モデル
- ゼロ過剰ポアソン回帰モデル
- ゼロ過剰ガンマ回帰モデル
- ゼロ過剰ベータ回帰モデル
道具立て
必用に応じてお使いください。
library(tidyverse) library(rstan) library(extraDistr) library(metRology) library(gamlss.dist) library(rtdists) rstan_options(auto_write = TRUE) options(mc.cores = parallel::detectCores())
信用区間、事後予測区間については、少しコードは違いますがこちらを参照ください。
線形回帰モデル
- 誤差構造:正規分布
- リンク関数:恒等リンク関数
サンプルデータ
N <- 100 X <- rnorm(n = N, mean = 0, sd = 1) Y <- rnorm(n = N, mean = 2*X, sd = 1) df <- data_frame(X, Y) standata <- list(N = nrow(df), Y = df$Y, X = df$X)
Stanコード
stancode <- " data{ int N; vector[N] Y; vector[N] X; } parameters{ real b0; real b1; real <lower=0> sigma; } model{ for (i in 1:N) Y[i] ~ normal(b0 + b1*X[i], sigma); } generated quantities{ vector[N] pred_Y; real log_lik[N]; //pred_Y[i] = normal_rng(b0 + b1*X1[i] + b2*-mean(X2), sigma);X2の平均を入れておけば //X1が動いたときのYの変化が確認できる for (i in 1:N){ pred_Y[i] = normal_rng(b0 + b1*X[i] , sigma); log_lik[i] = normal_lpdf(Y[i] | b0 + b1*X[i], sigma); } } "
事後予測分布
fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4) summary(fit)$summary[c("b0", "b1"), c(1, 4, 8, 10)] rstan::extract(fit)$pred_Y %>% data.frame() %>% apply(., 2, quantile, prob = c(0.025, 0.10, 0.5, 0.90, 0.975)) %>% t() %>% `colnames<-`(c("pred2.5", "pred10", "pred50", "pred90", "pred97.5")) %>% cbind(df) %>% ggplot() + geom_point(aes(X, Y), col = "#4169e1") + geom_line(aes(X, y = pred50), colour = "#4169e1") + geom_ribbon(aes(X, ymin = pred2.5, ymax = pred97.5), alpha = 0.1, fill = "#4169e1") + geom_ribbon(aes(X, ymin = pred10, ymax = pred90), alpha = 0.3, fill = "#4169e1") + theme_classic() + theme(axis.text = element_text(size = 15), axis.title = element_text(size = 20))
ロバスト回帰モデル
- 誤差構造:スチューデントのt分布
- リンク関数:恒等リンク関数
サンプルデータ
N <- 100 X <- rnorm(n = N, mean = 0, sd = 1) Y <- metRology::rt.scaled(n = N, df = 2, mean = 2* X, sd = 1) df <- data_frame(X, Y) standata <- list(N = nrow(df), Y = df$Y, X = df$X)
Stanコード
stancode <- " data{ int N; vector[N] Y; vector[N] X; } parameters{ real b0; real b1; real <lower=0> sigma; real <lower=1> nu; } model{ for (i in 1:N) Y[i] ~ student_t(nu, b0 + b1*X[i], sigma); } generated quantities{ vector[N] pred_Y; real log_lik[N]; for (i in 1:N){ pred_Y[i] = student_t_rng(nu, b0 + b1*X[i] , sigma); log_lik[i] = student_t_lpdf(Y[i] | nu, b0 + b1*X[i], sigma); } } "
事後予測分布
fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4) summary(fit)$summary[c("b0", "b1"), c(1, 4, 8, 10)] rstan::extract(fit)$pred_Y %>% data.frame() %>% apply(., 2, quantile, prob = c(0.025, 0.10, 0.5, 0.90, 0.975)) %>% t() %>% `colnames<-`(c("pred2.5", "pred10", "pred50", "pred90", "pred97.5")) %>% cbind(df) %>% ggplot() + geom_point(aes(X, Y), col = "#4169e1") + geom_line(aes(X, y = pred50), colour = "#4169e1") + geom_ribbon(aes(X, ymin = pred2.5, ymax = pred97.5), alpha = 0.1, fill = "#4169e1") + geom_ribbon(aes(X, ymin = pred10, ymax = pred90), alpha = 0.3, fill = "#4169e1") + theme_classic() + theme(axis.text = element_text(size = 15), axis.title = element_text(size = 20))
ポアソン回帰モデル
- 誤差構造:ポアソン分布
- リンク関数:対数リンク関数
サンプルデータ
N <- 100 X <- rnorm(n = N, mean = 0, sd = 1) Y <- rpois(n = N, lambda = exp(0.5*X)) df <- data_frame(X, Y) standata <- list(N = nrow(df), Y = df$Y, X = df$X)
Stanコード
stancode <- " data{ int N; int Y[N]; //離散変数なのでintで渡す vector[N] X; } parameters{ real b0; real b1; } model{ for (i in 1:N) Y[i] ~ poisson_log(b0 + b1*X[i]); } generated quantities{ vector[N] pred_Y; real log_lik[N]; for (i in 1:N){ pred_Y[i] = poisson_log_rng(b0 + b1*X[i]); log_lik[i] = poisson_log_lpmf(Y[i] | b0 + b1*X[i]); } } "exo
事後予測分布
fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4) summary(fit)$summary[c("b0", "b1"), c(1, 4, 8, 10)] rstan::extract(fit)$pred_Y %>% data.frame() %>% apply(., 2, quantile, prob = c(0.025, 0.10, 0.5, 0.90, 0.975)) %>% t() %>% `colnames<-`(c("pred2.5", "pred10", "pred50", "pred90", "pred97.5")) %>% cbind(df) %>% ggplot() + geom_point(aes(X, Y), col = "#4169e1") + geom_line(aes(X, y = pred50), colour = "#4169e1") + geom_ribbon(aes(X, ymin = pred2.5, ymax = pred97.5), alpha = 0.1, fill = "#4169e1") + geom_ribbon(aes(X, ymin = pred10, ymax = pred90), alpha = 0.3, fill = "#4169e1") + theme_classic() + theme(axis.text = element_text(size = 15), axis.title = element_text(size = 20))
ロジステック回帰モデル
- 誤差構造:ベルヌイ分布
- リンク関数:ロジットリンク関数(逆ロジット変換)
サンプルデータ
N <- 100 X <- rnorm(n = N, mean = 0, sd = 1) Y <- extraDistr::rbern(n = N, 1/(1 + exp(-(2*X)))) df <- data_frame(X, Y) standata <- list(N = nrow(df), Y = df$Y, X = df$X)
Stanコード
stancode <- " data{ int N; int Y[N]; //離散変数なのでintで渡す vector[N] X; } parameters{ real b0; real b1; } transformed parameters { vector <lower=0, upper=1>[N] p; for (i in 1:N){ p[i] = inv_logit(b0 + b1*X[i]); } } model{ for (i in 1:N) Y[i] ~ bernoulli(p[i]); } generated quantities{ vector[N] pred_Y; real log_lik[N]; for (i in 1:N){ pred_Y[i] = bernoulli_rng(p[i]); log_lik[i] = bernoulli_lpmf(Y[i] | p[i]); } } " #ロジットの逆関数の分布からサンプリングできるbernoulli_logit(θ)でもよい。 #model{ # for (i in 1:N) # Y[i] ~ bernoulli_logit(b0 + b1*X[i]); #}
確信区間分布
fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4) summary(fit)$summary[c("b0", "b1"), c(1, 4, 8, 10)] rstan::extract(fit)$p %>% #pred_Yではなくp data.frame() %>% apply(., 2, quantile, prob = c(0.025, 0.10, 0.5, 0.90, 0.975)) %>% t() %>% `colnames<-`(c("cred2.5", "cred10", "cred50", "cred90", "cred97.5")) %>% cbind(df) %>% ggplot() + geom_point(aes(X, Y), col = "#4169e1") + geom_line(aes(X, y = cred50), colour = "#4169e1") + geom_ribbon(aes(X, ymin = cred2.5, ymax = cred97.5), alpha = 0.1, fill = "#4169e1") + geom_ribbon(aes(X, ymin = cred10, ymax = cred90), alpha = 0.3, fill = "#4169e1") + theme_classic() + theme(axis.text = element_text(size = 15), axis.title = element_text(size = 20))
二項回帰モデル
- 誤差構造:二項分布
- リンク関数:ロジットリンク関数(逆ロジット変換)
サンプルデータ
df <- read.csv("data4a.csv", header = TRUE) %>% mutate(ratio = Y/N) standata <- list(N = nrow(df), K = df$N, Y = df$Y, X = df$X) #データは北大久保先生のHPよりお借りしました。 <a href="http://hosho.ees.hokudai.ac.jp/~kubo/ce/IwanamiBook.html">http://hosho.ees.hokudai.ac.jp/~kubo/ce/IwanamiBook.html</a>
Stanコード
stancode <- " data{ int N; int K[N]; int Y[N]; vector[N] X; } parameters{ real b0; real b1; } transformed parameters { vector[N] p; for (i in 1:N) p[i] = inv_logit(b0 + b1*X[i]); } model{ for(i in 1:N) Y[i] ~ binomial(K[i], p[i]); } generated quantities{ vector[N] pred_Y; real log_lik[N]; for (i in 1:N){ pred_Y[i] = binomial_rng(K[i], p[i]); log_lik[i] = binomial_lpmf(Y[i] |K[i], p[i]); } } "
信用区間分布&事後予測分布
rstan::extract(fit)$p %>% data.frame() %>% apply(., 2, quantile, prob = c(0.025, 0.10, 0.5, 0.90, 0.975)) %>% t() %>% `colnames<-`(c("pred2.5", "pred10", "pred50", "pred90", "pred97.5")) %>% cbind(df) %>% ggplot() + geom_point(aes(X, ratio), col = "#4169e1") + geom_line(aes(X, y = pred50), colour = "#4169e1") + geom_ribbon(aes(X, ymin = pred2.5, ymax = pred97.5), alpha = 0.1, fill = "#4169e1") + geom_ribbon(aes(X, ymin = pred10, ymax = pred90), alpha = 0.3, fill = "#4169e1") + theme_classic() + theme(axis.text = element_text(size = 15), axis.title = element_text(size = 20))
fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4) summary(fit)$summary[c("b0", "b1"), c(1, 4, 8, 10)] rstan::extract(fit)$pred_Y %>% data.frame() %>% apply(., 2, quantile, prob = c(0.025, 0.10, 0.5, 0.90, 0.975)) %>% t() %>% `colnames<-`(c("pred2.5", "pred10", "pred50", "pred90", "pred97.5")) %>% cbind(df) %>% ggplot() + geom_point(aes(X, Y), col = "#4169e1") + geom_line(aes(X, y = pred50), colour = "#4169e1") + geom_ribbon(aes(X, ymin = pred2.5, ymax = pred97.5), alpha = 0.1, fill = "#4169e1") + geom_ribbon(aes(X, ymin = pred10, ymax = pred90), alpha = 0.3, fill = "#4169e1") + theme_classic() + theme(axis.text = element_text(size = 15), axis.title = element_text(size = 20))
多項ロジスティクス回帰モデル
- 誤差構造:多項分布
- リンク関数:ロジットリンク関数(逆ロジット変換)
サンプルデータ
X <- cbind(1, df[,-2]) standata <- list(N = nrow(df), D = ncol(X), K = length(unique(df$Y)), X = X, Y = df$Y)
Stanコード
stancode <- " data { int N; int D; int K; matrix[N,D] X; int<lower=1, upper=K> Y[N]; } transformed data { vector[D] Zero_vec; Zero_vec = rep_vector(0, D); } parameters { matrix[D,K-1] beta_raw; } transformed parameters { matrix[D,K] beta; matrix[N,K] mu; beta = append_col(Zero_vec, beta_raw); mu = X*beta; } model { for (n in 1:N) //Y[n] ~ categorical(softmax(mu[n,]')); と同じ Y[n] ~ categorical_logit(mu[n,]'); } generated quantities{ vector[N] pred_Y; real log_lik[N]; for(i in 1:N){ pred_Y[i] = categorical_logit_rng(mu[i,]'); log_lik[i] = categorical_logit_lpmf(Y[i] | mu[i,]'); } } "
事後分布
fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 1) summary(fit)$summary[c("beta[1,2]", "beta[1,3]", "beta[2,2]", "beta[2,3]"), c(1, 4, 8, 10)] mean 2.5% 97.5% Rhat beta[1,2] 0.5724148 0.3611360 0.7907723 0.9992367 beta[1,3] -0.4137401 -0.6906688 -0.1503502 0.9990503 beta[2,2] -0.0252887 -0.2288971 0.1781029 0.9992045 beta[2,3] -0.1228497 -0.3827433 0.1365426 1.0023173
ガンマ回帰モデル
- 誤差構造:ガンマ分布
- リンク関数:対数リンク関数
サンプルデータ
N <- 100 X <- rnorm(n = N, mean = 0, sd = 1) mu <- exp(1+0.3*X) phi <- 1 #集中度パラメタ shape <- mu^2/phi #形状(shape)パラメタ rate <- mu/phi #比率(rate)パラメタ Y <- rgamma(n = N, shape = shape, rate = rate) df <- data_frame(X, Y) standata <- list(N = nrow(df), Y = df$Y, X = df$X)
Stanコード
stancode <- " data{ int N; vector[N] Y; vector[N] X; } parameters{ real b0; real b1; real <lower=0> phi; } transformed parameters { vector <lower=0>[N] shape; vector <lower=0>[N] rate; vector <lower=0>[N] mu; for (i in 1 : N){ mu[i] = exp(b0 + b1*X[i]); shape[i] = pow(mu[i], 2) / phi; rate[i] = mu[i] / phi; } } model{ for(i in 1:N){ Y[i] ~ gamma(shape[i], rate[i]); } } generated quantities{ vector[N] pred_Y; real log_lik[N]; for (i in 1:N){ pred_Y[i] = gamma_rng(shape[i], rate[i]); log_lik[i] = gamma_lpdf(Y[i] | shape[i], rate[i]); } } "
事後予測分布
rstan::extract(fit)$pred_Y %>% data.frame() %>% apply(., 2, quantile, prob = c(0.025, 0.10, 0.5, 0.90, 0.975)) %>% t() %>% `colnames<-`(c("pred2.5", "pred10", "pred50", "pred90", "pred97.5")) %>% cbind(df) %>% ggplot() + geom_point(aes(X, Y), col = "#4169e1") + geom_line(aes(X, y = pred50), colour = "#4169e1") + geom_ribbon(aes(X, ymin = pred2.5, ymax = pred97.5), alpha = 0.1, fill = "#4169e1") + geom_ribbon(aes(X, ymin = pred10, ymax = pred90), alpha = 0.3, fill = "#4169e1") + theme_classic() + theme(axis.text = element_text(size = 15), axis.title = element_text(size = 20))
ベータ回帰モデル
- 誤差構造:ベータ分布
- リンク関数:ロジットリンク関数
サンプルデータ
N <- 100 X <- rnorm(n = N, mean = 0, sd = 1) mu <- 1/(1 + exp(-(0.5 + 2*X))) phi <- 0.1 a <- mu/phi b <- (1 - mu)/phi Y <- rbeta(n = N, a, b) df <- data_frame(X, Y) standata <- list(N = nrow(df), Y = df$Y, X = df$X)
Stanコード
stancode <- " data{ int N; vector<lower=0, upper=1>[N] Y; vector[N] X; } parameters{ real b0; real b1; real<lower=0> phi; } transformed parameters { vector <lower=0>[N] a; vector <lower=0>[N] b; vector[N] mu; for(i in 1:N){ mu[i] = inv_logit(b0 + b1*X[i]); a[i] = phi * mu[i] ; b[i] = phi * (1 - mu[i]) ; } } model{ for(i in 1:N){ Y[i] ~ beta(a[i], b[i]); } } generated quantities{ vector[N] pred_Y; real log_lik[N]; for (i in 1:N){ pred_Y[i] = beta_rng(a[i], b[i]); log_lik[i] = beta_lpdf(Y[i] | a[i], b[i]); } } "
事後予測分布
fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4) summary(fit)$summary[c("b0", "b1"), c(1, 4, 8, 10)] rstan::extract(fit)$pred_Y %>% data.frame() %>% apply(., 2, quantile, prob = c(0.025, 0.10, 0.5, 0.90, 0.975)) %>% t() %>% `colnames<-`(c("pred2.5", "pred10", "pred50", "pred90", "pred97.5")) %>% cbind(df) %>% ggplot() + geom_point(aes(X, Y), col = "#4169e1") + geom_line(aes(X, y = pred50), colour = "#4169e1") + geom_ribbon(aes(X, ymin = pred2.5, ymax = pred97.5), alpha = 0.1, fill = "#4169e1") + geom_ribbon(aes(X, ymin = pred10, ymax = pred90), alpha = 0.3, fill = "#4169e1") + theme_classic() + theme(axis.text = element_text(size = 15), axis.title = element_text(size = 20))
対数正規回帰モデル
- 誤差構造:対数正規分布
- リンク関数:恒等リンク関数
サンプルデータ
X <- rnorm(n = N, mean = 0, sd = 1) Y <- rlnorm(n = N, meanlog = 2*X, sdlog = 1) df <- data_frame(X, Y) standata <- list(N = nrow(df), Y = df$Y, X = df$X)
Stanコード
stancode <- " data{ int N; vector[N] Y; vector[N] X; } parameters{ real b0; real b1; real<lower=0> sigma; } model{ for(i in 1:N){ Y[i] ~ lognormal(b0 + b1*X[i], sigma); } } generated quantities{ vector[N] pred_Y; real log_lik[N]; for (i in 1:N){ pred_Y[i] = lognormal_rng(b0 + b1*X[i], sigma); log_lik[i] = lognormal_lpdf(Y[i] | b0 + b1*X[i], sigma); } } "
事後予測分布
fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4) summary(fit)$summary[c("b0", "b1"), c(1, 4, 8, 10)] rstan::extract(fit)$pred_Y %>% data.frame() %>% apply(., 2, quantile, prob = c(0.025, 0.10, 0.5, 0.90, 0.975)) %>% t() %>% `colnames<-`(c("pred2.5", "pred10", "pred50", "pred90", "pred97.5")) %>% cbind(df) %>% ggplot() + geom_point(aes(X, Y), col = "#4169e1") + geom_line(aes(X, y = pred50), colour = "#4169e1") + geom_ribbon(aes(X, ymin = pred2.5, ymax = pred97.5), alpha = 0.1, fill = "#4169e1") + geom_ribbon(aes(X, ymin = pred10, ymax = pred90), alpha = 0.3, fill = "#4169e1") + theme_classic() + theme(axis.text = element_text(size = 15), axis.title = element_text(size = 20))
負の二項回帰モデル
- 誤差構造:負の二項分布
- リンク関数:対数リンク関数
サンプルデータ
N <- 100 X <- rnorm(n = N, mean = 0, sd = 1) Y <- rnbinom(n = N, mu = 4, size = 1) df <- data_frame(X, Y) standata <- list(N = nrow(df), Y = df$Y, X = df$X)
Stanコード
stancode <- " data{ int N; int Y[N]; vector[N] X; } parameters{ real b0; real b1; real<lower=0> alpha; } transformed parameters { vector[N] mu; for (i in 1:N){ mu[i] = exp(b0 + b1*X[i]); } } model{ for(i in 1:N){ Y[i] ~ neg_binomial_2(mu[i], alpha); } } generated quantities{ vector[N] pred_Y; real log_lik[N]; for (i in 1:N){ pred_Y[i] = neg_binomial_2_rng(mu[i], alpha); log_lik[i] = neg_binomial_2_lpmf(Y[i] | mu[i], alpha); } } " #neg_binomial_2_log(mu)で直接対数変換せずにサンプリングしてもいい #model{ #for(i in 1:N){ # Y[i] ~ neg_binomial_2_log(b0 + b1*X[i], alpha); # } #}
事後予測分布
fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4) summary(fit)$summary[c("b0", "b1"), c(1, 4, 8, 10)] rstan::extract(fit)$pred_Y %>% data.frame() %>% apply(., 2, quantile, prob = c(0.025, 0.10, 0.5, 0.90, 0.975)) %>% t() %>% `colnames<-`(c("pred2.5", "pred10", "pred50", "pred90", "pred97.5")) %>% cbind(df) %>% ggplot() + geom_point(aes(X, Y), col = "#4169e1") + geom_line(aes(X, y = pred50), colour = "#4169e1") + geom_ribbon(aes(X, ymin = pred2.5, ymax = pred97.5), alpha = 0.1, fill = "#4169e1") + geom_ribbon(aes(X, ymin = pred10, ymax = pred90), alpha = 0.3, fill = "#4169e1") + theme_classic() + theme(axis.text = element_text(size = 15), axis.title = element_text(size = 20))
ゼロ過剰ポアソン回帰モデル
サンプルデータ
N <- 100 X <- rnorm(n = N, mean = 0, sd = 1) Y <- rpois(n = N, lambda = exp(0.8*X)) df <- data_frame(X, Y) standata <- list(N = nrow(df), Y = df$Y, X = df$X)
Stanコード
stancode <- " data{ int N; int Y[N]; vector[N] X; } parameters{ real a0; real a1; real b0; real b1; } model{ vector[N] theta; vector[N] lambda; for(i in 1:N){ theta[i] = inv_logit(a0 + a1*X[i]); lambda[i] = exp(b0 + b1*X[i]); if(Y[i]==0) target += log_sum_exp(log(1-theta[i]), log(theta[i]) + poisson_lpmf(0| lambda[i])); else target += log(theta[i]) + poisson_lpmf(Y[i]| lambda[i]); } } "
事後予測分布
fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4) summary(fit)$summary[c("a0", "a1", "b0", "b1"), c(1, 4, 8, 10)] mean 2.5% 97.5% Rhat a0 20.02717748 4.7168032 32.8796014 1.005368 a1 1.60100422 -5.0632239 9.3640185 1.001425 b0 -0.09690579 -0.3120695 0.1158623 1.000140 b1 0.89315760 0.7568918 1.0262100 1.000192
ゼロ過剰ガンマ回帰モデル
サンプルデータ
N <- 100 X <- rnorm(n = N, mean = 0, sd = 1) mu <- exp(1 + 2*X) phi <- 1 shape <- mu^2/phi rate <- mu/phi Y <- ifelse(extraDistr::rbern(N, 1/(1 + exp(-(1 + 2*X)))) < 1, 0, rgamma(N, shape = shape, rate = rate)) df <- data_frame(X, Y) standata <- list(N = nrow(df), Y = df$Y, X = df$X)
Stanコード
stancode <- " data{ int N; real Y[N]; vector[N] X; } parameters{ real a0; real a1; real b0; real b1; real <lower=0> phi; } transformed parameters{ vector <lower=0> [N] shape; vector <lower=0> [N] rate; vector <lower=0> [N] mu; vector <lower=0> [N] theta; for (i in 1:N){ mu[i] = exp(a0 + a1*X[i]); shape[i] = pow(mu[i], 2)/phi; rate[i] = mu[i]/phi; theta[i] = inv_logit(b0 + b1*X[i]); } } model{ for( i in 1 : N){ if ( Y[i] == 0 ){ target += bernoulli_lpmf(0| theta[i]); }else{ target += bernoulli_lpmf(1| theta[i]) + gamma_lpdf(Y[i]| shape[i], rate[i]); } } } "
事後分布
fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4) summary(fit)$summary[c("a0", "a1", "b0", "b1"), c(1, 4, 8, 10)] mean 2.5% 97.5% Rhat a0 3.7284473 3.4287448 4.0449271 1.0010297 a1 0.6528083 0.4604300 0.8524688 1.0000245 b0 1.5223662 0.8335802 2.3302660 0.9999711 b1 2.7099104 1.7278128 3.9400902 0.9993525
ゼロ過剰ベータ回帰モデル
サンプルデータ
N <- 100 X <- rnorm(n = N, mean = 0, sd = 1) phi = 0.1 theta <- 1/(1 + exp(-(0.5 + 0.5*X))) a <- theta/phi b <- (1 - theta)/phi Y <- ifelse(extraDistr::rbern(n = N, theta) == 0, 0, rbeta(n = N, a, b)) df <- data_frame(X, Y) standata <- list(N = nrow(df), Y = df$Y, X = df$X)
Stanコード
stancode <- " data{ int N; vector<lower=0, upper=1>[N] Y; vector[N] X; } parameters{ real a0; real a1; real b0; real b1; real <lower=0> phi; } transformed parameters{ vector[N] theta1; vector[N] theta2; vector <lower=0>[N] a; vector <lower=0>[N] b; for(i in 1:N){ theta1[i] = inv_logit(a0 + a1*X[i]); theta2[i] = inv_logit(b0 + b1*X[i]); a[i] = theta2[i] * phi; b[i] = (1 - theta2[i]) * phi; } } model { for(i in 1:N){ if(Y[i] == 0){ target += bernoulli_lpmf(0| theta1[i]); }else{ target += bernoulli_lpmf(1| theta1[i]) + beta_lpdf(Y[i]| a[i], b[i]); } } } "
事後分布
fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4) summary(fit)$summary[c("a0", "a1", "b0", "b1"), c(1, 4, 8, 10)] mean 2.5% 97.5% Rhat a0 0.5072671 0.09042595 0.9397354 1.000795 a1 0.5365692 0.07548989 1.0071972 1.000013 b0 0.6072267 0.44246342 0.7689407 1.000972 b1 0.5019813 0.33206450 0.6705875 1.000977
参考文献および参考サイト
- Stan モデリング言語: ユーザーガイド・リファレンスマニュアル
- StanとRで学ぶベイズ統計モデリング
- Stan超初心者入門
- Stan中級編
- Stan Advent Calender 2017(day 19)
kai