Rのこと。

記事は引っ越し作業中。2023年中までに引っ越しを完了させてブログは削除予定

StanでGLM

WPからの引っ越し記事なのでレイアウトが崩れてます。

GLM図鑑 by Stan

  • 線形回帰モデル
  • ロバスト回帰モデル
  • ポアソン回帰モデル
  • ロジステック回帰モデル
  • 二項回帰モデル
  • 多項ロジスティクス回帰モデル
  • ガンマ回帰モデル
  • ベータ回帰モデル
  • 対数正規回帰モデル
  • 負の二項回帰モデル
  • ゼロ過剰ポアソン回帰モデル
  • ゼロ過剰ガンマ回帰モデル
  • ゼロ過剰ベータ回帰モデル

道具立て

必用に応じてお使いください。

library(tidyverse)  
library(rstan)  
library(extraDistr) 
library(metRology)  
library(gamlss.dist)
library(rtdists)  
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())

信用区間、事後予測区間については、少しコードは違いますがこちらを参照ください。

線形回帰モデル

  1. 誤差構造:正規分布
  2. リンク関数:恒等リンク関数

サンプルデータ

N <- 100
X <- rnorm(n = N, mean = 0, sd = 1)
Y  <- rnorm(n = N, mean = 2*X, sd = 1)
df <- data_frame(X, Y)

standata <- list(N = nrow(df),
                 Y = df$Y,
                 X = df$X)

Stanコード

stancode <- "
data{
  int N;
  vector[N] Y;
  vector[N] X;
}

parameters{
  real b0;
  real b1;
  real <lower=0> sigma;
}

model{
  for (i in 1:N)
    Y[i] ~ normal(b0 + b1*X[i], sigma);
}

generated quantities{
  vector[N] pred_Y;
  real log_lik[N];

  //pred_Y[i] = normal_rng(b0 + b1*X1[i] + b2*-mean(X2), sigma);X2の平均を入れておけば
  //X1が動いたときのYの変化が確認できる
  for (i in 1:N){
    pred_Y[i] = normal_rng(b0 + b1*X[i] , sigma);
    log_lik[i] = normal_lpdf(Y[i] | b0 + b1*X[i], sigma);
  }
}
"

事後予測分布

fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4)
summary(fit)$summary[c("b0", "b1"), c(1, 4, 8, 10)]

rstan::extract(fit)$pred_Y %>%
  data.frame() %>% 
  apply(., 2, quantile, prob = c(0.025, 0.10, 0.5, 0.90, 0.975)) %>% 
  t() %>% 
  `colnames<-`(c("pred2.5", "pred10", "pred50", "pred90", "pred97.5")) %>% 
  cbind(df) %>% 
  ggplot() + 
  geom_point(aes(X, Y), col = "#4169e1") + 
  geom_line(aes(X, y = pred50), colour = "#4169e1") +
  geom_ribbon(aes(X, ymin = pred2.5, ymax = pred97.5), alpha = 0.1, fill = "#4169e1") +
  geom_ribbon(aes(X, ymin = pred10, ymax = pred90), alpha = 0.3, fill = "#4169e1") +
  theme_classic() +
  theme(axis.text  = element_text(size = 15),
        axis.title = element_text(size = 20))

ロバスト回帰モデル

  1. 誤差構造:スチューデントのt分布
  2. リンク関数:恒等リンク関数

サンプルデータ

N <- 100
X <- rnorm(n = N, mean = 0, sd = 1)
Y <- metRology::rt.scaled(n = N, df = 2, mean = 2* X, sd = 1)

df <- data_frame(X, Y)

standata <- list(N = nrow(df),
                 Y = df$Y,
                 X = df$X)

Stanコード

stancode <- "
data{
  int N;
  vector[N] Y;
  vector[N] X;
}

parameters{
  real b0;
  real b1;
  real <lower=0> sigma;
  real <lower=1> nu;
}

model{
  for (i in 1:N)
    Y[i] ~ student_t(nu, b0 + b1*X[i], sigma);
}

generated quantities{
  vector[N] pred_Y;
  real log_lik[N];

  for (i in 1:N){
    pred_Y[i] = student_t_rng(nu, b0 + b1*X[i] , sigma);
    log_lik[i] = student_t_lpdf(Y[i] | nu, b0 + b1*X[i], sigma);
  }
}
"

事後予測分布

fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4)
summary(fit)$summary[c("b0", "b1"), c(1, 4, 8, 10)]

rstan::extract(fit)$pred_Y %>%
  data.frame() %>% 
  apply(., 2, quantile, prob = c(0.025, 0.10, 0.5, 0.90, 0.975)) %>% 
  t() %>% 
  `colnames<-`(c("pred2.5", "pred10", "pred50", "pred90", "pred97.5")) %>% 
  cbind(df) %>% 
  ggplot() + 
  geom_point(aes(X, Y), col = "#4169e1") + 
  geom_line(aes(X, y = pred50), colour = "#4169e1") +
  geom_ribbon(aes(X, ymin = pred2.5, ymax = pred97.5), alpha = 0.1, fill = "#4169e1") +
  geom_ribbon(aes(X, ymin = pred10, ymax = pred90), alpha = 0.3, fill = "#4169e1") +
  theme_classic() +
  theme(axis.text  = element_text(size = 15),
        axis.title = element_text(size = 20))

ポアソン回帰モデル

  1. 誤差構造:ポアソン分布
  2. リンク関数:対数リンク関数

サンプルデータ

N <- 100
X <- rnorm(n = N, mean = 0, sd = 1)
Y <- rpois(n = N, lambda = exp(0.5*X))

df <- data_frame(X, Y)

standata <- list(N = nrow(df),
                 Y = df$Y,
                 X = df$X)

Stanコード

stancode <- "
data{
  int N;
  int Y[N]; //離散変数なのでintで渡す
  vector[N] X;
}

parameters{
  real b0;
  real b1;
}

model{
  for (i in 1:N)
    Y[i] ~ poisson_log(b0 + b1*X[i]);
}

generated quantities{
  vector[N] pred_Y;
  real log_lik[N];

  for (i in 1:N){
    pred_Y[i] = poisson_log_rng(b0 + b1*X[i]);
    log_lik[i] = poisson_log_lpmf(Y[i] | b0 + b1*X[i]);
  }
}
"exo

事後予測分布

fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4)
summary(fit)$summary[c("b0", "b1"), c(1, 4, 8, 10)]

rstan::extract(fit)$pred_Y %>%
  data.frame() %>% 
  apply(., 2, quantile, prob = c(0.025, 0.10, 0.5, 0.90, 0.975)) %>% 
  t() %>% 
  `colnames<-`(c("pred2.5", "pred10", "pred50", "pred90", "pred97.5")) %>% 
  cbind(df) %>% 
  ggplot() + 
  geom_point(aes(X, Y), col = "#4169e1") + 
  geom_line(aes(X, y = pred50), colour = "#4169e1") +
  geom_ribbon(aes(X, ymin = pred2.5, ymax = pred97.5), alpha = 0.1, fill = "#4169e1") +
  geom_ribbon(aes(X, ymin = pred10, ymax = pred90), alpha = 0.3, fill = "#4169e1") +
  theme_classic() +
  theme(axis.text  = element_text(size = 15),
        axis.title = element_text(size = 20))

ロジステック回帰モデル

  1. 誤差構造:ベルヌイ分布
  2. リンク関数:ロジットリンク関数(逆ロジット変換)

サンプルデータ

N <- 100
X <- rnorm(n = N, mean = 0, sd = 1)
Y <- extraDistr::rbern(n = N, 1/(1 + exp(-(2*X))))

df <- data_frame(X, Y)

standata <- list(N = nrow(df),
                 Y = df$Y,
                 X = df$X)

Stanコード

stancode <- "
data{
  int N;
  int Y[N]; //離散変数なのでintで渡す
  vector[N] X;
}

parameters{
  real b0;
  real b1;
}

transformed parameters {
  vector <lower=0, upper=1>[N] p;
  
  for (i in 1:N){
    p[i] = inv_logit(b0 + b1*X[i]);
  }
}

model{
  for (i in 1:N)
    Y[i] ~ bernoulli(p[i]);
}

generated quantities{
  vector[N] pred_Y;
  real log_lik[N];

  for (i in 1:N){
    pred_Y[i] = bernoulli_rng(p[i]);
    log_lik[i] = bernoulli_lpmf(Y[i] | p[i]);
  }
}
"
#ロジットの逆関数の分布からサンプリングできるbernoulli_logit(θ)でもよい。
#model{
#  for (i in 1:N)
#    Y[i] ~ bernoulli_logit(b0 + b1*X[i]);
#}

確信区間分布

fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4)
summary(fit)$summary[c("b0", "b1"), c(1, 4, 8, 10)]

rstan::extract(fit)$p %>% #pred_Yではなくp
  data.frame() %>% 
  apply(., 2, quantile, prob = c(0.025, 0.10, 0.5, 0.90, 0.975)) %>% 
  t() %>% 
  `colnames<-`(c("cred2.5", "cred10", "cred50", "cred90", "cred97.5")) %>% 
  cbind(df) %>% 
  ggplot() + 
  geom_point(aes(X, Y), col = "#4169e1") + 
  geom_line(aes(X, y = cred50), colour = "#4169e1") +
  geom_ribbon(aes(X, ymin = cred2.5, ymax = cred97.5), alpha = 0.1, fill = "#4169e1") +
  geom_ribbon(aes(X, ymin = cred10, ymax = cred90), alpha = 0.3, fill = "#4169e1") +
  theme_classic() +
  theme(axis.text  = element_text(size = 15),
        axis.title = element_text(size = 20))

二項回帰モデル

  1. 誤差構造:二項分布
  2. リンク関数:ロジットリンク関数(逆ロジット変換)

サンプルデータ

df <- read.csv("data4a.csv", header  = TRUE) %>% 
  mutate(ratio = Y/N)

standata <- list(N = nrow(df),
                 K = df$N,
                 Y = df$Y,
                 X = df$X)
                 
#データは北大久保先生のHPよりお借りしました。
<a href="http://hosho.ees.hokudai.ac.jp/~kubo/ce/IwanamiBook.html">http://hosho.ees.hokudai.ac.jp/~kubo/ce/IwanamiBook.html</a>

Stanコード

stancode <- "
data{
  int N;
  int K[N]; 
  int Y[N]; 
  vector[N] X;
}

parameters{
  real b0;
  real b1;
}

transformed parameters {
  vector[N] p;

  for (i in 1:N)
    p[i] = inv_logit(b0 + b1*X[i]);

}

model{
  for(i in 1:N)
    Y[i] ~ binomial(K[i], p[i]);
}

generated quantities{
  vector[N] pred_Y;
  real log_lik[N];

  for (i in 1:N){
    pred_Y[i] = binomial_rng(K[i], p[i]);
    log_lik[i] = binomial_lpmf(Y[i] |K[i], p[i]);
  }
}
"

信用区間分布&事後予測分布

rstan::extract(fit)$p %>%
  data.frame() %>% 
  apply(., 2, quantile, prob = c(0.025, 0.10, 0.5, 0.90, 0.975)) %>% 
  t() %>% 
  `colnames<-`(c("pred2.5", "pred10", "pred50", "pred90", "pred97.5")) %>% 
  cbind(df) %>% 
  ggplot() + 
  geom_point(aes(X, ratio), col = "#4169e1") + 
  geom_line(aes(X, y = pred50), colour = "#4169e1") +
  geom_ribbon(aes(X, ymin = pred2.5, ymax = pred97.5), alpha = 0.1, fill = "#4169e1") +
  geom_ribbon(aes(X, ymin = pred10, ymax = pred90), alpha = 0.3, fill = "#4169e1") +
  theme_classic() +
  theme(axis.text  = element_text(size = 15),
        axis.title = element_text(size = 20))
fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4)
summary(fit)$summary[c("b0", "b1"), c(1, 4, 8, 10)]

rstan::extract(fit)$pred_Y %>%
  data.frame() %>% 
  apply(., 2, quantile, prob = c(0.025, 0.10, 0.5, 0.90, 0.975)) %>% 
  t() %>% 
  `colnames<-`(c("pred2.5", "pred10", "pred50", "pred90", "pred97.5")) %>% 
  cbind(df) %>% 
  ggplot() + 
  geom_point(aes(X, Y), col = "#4169e1") + 
  geom_line(aes(X, y = pred50), colour = "#4169e1") +
  geom_ribbon(aes(X, ymin = pred2.5, ymax = pred97.5), alpha = 0.1, fill = "#4169e1") +
  geom_ribbon(aes(X, ymin = pred10, ymax = pred90), alpha = 0.3, fill = "#4169e1") +
  theme_classic() +
  theme(axis.text  = element_text(size = 15),
        axis.title = element_text(size = 20))

多項ロジスティクス回帰モデル

  1. 誤差構造:多項分布
  2. リンク関数:ロジットリンク関数(逆ロジット変換)

サンプルデータ

X <- cbind(1, df[,-2])
standata <- list(N = nrow(df),
                 D = ncol(X),
                 K = length(unique(df$Y)),
                 X = X,
                 Y = df$Y)

Stanコード

stancode <- "
data {
  int N;
  int D;
  int K;
  matrix[N,D] X;
  int<lower=1, upper=K> Y[N];
}

transformed data {
  vector[D] Zero_vec;
  Zero_vec = rep_vector(0, D);
}

parameters {
  matrix[D,K-1] beta_raw;
}

transformed parameters {
  matrix[D,K] beta;
  matrix[N,K] mu;
  beta = append_col(Zero_vec, beta_raw);
  mu = X*beta;
}

model {
  for (n in 1:N)
    //Y[n] ~ categorical(softmax(mu[n,]')); と同じ
    Y[n] ~ categorical_logit(mu[n,]');
}
generated quantities{
  vector[N] pred_Y;
  real log_lik[N];

  for(i in 1:N){
    pred_Y[i] = categorical_logit_rng(mu[i,]');
    log_lik[i] = categorical_logit_lpmf(Y[i] | mu[i,]');
  }
}
"

事後分布

fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 1)
summary(fit)$summary[c("beta[1,2]", "beta[1,3]", "beta[2,2]", "beta[2,3]"), c(1, 4, 8, 10)]

                mean       2.5%      97.5%      Rhat
beta[1,2]  0.5724148  0.3611360  0.7907723 0.9992367
beta[1,3] -0.4137401 -0.6906688 -0.1503502 0.9990503
beta[2,2] -0.0252887 -0.2288971  0.1781029 0.9992045
beta[2,3] -0.1228497 -0.3827433  0.1365426 1.0023173

ガンマ回帰モデル

  1. 誤差構造:ガンマ分布
  2. リンク関数:対数リンク関数

サンプルデータ

N <- 100
X <- rnorm(n = N, mean = 0, sd = 1)

mu <- exp(1+0.3*X)  
phi <- 1  #集中度パラメタ
shape <- mu^2/phi  #形状(shape)パラメタ
rate <- mu/phi  #比率(rate)パラメタ
Y <- rgamma(n = N, shape = shape, rate = rate)

df <- data_frame(X, Y)

standata <- list(N = nrow(df),
                 Y = df$Y,
                 X = df$X)

Stanコード

stancode <- "
data{
  int N;
  vector[N] Y; 
  vector[N] X;
}

parameters{
  real b0;
  real b1;
  real <lower=0> phi;
}

transformed parameters {
  vector <lower=0>[N] shape;
  vector <lower=0>[N] rate;
  vector <lower=0>[N] mu;

  for (i in 1 : N){
    mu[i] = exp(b0 + b1*X[i]);
    shape[i] = pow(mu[i], 2) / phi;
    rate[i] = mu[i] / phi;
  }
}

model{
  for(i in 1:N){
        Y[i] ~ gamma(shape[i], rate[i]);
  }
}

generated quantities{
  vector[N] pred_Y;
  real log_lik[N];

  for (i in 1:N){
    pred_Y[i] = gamma_rng(shape[i], rate[i]);
    log_lik[i] = gamma_lpdf(Y[i] | shape[i], rate[i]);
  }
}
"

事後予測分布

rstan::extract(fit)$pred_Y %>%
  data.frame() %>% 
  apply(., 2, quantile, prob = c(0.025, 0.10, 0.5, 0.90, 0.975)) %>% 
  t() %>% 
  `colnames<-`(c("pred2.5", "pred10", "pred50", "pred90", "pred97.5")) %>% 
  cbind(df) %>% 
  ggplot() + 
  geom_point(aes(X, Y), col = "#4169e1") + 
  geom_line(aes(X, y = pred50), colour = "#4169e1") +
  geom_ribbon(aes(X, ymin = pred2.5, ymax = pred97.5), alpha = 0.1, fill = "#4169e1") +
  geom_ribbon(aes(X, ymin = pred10, ymax = pred90), alpha = 0.3, fill = "#4169e1") +
  theme_classic() +
  theme(axis.text  = element_text(size = 15),
        axis.title = element_text(size = 20))

ベータ回帰モデル

  1. 誤差構造:ベータ分布
  2. リンク関数:ロジットリンク関数

サンプルデータ

N <- 100
X <- rnorm(n = N, mean = 0, sd = 1)

mu <- 1/(1 + exp(-(0.5 + 2*X)))
phi <- 0.1
a <- mu/phi
b <- (1 - mu)/phi
Y <- rbeta(n = N, a, b)

df <- data_frame(X, Y)

standata <- list(N = nrow(df),
                 Y = df$Y,
                 X = df$X)

Stanコード

stancode <- "
data{
  int N;
  vector<lower=0, upper=1>[N] Y; 
  vector[N] X;
}

parameters{
  real b0;
  real b1;
  real<lower=0> phi;
}

transformed parameters {
  vector <lower=0>[N] a;
  vector <lower=0>[N] b;
  vector[N] mu;

  for(i in 1:N){
    mu[i] = inv_logit(b0 + b1*X[i]);
    a[i] = phi * mu[i] ;
    b[i] = phi * (1 - mu[i]) ;
  }
}

model{
  for(i in 1:N){
        Y[i] ~ beta(a[i], b[i]);
  }
}

generated quantities{
  vector[N] pred_Y;
  real log_lik[N];

  for (i in 1:N){
    pred_Y[i] = beta_rng(a[i], b[i]);
    log_lik[i] = beta_lpdf(Y[i] | a[i], b[i]);
  }
}
"

事後予測分布

fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4)
summary(fit)$summary[c("b0", "b1"), c(1, 4, 8, 10)]

rstan::extract(fit)$pred_Y %>%
  data.frame() %>% 
  apply(., 2, quantile, prob = c(0.025, 0.10, 0.5, 0.90, 0.975)) %>% 
  t() %>% 
  `colnames<-`(c("pred2.5", "pred10", "pred50", "pred90", "pred97.5")) %>% 
  cbind(df) %>% 
  ggplot() + 
  geom_point(aes(X, Y), col = "#4169e1") + 
  geom_line(aes(X, y = pred50), colour = "#4169e1") +
  geom_ribbon(aes(X, ymin = pred2.5, ymax = pred97.5), alpha = 0.1, fill = "#4169e1") +
  geom_ribbon(aes(X, ymin = pred10, ymax = pred90), alpha = 0.3, fill = "#4169e1") +
  theme_classic() +
  theme(axis.text  = element_text(size = 15),
        axis.title = element_text(size = 20))

対数正規回帰モデル

  1. 誤差構造:対数正規分布
  2. リンク関数:恒等リンク関数

サンプルデータ

X <- rnorm(n = N, mean = 0, sd = 1)
Y <- rlnorm(n = N, meanlog = 2*X, sdlog = 1)

df <- data_frame(X, Y)

standata <- list(N = nrow(df),
                 Y = df$Y,
                 X = df$X)

Stanコード

stancode <- "
data{
  int N;
  vector[N] Y; 
  vector[N] X;
}

parameters{
  real b0;
  real b1;
  real<lower=0> sigma;
}

model{
  for(i in 1:N){
        Y[i] ~ lognormal(b0 + b1*X[i], sigma);
  }
}

generated quantities{
  vector[N] pred_Y;
  real log_lik[N];

  for (i in 1:N){
    pred_Y[i] = lognormal_rng(b0 + b1*X[i], sigma);
    log_lik[i] = lognormal_lpdf(Y[i] | b0 + b1*X[i], sigma);
  }
}
"

事後予測分布

fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4)
summary(fit)$summary[c("b0", "b1"), c(1, 4, 8, 10)]

rstan::extract(fit)$pred_Y %>%
  data.frame() %>% 
  apply(., 2, quantile, prob = c(0.025, 0.10, 0.5, 0.90, 0.975)) %>% 
  t() %>% 
  `colnames<-`(c("pred2.5", "pred10", "pred50", "pred90", "pred97.5")) %>% 
  cbind(df) %>% 
  ggplot() + 
  geom_point(aes(X, Y), col = "#4169e1") + 
  geom_line(aes(X, y = pred50), colour = "#4169e1") +
  geom_ribbon(aes(X, ymin = pred2.5, ymax = pred97.5), alpha = 0.1, fill = "#4169e1") +
  geom_ribbon(aes(X, ymin = pred10, ymax = pred90), alpha = 0.3, fill = "#4169e1") +
  theme_classic() +
  theme(axis.text  = element_text(size = 15),
        axis.title = element_text(size = 20))

負の二項回帰モデル

  1. 誤差構造:負の二項分布
  2. リンク関数:対数リンク関数

サンプルデータ

N <- 100
X <- rnorm(n = N, mean = 0, sd = 1)
Y <- rnbinom(n = N, mu = 4, size = 1)

df <- data_frame(X, Y)

standata <- list(N = nrow(df),
                 Y = df$Y,
                 X = df$X)

Stanコード

stancode <- "
data{
  int N;
  int Y[N]; 
  vector[N] X;
}

parameters{
  real b0;
  real b1;
  real<lower=0> alpha;
}

transformed parameters {
  vector[N] mu;

  for (i in 1:N){
    mu[i] = exp(b0 + b1*X[i]);
  }
}

model{
  for(i in 1:N){
    Y[i] ~ neg_binomial_2(mu[i], alpha);
  }
}

generated quantities{
  vector[N] pred_Y;
  real log_lik[N];

  for (i in 1:N){
    pred_Y[i] = neg_binomial_2_rng(mu[i], alpha);
    log_lik[i] = neg_binomial_2_lpmf(Y[i] | mu[i], alpha);
  }
}
"

#neg_binomial_2_log(mu)で直接対数変換せずにサンプリングしてもいい
#model{
#for(i in 1:N){
#  Y[i] ~ neg_binomial_2_log(b0 + b1*X[i], alpha);
# }
#}

事後予測分布

fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4)
summary(fit)$summary[c("b0", "b1"), c(1, 4, 8, 10)]

rstan::extract(fit)$pred_Y %>%
  data.frame() %>% 
  apply(., 2, quantile, prob = c(0.025, 0.10, 0.5, 0.90, 0.975)) %>% 
  t() %>% 
  `colnames<-`(c("pred2.5", "pred10", "pred50", "pred90", "pred97.5")) %>% 
  cbind(df) %>% 
  ggplot() + 
  geom_point(aes(X, Y), col = "#4169e1") + 
  geom_line(aes(X, y = pred50), colour = "#4169e1") +
  geom_ribbon(aes(X, ymin = pred2.5, ymax = pred97.5), alpha = 0.1, fill = "#4169e1") +
  geom_ribbon(aes(X, ymin = pred10, ymax = pred90), alpha = 0.3, fill = "#4169e1") +
  theme_classic() +
  theme(axis.text  = element_text(size = 15),
        axis.title = element_text(size = 20))

ゼロ過剰ポアソン回帰モデル

サンプルデータ

N <- 100
X <- rnorm(n = N, mean = 0, sd = 1)
Y <- rpois(n = N, lambda = exp(0.8*X))

df <- data_frame(X, Y)

standata <- list(N = nrow(df),
                 Y = df$Y,
                 X = df$X)

Stanコード

stancode <- "
data{
  int N;
  int Y[N];
  vector[N] X;
}

parameters{
  real a0;
  real a1;
  real b0;
  real b1;
}

model{
  vector[N] theta;
  vector[N] lambda;

  for(i in 1:N){
    theta[i] = inv_logit(a0 + a1*X[i]);
    lambda[i] = exp(b0 + b1*X[i]);

  if(Y[i]==0)
    target += log_sum_exp(log(1-theta[i]), log(theta[i]) + poisson_lpmf(0| lambda[i]));
  else
    target += log(theta[i]) + poisson_lpmf(Y[i]| lambda[i]);
  }
}
"

事後予測分布

fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4)
summary(fit)$summary[c("a0", "a1", "b0", "b1"), c(1, 4, 8, 10)]
          mean       2.5%      97.5%     Rhat
a0 20.02717748  4.7168032 32.8796014 1.005368
a1  1.60100422 -5.0632239  9.3640185 1.001425
b0 -0.09690579 -0.3120695  0.1158623 1.000140
b1  0.89315760  0.7568918  1.0262100 1.000192

ゼロ過剰ガンマ回帰モデル

サンプルデータ

N <- 100
X <- rnorm(n = N, mean = 0, sd = 1)
mu <- exp(1 + 2*X)
phi <- 1
shape <- mu^2/phi
rate <- mu/phi
Y <- ifelse(extraDistr::rbern(N, 1/(1 + exp(-(1 + 2*X)))) < 1,
            0,
            rgamma(N, shape = shape, rate = rate)) 

df <- data_frame(X, Y)

standata <- list(N = nrow(df),
                 Y = df$Y,
                 X = df$X)

Stanコード

stancode <- "
data{
  int N;
  real Y[N];
  vector[N] X;
}

parameters{
  real a0;
  real a1;
  real b0;
  real b1;
  real <lower=0> phi;
}

transformed parameters{
  vector <lower=0> [N] shape;
  vector <lower=0> [N] rate;
  vector <lower=0> [N] mu;
  vector <lower=0> [N] theta;

  for (i in 1:N){
    mu[i] = exp(a0 + a1*X[i]);
    shape[i] = pow(mu[i], 2)/phi;
    rate[i] = mu[i]/phi;
    theta[i] = inv_logit(b0 + b1*X[i]);
  }
}
model{
  for( i in 1 : N){
    if ( Y[i] == 0 ){
      target += bernoulli_lpmf(0| theta[i]);
    }else{
      target += bernoulli_lpmf(1| theta[i]) + gamma_lpdf(Y[i]| shape[i], rate[i]);
    }
  }
}
"

事後分布

fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4)
summary(fit)$summary[c("a0", "a1", "b0", "b1"), c(1, 4, 8, 10)]
        mean      2.5%     97.5%      Rhat
a0 3.7284473 3.4287448 4.0449271 1.0010297
a1 0.6528083 0.4604300 0.8524688 1.0000245
b0 1.5223662 0.8335802 2.3302660 0.9999711
b1 2.7099104 1.7278128 3.9400902 0.9993525

ゼロ過剰ベータ回帰モデル

サンプルデータ

N <- 100
X <- rnorm(n = N, mean = 0, sd = 1)

phi = 0.1
theta <- 1/(1 + exp(-(0.5 + 0.5*X)))
a <- theta/phi  
b <- (1 - theta)/phi  

Y <- ifelse(extraDistr::rbern(n = N, theta) == 0,
            0,
            rbeta(n = N, a, b))

df <- data_frame(X, Y)

standata <- list(N = nrow(df),
                 Y = df$Y,
                 X = df$X)

Stanコード

stancode <- "
data{
  int N;
  vector<lower=0, upper=1>[N] Y;
  vector[N] X;
}

parameters{
  real a0;
  real a1;
  real b0;
  real b1;
  real <lower=0> phi;
}

transformed parameters{
  vector[N] theta1;
  vector[N] theta2;
  vector <lower=0>[N] a;
  vector <lower=0>[N] b;

  for(i in 1:N){
    theta1[i] = inv_logit(a0 + a1*X[i]);
    theta2[i] = inv_logit(b0 + b1*X[i]);
    a[i] = theta2[i] * phi;
    b[i] = (1 - theta2[i]) * phi;
  }
}

model {
  for(i in 1:N){
    if(Y[i] == 0){
      target += bernoulli_lpmf(0| theta1[i]);
    }else{
      target += bernoulli_lpmf(1| theta1[i]) + beta_lpdf(Y[i]| a[i], b[i]);
    }
  }
}
"

事後分布

fit <- stan(model_code = stancode, data = standata, iter = 2000, chains = 4)
summary(fit)$summary[c("a0", "a1", "b0", "b1"), c(1, 4, 8, 10)]

        mean       2.5%     97.5%     Rhat
a0 0.5072671 0.09042595 0.9397354 1.000795
a1 0.5365692 0.07548989 1.0071972 1.000013
b0 0.6072267 0.44246342 0.7689407 1.000972
b1 0.5019813 0.33206450 0.6705875 1.000977

参考文献および参考サイト

  1. Stan モデリング言語: ユーザーガイド・リファレンスマニュアル
  2. StanとRで学ぶベイズ統計モデリング
  3. Stan超初心者入門
  4. Stan中級編
  5. Stan Advent Calender 2017(day 19)

kai