ZiGDAG/README.Rmd at master · junsoukchoi/ZiGDAG · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
---
title: "ZiGDAG: R package for model-based causal discovery for zero-inflated count data"

output:
  github_document:
        toc: true
        toc_depth: 4
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

The R package `ZiGDAG` implements zero-inflated generalized hypergeometric directed acyclic graphs (ZiG-DAGs) for inference of causal structure from observational zero-inflated count data. For the structure learning of ZiG-DAGs, score-based greedy search algorithms are implemented.

## Installation

To install the latest version from Github, use

```{r install, tidy = 'formatR', eval = FALSE, echo = TRUE}
library(devtools)
devtools::install_github("junsoukchoi/ZiGDAG")
```

## Example of linear ZiG-DAGs

```{r example1, eval = FALSE, echo = TRUE}
library(ZiGDAG)
library(igraph)
library(DGLMExtPois)
# set a random seed
set.seed(1)

# generate synthetic data from a linear ZiG-DAG, where zero-inflated hyper-Poisson distributions are assumed for each node
# generate the DAG with 5 nodes: 1 -> 2 -> 3 -> 4 -> 5
p = 5   # number of variables
E_true = matrix(0, p, p)
for (j in 2 : p) E_true[j, j - 1] = 1

# generate model parameters for the linear ZiG-DAG
alpha_true  = matrix(0, p, p)
alpha_true[E_true == 1] = runif(sum(E_true), 0.5, 2)
beta_true   = matrix(0, p, p)
beta_true[E_true == 1]  = runif(sum(E_true), -2, -0.5)
delta_true  = runif(p, -1.5, -1)
gamma_true  = runif(p, 1, 1.5)
lambda_true = exp(runif(p, -2, 2))

# generate synthetic data from the specified linear ZiG-DAG with sample size n = 500
n   = 500  # sample size
dat = matrix(0, n, p)
order_nodes = as_ids(topo_sort(graph_from_adjacency_matrix(t(E_true))))
for (j in order_nodes)
{
   pi_true = as.vector(exp(dat %*% alpha_true[j, ] + delta_true[j]))
   pi_true = pi_true / (1 + pi_true)
   pi_true[is.nan(pi_true)] = 1
   mu_true = as.vector(exp(dat %*% beta_true[j, ] + gamma_true[j]))
   dat[ , j] = (1 - rbinom(n, 1, pi_true)) * rhP(n, lambda_true[j], mu_true)
}

# learn the causal structure of linear ZiG-DAG from synthetic data
# apply the hill-climbing algorithm for linear ZiG-DAG to synthetic data
fit = linear.zigdag(dat, ghpd = "hyper.poisson", method = "hc")
# adjacency matrix of the estimated causal DAG
E_est = fit$est$E
```

## Example of nonlinear ZiG-DAGs

```{r example2, eval = FALSE, echo = TRUE}
library(ZiGDAG)
library(igraph)
library(DGLMExtPois)
# set a random seed
set.seed(1)

# generate synthetic data from a nonlinear ZiG-DAG, where zero-inflated hyper-Poisson distributions are assumed for each node
# generate the DAG with 5 nodes: 1 -> 2 -> 3 -> 4 -> 5
p = 5   # number of variables
E_true = matrix(0, p, p)
for (j in 2 : p) E_true[j, j - 1] = 1

# generate nonlinear functions and parameters for the nonlinear ZiG-DAG
f_nonlinear = function(z, type)
{
   if (type == 1) return(0.5 * z * (z - 3))
   if (type == 2) return(sin(z))
}

g_nonlinear = function(z, type)
{
   if (type == 1) return(-0.5 * (z - 1.5)^2)
   if (type == 2) return(cos(z))
}

f_true = matrix(0, p, p)
f_true[E_true == 1] = sample(1 : 2, size = sum(E_true), replace = TRUE)
g_true = matrix(0, p, p)
g_true[E_true == 1] = sample(1 : 2, size = sum(E_true), replace = TRUE)
delta_true  = runif(p, -1.5, -1)
gamma_true  = runif(p, 1, 1.5)
lambda_true = exp(runif(p, -2, 2))

# generate synthetic data from the specified nonlinear ZiG-DAG with sample size n = 500
n   = 500  # sample size
dat = matrix(0, n, p)
order_nodes = as_ids(topo_sort(graph_from_adjacency_matrix(t(E_true))))
for (j in order_nodes)
{
   p_j  = sum(E_true[j, ] == 1)
   pi_true = rep(delta_true[j], n)
   mu_true = rep(gamma_true[j], n)
   if (p_j > 0)
   {
      pa_j = which(E_true[j, ] == 1)
      for (a in 1 : p_j)
      {
         k = pa_j[a]
         pi_true = pi_true + f_nonlinear(dat[ , k], type = f_true[j, k])
         mu_true = mu_true + g_nonlinear(dat[ , k], type = g_true[j, k])
      }
   }

   pi_true = exp(pi_true) / (1 + exp(pi_true))
   pi_true[is.nan(pi_true)] = 1
   mu_true = exp(mu_true)
   dat[ , j] = (1 - rbinom(n, 1, pi_true)) * rhP(n, lambda_true[j], mu_true)
}

# learn the causal structure of nonlinear ZiG-DAG from synthetic data
# apply the hill-climbing algorithm for nonlinear ZiG-DAG to synthetic data
# 4 cubic B-spline functions are used to approximate nonlinear functions in nonlinear ZiG-DAG models.
fit = nonlinear.zigdag(dat, nbasis = 4)
# adjacency matrix of the estimated causal DAG
E_est = fit$est$E
```