# Publications, Technical Reports and Theses

## 2021 |

Zoltowski, David M; Cai, Diana; Adams, Ryan P Slice sampling reparameterization gradients Conference Advances in Neural Information Processing Systems 34 (NeurIPS), 2021. @conference{zoltowski2021slice, title = {Slice sampling reparameterization gradients}, author = {David M. Zoltowski and Diana Cai and Ryan P. Adams}, year = {2021}, date = {2021-12-01}, booktitle = {Advances in Neural Information Processing Systems 34 (NeurIPS)}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |

Sun, Xingyuan; Xue, Tianju; Rusinkiewicz, Szymon; Adams, Ryan P Amortized synthesis of constrained configurations using a differentiable surrogate Conference Advances in Neural Information Processing Systems 34 (NeurIPS), 2021. @conference{sun2021amortized, title = {Amortized synthesis of constrained configurations using a differentiable surrogate}, author = {Xingyuan Sun and Tianju Xue and Szymon Rusinkiewicz and Ryan P. Adams}, year = {2021}, date = {2021-12-01}, booktitle = {Advances in Neural Information Processing Systems 34 (NeurIPS)}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |

Rahme, Jad; Ghosh, Dibya; Kumar, Aviral; Zhang, Amy; Adams, Ryan P; Levine, Sergey Understanding generalization in RL via the epistemic POMDP Conference Advances in Neural Information Processing Systems 34 (NeurIPS), 2021. @conference{rahme2021epistemic, title = {Understanding generalization in RL via the epistemic POMDP}, author = {Jad Rahme and Dibya Ghosh and Aviral Kumar and Amy Zhang and Ryan P. Adams and Sergey Levine}, year = {2021}, date = {2021-12-01}, booktitle = {Advances in Neural Information Processing Systems 34 (NeurIPS)}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |

Gundersen, Gregory W; Cai, Diana; Zhou, Chuteng; Engelhardt, Barbara E; Adams, Ryan P Active multi-fidelity Bayesian online changepoint detection Conference Proceedings of the 37th Conference on Uncertainty in Artificial Intelligence (UAI), 2021. @conference{gundersen2021active, title = {Active multi-fidelity Bayesian online changepoint detection}, author = {Gregory W. Gundersen and Diana Cai and Chuteng Zhou and Barbara E. Engelhardt and Ryan P. Adams}, year = {2021}, date = {2021-06-01}, booktitle = {Proceedings of the 37th Conference on Uncertainty in Artificial Intelligence (UAI)}, abstract = {Online algorithms for detecting changepoints, or abrupt shifts in the behavior of a time series, are often deployed with limited resources, e.g., to edge computing settings such as mobile phones or industrial sensors. In these scenarios it may be beneficial to trade the cost of collecting an environmental measurement against the quality or "fidelity" of this measurement and how the measurement affects changepoint estimation. For instance, one might decide between inertial measurements or GPS to determine changepoints for motion. A Bayesian approach to changepoint detection is particularly appealing because we can represent our posterior uncertainty about changepoints and make active, cost-sensitive decisions about data fidelity to reduce this posterior uncertainty. Moreover, the total cost could be dramatically lowered through active fidelity switching, while remaining robust to changes in data distribution. We propose a multi-fidelity approach that makes cost-sensitive decisions about which data fidelity to collect based on maximizing information gain with respect to changepoints. We evaluate this framework on synthetic, video, and audio data and show that this information-based approach results in accurate predictions while reducing total cost.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Online algorithms for detecting changepoints, or abrupt shifts in the behavior of a time series, are often deployed with limited resources, e.g., to edge computing settings such as mobile phones or industrial sensors. In these scenarios it may be beneficial to trade the cost of collecting an environmental measurement against the quality or "fidelity" of this measurement and how the measurement affects changepoint estimation. For instance, one might decide between inertial measurements or GPS to determine changepoints for motion. A Bayesian approach to changepoint detection is particularly appealing because we can represent our posterior uncertainty about changepoints and make active, cost-sensitive decisions about data fidelity to reduce this posterior uncertainty. Moreover, the total cost could be dramatically lowered through active fidelity switching, while remaining robust to changes in data distribution. We propose a multi-fidelity approach that makes cost-sensitive decisions about which data fidelity to collect based on maximizing information gain with respect to changepoints. We evaluate this framework on synthetic, video, and audio data and show that this information-based approach results in accurate predictions while reducing total cost. |

Oktay, Deniz; McGreivy, Nick; Aduol, Joshua; Beatson, Alex; Adams, Ryan P Randomized Automatic Differentiation Conference Proceedings of the International Conference on Learning Representations (ICLR), 2021. @conference{oktay2021randomized, title = {Randomized Automatic Differentiation}, author = {Deniz Oktay and Nick McGreivy and Joshua Aduol and Alex Beatson and Ryan P. Adams}, year = {2021}, date = {2021-04-01}, booktitle = {Proceedings of the International Conference on Learning Representations (ICLR)}, abstract = {The successes of deep learning, variational inference, and many other fields have been aided by specialized implementations of reverse-mode automatic differentiation (AD) to compute gradients of mega-dimensional objectives. The AD techniques underlying these tools were designed to compute exact gradients to numerical precision, but modern machine learning models are almost always trained with stochastic gradient descent. Why spend computation and memory on exact (minibatch) gradients only to use them for stochastic optimization? We develop a general framework and approach for randomized automatic differentiation (RAD), which can allow unbiased gradient estimates to be computed with reduced memory in return for variance. We examine limitations of the general approach, and argue that we must leverage problem specific structure to realize benefits. We develop RAD techniques for a variety of simple neural network architectures, and show that for a fixed memory budget, RAD converges in fewer iterations than using a small batch size for feedforward networks, and in a similar number for recurrent networks. We also show that RAD can be applied to scientific computing, and use it to develop a low-memory stochastic gradient method for optimizing the control parameters of a linear reaction-diffusion PDE representing a fission reactor.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } The successes of deep learning, variational inference, and many other fields have been aided by specialized implementations of reverse-mode automatic differentiation (AD) to compute gradients of mega-dimensional objectives. The AD techniques underlying these tools were designed to compute exact gradients to numerical precision, but modern machine learning models are almost always trained with stochastic gradient descent. Why spend computation and memory on exact (minibatch) gradients only to use them for stochastic optimization? We develop a general framework and approach for randomized automatic differentiation (RAD), which can allow unbiased gradient estimates to be computed with reduced memory in return for variance. We examine limitations of the general approach, and argue that we must leverage problem specific structure to realize benefits. We develop RAD techniques for a variety of simple neural network architectures, and show that for a fixed memory budget, RAD converges in fewer iterations than using a small batch size for feedforward networks, and in a similar number for recurrent networks. We also show that RAD can be applied to scientific computing, and use it to develop a low-memory stochastic gradient method for optimizing the control parameters of a linear reaction-diffusion PDE representing a fission reactor. |

Shields, Benjamin J; Stevens, Jason; Li, Jun; Parasram, Marvin; Damani, Farhan; Alvarado, Jesus Martinez I; Janey, Jacob M; Adams, Ryan P; Doyle, Abigail G Bayesian reaction optimization as a tool for chemical synthesis Journal Article Nature, 590 , pp. 89-96, 2021. @article{shields2021bayesian, title = {Bayesian reaction optimization as a tool for chemical synthesis}, author = {Benjamin J. Shields and Jason Stevens and Jun Li and Marvin Parasram and Farhan Damani and Jesus I. Martinez Alvarado and Jacob M. Janey and Ryan P. Adams and Abigail G. Doyle}, year = {2021}, date = {2021-04-01}, journal = {Nature}, volume = {590}, pages = {89-96}, keywords = {}, pubstate = {published}, tppubtype = {article} } |

## 2020 |

Ash, Jordan T; Adams, Ryan P On warm-starting neural network training Conference Advances in Neural Information Processing Systems 33 (NeurIPS), 2020. @conference{ash2020warm, title = {On warm-starting neural network training}, author = {Jordan T. Ash and Ryan P. Adams}, year = {2020}, date = {2020-12-01}, booktitle = {Advances in Neural Information Processing Systems 33 (NeurIPS)}, abstract = {In many real-world deployments of machine learning systems, data arrive piecemeal. These learning scenarios may be passive, where data arrive incrementally due to structural properties of the problem (e.g., daily financial data) or active, where samples are selected according to a measure of their quality (e.g., experimental design). In both of these cases, we are building a sequence of models that incorporate an increasing amount of data. We would like each of these models in the sequence to be performant and take advantage of all the data that are available to that point. Conventional intuition suggests that when solving a sequence of related optimization problems of this form, it should be possible to initialize using the solution of the previous iterate---to warm start'' the optimization rather than initialize from scratch---and see reductions in wall-clock time. However, in practice this warm-starting seems to yield poorer generalization performance than models that have fresh random initializations, even though the final training losses are similar. While it appears that some hyperparameter settings allow a practitioner to close this generalization gap, they seem to only do so in regimes that damage the wall-clock gains of the warm start. Nevertheless, it is highly desirable to be able to warm-start neural network training, as it would dramatically reduce the resource usage associated with the construction of performant deep learning systems. In this work, we take a closer look at this empirical phenomenon and try to understand when and how it occurs. We also provide a surprisingly simple trick that overcomes this pathology in several important situations, and present experiments that elucidate some of its properties.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } In many real-world deployments of machine learning systems, data arrive piecemeal. These learning scenarios may be passive, where data arrive incrementally due to structural properties of the problem (e.g., daily financial data) or active, where samples are selected according to a measure of their quality (e.g., experimental design). In both of these cases, we are building a sequence of models that incorporate an increasing amount of data. We would like each of these models in the sequence to be performant and take advantage of all the data that are available to that point. Conventional intuition suggests that when solving a sequence of related optimization problems of this form, it should be possible to initialize using the solution of the previous iterate---to warm start'' the optimization rather than initialize from scratch---and see reductions in wall-clock time. However, in practice this warm-starting seems to yield poorer generalization performance than models that have fresh random initializations, even though the final training losses are similar. While it appears that some hyperparameter settings allow a practitioner to close this generalization gap, they seem to only do so in regimes that damage the wall-clock gains of the warm start. Nevertheless, it is highly desirable to be able to warm-start neural network training, as it would dramatically reduce the resource usage associated with the construction of performant deep learning systems. In this work, we take a closer look at this empirical phenomenon and try to understand when and how it occurs. We also provide a surprisingly simple trick that overcomes this pathology in several important situations, and present experiments that elucidate some of its properties. |

Liu, Sulin; Sun, Xingyuan; Ramadge, Peter J; Adams, Ryan P Task-agnostic amortized inference of Gaussian process hyperparameters Conference Advances in Neural Information Processing Systems 33 (NeurIPS), 2020. @conference{liu2020task, title = {Task-agnostic amortized inference of Gaussian process hyperparameters}, author = {Sulin Liu and Xingyuan Sun and Peter J. Ramadge and Ryan P. Adams}, year = {2020}, date = {2020-12-01}, booktitle = {Advances in Neural Information Processing Systems 33 (NeurIPS)}, abstract = {Gaussian processes (GPs) are flexible priors for modeling functions. However, their success depends on the kernel accurately reflecting the properties of the data. One of the appeals of the GP framework is that the marginal likelihood of the kernel hyperparameters is often available in closed form, enabling optimization and sampling procedures to fit these hyperparameters to data. Unfortunately, point-wise evaluation of the marginal likelihood is expensive due to the need to solve a linear system; searching or sampling the space of hyperparameters thus often dominates the practical cost of using GPs. We introduce an approach to the identification of kernel hyperparameters in GP regression and related problems that sidesteps the need for costly marginal likelihoods. Our strategy is to "amortize" inference over hyperparameters by training a single neural network, which consumes a set of regression data and produces an estimate of the kernel function, useful across different tasks. To accommodate the varying dimension and cardinality of different regression problems, we use a hierarchical self-attention-based neural network that produces estimates of the hyperparameters which are invariant to the order of the input data points and data dimensions. We show that a single neural model trained on synthetic data is able to generalize directly to several different unseen real-world GP use cases. Our experiments demonstrate that the estimated hyperparameters are comparable in quality to those from the conventional model selection procedures, while being much faster to obtain, significantly accelerating GP regression and its related applications such as Bayesian optimization and Bayesian quadrature. The code and pre-trained model are available at https://github.com/PrincetonLIPS/AHGP.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Gaussian processes (GPs) are flexible priors for modeling functions. However, their success depends on the kernel accurately reflecting the properties of the data. One of the appeals of the GP framework is that the marginal likelihood of the kernel hyperparameters is often available in closed form, enabling optimization and sampling procedures to fit these hyperparameters to data. Unfortunately, point-wise evaluation of the marginal likelihood is expensive due to the need to solve a linear system; searching or sampling the space of hyperparameters thus often dominates the practical cost of using GPs. We introduce an approach to the identification of kernel hyperparameters in GP regression and related problems that sidesteps the need for costly marginal likelihoods. Our strategy is to "amortize" inference over hyperparameters by training a single neural network, which consumes a set of regression data and produces an estimate of the kernel function, useful across different tasks. To accommodate the varying dimension and cardinality of different regression problems, we use a hierarchical self-attention-based neural network that produces estimates of the hyperparameters which are invariant to the order of the input data points and data dimensions. We show that a single neural model trained on synthetic data is able to generalize directly to several different unseen real-world GP use cases. Our experiments demonstrate that the estimated hyperparameters are comparable in quality to those from the conventional model selection procedures, while being much faster to obtain, significantly accelerating GP regression and its related applications such as Bayesian optimization and Bayesian quadrature. The code and pre-trained model are available at https://github.com/PrincetonLIPS/AHGP. |

Xue, Tianju; Beatson, Alex; Chiaramonte, Maurizio; Roeder, Geoffrey; Ash, Jordan T; Menguc, Yigit; Adiaenssens, Sigrid; Adams, Ryan P; Mao, Sheng A data-driven computational scheme for the nonlinear mechanical properties of cellular mechanical meta-materials under large deformation Journal Article Soft Matter, 16 , pp. 7524-7534, 2020. @article{xue2020data, title = {A data-driven computational scheme for the nonlinear mechanical properties of cellular mechanical meta-materials under large deformation}, author = {Tianju Xue and Alex Beatson and Maurizio Chiaramonte and Geoffrey Roeder and Jordan T. Ash and Yigit Menguc and Sigrid Adiaenssens and Ryan P. Adams and Sheng Mao}, year = {2020}, date = {2020-08-01}, journal = {Soft Matter}, volume = {16}, pages = {7524-7534}, keywords = {}, pubstate = {published}, tppubtype = {article} } |

Xue, Tianju; Beatson, Alex; Adiaenssens, Sigrid; Adams, Ryan P Amortized Finite Element Analysis for Fast PDE-Constrained Optimization Conference Proceedings of the 37th International Conference on Machine Learning (ICML), 2020. @conference{xue2020amortized, title = {Amortized Finite Element Analysis for Fast PDE-Constrained Optimization}, author = {Tianju Xue and Alex Beatson and Sigrid Adiaenssens and Ryan P. Adams}, year = {2020}, date = {2020-07-01}, booktitle = {Proceedings of the 37th International Conference on Machine Learning (ICML)}, abstract = {Optimizing the parameters of partial differential equations (PDEs), i.e., PDE-constrained optimization (PDE-CO), allows us to model natural systems from observations or perform rational design of structures with complicated mechanical, thermal, or electromagnetic properties. However, PDE-CO is often computationally prohibitive due to the need to solve the PDE—typically via finite element analysis (FEA)—at each step of the optimization procedure. In this paper we propose amortized finite element analysis (AmorFEA), in which a neural network learns to produce accurate PDE solutions, while preserving many of the advantages of traditional finite element methods. This network is trained to directly minimize the potential energy from which the PDE and finite element method are derived, avoiding the need to generate costly supervised training data by solving PDEs with traditional FEA. As FEA is a variational procedure, AmorFEA is a direct analogue to popular amortized inference approaches in latent variable models, with the finite element basis acting as the variational family. AmorFEA can perform PDE-CO without the need to repeatedly solve the associated PDE, accelerating optimization when compared to a traditional workflow using FEA and the adjoint method.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Optimizing the parameters of partial differential equations (PDEs), i.e., PDE-constrained optimization (PDE-CO), allows us to model natural systems from observations or perform rational design of structures with complicated mechanical, thermal, or electromagnetic properties. However, PDE-CO is often computationally prohibitive due to the need to solve the PDE—typically via finite element analysis (FEA)—at each step of the optimization procedure. In this paper we propose amortized finite element analysis (AmorFEA), in which a neural network learns to produce accurate PDE solutions, while preserving many of the advantages of traditional finite element methods. This network is trained to directly minimize the potential energy from which the PDE and finite element method are derived, avoiding the need to generate costly supervised training data by solving PDEs with traditional FEA. As FEA is a variational procedure, AmorFEA is a direct analogue to popular amortized inference approaches in latent variable models, with the finite element basis acting as the variational family. AmorFEA can perform PDE-CO without the need to repeatedly solve the associated PDE, accelerating optimization when compared to a traditional workflow using FEA and the adjoint method. |

Beatson, Alex; Ash, Jordan T; Roeder, Geoffrey; Xue, Tianju; Adams, Ryan P Learning Composable Energy Surrogates for PDE Order Reduction Conference Advances in Neural Information Processing Systems 33 (NeurIPS), 2020. @conference{beatson2020composable, title = {Learning Composable Energy Surrogates for PDE Order Reduction}, author = {Alex Beatson and Jordan T. Ash and Geoffrey Roeder and Tianju Xue and Ryan P. Adams}, url = {https://arxiv.org/abs/2005.06549}, year = {2020}, date = {2020-05-13}, booktitle = {Advances in Neural Information Processing Systems 33 (NeurIPS)}, abstract = {Meta-materials are an important emerging class of engineered materials in which complex macroscopic behaviour--whether electromagnetic, thermal, or mechanical--arises from modular substructure. Simulation and optimization of these materials are computationally challenging, as rich substructures necessitate high-fidelity finite element meshes to solve the governing PDEs. To address this, we leverage parametric modular structure to learn component-level surrogates, enabling cheaper high-fidelity simulation. We use a neural network to model the stored potential energy in a component given boundary conditions. This yields a structured prediction task: macroscopic behavior is determined by the minimizer of the system's total potential energy, which can be approximated by composing these surrogate models. Composable energy surrogates thus permit simulation in the reduced basis of component boundaries. Costly ground-truth simulation of the full structure is avoided, as training data are generated by performing finite element analysis with individual components. Using dataset aggregation to choose training boundary conditions allows us to learn energy surrogates which produce accurate macroscopic behavior when composed, accelerating simulation of parametric meta-materials.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Meta-materials are an important emerging class of engineered materials in which complex macroscopic behaviour--whether electromagnetic, thermal, or mechanical--arises from modular substructure. Simulation and optimization of these materials are computationally challenging, as rich substructures necessitate high-fidelity finite element meshes to solve the governing PDEs. To address this, we leverage parametric modular structure to learn component-level surrogates, enabling cheaper high-fidelity simulation. We use a neural network to model the stored potential energy in a component given boundary conditions. This yields a structured prediction task: macroscopic behavior is determined by the minimizer of the system's total potential energy, which can be approximated by composing these surrogate models. Composable energy surrogates thus permit simulation in the reduced basis of component boundaries. Costly ground-truth simulation of the full structure is avoided, as training data are generated by performing finite element analysis with individual components. Using dataset aggregation to choose training boundary conditions allows us to learn energy surrogates which produce accurate macroscopic behavior when composed, accelerating simulation of parametric meta-materials. |

Luo, Yucen; Beatson, Alex; Norouzi, Mohammad; Zhu, Jun; Duvenaud, David; Adams, Ryan P; Chen, Ricky T Q SUMO: Unbiased Estimation of Log Marginal Probability for Latent Variable Models Conference Proceedings of the Eighth International Conference on Learning Representations (ICLR), 2020. @conference{luo2020sumo, title = {SUMO: Unbiased Estimation of Log Marginal Probability for Latent Variable Models}, author = {Yucen Luo and Alex Beatson and Mohammad Norouzi and Jun Zhu and David Duvenaud and Ryan P. Adams and Ricky T. Q. Chen}, url = {https://openreview.net/forum?id=SylkYeHtwr}, year = {2020}, date = {2020-04-30}, booktitle = {Proceedings of the Eighth International Conference on Learning Representations (ICLR)}, abstract = {The standard variational lower bounds used to train latent variable models produce biased estimates of most quantities of interest. We introduce an unbiased estimator of the log marginal likelihood and its gradients for latent variable models based on randomized truncation of infinite series. If parameterized by an encoder-decoder architecture, the parameters of the encoder can be optimized to minimize its variance of this estimator. We show that models trained using our estimator give better test-set likelihoods than a standard importance-sampling based approach for the same average computational cost. This estimator also allows use of latent variable models for tasks where unbiased estimators, rather than marginal likelihood lower bounds, are preferred, such as minimizing reverse KL divergences and estimating score functions.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } The standard variational lower bounds used to train latent variable models produce biased estimates of most quantities of interest. We introduce an unbiased estimator of the log marginal likelihood and its gradients for latent variable models based on randomized truncation of infinite series. If parameterized by an encoder-decoder architecture, the parameters of the encoder can be optimized to minimize its variance of this estimator. We show that models trained using our estimator give better test-set likelihoods than a standard importance-sampling based approach for the same average computational cost. This estimator also allows use of latent variable models for tasks where unbiased estimators, rather than marginal likelihood lower bounds, are preferred, such as minimizing reverse KL divergences and estimating score functions. |

## 2019 |

Fedorov, Igor; Adams, Ryan P; Mattina, Matthew; Whatmough, Paul N SpArSe: Sparse Architecture Search for CNNs on Resource-Constrained Microcontrollers Conference Advances in Neural Information Processing Systems 32 (NeurIPS), 2019. @conference{fedorov2019sparse, title = {SpArSe: Sparse Architecture Search for CNNs on Resource-Constrained Microcontrollers}, author = {Igor Fedorov and Ryan P. Adams and Matthew Mattina and Paul N. Whatmough}, url = {https://www.cs.princeton.edu/~rpa/pubs/fedorov2019sparse.pdf}, year = {2019}, date = {2019-12-04}, booktitle = {Advances in Neural Information Processing Systems 32 (NeurIPS)}, abstract = {The vast majority of processors in the world are actually microcontroller units (MCUs), which find widespread use performing simple control tasks in applications ranging from automobiles to medical devices and office equipment. The Internet of Things (IoT) promises to inject machine learning into many of these every-day objects via tiny, cheap MCUs. However, these resource-impoverished hardware platforms severely limit the complexity of machine learning models that can be deployed. For example, although convolutional neural networks (CNNs) achieve state-of-the-art results on many visual recognition tasks, CNN inference on MCUs is challenging due to severe finite memory limitations. To circumvent the memory challenge associated with CNNs, various alternatives have been proposed that do fit within the memory budget of an MCU, albeit at the cost of prediction accuracy. This paper challenges the idea that CNNs are not suitable for deployment on MCUs. We demonstrate that it is possible to automatically design CNNs which generalize well, while also being small enough to fit onto memory-limited MCUs. Our Sparse Architecture Search method combines neural architecture search with pruning in a single, unified approach, which learns superior models on four popular IoT datasets. The CNNs we find are more accurate and up to 4.35× smaller than previous approaches, while meeting the strict MCU working memory constraint.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } The vast majority of processors in the world are actually microcontroller units (MCUs), which find widespread use performing simple control tasks in applications ranging from automobiles to medical devices and office equipment. The Internet of Things (IoT) promises to inject machine learning into many of these every-day objects via tiny, cheap MCUs. However, these resource-impoverished hardware platforms severely limit the complexity of machine learning models that can be deployed. For example, although convolutional neural networks (CNNs) achieve state-of-the-art results on many visual recognition tasks, CNN inference on MCUs is challenging due to severe finite memory limitations. To circumvent the memory challenge associated with CNNs, various alternatives have been proposed that do fit within the memory budget of an MCU, albeit at the cost of prediction accuracy. This paper challenges the idea that CNNs are not suitable for deployment on MCUs. We demonstrate that it is possible to automatically design CNNs which generalize well, while also being small enough to fit onto memory-limited MCUs. Our Sparse Architecture Search method combines neural architecture search with pruning in a single, unified approach, which learns superior models on four popular IoT datasets. The CNNs we find are more accurate and up to 4.35× smaller than previous approaches, while meeting the strict MCU working memory constraint. |

Seff, Ari; Zhou, Wenda; Damani, Farhan; Doyle, Abigail; Adams, Ryan P Discrete Object Generation with Reversible Inductive Construction Conference Advances in Neural Information Processing Systems 32 (NeurIPS), 2019. @conference{seff2019discrete, title = {Discrete Object Generation with Reversible Inductive Construction}, author = {Ari Seff and Wenda Zhou and Farhan Damani and Abigail Doyle and Ryan P. Adams}, url = {https://www.cs.princeton.edu/~rpa/pubs/seff2019discrete.pdf}, year = {2019}, date = {2019-12-04}, booktitle = {Advances in Neural Information Processing Systems 32 (NeurIPS)}, abstract = {The success of generative modeling in continuous domains has led to a surge of interest in generating discrete data such as molecules, source code, and graphs. However, construction histories for these discrete objects are typically not unique and so generative models must reason about intractably large spaces in order to learn. Additionally, structured discrete domains are often characterized by strict constraints on what constitutes a valid object and generative models must respect these requirements in order to produce useful novel samples. Here, we present a generative model for discrete objects employing a Markov chain where transitions are restricted to a set of local operations that preserve validity. Building off of generative interpretations of denoising autoencoders, the Markov chain alternates between producing 1) a sequence of corrupted objects that are valid but not from the data distribution, and 2) a learned reconstruction distribution that attempts to fix the corruptions while also preserving validity. This approach constrains the generative model to only produce valid objects, requires the learner to only discover local modifications to the objects, and avoids marginalization over an unknown and potentially large space of construction histories. We evaluate the proposed approach on two highly structured discrete domains, molecules and Laman graphs, and find that it compares favorably to alternative methods at capturing distributional statistics for a host of semantically relevant metrics.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } The success of generative modeling in continuous domains has led to a surge of interest in generating discrete data such as molecules, source code, and graphs. However, construction histories for these discrete objects are typically not unique and so generative models must reason about intractably large spaces in order to learn. Additionally, structured discrete domains are often characterized by strict constraints on what constitutes a valid object and generative models must respect these requirements in order to produce useful novel samples. Here, we present a generative model for discrete objects employing a Markov chain where transitions are restricted to a set of local operations that preserve validity. Building off of generative interpretations of denoising autoencoders, the Markov chain alternates between producing 1) a sequence of corrupted objects that are valid but not from the data distribution, and 2) a learned reconstruction distribution that attempts to fix the corruptions while also preserving validity. This approach constrains the generative model to only produce valid objects, requires the learner to only discover local modifications to the objects, and avoids marginalization over an unknown and potentially large space of construction histories. We evaluate the proposed approach on two highly structured discrete domains, molecules and Laman graphs, and find that it compares favorably to alternative methods at capturing distributional statistics for a host of semantically relevant metrics. |

Ash, Jordan T; Adams, Ryan P On the Difficulty of Warm-Starting Neural Network Training Technical Report 2019. @techreport{ash2018warm, title = {On the Difficulty of Warm-Starting Neural Network Training}, author = {Jordan T. Ash and Ryan P. Adams}, url = {https://arxiv.org/abs/1910.08475}, year = {2019}, date = {2019-10-18}, abstract = {In many real-world deployments of machine learning systems, data arrive piecemeal. These learning scenarios may be passive, where data arrive incrementally due to structural properties of the problem (e.g., daily financial data) or active, where samples are selected according to a measure of their quality (e.g., experimental design). In both of these cases, we are building a sequence of models that incorporate an increasing amount of data. We would like each of these models in the sequence to be performant and take advantage of all the data that are available to that point. Conventional intuition suggests that when solving a sequence of related optimization problems of this form, it should be possible to initialize using the solution of the previous iterate---to "warm start" the optimization rather than initialize from scratch---and see reductions in wall-clock time. However, in practice this warm-starting seems to yield poorer generalization performance than models that have fresh random initializations, even though the final training losses are similar. While it appears that some hyperparameter settings allow a practitioner to close this generalization gap, they seem to only do so in regimes that damage the wall-clock gains of the warm start. Nevertheless, it is highly desirable to be able to warm-start neural network training, as it would dramatically reduce the resource usage associated with the construction of performant deep learning systems. In this work, we take a closer look at this empirical phenomenon and try to understand when and how it occurs. Although the present investigation did not lead to a solution, we hope that a thorough articulation of the problem will spur new research that may lead to improved methods that consume fewer resources during training.}, keywords = {}, pubstate = {published}, tppubtype = {techreport} } In many real-world deployments of machine learning systems, data arrive piecemeal. These learning scenarios may be passive, where data arrive incrementally due to structural properties of the problem (e.g., daily financial data) or active, where samples are selected according to a measure of their quality (e.g., experimental design). In both of these cases, we are building a sequence of models that incorporate an increasing amount of data. We would like each of these models in the sequence to be performant and take advantage of all the data that are available to that point. Conventional intuition suggests that when solving a sequence of related optimization problems of this form, it should be possible to initialize using the solution of the previous iterate---to "warm start" the optimization rather than initialize from scratch---and see reductions in wall-clock time. However, in practice this warm-starting seems to yield poorer generalization performance than models that have fresh random initializations, even though the final training losses are similar. While it appears that some hyperparameter settings allow a practitioner to close this generalization gap, they seem to only do so in regimes that damage the wall-clock gains of the warm start. Nevertheless, it is highly desirable to be able to warm-start neural network training, as it would dramatically reduce the resource usage associated with the construction of performant deep learning systems. In this work, we take a closer look at this empirical phenomenon and try to understand when and how it occurs. Although the present investigation did not lead to a solution, we hope that a thorough articulation of the problem will spur new research that may lead to improved methods that consume fewer resources during training. |

Rahme, Jad; Adams, Ryan P A Theoretical Connection Between Statistical Physics and Reinforcement Learning Technical Report 2019. @techreport{rahme2019statistical, title = {A Theoretical Connection Between Statistical Physics and Reinforcement Learning}, author = {Jad Rahme and Ryan P. Adams}, url = {https://arxiv.org/abs/1906.10228}, year = {2019}, date = {2019-06-24}, abstract = {Sequential decision making in the presence of uncertainty and stochastic dynamics gives rise to distributions over state/action trajectories in reinforcement learning (RL) and optimal control problems. This observation has led to a variety of connections between RL and inference in probabilistic graphical models (PGMs). Here we explore a different dimension to this relationship, examining reinforcement learning using the tools and abstractions of statistical physics. The central object in the statistical physics abstraction is the idea of a partition function , and here we construct a partition function from the ensemble of possible trajectories that an agent might take in a Markov decision process. Although value functions and Q-functions can be derived from this partition function and interpreted via average energies, the -function provides an object with its own Bellman equation that can form the basis of alternative dynamic programming approaches. Moreover, when the MDP dynamics are deterministic, the Bellman equation for is linear, allowing direct solutions that are unavailable for the nonlinear equations associated with traditional value functions. The policies learned via these -based Bellman updates are tightly linked to Boltzmann-like policy parameterizations. In addition to sampling actions proportionally to the exponential of the expected cumulative reward as Boltzmann policies would, these policies take entropy into account favoring states from which many outcomes are possible.}, keywords = {}, pubstate = {published}, tppubtype = {techreport} } Sequential decision making in the presence of uncertainty and stochastic dynamics gives rise to distributions over state/action trajectories in reinforcement learning (RL) and optimal control problems. This observation has led to a variety of connections between RL and inference in probabilistic graphical models (PGMs). Here we explore a different dimension to this relationship, examining reinforcement learning using the tools and abstractions of statistical physics. The central object in the statistical physics abstraction is the idea of a partition function , and here we construct a partition function from the ensemble of possible trajectories that an agent might take in a Markov decision process. Although value functions and Q-functions can be derived from this partition function and interpreted via average energies, the -function provides an object with its own Bellman equation that can form the basis of alternative dynamic programming approaches. Moreover, when the MDP dynamics are deterministic, the Bellman equation for is linear, allowing direct solutions that are unavailable for the nonlinear equations associated with traditional value functions. The policies learned via these -based Bellman updates are tightly linked to Boltzmann-like policy parameterizations. In addition to sampling actions proportionally to the exponential of the expected cumulative reward as Boltzmann policies would, these policies take entropy into account favoring states from which many outcomes are possible. |

Beatson, Alex; Adams, Ryan P Efficient Optimization of Loops and Limits with Randomized Telescoping Sums Conference Proceedings of the 36th International Conference on Machine Learning (ICML), 2019. @conference{beatson2019efficient, title = {Efficient Optimization of Loops and Limits with Randomized Telescoping Sums}, author = {Alex Beatson and Ryan P. Adams}, url = {https://www.cs.princeton.edu/~rpa/pubs/beatson2019efficient.pdf}, year = {2019}, date = {2019-06-13}, booktitle = {Proceedings of the 36th International Conference on Machine Learning (ICML)}, abstract = {We consider optimization problems in which the objective requires an inner loop with many steps or is the limit of a sequence of increasingly costly approximations. Meta-learning, training recurrent neural networks, and optimization of the solutions to differential equations are all examples of optimization problems with this character. In such problems, it can be expensive to compute the objective function value and its gradient, but truncating the loop or using less accurate approximations can induce biases that damage the overall solution. We propose randomized telescope (RT) gradient estimators, which represent the objective as the sum of a telescoping series and sample linear combinations of terms to provide cheap unbiased gradient estimates. We identify conditions under which RT estimators achieve optimization convergence rates independent of the length of the loop or the required accuracy of the approximation. We also derive a method for tuning RT estimators online to maximize a lower bound on the expected decrease in loss per unit of computation. We evaluate our adaptive RT estimators on a range of applications including meta-optimization of learning rates, variational inference of ODE parameters, and training an LSTM to model long sequences.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } We consider optimization problems in which the objective requires an inner loop with many steps or is the limit of a sequence of increasingly costly approximations. Meta-learning, training recurrent neural networks, and optimization of the solutions to differential equations are all examples of optimization problems with this character. In such problems, it can be expensive to compute the objective function value and its gradient, but truncating the loop or using less accurate approximations can induce biases that damage the overall solution. We propose randomized telescope (RT) gradient estimators, which represent the objective as the sum of a telescoping series and sample linear combinations of terms to provide cheap unbiased gradient estimates. We identify conditions under which RT estimators achieve optimization convergence rates independent of the length of the loop or the required accuracy of the approximation. We also derive a method for tuning RT estimators online to maximize a lower bound on the expected decrease in loss per unit of computation. We evaluate our adaptive RT estimators on a range of applications including meta-optimization of learning rates, variational inference of ODE parameters, and training an LSTM to model long sequences. |

Zhou, Wenda; Veitch, Victor; Austern, Morgane; Adams, Ryan P; Orbanz, Peter Non-Vacuous Generalization Bounds at the ImageNet Scale: A PAC-Bayesian Compression Approach Conference Proceedings of the Seventh International Conference on Learning Representations (ICLR), 2019. @conference{zhou2019nonvacuous, title = {Non-Vacuous Generalization Bounds at the ImageNet Scale: A PAC-Bayesian Compression Approach}, author = {Wenda Zhou and Victor Veitch and Morgane Austern and Ryan P. Adams and Peter Orbanz}, url = {https://www.cs.princeton.edu/~rpa/pubs/zhou2019nonvacuous.pdf}, year = {2019}, date = {2019-04-18}, booktitle = {Proceedings of the Seventh International Conference on Learning Representations (ICLR)}, abstract = {Modern neural networks are highly overparameterized, with capacity to substantially overfit to training data. Nevertheless, these networks often generalize well in practice. It has also been observed that trained networks can often be "compressed" to much smaller representations. The purpose of this paper is to connect these two empirical observations. Our main technical result is a generalization bound for compressed networks based on the compressed size. Combined with off-the-shelf compression algorithms, the bound leads to state of the art generalization guarantees; in particular, we provide the first non-vacuous generalization guarantees for realistic architectures applied to the ImageNet classification problem. As additional evidence connecting compression and generalization, we show that compressibility of models that tend to overfit is limited: We establish an absolute limit on expected compressibility as a function of expected generalization error, where the expectations are over the random choice of training examples. The bounds are complemented by empirical results that show an increase in overfitting implies an increase in the number of bits required to describe a trained network.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Modern neural networks are highly overparameterized, with capacity to substantially overfit to training data. Nevertheless, these networks often generalize well in practice. It has also been observed that trained networks can often be "compressed" to much smaller representations. The purpose of this paper is to connect these two empirical observations. Our main technical result is a generalization bound for compressed networks based on the compressed size. Combined with off-the-shelf compression algorithms, the bound leads to state of the art generalization guarantees; in particular, we provide the first non-vacuous generalization guarantees for realistic architectures applied to the ImageNet classification problem. As additional evidence connecting compression and generalization, we show that compressibility of models that tend to overfit is limited: We establish an absolute limit on expected compressibility as a function of expected generalization error, where the expectations are over the random choice of training examples. The bounds are complemented by empirical results that show an increase in overfitting implies an increase in the number of bits required to describe a trained network. |

Wei, Jennifer N; Belanger, David; Adams, Ryan P; Sculley, D Rapid Prediction of Electron–Ionization Mass Spectrometry Using Neural Networks Journal Article ACS Central Science, 5 (4), pp. 700-708, 2019. @article{wei2019rapid, title = {Rapid Prediction of Electron–Ionization Mass Spectrometry Using Neural Networks}, author = {Jennifer N. Wei and David Belanger and Ryan P. Adams and D. Sculley}, url = {https://www.cs.princeton.edu/~rpa/pubs/wei2019rapid.pdf}, year = {2019}, date = {2019-03-19}, journal = {ACS Central Science}, volume = {5}, number = {4}, pages = {700-708}, abstract = {When confronted with a substance of unknown identity, researchers often perform mass spectrometry on the sample and compare the observed spectrum to a library of previously collected spectra to identify the molecule. While popular, this approach will fail to identify molecules that are not in the existing library. In response, we propose to improve the library’s coverage by augmenting it with synthetic spectra that are predicted from candidate molecules using machine learning. We contribute a lightweight neural network model that quickly predicts mass spectra for small molecules, averaging 5 ms per molecule with a recall-at-10 accuracy of 91.8%. Achieving high-accuracy predictions requires a novel neural network architecture that is designed to capture typical fragmentation patterns from electron ionization. We analyze the effects of our modeling innovations on library matching performance and compare our models to prior machine-learning-based work on spectrum prediction.}, keywords = {}, pubstate = {published}, tppubtype = {article} } When confronted with a substance of unknown identity, researchers often perform mass spectrometry on the sample and compare the observed spectrum to a library of previously collected spectra to identify the molecule. While popular, this approach will fail to identify molecules that are not in the existing library. In response, we propose to improve the library’s coverage by augmenting it with synthetic spectra that are predicted from candidate molecules using machine learning. We contribute a lightweight neural network model that quickly predicts mass spectra for small molecules, averaging 5 ms per molecule with a recall-at-10 accuracy of 91.8%. Achieving high-accuracy predictions requires a novel neural network architecture that is designed to capture typical fragmentation patterns from electron ionization. We analyze the effects of our modeling innovations on library matching performance and compare our models to prior machine-learning-based work on spectrum prediction. |

Regier, Jeffrey; Miller, Andrew C; Schlegel, David; Adams, Ryan P; McAuliffe, Jon D; Prabhat, Approximate inference for constructing astronomical catalogs from images Journal Article Annals of Applied Statistics, 13 (3), pp. 1884-1926, 2019. @article{regier2019approximate, title = {Approximate inference for constructing astronomical catalogs from images}, author = {Jeffrey Regier and Andrew C. Miller and David Schlegel and Ryan P. Adams and Jon D. McAuliffe and Prabhat}, url = {https://www.cs.princeton.edu/~rpa/pubs/regier2019approximate.pdf}, year = {2019}, date = {2019-03-01}, journal = {Annals of Applied Statistics}, volume = {13}, number = {3}, pages = {1884-1926}, abstract = {We present a new, fully generative model for constructing astronomical catalogs from optical telescope image sets. Each pixel intensity is treated as a random variable with parameters that depend on the latent properties of stars and galaxies. These latent properties are themselves modeled as random. We compare two procedures for posterior inference. One procedure is based on Markov chain Monte Carlo (MCMC) while the other is based on variational inference (VI). The MCMC procedure excels at quantifying uncertainty, while the VI procedure is 1000 times faster. On a supercomputer, the VI procedure efficiently uses 665,000 CPU cores to construct an astronomical catalog from 50 terabytes of images in 14.6 minutes, demonstrating the scaling characteristics necessary to construct catalogs for upcoming astronomical surveys.}, keywords = {}, pubstate = {published}, tppubtype = {article} } We present a new, fully generative model for constructing astronomical catalogs from optical telescope image sets. Each pixel intensity is treated as a random variable with parameters that depend on the latent properties of stars and galaxies. These latent properties are themselves modeled as random. We compare two procedures for posterior inference. One procedure is based on Markov chain Monte Carlo (MCMC) while the other is based on variational inference (VI). The MCMC procedure excels at quantifying uncertainty, while the VI procedure is 1000 times faster. On a supercomputer, the VI procedure efficiently uses 665,000 CPU cores to construct an astronomical catalog from 50 terabytes of images in 14.6 minutes, demonstrating the scaling characteristics necessary to construct catalogs for upcoming astronomical surveys. |

## 2018 |

Cai, Diana; Mitzenmacher, Michael; Adams, Ryan P A Bayesian Nonparametric View on Count-Min Sketch Conference Advances in Neural Information Processing Systems 31 (NeurIPS), 2018. @conference{cai2018bayesian, title = {A Bayesian Nonparametric View on Count-Min Sketch}, author = {Diana Cai and Michael Mitzenmacher and Ryan P. Adams}, url = {https://www.cs.princeton.edu/~rpa/pubs/cai2018bayesian.pdf}, year = {2018}, date = {2018-12-06}, booktitle = {Advances in Neural Information Processing Systems 31 (NeurIPS)}, abstract = {The count-min sketch is a time- and memory-efficient randomized data structure that provides a point estimate of the number of times an item has appeared in a data stream. The count-min sketch and related hash-based data structures are ubiquitous in systems that must track frequencies of data such as URLs, IP addresses, and language n-grams. We present a Bayesian view on the count-min sketch, using the same data structure, but providing a posterior distribution over the frequencies that characterizes the uncertainty arising from the hash-based approximation. In particular, we take a nonparametric approach and consider tokens generated from a Dirichlet process (DP) random measure, which allows for an unbounded number of unique tokens. Using properties of the DP, we show that it is possible to straightforwardly compute posterior marginals of the unknown true counts and that the modes of these marginals recover the count-min sketch estimator, inheriting the associated probabilistic guarantees. Using simulated data with known ground truth, we investigate the properties of these estimators. Lastly, we also study a modified problem in which the observation stream consists of collections of tokens (i.e., documents) arising from a random measure drawn from a stable beta process, which allows for power law scaling behavior in the number of unique tokens.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } The count-min sketch is a time- and memory-efficient randomized data structure that provides a point estimate of the number of times an item has appeared in a data stream. The count-min sketch and related hash-based data structures are ubiquitous in systems that must track frequencies of data such as URLs, IP addresses, and language n-grams. We present a Bayesian view on the count-min sketch, using the same data structure, but providing a posterior distribution over the frequencies that characterizes the uncertainty arising from the hash-based approximation. In particular, we take a nonparametric approach and consider tokens generated from a Dirichlet process (DP) random measure, which allows for an unbounded number of unique tokens. Using properties of the DP, we show that it is possible to straightforwardly compute posterior marginals of the unknown true counts and that the modes of these marginals recover the count-min sketch estimator, inheriting the associated probabilistic guarantees. Using simulated data with known ground truth, we investigate the properties of these estimators. Lastly, we also study a modified problem in which the observation stream consists of collections of tokens (i.e., documents) arising from a random measure drawn from a stable beta process, which allows for power law scaling behavior in the number of unique tokens. |

Reshef, Yakir A; Finucane, Hilary; Kelley, David R; Gusev, Alexander; Kotliar, Dylan; Ulirsch, Jacob C; Hormozdiari, Farhad; Nasser, Joseph; O'Connor, Luke; van de Geijn, Bryce; Loh, Po-Ru; Grossman, Shari; Bhatia, Gaurav; Gazal, Steven; Palamara, Pier Francesco; Pinello, Luca; Patterson, Nick; Adams, Ryan P; Price, Alkes Detecting genome-wide directional effects of transcription factor binding on polygenic disease risk Journal Article Nature Genetics, 50 , pp. 143-1493, 2018. @article{reshef2018detecting, title = {Detecting genome-wide directional effects of transcription factor binding on polygenic disease risk}, author = {Yakir A. Reshef and Hilary Finucane and David R. Kelley and Alexander Gusev and Dylan Kotliar and Jacob C. Ulirsch and Farhad Hormozdiari and Joseph Nasser and Luke O'Connor and Bryce van de Geijn and Po-Ru Loh and Shari Grossman and Gaurav Bhatia and Steven Gazal and Pier Francesco Palamara and Luca Pinello and Nick Patterson and Ryan P. Adams and Alkes Price}, url = {https://www.nature.com/articles/s41588-018-0196-7}, year = {2018}, date = {2018-09-03}, journal = {Nature Genetics}, volume = {50}, pages = {143-1493}, abstract = {Biological interpretation of genome-wide association study data frequently involves assessing whether SNPs linked to a biological process, for example, binding of a transcription factor, show unsigned enrichment for disease signal. However, signed annotations quantifying whether each SNP allele promotes or hinders the biological process can enable stronger statements about disease mechanism. We introduce a method, signed linkage disequilibrium profile regression, for detecting genome-wide directional effects of signed functional annotations on disease risk. We validate the method via simulations and application to molecular quantitative trait loci in blood, recovering known transcriptional regulators. We apply the method to expression quantitative trait loci in 48 Genotype-Tissue Expression tissues, identifying 651 transcription factor-tissue associations including 30 with robust evidence of tissue specificity. We apply the method to 46 diseases and complex traits (average n = 290 K), identifying 77 annotation-trait associations representing 12 independent transcription factor-trait associations, and characterize the underlying transcriptional programs using gene-set enrichment analyses. Our results implicate new causal disease genes and new disease mechanisms.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Biological interpretation of genome-wide association study data frequently involves assessing whether SNPs linked to a biological process, for example, binding of a transcription factor, show unsigned enrichment for disease signal. However, signed annotations quantifying whether each SNP allele promotes or hinders the biological process can enable stronger statements about disease mechanism. We introduce a method, signed linkage disequilibrium profile regression, for detecting genome-wide directional effects of signed functional annotations on disease risk. We validate the method via simulations and application to molecular quantitative trait loci in blood, recovering known transcriptional regulators. We apply the method to expression quantitative trait loci in 48 Genotype-Tissue Expression tissues, identifying 651 transcription factor-tissue associations including 30 with robust evidence of tissue specificity. We apply the method to 46 diseases and complex traits (average n = 290 K), identifying 77 annotation-trait associations representing 12 independent transcription factor-trait associations, and characterize the underlying transcriptional programs using gene-set enrichment analyses. Our results implicate new causal disease genes and new disease mechanisms. |

Gilmer, Justin; Adams, Ryan P; Goodfellow, Ian; Andersen, David; Dahl, George E Motivating the Rules of the Game for Adversarial Example Research Technical Report 2018. @techreport{gilmer2018adversarial, title = {Motivating the Rules of the Game for Adversarial Example Research}, author = {Justin Gilmer and Ryan P. Adams and Ian Goodfellow and David Andersen and George E. Dahl}, url = {https://arxiv.org/abs/1807.06732}, year = {2018}, date = {2018-07-18}, abstract = {Advances in machine learning have led to broad deployment of systems with impressive performance on important problems. Nonetheless, these systems can be induced to make errors on data that are surprisingly similar to examples the learned system handles correctly. The existence of these errors raises a variety of questions about out-of-sample generalization and whether bad actors might use such examples to abuse deployed systems. As a result of these security concerns, there has been a flurry of recent papers proposing algorithms to defend against such malicious perturbations of correctly handled examples. It is unclear how such misclassifications represent a different kind of security problem than other errors, or even other attacker-produced examples that have no specific relationship to an uncorrupted input. In this paper, we argue that adversarial example defense papers have, to date, mostly considered abstract, toy games that do not relate to any specific security concern. Furthermore, defense papers have not yet precisely described all the abilities and limitations of attackers that would be relevant in practical security. Towards this end, we establish a taxonomy of motivations, constraints, and abilities for more plausible adversaries. Finally, we provide a series of recommendations outlining a path forward for future work to more clearly articulate the threat model and perform more meaningful evaluation.}, keywords = {}, pubstate = {published}, tppubtype = {techreport} } Advances in machine learning have led to broad deployment of systems with impressive performance on important problems. Nonetheless, these systems can be induced to make errors on data that are surprisingly similar to examples the learned system handles correctly. The existence of these errors raises a variety of questions about out-of-sample generalization and whether bad actors might use such examples to abuse deployed systems. As a result of these security concerns, there has been a flurry of recent papers proposing algorithms to defend against such malicious perturbations of correctly handled examples. It is unclear how such misclassifications represent a different kind of security problem than other errors, or even other attacker-produced examples that have no specific relationship to an uncorrupted input. In this paper, we argue that adversarial example defense papers have, to date, mostly considered abstract, toy games that do not relate to any specific security concern. Furthermore, defense papers have not yet precisely described all the abilities and limitations of attackers that would be relevant in practical security. Towards this end, we establish a taxonomy of motivations, constraints, and abilities for more plausible adversaries. Finally, we provide a series of recommendations outlining a path forward for future work to more clearly articulate the threat model and perform more meaningful evaluation. |

Adams, Ryan P; Pennington, Jeffrey; Johnson, Matthew J; Smith, Jamie; Ovadia, Yaniv; Patton, Brian; Saunderson, James Estimating the Spectral Density of Large Implicit Matrices Technical Report 2018. @techreport{adams2018spectral, title = {Estimating the Spectral Density of Large Implicit Matrices}, author = {Ryan P. Adams and Jeffrey Pennington and Matthew J. Johnson and Jamie Smith and Yaniv Ovadia and Brian Patton and James Saunderson}, url = {https://arxiv.org/abs/1802.03451}, year = {2018}, date = {2018-02-09}, abstract = {Many important problems are characterized by the eigenvalues of a large matrix. For example, the difficulty of many optimization problems, such as those arising from the fitting of large models in statistics and machine learning, can be investigated via the spectrum of the Hessian of the empirical loss function. Network data can be understood via the eigenstructure of a graph Laplacian matrix using spectral graph theory. Quantum simulations and other many-body problems are often characterized via the eigenvalues of the solution space, as are various dynamic systems. However, naive eigenvalue estimation is computationally expensive even when the matrix can be represented; in many of these situations the matrix is so large as to only be available implicitly via products with vectors. Even worse, one may only have noisy estimates of such matrix vector products. In this work, we combine several different techniques for randomized estimation and show that it is possible to construct unbiased estimators to answer a broad class of questions about the spectra of such implicit matrices, even in the presence of noise. We validate these methods on large-scale problems in which graph theory and random matrix theory provide ground truth.}, keywords = {}, pubstate = {published}, tppubtype = {techreport} } Many important problems are characterized by the eigenvalues of a large matrix. For example, the difficulty of many optimization problems, such as those arising from the fitting of large models in statistics and machine learning, can be investigated via the spectrum of the Hessian of the empirical loss function. Network data can be understood via the eigenstructure of a graph Laplacian matrix using spectral graph theory. Quantum simulations and other many-body problems are often characterized via the eigenvalues of the solution space, as are various dynamic systems. However, naive eigenvalue estimation is computationally expensive even when the matrix can be represented; in many of these situations the matrix is so large as to only be available implicitly via products with vectors. Even worse, one may only have noisy estimates of such matrix vector products. In this work, we combine several different techniques for randomized estimation and show that it is possible to construct unbiased estimators to answer a broad class of questions about the spectra of such implicit matrices, even in the presence of noise. We validate these methods on large-scale problems in which graph theory and random matrix theory provide ground truth. |

Saeedi, Ardavan; Hoffman, Matthew D; DiVerdi, Stephen J; Ghandeharioun, Asma; Johnson, Matthew J; Adams, Ryan P Multimodal Prediction and Personalization of Photo Edits with Deep Generative Models Conference Proceedings of the 21st International Conference on Artificial Intelligence and Statistics (AISTATS), 2018, (arXiv:1704.04997 [stat.ML]). @conference{saeedi2018multimodal, title = {Multimodal Prediction and Personalization of Photo Edits with Deep Generative Models}, author = {Ardavan Saeedi and Matthew D. Hoffman and Stephen J. DiVerdi and Asma Ghandeharioun and Matthew J. Johnson and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/saeedi2018multimodal.pdf}, year = {2018}, date = {2018-01-01}, booktitle = {Proceedings of the 21st International Conference on Artificial Intelligence and Statistics (AISTATS)}, abstract = {Professional-grade software applications are powerful but complicated−expert users can achieve impressive results, but novices often struggle to complete even basic tasks. Photo editing is a prime example: after loading a photo, the user is confronted with an array of cryptic sliders like "clarity", "temp", and "highlights". An automatically generated suggestion could help, but there is no single "correct" edit for a given image−different experts may make very different aesthetic decisions when faced with the same image, and a single expert may make different choices depending on the intended use of the image (or on a whim). We therefore want a system that can propose multiple diverse, high-quality edits while also learning from and adapting to a user's aesthetic preferences. In this work, we develop a statistical model that meets these objectives. Our model builds on recent advances in neural network generative modeling and scalable inference, and uses hierarchical structure to learn editing patterns across many diverse users. Empirically, we find that our model outperforms other approaches on this challenging multimodal prediction task.}, note = {arXiv:1704.04997 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Professional-grade software applications are powerful but complicated−expert users can achieve impressive results, but novices often struggle to complete even basic tasks. Photo editing is a prime example: after loading a photo, the user is confronted with an array of cryptic sliders like "clarity", "temp", and "highlights". An automatically generated suggestion could help, but there is no single "correct" edit for a given image−different experts may make very different aesthetic decisions when faced with the same image, and a single expert may make different choices depending on the intended use of the image (or on a whim). We therefore want a system that can propose multiple diverse, high-quality edits while also learning from and adapting to a user's aesthetic preferences. In this work, we develop a statistical model that meets these objectives. Our model builds on recent advances in neural network generative modeling and scalable inference, and uses hierarchical structure to learn editing patterns across many diverse users. Empirically, we find that our model outperforms other approaches on this challenging multimodal prediction task. |

Gómez-Bombarelli, Rafael; Wei, Jennifer; Duvenaud, David; Hernández-Lobato, Jose Miguel; Sánchez-Lengeling, Benjamin; Sheberla, Dennis; Aguilera-Iparraguirre, Jorge; Hirzel, Timothy D; Adams, Ryan P; Aspuru-Guzik, Alan Automatic Chemical Design Using a Data-Driven Continuous Representation of Molecules Journal Article ACS Central Science, 4 (2), pp. 268–276, 2018, (arXiv:1610.02415 [cs.LG]). @article{bombarelli2018automatic, title = {Automatic Chemical Design Using a Data-Driven Continuous Representation of Molecules}, author = {Rafael Gómez-Bombarelli and Jennifer Wei and David Duvenaud and Jose Miguel Hernández-Lobato and Benjamin Sánchez-Lengeling and Dennis Sheberla and Jorge Aguilera-Iparraguirre and Timothy D. Hirzel and Ryan P. Adams and Alan Aspuru-Guzik}, url = {http://www.cs.princeton.edu/~rpa/pubs/bombarelli2018automatic.pdf}, year = {2018}, date = {2018-01-01}, journal = {ACS Central Science}, volume = {4}, number = {2}, pages = {268--276}, abstract = {We report a method to convert discrete representations of molecules to and from a multidimensional continuous representation. This model allows us to generate new molecules for efficient exploration and optimization through open-ended spaces of chemical compounds. A deep neural network was trained on hundreds of thousands of existing chemical structures to construct three coupled functions: an encoder, a decoder and a predictor. The encoder converts the discrete representation of a molecule into a real-valued continuous vector, and the decoder converts these continuous vectors back to discrete molecular representations. The predictor estimates chemical properties from the latent continuous vector representation of the molecule. Continuous representations allow us to automatically generate novel chemical structures by performing simple operations in the latent space, such as decoding random vectors, perturbing known chemical structures, or interpolating between molecules. Continuous representations also allow the use of powerful gradient-based optimization to efficiently guide the search for optimized functional compounds. We demonstrate our method in the domain of drug-like molecules and also in the set of molecules with fewer that nine heavy atoms.}, note = {arXiv:1610.02415 [cs.LG]}, keywords = {}, pubstate = {published}, tppubtype = {article} } We report a method to convert discrete representations of molecules to and from a multidimensional continuous representation. This model allows us to generate new molecules for efficient exploration and optimization through open-ended spaces of chemical compounds. A deep neural network was trained on hundreds of thousands of existing chemical structures to construct three coupled functions: an encoder, a decoder and a predictor. The encoder converts the discrete representation of a molecule into a real-valued continuous vector, and the decoder converts these continuous vectors back to discrete molecular representations. The predictor estimates chemical properties from the latent continuous vector representation of the molecule. Continuous representations allow us to automatically generate novel chemical structures by performing simple operations in the latent space, such as decoding random vectors, perturbing known chemical structures, or interpolating between molecules. Continuous representations also allow the use of powerful gradient-based optimization to efficiently guide the search for optimized functional compounds. We demonstrate our method in the domain of drug-like molecules and also in the set of molecules with fewer that nine heavy atoms. |

## 2017 |

Linderman, Scott W; Johnson, Matthew J; Miller, Andrew C; Adams, Ryan P; Blei, David M; Paninski, Liam Recurrent Switching Linear Dynamical Systems Conference Proceedings of the 20th International Conference on Artificial Intelligence and Statistics (AISTATS), 2017, (arXiv:1610.08466 [stat.ML]). @conference{linderman2017recurrent, title = {Recurrent Switching Linear Dynamical Systems}, author = {Scott W. Linderman and Matthew J. Johnson and Andrew C. Miller and Ryan P. Adams and David M. Blei and Liam Paninski}, url = {http://www.cs.princeton.edu/~rpa/pubs/linderman2017recurrent.pdf}, year = {2017}, date = {2017-01-01}, booktitle = {Proceedings of the 20th International Conference on Artificial Intelligence and Statistics (AISTATS)}, abstract = {Many natural systems, such as neurons firing in the brain or basketball teams traversing a court, give rise to time series data with complex, nonlinear dynamics. We can gain insight into these systems by decomposing the data into segments that are each explained by simpler dynamic units. Building on switching linear dynamical systems (SLDS), we present a new model class that not only discovers these dynamical units, but also explains how their switching behavior depends on observations or continuous latent states. These "recurrent" switching linear dynamical systems provide further insight by discovering the conditions under which each unit is deployed, something that traditional SLDS models fail to do. We leverage recent algorithmic advances in approximate inference to make Bayesian inference in these models easy, fast, and scalable.}, note = {arXiv:1610.08466 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Many natural systems, such as neurons firing in the brain or basketball teams traversing a court, give rise to time series data with complex, nonlinear dynamics. We can gain insight into these systems by decomposing the data into segments that are each explained by simpler dynamic units. Building on switching linear dynamical systems (SLDS), we present a new model class that not only discovers these dynamical units, but also explains how their switching behavior depends on observations or continuous latent states. These "recurrent" switching linear dynamical systems provide further insight by discovering the conditions under which each unit is deployed, something that traditional SLDS models fail to do. We leverage recent algorithmic advances in approximate inference to make Bayesian inference in these models easy, fast, and scalable. |

Miller, Andrew C; Foti, Nicholas J; Adams, Ryan P Variational Boosting: Iteratively Refining Posterior Approximations Conference Proceedings of the 34th International Conference on Machine Learning (ICML), 2017, (arXiv:1611.06585 [stat.ML]). @conference{miller2017boosting, title = {Variational Boosting: Iteratively Refining Posterior Approximations}, author = {Andrew C. Miller and Nicholas J. Foti and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/miller2017boosting.pdf}, year = {2017}, date = {2017-01-01}, booktitle = {Proceedings of the 34th International Conference on Machine Learning (ICML)}, abstract = {We propose a black-box variational inference method to approximate intractable distributions with an increasingly rich approximating class. Our method, termed variational boosting, iteratively refines an existing variational approximation by solving a sequence of optimization problems, allowing the practitioner to trade computation time for accuracy. We show how to expand the variational approximating class by incorporating additional covariance structure and by introducing new components to form a mixture. We apply variational boosting to synthetic and real statistical models, and show that resulting posterior inferences compare favorably to existing posterior approximation algorithms in both accuracy and efficiency.}, note = {arXiv:1611.06585 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } We propose a black-box variational inference method to approximate intractable distributions with an increasingly rich approximating class. Our method, termed variational boosting, iteratively refines an existing variational approximation by solving a sequence of optimization problems, allowing the practitioner to trade computation time for accuracy. We show how to expand the variational approximating class by incorporating additional covariance structure and by introducing new components to form a mixture. We apply variational boosting to synthetic and real statistical models, and show that resulting posterior inferences compare favorably to existing posterior approximation algorithms in both accuracy and efficiency. |

Huggins, Jonathan; Adams, Ryan P; Broderick, Tamara PASS-GLM: Polynomial Approximate Sufficient Statistics for Scalable Bayesian GLM Inference Conference Advances in Neural Information Processing Systems (NeurIPS) 30, 2017, (arXiv:1709.09216 [stat.CO]). @conference{huggins2017pass, title = {PASS-GLM: Polynomial Approximate Sufficient Statistics for Scalable Bayesian GLM Inference}, author = {Jonathan Huggins and Ryan P. Adams and Tamara Broderick}, url = {http://www.cs.princeton.edu/~rpa/pubs/huggins2017pass.pdf}, year = {2017}, date = {2017-01-01}, booktitle = {Advances in Neural Information Processing Systems (NeurIPS) 30}, abstract = {Generalized linear models (GLMs) -- such as logistic regression, Poisson regression, and robust regression -- provide interpretable models for diverse data types. Probabilistic approaches, particularly Bayesian ones, allow coherent estimates of uncertainty, incorporation of prior information, and sharing of power across experiments via hierarchical models. In practice, however, the approximate Bayesian methods necessary for inference have either failed to scale to large data sets or failed to provide theoretical guarantees on the quality of inference. We propose a new approach based on constructing polynomial approximate sufficient statistics for GLMs (PASS-GLM). We demonstrate that our method admits a simple algorithm as well as trivial streaming and distributed extensions that do not compound error across computations. We provide theoretical guarantees on the quality of point (MAP) estimates, the approximate posterior, and posterior mean and uncertainty estimates. We validate our approach empirically in the case of logistic regression using a quadratic approximation and show competitive performance with stochastic gradient descent, MCMC, and the Laplace approximation in terms of speed and multiple measures of accuracy -- including on an advertising data set with 40 million data points and 20,000 covariates.}, note = {arXiv:1709.09216 [stat.CO]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Generalized linear models (GLMs) -- such as logistic regression, Poisson regression, and robust regression -- provide interpretable models for diverse data types. Probabilistic approaches, particularly Bayesian ones, allow coherent estimates of uncertainty, incorporation of prior information, and sharing of power across experiments via hierarchical models. In practice, however, the approximate Bayesian methods necessary for inference have either failed to scale to large data sets or failed to provide theoretical guarantees on the quality of inference. We propose a new approach based on constructing polynomial approximate sufficient statistics for GLMs (PASS-GLM). We demonstrate that our method admits a simple algorithm as well as trivial streaming and distributed extensions that do not compound error across computations. We provide theoretical guarantees on the quality of point (MAP) estimates, the approximate posterior, and posterior mean and uncertainty estimates. We validate our approach empirically in the case of logistic regression using a quadratic approximation and show competitive performance with stochastic gradient descent, MCMC, and the Laplace approximation in terms of speed and multiple measures of accuracy -- including on an advertising data set with 40 million data points and 20,000 covariates. |

Miller, Andrew C; Foti, Nicholas J; d'Amour, Alexander; Adams, Ryan P Reducing Reparameterization Gradient Variance Conference Advances in Neural Information Processing Systems (NIPS) 30, 2017, (arXiv:1705.07880 [stat.ML]). @conference{miller2017reducing, title = {Reducing Reparameterization Gradient Variance}, author = {Andrew C. Miller and Nicholas J. Foti and Alexander d'Amour and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/miller2017reducing.pdf}, year = {2017}, date = {2017-01-01}, booktitle = {Advances in Neural Information Processing Systems (NIPS) 30}, abstract = {Optimization with noisy gradients has become ubiquitous in statistics and machine learning. Reparameterization gradients, or gradient estimates computed via the "reparameterization trick," represent a class of noisy gradients often used in Monte Carlo variational inference (MCVI). However, when these gradient estimators are too noisy, the optimization procedure can be slow or fail to converge. One way to reduce noise is to use more samples for the gradient estimate, but this can be computationally expensive. Instead, we view the noisy gradient as a random variable, and form an inexpensive approximation of the generating procedure for the gradient sample. This approximation has high correlation with the noisy gradient by construction, making it a useful control variate for variance reduction. We demonstrate our approach on non-conjugate multi-level hierarchical models and a Bayesian neural net where we observed gradient variance reductions of multiple orders of magnitude (20-2,000x).}, note = {arXiv:1705.07880 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Optimization with noisy gradients has become ubiquitous in statistics and machine learning. Reparameterization gradients, or gradient estimates computed via the "reparameterization trick," represent a class of noisy gradients often used in Monte Carlo variational inference (MCVI). However, when these gradient estimators are too noisy, the optimization procedure can be slow or fail to converge. One way to reduce noise is to use more samples for the gradient estimate, but this can be computationally expensive. Instead, we view the noisy gradient as a random variable, and form an inexpensive approximation of the generating procedure for the gradient sample. This approximation has high correlation with the noisy gradient by construction, making it a useful control variate for variance reduction. We demonstrate our approach on non-conjugate multi-level hierarchical models and a Bayesian neural net where we observed gradient variance reductions of multiple orders of magnitude (20-2,000x). |

## 2016 |

Grosse, Roger B; Ghahramani, Zoubin; Adams, Ryan P Sandwiching the Marginal Likelihood Using Bidirectional Monte Carlo Unpublished 2016, (arXiv:1511.02543 [stat.ML]). @unpublished{grosse2015sandwiching, title = {Sandwiching the Marginal Likelihood Using Bidirectional Monte Carlo}, author = {Roger B. Grosse and Zoubin Ghahramani and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/grosse2015sandwiching.pdf}, year = {2016}, date = {2016-01-01}, abstract = {Computing the marginal likelihood (ML) of a model requires marginalizing out all of the parameters and latent variables, a difficult high-dimensional summation or integration problem. To make matters worse, it is often hard to measure the accuracy of one’s ML estimates. We present bidirectional Monte Carlo, a technique for obtaining accurate log-ML estimates on data simulated from a model. This method obtains stochastic lower bounds on the log-ML using annealed importance sampling or sequential Monte Carlo, and obtains stochastic upper bounds by running these same algorithms in reverse starting from an exact posterior sample. The true value can be sandwiched between these two stochastic bounds with high probability. Using the ground truth log-ML estimates obtained from our method, we quantitatively evaluate a wide variety of existing ML estimators on several latent variable models: clustering, a low rank approximation, and a binary attributes model. These experiments yield insights into how to accurately estimate marginal likelihoods.}, note = {arXiv:1511.02543 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {unpublished} } Computing the marginal likelihood (ML) of a model requires marginalizing out all of the parameters and latent variables, a difficult high-dimensional summation or integration problem. To make matters worse, it is often hard to measure the accuracy of one’s ML estimates. We present bidirectional Monte Carlo, a technique for obtaining accurate log-ML estimates on data simulated from a model. This method obtains stochastic lower bounds on the log-ML using annealed importance sampling or sequential Monte Carlo, and obtains stochastic upper bounds by running these same algorithms in reverse starting from an exact posterior sample. The true value can be sandwiched between these two stochastic bounds with high probability. Using the ground truth log-ML estimates obtained from our method, we quantitatively evaluate a wide variety of existing ML estimators on several latent variable models: clustering, a low rank approximation, and a binary attributes model. These experiments yield insights into how to accurately estimate marginal likelihoods. |

Shahriari, Bobak; Swersky, Kevin; Wang, Ziyu; Adams, Ryan P; de Freitas, Nando Taking the Human Out of the Loop: A Review of Bayesian Optimization Journal Article Proceedings of the IEEE, 104 (1), pp. 148–175, 2016. @article{shahriari2016loop, title = {Taking the Human Out of the Loop: A Review of Bayesian Optimization}, author = {Bobak Shahriari and Kevin Swersky and Ziyu Wang and Ryan P. Adams and Nando de Freitas}, url = {http://www.cs.princeton.edu/~rpa/pubs/shahriari2016loop.pdf}, year = {2016}, date = {2016-01-01}, journal = {Proceedings of the IEEE}, volume = {104}, number = {1}, pages = {148--175}, abstract = {Big Data applications are typically associated with systems involving large numbers of users, massive complex software systems, and large-scale heterogeneous computing and storage architectures. The construction of such systems involves many distributed design choices. The end products (e.g., recommendation systems, medical analysis tools, real-time game engines, speech recognizers) thus involve many tunable configuration parameters. These parameters are often specified and hard-coded into the software by various developers or teams. If optimized jointly, these parameters can result in significant improvements. Bayesian optimization is a powerful tool for the joint optimization of design choices that is gaining great popularity in recent years. It promises greater automation so as to increase both product quality and human productivity. This review paper introduces Bayesian optimization, highlights some of its methodological aspects, and showcases a wide range of applications.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Big Data applications are typically associated with systems involving large numbers of users, massive complex software systems, and large-scale heterogeneous computing and storage architectures. The construction of such systems involves many distributed design choices. The end products (e.g., recommendation systems, medical analysis tools, real-time game engines, speech recognizers) thus involve many tunable configuration parameters. These parameters are often specified and hard-coded into the software by various developers or teams. If optimized jointly, these parameters can result in significant improvements. Bayesian optimization is a powerful tool for the joint optimization of design choices that is gaining great popularity in recent years. It promises greater automation so as to increase both product quality and human productivity. This review paper introduces Bayesian optimization, highlights some of its methodological aspects, and showcases a wide range of applications. |

Gómez-Bombarelli, Rafael; Aguilera-Iparraguirre, Jorge; Hirzel, Timothy D; Duvenaud, David; Maclaurin, Dougal; Blood-Forsythe, Martin A; Chae, Hyun Sik; Einzinger, Markus; Ha, Dong-Gwang; Wu, Tony; Markopolous, Georgios; Jeon, Soonok; Kang, Hosuk; Miyazaki, Hiroshi; Numata, Masaki; Kim, Sunghan; Huang, Wenliang; Hong, Seong Ik; Baldo, Marc; Adams, Ryan P; Aspuru-Guzik, Alan Design of Efficient Molecular Organic Light-Emitting Diodes by a High-Throughput Virtual Screening and Experimental Approach Journal Article Nature Materials, 15 (10), pp. 1120–1127, 2016. @article{bombarelli2016oleds, title = {Design of Efficient Molecular Organic Light-Emitting Diodes by a High-Throughput Virtual Screening and Experimental Approach}, author = {Rafael Gómez-Bombarelli and Jorge Aguilera-Iparraguirre and Timothy D. Hirzel and David Duvenaud and Dougal Maclaurin and Martin A. Blood-Forsythe and Hyun Sik Chae and Markus Einzinger and Dong-Gwang Ha and Tony Wu and Georgios Markopolous and Soonok Jeon and Hosuk Kang and Hiroshi Miyazaki and Masaki Numata and Sunghan Kim and Wenliang Huang and Seong Ik Hong and Marc Baldo and Ryan P. Adams and Alan Aspuru-Guzik}, url = {http://www.cs.princeton.edu/~rpa/pubs/bombarelli2016oleds.pdf}, year = {2016}, date = {2016-01-01}, journal = {Nature Materials}, volume = {15}, number = {10}, pages = {1120--1127}, abstract = {Virtual screening is becoming a ground-breaking tool for molecular discovery due to the exponential growth of available computer time and constant improvement of simulation and machine learning techniques. We report an integrated organic functional material design process that incorporates theoretical insight, quantum chemistry, cheminformatics, machine learning, industrial expertise, organic synthesis, molecular characterization, device fabrication and optoelectronic testing. After exploring a search space of 1.6 million molecules and screening over 400,000 of them using time-dependent density functional theory, we identified thousands of promising novel organic light-emitting diode molecules across the visible spectrum. Our team collaboratively selected the best candidates from this set. The experimentally determined external quantum efficiencies for these synthesized candidates were as large as 22%.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Virtual screening is becoming a ground-breaking tool for molecular discovery due to the exponential growth of available computer time and constant improvement of simulation and machine learning techniques. We report an integrated organic functional material design process that incorporates theoretical insight, quantum chemistry, cheminformatics, machine learning, industrial expertise, organic synthesis, molecular characterization, device fabrication and optoelectronic testing. After exploring a search space of 1.6 million molecules and screening over 400,000 of them using time-dependent density functional theory, we identified thousands of promising novel organic light-emitting diode molecules across the visible spectrum. Our team collaboratively selected the best candidates from this set. The experimentally determined external quantum efficiencies for these synthesized candidates were as large as 22%. |

Doshi-Velez, Finale; Wallace, Byrown; Adams, Ryan P Graph-Sparse LDA: A Topic Model with Structured Sparsity Conference Proceedings of the 29th AAAI Conference on Artificial Intelligence, 2016, (arXiv:1410.4510 [stat.ML]). @conference{velez2016graph, title = {Graph-Sparse LDA: A Topic Model with Structured Sparsity}, author = {Finale Doshi-Velez and Byrown Wallace and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/velez2016graph.pdf}, year = {2016}, date = {2016-01-01}, booktitle = {Proceedings of the 29th AAAI Conference on Artificial Intelligence}, abstract = {Originally designed to model text, topic modeling has become a powerful tool for uncovering latent structure in domains including medicine, finance, and vision. The goals for the model vary depending on the application: in some cases, the discovered topics may be used for prediction or some other downstream task. In other cases, the content of the topic itself may be of intrinsic scientific interest. Unfortunately, even using modern sparse techniques, the discovered topics are often difficult to interpret due to the high dimensionality of the underlying space. To improve topic interpretability, we introduce Graph-Sparse LDA, a hierarchical topic model that leverages knowledge of relationships between words (e.g., as encoded by an ontology). In our model, topics are summarized by a few latent concept-words from the underlying graph that explain the observed words. Graph-Sparse LDA recovers sparse, interpretable summaries on two real-world biomedical datasets while matching state-of-the-art prediction performance.}, note = {arXiv:1410.4510 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Originally designed to model text, topic modeling has become a powerful tool for uncovering latent structure in domains including medicine, finance, and vision. The goals for the model vary depending on the application: in some cases, the discovered topics may be used for prediction or some other downstream task. In other cases, the content of the topic itself may be of intrinsic scientific interest. Unfortunately, even using modern sparse techniques, the discovered topics are often difficult to interpret due to the high dimensionality of the underlying space. To improve topic interpretability, we introduce Graph-Sparse LDA, a hierarchical topic model that leverages knowledge of relationships between words (e.g., as encoded by an ontology). In our model, topics are summarized by a few latent concept-words from the underlying graph that explain the observed words. Graph-Sparse LDA recovers sparse, interpretable summaries on two real-world biomedical datasets while matching state-of-the-art prediction performance. |

Hennek, Jonathan William; Kumar, Ashok A; Wiltschko, Alexander B; Patton, Matthew; Lee, Si Yi Ryan; Brugnara, Carlo; Adams, Ryan P; Whitesides, George M Diagnosis of Iron Deficiency Anemia Using Density-based Fractionation of Red Blood Cells Journal Article Lab on a Chip VOLUME = "16", pp. 3929–3939, 2016. @article{hennek2016blood, title = {Diagnosis of Iron Deficiency Anemia Using Density-based Fractionation of Red Blood Cells}, author = {Jonathan William Hennek and Ashok A. Kumar and Alexander B. Wiltschko and Matthew Patton and Si Yi Ryan Lee and Carlo Brugnara and Ryan P. Adams and George M. Whitesides}, url = {http://www.cs.princeton.edu/~rpa/pubs/hennek2016blood.pdf}, year = {2016}, date = {2016-01-01}, journal = {Lab on a Chip VOLUME = "16"}, pages = {3929--3939}, abstract = {Iron deficiency anemia (IDA) is a nutritional disorder that impacts over one billion people worldwide; it may cause permanent cognitive impairment in children, fatigue in adults, and suboptimal outcomes in pregnancy. IDA can be diagnosed by detection of red blood cells (RBCs) that are characteristically small (microcytic) and deficient in hemoglobin (hypochromic), typically by examining the results of a complete blood count performed by a hematology analyzer. These instruments are expensive, not portable, and require trained personnel; they are, therefore, unavailable in many low-resource settings. This paper describes a low-cost and rapid method to diagnose IDA using aqueous multiphase systems (AMPS)—thermodynamically stable mixtures of biocompatible polymers and salt that spontaneously form discrete layers having sharp steps in density. AMPS are preloaded into a microhematocrit tube and used with a drop of blood from a fingerstick. After only two minutes in a low-cost centrifuge, the tests (n = 152) were read by eye with a sensitivity of 84% (72–93%) and a specificity of 78% (68–86%), corresponding to an area under the curve (AUC) of 0.89. The AMPS test outperforms diagnosis by hemoglobin alone (AUC = 0.73) and is comparable to methods used in clinics like reticulocyte hemoglobin concentration (AUC = 0.91). Standard machine learning tools were used to analyze images of the resulting tests captured by a standard desktop scanner to 1) slightly improve diagnosis of IDA—sensitivity of 90% (83–96%) and a specificity of 77% (64–87%), and 2) predict several important red blood cell parameters, such as mean corpuscular hemoglobin concentration. These results suggest that the use of AMPS combined with machine learning provides an approach to developing point-of-care hematology.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Iron deficiency anemia (IDA) is a nutritional disorder that impacts over one billion people worldwide; it may cause permanent cognitive impairment in children, fatigue in adults, and suboptimal outcomes in pregnancy. IDA can be diagnosed by detection of red blood cells (RBCs) that are characteristically small (microcytic) and deficient in hemoglobin (hypochromic), typically by examining the results of a complete blood count performed by a hematology analyzer. These instruments are expensive, not portable, and require trained personnel; they are, therefore, unavailable in many low-resource settings. This paper describes a low-cost and rapid method to diagnose IDA using aqueous multiphase systems (AMPS)—thermodynamically stable mixtures of biocompatible polymers and salt that spontaneously form discrete layers having sharp steps in density. AMPS are preloaded into a microhematocrit tube and used with a drop of blood from a fingerstick. After only two minutes in a low-cost centrifuge, the tests (n = 152) were read by eye with a sensitivity of 84% (72–93%) and a specificity of 78% (68–86%), corresponding to an area under the curve (AUC) of 0.89. The AMPS test outperforms diagnosis by hemoglobin alone (AUC = 0.73) and is comparable to methods used in clinics like reticulocyte hemoglobin concentration (AUC = 0.91). Standard machine learning tools were used to analyze images of the resulting tests captured by a standard desktop scanner to 1) slightly improve diagnosis of IDA—sensitivity of 90% (83–96%) and a specificity of 77% (64–87%), and 2) predict several important red blood cell parameters, such as mean corpuscular hemoglobin concentration. These results suggest that the use of AMPS combined with machine learning provides an approach to developing point-of-care hematology. |

Angelino, Elaine; Johnson, Matthew J; Adams, Ryan P Patterns of Scalable Bayesian Inference Journal Article Foundations and Trends in Machine Learning, 9 (2-3), pp. 119–247, 2016, (arXiv:1602.05221 [stat.ML]). @article{angelino2016patterns, title = {Patterns of Scalable Bayesian Inference}, author = {Elaine Angelino and Matthew J. Johnson and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/angelino2016patterns.pdf}, year = {2016}, date = {2016-01-01}, journal = {Foundations and Trends in Machine Learning}, volume = {9}, number = {2-3}, pages = {119--247}, abstract = {Datasets are growing not just in size but in complexity, creating a demand for rich models and quantification of uncertainty. Bayesian methods are an excellent fit for this demand, but scaling Bayesian inference is a challenge. In response to this challenge, there has been considerable recent work based on varying assumptions about model structure, underlying computational resources, and the importance of asymptotic correctness. As a result, there is a zoo of ideas with few clear overarching principles. In this paper, we seek to identify unifying principles, patterns, and intuitions for scaling Bayesian inference. We review existing work on utilizing modern computing resources with both MCMC and variational approximation techniques. From this taxonomy of ideas, we characterize the general principles that have proven successful for designing scalable inference procedures and comment on the path forward.}, note = {arXiv:1602.05221 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {article} } Datasets are growing not just in size but in complexity, creating a demand for rich models and quantification of uncertainty. Bayesian methods are an excellent fit for this demand, but scaling Bayesian inference is a challenge. In response to this challenge, there has been considerable recent work based on varying assumptions about model structure, underlying computational resources, and the importance of asymptotic correctness. As a result, there is a zoo of ideas with few clear overarching principles. In this paper, we seek to identify unifying principles, patterns, and intuitions for scaling Bayesian inference. We review existing work on utilizing modern computing resources with both MCMC and variational approximation techniques. From this taxonomy of ideas, we characterize the general principles that have proven successful for designing scalable inference procedures and comment on the path forward. |

Rao, Vinayak; Adams, Ryan P; Dunson, David B Bayesian Inference for Matérn Repulsive Processes Journal Article Journal of the Royal Statistical Society: Series B (Statistical Methodology), 79 (3), pp. 877–897, 2016, (arXiv:1308.1136 [stat.ME]). @article{rao2016matern, title = {Bayesian Inference for Matérn Repulsive Processes}, author = {Vinayak Rao and Ryan P. Adams and David B. Dunson}, url = {http://www.cs.princeton.edu/~rpa/pubs/rao2016matern.pdf}, year = {2016}, date = {2016-01-01}, journal = {Journal of the Royal Statistical Society: Series B (Statistical Methodology)}, volume = {79}, number = {3}, pages = {877--897}, abstract = {In many applications involving point pattern data, the Poisson process assumption is unrealistic, with the data exhibiting a more regular spread. Such a repulsion between events is exhibited by trees for example, because of competition for light and nutrients. Other examples include the locations of biological cells and cities, and the times of neuronal spikes. Given the many applications of repulsive point processes, there is a surprisingly limited literature developing flexible, realistic and interpretable models, as well as efficient inferential methods. We address this gap by developing a modelling framework around the Matérn type-III repulsive process. We consider a number of extensions of the original Matérn type-III process for both the homogeneous and inhomogeneous cases. We also derive the probability density of this generalized Matérn process. This allows us to characterize the posterior distribution of the various latent variables, and leads to a novel and efficient Markov chain Monte Carlo algorithm. We apply our ideas to datasets involving the spatial locations of trees, nerve fiber cells and Greyhound bus stations.}, note = {arXiv:1308.1136 [stat.ME]}, keywords = {}, pubstate = {published}, tppubtype = {article} } In many applications involving point pattern data, the Poisson process assumption is unrealistic, with the data exhibiting a more regular spread. Such a repulsion between events is exhibited by trees for example, because of competition for light and nutrients. Other examples include the locations of biological cells and cities, and the times of neuronal spikes. Given the many applications of repulsive point processes, there is a surprisingly limited literature developing flexible, realistic and interpretable models, as well as efficient inferential methods. We address this gap by developing a modelling framework around the Matérn type-III repulsive process. We consider a number of extensions of the original Matérn type-III process for both the homogeneous and inhomogeneous cases. We also derive the probability density of this generalized Matérn process. This allows us to characterize the posterior distribution of the various latent variables, and leads to a novel and efficient Markov chain Monte Carlo algorithm. We apply our ideas to datasets involving the spatial locations of trees, nerve fiber cells and Greyhound bus stations. |

Tarlow, Daniel; Gaunt, Alexander; Adams, Ryan P; Zemel, Richard S Factorizing Shortest Paths with Randomized Optimum Models Book Chapter Perturbations, Optimization, and Statistics, MIT Press, Cambridge, MA, 2016. @inbook{tarlow2016factorizing, title = {Factorizing Shortest Paths with Randomized Optimum Models}, author = {Daniel Tarlow and Alexander Gaunt and Ryan P. Adams and Richard S. Zemel}, url = {http://www.cs.princeton.edu/~rpa/pubs/tarlow2016factorizing.pdf}, year = {2016}, date = {2016-01-01}, booktitle = {Perturbations, Optimization, and Statistics}, publisher = {MIT Press}, address = {Cambridge, MA}, abstract = {Randomized Optimum models (RandOMs) are probabilistic models that de- fine distributions over structured outputs by making use of structured opti- mization procedures within the model definition. This chapter reviews Ran- dOMs and develops a new application of RandOMs to the problem of factorizing shortest paths; that is, given observations of paths that users take to get from one node to another on a graph, learn edge-specific and user- specific trait vectors such that inner products of the two define user-specific edge costs, and the distribution of observed paths can be explained as users taking shortest paths according to noisy samples from their cost function.}, keywords = {}, pubstate = {published}, tppubtype = {inbook} } Randomized Optimum models (RandOMs) are probabilistic models that de- fine distributions over structured outputs by making use of structured opti- mization procedures within the model definition. This chapter reviews Ran- dOMs and develops a new application of RandOMs to the problem of factorizing shortest paths; that is, given observations of paths that users take to get from one node to another on a graph, learn edge-specific and user- specific trait vectors such that inner products of the two define user-specific edge costs, and the distribution of observed paths can be explained as users taking shortest paths according to noisy samples from their cost function. |

Duvenaud, David; Maclaurin, Dougal; Adams, Ryan P Early Stopping is Nonparametric Variational Inference Conference Proceedings of the International Conference on Artificial Intelligence and Statistics (AISTATS), 2016, (arXiv:1504.01344 [stat.ML]). @conference{duvenaud2016early, title = {Early Stopping is Nonparametric Variational Inference}, author = {David Duvenaud and Dougal Maclaurin and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/duvenaud2016early.pdf}, year = {2016}, date = {2016-01-01}, booktitle = {Proceedings of the International Conference on Artificial Intelligence and Statistics (AISTATS)}, abstract = {We show that unconverged stochastic gradient descent can be interpreted as a procedure that samples from a nonparametric variational approximate posterior distribution. This distribution is implicitly defined as the transformation of an initial distribution by a sequence of optimization updates. By tracking the change in entropy over this sequence of transformations during optimization, we form a scalable, unbiased estimate of the variational lower bound on the log marginal likelihood. We can use this bound to optimize hyperparameters instead of using cross-validation. This Bayesian interpretation of SGD suggests improved, overfitting-resistant optimization procedures, and gives a theoretical foundation for popular tricks such as early stopping and ensembling. We investigate the properties of this marginal likelihood estimator on neural network models.}, note = {arXiv:1504.01344 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } We show that unconverged stochastic gradient descent can be interpreted as a procedure that samples from a nonparametric variational approximate posterior distribution. This distribution is implicitly defined as the transformation of an initial distribution by a sequence of optimization updates. By tracking the change in entropy over this sequence of transformations during optimization, we form a scalable, unbiased estimate of the variational lower bound on the log marginal likelihood. We can use this bound to optimize hyperparameters instead of using cross-validation. This Bayesian interpretation of SGD suggests improved, overfitting-resistant optimization procedures, and gives a theoretical foundation for popular tricks such as early stopping and ensembling. We investigate the properties of this marginal likelihood estimator on neural network models. |

Wan, Qian; Adams, Ryan P; Howe, Robert D Variability and Predictability in Tactile Sensing During Grasping Conference Proceedings of the IEEE International Conference on Robotics and Automation (ICRA), 2016. @conference{wan2016grasping, title = {Variability and Predictability in Tactile Sensing During Grasping}, author = {Qian Wan and Ryan P. Adams and Robert D. Howe}, url = {http://www.cs.princeton.edu/~rpa/pubs/qan2016grasping.pdf}, year = {2016}, date = {2016-01-01}, booktitle = {Proceedings of the IEEE International Conference on Robotics and Automation (ICRA)}, abstract = {Robotic manipulation in unstructured environments requires grasping a wide range of objects. Tactile sensing is presumed to provide essential information in this context, but there has been little work examining the tactile sensor signals produced during realistic manipulation tasks. This paper presents tactile sensor data from grasping a generic object in thousands of trials. Position error between the hand and object was varied to model the uncertainty in real-world grasping, and a grasp outcome prediction was done using only tactile sensors. Results show that tactile signals are highly variable despite good repeatability in grasping conditions. The observed variability appears to be intrinsic to the grasping process, due to the mechanical coupling between fingers as they contact the object in parallel, as well as numerous factors such as frictional effects and inaccuracies in the robot hand. Using a simple machine learning algorithm, grasp outcome prediction based purely on tactile sensors is not reliable enough for real-world responsibilities. These results have implications for improved tactile sensor system and controller design, as well as signal processing and machine learning methods.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Robotic manipulation in unstructured environments requires grasping a wide range of objects. Tactile sensing is presumed to provide essential information in this context, but there has been little work examining the tactile sensor signals produced during realistic manipulation tasks. This paper presents tactile sensor data from grasping a generic object in thousands of trials. Position error between the hand and object was varied to model the uncertainty in real-world grasping, and a grasp outcome prediction was done using only tactile sensors. Results show that tactile signals are highly variable despite good repeatability in grasping conditions. The observed variability appears to be intrinsic to the grasping process, due to the mechanical coupling between fingers as they contact the object in parallel, as well as numerous factors such as frictional effects and inaccuracies in the robot hand. Using a simple machine learning algorithm, grasp outcome prediction based purely on tactile sensors is not reliable enough for real-world responsibilities. These results have implications for improved tactile sensor system and controller design, as well as signal processing and machine learning methods. |

Saeedi, Ardavan; Hoffman, Matthew D; Johnson, Matthew J; Adams, Ryan P The Segmented iHMM: A Simple, Efficient Hierarchical Infinite HMM Conference Proceedings of the 33rd International Conference on Machine Learning (ICML), 2016, (arXiv:1602.06349 [stat.ML]). @conference{saeedi2016segmented, title = {The Segmented iHMM: A Simple, Efficient Hierarchical Infinite HMM}, author = {Ardavan Saeedi and Matthew D. Hoffman and Matthew J. Johnson and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/saeedi2016segmented.pdf}, year = {2016}, date = {2016-01-01}, booktitle = {Proceedings of the 33rd International Conference on Machine Learning (ICML)}, abstract = {We propose the segmented iHMM (siHMM), a hierarchical infinite hidden Markov model (iHMM) that supports a simple, efficient inference scheme. The siHMM is well suited to segmentation problems, where the goal is to identify points at which a time series transitions from one relatively stable regime to a new regime. Conventional iHMMs often struggle with such problems, since they have no mechanism for distinguishing between high- and low-level dynamics. Hierarchical HMMs (HHMMs) can do better, but they require much more complex and expensive inference algorithms. The siHMM retains the simplicity and efficiency of the iHMM, but outperforms it on a variety of segmentation problems, achieving performance that matches or exceeds that of a more complicated HHMM.}, note = {arXiv:1602.06349 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } We propose the segmented iHMM (siHMM), a hierarchical infinite hidden Markov model (iHMM) that supports a simple, efficient inference scheme. The siHMM is well suited to segmentation problems, where the goal is to identify points at which a time series transitions from one relatively stable regime to a new regime. Conventional iHMMs often struggle with such problems, since they have no mechanism for distinguishing between high- and low-level dynamics. Hierarchical HMMs (HHMMs) can do better, but they require much more complex and expensive inference algorithms. The siHMM retains the simplicity and efficiency of the iHMM, but outperforms it on a variety of segmentation problems, achieving performance that matches or exceeds that of a more complicated HHMM. |

Hernández-Lobato, Daniel; Hernández-Lobato, José Miguel; Shah, Amar; Adams, Ryan P Predictive Entropy Search for Multi-Objective Bayesian Optimization Conference Proceedings of the 33rd International Conference on Machine Learning (ICML), 2016, (arXiv:1511.05467 [stat.ML]). @conference{lobato2016pesc, title = {Predictive Entropy Search for Multi-Objective Bayesian Optimization}, author = {Daniel Hernández-Lobato and José Miguel Hernández-Lobato and Amar Shah and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/lobato2016pesc.pdf}, year = {2016}, date = {2016-01-01}, booktitle = {Proceedings of the 33rd International Conference on Machine Learning (ICML)}, abstract = {We present PESMO, a Bayesian method for identifying the Pareto set of multi-objective optimization problems, when the functions are expensive to evaluate. The central idea of PESMO is to choose evaluation points so as to maximally reduce the entropy of the posterior distribution over the Pareto set. Critically, the PESMO multi-objective acquisition function can be decomposed as a sum of objective-specific acquisition functions, which enables the algorithm to be used in decoupled scenarios in which the objectives can be evaluated separately and perhaps with different costs. This decoupling capability also makes it possible to identify difficult objectives that require more evaluations. PESMO also offers gains in efficiency, as its cost scales linearly with the number of objectives, in comparison to the exponential cost of other methods. We compare PESMO with other related methods for multi-objective Bayesian optimization on synthetic and real-world problems. The results show that PESMO produces better recommendations with a smaller number of evaluations of the objectives, and that a decoupled evaluation can lead to improvements in performance, particularly when the number of objectives is large.}, note = {arXiv:1511.05467 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } We present PESMO, a Bayesian method for identifying the Pareto set of multi-objective optimization problems, when the functions are expensive to evaluate. The central idea of PESMO is to choose evaluation points so as to maximally reduce the entropy of the posterior distribution over the Pareto set. Critically, the PESMO multi-objective acquisition function can be decomposed as a sum of objective-specific acquisition functions, which enables the algorithm to be used in decoupled scenarios in which the objectives can be evaluated separately and perhaps with different costs. This decoupling capability also makes it possible to identify difficult objectives that require more evaluations. PESMO also offers gains in efficiency, as its cost scales linearly with the number of objectives, in comparison to the exponential cost of other methods. We compare PESMO with other related methods for multi-objective Bayesian optimization on synthetic and real-world problems. The results show that PESMO produces better recommendations with a smaller number of evaluations of the objectives, and that a decoupled evaluation can lead to improvements in performance, particularly when the number of objectives is large. |

Johnson, Matthew J; Duvenaud, David; Wiltschko, Alexander B; Datta, Sandeep Robert; Adams, Ryan P Composing Graphical Models with Neural Networks for Structured Representations and Fast Inference Conference Advances in Neural Information Processing Systems (NIPS) 29, 2016, (arXiv:1603.06277 [stat.ML]). @conference{johnson2016svae, title = {Composing Graphical Models with Neural Networks for Structured Representations and Fast Inference}, author = {Matthew J. Johnson and David Duvenaud and Alexander B. Wiltschko and Sandeep Robert Datta and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/johnson2016svae.pdf}, year = {2016}, date = {2016-01-01}, booktitle = {Advances in Neural Information Processing Systems (NIPS) 29}, abstract = {We propose a general modeling and inference framework that composes probabilistic graphical models with deep learning methods and combines their respective strengths. Our model family augments graphical structure in latent variables with neural network observation models. For inference, we extend variational autoencoders to use graphical model approximating distributions with recognition networks that output conjugate potentials. All components of these models are learned simultaneously with a single objective, giving a scalable algorithm that leverages stochastic variational inference, natural gradients, graphical model message passing, and the reparameterization trick. We illustrate this framework with several example models and an application to mouse behavioral phenotyping.}, note = {arXiv:1603.06277 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } We propose a general modeling and inference framework that composes probabilistic graphical models with deep learning methods and combines their respective strengths. Our model family augments graphical structure in latent variables with neural network observation models. For inference, we extend variational autoencoders to use graphical model approximating distributions with recognition networks that output conjugate potentials. All components of these models are learned simultaneously with a single objective, giving a scalable algorithm that leverages stochastic variational inference, natural gradients, graphical model message passing, and the reparameterization trick. We illustrate this framework with several example models and an application to mouse behavioral phenotyping. |

Linderman, Scott W; Adams, Ryan P; Pillow, Jonathan W Bayesian Latent Structure Discovery from Multi-neuron Recordings Conference Advances in Neural Information Processing Systems (NIPS) 29, 2016, (arXiv:1610.08465 [stat.ML]). @conference{linderman2016multi, title = {Bayesian Latent Structure Discovery from Multi-neuron Recordings}, author = {Scott W. Linderman and Ryan P. Adams and Jonathan W. Pillow}, url = {http://www.cs.princeton.edu/~rpa/pubs/linderman2016multi.pdf}, year = {2016}, date = {2016-01-01}, booktitle = {Advances in Neural Information Processing Systems (NIPS) 29}, abstract = {Neural circuits contain heterogeneous groups of neurons that differ in type, location, connectivity, and basic response properties. However, traditional methods for dimensionality reduction and clustering are ill-suited to recovering the structure underlying the organization of neural circuits. In particular, they do not take advantage of the rich temporal dependencies in multi-neuron recordings and fail to account for the noise in neural spike trains. Here we describe new tools for inferring latent structure from simultaneously recorded spike train data using a hierarchical extension of a multi-neuron point process model commonly known as the generalized linear model (GLM). Our approach combines the GLM with flexible graph-theoretic priors governing the relationship between latent features and neural connectivity patterns. Fully Bayesian inference via Pólya-gamma augmentation of the resulting model allows us to classify neurons and infer latent dimensions of circuit organization from correlated spike trains. We demonstrate the effectiveness of our method with applications to synthetic data and multi-neuron recordings in primate retina, revealing latent patterns of neural types and locations from spike trains alone.}, note = {arXiv:1610.08465 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Neural circuits contain heterogeneous groups of neurons that differ in type, location, connectivity, and basic response properties. However, traditional methods for dimensionality reduction and clustering are ill-suited to recovering the structure underlying the organization of neural circuits. In particular, they do not take advantage of the rich temporal dependencies in multi-neuron recordings and fail to account for the noise in neural spike trains. Here we describe new tools for inferring latent structure from simultaneously recorded spike train data using a hierarchical extension of a multi-neuron point process model commonly known as the generalized linear model (GLM). Our approach combines the GLM with flexible graph-theoretic priors governing the relationship between latent features and neural connectivity patterns. Fully Bayesian inference via Pólya-gamma augmentation of the resulting model allows us to classify neurons and infer latent dimensions of circuit organization from correlated spike trains. We demonstrate the effectiveness of our method with applications to synthetic data and multi-neuron recordings in primate retina, revealing latent patterns of neural types and locations from spike trains alone. |

## 2015 |

Lehman, Li-Wei; Johnson, Matthew J; Nemati, Shamim; Adams, Ryan P; Mark, Roger Bayesian Nonparametric Learning of Switching Dynamics in Cohort Physiological Time Series: Application in Critical Care Patient Monitoring Book Chapter Advanced State Space Methods for Neural and Clinical Data, Cambridge University Press, Cambridge, UK, 2015. @inbook{lehman2015switching, title = {Bayesian Nonparametric Learning of Switching Dynamics in Cohort Physiological Time Series: Application in Critical Care Patient Monitoring}, author = {Li-Wei Lehman and Matthew J. Johnson and Shamim Nemati and Ryan P. Adams and Roger Mark}, year = {2015}, date = {2015-01-01}, booktitle = {Advanced State Space Methods for Neural and Clinical Data}, publisher = {Cambridge University Press}, address = {Cambridge, UK}, abstract = {The time series of vital signs, such as heart rate (HR) and blood pressure (BP), can exhibit complex dynamic behaviors as a result of internally and externally induced changes in the state of the underlying control systems (Peng et al. 1995; Ivanov et al. 1999; Costa et al. 2002). For instance, time series of BP can exhibit oscillations on the order of seconds (e.g., due to the variations in sympathovagal balance), to minutes (e.g., as a consequence of fever, blood loss, or behavioral factors), to hours (e.g., due to humoral variations, sleep-wake cycle, or circadian effects) (Mancia 2012; Parati et al. 2013). A question of interest is whether “similar” dynamical patterns can be automatically identified across a heterogeneous patient cohort, and be used for prognosis of patients' health and progress. In this work, we present a Bayesian nonparametric switching Markov processes framework with conditionally linear dynamics to learn phenotypic dynamic behaviors from vital sign time series of a patient cohort, and use the learned dynamics to characterize the changing physiological states of patients for critical-care bed-side monitoring (Lehman et al. 2012, 2013, 2014a; Nemati 2012). We assume that although the underlying dynamical system may be nonlinear and nonstationary and the stochastic noise components can be non-Gaussian, the dynamics can be approximated by a collection of linear dynamical systems (Nemati 2012; Nemati et al. 2012). Each such linear “dynamic” (or mode) is a time-dependent rule that describes how the future state of the system evolves from its current state, centered around a given system equilibrium point. Therefore, an ideal algorithm would be able to identify time series segments that follow a “similar” dynamic, and would switch to a different mode upon a change in the state of the underlying system. We explore several variants of the Bayesian nonparametric approach to discovery of shared dynamics among patients via switching Markov processes: hierarchical Dirichlet process (HDP) autoregressive hidden Markov model (HDP-AR-HMM) (Teh et al. 2006; Fox et al. 2008), an explicit-duration HDP-based hidden semi-Markov model (HDP-AR-HSMM) (Johnson & Willsky 2013a), and the beta process autoregressive HMM (BP-AR-HMM) (Fox 2009; Fox et al. 2009, 2014).}, keywords = {}, pubstate = {published}, tppubtype = {inbook} } The time series of vital signs, such as heart rate (HR) and blood pressure (BP), can exhibit complex dynamic behaviors as a result of internally and externally induced changes in the state of the underlying control systems (Peng et al. 1995; Ivanov et al. 1999; Costa et al. 2002). For instance, time series of BP can exhibit oscillations on the order of seconds (e.g., due to the variations in sympathovagal balance), to minutes (e.g., as a consequence of fever, blood loss, or behavioral factors), to hours (e.g., due to humoral variations, sleep-wake cycle, or circadian effects) (Mancia 2012; Parati et al. 2013). A question of interest is whether “similar” dynamical patterns can be automatically identified across a heterogeneous patient cohort, and be used for prognosis of patients' health and progress. In this work, we present a Bayesian nonparametric switching Markov processes framework with conditionally linear dynamics to learn phenotypic dynamic behaviors from vital sign time series of a patient cohort, and use the learned dynamics to characterize the changing physiological states of patients for critical-care bed-side monitoring (Lehman et al. 2012, 2013, 2014a; Nemati 2012). We assume that although the underlying dynamical system may be nonlinear and nonstationary and the stochastic noise components can be non-Gaussian, the dynamics can be approximated by a collection of linear dynamical systems (Nemati 2012; Nemati et al. 2012). Each such linear “dynamic” (or mode) is a time-dependent rule that describes how the future state of the system evolves from its current state, centered around a given system equilibrium point. Therefore, an ideal algorithm would be able to identify time series segments that follow a “similar” dynamic, and would switch to a different mode upon a change in the state of the underlying system. We explore several variants of the Bayesian nonparametric approach to discovery of shared dynamics among patients via switching Markov processes: hierarchical Dirichlet process (HDP) autoregressive hidden Markov model (HDP-AR-HMM) (Teh et al. 2006; Fox et al. 2008), an explicit-duration HDP-based hidden semi-Markov model (HDP-AR-HSMM) (Johnson & Willsky 2013a), and the beta process autoregressive HMM (BP-AR-HMM) (Fox 2009; Fox et al. 2009, 2014). |

Maclaurin, Dougal; Duvenaud, David; Adams, Ryan P Gradient-based Hyperparameter Optimization through Reversible Learning Conference Proceedings of the 32nd International Conference on Machine Learning (ICML), 2015, (arXiv:1502.03492 [stat.ML]). @conference{maclaurin2015reversible, title = {Gradient-based Hyperparameter Optimization through Reversible Learning}, author = {Dougal Maclaurin and David Duvenaud and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/maclaurin2015reversible.pdf}, year = {2015}, date = {2015-01-01}, booktitle = {Proceedings of the 32nd International Conference on Machine Learning (ICML)}, abstract = {Tuning hyperparameters of learning algorithms is hard because gradients are usually unavailable. We compute exact gradients of cross-validation performance with respect to all hyperparameters by chaining derivatives backwards through the entire training procedure. These gradients allow us to optimize thousands of hyperparameters, including step-size and momentum schedules, weight initialization distributions, richly parameterized regularization schemes, and neural network architectures. We compute hyperparameter gradients by exactly reversing the dynamics of stochastic gradient descent with momentum.}, note = {arXiv:1502.03492 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Tuning hyperparameters of learning algorithms is hard because gradients are usually unavailable. We compute exact gradients of cross-validation performance with respect to all hyperparameters by chaining derivatives backwards through the entire training procedure. These gradients allow us to optimize thousands of hyperparameters, including step-size and momentum schedules, weight initialization distributions, richly parameterized regularization schemes, and neural network architectures. We compute hyperparameter gradients by exactly reversing the dynamics of stochastic gradient descent with momentum. |

Snoek, Jasper; Rippel, Oren; Swersky, Kevin; Kiros, Ryan; Satish, Nadathur; Sundaram, Narayanan; Patwary, Md. Mostofa Ali; Prabhat, ; Adams, Ryan P Scalable Bayesian Optimization Using Deep Neural Networks Conference Proceedings of the 32nd International Conference on Machine Learning (ICML), 2015, (arXiv:1502.05700 [stat.ML]). @conference{snoek2015scalable, title = {Scalable Bayesian Optimization Using Deep Neural Networks}, author = {Jasper Snoek and Oren Rippel and Kevin Swersky and Ryan Kiros and Nadathur Satish and Narayanan Sundaram and Md. Mostofa Ali Patwary and Prabhat and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/snoek2015scalable.pdf}, year = {2015}, date = {2015-01-01}, booktitle = {Proceedings of the 32nd International Conference on Machine Learning (ICML)}, abstract = {Bayesian optimization is an effective methodology for the global optimization of functions with expensive evaluations. It relies on querying a distribution over functions defined by a relatively cheap surrogate model. An accurate model for this distribution over functions is critical to the effectiveness of the approach, and is typically fit using Gaussian processes (GPs). However, since GPs scale cubically with the number of observations, it has been challenging to handle objectives whose optimization requires many evaluations, and as such, massively parallelizing the optimization. In this work, we explore the use of neural networks as an alternative to GPs to model distributions over functions. We show that performing adaptive basis function regression with a neural network as the parametric form performs competitively with state-of-the-art GP-based approaches, but scales linearly with the number of data rather than cubically. This allows us to achieve a previously intractable degree of parallelism, which we apply to large scale hyperparameter optimization, rapidly finding competitive models on benchmark object recognition tasks using convolutional networks, and image caption generation using neural language models.}, note = {arXiv:1502.05700 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Bayesian optimization is an effective methodology for the global optimization of functions with expensive evaluations. It relies on querying a distribution over functions defined by a relatively cheap surrogate model. An accurate model for this distribution over functions is critical to the effectiveness of the approach, and is typically fit using Gaussian processes (GPs). However, since GPs scale cubically with the number of observations, it has been challenging to handle objectives whose optimization requires many evaluations, and as such, massively parallelizing the optimization. In this work, we explore the use of neural networks as an alternative to GPs to model distributions over functions. We show that performing adaptive basis function regression with a neural network as the parametric form performs competitively with state-of-the-art GP-based approaches, but scales linearly with the number of data rather than cubically. This allows us to achieve a previously intractable degree of parallelism, which we apply to large scale hyperparameter optimization, rapidly finding competitive models on benchmark object recognition tasks using convolutional networks, and image caption generation using neural language models. |

Hernández-Lobato, José Miguel; Adams, Ryan P Probabilistic Backpropagation for Scalable Learning of Bayesian Neural Networks Conference Proceedings of the 32nd International Conference on Machine Learning (ICML), 2015, (arXiv:1502.05336 [stat.ML]). @conference{lobato2015probabilistic, title = {Probabilistic Backpropagation for Scalable Learning of Bayesian Neural Networks}, author = {José Miguel Hernández-Lobato and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/lobato2015probabilistic.pdf}, year = {2015}, date = {2015-01-01}, booktitle = {Proceedings of the 32nd International Conference on Machine Learning (ICML)}, abstract = {Large multilayer neural networks trained with backpropagation have recently achieved state-of-the-art results in a wide range of problems. However, using backprop for neural net learning still has some disadvantages, e.g., having to tune a large number of hyperparameters to the data, lack of calibrated probabilistic predictions, and a tendency to overfit the training data. In principle, the Bayesian approach to learning neural networks does not have these problems. However, existing Bayesian techniques lack scalability to large dataset and network sizes. In this work we present a novel scalable method for learning Bayesian neural networks, called probabilistic backpropagation (PBP). Similar to classical backpropagation, PBP works by computing a forward propagation of probabilities through the network and then doing a backward computation of gradients. A series of experiments on ten real-world datasets show that PBP is significantly faster than other techniques, while offering competitive predictive abilities. Our experiments also show that PBP provides accurate estimates of the posterior variance on the network weights.}, note = {arXiv:1502.05336 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Large multilayer neural networks trained with backpropagation have recently achieved state-of-the-art results in a wide range of problems. However, using backprop for neural net learning still has some disadvantages, e.g., having to tune a large number of hyperparameters to the data, lack of calibrated probabilistic predictions, and a tendency to overfit the training data. In principle, the Bayesian approach to learning neural networks does not have these problems. However, existing Bayesian techniques lack scalability to large dataset and network sizes. In this work we present a novel scalable method for learning Bayesian neural networks, called probabilistic backpropagation (PBP). Similar to classical backpropagation, PBP works by computing a forward propagation of probabilities through the network and then doing a backward computation of gradients. A series of experiments on ten real-world datasets show that PBP is significantly faster than other techniques, while offering competitive predictive abilities. Our experiments also show that PBP provides accurate estimates of the posterior variance on the network weights. |

Hernández-Lobato, José Miguel; Gelbart, Michael A; Hoffman, Matthew W; Adams, Ryan P; Ghahramani, Zoubin Predictive Entropy Search for Bayesian Optimization with Unknown Constraints Conference Proceedings of the 32nd International Conference on Machine Learning (ICML), 2015, (arXiv:1502.05312 [stat.ML]). @conference{lobato2015predictive, title = {Predictive Entropy Search for Bayesian Optimization with Unknown Constraints}, author = {José Miguel Hernández-Lobato and Michael A. Gelbart and Matthew W. Hoffman and Ryan P. Adams and Zoubin Ghahramani}, url = {http://www.cs.princeton.edu/~rpa/pubs/lobato2015predictive.pdf}, year = {2015}, date = {2015-01-01}, booktitle = {Proceedings of the 32nd International Conference on Machine Learning (ICML)}, abstract = {Unknown constraints arise in many types of expensive black-box optimization problems. Several methods have been proposed recently for performing Bayesian optimization with constraints, based on the expected improvement (EI) heuristic. However, EI can lead to pathologies when used with constraints. For example, in the case of decoupled constraints—i.e., when one can independently evaluate the objective or the constraints—EI can encounter a pathology that prevents exploration. Additionally, computing EI requires a current best solution, which may not exist if none of the data collected so far satisfy the constraints. By contrast, information based approaches do not suffer from these failure modes. In this paper, we present a new information-based method called Predictive Entropy Search with Constraints (PESC). We analyze the performance of PESC and show that it compares favorably to EI-based approaches on synthetic and benchmark problems, as well as several real-world examples. We demonstrate that PESC is an effective algorithm that provides a promising direction towards a unified solution for constrained Bayesian optimization.}, note = {arXiv:1502.05312 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Unknown constraints arise in many types of expensive black-box optimization problems. Several methods have been proposed recently for performing Bayesian optimization with constraints, based on the expected improvement (EI) heuristic. However, EI can lead to pathologies when used with constraints. For example, in the case of decoupled constraints—i.e., when one can independently evaluate the objective or the constraints—EI can encounter a pathology that prevents exploration. Additionally, computing EI requires a current best solution, which may not exist if none of the data collected so far satisfy the constraints. By contrast, information based approaches do not suffer from these failure modes. In this paper, we present a new information-based method called Predictive Entropy Search with Constraints (PESC). We analyze the performance of PESC and show that it compares favorably to EI-based approaches on synthetic and benchmark problems, as well as several real-world examples. We demonstrate that PESC is an effective algorithm that provides a promising direction towards a unified solution for constrained Bayesian optimization. |

Regier, Jeffrey; Miller, Andrew C; McAuliffe, Jon; Adams, Ryan P; Hoffman, Matthew D; Lang, Dustin; Schlegel, David; Prabhat, Celeste: Variational Inference for a Generative Model of Astronomical Images Conference Proceedings of the 32nd International Conference on Machine Learning (ICML), 2015, (arXiv:1506.01351 [astro-ph.IM]). @conference{regier2015celeste, title = {Celeste: Variational Inference for a Generative Model of Astronomical Images}, author = {Jeffrey Regier and Andrew C. Miller and Jon McAuliffe and Ryan P. Adams and Matthew D. Hoffman and Dustin Lang and David Schlegel and Prabhat}, url = {http://www.cs.princeton.edu/~rpa/pubs/regier2015celeste.pdf}, year = {2015}, date = {2015-01-01}, booktitle = {Proceedings of the 32nd International Conference on Machine Learning (ICML)}, abstract = {We present a new, fully generative model of optical telescope image sets, along with a variational procedure for inference. Each pixel intensity is treated as a Poisson random variable, with a rate parameter dependent on latent properties of stars and galaxies. Key latent properties are themselves random, with scientific prior distributions constructed from large ancillary data sets. We check our approach on synthetic images. We also run it on images from a major sky survey, where it exceeds the performance of the current state-of-the-art method for locating celestial bodies and measuring their colors.}, note = {arXiv:1506.01351 [astro-ph.IM]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } We present a new, fully generative model of optical telescope image sets, along with a variational procedure for inference. Each pixel intensity is treated as a Poisson random variable, with a rate parameter dependent on latent properties of stars and galaxies. Key latent properties are themselves random, with scientific prior distributions constructed from large ancillary data sets. We check our approach on synthetic images. We also run it on images from a major sky survey, where it exceeds the performance of the current state-of-the-art method for locating celestial bodies and measuring their colors. |

Rippel, Oren; Snoek, Jasper; Adams, Ryan P Spectral Representations for Convolutional Neural Networks Conference Advances in Neural Information Processing Systems (NIPS) 28, 2015, (arXiv:1506.03767 [stat.ML]). @conference{rippel2015spectral, title = {Spectral Representations for Convolutional Neural Networks}, author = {Oren Rippel and Jasper Snoek and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/rippel2015spectral.pdf}, year = {2015}, date = {2015-01-01}, booktitle = {Advances in Neural Information Processing Systems (NIPS) 28}, abstract = {Discrete Fourier transforms provide a significant speedup in the computation of convolutions in deep learning. In this work, we demonstrate that, beyond its advantages for efficient computation, the spectral domain also provides a powerful representation in which to model and train convolutional neural networks (CNNs). We employ spectral representations to introduce a number of innovations to CNN design. First, we propose spectral pooling, which performs dimensionality reduction by truncating the representation in the frequency domain. This approach preserves considerably more information per parameter than other pooling strategies and enables flexibility in the choice of pooling output dimensionality. This representation also enables a new form of stochastic regularization by randomized modification of resolution. We show that these methods achieve competitive results on classification and approximation tasks, without using any dropout or max-pooling. Finally, we demonstrate the effectiveness of complex-coefficient spectral parameterization of convolutional filters. While this leaves the underlying model unchanged, it results in a representation that greatly facilitates optimization. We observe on a variety of popular CNN configurations that this leads to significantly faster convergence during training.}, note = {arXiv:1506.03767 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Discrete Fourier transforms provide a significant speedup in the computation of convolutions in deep learning. In this work, we demonstrate that, beyond its advantages for efficient computation, the spectral domain also provides a powerful representation in which to model and train convolutional neural networks (CNNs). We employ spectral representations to introduce a number of innovations to CNN design. First, we propose spectral pooling, which performs dimensionality reduction by truncating the representation in the frequency domain. This approach preserves considerably more information per parameter than other pooling strategies and enables flexibility in the choice of pooling output dimensionality. This representation also enables a new form of stochastic regularization by randomized modification of resolution. We show that these methods achieve competitive results on classification and approximation tasks, without using any dropout or max-pooling. Finally, we demonstrate the effectiveness of complex-coefficient spectral parameterization of convolutional filters. While this leaves the underlying model unchanged, it results in a representation that greatly facilitates optimization. We observe on a variety of popular CNN configurations that this leads to significantly faster convergence during training. |

Wiltschko, Alexander B; Johnson, Matthew J; Iurilli, Giuliano; Peterson, Ralph E; Katon, Jesse M; Pashkovski, Stan L; Abraira, Victoria E; Adams, Ryan P; Datta, Sandeep Robert Mapping Sub-Second Structure in Mouse Behavior Journal Article Neuron, 88 (6), pp. 1121–1135, 2015. @article{wiltschko2015behavior, title = {Mapping Sub-Second Structure in Mouse Behavior}, author = {Alexander B. Wiltschko and Matthew J. Johnson and Giuliano Iurilli and Ralph E. Peterson and Jesse M. Katon and Stan L. Pashkovski and Victoria E. Abraira and Ryan P. Adams and Sandeep Robert Datta}, url = {http://www.cs.princeton.edu/~rpa/pubs/wiltschko2015behavior.pdf}, year = {2015}, date = {2015-01-01}, journal = {Neuron}, volume = {88}, number = {6}, pages = {1121--1135}, abstract = {Complex animal behaviors are likely built from simpler modules, but their systematic identification in mammals remains a significant challenge. Here we use depth imaging to show that 3D mouse pose dynamics are structured at the sub-second timescale. Computational modeling of these fast dynamics effectively describes mouse behavior as a series of reused and stereotyped modules with defined transition probabilities. We demonstrate this combined 3D imaging and machine learning method can be used to unmask potential strategies employed by the brain to adapt to the environment, to capture both predicted and previously hidden phenotypes caused by genetic or neural manipulations, and to systematically expose the global structure of behavior within an experiment. This work reveals that mouse body language is built from identifiable components and is organized in a predictable fashion; deciphering this language establishes an objective framework for characterizing the influence of environmental cues, genes and neural activity on behavior.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Complex animal behaviors are likely built from simpler modules, but their systematic identification in mammals remains a significant challenge. Here we use depth imaging to show that 3D mouse pose dynamics are structured at the sub-second timescale. Computational modeling of these fast dynamics effectively describes mouse behavior as a series of reused and stereotyped modules with defined transition probabilities. We demonstrate this combined 3D imaging and machine learning method can be used to unmask potential strategies employed by the brain to adapt to the environment, to capture both predicted and previously hidden phenotypes caused by genetic or neural manipulations, and to systematically expose the global structure of behavior within an experiment. This work reveals that mouse body language is built from identifiable components and is organized in a predictable fashion; deciphering this language establishes an objective framework for characterizing the influence of environmental cues, genes and neural activity on behavior. |

Miller, Andrew C; Wu, Albert; Regier, Jeffrey; McAuliffe, Jon; Lang, Dustin; Prabhat, ; Schlegel, David; Adams, Ryan P A Gaussian Process Model of Quasar Spectral Energy Distributions Conference Advances in Neural Information Processing Systems (NIPS) 28, 2015. @conference{miller2015quasars, title = {A Gaussian Process Model of Quasar Spectral Energy Distributions}, author = {Andrew C. Miller and Albert Wu and Jeffrey Regier and Jon McAuliffe and Dustin Lang and Prabhat and David Schlegel and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/miller2015quasars.pdf}, year = {2015}, date = {2015-01-01}, booktitle = {Advances in Neural Information Processing Systems (NIPS) 28}, abstract = {We propose a method for combining two sources of astronomical data, spectroscopy and photometry, that carry information about sources of light (e.g., stars, galaxies, and quasars) at extremely different spectral resolutions. Our model treats the spectral energy distribution (SED) of the radiation from a source as a latent variable that jointly explains both photometric and spectroscopic observations. We place a flexible, nonparametric prior over the SED of a light source that admits a physically interpretable decomposition, and allows us to tractably perform inference. We use our model to predict the distribution of the redshift of a quasar from five-band (low spectral resolution) photometric data, the so called photo-z'' problem. Our method shows that tools from machine learning and Bayesian statistics allow us to leverage multiple resolutions of information to make accurate predictions with well-characterized uncertainties.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } We propose a method for combining two sources of astronomical data, spectroscopy and photometry, that carry information about sources of light (e.g., stars, galaxies, and quasars) at extremely different spectral resolutions. Our model treats the spectral energy distribution (SED) of the radiation from a source as a latent variable that jointly explains both photometric and spectroscopic observations. We place a flexible, nonparametric prior over the SED of a light source that admits a physically interpretable decomposition, and allows us to tractably perform inference. We use our model to predict the distribution of the redshift of a quasar from five-band (low spectral resolution) photometric data, the so called photo-z'' problem. Our method shows that tools from machine learning and Bayesian statistics allow us to leverage multiple resolutions of information to make accurate predictions with well-characterized uncertainties. |

Linderman, Scott W; Johnson, Matthew J; Adams, Ryan P Dependent Multinomial Models Made Easy: Stick-Breaking with the Polya-gamma Augmentation Conference Advances in Neural Information Processing Systems (NIPS) 28, 2015, (arXiv:1506.05843 [stat.ML]). @conference{linderman2015multinomial, title = {Dependent Multinomial Models Made Easy: Stick-Breaking with the Polya-gamma Augmentation}, author = {Scott W. Linderman and Matthew J. Johnson and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/linderman2015multinomial.pdf}, year = {2015}, date = {2015-01-01}, booktitle = {Advances in Neural Information Processing Systems (NIPS) 28}, abstract = {Many practical modeling problems involve discrete data that are best represented as draws from multinomial or categorical distributions. For example, nucleotides in a DNA sequence, children's names in a given state and year, and text documents are all commonly modeled with multinomial distributions. In all of these cases, we expect some form of dependency between the draws: the nucleotide at one position in the DNA strand may depend on the preceding nucleotides, children's names are highly correlated from year to year, and topics in text may be correlated and dynamic. These dependencies are not naturally captured by the typical Dirichlet-multinomial formulation. Here, we leverage a logistic stick-breaking representation and recent innovations in Polya-gamma augmentation to reformulate the multinomial distribution in terms of latent variables with jointly Gaussian likelihoods, enabling us to take advantage of a host of Bayesian inference techniques for Gaussian models with minimal overhead.}, note = {arXiv:1506.05843 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Many practical modeling problems involve discrete data that are best represented as draws from multinomial or categorical distributions. For example, nucleotides in a DNA sequence, children's names in a given state and year, and text documents are all commonly modeled with multinomial distributions. In all of these cases, we expect some form of dependency between the draws: the nucleotide at one position in the DNA strand may depend on the preceding nucleotides, children's names are highly correlated from year to year, and topics in text may be correlated and dynamic. These dependencies are not naturally captured by the typical Dirichlet-multinomial formulation. Here, we leverage a logistic stick-breaking representation and recent innovations in Polya-gamma augmentation to reformulate the multinomial distribution in terms of latent variables with jointly Gaussian likelihoods, enabling us to take advantage of a host of Bayesian inference techniques for Gaussian models with minimal overhead. |

Duvenaud, David; Maclaurin, Dougal; Aguilera-Iparraguirre, Jorge; Gómez-Bombarelli, Rafael; Hirzel, Timothy D; Aspuru-Guzik, Alan; Adams, Ryan P Convolutional Networks on Graphs for Learning Molecular Fingerprints Conference Advances in Neural Information Processing Systems (NIPS) 28, 2015, (arXiv:1509.09292 [stat.ML]). @conference{duvenaud2015fingerprints, title = {Convolutional Networks on Graphs for Learning Molecular Fingerprints}, author = {David Duvenaud and Dougal Maclaurin and Jorge Aguilera-Iparraguirre and Rafael Gómez-Bombarelli and Timothy D. Hirzel and Alan Aspuru-Guzik and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/duvenaud2015fingerprints.pdf}, year = {2015}, date = {2015-01-01}, booktitle = {Advances in Neural Information Processing Systems (NIPS) 28}, abstract = {We introduce a convolutional neural network that operates directly on graphs. These networks allow end-to-end learning of prediction pipelines whose inputs are graphs of arbitrary size and shape. The architecture we present generalizes standard molecular feature extraction methods based on circular fingerprints. We show that these data-driven features are more interpretable, and have better predictive performance on a variety of tasks.}, note = {arXiv:1509.09292 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } We introduce a convolutional neural network that operates directly on graphs. These networks allow end-to-end learning of prediction pipelines whose inputs are graphs of arbitrary size and shape. The architecture we present generalizes standard molecular feature extraction methods based on circular fingerprints. We show that these data-driven features are more interpretable, and have better predictive performance on a variety of tasks. |

Linderman, Scott W; Adams, Ryan P Scalable Bayesian Inference for Excitatory Point Process Networks Unpublished 2015, (arXiv:1507.03228 [stat.ML]). @unpublished{linderman2015scalable, title = {Scalable Bayesian Inference for Excitatory Point Process Networks}, author = {Scott W. Linderman and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/linderman2015scalable.pdf}, year = {2015}, date = {2015-01-01}, abstract = {Networks capture our intuition about relationships in the world. They describe the friendships between Facebook users, interactions in financial markets, and synapses connecting neurons in the brain. These networks are richly structured with cliques of friends, sectors of stocks, and a smorgasbord of cell types that govern how neurons connect. Some networks, like social network friendships, can be directly observed, but in many cases we only have an indirect view of the network through the actions of its constituents and an understanding of how the network mediates that activity. In this work, we focus on the problem of latent network discovery in the case where the observable activity takes the form of a mutually-excitatory point process known as a Hawkes process. We build on previous work that has taken a Bayesian approach to this problem, specifying prior distributions over the latent network structure and a likelihood of observed activity given this network. We extend this work by proposing a discrete-time formulation and developing a computationally efficient stochastic variational inference (SVI) algorithm that allows us to scale the approach to long sequences of observations. We demonstrate our algorithm on the calcium imaging data used in the Chalearn neural connectomics challenge.}, note = {arXiv:1507.03228 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {unpublished} } Networks capture our intuition about relationships in the world. They describe the friendships between Facebook users, interactions in financial markets, and synapses connecting neurons in the brain. These networks are richly structured with cliques of friends, sectors of stocks, and a smorgasbord of cell types that govern how neurons connect. Some networks, like social network friendships, can be directly observed, but in many cases we only have an indirect view of the network through the actions of its constituents and an understanding of how the network mediates that activity. In this work, we focus on the problem of latent network discovery in the case where the observable activity takes the form of a mutually-excitatory point process known as a Hawkes process. We build on previous work that has taken a Bayesian approach to this problem, specifying prior distributions over the latent network structure and a likelihood of observed activity given this network. We extend this work by proposing a discrete-time formulation and developing a computationally efficient stochastic variational inference (SVI) algorithm that allows us to scale the approach to long sequences of observations. We demonstrate our algorithm on the calcium imaging data used in the Chalearn neural connectomics challenge. |

Lehman, Li-Wei; Adams, Ryan P; Mayaud, Louis; Moody, George; Malhotra, Atul; Mark, Roger; Nemati, Shamim A Physiological Time Series Dynamics-Based Approach to Patient Monitoring and Outcome Prediction Journal Article IEEE Journal of Biomedical and Health Informatics, 19 (3), pp. 1068–1076, 2015. @article{lehman2015physiological, title = {A Physiological Time Series Dynamics-Based Approach to Patient Monitoring and Outcome Prediction}, author = {Li-Wei Lehman and Ryan P. Adams and Louis Mayaud and George Moody and Atul Malhotra and Roger Mark and Shamim Nemati}, url = {http://www.cs.princeton.edu/~rpa/pubs/lehman2015physiological.pdf}, year = {2015}, date = {2015-01-01}, journal = {IEEE Journal of Biomedical and Health Informatics}, volume = {19}, number = {3}, pages = {1068--1076}, abstract = { Cardiovascular variables such as heart rate (HR) and blood pressure (BP) are regulated by an underlying control system, and therefore the time series of these vital signs exhibit rich dynamical patterns of interaction in response to external perturbations (e.g., drug administration) as well as pathological states (e.g., onset of sepsis and hypotension). A question of interest is whether similar dynamical patterns can be identified across a heterogeneous patient cohort, and be used for prognosis of patients’ health and progress. In this work, we used a switching vector autoregressive (SVAR) framework to systematically learn and identify a collection of vital sign time series dynamics, which are possibly recurrent within the same patient and may be shared across the entire cohort. We show that these dynamical behaviors can be used to characterize the physiological state of a patient. We validate our technique using simulated time series of the cardiovascular system, and human recordings of HR and BP time series from an orthostatic stress study with known postural states. Using the HR and BP dynamics of an intensive care unit (ICU) cohort of over 450 patients from the MIMIC II database, we demonstrate that the discovered cardiovascular dynamics are significantly associated with hospital mortality (dynamic modes 3 and 9}, keywords = {}, pubstate = {published}, tppubtype = {article} } Cardiovascular variables such as heart rate (HR) and blood pressure (BP) are regulated by an underlying control system, and therefore the time series of these vital signs exhibit rich dynamical patterns of interaction in response to external perturbations (e.g., drug administration) as well as pathological states (e.g., onset of sepsis and hypotension). A question of interest is whether similar dynamical patterns can be identified across a heterogeneous patient cohort, and be used for prognosis of patients’ health and progress. In this work, we used a switching vector autoregressive (SVAR) framework to systematically learn and identify a collection of vital sign time series dynamics, which are possibly recurrent within the same patient and may be shared across the entire cohort. We show that these dynamical behaviors can be used to characterize the physiological state of a patient. We validate our technique using simulated time series of the cardiovascular system, and human recordings of HR and BP time series from an orthostatic stress study with known postural states. Using the HR and BP dynamics of an intensive care unit (ICU) cohort of over 450 patients from the MIMIC II database, we demonstrate that the discovered cardiovascular dynamics are significantly associated with hospital mortality (dynamic modes 3 and 9 |

Nemati, Shamim; Adams, Ryan P Identifying Outcome-Discriminative Dynamics in Multivariate Physiological Cohort Time Series Book Chapter Advanced State Space Methods for Neural and Clinical Data, Cambridge University Press, Cambridge, UK, 2015. @inbook{nemati2015identifying, title = {Identifying Outcome-Discriminative Dynamics in Multivariate Physiological Cohort Time Series}, author = {Shamim Nemati and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/nemati2015identifying.pdf}, year = {2015}, date = {2015-01-01}, booktitle = {Advanced State Space Methods for Neural and Clinical Data}, publisher = {Cambridge University Press}, address = {Cambridge, UK}, abstract = {In this chapter, we present a learning algorithm specifically designed to learn dynam- ical features of time series that are directly predictive of the associated labels. Rather than depending on label-free unsupervised learning to discover relevant features of the time series, we build a system that expressly learns the dynamics that are most rele- vant for classifying time series labels. Our goal is to obtain compact representations of nonstationary and multivariate time series (representation learning)(Bengio, Courville & Vincent 2013). To accomplish this we use a connection between dynamic bayesian networks (e.g., the switching VAR model) and artificial neural networks (ANNs) to perform inference and learning in state-space models in a manner analogous to back- propagation in neural networks (Rumelhart, Hinton & Williams 1988). This connection stems from the observation that the directed acyclic graph structure of a state-space model can be unrolled both as a function of time and inference steps to yield a deter- ministic neural network with efficient parameter tying across time (see Fig. 1.2). Thus, the parameters governing the dynamics and observation model of a state-space model can be learned in a manner analogous to that of a neural network. Indeed, the resulting system can be viewed as a compactly-parameterized recurrent neural network (RNN) (Sutskever 2013). Although the standard use of RNNs has been for time series pre- diction (network output is the predicted input time series in the future) or sequential labeling (when output is a label sequence associated with the input data sequence), with additional processing layers one may obtain a time series classifier from this class of models (Graves, Ferna ́ndez, Gomez & Schmidhuber 2006). Nevertheless, RNNs have proven hard to train, since the optimization surface tend to include multiple local min- ima. Moreover, standard RNN are ’black box’ algorithms(as apposed to ’model-based’) and therefore do allow for incorporation of physiological models of the underlying sys- tems. The framework proposed here addresses both these shortcomings. First, knowl- edge of the underlying physiology can be directly incorporated into the state-space mod- els that constitute the basic building blocks of a dynamic Bayesian network. Secondly, equipped with a generative model, we can rely on unsupervised pre-training (via expec- tation maximization) to systematically initialize the parameters of the equivalent RNN; in a manner analogous to pre-training of very large neural networks (deep learning) (Erhan, Bengio, Courville, Manzagol, Vincent & Bengio 2010).}, keywords = {}, pubstate = {published}, tppubtype = {inbook} } In this chapter, we present a learning algorithm specifically designed to learn dynam- ical features of time series that are directly predictive of the associated labels. Rather than depending on label-free unsupervised learning to discover relevant features of the time series, we build a system that expressly learns the dynamics that are most rele- vant for classifying time series labels. Our goal is to obtain compact representations of nonstationary and multivariate time series (representation learning)(Bengio, Courville & Vincent 2013). To accomplish this we use a connection between dynamic bayesian networks (e.g., the switching VAR model) and artificial neural networks (ANNs) to perform inference and learning in state-space models in a manner analogous to back- propagation in neural networks (Rumelhart, Hinton & Williams 1988). This connection stems from the observation that the directed acyclic graph structure of a state-space model can be unrolled both as a function of time and inference steps to yield a deter- ministic neural network with efficient parameter tying across time (see Fig. 1.2). Thus, the parameters governing the dynamics and observation model of a state-space model can be learned in a manner analogous to that of a neural network. Indeed, the resulting system can be viewed as a compactly-parameterized recurrent neural network (RNN) (Sutskever 2013). Although the standard use of RNNs has been for time series pre- diction (network output is the predicted input time series in the future) or sequential labeling (when output is a label sequence associated with the input data sequence), with additional processing layers one may obtain a time series classifier from this class of models (Graves, Ferna ́ndez, Gomez & Schmidhuber 2006). Nevertheless, RNNs have proven hard to train, since the optimization surface tend to include multiple local min- ima. Moreover, standard RNN are ’black box’ algorithms(as apposed to ’model-based’) and therefore do allow for incorporation of physiological models of the underlying sys- tems. The framework proposed here addresses both these shortcomings. First, knowl- edge of the underlying physiology can be directly incorporated into the state-space mod- els that constitute the basic building blocks of a dynamic Bayesian network. Secondly, equipped with a generative model, we can rely on unsupervised pre-training (via expec- tation maximization) to systematically initialize the parameters of the equivalent RNN; in a manner analogous to pre-training of very large neural networks (deep learning) (Erhan, Bengio, Courville, Manzagol, Vincent & Bengio 2010). |

## 2014 |

Nishihara, Robert; Murray, Iain; Adams, Ryan P Parallel MCMC with Generalized Elliptical Slice Sampling Journal Article Journal of Machine Learning Research, 15 (1), pp. 2087-2112, 2014. @article{nishihara2014generalized, title = {Parallel MCMC with Generalized Elliptical Slice Sampling}, author = {Robert Nishihara and Iain Murray and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/nishihara2014generalized.pdf}, year = {2014}, date = {2014-01-01}, journal = {Journal of Machine Learning Research}, volume = {15}, number = {1}, pages = {2087-2112}, abstract = {Probabilistic models are conceptually powerful tools for finding structure in data, but their practical effectiveness is often limited by our ability to perform inference in them. Exact inference is frequently intractable, so approximate inference is often performed using Markov chain Monte Carlo (MCMC). To achieve the best possible results from MCMC, we want to efficiently simulate many steps of a rapidly mixing Markov chain which leaves the target distribution invariant. Of particular interest in this regard is how to take advantage of multi-core computing to speed up MCMC-based inference, both to improve mixing and to distribute the computational load. In this paper, we present a parallelizable Markov chain Monte Carlo algorithm for efficiently sampling from continuous probability distributions that can take advantage of hundreds of cores. This method shares information between parallel Markov chains to build a scale-location mixture of Gaussians approximation to the density function of the target distribution. We combine this approximation with a recently developed method known as elliptical slice sampling to create a Markov chain with no step-size parameters that can mix rapidly without requiring gradient or curvature computations.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Probabilistic models are conceptually powerful tools for finding structure in data, but their practical effectiveness is often limited by our ability to perform inference in them. Exact inference is frequently intractable, so approximate inference is often performed using Markov chain Monte Carlo (MCMC). To achieve the best possible results from MCMC, we want to efficiently simulate many steps of a rapidly mixing Markov chain which leaves the target distribution invariant. Of particular interest in this regard is how to take advantage of multi-core computing to speed up MCMC-based inference, both to improve mixing and to distribute the computational load. In this paper, we present a parallelizable Markov chain Monte Carlo algorithm for efficiently sampling from continuous probability distributions that can take advantage of hundreds of cores. This method shares information between parallel Markov chains to build a scale-location mixture of Gaussians approximation to the density function of the target distribution. We combine this approximation with a recently developed method known as elliptical slice sampling to create a Markov chain with no step-size parameters that can mix rapidly without requiring gradient or curvature computations. |

Duvenaud, David; Rippel, Oren; Adams, Ryan P; Ghahramani, Zoubin Avoiding Pathologies in Very Deep Networks Conference Proceedings of the 17th International Conference on Artificial Intelligence and Statistics (AISTATS), 2014, (arXiv:1402.5836 [stat.ML]). @conference{duvenaud2014pathologies, title = {Avoiding Pathologies in Very Deep Networks}, author = {David Duvenaud and Oren Rippel and Ryan P. Adams and Zoubin Ghahramani}, url = {http://www.cs.princeton.edu/~rpa/pubs/duvenaud2014pathologies.pdf}, year = {2014}, date = {2014-01-01}, booktitle = {Proceedings of the 17th International Conference on Artificial Intelligence and Statistics (AISTATS)}, abstract = {Choosing appropriate architectures and regularization strategies for deep networks is crucial to good predictive performance. To shed light on this problem, we analyze the analogous problem of constructing useful priors on compositions of functions. Specifically, we study the deep Gaussian process, a type of infinitely-wide, deep neural network. We show that in standard architectures, the representational capacity of the network tends to capture fewer degrees of freedom as the number of layers increases, retaining only a single degree of freedom in the limit. We propose an alternate network architecture which does not suffer from this pathology. We also examine deep covariance functions, obtained by composing infinitely many feature transforms. Lastly, we characterize the class of models obtained by performing dropout on Gaussian processes.}, note = {arXiv:1402.5836 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Choosing appropriate architectures and regularization strategies for deep networks is crucial to good predictive performance. To shed light on this problem, we analyze the analogous problem of constructing useful priors on compositions of functions. Specifically, we study the deep Gaussian process, a type of infinitely-wide, deep neural network. We show that in standard architectures, the representational capacity of the network tends to capture fewer degrees of freedom as the number of layers increases, retaining only a single degree of freedom in the limit. We propose an alternate network architecture which does not suffer from this pathology. We also examine deep covariance functions, obtained by composing infinitely many feature transforms. Lastly, we characterize the class of models obtained by performing dropout on Gaussian processes. |

Waterland, Amos; Angelino, Elaine; Adams, Ryan P; Appavoo, Jonathan; Seltzer, Margo ASC: Automatically Scalable Computation Conference Proceedings of the 19th International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS), 2014. @conference{waterland2014scalable, title = {ASC: Automatically Scalable Computation}, author = {Amos Waterland and Elaine Angelino and Ryan P. Adams and Jonathan Appavoo and Margo Seltzer}, url = {http://www.cs.princeton.edu/~rpa/pubs/waterland2014scalable.pdf}, year = {2014}, date = {2014-01-01}, booktitle = {Proceedings of the 19th International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS)}, abstract = {We present an architecture designed to transparently and automatically scale the performance of sequential programs as a function of the hardware resources available. The architecture is predicated on a model of computation that views program execution as a walk through the enormous state space composed of the memory and registers of a single- threaded processor. Each instruction execution in this model moves the system from its current point in state space to a deterministic subsequent point. We can parallelize such execution by predictively partitioning the complete path and speculatively executing each partition in parallel. Accurately partitioning the path is a challenging prediction problem. We have implemented our system using a functional simulator that emulates the x86 instruction set, including a collection of state predictors and a mechanism for speculatively executing threads that explore potential states along the execution path. While the overhead of our simulation makes it impractical to measure speedup relative to native x86 execution, experiments on three benchmarks show scalability of up to a factor of 256 on a 1024 core machine when executing unmodified sequential programs.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } We present an architecture designed to transparently and automatically scale the performance of sequential programs as a function of the hardware resources available. The architecture is predicated on a model of computation that views program execution as a walk through the enormous state space composed of the memory and registers of a single- threaded processor. Each instruction execution in this model moves the system from its current point in state space to a deterministic subsequent point. We can parallelize such execution by predictively partitioning the complete path and speculatively executing each partition in parallel. Accurately partitioning the path is a challenging prediction problem. We have implemented our system using a functional simulator that emulates the x86 instruction set, including a collection of state predictors and a mechanism for speculatively executing threads that explore potential states along the execution path. While the overhead of our simulation makes it impractical to measure speedup relative to native x86 execution, experiments on three benchmarks show scalability of up to a factor of 256 on a 1024 core machine when executing unmodified sequential programs. |

Gao, Xi Alice; Mao, Andrew; Chen, Yiling; Adams, Ryan P Trick or Treat: Putting Peer Prediction to the Test Conference Proceedings of the 15th ACM Conference on Economics and Computation (EC), 2014. @conference{gao2014peer, title = {Trick or Treat: Putting Peer Prediction to the Test}, author = {Xi Alice Gao and Andrew Mao and Yiling Chen and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/gao2014peer.pdf}, year = {2014}, date = {2014-01-01}, booktitle = {Proceedings of the 15th ACM Conference on Economics and Computation (EC)}, abstract = {Collecting truthful subjective information from multiple individuals is an important problem in many social and online systems. While peer prediction mechanisms promise to elicit truthful information by rewarding participants with carefully constructed payments, they also admit uninformative equilibria where coordinating participants provide no useful information. To understand how participants behave towards such mechanisms in practice, we conduct the first controlled online experiment of a peer prediction mechanism, engaging the participants in a multiplayer, real-time and repeated game. Using a hidden Markov model to capture players’ strategies from their actions, our results show that participants successfully coordinate on uninformative equilibria and the truthful equilibrium is not focal, even when some uninformative equilibria do not exist or are undesirable. In contrast, most players are consistently truthful in the absence of peer prediction, suggesting that these mechanisms may be harmful when truthful reporting has similar cost to strategic behavior.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Collecting truthful subjective information from multiple individuals is an important problem in many social and online systems. While peer prediction mechanisms promise to elicit truthful information by rewarding participants with carefully constructed payments, they also admit uninformative equilibria where coordinating participants provide no useful information. To understand how participants behave towards such mechanisms in practice, we conduct the first controlled online experiment of a peer prediction mechanism, engaging the participants in a multiplayer, real-time and repeated game. Using a hidden Markov model to capture players’ strategies from their actions, our results show that participants successfully coordinate on uninformative equilibria and the truthful equilibrium is not focal, even when some uninformative equilibria do not exist or are undesirable. In contrast, most players are consistently truthful in the absence of peer prediction, suggesting that these mechanisms may be harmful when truthful reporting has similar cost to strategic behavior. |

Affandi, Raja Hafiz; Fox, Emily B; Adams, Ryan P; Taskar, Ben Learning the Parameters of Determinantal Point Process Kernels Conference Proceedings of the 31st International Conference on Machine Learning (ICML), 2014. @conference{affandi2014determinantal, title = {Learning the Parameters of Determinantal Point Process Kernels}, author = {Raja Hafiz Affandi and Emily B. Fox and Ryan P. Adams and Ben Taskar}, url = {http://www.cs.princeton.edu/~rpa/pubs/affandi2014determinantal.pdf}, year = {2014}, date = {2014-01-01}, booktitle = {Proceedings of the 31st International Conference on Machine Learning (ICML)}, abstract = {Determinantal point processes (DPPs) are well-suited for modeling repulsion and have proven useful in many applications where diversity is desired. While DPPs have many appealing properties, such as efficient sampling, learning the parameters of a DPP is still considered a difficult problem due to the non-convex nature of the likelihood function. In this paper, we propose using Bayesian methods to learn the DPP kernel parameters. These methods are applicable in large-scale and continuous DPP settings even when the exact form of the eigendecomposition is unknown. We demonstrate the utility of our DPP learning methods in studying the progression of diabetic neuropathy based on spatial distribution of nerve fibers, and in studying human perception of diversity in images.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Determinantal point processes (DPPs) are well-suited for modeling repulsion and have proven useful in many applications where diversity is desired. While DPPs have many appealing properties, such as efficient sampling, learning the parameters of a DPP is still considered a difficult problem due to the non-convex nature of the likelihood function. In this paper, we propose using Bayesian methods to learn the DPP kernel parameters. These methods are applicable in large-scale and continuous DPP settings even when the exact form of the eigendecomposition is unknown. We demonstrate the utility of our DPP learning methods in studying the progression of diabetic neuropathy based on spatial distribution of nerve fibers, and in studying human perception of diversity in images. |

Rippel, Oren; Gelbart, Michael A; Adams, Ryan P Learning Ordered Representations with Nested Dropout Conference Proceedings of the 31st International Conference on Machine Learning (ICML), 2014, (arXiv:1402.0915 [stat.ML]). @conference{rippel2014nested, title = {Learning Ordered Representations with Nested Dropout}, author = {Oren Rippel and Michael A. Gelbart and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/rippel2014nested.pdf}, year = {2014}, date = {2014-01-01}, booktitle = {Proceedings of the 31st International Conference on Machine Learning (ICML)}, abstract = {In this paper, we study ordered representations of data in which different dimensions have different degrees of importance. To learn these representations we introduce nested dropout, a procedure for stochastically removing coherent nested sets of hidden units in a neural network. We first present a sequence of theoretical results in the simple case of a semi-linear autoencoder. We rigorously show that the application of nested dropout enforces identifiability of the units, which leads to an exact equivalence with PCA. We then extend the algorithm to deep models and demonstrate the relevance of ordered representations to a number of applications. Specifically, we use the ordered property of the learned codes to construct hash-based data structures that permit very fast retrieval, achieving retrieval in time logarithmic in the database size and independent of the dimensionality of the representation. This allows codes that are hundreds of times longer than currently feasible for retrieval. We therefore avoid the diminished quality associated with short codes, while still performing retrieval that is competitive in speed with existing methods. We also show that ordered representations are a promising way to learn adaptive compression for efficient online data reconstruction.}, note = {arXiv:1402.0915 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } In this paper, we study ordered representations of data in which different dimensions have different degrees of importance. To learn these representations we introduce nested dropout, a procedure for stochastically removing coherent nested sets of hidden units in a neural network. We first present a sequence of theoretical results in the simple case of a semi-linear autoencoder. We rigorously show that the application of nested dropout enforces identifiability of the units, which leads to an exact equivalence with PCA. We then extend the algorithm to deep models and demonstrate the relevance of ordered representations to a number of applications. Specifically, we use the ordered property of the learned codes to construct hash-based data structures that permit very fast retrieval, achieving retrieval in time logarithmic in the database size and independent of the dimensionality of the representation. This allows codes that are hundreds of times longer than currently feasible for retrieval. We therefore avoid the diminished quality associated with short codes, while still performing retrieval that is competitive in speed with existing methods. We also show that ordered representations are a promising way to learn adaptive compression for efficient online data reconstruction. |

Snoek, Jasper; Swersky, Kevin; Zemel, Richard S; Adams, Ryan P Input Warping for Bayesian Optimization of Non-Stationary Functions Conference Proceedings of the 31st International Conference on Machine Learning (ICML), 2014, (arXiv:1402.0929 [stat.ML]). @conference{snoek2014warping, title = {Input Warping for Bayesian Optimization of Non-Stationary Functions}, author = {Jasper Snoek and Kevin Swersky and Richard S. Zemel and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/snoek2014warping.pdf}, year = {2014}, date = {2014-01-01}, booktitle = {Proceedings of the 31st International Conference on Machine Learning (ICML)}, abstract = {Bayesian optimization has proven to be a highly effective methodology for the global optimization of unknown, expensive and multimodal functions. The ability to accurately model distributions over functions is critical to the effectiveness of Bayesian optimization. Although Gaussian processes provide a flexible prior over functions which can be queried efficiently, there are various classes of functions that remain difficult to model. One of the most frequently occurring of these is the class of non-stationary functions. The optimization of the hyperparameters of machine learning algorithms is a problem domain in which parameters are often manually transformed a priori, for example by optimizing in "log-space," to mitigate the effects of spatially-varying length scale. We develop a methodology for automatically learning a wide family of bijective transformations or warpings of the input space using the Beta cumulative distribution function. We further extend the warping framework to multi-task Bayesian optimization so that multiple tasks can be warped into a jointly stationary space. On a set of challenging benchmark optimization tasks, we observe that the inclusion of warping greatly improves on the state-of-the-art, producing better results faster and more reliably.}, note = {arXiv:1402.0929 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Bayesian optimization has proven to be a highly effective methodology for the global optimization of unknown, expensive and multimodal functions. The ability to accurately model distributions over functions is critical to the effectiveness of Bayesian optimization. Although Gaussian processes provide a flexible prior over functions which can be queried efficiently, there are various classes of functions that remain difficult to model. One of the most frequently occurring of these is the class of non-stationary functions. The optimization of the hyperparameters of machine learning algorithms is a problem domain in which parameters are often manually transformed a priori, for example by optimizing in "log-space," to mitigate the effects of spatially-varying length scale. We develop a methodology for automatically learning a wide family of bijective transformations or warpings of the input space using the Beta cumulative distribution function. We further extend the warping framework to multi-task Bayesian optimization so that multiple tasks can be warped into a jointly stationary space. On a set of challenging benchmark optimization tasks, we observe that the inclusion of warping greatly improves on the state-of-the-art, producing better results faster and more reliably. |

Miller, Andrew C; Bornn, Luke; Adams, Ryan P; Goldsberry, Kirk Factorized Point Process Intensities: A Spatial Analysis of Professional Basketball Conference Proceedings of the 31st International Conference on Machine Learning (ICML), 2014, (arXiv:1401.0942 [stat.ML]). @conference{miller2014factorized, title = {Factorized Point Process Intensities: A Spatial Analysis of Professional Basketball}, author = {Andrew C. Miller and Luke Bornn and Ryan P. Adams and Kirk Goldsberry}, url = {http://www.cs.princeton.edu/~rpa/pubs/miller2014factorized.pdf}, year = {2014}, date = {2014-01-01}, booktitle = {Proceedings of the 31st International Conference on Machine Learning (ICML)}, abstract = {We develop a machine learning approach to represent and analyze the underlying spatial structure that governs shot selection among professional basketball players in the NBA. Typically, NBA players are discussed and compared in an heuristic, imprecise manner that relies on unmeasured intuitions about player behavior. This makes it difficult to draw comparisons between players and make accurate player specific predictions. Modeling shot attempt data as a point process, we create a low dimensional representation of offensive player types in the NBA. Using non-negative matrix factorization (NMF), an unsupervised dimensionality reduction technique, we show that a low-rank spatial decomposition summarizes the shooting habits of NBA players. The spatial representations discovered by the algorithm correspond to intuitive descriptions of NBA player types, and can be used to model other spatial effects, such as shooting accuracy.}, note = {arXiv:1401.0942 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } We develop a machine learning approach to represent and analyze the underlying spatial structure that governs shot selection among professional basketball players in the NBA. Typically, NBA players are discussed and compared in an heuristic, imprecise manner that relies on unmeasured intuitions about player behavior. This makes it difficult to draw comparisons between players and make accurate player specific predictions. Modeling shot attempt data as a point process, we create a low dimensional representation of offensive player types in the NBA. Using non-negative matrix factorization (NMF), an unsupervised dimensionality reduction technique, we show that a low-rank spatial decomposition summarizes the shooting habits of NBA players. The spatial representations discovered by the algorithm correspond to intuitive descriptions of NBA player types, and can be used to model other spatial effects, such as shooting accuracy. |

Linderman, Scott W; Adams, Ryan P Discovering Latent Network Structure in Point Process Data Conference Proceedings of the 31st International Conference on Machine Learning (ICML), 2014, (arXiv:1402.0914 [stat.ML]). @conference{linderman2014networks, title = {Discovering Latent Network Structure in Point Process Data}, author = {Scott W. Linderman and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/linderman2014networks.pdf}, year = {2014}, date = {2014-01-01}, booktitle = {Proceedings of the 31st International Conference on Machine Learning (ICML)}, abstract = {Networks play a central role in modern data analysis, enabling us to reason about systems by studying the relationships between their parts. Most often in network analysis, the edges are given. However, in many systems it is difficult or impossible to measure the network directly. Examples of latent networks include economic interactions linking financial instruments and patterns of reciprocity in gang violence. In these cases, we are limited to noisy observations of events associated with each node. To enable analysis of these implicit networks, we develop a probabilistic model that combines mutually-exciting point processes with random graph models. We show how the Poisson superposition principle enables an elegant auxiliary variable formulation and a fully-Bayesian, parallel inference algorithm. We evaluate this new model empirically on several datasets.}, note = {arXiv:1402.0914 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Networks play a central role in modern data analysis, enabling us to reason about systems by studying the relationships between their parts. Most often in network analysis, the edges are given. However, in many systems it is difficult or impossible to measure the network directly. Examples of latent networks include economic interactions linking financial instruments and patterns of reciprocity in gang violence. In these cases, we are limited to noisy observations of events associated with each node. To enable analysis of these implicit networks, we develop a probabilistic model that combines mutually-exciting point processes with random graph models. We show how the Poisson superposition principle enables an elegant auxiliary variable formulation and a fully-Bayesian, parallel inference algorithm. We evaluate this new model empirically on several datasets. |

Maclaurin, Dougal; Adams, Ryan P Firefly Monte Carlo: Exact MCMC with Subsets of Data Conference Proceedings of the 30th Conference on Uncertainty in Artificial Intelligence (UAI), 2014, (arXiv:1403.5693 [stat.ML]). @conference{maclaurin2014firefly, title = {Firefly Monte Carlo: Exact MCMC with Subsets of Data}, author = {Dougal Maclaurin and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/maclaurin2014firefly.pdf}, year = {2014}, date = {2014-01-01}, booktitle = {Proceedings of the 30th Conference on Uncertainty in Artificial Intelligence (UAI)}, abstract = {Markov chain Monte Carlo (MCMC) is a popular and successful general-purpose tool for Bayesian inference. However, MCMC cannot be practically applied to large data sets because of the prohibitive cost of evaluating every likelihood term at every iteration. Here we present Fire- fly Monte Carlo (FlyMC) an auxiliary variable MCMC algorithm that only queries the likelihoods of a potentially small subset of the data at each iteration yet simulates from the exact posterior distribution, in contrast to recent proposals that are approximate even in the asymptotic limit. FlyMC is compatible with a wide variety of modern MCMC algorithms, and only requires a lower bound on the per-datum likelihood factors. In experiments, we find that FlyMC generates samples from the posterior more than an order of magnitude faster than regular MCMC, opening up MCMC methods to larger datasets than were previously considered feasible.}, note = {arXiv:1403.5693 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Markov chain Monte Carlo (MCMC) is a popular and successful general-purpose tool for Bayesian inference. However, MCMC cannot be practically applied to large data sets because of the prohibitive cost of evaluating every likelihood term at every iteration. Here we present Fire- fly Monte Carlo (FlyMC) an auxiliary variable MCMC algorithm that only queries the likelihoods of a potentially small subset of the data at each iteration yet simulates from the exact posterior distribution, in contrast to recent proposals that are approximate even in the asymptotic limit. FlyMC is compatible with a wide variety of modern MCMC algorithms, and only requires a lower bound on the per-datum likelihood factors. In experiments, we find that FlyMC generates samples from the posterior more than an order of magnitude faster than regular MCMC, opening up MCMC methods to larger datasets than were previously considered feasible. |

Gelbart, Michael A; Snoek, Jasper; Adams, Ryan P Bayesian Optimization with Unknown Constraints Conference Proceedings of the 30th Conference on Uncertainty in Artificial Intelligence (UAI), 2014, (arXiv:1403.5607 [stat.ML]). @conference{gelbart2014constraints, title = {Bayesian Optimization with Unknown Constraints}, author = {Michael A. Gelbart and Jasper Snoek and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/gelbart2014constraints.pdf}, year = {2014}, date = {2014-01-01}, booktitle = {Proceedings of the 30th Conference on Uncertainty in Artificial Intelligence (UAI)}, abstract = {Recent work on Bayesian optimization has shown its effectiveness in global optimization of difficult black-box objective functions. Many real-world optimization problems of interest also have constraints which are unknown a priori. In this paper, we study Bayesian optimization for constrained problems in the general case that noise may be present in the constraint functions, and the objective and constraints may be evaluated independently. We provide motivating practical examples, and present a general frame- work to solve such problems. We demonstrate the effectiveness of our approach on optimizing the performance of online latent Dirichlet allocation subject to topic sparsity constraints, tun- ing a neural network given test-time memory constraints, and optimizing Hamiltonian Monte Carlo to achieve maximal effectiveness in a fixed time, subject to passing standard convergence diagnostics.}, note = {arXiv:1403.5607 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Recent work on Bayesian optimization has shown its effectiveness in global optimization of difficult black-box objective functions. Many real-world optimization problems of interest also have constraints which are unknown a priori. In this paper, we study Bayesian optimization for constrained problems in the general case that noise may be present in the constraint functions, and the objective and constraints may be evaluated independently. We provide motivating practical examples, and present a general frame- work to solve such problems. We demonstrate the effectiveness of our approach on optimizing the performance of online latent Dirichlet allocation subject to topic sparsity constraints, tun- ing a neural network given test-time memory constraints, and optimizing Hamiltonian Monte Carlo to achieve maximal effectiveness in a fixed time, subject to passing standard convergence diagnostics. |

Angelino, Elaine; Kohler, Eddie; Waterland, Amos; Seltzer, Margo; Adams, Ryan P Accelerating MCMC via Parallel Predictive Prefetching Conference Proceedings of the 30th Conference on Uncertainty in Artificial Intelligence (UAI), 2014, (arXiv:1403.7265 [stat.ML]). @conference{angelino2014accelerating, title = {Accelerating MCMC via Parallel Predictive Prefetching}, author = {Elaine Angelino and Eddie Kohler and Amos Waterland and Margo Seltzer and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/angelino2014accelerating.pdf}, year = {2014}, date = {2014-01-01}, booktitle = {Proceedings of the 30th Conference on Uncertainty in Artificial Intelligence (UAI)}, abstract = {Parallel predictive prefetching is a new frame- work for accelerating a large class of widely-used Markov chain Monte Carlo (MCMC) algorithms. It speculatively evaluates many potential steps of an MCMC chain in parallel while exploiting fast, iterative approximations to the tar- get density. This can accelerate sampling from target distributions in Bayesian inference problems. Our approach takes advantage of whatever parallel resources are available, but produces results exactly equivalent to standard serial execution. In the initial burn-in phase of chain evaluation, we achieve speedup close to linear in the number of available cores.}, note = {arXiv:1403.7265 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Parallel predictive prefetching is a new frame- work for accelerating a large class of widely-used Markov chain Monte Carlo (MCMC) algorithms. It speculatively evaluates many potential steps of an MCMC chain in parallel while exploiting fast, iterative approximations to the tar- get density. This can accelerate sampling from target distributions in Bayesian inference problems. Our approach takes advantage of whatever parallel resources are available, but produces results exactly equivalent to standard serial execution. In the initial burn-in phase of chain evaluation, we achieve speedup close to linear in the number of available cores. |

Linderman, Scott W; Stock, Christopher H; Adams, Ryan P A Framework for Studying Synaptic Plasticity with Neural Spike Train Data Conference Advances in Neural Information Processing Systems (NIPS) 27, 2014. @conference{linderman2014plasticity, title = {A Framework for Studying Synaptic Plasticity with Neural Spike Train Data}, author = {Scott W. Linderman and Christopher H. Stock and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/linderman2014plasticity.pdf}, year = {2014}, date = {2014-01-01}, booktitle = {Advances in Neural Information Processing Systems (NIPS) 27}, abstract = {Learning and memory in the brain are implemented by complex, time-varying changes in neural circuitry. The computational rules according to which synaptic weights change over time are the subject of much research, and are not precisely understood. Until recently, limitations in experimental methods have made it challenging to test hypotheses about synaptic plasticity on a large scale. However, as such data become available and these barriers are lifted, it becomes necessary to develop analysis techniques to validate plasticity models. Here, we present a highly extensible framework for modeling arbitrary synaptic plasticity rules on spike train data in populations of interconnected neurons. We treat synaptic weights as a (potentially nonlinear) dynamical system embedded in a fully-Bayesian generalized linear model (GLM). In addition, we provide an algorithm for inferring synaptic weight trajectories alongside the parameters of the GLM and of the learning rules. Using this method, we perform model comparison of two proposed variants of the well-known spike-timing-dependent plasticity (STDP) rule, where nonlinear effects play a substantial role. On synthetic data generated from the biophysical simulator NEURON, we show that we can recover the weight trajectories, the pattern of connectivity, and the underlying learning rules.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Learning and memory in the brain are implemented by complex, time-varying changes in neural circuitry. The computational rules according to which synaptic weights change over time are the subject of much research, and are not precisely understood. Until recently, limitations in experimental methods have made it challenging to test hypotheses about synaptic plasticity on a large scale. However, as such data become available and these barriers are lifted, it becomes necessary to develop analysis techniques to validate plasticity models. Here, we present a highly extensible framework for modeling arbitrary synaptic plasticity rules on spike train data in populations of interconnected neurons. We treat synaptic weights as a (potentially nonlinear) dynamical system embedded in a fully-Bayesian generalized linear model (GLM). In addition, we provide an algorithm for inferring synaptic weight trajectories alongside the parameters of the GLM and of the learning rules. Using this method, we perform model comparison of two proposed variants of the well-known spike-timing-dependent plasticity (STDP) rule, where nonlinear effects play a substantial role. On synthetic data generated from the biophysical simulator NEURON, we show that we can recover the weight trajectories, the pattern of connectivity, and the underlying learning rules. |

Swersky, Kevin; Snoek, Jasper; Adams, Ryan P Freeze-Thaw Bayesian Optimization Unpublished 2014, (arXiv:1406.3896 [stat.ML]). @unpublished{swersky2014freeze, title = {Freeze-Thaw Bayesian Optimization}, author = {Kevin Swersky and Jasper Snoek and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/swersky2014freeze.pdf}, year = {2014}, date = {2014-01-01}, abstract = {In this paper we develop a dynamic form of Bayesian optimization for machine learning models with the goal of rapidly finding good hyperparameter settings. Our method uses the partial information gained during the training of a machine learning model in order to decide whether to pause training and start a new model, or resume the training of a previously-considered model. We specifically tailor our method to machine learning problems by developing a novel positive-definite covariance kernel to capture a variety of training curves. Furthermore, we develop a Gaussian process prior that scales gracefully with additional temporal observations. Finally, we provide an information-theoretic framework to automate the decision process. Experiments on several common machine learning models show that our approach is extremely effective in practice.}, note = {arXiv:1406.3896 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {unpublished} } In this paper we develop a dynamic form of Bayesian optimization for machine learning models with the goal of rapidly finding good hyperparameter settings. Our method uses the partial information gained during the training of a machine learning model in order to decide whether to pause training and start a new model, or resume the training of a previously-considered model. We specifically tailor our method to machine learning problems by developing a novel positive-definite covariance kernel to capture a variety of training curves. Furthermore, we develop a Gaussian process prior that scales gracefully with additional temporal observations. Finally, we provide an information-theoretic framework to automate the decision process. Experiments on several common machine learning models show that our approach is extremely effective in practice. |

Engelhardt, Barbara E; Adams, Ryan P Bayesian Structured Sparsity from Gaussian Fields Unpublished 2014, (arXiv:1407.2235 [stat.ME]). @unpublished{engelhardt2014sparsity, title = {Bayesian Structured Sparsity from Gaussian Fields}, author = {Barbara E. Engelhardt and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/engelhardt2014sparsity.pdf}, year = {2014}, date = {2014-01-01}, abstract = {Substantial research on structured sparsity has contributed to analysis of many different applications. However, there have been few Bayesian procedures among this work. Here, we develop a Bayesian model for structured sparsity that uses a Gaussian process (GP) to share parameters of the sparsity-inducing prior in proportion to feature similarity as defined by an arbitrary positive definite kernel. For linear regression, this sparsity-inducing prior on regression coefficients is a relaxation of the canonical spike-and-slab prior that flattens the mixture model into a scale mixture of normals. This prior retains the explicit posterior probability on inclusion parameters---now with GP probit prior distributions---but enables tractable computation via elliptical slice sampling for the latent Gaussian field. We motivate development of this prior using the genomic application of association mapping, or identifying genetic variants associated with a continuous trait. Our Bayesian structured sparsity model produced sparse results with substantially improved sensitivity and precision relative to comparable methods. Through simulations, we show that three properties are key to this improvement: i) modeling structure in the covariates, ii) significance testing using the posterior probabilities of inclusion, and iii) model averaging. We present results from applying this model to a large genomic dataset to demonstrate computational tractability.}, note = {arXiv:1407.2235 [stat.ME]}, keywords = {}, pubstate = {published}, tppubtype = {unpublished} } Substantial research on structured sparsity has contributed to analysis of many different applications. However, there have been few Bayesian procedures among this work. Here, we develop a Bayesian model for structured sparsity that uses a Gaussian process (GP) to share parameters of the sparsity-inducing prior in proportion to feature similarity as defined by an arbitrary positive definite kernel. For linear regression, this sparsity-inducing prior on regression coefficients is a relaxation of the canonical spike-and-slab prior that flattens the mixture model into a scale mixture of normals. This prior retains the explicit posterior probability on inclusion parameters---now with GP probit prior distributions---but enables tractable computation via elliptical slice sampling for the latent Gaussian field. We motivate development of this prior using the genomic application of association mapping, or identifying genetic variants associated with a continuous trait. Our Bayesian structured sparsity model produced sparse results with substantially improved sensitivity and precision relative to comparable methods. Through simulations, we show that three properties are key to this improvement: i) modeling structure in the covariates, ii) significance testing using the posterior probabilities of inclusion, and iii) model averaging. We present results from applying this model to a large genomic dataset to demonstrate computational tractability. |

## 2013 |

Tse, Henry T K; Gossett, Daniel R; Moon, Yo Sup; Masaeli, Mahdokht; Sohsman, Marie; Ying, Yong; Mislick, Kimberly; Adams, Ryan P; Rao, Jianyu; DiCarlo, Dino Quantitative Diagnosis of Malignant Pleural Effusions by Single-Cell Mechanophenotyping Journal Article Science Translational Medicine, 5 (212), 2013. @article{tse2013mechano, title = {Quantitative Diagnosis of Malignant Pleural Effusions by Single-Cell Mechanophenotyping}, author = {Henry T.K. Tse and Daniel R. Gossett and Yo Sup Moon and Mahdokht Masaeli and Marie Sohsman and Yong Ying and Kimberly Mislick and Ryan P. Adams and Jianyu Rao and Dino DiCarlo}, url = {http://www.cs.princeton.edu/~rpa/pubs/tse2013mechano.pdf}, year = {2013}, date = {2013-01-01}, journal = {Science Translational Medicine}, volume = {5}, number = {212}, abstract = {Biophysical characteristics of cells are attractive as potential diagnostic markers for cancer. Transformation of cell state or phenotype and the accompanying epigenetic, nuclear, and cytoplasmic modifications lead to measureable changes in cellular architecture. We recently introduced a technique called deformability cytometry (DC) that enables rapid mechanophenotyping of single cells in suspension at rates of 1000 cells/s—a throughput that is com- parable to traditional flow cytometry. We applied this technique to diagnose malignant pleural effusions, in which disseminated tumor cells can be difficult to accurately identify by traditional cytology. An algorithmic diagnostic scoring system was developed on the basis of quantitative features of two-dimensional distributions of single- cell mechanophenotypes from 119 samples. The DC scoring system classified 63% of the samples into two high-confidence regimes with 100% positive predictive value or 100% negative predictive value, and achieved an area under the curve of 0.86. This performance is suitable for a prescreening role to focus cytopathologist anal- ysis time on a smaller fraction of difficult samples. Diagnosis of samples that present a challenge to cytology was also improved. Samples labeled as “atypical cells,” which require additional time and follow-up, were classi- fied in high-confidence regimes in 8 of 15 cases. Further, 10 of 17 cytology-negative samples corresponding to patients with concurrent cancer were correctly classified as malignant or negative, in agreement with 6-month outcomes. This study lays the groundwork for broader validation of label-free quantitative biophysical markers for clinical diagnoses of cancer and inflammation, which could help to reduce laboratory workload and improve clinical decision-making.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Biophysical characteristics of cells are attractive as potential diagnostic markers for cancer. Transformation of cell state or phenotype and the accompanying epigenetic, nuclear, and cytoplasmic modifications lead to measureable changes in cellular architecture. We recently introduced a technique called deformability cytometry (DC) that enables rapid mechanophenotyping of single cells in suspension at rates of 1000 cells/s—a throughput that is com- parable to traditional flow cytometry. We applied this technique to diagnose malignant pleural effusions, in which disseminated tumor cells can be difficult to accurately identify by traditional cytology. An algorithmic diagnostic scoring system was developed on the basis of quantitative features of two-dimensional distributions of single- cell mechanophenotypes from 119 samples. The DC scoring system classified 63% of the samples into two high-confidence regimes with 100% positive predictive value or 100% negative predictive value, and achieved an area under the curve of 0.86. This performance is suitable for a prescreening role to focus cytopathologist anal- ysis time on a smaller fraction of difficult samples. Diagnosis of samples that present a challenge to cytology was also improved. Samples labeled as “atypical cells,” which require additional time and follow-up, were classi- fied in high-confidence regimes in 8 of 15 cases. Further, 10 of 17 cytology-negative samples corresponding to patients with concurrent cancer were correctly classified as malignant or negative, in agreement with 6-month outcomes. This study lays the groundwork for broader validation of label-free quantitative biophysical markers for clinical diagnoses of cancer and inflammation, which could help to reduce laboratory workload and improve clinical decision-making. |

Wilson, Andrew Gordon; Adams, Ryan P Gaussian Process Kernels for Pattern Discovery and Extrapolation Conference Proceedings of the 30th International Conference on Machine Learning (ICML), 2013, (arXiv:1302.4245 [stat.ML]). @conference{wilson2013kernels, title = {Gaussian Process Kernels for Pattern Discovery and Extrapolation}, author = {Andrew Gordon Wilson and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/wilson2013kernels.pdf}, year = {2013}, date = {2013-01-01}, booktitle = {Proceedings of the 30th International Conference on Machine Learning (ICML)}, abstract = {Gaussian processes are rich distributions over functions, which provide a Bayesian nonparametric approach to smoothing and interpolation. We introduce simple closed form ker- nels that can be used with Gaussian processes to discover patterns and enable extrapolation. These kernels are derived by modelling a spectral density – the Fourier transform of a kernel – with a Gaussian mixture. The proposed kernels support a broad class of stationary covariances, but Gaussian process inference remains simple and analytic. We demonstrate the proposed kernels by discovering patterns and performing long range extrapolation on synthetic examples, as well as atmospheric CO2 trends and airline passenger data. We also show that it is possible to reconstruct several popular standard covariances within our framework.}, note = {arXiv:1302.4245 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Gaussian processes are rich distributions over functions, which provide a Bayesian nonparametric approach to smoothing and interpolation. We introduce simple closed form ker- nels that can be used with Gaussian processes to discover patterns and enable extrapolation. These kernels are derived by modelling a spectral density – the Fourier transform of a kernel – with a Gaussian mixture. The proposed kernels support a broad class of stationary covariances, but Gaussian process inference remains simple and analytic. We demonstrate the proposed kernels by discovering patterns and performing long range extrapolation on synthetic examples, as well as atmospheric CO2 trends and airline passenger data. We also show that it is possible to reconstruct several popular standard covariances within our framework. |

Waterland, Amos; Angelino, Elaine; Cubuk, Elkin D; Kaxiras, Efthimios; Adams, Ryan P; Appavoo, Jonathan; Seltzer, Margo Computational Caches Conference Proceedings of the International Systems and Storage Conference (SYSTOR), 2013. @conference{waterland2013caches, title = {Computational Caches}, author = {Amos Waterland and Elaine Angelino and Elkin D. Cubuk and Efthimios Kaxiras and Ryan P. Adams and Jonathan Appavoo and Margo Seltzer}, url = {http://www.cs.princeton.edu/~rpa/pubs/waterland2013caches.pdf}, year = {2013}, date = {2013-01-01}, booktitle = {Proceedings of the International Systems and Storage Conference (SYSTOR)}, abstract = {Caching is a well-known technique for speeding up computation. We cache data from file systems and databases; we cache dynamically generated code blocks; we cache page translations in TLBs. We propose to cache the act of computation, so that we can apply it later and in different contexts. We use a state-space model of computation to support such caching, involving two interrelated parts: speculatively memoized predicted/resultant state pairs that we use to accelerate sequential computation, and trained probabilistic models that we use to generate predicted states from which to speculatively execute. The key techniques that make this approach feasible are designing probabilistic models that automatically focus on regions of program execution state space in which prediction is tractable and identifying state space equivalence classes so that predictions need not be exact.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Caching is a well-known technique for speeding up computation. We cache data from file systems and databases; we cache dynamically generated code blocks; we cache page translations in TLBs. We propose to cache the act of computation, so that we can apply it later and in different contexts. We use a state-space model of computation to support such caching, involving two interrelated parts: speculatively memoized predicted/resultant state pairs that we use to accelerate sequential computation, and trained probabilistic models that we use to generate predicted states from which to speculatively execute. The key techniques that make this approach feasible are designing probabilistic models that automatically focus on regions of program execution state space in which prediction is tractable and identifying state space equivalence classes so that predictions need not be exact. |

Lehman, Li-Wei; Nemati, Shamim; Adams, Ryan P; Moody, George; Malhotra, Atul; Mark, Roger Proceedings of the 35th International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC), 2013. @conference{lehman2013tracking, title = {Tracking Progression of Patient State of Health in Critical Care Using Inferred Shared Dynamics in Physiological Time Series}, author = {Li-Wei Lehman and Shamim Nemati and Ryan P. Adams and George Moody and Atul Malhotra and Roger Mark}, url = {http://www.cs.princeton.edu/~rpa/pubs/lehman2013tracking.pdf}, year = {2013}, date = {2013-01-01}, booktitle = {Proceedings of the 35th International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC)}, abstract = {Physiologic systems generate complex dynamics in their output signals that reflect the changing state of the underlying control systems. In this work, we used a switching vector autoregressive (switching VAR) framework to systematically learn and identify a collection of vital sign dynamics, which can possibly be recurrent within the same patient and shared across the entire cohort. We show that these dynamical behaviors can be used to characterize and elucidate the progression of patients’ states of health over time. Using the mean arterial blood pressure time series of 337 ICU patients during the first 24 hours of their ICU stays, we demonstrated that the learned dynamics from as early as the first 8 hours of patients’ ICU stays can achieve similar hospital mortality prediction performance as the well-known SAPS-I acuity scores, suggesting that the discovered latent dynamics structure may yield more timely insights into the progression of a patient’s state of health than the traditional snapshot-based acuity scores.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Physiologic systems generate complex dynamics in their output signals that reflect the changing state of the underlying control systems. In this work, we used a switching vector autoregressive (switching VAR) framework to systematically learn and identify a collection of vital sign dynamics, which can possibly be recurrent within the same patient and shared across the entire cohort. We show that these dynamical behaviors can be used to characterize and elucidate the progression of patients’ states of health over time. Using the mean arterial blood pressure time series of 337 ICU patients during the first 24 hours of their ICU stays, we demonstrated that the learned dynamics from as early as the first 8 hours of patients’ ICU stays can achieve similar hospital mortality prediction performance as the well-known SAPS-I acuity scores, suggesting that the discovered latent dynamics structure may yield more timely insights into the progression of a patient’s state of health than the traditional snapshot-based acuity scores. |

Nemati, Shamim; Lehman, Li-Wei; Adams, Ryan P Learning Outcome-Discriminative Dynamics in Multivariate Physiological Cohort Time Series Conference Proceedings of the 35th International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC), 2013. @conference{nemati2013learning, title = {Learning Outcome-Discriminative Dynamics in Multivariate Physiological Cohort Time Series}, author = {Shamim Nemati and Li-Wei Lehman and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/nemati2013learning.pdf}, year = {2013}, date = {2013-01-01}, booktitle = {Proceedings of the 35th International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC)}, abstract = {Model identification for physiological systems is complicated by changes between operating regimes and mea- surement artifacts. We present a solution to these problems by assuming that a cohort of physiological time series is gener- ated by switching among a finite collection of physiologically-constrained dynamical models and artifactual segments. We model the resulting time series using the switching linear dynamical systems (SLDS) framework, and present a novel learning algorithm for the class of SLDS, with the objective of identifying time series dynamics that are predictive of physiological regimes or outcomes of interest. We present exploratory results based on a simulation study and a physiological classification example of decoding postural changes from heart rate and blood pressure. We demonstrate a significant improvement in classification over methods based on feature learning via expectation maximization. The proposed learning algorithm is general, and can be extended to other applications involving state-space formulations.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Model identification for physiological systems is complicated by changes between operating regimes and mea- surement artifacts. We present a solution to these problems by assuming that a cohort of physiological time series is gener- ated by switching among a finite collection of physiologically-constrained dynamical models and artifactual segments. We model the resulting time series using the switching linear dynamical systems (SLDS) framework, and present a novel learning algorithm for the class of SLDS, with the objective of identifying time series dynamics that are predictive of physiological regimes or outcomes of interest. We present exploratory results based on a simulation study and a physiological classification example of decoding postural changes from heart rate and blood pressure. We demonstrate a significant improvement in classification over methods based on feature learning via expectation maximization. The proposed learning algorithm is general, and can be extended to other applications involving state-space formulations. |

Dechter, Eyal; Malmaud, Jonathan; Adams, Ryan P; Tenenbaum, Joshua B Bootstrap Learning Via Modular Concept Discovery Conference Proceedings of the 23rd International Joint Conference on Artificial Intelligence, 2013. @conference{dechter2013bootstrap, title = {Bootstrap Learning Via Modular Concept Discovery}, author = {Eyal Dechter and Jonathan Malmaud and Ryan P. Adams and Joshua B. Tenenbaum}, url = {http://www.cs.princeton.edu/~rpa/pubs/dechter2013bootstrap.pdf}, year = {2013}, date = {2013-01-01}, booktitle = {Proceedings of the 23rd International Joint Conference on Artificial Intelligence}, abstract = {Suppose a learner is faced with a domain of problems about which it knows nearly nothing. It does not know the distribution of problems, the space of solutions is not smooth, and the reward signal is uninformative, providing perhaps a few bits of information but not enough to steer the learner effectively. How can such a learner ever get off the ground? A common intuition is that if the solutions to these problems share a common structure, and the learner can solve some simple problems by brute force, it should be able to extract useful components from these solutions and, by composing them, explore the solution space more efficiently. Here, we formalize this intuition, where the solution space is that of typed functional programs and the gained information is stored as a stochastic grammar over programs. We propose an iterative procedure for exploring such spaces: in the first step of each iteration, the learner explores a finite subset of the domain, guided by a stochastic grammar; in the second step, the learner compresses the successful solutions from the first step to estimate a new stochastic grammar. We test this procedure on symbolic regression and Boolean circuit learning and show that the learner discovers modular concepts for these domains. Whereas the learner is able to solve almost none of the posed problems in the procedure’s first iteration, it rapidly becomes able to solve a large number by gaining abstract knowledge of the structure of the solution space.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Suppose a learner is faced with a domain of problems about which it knows nearly nothing. It does not know the distribution of problems, the space of solutions is not smooth, and the reward signal is uninformative, providing perhaps a few bits of information but not enough to steer the learner effectively. How can such a learner ever get off the ground? A common intuition is that if the solutions to these problems share a common structure, and the learner can solve some simple problems by brute force, it should be able to extract useful components from these solutions and, by composing them, explore the solution space more efficiently. Here, we formalize this intuition, where the solution space is that of typed functional programs and the gained information is stored as a stochastic grammar over programs. We propose an iterative procedure for exploring such spaces: in the first step of each iteration, the learner explores a finite subset of the domain, guided by a stochastic grammar; in the second step, the learner compresses the successful solutions from the first step to estimate a new stochastic grammar. We test this procedure on symbolic regression and Boolean circuit learning and show that the learner discovers modular concepts for these domains. Whereas the learner is able to solve almost none of the posed problems in the procedure’s first iteration, it rapidly becomes able to solve a large number by gaining abstract knowledge of the structure of the solution space. |

Swersky, Kevin; Snoek, Jasper; Adams, Ryan P Multi-Task Bayesian Optimization Conference Advances in Neural Information Processing Systems (NIPS) 26, 2013. @conference{swersky2013multi, title = {Multi-Task Bayesian Optimization}, author = {Kevin Swersky and Jasper Snoek and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/swersky2013multi.pdf}, year = {2013}, date = {2013-01-01}, booktitle = {Advances in Neural Information Processing Systems (NIPS) 26}, abstract = {Bayesian optimization has recently been proposed as a framework for automatically tuning the hyperparameters of machine learning models and has been shown to yield state-of-the-art performance with impressive ease and efficiency. In this paper, we explore whether it is possible to transfer the knowledge gained from previous optimizations to new tasks in order to find optimal hyperparameter settings more efficiently. Our approach is based on extending multi-task Gaussian processes to the framework of Bayesian optimization. We show that this method significantly speeds up the optimization process when compared to the standard single-task approach. We further propose a straightforward extension of our algorithm in order to jointly minimize the average error across multiple tasks and demonstrate how this can be used to greatly speed up k-fold cross-validation. Lastly, we propose an adaptation of a recently developed acquisition function, entropy search, to the cost-sensitive, multi-task setting. We demonstrate the utility of this new acquisition function by leveraging a small dataset to explore hyperparameter settings for a large dataset. Our algorithm dynamically chooses which dataset to query in order to yield the most information per unit cost.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Bayesian optimization has recently been proposed as a framework for automatically tuning the hyperparameters of machine learning models and has been shown to yield state-of-the-art performance with impressive ease and efficiency. In this paper, we explore whether it is possible to transfer the knowledge gained from previous optimizations to new tasks in order to find optimal hyperparameter settings more efficiently. Our approach is based on extending multi-task Gaussian processes to the framework of Bayesian optimization. We show that this method significantly speeds up the optimization process when compared to the standard single-task approach. We further propose a straightforward extension of our algorithm in order to jointly minimize the average error across multiple tasks and demonstrate how this can be used to greatly speed up k-fold cross-validation. Lastly, we propose an adaptation of a recently developed acquisition function, entropy search, to the cost-sensitive, multi-task setting. We demonstrate the utility of this new acquisition function by leveraging a small dataset to explore hyperparameter settings for a large dataset. Our algorithm dynamically chooses which dataset to query in order to yield the most information per unit cost. |

Napp, Nils; Adams, Ryan P Message Passing Inference with Chemical Reaction Networks Conference Advances in Neural Information Processing Systems (NIPS) 26, 2013. @conference{napp2013reactions, title = {Message Passing Inference with Chemical Reaction Networks}, author = {Nils Napp and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/napp2013reactions.pdf}, year = {2013}, date = {2013-01-01}, booktitle = {Advances in Neural Information Processing Systems (NIPS) 26}, abstract = {Recent work on molecular programming has explored new possibilities for computational abstractions with biomolecules, including logic gates, neural networks, and linear systems. In the future such abstractions might enable nanoscale devices that can sense and control the world at a molecular scale. Just as in macroscale robotics, it is critical that such devices can learn about their environment and reason under uncertainty. At this small scale, systems are typically modeled as chemical reaction networks. In this work, we develop a procedure that can take arbitrary probabilistic graphical models, represented as factor graphs over discrete random variables, and compile them into chemical reaction networks that implement inference. In particular, we show that marginalization based on sum-product message passing can be implemented in terms of reactions between chemical species whose concentrations represent probabilities. We show algebraically that the steady state concentration of these species correspond to the marginal distributions of the random variables in the graph and validate the results in simulations. As with standard sum-product inference, this procedure yields exact results for tree-structured graphs, and approximate solutions for loopy graphs.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Recent work on molecular programming has explored new possibilities for computational abstractions with biomolecules, including logic gates, neural networks, and linear systems. In the future such abstractions might enable nanoscale devices that can sense and control the world at a molecular scale. Just as in macroscale robotics, it is critical that such devices can learn about their environment and reason under uncertainty. At this small scale, systems are typically modeled as chemical reaction networks. In this work, we develop a procedure that can take arbitrary probabilistic graphical models, represented as factor graphs over discrete random variables, and compile them into chemical reaction networks that implement inference. In particular, we show that marginalization based on sum-product message passing can be implemented in terms of reactions between chemical species whose concentrations represent probabilities. We show algebraically that the steady state concentration of these species correspond to the marginal distributions of the random variables in the graph and validate the results in simulations. As with standard sum-product inference, this procedure yields exact results for tree-structured graphs, and approximate solutions for loopy graphs. |

Snoek, Jasper; Adams, Ryan P; Zemel, Richard S A Determinantal Point Process Latent Variable Model for Inhibition in Neural Spiking Data Conference Advances in Neural Information Processing Systems (NIPS) 26, 2013. @conference{snoek2013determinantal, title = {A Determinantal Point Process Latent Variable Model for Inhibition in Neural Spiking Data}, author = {Jasper Snoek and Ryan P. Adams and Richard S. Zemel}, url = {http://www.cs.princeton.edu/~rpa/pubs/snoek2013determinantal.pdf}, year = {2013}, date = {2013-01-01}, booktitle = {Advances in Neural Information Processing Systems (NIPS) 26}, abstract = {Point processes are popular models of neural spiking behavior as they provide a statistical distribution over temporal sequences of spikes and help to reveal the complexities underlying a series of recorded action potentials. However, the most common neural point process models, the Poisson process and the gamma renewal process, do not capture interactions and correlations that are critical to modeling populations of neurons. We develop a novel model based on a determinantal point process over latent embeddings of neurons that effectively captures and helps visualize complex inhibitory and competitive interaction. We show that this model is a natural extension of the popular generalized linear model to sets of interacting neurons. The model is extended to incorporate gain control or divisive normalization, and the modulation of neural spiking based on periodic phenomena. Applied to neural spike recordings from the rat hippocampus, we see that the model captures inhibitory relationships, a dichotomy of classes of neurons, and a periodic modulation by the theta rhythm known to be present in the data.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Point processes are popular models of neural spiking behavior as they provide a statistical distribution over temporal sequences of spikes and help to reveal the complexities underlying a series of recorded action potentials. However, the most common neural point process models, the Poisson process and the gamma renewal process, do not capture interactions and correlations that are critical to modeling populations of neurons. We develop a novel model based on a determinantal point process over latent embeddings of neurons that effectively captures and helps visualize complex inhibitory and competitive interaction. We show that this model is a natural extension of the popular generalized linear model to sets of interacting neurons. The model is extended to incorporate gain control or divisive normalization, and the modulation of neural spiking based on periodic phenomena. Applied to neural spike recordings from the rat hippocampus, we see that the model captures inhibitory relationships, a dichotomy of classes of neurons, and a periodic modulation by the theta rhythm known to be present in the data. |

Zou, James Y; Hsu, Daniel; Parkes, David; Adams, Ryan P Contrastive Learning Using Spectral Methods Conference Advances in Neural Information Processing Systems (NIPS) 26, 2013. @conference{zou2013contrastive, title = {Contrastive Learning Using Spectral Methods}, author = {James Y. Zou and Daniel Hsu and David Parkes and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/zou2013contrastive.pdf}, year = {2013}, date = {2013-01-01}, booktitle = {Advances in Neural Information Processing Systems (NIPS) 26}, abstract = {In many natural settings, the analysis goal is not to characterize a single data set in isolation, but rather to understand the difference between one set of observations and another. For example, given a background corpus of news articles together with writings of a particular author, one may want a topic model that explains word patterns and themes specific to the author. Another example comes from genomics, in which biological signals may be collected from different regions of a genome, and one wants a model that captures the differential statistics observed in these regions. This paper formalizes this notion of contrastive learning for mixture models, and develops spectral algorithms for inferring mixture components specific to a foreground data set when contrasted with a background data set. The method builds on recent moment-based estimators and tensor decompositions for latent variable models, and has the intuitive feature of using background data statistics to appropriately modify moments estimated from foreground data. A key advantage of the method is that the background data need only be coarsely modeled, which is important when the background is too complex, noisy, or not of interest. The method is demonstrated on applications in contrastive topic modeling and genomic sequence analysis.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } In many natural settings, the analysis goal is not to characterize a single data set in isolation, but rather to understand the difference between one set of observations and another. For example, given a background corpus of news articles together with writings of a particular author, one may want a topic model that explains word patterns and themes specific to the author. Another example comes from genomics, in which biological signals may be collected from different regions of a genome, and one wants a model that captures the differential statistics observed in these regions. This paper formalizes this notion of contrastive learning for mixture models, and develops spectral algorithms for inferring mixture components specific to a foreground data set when contrasted with a background data set. The method builds on recent moment-based estimators and tensor decompositions for latent variable models, and has the intuitive feature of using background data statistics to appropriately modify moments estimated from foreground data. A key advantage of the method is that the background data need only be coarsely modeled, which is important when the background is too complex, noisy, or not of interest. The method is demonstrated on applications in contrastive topic modeling and genomic sequence analysis. |

Lovell, Dan; Malmaud, Jonathan; Adams, Ryan P; Mansinghka, Vikash K ClusterCluster: Parallel Markov Chain Monte Carlo for Đirichlet Process Mixtures Unpublished 2013, (arXiv:1304.2302 [stat.ML]). @unpublished{lovell2013cluster, title = {ClusterCluster: Parallel Markov Chain Monte Carlo for Đirichlet Process Mixtures}, author = {Dan Lovell and Jonathan Malmaud and Ryan P. Adams and Vikash K. Mansinghka}, url = {http://www.cs.princeton.edu/~rpa/pubs/lovell2013cluster.pdf}, year = {2013}, date = {2013-01-01}, abstract = {The Dirichlet process (DP) is a fundamental mathematical tool for Bayesian nonparametric modeling, and is widely used in tasks such as density estimation, natural language processing, and time series modeling. Although MCMC inference methods for the DP often provide a gold standard in terms asymptotic accuracy, they can be computationally expensive and are not obviously parallelizable. We propose a reparameterization of the Dirichlet process that induces conditional independencies between the atoms that form the random measure. This conditional independence enables many of the Markov chain transition operators for DP inference to be simulated in parallel across multiple cores. Applied to mixture modeling, our approach enables the Dirichlet process to simultaneously learn clusters that describe the data and superclusters that define the granularity of parallelization. Unlike previous approaches, our technique does not require alteration of the model and leaves the true posterior distribution invariant. It also naturally lends itself to a distributed software implementation in terms of Map-Reduce, which we test in cluster configurations of over 50 machines and 100 cores. We present experiments exploring the parallel efficiency and convergence properties of our approach on both synthetic and real-world data, including runs on 1MM data vectors in 256 dimensions.}, note = {arXiv:1304.2302 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {unpublished} } The Dirichlet process (DP) is a fundamental mathematical tool for Bayesian nonparametric modeling, and is widely used in tasks such as density estimation, natural language processing, and time series modeling. Although MCMC inference methods for the DP often provide a gold standard in terms asymptotic accuracy, they can be computationally expensive and are not obviously parallelizable. We propose a reparameterization of the Dirichlet process that induces conditional independencies between the atoms that form the random measure. This conditional independence enables many of the Markov chain transition operators for DP inference to be simulated in parallel across multiple cores. Applied to mixture modeling, our approach enables the Dirichlet process to simultaneously learn clusters that describe the data and superclusters that define the granularity of parallelization. Unlike previous approaches, our technique does not require alteration of the model and leaves the true posterior distribution invariant. It also naturally lends itself to a distributed software implementation in terms of Map-Reduce, which we test in cluster configurations of over 50 machines and 100 cores. We present experiments exploring the parallel efficiency and convergence properties of our approach on both synthetic and real-world data, including runs on 1MM data vectors in 256 dimensions. |

Rippel, Oren; Adams, Ryan P High-Dimensional Probability Estimation with Deep Density Models Unpublished 2013, (arXiv:1302.5125 [stat.ML]). @unpublished{rippel2013density, title = {High-Dimensional Probability Estimation with Deep Density Models}, author = {Oren Rippel and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/rippel2013density.pdf}, year = {2013}, date = {2013-01-01}, abstract = {One of the fundamental problems in machine learning is the estimation of a probability distribution from data. Many techniques have been proposed to study the structure of data, most often building around the assumption that observations lie on a lower-dimensional manifold of high probability. It has been more difficult, however, to exploit this insight to build explicit, tractable density models for high-dimensional data. In this paper, we introduce the deep density model (DDM), a new approach to density estimation. We exploit insights from deep learning to construct a bijective map to a representation space, under which the transformation of the distribution of the data is approximately factorized and has identical and known marginal densities. The simplicity of the latent distribution under the model allows us to feasibly explore it, and the invertibility of the map to characterize contraction of measure across it. This enables us to compute normalized densities for out-of-sample data. This combination of tractability and flexibility allows us to tackle a variety of probabilistic tasks on high-dimensional datasets, including: rapid computation of normalized densities at test-time without evaluating a partition function; generation of samples without MCMC; and characterization of the joint entropy of the data.}, note = {arXiv:1302.5125 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {unpublished} } One of the fundamental problems in machine learning is the estimation of a probability distribution from data. Many techniques have been proposed to study the structure of data, most often building around the assumption that observations lie on a lower-dimensional manifold of high probability. It has been more difficult, however, to exploit this insight to build explicit, tractable density models for high-dimensional data. In this paper, we introduce the deep density model (DDM), a new approach to density estimation. We exploit insights from deep learning to construct a bijective map to a representation space, under which the transformation of the distribution of the data is approximately factorized and has identical and known marginal densities. The simplicity of the latent distribution under the model allows us to feasibly explore it, and the invertibility of the map to characterize contraction of measure across it. This enables us to compute normalized densities for out-of-sample data. This combination of tractability and flexibility allows us to tackle a variety of probabilistic tasks on high-dimensional datasets, including: rapid computation of normalized densities at test-time without evaluating a partition function; generation of samples without MCMC; and characterization of the joint entropy of the data. |

## 2012 |

Tarlow, Daniel; Adams, Ryan P; Zemel, Richard S Randomized Optimum Models for Structured Prediction Conference Proceedings of the 15th International Conference on Artificial Intelligence and Statistics (AISTATS), 2012. @conference{tarlow2012optimum, title = {Randomized Optimum Models for Structured Prediction}, author = {Daniel Tarlow and Ryan P. Adams and Richard S. Zemel}, url = {http://www.cs.princeton.edu/~rpa/pubs/tarlow2012optimum.pdf}, year = {2012}, date = {2012-01-01}, booktitle = {Proceedings of the 15th International Conference on Artificial Intelligence and Statistics (AISTATS)}, abstract = {One approach to modeling structured discrete data is to describe the probability of states via an energy function and Gibbs distribution. A recurring difficulty in these models is the computation of the partition function, which may require an intractable sum. However, in many such models, the mode can be found efficiently even when the partition function is unavailable. Recent work on Perturb-and-MAP (PM) models has exploited this discrepancy to approximate the Gibbs distribution for Markov random fields (MRFs). Here, we explore a broader class of models, called Randomized Optimum Models (RandOMs), which include PM as a special case. This new class of models encompasses not only MRFs, but also more general types of models that have intractable partition functions but permit efficient optimization, such as those based on bipartite matchings, shortest paths, or connected components in a graph. We develop several likelihood-based learning algorithms for RandOMs, and our empirical results indicate that our methods can produce better models than the moment-matching of PM.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } One approach to modeling structured discrete data is to describe the probability of states via an energy function and Gibbs distribution. A recurring difficulty in these models is the computation of the partition function, which may require an intractable sum. However, in many such models, the mode can be found efficiently even when the partition function is unavailable. Recent work on Perturb-and-MAP (PM) models has exploited this discrepancy to approximate the Gibbs distribution for Markov random fields (MRFs). Here, we explore a broader class of models, called Randomized Optimum Models (RandOMs), which include PM as a special case. This new class of models encompasses not only MRFs, but also more general types of models that have intractable partition functions but permit efficient optimization, such as those based on bipartite matchings, shortest paths, or connected components in a graph. We develop several likelihood-based learning algorithms for RandOMs, and our empirical results indicate that our methods can produce better models than the moment-matching of PM. |

Snoek, Jasper; Adams, Ryan P; Larochelle, Hugo On Nonparametric Guidance for Learning Autoencoder Representations Conference Proceedings of the 15th International Conference on Artificial Intelligence and Statistics (AISTATS), 2012, (arXiv:1102.1492v4 [stat.ML]). @conference{snoek2012guidance, title = {On Nonparametric Guidance for Learning Autoencoder Representations}, author = {Jasper Snoek and Ryan P. Adams and Hugo Larochelle}, url = {http://www.cs.princeton.edu/~rpa/pubs/snoek2012guidance.pdf}, year = {2012}, date = {2012-01-01}, booktitle = {Proceedings of the 15th International Conference on Artificial Intelligence and Statistics (AISTATS)}, abstract = {Unsupervised discovery of latent representations, in addition to being useful for density modeling, visualisation and exploratory data analysis, is also increasingly important for learning features relevant to discriminative tasks. Autoencoders, in particular, have proven to be an effective way to learn latent codes that reflect meaningful variations in data. A continuing challenge, however, is guiding an autoencoder toward representations that are useful for particular tasks. A complementary challenge is to find codes that are invariant to irrelevant transformations of the data. The most common way of introducing such problem-specific guidance in autoencoders has been through the incorporation of a parametric component that ties the latent representation to the label information. In this work, we argue that a preferable approach relies instead on a nonparametric guidance mechanism. Conceptually, it ensures that there exists a function that can predict the label information, without explicitly instantiating that function. The superiority of this guidance mechanism is confirmed on two datasets. In particular, this approach is able to incorporate invariance information (lighting, elevation, etc.) from the small NORB object recognition dataset and yields state-of-the-art performance for a single layer, non-convolutional network.}, note = {arXiv:1102.1492v4 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Unsupervised discovery of latent representations, in addition to being useful for density modeling, visualisation and exploratory data analysis, is also increasingly important for learning features relevant to discriminative tasks. Autoencoders, in particular, have proven to be an effective way to learn latent codes that reflect meaningful variations in data. A continuing challenge, however, is guiding an autoencoder toward representations that are useful for particular tasks. A complementary challenge is to find codes that are invariant to irrelevant transformations of the data. The most common way of introducing such problem-specific guidance in autoencoders has been through the incorporation of a parametric component that ties the latent representation to the label information. In this work, we argue that a preferable approach relies instead on a nonparametric guidance mechanism. Conceptually, it ensures that there exists a function that can predict the label information, without explicitly instantiating that function. The superiority of this guidance mechanism is confirmed on two datasets. In particular, this approach is able to incorporate invariance information (lighting, elevation, etc.) from the small NORB object recognition dataset and yields state-of-the-art performance for a single layer, non-convolutional network. |

Chua, Jeroen; Givoni, Inmar; Adams, Ryan P; Frey, Brendan Bayesian Painting by Numbers: Flexible Priors for Colour-Invariant Object Recognition Book Chapter Machine Learning for Computer Vision, Springer, Berlin, 2012. @inbook{chua2012painting, title = {Bayesian Painting by Numbers: Flexible Priors for Colour-Invariant Object Recognition}, author = {Jeroen Chua and Inmar Givoni and Ryan P. Adams and Brendan Frey}, url = {http://www.cs.princeton.edu/~rpa/pubs/chua2012painting.pdf}, year = {2012}, date = {2012-01-01}, booktitle = {Machine Learning for Computer Vision}, publisher = {Springer}, address = {Berlin}, series = {Studies in Computational Intelligence}, abstract = {Generative models of images should take into account transformations of geometry and reflectance. Then, they can provide explanations of images that are factorized into intrinsic properties that are useful for subsequent tasks, such as object classification. It was previously shown how images and objects within images could be described as compositions of regions called structural elements or "stels". In this way, transformations of the reflectance and illumination of object parts could be accounted for using a hidden variable that is used to "paint" the same stel differently in different images. For example, the stel corresponding to the petals of a flower can be red in one image and yellow in another. Previous stel models have used a fixed number of stels per image and per image class. Here, we introduce a Bayesian stel model, the colour-invariant admixture (CIA) model, which can infer different numbers of stels for different object types, as appropriate. Results on Caltech101 images show that this method is capable of automatically selecting a number of stels that reflects the complexity of the object class and that these stels are useful for object recognition.}, keywords = {}, pubstate = {published}, tppubtype = {inbook} } Generative models of images should take into account transformations of geometry and reflectance. Then, they can provide explanations of images that are factorized into intrinsic properties that are useful for subsequent tasks, such as object classification. It was previously shown how images and objects within images could be described as compositions of regions called structural elements or "stels". In this way, transformations of the reflectance and illumination of object parts could be accounted for using a hidden variable that is used to "paint" the same stel differently in different images. For example, the stel corresponding to the petals of a flower can be red in one image and yellow in another. Previous stel models have used a fixed number of stels per image and per image class. Here, we introduce a Bayesian stel model, the colour-invariant admixture (CIA) model, which can infer different numbers of stels for different object types, as appropriate. Results on Caltech101 images show that this method is capable of automatically selecting a number of stels that reflects the complexity of the object class and that these stels are useful for object recognition. |

Dahl, George E; Adams, Ryan P; Larochelle, Hugo Training Restricted Boltzmann Machines on Word Observations Conference Proceedings of the 29th International Conference on Machine Learning (ICML), 2012, (arXiv:1202.5695 [cs.LG]). @conference{dahl2012training, title = {Training Restricted Boltzmann Machines on Word Observations}, author = {George E. Dahl and Ryan P. Adams and Hugo Larochelle}, url = {http://www.cs.princeton.edu/~rpa/pubs/dahl2012training.pdf}, year = {2012}, date = {2012-01-01}, booktitle = {Proceedings of the 29th International Conference on Machine Learning (ICML)}, abstract = {The restricted Boltzmann machine (RBM) is a flexible model for complex data. However, using RBMs for high-dimensional multinomial observations poses significant computational di␣culties. In natural language processing applications, words are naturally modeled by K-ary discrete distributions, where K is determined by the vocabulary size and can easily be in the hundred thousands. The conventional approach to training RBMs on word observations is limited because it requires sampling the states of K-way softmax visible units during block Gibbs updates, an operation that takes time linear in K. In this work, we address this issue with a more general class of Markov chain Monte Carlo operators on the visible units, yielding updates with computational complexity independent of K. We demonstrate the success of our approach by training RBMs on hundreds of millions of word n-grams using larger vocabularies than previously feasible with RBMs and by using the learned features to improve performance on chunking and sentiment classification tasks, achieving state-of-the-art results on the latter.}, note = {arXiv:1202.5695 [cs.LG]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } The restricted Boltzmann machine (RBM) is a flexible model for complex data. However, using RBMs for high-dimensional multinomial observations poses significant computational di␣culties. In natural language processing applications, words are naturally modeled by K-ary discrete distributions, where K is determined by the vocabulary size and can easily be in the hundred thousands. The conventional approach to training RBMs on word observations is limited because it requires sampling the states of K-way softmax visible units during block Gibbs updates, an operation that takes time linear in K. In this work, we address this issue with a more general class of Markov chain Monte Carlo operators on the visible units, yielding updates with computational complexity independent of K. We demonstrate the success of our approach by training RBMs on hundreds of millions of word n-grams using larger vocabularies than previously feasible with RBMs and by using the learned features to improve performance on chunking and sentiment classification tasks, achieving state-of-the-art results on the latter. |

Tarlow, Daniel; Adams, Ryan P Revisiting Uncertainty in Graph Cut Solutions Conference Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2012. @conference{tarlow2012revisiting, title = {Revisiting Uncertainty in Graph Cut Solutions}, author = {Daniel Tarlow and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/tarlow2012revisiting.pdf}, year = {2012}, date = {2012-01-01}, booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, abstract = {Graph cuts is a popular algorithm for finding the MAP assignment of many large-scale graphical models that are common in computer vision. While graph cuts is powerful, it does not provide information about the marginal proba- bilities associated with the solution it finds. To assess uncer- tainty, we are forced to fall back on less efficient and inexact inference algorithms such as loopy belief propagation, or use less principled surrogate representations of uncertainty such as the min-marginal approach of Kohli & Torr (2008). In this work, we give new justification for using min- marginals to compute the uncertainty in conditional ran- dom fields, framing the min-marginal outputs as exact marginals under a specially-chosen generative probabilis- tic model. We leverage this view to learn properly cali- brated marginal probabilities as the result of straightfor- ward maximization of the training likelihood, showing that the necessary subgradients can be computed efficiently us- ing dynamic graph cut operations. We also show how this approach can be extended to compute multi-label marginal distributions, where again dynamic graph cuts enable ef- ficient marginal inference and maximum likelihood learn- ing. We demonstrate empirically that — after proper train- ing — uncertainties based on min-marginals provide better- calibrated probabilities than baselines and that these dis- tributions can be exploited in a decision-theoretic way for improved segmentation in low-level vision.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Graph cuts is a popular algorithm for finding the MAP assignment of many large-scale graphical models that are common in computer vision. While graph cuts is powerful, it does not provide information about the marginal proba- bilities associated with the solution it finds. To assess uncer- tainty, we are forced to fall back on less efficient and inexact inference algorithms such as loopy belief propagation, or use less principled surrogate representations of uncertainty such as the min-marginal approach of Kohli & Torr (2008). In this work, we give new justification for using min- marginals to compute the uncertainty in conditional ran- dom fields, framing the min-marginal outputs as exact marginals under a specially-chosen generative probabilis- tic model. We leverage this view to learn properly cali- brated marginal probabilities as the result of straightfor- ward maximization of the training likelihood, showing that the necessary subgradients can be computed efficiently us- ing dynamic graph cut operations. We also show how this approach can be extended to compute multi-label marginal distributions, where again dynamic graph cuts enable ef- ficient marginal inference and maximum likelihood learn- ing. We demonstrate empirically that — after proper train- ing — uncertainties based on min-marginals provide better- calibrated probabilities than baselines and that these dis- tributions can be exploited in a decision-theoretic way for improved segmentation in low-level vision. |

Chua, Jeroen; Givoni, Inmar; Adams, Ryan P; Frey, Brendan Learning Structural Element Patch Models with Hierarchical Palettes Conference Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2012. @conference{chua2012stel, title = {Learning Structural Element Patch Models with Hierarchical Palettes}, author = {Jeroen Chua and Inmar Givoni and Ryan P. Adams and Brendan Frey}, url = {http://www.cs.princeton.edu/~rpa/pubs/chua2012stel.pdf}, year = {2012}, date = {2012-01-01}, booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, abstract = {Image patches can be factorized into ‘shapelets’ that describe segmentation patterns called structural elements (stels), and palettes that describe how to paint the shapelets. We introduce local palettes for patches, global palettes for entire images and universal palettes for image collections. Using a learned shapelet library, patches from a test image can be analyzed using a variational technique to produce an image descriptor that represents local shapes and colors separately. We show that the shapelet model performs better than SIFT, Gist and the standard stel method on Caltech28 and is very competitive with other methods on Caltech101.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Image patches can be factorized into ‘shapelets’ that describe segmentation patterns called structural elements (stels), and palettes that describe how to paint the shapelets. We introduce local palettes for patches, global palettes for entire images and universal palettes for image collections. Using a learned shapelet library, patches from a test image can be analyzed using a variational technique to produce an image descriptor that represents local shapes and colors separately. We show that the shapelet model performs better than SIFT, Gist and the standard stel method on Caltech28 and is very competitive with other methods on Caltech101. |

Tarlow, Daniel; Swersky, Kevin; Zemel, Richard S; Adams, Ryan P; Frey, Brendan Fast Exact Inference in Recursive Cardinality Models Conference Proceedings of the 28th Conference on Uncertainty in Artificial Intelligence (UAI), 2012. @conference{tarlow2012cardinality, title = {Fast Exact Inference in Recursive Cardinality Models}, author = {Daniel Tarlow and Kevin Swersky and Richard S. Zemel and Ryan P. Adams and Brendan Frey}, url = {http://www.cs.princeton.edu/~rpa/pubs/tarlow2012cardinality.pdf}, year = {2012}, date = {2012-01-01}, booktitle = {Proceedings of the 28th Conference on Uncertainty in Artificial Intelligence (UAI)}, abstract = {Cardinality potentials are a generally useful class of high order potential that affect probabilities based on how many of D binary variables are active. Maximum a posteriori (MAP) inference for cardinality potential models is well-understood, with efficient computations taking O(D log D) time. Yet efficient marginalization and sampling have not been addressed as thoroughly in the machine learning community. We show that there exists a simple algorithm for computing marginal probabilities and drawing exact joint samples that runs in O(D log2 D) time, and we show how to frame the algorithm as efficient belief propagation in a low order tree-structured model that includes additional auxiliary variables. We then develop a new, more general class of models, termed Recursive Cardinality models, which take advantage of this efficiency. Finally, we show how to do efficient exact inference in models composed of a tree structure and a cardinality potential. We explore the expressive power of Recursive Cardinality models and empirically demonstrate their utility.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Cardinality potentials are a generally useful class of high order potential that affect probabilities based on how many of D binary variables are active. Maximum a posteriori (MAP) inference for cardinality potential models is well-understood, with efficient computations taking O(D log D) time. Yet efficient marginalization and sampling have not been addressed as thoroughly in the machine learning community. We show that there exists a simple algorithm for computing marginal probabilities and drawing exact joint samples that runs in O(D log2 D) time, and we show how to frame the algorithm as efficient belief propagation in a low order tree-structured model that includes additional auxiliary variables. We then develop a new, more general class of models, termed Recursive Cardinality models, which take advantage of this efficiency. Finally, we show how to do efficient exact inference in models composed of a tree structure and a cardinality potential. We explore the expressive power of Recursive Cardinality models and empirically demonstrate their utility. |

Lehman, Li-Wei; Nemati, Shamim; Adams, Ryan P; Mark, Roger Discovering Shared Dynamics in Physiological Signals: Application to Patient Monitoring in ICU Conference Proceedings of the International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC), 2012. @conference{lehman2012dynamics, title = {Discovering Shared Dynamics in Physiological Signals: Application to Patient Monitoring in ICU}, author = {Li-Wei Lehman and Shamim Nemati and Ryan P. Adams and Roger Mark}, url = {http://www.cs.princeton.edu/~rpa/pubs/lehman2012dynamics.pdf}, year = {2012}, date = {2012-01-01}, booktitle = {Proceedings of the International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC)}, abstract = {Modern clinical databases include time series of vital signs, which are often recorded continuously during a hospital stay. Over several days, these recordings may yield many thousands of samples. In this work, we explore the feasibility of characterizing the “state of health” of a patient using the physiological dynamics inferred from these time series. The ultimate objective is to assist clinicians in allocating resources to high-risk patients. We hypothesize that “similar” patients exhibit similar dynamics and the properties and dura- tion of these states are indicative of health and disease. We use Bayesian nonparametric machine learning methods to discover shared dynamics in patients’ blood pressure (BP) time series. Each such “dynamic” captures a distinct pattern of evolution of BP and is possibly recurrent within the same time series and shared across multiple patients. Next, we examine the utility of this low-dimensional representation of BP time series for predicting mortality in patients. Our results are based on an intensive care unit (ICU) cohort of 480 patients (with 16% mortality) and indicate that the dynamics of time series of vital signs can be an independent useful predictor of outcome in ICU.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Modern clinical databases include time series of vital signs, which are often recorded continuously during a hospital stay. Over several days, these recordings may yield many thousands of samples. In this work, we explore the feasibility of characterizing the “state of health” of a patient using the physiological dynamics inferred from these time series. The ultimate objective is to assist clinicians in allocating resources to high-risk patients. We hypothesize that “similar” patients exhibit similar dynamics and the properties and dura- tion of these states are indicative of health and disease. We use Bayesian nonparametric machine learning methods to discover shared dynamics in patients’ blood pressure (BP) time series. Each such “dynamic” captures a distinct pattern of evolution of BP and is possibly recurrent within the same time series and shared across multiple patients. Next, we examine the utility of this low-dimensional representation of BP time series for predicting mortality in patients. Our results are based on an intensive care unit (ICU) cohort of 480 patients (with 16% mortality) and indicate that the dynamics of time series of vital signs can be an independent useful predictor of outcome in ICU. |

Nemati, Shamim; Lehman, Li-Wei; Adams, Ryan P; Malhotra, Atul Discovering Shared Cardiovascular Dynamics within a Patient Cohort Conference Proceedings of the International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC), 2012. @conference{nemati2012cardio, title = {Discovering Shared Cardiovascular Dynamics within a Patient Cohort}, author = {Shamim Nemati and Li-Wei Lehman and Ryan P. Adams and Atul Malhotra}, url = {http://www.cs.princeton.edu/~rpa/pubs/nemati2012cardio.pdf}, year = {2012}, date = {2012-01-01}, booktitle = {Proceedings of the International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC)}, abstract = {Cardiovascular variables such as heart rate (HR) and blood pressure (BP) are robustly regulated by an underlying control system. Time series of HR and BP exhibit distinct dynamical patterns of interaction in response to perturbations (e.g., drugs or exercise) as well as in pathological states (e.g., excessive sympathetic activation). A question of interest is whether “similar” dynamical patterns can be identified across a heterogeneous patient cohort. In this work, we present a technique based on switching linear dynamical systems for identification of shared dynamical patterns in the time series of HR and BP recorded from a patient cohort. The technique uses a mixture of linear dynamical systems, the components of which are shared across all patients, to capture both nonlinear dynamics and non-Gaussian perturbations. We present exploratory results based on a simulation study of the cardiovascular system, and real recordings from 10 healthy subjects undergoing a tilt-table test. These results demonstrate the ability of the proposed technique to identify similar dynamical patterns present across multiple time series.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Cardiovascular variables such as heart rate (HR) and blood pressure (BP) are robustly regulated by an underlying control system. Time series of HR and BP exhibit distinct dynamical patterns of interaction in response to perturbations (e.g., drugs or exercise) as well as in pathological states (e.g., excessive sympathetic activation). A question of interest is whether “similar” dynamical patterns can be identified across a heterogeneous patient cohort. In this work, we present a technique based on switching linear dynamical systems for identification of shared dynamical patterns in the time series of HR and BP recorded from a patient cohort. The technique uses a mixture of linear dynamical systems, the components of which are shared across all patients, to capture both nonlinear dynamics and non-Gaussian perturbations. We present exploratory results based on a simulation study of the cardiovascular system, and real recordings from 10 healthy subjects undergoing a tilt-table test. These results demonstrate the ability of the proposed technique to identify similar dynamical patterns present across multiple time series. |

Snoek, Jasper; Adams, Ryan P; Larochelle, Hugo Nonparametric Guidance of Autoencoder Representations Using Label Information Journal Article Journal of Machine Learning Research, 13 , pp. 2567–2588, 2012. @article{snoek2012autoencoder, title = {Nonparametric Guidance of Autoencoder Representations Using Label Information}, author = {Jasper Snoek and Ryan P. Adams and Hugo Larochelle}, url = {http://www.cs.princeton.edu/~rpa/pubs/snoek2012autoencoder.pdf}, year = {2012}, date = {2012-01-01}, journal = {Journal of Machine Learning Research}, volume = {13}, pages = {2567--2588}, abstract = {While unsupervised learning has long been useful for density modeling, exploratory data analysis and visualization, it has become increasingly important for discovering features that will later be used for discriminative tasks. Discriminative algorithms often work best with highly-informative features; remarkably, such features can often be learned without the labels. One particularly effective way to perform such unsupervised learning has been to use autoencoder neural networks, which find latent representations that are constrained but nevertheless informative for reconstruction. However, pure unsupervised learning with autoencoders can find representations that may or may not be useful for the ultimate discriminative task. It is a continuing challenge to guide the training of an autoencoder so that it finds features which will be useful for predicting labels. Similarly, we often have a priori information regarding what statistical variation will be irrelevant to the ultimate discriminative task, and we would like to be able to use this for guidance as well. Although a typical strategy would be to include a parametric discriminative model as part of the autoencoder training, here we propose a nonparametric approach that uses a Gaussian process to guide the representation. By using a nonparametric model, we can ensure that a useful discriminative function exists for a given set of features, without explicitly instantiating it. We demonstrate the superiority of this guidance mechanism on four data sets, including a real-world application to rehabilitation research. We also show how our proposed approach can learn to explicitly ignore statistically significant covariate information that is label-irrelevant, by evaluating on the small NORB image recognition problem in which pose and lighting labels are available.}, keywords = {}, pubstate = {published}, tppubtype = {article} } While unsupervised learning has long been useful for density modeling, exploratory data analysis and visualization, it has become increasingly important for discovering features that will later be used for discriminative tasks. Discriminative algorithms often work best with highly-informative features; remarkably, such features can often be learned without the labels. One particularly effective way to perform such unsupervised learning has been to use autoencoder neural networks, which find latent representations that are constrained but nevertheless informative for reconstruction. However, pure unsupervised learning with autoencoders can find representations that may or may not be useful for the ultimate discriminative task. It is a continuing challenge to guide the training of an autoencoder so that it finds features which will be useful for predicting labels. Similarly, we often have a priori information regarding what statistical variation will be irrelevant to the ultimate discriminative task, and we would like to be able to use this for guidance as well. Although a typical strategy would be to include a parametric discriminative model as part of the autoencoder training, here we propose a nonparametric approach that uses a Gaussian process to guide the representation. By using a nonparametric model, we can ensure that a useful discriminative function exists for a given set of features, without explicitly instantiating it. We demonstrate the superiority of this guidance mechanism on four data sets, including a real-world application to rehabilitation research. We also show how our proposed approach can learn to explicitly ignore statistically significant covariate information that is label-irrelevant, by evaluating on the small NORB image recognition problem in which pose and lighting labels are available. |

Swersky, Kevin; Tarlow, Daniel; Adams, Ryan P; Zemel, Richard S; Frey, Brendan Probabilistic n-choose-k Models for Classification and Ranking Conference Advances in Neural Information Processing Systems (NIPS) 25, 2012. @conference{swersky2012choose, title = {Probabilistic n-choose-k Models for Classification and Ranking}, author = {Kevin Swersky and Daniel Tarlow and Ryan P. Adams and Richard S. Zemel and Brendan Frey}, url = {http://www.cs.princeton.edu/~rpa/pubs/swersky2012choose.pdf}, year = {2012}, date = {2012-01-01}, booktitle = {Advances in Neural Information Processing Systems (NIPS) 25}, abstract = {In categorical data there is often structure in the number of variables that take on each label. For example, the total number of objects in an image and the number of highly relevant documents per query in web search both tend to follow a structured distribution. In this paper, we study a probabilistic model that explicitly includes a prior distribution over such counts, along with a count-conditional likelihood that defines probabilities over all subsets of a given size. When labels are binary and the prior over counts is a Poisson-Binomial distribution, a standard logistic regression model is recovered, but for other count distributions, such priors induce global dependencies and combinatorics that appear to complicate learning and inference. However, we demonstrate that simple, efficient learning procedures can be derived for more general forms of this model. We illustrate the utility of the formulation by exploring applications to multi-object classification, learning to rank, and top-K classification.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } In categorical data there is often structure in the number of variables that take on each label. For example, the total number of objects in an image and the number of highly relevant documents per query in web search both tend to follow a structured distribution. In this paper, we study a probabilistic model that explicitly includes a prior distribution over such counts, along with a count-conditional likelihood that defines probabilities over all subsets of a given size. When labels are binary and the prior over counts is a Poisson-Binomial distribution, a standard logistic regression model is recovered, but for other count distributions, such priors induce global dependencies and combinatorics that appear to complicate learning and inference. However, we demonstrate that simple, efficient learning procedures can be derived for more general forms of this model. We illustrate the utility of the formulation by exploring applications to multi-object classification, learning to rank, and top-K classification. |

Zou, James Y; Adams, Ryan P Priors for Diversity in Generative Latent Variable Models Conference Advances in Neural Information Processing Systems (NIPS) 25, 2012. @conference{zou2012diversity, title = {Priors for Diversity in Generative Latent Variable Models}, author = {James Y. Zou and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/zou2012diversity.pdf}, year = {2012}, date = {2012-01-01}, booktitle = {Advances in Neural Information Processing Systems (NIPS) 25}, abstract = {Probabilistic latent variable models are one of the cornerstones of machine learning. They offer a convenient and coherent way to specify prior distributions over unobserved structure in data, so that these unknown properties can be inferred via posterior inference. Such models are useful for exploratory analysis and visualization, for building density models of data, and for providing features that can be used for later discriminative tasks. A significant limitation of these models, however, is that draws from the prior are often highly redundant due to i.i.d. assumptions on internal parameters. For example, there is no preference in the prior of a mixture model to make components non-overlapping, or in topic model to ensure that co-occurring words only appear in a small number of topics. In this work, we revisit these independence assumptions for probabilistic latent variable models, replacing the underlying i.i.d. prior with a determinantal point process (DPP). The DPP allows us to specify a preference for diversity in our latent variables using a positive definite kernel function. Using a kernel between probability distributions, we are able to define a DPP on probability measures. We show how to perform MAP inference with DPP priors in latent Dirichlet allocation and in mixture models, leading to better intuition for the latent variable representation and quantitatively improved unsupervised feature extraction, without compromising the generative aspects of the model.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Probabilistic latent variable models are one of the cornerstones of machine learning. They offer a convenient and coherent way to specify prior distributions over unobserved structure in data, so that these unknown properties can be inferred via posterior inference. Such models are useful for exploratory analysis and visualization, for building density models of data, and for providing features that can be used for later discriminative tasks. A significant limitation of these models, however, is that draws from the prior are often highly redundant due to i.i.d. assumptions on internal parameters. For example, there is no preference in the prior of a mixture model to make components non-overlapping, or in topic model to ensure that co-occurring words only appear in a small number of topics. In this work, we revisit these independence assumptions for probabilistic latent variable models, replacing the underlying i.i.d. prior with a determinantal point process (DPP). The DPP allows us to specify a preference for diversity in our latent variables using a positive definite kernel function. Using a kernel between probability distributions, we are able to define a DPP on probability measures. We show how to perform MAP inference with DPP priors in latent Dirichlet allocation and in mixture models, leading to better intuition for the latent variable representation and quantitatively improved unsupervised feature extraction, without compromising the generative aspects of the model. |

Snoek, Jasper; Larochelle, Hugo; Adams, Ryan P Practical Bayesian Optimization of Machine Learning Algorithms Conference Advances in Neural Information Processing Systems (NIPS) 25, 2012, (arXiv:1206.2944 [stat.ML]). @conference{snoek2012practical, title = {Practical Bayesian Optimization of Machine Learning Algorithms}, author = {Jasper Snoek and Hugo Larochelle and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/snoek2012practical.pdf}, year = {2012}, date = {2012-01-01}, booktitle = {Advances in Neural Information Processing Systems (NIPS) 25}, abstract = {The use of machine learning algorithms frequently involves careful tuning of learning parameters and model hyperparameters. Unfortunately, this tuning is often a ``black art'' requiring expert experience, rules of thumb, or sometimes brute-force search. There is therefore great appeal for automatic approaches that can optimize the performance of any given learning algorithm to the problem at hand. In this work, we consider this problem through the framework of Bayesian optimization, in which a learning algorithm's generalization performance is modeled as a sample from a Gaussian process (GP). We show that certain choices for the nature of the GP, such as the type of kernel and the treatment of its hyperparameters, can play a crucial role in obtaining a good optimizer that can achieve expert-level performance. We describe new algorithms that take into account the variable cost (duration) of learning algorithm experiments and that can leverage the presence of multiple cores for parallel experimentation. We show that these proposed algorithms improve on previous automatic procedures and can reach or surpass human expert-level optimization for many algorithms including latent Dirichlet allocation, structured SVMs and convolutional neural networks.}, note = {arXiv:1206.2944 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } The use of machine learning algorithms frequently involves careful tuning of learning parameters and model hyperparameters. Unfortunately, this tuning is often a ``black art'' requiring expert experience, rules of thumb, or sometimes brute-force search. There is therefore great appeal for automatic approaches that can optimize the performance of any given learning algorithm to the problem at hand. In this work, we consider this problem through the framework of Bayesian optimization, in which a learning algorithm's generalization performance is modeled as a sample from a Gaussian process (GP). We show that certain choices for the nature of the GP, such as the type of kernel and the treatment of its hyperparameters, can play a crucial role in obtaining a good optimizer that can achieve expert-level performance. We describe new algorithms that take into account the variable cost (duration) of learning algorithm experiments and that can leverage the presence of multiple cores for parallel experimentation. We show that these proposed algorithms improve on previous automatic procedures and can reach or surpass human expert-level optimization for many algorithms including latent Dirichlet allocation, structured SVMs and convolutional neural networks. |

Swersky, Kevin; Tarlow, Daniel; Sutskever, Ilya; Salakhutdinov, Ruslan ; Zemel, Richard S; Adams, Ryan P Cardinality Restricted Boltzmann Machines Conference Advances in Neural Information Processing Systems (NIPS) 25, 2012. @conference{swersky2012cardinality, title = {Cardinality Restricted Boltzmann Machines}, author = {Kevin Swersky and Daniel Tarlow and Ilya Sutskever and Ruslan Salakhutdinov and Richard S. Zemel and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/swersky2012cardinality.pdf}, year = {2012}, date = {2012-01-01}, booktitle = {Advances in Neural Information Processing Systems (NIPS) 25}, abstract = {The Restricted Boltzmann Machine (RBM) is a popular density model that is also good for extracting features. A main source of tractability in RBM models is that, given an input, the posterior distribution over hidden variables is factorizable and can be easily computed and sampled from. Sparsity and competition in the hidden representation is beneficial, and while an RBM with competition among its hidden units would acquire some of the attractive properties of sparse coding, such constraints are typically not added, as the resulting posterior over the hid- den units seemingly becomes intractable. In this paper we show that a dynamic programming algorithm can be used to implement exact sparsity in the RBM’s hidden units. We also show how to pass derivatives through the resulting posterior marginals, which makes it possible to fine-tune a pre-trained neural network with sparse hidden layers.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } The Restricted Boltzmann Machine (RBM) is a popular density model that is also good for extracting features. A main source of tractability in RBM models is that, given an input, the posterior distribution over hidden variables is factorizable and can be easily computed and sampled from. Sparsity and competition in the hidden representation is beneficial, and while an RBM with competition among its hidden units would acquire some of the attractive properties of sparse coding, such constraints are typically not added, as the resulting posterior over the hid- den units seemingly becomes intractable. In this paper we show that a dynamic programming algorithm can be used to implement exact sparsity in the RBM’s hidden units. We also show how to pass derivatives through the resulting posterior marginals, which makes it possible to fine-tune a pre-trained neural network with sparse hidden layers. |

## 2011 |

Adams, Ryan P; Zemel, Richard S Ranking via Sinkhorn Propagation Unpublished 2011, (arXiv:1106.1925 [stat.ML]). @unpublished{adams2011sinkhorn, title = {Ranking via Sinkhorn Propagation}, author = {Ryan P. Adams and Richard S. Zemel}, url = {http://www.cs.princeton.edu/~rpa/pubs/adams2011sinkhorn.pdf}, year = {2011}, date = {2011-01-01}, abstract = {It is of increasing importance to develop learning methods for ranking. In contrast to many learning objectives, however, the ranking problem presents difficulties due to the fact that the space of permutations is not smooth. In this paper, we examine the class of emphrank-linear objective functions, which includes popular metrics such as precision and discounted cumulative gain. In particular, we observe that expectations of these gains are completely characterized by the marginals of the corresponding distribution over permutation matrices. Thus, the expectations of rank-linear objectives can always be described through locations in the Birkhoff polytope, i.e., doubly-stochastic matrices (DSMs). We propose a technique for learning DSM-based ranking functions using an iterative projection operator known as Sinkhorn normalization. Gradients of this operator can be computed via backpropagation, resulting in an algorithm we call Sinkhorn propagation, or SinkProp. This approach can be combined with a wide range of gradient-based approaches to rank learning. We demonstrate the utility of SinkProp on several information retrieval data sets.}, note = {arXiv:1106.1925 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {unpublished} } It is of increasing importance to develop learning methods for ranking. In contrast to many learning objectives, however, the ranking problem presents difficulties due to the fact that the space of permutations is not smooth. In this paper, we examine the class of emphrank-linear objective functions, which includes popular metrics such as precision and discounted cumulative gain. In particular, we observe that expectations of these gains are completely characterized by the marginals of the corresponding distribution over permutation matrices. Thus, the expectations of rank-linear objectives can always be described through locations in the Birkhoff polytope, i.e., doubly-stochastic matrices (DSMs). We propose a technique for learning DSM-based ranking functions using an iterative projection operator known as Sinkhorn normalization. Gradients of this operator can be computed via backpropagation, resulting in an algorithm we call Sinkhorn propagation, or SinkProp. This approach can be combined with a wide range of gradient-based approaches to rank learning. We demonstrate the utility of SinkProp on several information retrieval data sets. |

## 2010 |

Adams, Ryan P; Dahl, George E; Murray, Iain Incorporating Side Information into Probabilistic Matrix Factorization Using Gaussian Processes Conference Proceedings of the 26th Conference on Uncertainty in Artificial Intelligence (UAI), 2010, (arXiv:1003.4944 [stat.ML]). @conference{adams2010side, title = {Incorporating Side Information into Probabilistic Matrix Factorization Using Gaussian Processes}, author = {Ryan P. Adams and George E. Dahl and Iain Murray}, url = {http://www.cs.princeton.edu/~rpa/pubs/adams2010side.pdf}, year = {2010}, date = {2010-01-01}, booktitle = {Proceedings of the 26th Conference on Uncertainty in Artificial Intelligence (UAI)}, abstract = {Probabilistic matrix factorization (PMF) is a powerful method for modeling data associated with pairwise relationships, finding use in collaborative filtering, computational biology, and document analysis, among other areas. In many domains, there is additional information that can assist in prediction. For example, when modeling movie ratings, we might know when the rating occurred, where the user lives, or what actors appear in the movie. It is difficult, however, to incorporate this side information into the PMF model. We propose a framework for incorporating side information by coupling together multiple PMF problems via Gaussian process priors. We replace scalar latent features with functions that vary over the space of side information. The GP priors on these functions require them to vary smoothly and share information. We successfully use this new method to predict the scores of professional basketball games, where side information about the venue and date of the game are relevant for the outcome.}, note = {arXiv:1003.4944 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Probabilistic matrix factorization (PMF) is a powerful method for modeling data associated with pairwise relationships, finding use in collaborative filtering, computational biology, and document analysis, among other areas. In many domains, there is additional information that can assist in prediction. For example, when modeling movie ratings, we might know when the rating occurred, where the user lives, or what actors appear in the movie. It is difficult, however, to incorporate this side information into the PMF model. We propose a framework for incorporating side information by coupling together multiple PMF problems via Gaussian process priors. We replace scalar latent features with functions that vary over the space of side information. The GP priors on these functions require them to vary smoothly and share information. We successfully use this new method to predict the scores of professional basketball games, where side information about the venue and date of the game are relevant for the outcome. |

Murray, Iain; Adams, Ryan P; MacKay, David J C Elliptical Slice Sampling Conference Proceedings of the 13th International Conference on Artificial Intelligence and Statistics (AISTATS), 2010, (arXiv:1001.0175 [stat.CO]). @conference{murray2010elliptical, title = {Elliptical Slice Sampling}, author = {Iain Murray and Ryan P. Adams and David J.C. MacKay}, url = {http://www.cs.princeton.edu/~rpa/pubs/murray2010elliptical.pdf}, year = {2010}, date = {2010-01-01}, booktitle = {Proceedings of the 13th International Conference on Artificial Intelligence and Statistics (AISTATS)}, abstract = {Many probabilistic models introduce strong dependencies between variables using a latent multivariate Gaussian distribution or a Gaussian process. We present a new Markov chain Monte Carlo algorithm for performing inference in models with multivariate Gaussian priors. Its key properties are: 1) it has simple, generic code applicable to many models, 2) it has no free parameters, 3) it works well for a variety of Gaussian process based models. These properties make our method ideal for use while model building, removing the need to spend time deriving and tuning updates for more complex algorithms.}, note = {arXiv:1001.0175 [stat.CO]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Many probabilistic models introduce strong dependencies between variables using a latent multivariate Gaussian distribution or a Gaussian process. We present a new Markov chain Monte Carlo algorithm for performing inference in models with multivariate Gaussian priors. Its key properties are: 1) it has simple, generic code applicable to many models, 2) it has no free parameters, 3) it works well for a variety of Gaussian process based models. These properties make our method ideal for use while model building, removing the need to spend time deriving and tuning updates for more complex algorithms. |

Adams, Ryan P; Wallach, Hanna M; Ghahramani, Zoubin Learning the Structure of Deep Sparse Graphical Models Conference Proceedings of the 13th International Conference on Artificial Intelligence and Statistics (AISTATS), 2010, (arXiv:1001.0160 [stat.ML]). @conference{adams2010deep, title = {Learning the Structure of Deep Sparse Graphical Models}, author = {Ryan P. Adams and Hanna M. Wallach and Zoubin Ghahramani}, url = {http://www.cs.princeton.edu/~rpa/pubs/adams2010deep.pdf}, year = {2010}, date = {2010-01-01}, booktitle = {Proceedings of the 13th International Conference on Artificial Intelligence and Statistics (AISTATS)}, abstract = {Deep belief networks are a powerful way to model complex probability distributions. However, learning the structure of a belief network, particularly one with hidden units, is difficult. The Indian buffet process has been used as a nonparametric Bayesian prior on the directed structure of a belief network with a single infinitely wide hidden layer. In this paper, we introduce the cascading Indian buffet process (CIBP), which provides a nonparametric prior on the structure of a layered, directed belief network that is unbounded in both depth and width, yet allows tractable inference. We use the CIBP prior with the nonlinear Gaussian belief network so each unit can additionally vary its behavior between discrete and continuous representations. We provide Markov chain Monte Carlo algorithms for inference in these belief networks and explore the structures learned on several image data sets.}, note = {arXiv:1001.0160 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Deep belief networks are a powerful way to model complex probability distributions. However, learning the structure of a belief network, particularly one with hidden units, is difficult. The Indian buffet process has been used as a nonparametric Bayesian prior on the directed structure of a belief network with a single infinitely wide hidden layer. In this paper, we introduce the cascading Indian buffet process (CIBP), which provides a nonparametric prior on the structure of a layered, directed belief network that is unbounded in both depth and width, yet allows tractable inference. We use the CIBP prior with the nonlinear Gaussian belief network so each unit can additionally vary its behavior between discrete and continuous representations. We provide Markov chain Monte Carlo algorithms for inference in these belief networks and explore the structures learned on several image data sets. |

Murray, Iain; Adams, Ryan P Slice Sampling Covariance Hyperparameters in Latent Gaussian Models Conference Advances in Neural Information Processing Systems (NIPS) 23, 2010, (arXiv:1006.0868 [stat.CO]). @conference{murray2010hyper, title = {Slice Sampling Covariance Hyperparameters in Latent Gaussian Models}, author = {Iain Murray and Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/murray2010hyper.pdf}, year = {2010}, date = {2010-01-01}, booktitle = {Advances in Neural Information Processing Systems (NIPS) 23}, abstract = {The Gaussian process (GP) is a popular way to specify dependencies between random variables in a probabilistic model. In the Bayesian framework the covariance structure can be specified using unknown hyperparameters. Integrating over these hyperparameters considers different possible explanations for the data when making predictions. This integration is often performed using Markov chain Monte Carlo (MCMC) sampling. However, with non-Gaussian observations standard hyperparameter sampling approaches require careful tuning and may converge slowly. In this paper we present a slice sampling approach that requires little tuning while mixing well in both strong- and weak-data regimes.}, note = {arXiv:1006.0868 [stat.CO]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } The Gaussian process (GP) is a popular way to specify dependencies between random variables in a probabilistic model. In the Bayesian framework the covariance structure can be specified using unknown hyperparameters. Integrating over these hyperparameters considers different possible explanations for the data when making predictions. This integration is often performed using Markov chain Monte Carlo (MCMC) sampling. However, with non-Gaussian observations standard hyperparameter sampling approaches require careful tuning and may converge slowly. In this paper we present a slice sampling approach that requires little tuning while mixing well in both strong- and weak-data regimes. |

Adams, Ryan P; Ghahramani, Zoubin; Jordan, Michael I Tree-Structured Stick Breaking for Hierarchical Data Conference Advances in Neural Information Processing Systems (NIPS) 23, 2010, (arXiv:1006.1062 [stat.ME]). @conference{adams2010tree, title = {Tree-Structured Stick Breaking for Hierarchical Data}, author = {Ryan P. Adams and Zoubin Ghahramani and Michael I. Jordan}, url = {http://www.cs.princeton.edu/~rpa/pubs/adams2010tree.pdf}, year = {2010}, date = {2010-01-01}, booktitle = {Advances in Neural Information Processing Systems (NIPS) 23}, abstract = {Many data are naturally modeled by an unobserved hierarchical structure. In this paper we propose a flexible nonparametric prior over unknown data hierarchies. The approach uses nested stick-breaking processes to allow for trees of unbounded width and depth, where data can live at any node and are infinitely exchangeable. One can view our model as providing infinite mixtures where the components have a dependency structure corresponding to an evolutionary diffusion down a tree. By using a stick-breaking approach, we can apply Markov chain Monte Carlo methods based on slice sampling to perform Bayesian inference and simulate from the posterior distribution on trees. We apply our method to hierarchical clustering of images and topic modeling of text data.}, note = {arXiv:1006.1062 [stat.ME]}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Many data are naturally modeled by an unobserved hierarchical structure. In this paper we propose a flexible nonparametric prior over unknown data hierarchies. The approach uses nested stick-breaking processes to allow for trees of unbounded width and depth, where data can live at any node and are infinitely exchangeable. One can view our model as providing infinite mixtures where the components have a dependency structure corresponding to an evolutionary diffusion down a tree. By using a stick-breaking approach, we can apply Markov chain Monte Carlo methods based on slice sampling to perform Bayesian inference and simulate from the posterior distribution on trees. We apply our method to hierarchical clustering of images and topic modeling of text data. |

## 2009 |

Adams, Ryan P; Ghahramani, Zoubin Archipelago: Nonparametric Bayesian Semi-Supervised Learning Conference Proceedings of the 26th International Conference on Machine Learning (ICML), 2009. @conference{adams2009archipelago, title = {Archipelago: Nonparametric Bayesian Semi-Supervised Learning}, author = {Ryan P. Adams and Zoubin Ghahramani}, url = {http://www.cs.princeton.edu/~rpa/pubs/adams2009archipelago.pdf}, year = {2009}, date = {2009-01-01}, booktitle = {Proceedings of the 26th International Conference on Machine Learning (ICML)}, abstract = {Semi-supervised learning (SSL), is classification where additional unlabeled data can be used to improve accuracy. Generative approaches are appealing in this situation, as a model of the data's probability density can assist in identifying clusters. Nonparametric Bayesian methods, while ideal in theory due to their principled motivations, have been difficult to apply to SSL in practice. We present a nonparametric Bayesian method that uses Gaussian processes for the generative model, avoiding many of the problems associated with Dirichlet process mixture models. Our model is fully generative and we take advantage of recent advances in Markov chain Monte Carlo algorithms to provide a practical inference method. Our method compares favorably to competing approaches on synthetic and real-world multi-class data.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Semi-supervised learning (SSL), is classification where additional unlabeled data can be used to improve accuracy. Generative approaches are appealing in this situation, as a model of the data's probability density can assist in identifying clusters. Nonparametric Bayesian methods, while ideal in theory due to their principled motivations, have been difficult to apply to SSL in practice. We present a nonparametric Bayesian method that uses Gaussian processes for the generative model, avoiding many of the problems associated with Dirichlet process mixture models. Our model is fully generative and we take advantage of recent advances in Markov chain Monte Carlo algorithms to provide a practical inference method. Our method compares favorably to competing approaches on synthetic and real-world multi-class data. |

Adams, Ryan P; Murray, Iain; MacKay, David J C The Gaussian Process Density Sampler Conference Advances in Neural Information Processing Systems 21 (NIPS), 2009. @conference{adams2009gpds, title = {The Gaussian Process Density Sampler}, author = {Ryan P. Adams and Iain Murray and David J.C. MacKay}, url = {http://www.cs.princeton.edu/~rpa/pubs/adams2009gpds.pdf}, year = {2009}, date = {2009-01-01}, booktitle = {Advances in Neural Information Processing Systems 21 (NIPS)}, abstract = {We present the Gaussian Process Density Sampler (GPDS), an exchangeable generative model for use in nonparametric Bayesian density estimation. Samples drawn from the GPDS are consistent with exact, independent samples from a fixed density function that is a transformation of a function drawn from a Gaussian process prior. Our formulation allows us to infer an unknown density from data using Markov chain Monte Carlo, which gives samples from the posterior distribution over density functions and from the predictive distribution on data space. We can also infer the hyperparameters of the Gaussian process. We compare this density modeling technique to several existing techniques on a toy problem and a skull-reconstruction task.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } We present the Gaussian Process Density Sampler (GPDS), an exchangeable generative model for use in nonparametric Bayesian density estimation. Samples drawn from the GPDS are consistent with exact, independent samples from a fixed density function that is a transformation of a function drawn from a Gaussian process prior. Our formulation allows us to infer an unknown density from data using Markov chain Monte Carlo, which gives samples from the posterior distribution over density functions and from the predictive distribution on data space. We can also infer the hyperparameters of the Gaussian process. We compare this density modeling technique to several existing techniques on a toy problem and a skull-reconstruction task. |

Adams, Ryan P; Murray, Iain; MacKay, David J C Tractable Nonparametric Bayesian Inference in Poisson Processes with Gaussian Process Intensities Conference Proceedings of the 26th International Conference on Machine Learning (ICML), Montréal, Canada, 2009. @conference{adams2009poisson, title = {Tractable Nonparametric Bayesian Inference in Poisson Processes with Gaussian Process Intensities}, author = {Ryan P. Adams and Iain Murray and David J.C. MacKay}, url = {http://www.cs.princeton.edu/~rpa/pubs/adams2009gpds.pdf}, year = {2009}, date = {2009-01-01}, booktitle = {Proceedings of the 26th International Conference on Machine Learning (ICML)}, address = {Montréal, Canada}, abstract = {The inhomogeneous Poisson process is a point process that has varying intensity across its domain (usually time or space). For nonparametric Bayesian modeling, the Gaussian process is a useful way to place a prior distribution on this intensity. The combination of a Poisson process and GP is known as a Gaussian Cox process, or doubly-stochastic Poisson process. Likelihood-based inference in these models requires an intractable integral over an infinite-dimensional random function. In this paper we present the first approach to Gaussian Cox processes in which it is possible to perform inference without introducing approximations or finite-dimensional proxy distributions. We call our method the Sigmoidal Gaussian Cox Process, which uses a generative model for Poisson data to enable tractable inference via Markov chain Monte Carlo. We compare our methods to competing methods on synthetic data and apply it to several real-world data sets.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } The inhomogeneous Poisson process is a point process that has varying intensity across its domain (usually time or space). For nonparametric Bayesian modeling, the Gaussian process is a useful way to place a prior distribution on this intensity. The combination of a Poisson process and GP is known as a Gaussian Cox process, or doubly-stochastic Poisson process. Likelihood-based inference in these models requires an intractable integral over an infinite-dimensional random function. In this paper we present the first approach to Gaussian Cox processes in which it is possible to perform inference without introducing approximations or finite-dimensional proxy distributions. We call our method the Sigmoidal Gaussian Cox Process, which uses a generative model for Poisson data to enable tractable inference via Markov chain Monte Carlo. We compare our methods to competing methods on synthetic data and apply it to several real-world data sets. |

Adams, Ryan P; Murray, Iain; MacKay, David J C Nonparametric Bayesian Density Modeling with Gaussian Processes Unpublished 2009, (arXiv:0912.4896 [stat.CO]). @unpublished{adams2009nonparametric, title = {Nonparametric Bayesian Density Modeling with Gaussian Processes}, author = {Ryan P. Adams and Iain Murray and David J.C. MacKay}, url = {http://www.cs.princeton.edu/~rpa/pubs/adams2009nonparametric.pdf}, year = {2009}, date = {2009-01-01}, abstract = {We present the Gaussian process density sampler (GPDS), an exchangeable generative model for use in nonparametric Bayesian density estimation. Samples drawn from the GPDS are consistent with exact, independent samples from a distribution defined by a density that is a transformation of a function drawn from a Gaussian process prior. Our formulation allows us to infer an unknown density from data using Markov chain Monte Carlo, which gives samples from the posterior distribution over density functions and from the predictive distribution on data space. We describe two such MCMC methods. Both methods also allow inference of the hyperparameters of the Gaussian process.}, note = {arXiv:0912.4896 [stat.CO]}, keywords = {}, pubstate = {published}, tppubtype = {unpublished} } We present the Gaussian process density sampler (GPDS), an exchangeable generative model for use in nonparametric Bayesian density estimation. Samples drawn from the GPDS are consistent with exact, independent samples from a distribution defined by a density that is a transformation of a function drawn from a Gaussian process prior. Our formulation allows us to infer an unknown density from data using Markov chain Monte Carlo, which gives samples from the posterior distribution over density functions and from the predictive distribution on data space. We describe two such MCMC methods. Both methods also allow inference of the hyperparameters of the Gaussian process. |

Adams, Ryan P Kernel Methods for Nonparametric Bayesian Inference of Probability Densities and Point Processes PhD Thesis University of Cambridge, 2009. @phdthesis{adams2009thesis, title = {Kernel Methods for Nonparametric Bayesian Inference of Probability Densities and Point Processes}, author = {Ryan P. Adams}, url = {http://www.cs.princeton.edu/~rpa/pubs/adams2009thesis.pdf}, year = {2009}, date = {2009-01-01}, address = {Cambridge, UK}, school = {University of Cambridge}, abstract = {Nonparametric kernel methods for estimation of probability densities and point process intensities have long been of interest to researchers in statistics and machine learning. Frequentist kernel methods are widely used, but provide only a point estimate of the unknown density. Additionally, in frequentist kernel density methods, it can be difficult to select appropriate kernel parameters. The Bayesian approach to inference potentially resolves both of these deficiencies, by providing a distribution over the unknowns and enabling a principled approach to kernel selection. Constructing a Bayesian nonparametric kernel density method has proven to be difficult, however, due to the need to integrate over an infinite-dimensional random function in order to evaluate the likelihood. To avoid this intractability, all Bayesian kernel density methods to date have either used a crippled model or a finite-dimensional approximation. Recent advances in Markov chain Monte Carlo methods have improved the situation for these doubly-intractable posterior distributions, however. If data can be generated exactly from the model, then it is possible to perform inference without computing the intractable likelihood. I propose two new kernel-based models that enable an exact generative procedure: the Gaussian process density sampler (GPDS) for probability density functions, and the sigmoidal Gaussian Cox process (SGCP) for the Poisson process. With generative priors, I show how it is now possible to construct two dif- ferent kinds of Markov chains for inference in these models. These Markov chains have the desired posterior distribution as their equilibrium distributions, and, despite a parameter space with uncountably many dimensions, require only a finite amount of computation to simulate. The GPDS and SGCP, and the associated inference procedures, are the first kernel-based nonparametric Bayesian methods that allow inference without a finite-dimensional approximation. I also present several additional kernel-based models for data that extend the Gaussian process density sampler and sigmoidal Gaussian Cox process to other situations. The Archipelago model extends the GPDS to address the task of semi-supervised learning, where a flexible density estimate can improve the performance of a classifier when unlabeled data are available. I also generalise the SGCP to enable a nonparametric inhomogeneous Neyman–Scott process, and present a soft-core generalisation of the Mate Ìrn repulsive process that similarly allows non-approximate inference via Markov chain Monte Carlo.}, keywords = {}, pubstate = {published}, tppubtype = {phdthesis} } Nonparametric kernel methods for estimation of probability densities and point process intensities have long been of interest to researchers in statistics and machine learning. Frequentist kernel methods are widely used, but provide only a point estimate of the unknown density. Additionally, in frequentist kernel density methods, it can be difficult to select appropriate kernel parameters. The Bayesian approach to inference potentially resolves both of these deficiencies, by providing a distribution over the unknowns and enabling a principled approach to kernel selection. Constructing a Bayesian nonparametric kernel density method has proven to be difficult, however, due to the need to integrate over an infinite-dimensional random function in order to evaluate the likelihood. To avoid this intractability, all Bayesian kernel density methods to date have either used a crippled model or a finite-dimensional approximation. Recent advances in Markov chain Monte Carlo methods have improved the situation for these doubly-intractable posterior distributions, however. If data can be generated exactly from the model, then it is possible to perform inference without computing the intractable likelihood. I propose two new kernel-based models that enable an exact generative procedure: the Gaussian process density sampler (GPDS) for probability density functions, and the sigmoidal Gaussian Cox process (SGCP) for the Poisson process. With generative priors, I show how it is now possible to construct two dif- ferent kinds of Markov chains for inference in these models. These Markov chains have the desired posterior distribution as their equilibrium distributions, and, despite a parameter space with uncountably many dimensions, require only a finite amount of computation to simulate. The GPDS and SGCP, and the associated inference procedures, are the first kernel-based nonparametric Bayesian methods that allow inference without a finite-dimensional approximation. I also present several additional kernel-based models for data that extend the Gaussian process density sampler and sigmoidal Gaussian Cox process to other situations. The Archipelago model extends the GPDS to address the task of semi-supervised learning, where a flexible density estimate can improve the performance of a classifier when unlabeled data are available. I also generalise the SGCP to enable a nonparametric inhomogeneous Neyman–Scott process, and present a soft-core generalisation of the Mate Ìrn repulsive process that similarly allows non-approximate inference via Markov chain Monte Carlo. |

## 2008 |

Adams, Ryan P; Stegle, Oliver Gaussian Process Product Models for Nonparametric Nonstationarity Conference Proceedings of the 25th International Conference on Machine Learning (ICML), Helsinki, Finland, 2008. @conference{adams2008gppm, title = {Gaussian Process Product Models for Nonparametric Nonstationarity}, author = {Ryan P. Adams and Oliver Stegle}, url = {http://www.cs.princeton.edu/~rpa/pubs/adams2008gppm.pdf}, year = {2008}, date = {2008-01-01}, booktitle = {Proceedings of the 25th International Conference on Machine Learning (ICML)}, pages = {1-8}, address = {Helsinki, Finland}, abstract = {Stationarity is often an unrealistic prior assumption for Gaussian process regression. One solution is to predefine an explicit nonstationary covariance function, but such covariance functions can be difficult to specify and require detailed prior knowledge of the nonstationarity. We propose the Gaussian process product model (GPPM) which models data as the pointwise product of two latent Gaussian processes to nonparametrically infer nonstationary variations of amplitude. This approach differs from other nonparametric approaches to covariance function inference in that it operates on the outputs rather than the inputs, resulting in a significant reduction in computational cost and required data for inference. We present an approximate inference scheme using Expectation Propagation. This variational approximation yields convenient GP hyperparameter selection and compact approximate predictive distributions.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Stationarity is often an unrealistic prior assumption for Gaussian process regression. One solution is to predefine an explicit nonstationary covariance function, but such covariance functions can be difficult to specify and require detailed prior knowledge of the nonstationarity. We propose the Gaussian process product model (GPPM) which models data as the pointwise product of two latent Gaussian processes to nonparametrically infer nonstationary variations of amplitude. This approach differs from other nonparametric approaches to covariance function inference in that it operates on the outputs rather than the inputs, resulting in a significant reduction in computational cost and required data for inference. We present an approximate inference scheme using Expectation Propagation. This variational approximation yields convenient GP hyperparameter selection and compact approximate predictive distributions. |

## 2007 |

Adams, Ryan P; MacKay, David J C Bayesian Online Changepoint Detection Unpublished 2007, (arXiv:0710.3742v1 [stat.ML]). @unpublished{adams2007changepoint, title = {Bayesian Online Changepoint Detection}, author = {Ryan P. Adams and David J.C. MacKay}, url = {http://www.cs.princeton.edu/~rpa/pubs/adams2007changepoint.pdf}, year = {2007}, date = {2007-01-01}, abstract = {Changepoints are abrupt variations in the generative parameters of a data sequence. Online detection of changepoints is useful in modelling and prediction of time series in application areas such as finance, biometrics, and robotics. While frequentist methods have yielded online filtering and prediction techniques, most Bayesian papers have focused on the retrospective segmentation problem. Here we examine the case where the model parameters before and after the changepoint are independent and we derive an online algorithm for exact inference of the most recent changepoint. We compute the probability distribution of the length of the current ``run,'' or time since the last changepoint, using a simple message-passing algorithm. Our implementation is highly modular so that the algorithm may be applied to a variety of types of data. We illustrate this modularity by demonstrating the algorithm on three different real-world data sets.}, note = {arXiv:0710.3742v1 [stat.ML]}, keywords = {}, pubstate = {published}, tppubtype = {unpublished} } Changepoints are abrupt variations in the generative parameters of a data sequence. Online detection of changepoints is useful in modelling and prediction of time series in application areas such as finance, biometrics, and robotics. While frequentist methods have yielded online filtering and prediction techniques, most Bayesian papers have focused on the retrospective segmentation problem. Here we examine the case where the model parameters before and after the changepoint are independent and we derive an online algorithm for exact inference of the most recent changepoint. We compute the probability distribution of the length of the current ``run,'' or time since the last changepoint, using a simple message-passing algorithm. Our implementation is highly modular so that the algorithm may be applied to a variety of types of data. We illustrate this modularity by demonstrating the algorithm on three different real-world data sets. |