Optimizing Expensive Objective Functions
Shields, Benjamin J.; Stevens, Jason; Li, Jun; Parasram, Marvin; Damani, Farhan; Alvarado, Jesus I. Martinez; Janey, Jacob M.; Adams, Ryan P.; Doyle, Abigail G.
Bayesian reaction optimization as a tool for chemical synthesis Journal Article
In: Nature, vol. 590, pp. 89-96, 2021.
@article{shields2021bayesian,
title = {Bayesian reaction optimization as a tool for chemical synthesis},
author = {Benjamin J. Shields and Jason Stevens and Jun Li and Marvin Parasram and Farhan Damani and Jesus I. Martinez Alvarado and Jacob M. Janey and Ryan P. Adams and Abigail G. Doyle},
year = {2021},
date = {2021-04-01},
journal = {Nature},
volume = {590},
pages = {89-96},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Fedorov, Igor; Adams, Ryan P.; Mattina, Matthew; Whatmough, Paul N.
SpArSe: Sparse Architecture Search for CNNs on Resource-Constrained Microcontrollers Conference
Advances in Neural Information Processing Systems 32 (NeurIPS), 2019.
@conference{fedorov2019sparse,
title = {SpArSe: Sparse Architecture Search for CNNs on Resource-Constrained Microcontrollers},
author = {Igor Fedorov and
Ryan P. Adams and
Matthew Mattina and
Paul N. Whatmough},
url = {https://www.cs.princeton.edu/~rpa/pubs/fedorov2019sparse.pdf},
year = {2019},
date = {2019-12-04},
booktitle = {Advances in Neural Information Processing Systems 32 (NeurIPS)},
abstract = {The vast majority of processors in the world are actually microcontroller units (MCUs), which find widespread use performing simple control tasks in applications ranging from automobiles to medical devices and office equipment. The Internet of Things (IoT) promises to inject machine learning into many of these every-day objects via tiny, cheap MCUs. However, these resource-impoverished hardware platforms severely limit the complexity of machine learning models that can be deployed. For example, although convolutional neural networks (CNNs) achieve state-of-the-art results on many visual recognition tasks, CNN inference on MCUs is challenging due to severe finite memory limitations. To circumvent the memory challenge associated with CNNs, various alternatives have been proposed that do fit within the memory budget of an MCU, albeit at the cost of prediction accuracy. This paper challenges the idea that CNNs are not suitable for deployment on MCUs. We demonstrate that it is possible to automatically design CNNs which generalize well, while also being small enough to fit onto memory-limited MCUs. Our Sparse Architecture Search method combines neural architecture search with pruning in a single, unified approach, which learns superior models on four popular IoT datasets. The CNNs we find are more accurate and up to 4.35× smaller than previous approaches, while meeting the strict MCU working memory constraint.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Beatson, Alex; Adams, Ryan P.
Efficient Optimization of Loops and Limits with Randomized Telescoping Sums Conference
Proceedings of the 36th International Conference on Machine Learning (ICML), 2019.
@conference{beatson2019efficient,
title = {Efficient Optimization of Loops and Limits with Randomized Telescoping Sums},
author = {Alex Beatson and
Ryan P. Adams},
url = {https://www.cs.princeton.edu/~rpa/pubs/beatson2019efficient.pdf},
year = {2019},
date = {2019-06-13},
booktitle = {Proceedings of the 36th International Conference on Machine Learning (ICML)},
abstract = {We consider optimization problems in which the objective requires an inner loop with many steps or is the limit of a sequence of increasingly costly approximations. Meta-learning, training recurrent neural networks, and optimization of the solutions to differential equations are all examples of optimization problems with this character. In such problems, it can be expensive to compute the objective function value and its gradient, but truncating the loop or using less accurate approximations can induce biases that damage the overall solution. We propose randomized telescope (RT) gradient estimators, which represent the objective as the sum of a telescoping series and sample linear combinations of terms to provide cheap unbiased gradient estimates. We identify conditions under which RT estimators achieve optimization convergence rates independent of the length of the loop or the required accuracy of the approximation. We also derive a method for tuning RT estimators online to maximize a lower bound on the expected decrease in loss per unit of computation. We evaluate our adaptive RT estimators on a range of applications including meta-optimization of learning rates, variational inference of ODE parameters, and training an LSTM to model long sequences.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Shahriari, Bobak; Swersky, Kevin; Wang, Ziyu; Adams, Ryan P.; de Freitas, Nando
Taking the Human Out of the Loop: A Review of Bayesian Optimization Journal Article
In: Proceedings of the IEEE, vol. 104, no. 1, pp. 148–175, 2016.
@article{shahriari2016loop,
title = {Taking the Human Out of the Loop: A Review of Bayesian Optimization},
author = {Bobak Shahriari and Kevin Swersky and Ziyu Wang and Ryan P. Adams and Nando de Freitas},
url = {http://www.cs.princeton.edu/~rpa/pubs/shahriari2016loop.pdf},
year = {2016},
date = {2016-01-01},
journal = {Proceedings of the IEEE},
volume = {104},
number = {1},
pages = {148--175},
abstract = {Big Data applications are typically associated with systems
involving large numbers of users, massive complex software
systems, and large-scale heterogeneous computing and storage
architectures. The construction of such systems involves many
distributed design choices. The end products (e.g.,
recommendation systems, medical analysis tools, real-time game
engines, speech recognizers) thus involve many tunable
configuration parameters. These parameters are often specified
and hard-coded into the software by various developers or
teams. If optimized jointly, these parameters can result in
significant improvements. Bayesian optimization is a powerful
tool for the joint optimization of design choices that is
gaining great popularity in recent years. It promises greater
automation so as to increase both product quality and human
productivity. This review paper introduces Bayesian
optimization, highlights some of its methodological aspects,
and showcases a wide range of applications.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
involving large numbers of users, massive complex software
systems, and large-scale heterogeneous computing and storage
architectures. The construction of such systems involves many
distributed design choices. The end products (e.g.,
recommendation systems, medical analysis tools, real-time game
engines, speech recognizers) thus involve many tunable
configuration parameters. These parameters are often specified
and hard-coded into the software by various developers or
teams. If optimized jointly, these parameters can result in
significant improvements. Bayesian optimization is a powerful
tool for the joint optimization of design choices that is
gaining great popularity in recent years. It promises greater
automation so as to increase both product quality and human
productivity. This review paper introduces Bayesian
optimization, highlights some of its methodological aspects,
and showcases a wide range of applications.
Hernández-Lobato, Daniel; Hernández-Lobato, José Miguel; Shah, Amar; Adams, Ryan P.
Predictive Entropy Search for Multi-Objective Bayesian Optimization Conference
Proceedings of the 33rd International Conference on Machine Learning (ICML), 2016, (arXiv:1511.05467 [stat.ML]).
@conference{lobato2016pesc,
title = {Predictive Entropy Search for Multi-Objective Bayesian Optimization},
author = {Daniel Hernández-Lobato and José Miguel Hernández-Lobato and Amar Shah and Ryan P. Adams},
url = {http://www.cs.princeton.edu/~rpa/pubs/lobato2016pesc.pdf},
year = {2016},
date = {2016-01-01},
booktitle = {Proceedings of the 33rd International Conference on Machine Learning (ICML)},
abstract = {We present PESMO, a Bayesian method for identifying the Pareto
set of multi-objective optimization problems, when the
functions are expensive to evaluate. The central idea of PESMO
is to choose evaluation points so as to maximally reduce the
entropy of the posterior distribution over the Pareto
set. Critically, the PESMO multi-objective acquisition
function can be decomposed as a sum of objective-specific
acquisition functions, which enables the algorithm to be used
in decoupled scenarios in which the objectives can be
evaluated separately and perhaps with different costs. This
decoupling capability also makes it possible to identify
difficult objectives that require more evaluations. PESMO also
offers gains in efficiency, as its cost scales linearly with
the number of objectives, in comparison to the exponential
cost of other methods. We compare PESMO with other related
methods for multi-objective Bayesian optimization on synthetic
and real-world problems. The results show that PESMO produces
better recommendations with a smaller number of evaluations of
the objectives, and that a decoupled evaluation can lead to
improvements in performance, particularly when the number of
objectives is large.},
note = {arXiv:1511.05467 [stat.ML]},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
set of multi-objective optimization problems, when the
functions are expensive to evaluate. The central idea of PESMO
is to choose evaluation points so as to maximally reduce the
entropy of the posterior distribution over the Pareto
set. Critically, the PESMO multi-objective acquisition
function can be decomposed as a sum of objective-specific
acquisition functions, which enables the algorithm to be used
in decoupled scenarios in which the objectives can be
evaluated separately and perhaps with different costs. This
decoupling capability also makes it possible to identify
difficult objectives that require more evaluations. PESMO also
offers gains in efficiency, as its cost scales linearly with
the number of objectives, in comparison to the exponential
cost of other methods. We compare PESMO with other related
methods for multi-objective Bayesian optimization on synthetic
and real-world problems. The results show that PESMO produces
better recommendations with a smaller number of evaluations of
the objectives, and that a decoupled evaluation can lead to
improvements in performance, particularly when the number of
objectives is large.
Snoek, Jasper; Rippel, Oren; Swersky, Kevin; Kiros, Ryan; Satish, Nadathur; Sundaram, Narayanan; Patwary, Md. Mostofa Ali; Prabhat,; Adams, Ryan P.
Scalable Bayesian Optimization Using Deep Neural Networks Conference
Proceedings of the 32nd International Conference on Machine Learning (ICML), 2015, (arXiv:1502.05700 [stat.ML]).
@conference{snoek2015scalable,
title = {Scalable Bayesian Optimization Using Deep Neural Networks},
author = {Jasper Snoek and Oren Rippel and Kevin Swersky and Ryan Kiros and Nadathur Satish and Narayanan Sundaram and Md. Mostofa Ali Patwary and Prabhat and Ryan P. Adams},
url = {http://www.cs.princeton.edu/~rpa/pubs/snoek2015scalable.pdf},
year = {2015},
date = {2015-01-01},
booktitle = {Proceedings of the 32nd International Conference on Machine Learning (ICML)},
abstract = {Bayesian optimization is an effective methodology for the global
optimization of functions with expensive evaluations. It
relies on querying a distribution over functions defined by a
relatively cheap surrogate model. An accurate model for this
distribution over functions is critical to the effectiveness
of the approach, and is typically fit using Gaussian processes
(GPs). However, since GPs scale cubically with the number of
observations, it has been challenging to handle objectives
whose optimization requires many evaluations, and as such,
massively parallelizing the optimization. In this work, we
explore the use of neural networks as an alternative to GPs to
model distributions over functions. We show that performing
adaptive basis function regression with a neural network as
the parametric form performs competitively with
state-of-the-art GP-based approaches, but scales linearly with
the number of data rather than cubically. This allows us to
achieve a previously intractable degree of parallelism, which
we apply to large scale hyperparameter optimization, rapidly
finding competitive models on benchmark object recognition
tasks using convolutional networks, and image caption
generation using neural language models.},
note = {arXiv:1502.05700 [stat.ML]},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
optimization of functions with expensive evaluations. It
relies on querying a distribution over functions defined by a
relatively cheap surrogate model. An accurate model for this
distribution over functions is critical to the effectiveness
of the approach, and is typically fit using Gaussian processes
(GPs). However, since GPs scale cubically with the number of
observations, it has been challenging to handle objectives
whose optimization requires many evaluations, and as such,
massively parallelizing the optimization. In this work, we
explore the use of neural networks as an alternative to GPs to
model distributions over functions. We show that performing
adaptive basis function regression with a neural network as
the parametric form performs competitively with
state-of-the-art GP-based approaches, but scales linearly with
the number of data rather than cubically. This allows us to
achieve a previously intractable degree of parallelism, which
we apply to large scale hyperparameter optimization, rapidly
finding competitive models on benchmark object recognition
tasks using convolutional networks, and image caption
generation using neural language models.
Hernández-Lobato, José Miguel; Gelbart, Michael A.; Hoffman, Matthew W.; Adams, Ryan P.; Ghahramani, Zoubin
Predictive Entropy Search for Bayesian Optimization with Unknown Constraints Conference
Proceedings of the 32nd International Conference on Machine Learning (ICML), 2015, (arXiv:1502.05312 [stat.ML]).
@conference{lobato2015predictive,
title = {Predictive Entropy Search for Bayesian Optimization with Unknown Constraints},
author = {José Miguel Hernández-Lobato and Michael A. Gelbart and Matthew W. Hoffman and Ryan P. Adams and Zoubin Ghahramani},
url = {http://www.cs.princeton.edu/~rpa/pubs/lobato2015predictive.pdf},
year = {2015},
date = {2015-01-01},
booktitle = {Proceedings of the 32nd International Conference on Machine Learning (ICML)},
abstract = {Unknown constraints arise in many types of expensive black-box
optimization problems. Several methods have been proposed
recently for performing Bayesian optimization with
constraints, based on the expected improvement (EI)
heuristic. However, EI can lead to pathologies when used with
constraints. For example, in the case of decoupled
constraints—i.e., when one can independently evaluate the
objective or the constraints—EI can encounter a pathology that
prevents exploration. Additionally, computing EI requires a
current best solution, which may not exist if none of the data
collected so far satisfy the constraints. By contrast,
information based approaches do not suffer from these failure
modes. In this paper, we present a new information-based
method called Predictive Entropy Search with Constraints
(PESC). We analyze the performance of PESC and show that it
compares favorably to EI-based approaches on synthetic and
benchmark problems, as well as several real-world examples. We
demonstrate that PESC is an effective algorithm that provides
a promising direction towards a unified solution for
constrained Bayesian optimization.},
note = {arXiv:1502.05312 [stat.ML]},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
optimization problems. Several methods have been proposed
recently for performing Bayesian optimization with
constraints, based on the expected improvement (EI)
heuristic. However, EI can lead to pathologies when used with
constraints. For example, in the case of decoupled
constraints—i.e., when one can independently evaluate the
objective or the constraints—EI can encounter a pathology that
prevents exploration. Additionally, computing EI requires a
current best solution, which may not exist if none of the data
collected so far satisfy the constraints. By contrast,
information based approaches do not suffer from these failure
modes. In this paper, we present a new information-based
method called Predictive Entropy Search with Constraints
(PESC). We analyze the performance of PESC and show that it
compares favorably to EI-based approaches on synthetic and
benchmark problems, as well as several real-world examples. We
demonstrate that PESC is an effective algorithm that provides
a promising direction towards a unified solution for
constrained Bayesian optimization.
Snoek, Jasper; Swersky, Kevin; Zemel, Richard S.; Adams, Ryan P.
Input Warping for Bayesian Optimization of Non-Stationary Functions Conference
Proceedings of the 31st International Conference on Machine Learning (ICML), 2014, (arXiv:1402.0929 [stat.ML]).
@conference{snoek2014warping,
title = {Input Warping for Bayesian Optimization of Non-Stationary Functions},
author = {Jasper Snoek and Kevin Swersky and Richard S. Zemel and Ryan P. Adams},
url = {http://www.cs.princeton.edu/~rpa/pubs/snoek2014warping.pdf},
year = {2014},
date = {2014-01-01},
booktitle = {Proceedings of the 31st International Conference on Machine Learning (ICML)},
abstract = {Bayesian optimization has proven to be a highly effective
methodology for the global optimization of unknown, expensive
and multimodal functions. The ability to accurately model
distributions over functions is critical to the effectiveness
of Bayesian optimization. Although Gaussian processes provide
a flexible prior over functions which can be queried
efficiently, there are various classes of functions that
remain difficult to model. One of the most frequently
occurring of these is the class of non-stationary
functions. The optimization of the hyperparameters of machine
learning algorithms is a problem domain in which parameters
are often manually transformed a priori, for example by
optimizing in "log-space," to mitigate the effects of
spatially-varying length scale. We develop a methodology for
automatically learning a wide family of bijective
transformations or warpings of the input space using the Beta
cumulative distribution function. We further extend the
warping framework to multi-task Bayesian optimization so that
multiple tasks can be warped into a jointly stationary
space. On a set of challenging benchmark optimization tasks,
we observe that the inclusion of warping greatly improves on
the state-of-the-art, producing better results faster and more
reliably.},
note = {arXiv:1402.0929 [stat.ML]},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
methodology for the global optimization of unknown, expensive
and multimodal functions. The ability to accurately model
distributions over functions is critical to the effectiveness
of Bayesian optimization. Although Gaussian processes provide
a flexible prior over functions which can be queried
efficiently, there are various classes of functions that
remain difficult to model. One of the most frequently
occurring of these is the class of non-stationary
functions. The optimization of the hyperparameters of machine
learning algorithms is a problem domain in which parameters
are often manually transformed a priori, for example by
optimizing in "log-space," to mitigate the effects of
spatially-varying length scale. We develop a methodology for
automatically learning a wide family of bijective
transformations or warpings of the input space using the Beta
cumulative distribution function. We further extend the
warping framework to multi-task Bayesian optimization so that
multiple tasks can be warped into a jointly stationary
space. On a set of challenging benchmark optimization tasks,
we observe that the inclusion of warping greatly improves on
the state-of-the-art, producing better results faster and more
reliably.
Gelbart, Michael A.; Snoek, Jasper; Adams, Ryan P.
Bayesian Optimization with Unknown Constraints Conference
Proceedings of the 30th Conference on Uncertainty in Artificial Intelligence (UAI), 2014, (arXiv:1403.5607 [stat.ML]).
@conference{gelbart2014constraints,
title = {Bayesian Optimization with Unknown Constraints},
author = {Michael A. Gelbart and Jasper Snoek and Ryan P. Adams},
url = {http://www.cs.princeton.edu/~rpa/pubs/gelbart2014constraints.pdf},
year = {2014},
date = {2014-01-01},
booktitle = {Proceedings of the 30th Conference on Uncertainty in Artificial Intelligence (UAI)},
abstract = {Recent work on Bayesian optimization has shown its
effectiveness in global optimization of difficult black-box
objective functions. Many real-world optimization problems of
interest also have constraints which are unknown a priori. In
this paper, we study Bayesian optimization for constrained
problems in the general case that noise may be present in the
constraint functions, and the objective and constraints may be
evaluated independently. We provide motivating practical
examples, and present a general frame- work to solve such
problems. We demonstrate the effectiveness of our approach on
optimizing the performance of online latent Dirichlet
allocation subject to topic sparsity constraints, tun- ing a
neural network given test-time memory constraints, and
optimizing Hamiltonian Monte Carlo to achieve maximal
effectiveness in a fixed time, subject to passing standard
convergence diagnostics.},
note = {arXiv:1403.5607 [stat.ML]},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
effectiveness in global optimization of difficult black-box
objective functions. Many real-world optimization problems of
interest also have constraints which are unknown a priori. In
this paper, we study Bayesian optimization for constrained
problems in the general case that noise may be present in the
constraint functions, and the objective and constraints may be
evaluated independently. We provide motivating practical
examples, and present a general frame- work to solve such
problems. We demonstrate the effectiveness of our approach on
optimizing the performance of online latent Dirichlet
allocation subject to topic sparsity constraints, tun- ing a
neural network given test-time memory constraints, and
optimizing Hamiltonian Monte Carlo to achieve maximal
effectiveness in a fixed time, subject to passing standard
convergence diagnostics.
Swersky, Kevin; Snoek, Jasper; Adams, Ryan P.
Freeze-Thaw Bayesian Optimization Unpublished
2014, (arXiv:1406.3896 [stat.ML]).
@unpublished{swersky2014freeze,
title = {Freeze-Thaw Bayesian Optimization},
author = {Kevin Swersky and Jasper Snoek and Ryan P. Adams},
url = {http://www.cs.princeton.edu/~rpa/pubs/swersky2014freeze.pdf},
year = {2014},
date = {2014-01-01},
abstract = {In this paper we develop a dynamic form of Bayesian
optimization for machine learning models with the goal of
rapidly finding good hyperparameter settings. Our method uses
the partial information gained during the training of a
machine learning model in order to decide whether to pause
training and start a new model, or resume the training of a
previously-considered model. We specifically tailor our method
to machine learning problems by developing a novel
positive-definite covariance kernel to capture a variety of
training curves. Furthermore, we develop a Gaussian process
prior that scales gracefully with additional temporal
observations. Finally, we provide an information-theoretic
framework to automate the decision process. Experiments on
several common machine learning models show that our approach
is extremely effective in practice.},
note = {arXiv:1406.3896 [stat.ML]},
keywords = {},
pubstate = {published},
tppubtype = {unpublished}
}
optimization for machine learning models with the goal of
rapidly finding good hyperparameter settings. Our method uses
the partial information gained during the training of a
machine learning model in order to decide whether to pause
training and start a new model, or resume the training of a
previously-considered model. We specifically tailor our method
to machine learning problems by developing a novel
positive-definite covariance kernel to capture a variety of
training curves. Furthermore, we develop a Gaussian process
prior that scales gracefully with additional temporal
observations. Finally, we provide an information-theoretic
framework to automate the decision process. Experiments on
several common machine learning models show that our approach
is extremely effective in practice.
Swersky, Kevin; Snoek, Jasper; Adams, Ryan P.
Multi-Task Bayesian Optimization Conference
Advances in Neural Information Processing Systems (NIPS) 26, 2013.
@conference{swersky2013multi,
title = {Multi-Task Bayesian Optimization},
author = {Kevin Swersky and Jasper Snoek and Ryan P. Adams},
url = {http://www.cs.princeton.edu/~rpa/pubs/swersky2013multi.pdf},
year = {2013},
date = {2013-01-01},
booktitle = {Advances in Neural Information Processing Systems (NIPS) 26},
abstract = {Bayesian optimization has recently been proposed as a framework
for automatically tuning the hyperparameters of machine
learning models and has been shown to yield state-of-the-art
performance with impressive ease and efficiency. In this
paper, we explore whether it is possible to transfer the
knowledge gained from previous optimizations to new tasks in
order to find optimal hyperparameter settings more
efficiently. Our approach is based on extending multi-task
Gaussian processes to the framework of Bayesian
optimization. We show that this method significantly speeds up
the optimization process when compared to the standard
single-task approach. We further propose a straightforward
extension of our algorithm in order to jointly minimize the
average error across multiple tasks and demonstrate how this
can be used to greatly speed up k-fold
cross-validation. Lastly, we propose an adaptation of a
recently developed acquisition function, entropy search, to
the cost-sensitive, multi-task setting. We demonstrate the
utility of this new acquisition function by leveraging a small
dataset to explore hyperparameter settings for a large
dataset. Our algorithm dynamically chooses which dataset to
query in order to yield the most information per unit cost.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
for automatically tuning the hyperparameters of machine
learning models and has been shown to yield state-of-the-art
performance with impressive ease and efficiency. In this
paper, we explore whether it is possible to transfer the
knowledge gained from previous optimizations to new tasks in
order to find optimal hyperparameter settings more
efficiently. Our approach is based on extending multi-task
Gaussian processes to the framework of Bayesian
optimization. We show that this method significantly speeds up
the optimization process when compared to the standard
single-task approach. We further propose a straightforward
extension of our algorithm in order to jointly minimize the
average error across multiple tasks and demonstrate how this
can be used to greatly speed up k-fold
cross-validation. Lastly, we propose an adaptation of a
recently developed acquisition function, entropy search, to
the cost-sensitive, multi-task setting. We demonstrate the
utility of this new acquisition function by leveraging a small
dataset to explore hyperparameter settings for a large
dataset. Our algorithm dynamically chooses which dataset to
query in order to yield the most information per unit cost.
Snoek, Jasper; Larochelle, Hugo; Adams, Ryan P.
Practical Bayesian Optimization of Machine Learning Algorithms Conference
Advances in Neural Information Processing Systems (NIPS) 25, 2012, (arXiv:1206.2944 [stat.ML]).
@conference{snoek2012practical,
title = {Practical Bayesian Optimization of Machine Learning Algorithms},
author = {Jasper Snoek and Hugo Larochelle and Ryan P. Adams},
url = {http://www.cs.princeton.edu/~rpa/pubs/snoek2012practical.pdf},
year = {2012},
date = {2012-01-01},
booktitle = {Advances in Neural Information Processing Systems (NIPS) 25},
abstract = {The use of machine learning algorithms frequently involves
careful tuning of learning parameters and model
hyperparameters. Unfortunately, this tuning is often a ``black
art'' requiring expert experience, rules of thumb, or
sometimes brute-force search. There is therefore great appeal
for automatic approaches that can optimize the performance of
any given learning algorithm to the problem at hand. In this
work, we consider this problem through the framework of
Bayesian optimization, in which a learning algorithm's
generalization performance is modeled as a sample from a
Gaussian process (GP). We show that certain choices for the
nature of the GP, such as the type of kernel and the treatment
of its hyperparameters, can play a crucial role in obtaining a
good optimizer that can achieve expert-level performance. We
describe new algorithms that take into account the variable
cost (duration) of learning algorithm experiments and that can
leverage the presence of multiple cores for parallel
experimentation. We show that these proposed algorithms
improve on previous automatic procedures and can reach or
surpass human expert-level optimization for many algorithms
including latent Dirichlet allocation, structured SVMs and
convolutional neural networks.},
note = {arXiv:1206.2944 [stat.ML]},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
careful tuning of learning parameters and model
hyperparameters. Unfortunately, this tuning is often a ``black
art'' requiring expert experience, rules of thumb, or
sometimes brute-force search. There is therefore great appeal
for automatic approaches that can optimize the performance of
any given learning algorithm to the problem at hand. In this
work, we consider this problem through the framework of
Bayesian optimization, in which a learning algorithm's
generalization performance is modeled as a sample from a
Gaussian process (GP). We show that certain choices for the
nature of the GP, such as the type of kernel and the treatment
of its hyperparameters, can play a crucial role in obtaining a
good optimizer that can achieve expert-level performance. We
describe new algorithms that take into account the variable
cost (duration) of learning algorithm experiments and that can
leverage the presence of multiple cores for parallel
experimentation. We show that these proposed algorithms
improve on previous automatic procedures and can reach or
surpass human expert-level optimization for many algorithms
including latent Dirichlet allocation, structured SVMs and
convolutional neural networks.