\documentclass[11pt]{article}
\usepackage[english]{babel}
\setlength{\textheight}{257mm}
\setlength{\textwidth}{160mm}
\setlength{\oddsidemargin}{0mm}
\setlength{\evensidemargin}{0mm}
\setlength{\footskip}{12mm}
\setlength{\topmargin}{-10mm}
\setlength{\headsep}{0mm}
\setlength{\parindent}{10mm}
\setlength{\parskip}{7pt}
\begin{document}
\bibliographystyle{plain}
\pagestyle{empty}
%=================================================================================================================
\begin{center}
{\Large \bf The General Method of Moments (GMM) using MATLAB:\\The practical guide based on the CKLS interest rate model}\\[4mm]
{\em Kamil Klad\'{i}vko\footnote{Supported by \textit{IG}410067 and EEA/Norwegian FMP -- Student mobility}}\\[2mm]
Department of Statistics and Probability Calculus,\\ University of Economics, Prague\\
{\tt \small kladivk@vse.cz}\\
\end{center}
\begin{abstract}
\bf \noindent The General Method of Moments (GMM) is an estimation technique which can be used for variety of financial models. We use the CKLS class of interest rate models to demonstrate how GMM works. We discuss the practical implementation in MATLAB. We pay attention to exactly-identified versus over-identified estimation, minimization of objective function and hypothesis testing of the model. We then illustrate problem areas in implementation, specifically the importance of the variance-covariance matrix estimator of moment functions. Our technical note is supplemented with the MATLAB code of discussed topics.
\end{abstract}
\section{Short-Term Interest Rate Stochastic Differential Equation}
The dynamics of a short-term interest rate can be nested within the following stochastic differential equation (SDE):
\begin{equation}
\label{sde} dr_{t} = (\alpha + \beta r)dt + \sigma r^\gamma dZ_{t},
\end{equation}
where $\alpha, \beta, \sigma, \gamma$ are model parameters, $r_{t}$ is the short-term interest rate and
$Z_{t}$ is the standard Brownian motion. CKLS~\cite{ckls} defines the parameter vector $\theta = (\alpha, \beta, \sigma^2, \gamma)$.
The SDE (\ref{sde}) defines a broad class of interest rate processes which include many well-known one-factor interest rate models. These models can be obtained from (\ref{sde}) by placing appropriate restrictions on the parameters (see Table~\ref{tab1}).
CKLS estimated nine different specifications of the interest rate process. In this paper, we implement three of them. The implemented models are summarized in Table~\ref{tab1}. Adding further specifications is a routine task.
\begin{table}[!h]
\begin{center}
\begin{tabular}{lll}
Name&SDE&Restrictions\\\hline
CKLS & $dr_{t} = (\alpha + \beta r_{t})dt + \sigma r_{t}^\gamma dZ_{t}$ & (unrestricted) \\
CIR & $dr_{t} = (\alpha + \beta r_{t})dt + \sigma \sqrt{r_{t}} dZ_{t}$ & $\gamma = \frac{1}{2}$\\
Vasicek & $dr_{t} = (\alpha + \beta r_{t})dt + \sigma dZ_{t}$ & $\gamma = 0$\\
\end{tabular}
\parbox{0.9\textwidth}{\caption{\label{tab1}Implemented short-term interest rate processes. CIR and Vasicek are probably the most commonly used models among the models analyzed in the CKLS paper.}}
\end{center}
\end{table}
\section{GMM Estimation of the Short-Term Interest Rate SDE}
To estimate the parameters of the model, CKLS relied on the General Method of Moments (GMM). We present basic ideas of this estimation technique in this section. Details can be found in~\cite{hansen}.
\subsection{Moment Conditions}
To estimate parameters of a SDE using the GMM we have to discretize the SDE. CKLS applied
Euler discretization scheme on (\ref{sde}):
\begin{eqnarray}
\label{diskretizace} r_{t+1} - r_{t} &=& \alpha + \beta r_{t} + \varepsilon_{t+1},\\
\label{defepsilon} \varepsilon_{t+1} &\equiv& \sigma r_{t}^{\gamma}\mathrm{N}(0,1),
\end{eqnarray}
where $\mathrm{N}(0,1)$ is a random shock with zero mean and unit variance. It does not have to be necessarily drawn from the standard Gaussian distribution, although Euler discretization assumes that.
From (\ref{defepsilon}) we can immediately observe that
\begin{eqnarray}
\label{restriction1} E\left[\varepsilon_{t+1}\right] &=& 0,\\
\label{restriction2} D[\varepsilon_{t+1}] = E[\varepsilon_{t+1}^{2}] &=& \sigma^2r_{t}^{2\gamma}.
\end{eqnarray}
Employing the characteristics (\ref{restriction1}) and (\ref{restriction2}) and using the notion that the error term $\varepsilon_{t+1}$ is uncorrelated with the explanatory variable $r_{t}$ (orthogonality condition), CKLS derived a set of four moment functions:
\begin{equation}
\label{populationmoments}
f_{t}(\theta) =
\left[\begin{array}{c}
\varepsilon_{t+1}\\
\varepsilon_{t+1}^{2} - \sigma^{2}r_{t}^{2\gamma}
\end{array}\right]
\otimes
\left[\begin{array}{c}
1\\
r_{t}
\end{array}\right]
=
\left[\begin{array}{c}
\varepsilon_{t+1}\\
\varepsilon_{t+1}r_{t}\\
\varepsilon_{t+1}^{2} - \sigma^{2}r_{t}^{2\gamma}\\
(\varepsilon_{t+1}^{2} - \sigma^{2}r_{t}^{2\gamma})r_{t}
\end{array}\right].
\end{equation}
The moment functions are constructed so that
\begin{equation}
\label{momentcond}
E[f_{t}(\theta)] = 0.
\end{equation}
Equations (\ref{momentcond}) are called moment conditions of moments $E[f_{t}(\theta)]$ in the GMM terminology. The basic idea of GMM is to pick model parameters that ``fit'' these moment conditions in our data set.
We could easily come up with additional moment conditions. For example, we restrict that $\varepsilon_{t}$ is not autocorrelated, i.e. $E[\varepsilon_{t+1}\varepsilon_{t}] = 0$.
\subsection{Sample Moments}
The corresponding sample moments are defined as
\begin{equation}
\label{samplemoments}
g_{T}(\theta) = \frac{1}{T}\sum_{t=1}^{T} f_{t}(\theta),
\end{equation}
where $T$ is a number of available observations (sample size). Equations (\ref{samplemoments}) are natural sample counterparts of population moments $E[f_{t}(\theta)]$. (The traditional notation used in statistics would denote these estimates $\hat{g}(\theta)$. However using $T$ subscript is a widespread notation due to~\cite{hansen}.)
We find it convenient to reparametrise (\ref{diskretizace}) to keep the implementation code neat. Further, to get the numerical results in accordance with the CKLS paper, we have to employ time step $\Delta t$. Equations (\ref{diskretizace}) and (\ref{defepsilon}) turn into
\begin{eqnarray}
r_{t+1} &=& a + b r_{t} + \varepsilon_{t+1},\\
\varepsilon_{t+1} &\equiv& \sigma r_{t}^{\gamma}\sqrt{\Delta t}\mathrm{N}(0,1),
\end{eqnarray}
where $a = \alpha \Delta t$ and $b = 1 + \beta \Delta t$. Then the sample moments look as follows:
\begin{equation}
\label{ckls_sample_moments}
g_{T}(\theta) = \frac{1}{T}
\left[\begin{array}{l}
\sum_{t=1}^{T}(r_{t+1} - a - br_{t})\\
\sum_{t=1}^{T} ((r_{t+1} - a - br_{t})^{2} - \sigma^{2}r_{t}^{2\gamma}\Delta t)\\
\sum_{t=1}^{T}(r_{t+1} - a - br_{t})r_{t}\\
\sum_{t=1}^{T} ((r_{t+1} - a - br_{t})^{2} - \sigma^{2}r_{t}^{2\gamma}\Delta t)r_{t}\\
\end{array}\right].
\end{equation}
\subsection{Objective Function}
The GMM objective function is defined as
\begin{equation}
\label{objectivefunction}
J_{T} = g_{T}'(\theta)W_{T}g_{T}(\theta),
\end{equation}
where $W_{T}$ is positive-definite weighting matrix. We find our parameter estimates by solving
\begin{equation}
\label{objective}
\hat{\theta} = \arg\limits_{\theta }\min J_{T}.
\end{equation}
We can see that the GMM is a minimum distance estimator. In other words, we are setting the weighted sum of squared sample moments as close to zero as possible. As we know, their population counterparts are constructed to be equal to zero. The weighting matrix tells us how much attention to pay to each moment.
For the unrestricted (CKLS) model we have four moment conditions and four parameters. We say that the CKLS model is exactly identified and we can pick an arbitrary positive-definite $W_{T}$. We can set all sample moments equal exactly zero.
\subsection{Optimal Weighting Matrix $W_{T}$}
The CIR and Vasicek models have three parameters, these models are overidentified. It is not possible (with probability one) to set all sample moment conditions equal to zero. To obtain asymptotically efficient estimates we set
\begin{equation}
W_{T} = \hat{S}^{-1},
\end{equation}
where $\hat{S}$ is an estimate of the spectral density matrix of population moment functions. This choice of the weighting matrix secures the smallest asymptotic covariance matrix of the estimate of $\theta$. This is important result is due to Hansen~\cite{hansen}. We call this the efficient GMM.
The spectral density or long run covariance matrix is defined
\begin{equation}
S = \sum_{j=-\infty}^{\infty} E[f_{t}(\theta)f_{t-j}'(\theta)].
\end{equation}
The spectral density matrix allows for serial correlation and heteroskedasticity in the observations of the moments function. A popular consistent estimate of the spectral density matrix is the Newey-West estimator~\cite{newey}:
\begin{equation}
\label{neweywest}
\widehat{S} = \widehat{S}_{0} + \sum_{j=1}^{k}\left(1 - \frac{j}{k+1}\right)(\widehat{S}_{j} + \widehat{S}_{j}'),
\end{equation}
where
$$
\widehat{S}_{j} = \frac{1}{T}\sum_{t = j+1}^{T}f_{t}(\theta)f'_{t-j}(\theta).
$$
In case of overidentified models, GMM is a two stage estimator. We usually proceed in the following way:
\begin{enumerate}
\item We minimize (\ref{objectivefunction}) using identity weighting matrix ($W_{T} = I$). This means that we consider all moments equally important. We plug estimated parameter vector $\hat{\theta}$ into (\ref{neweywest}) and inverse to get $W_{T}$.
\item Again, we minimize (\ref{objectivefunction}), but this time using the $W_{T}$ from the first step.
\end{enumerate}
\subsection{Testing the Significance of Individual Parameters}
Under the efficient GMM (and under the exactly identified GMM), we have asymptotic normality for the estimated parameters:
\begin{equation}
\sqrt{T}(\hat{\theta} - \theta) \stackrel{d}\longrightarrow \mathrm{N}(0, (d'S^{-1}d)^{-1}),
\end{equation}
where $d$ is the Jacobian of the population moments vector $E[f_{t}(\theta)]$.
For constructing the test statistics, we substitute in the corresponding sample moments and evaluate them at the estimated parameters. For $S$, we substitute in our estimate (\ref{neweywest}) of the spectral density matrix, i.e. our weighting matrix $W_{T}$.\footnote{In case of the the exactly identified model (CKLS), we still calculate the spectral density matrix and
do the inference.}
For example, the estimate of the Jacobian matrix of the CKLS model based on the sample moments (\ref{ckls_sample_moments}) looks as follows:
\begin{equation}
%d(\hat{\theta}) = \frac{\partial g_{T}(\hat{\theta})}{\partial \hat{\theta}} = \\
\hat{d}(\hat{\theta}) =
-\frac{1}{T}\left[\begin{array}{cccc}
T & \sum r_{t} & 0 & 0\\
% druhy radek
2\sum(r_{t+1}-a-br_{t}) &2\sum(r_{t+1}-a-br_{t})r_{t} & \Delta t\sum r_{t}^{2\gamma}&
2\sigma^{2}\Delta t\sum \ln(r_{t})r_{t}^{2\gamma}\\
% treti radek
\sum r_{t} & \sum r_{t}^{2} & 0 & 0 \\
% ctvrty radek
2\sum(r_{t+1}-a-br_{t})r_{t} &2\sum(r_{t+1}-a-br_{t})r_{t}^{2}& \Delta t\sum r_{t}^{2\gamma+1} &
2\sigma^2\Delta t\sum \ln(r_{t}) r_{t}^{2\gamma+1}
\end{array}\right].
\end{equation}
\subsection{Overall Test of Model Specification (Model Adequacy)}
If the model is overidentified, GMM offers an overall test of the model by testing whether the
``extra'' sample moments are sufficiently close to zero relative to their distribution.
The test statistic is asymptotically $\chi^{2}$ distributed:
\begin{equation}
\label{chi2}
T\times J_{T}(\hat{\theta}) =T \times g_{T}'(\hat{\theta})W_{T}(\hat{\theta})g_{T}(\hat{\theta}) \stackrel{d}\longrightarrow \chi^{2}_{m-p},
\end{equation}
where $m$ is a number of moment conditions and $p$ is a number of parameters.
If the test statistic rejects, then the underlying model that generated the system of moment conditions is declared invalid.
\section{MATLAB Implementation}
The estimation routine is executed by running the \texttt{RunEstimation.m} (uses \texttt{GMMestimation.m}, \texttt{GMMobjective.m}, \texttt{GMMweightsNW.m} and \texttt{MomentsJacobia.m}). See code comments for details. The results are displayed in the MATLAB's command window. We rely on \texttt{fminsearch} routine when minimizing the objective function.
The Treasury bill yield data used by CKLS are one-month yields from the Center for Research in Security Prices (CRSP). The data are monthly and cover period from June 1964 to December 1989. The data set is saved in \texttt{ckls.dat}.
\section{Importance of the Spectral Density Matrix Estimation}
One of the main results of CKLS is concerned with the value of $\gamma$. They showed that the value of $\gamma$ is the most important feature differentiating interest rate models. Models which assume $\gamma < 1$, e.g. CIR and Vasicek, are rejected by the overall $\chi^{2}$ test of model adequacy. And those which assume $\gamma \geq 1$ are not rejected.
\subsection{Spectral Density Matrix}
By running the estimation routine with different number of lags (differnet $k$) used in the spectral density matrix estimation (\ref{neweywest}), we can deduce that CKLS used an ordinary sample covariance matrix in their inference ($k = 0$).
By relying on the ordinary sample covariance matrix, they implicitly assume that the moment functions can be considered as independent identically distributed variables.
\subsection{Spectral Density Matrix for the CIR Model}
Table~\ref{autocorrelation} reports sample autocorrelations of the four moment functions for the CIR model. The autocorrelations of the second and the fourth moment functions are obviously significantly different from zero.
\begin{table}[!ht]
\begin{center}
\begin{tabular}{c|cccc}
lag & \multicolumn{4}{c}\hfill autocorrelation of $g_{T}(\hat{\theta})$\\\hline
1 & 0.04 & 0.18 & -0.01 & 0.13 \\
2 & -0.06 & 0.15 & -0.09 & 0.11 \\
3 & -0.10 & 0.16 & -0.09 & 0.09 \\
4 & -0.06 & 0.12 & -0.08 & 0.07 \\
5 & -0.01 & 0.05 & -0.03 & 0.02 \\
\end{tabular}
\parbox{0.9\textwidth}{\caption{\label{autocorrelation}This table reports sample autocorrelations (first five lags) of the four moment functions for the CIR model.}}
\end{center}
\end{table}
Thus we should use e.g. the Newey-West estimator (\ref{neweywest}) of the spectral density matrix $S$. If we do so, and set the number of lags of the Newey-West estimator greater or equal to 12, we no longer can reject the CIR model on the 5\% significance level. The results are reported in Table~\ref{cirresults}.
\begin{table}[!ht]
\begin{center}
\begin{tabular}{c|ccccc}
\#lags & $\alpha$ & $\beta$ & $\sigma^{2}$ & $\chi^{2}$ & p-value \\\hline
0 & 0.0190 & -0.2281 & 0.0073 & 5.82 & 0.0158\\
12& 0.0328 & -0.5145 & 0.0057 & 3.73 & 0.0533\\
\end{tabular}
\parbox{0.9\textwidth}{\caption{\label{cirresults}This table reports GMM estimation results for the CIR model when using different number of lags in the Newey-West estimator of the spectral density matrix. First row reports results with zero lags in the Newey-West estimator, i.e. results obtained with an ordinary sample covariance matrix.}}
\end{center}
\end{table}
Also note that the parameter estimates differ considerably, which questions the robustness of the GMM for the CIR model estimation. Qualitatively similar results can be obtained for the Vasicek model.
\subsection{GMM versus Maximum Likelihood for the CIR Model}
We will compare the GMM to the Maximum Likelihood (ML). The transition density function has a closed form solution for the CIR model and thus the ML estimator is available.
For easily noticeable comparison, we find it convenient to rewrite the CIR model into the following form:
\begin{equation}
\label{cir_proces}dr_{t} = \lambda(\mu - r_{t})dt + \sqrt{r_{t}}\sigma dW_{t},
\end{equation}
where interest rate $r_{t}$ moves in the direction of its mean $\mu$ at speed $\lambda$. We get back to the former notation by setting $\alpha = \lambda\mu$ and $\beta=-\lambda$.
The estimation results of the ML versus the GMM are reported in Table~\ref{cirresultsml}. It is striking that the 12-lags Newey-West GMM estimate of the drift ($\lambda$, $\mu$) is much closer to the ML estimate compared to the GMM estimate without using the Newey-West. This result emphasizes the importance of the optimal weighting matrix estimation.
\begin{table}[!ht]
\begin{center}
\begin{tabular}{cc|ccccc}
Method & \#lags & $\lambda$ & $\mu$ & $\sigma$ \\\hline
GMM& 0 & 0.228 & 0.083 & 0.086 \\
GMM& 12 & 0.515 & 0.064 & 0.075 \\
ML & & 0.599 & 0.069 & 0.098
\end{tabular}
\parbox{0.9\textwidth}{\caption{\label{cirresultsml}This table reports the GMM versus the ML estimation results for the CIR model. The sample mean of interest rate time series is equal to 0.067.}}
\end{center}
\end{table}
\section*{Conclusion}
In this technical note we have outlined, how to implement the General Method of Moments estimator for one-factor interest rate process. We have demonstrated that one should pay close attention to the weighting matrix estimation.
\begin{thebibliography}{x}
\bibitem{ckls} Chan, K.C., Karolyi, Andrew G., Longstaff, Francis A. and Sanders, Anthony B., 1992, ``An Empirical Comparison of Alternative Models of the Short-Term Interest Rate,'' \textit{The Journal of Finance} 47, 1209--1227.
\bibitem{hansen} Hansen, Lars Peter, 1982, ``Large Sample Properties of Generalized Method of Moments Estimators,'' \textit{Econometrica} 50, 1029--1054.
\bibitem{newey} Newey, Whiteny, K. and West, Kenneth, D., 1987, ``A Simple, Positive Semi-Definite, Heteroskedasticity and Autocorrelation Consistent Covariance Matrix,'' \textit{Econometrica} 55, 703--708.
\end{thebibliography}
\end{document}