Distance Metrics
Euclidean distance
\begin{equation}
\sqrt{\sum_{i=1}^n (x_i-y_i)^2}
\end{equation}
Manhattan distance
\begin{equation}
\sum_{i=1}^n |x_i-y_i|
\end{equation}
Hamming distance (x and y are binary vectors)
\begin{equation}
\sum_{i=1}^n |x_i-y_i|
\end{equation}
Minkowski distance
\begin{equation}
\left(\sum_{i=1}^n |x_i-y_i|^p\right)^{1/p}
\end{equation}
Quadratic equation
\begin{equation}
x = {-b \pm \sqrt{b^2-4ac} \over 2a}
\end{equation}
Univariate Statistics
Population mean
\begin{equation}
\mu = \frac{1}{N} \sum_{i=1}^N x_i
\end{equation}
Standard deviation
\begin{equation}
\sigma = \sqrt{\frac{1}{N} \sum_{i=1}^N (x_i - \mu)^2}
\end{equation}
Variance
\begin{equation}
\sigma{_x}^{2} = \sum_{i=1}^{n} (x_i - \bar{x})^2\quad
\end{equation}
Pre-processing
Normalization
\begin{equation}
z = \frac{x - \mu}{\sigma}
\end{equation}
Standardizing
\begin{equation}
X_{standarize} = \frac{X - X_{min}}{X_{max}-X_{min}}
\end{equation}
Comparing two vectors
Pearson Correlation
\begin{equation}
\rho_{X,Y} = \frac{\text{cov}(X,Y)}{\sigma_X \sigma_Y}
\end{equation}
Spearman Correlation
\begin{equation}
\rho_{s} = \rho_{X_{[i]},Y_{[i]}} = 1- {\frac {6 \sum d_i^2}{n(n^2 - 1)}}
\end{equation}
Cosine Similarity
\begin{equation}
cos(\pmb x, \pmb y) = \frac {\pmb x \cdot \pmb y}{||\pmb x|| \cdot ||\pmb y||}
\end{equation}
Co-Variance
\begin{equation}
S_{xy} = \sum_{i=1}^{n} (x_i - \bar{x})(y_i - \bar{y})
\end{equation}
Eigenvector and Eigenvalue
\begin{equation}
\pmb A\pmb{v} = \lambda\pmb{v}
\end{equation}
Binomial distribution
\begin{equation}
p_k = {n \choose x} \cdot p^k \cdot (1-p)^{n-k}
\end{equation}
Gaussian distribution
Univariate
\begin{equation}
p(x) \sim N(\mu|\sigma^2)
\end{equation}
\begin{equation}
p(x) \sim \frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{1}{2}(\frac{x-\mu}{\sigma})^2}
\end{equation}
multivariate
\begin{equation}
p(\pmb x) \sim N(\pmb \mu|\Sigma)
\end{equation}
\begin{equation}
p(\pmb x) \sim \frac{1}{(2\pi)^{d/2} |\Sigma|^{1/2}} e^{-\frac{1}{2}(\pmb x - \pmb \mu)^t \Sigma^{-1}(\pmb x - \pmb \mu) }
\end{equation}
Maximum Likelihood Estimate
Given,
\begin{equation}
D = \left\{ \pmb x_1, \pmb x_2,..., \pmb x_n \right\}
\end{equation}
Assuming the samples are i.i.d.,
\begin{equation}
p(D| \pmb \theta) = p(\pmb x_1 | \pmb \theta) \cdot p(\pmb x_2 | \pmb \theta) \cdot \;... \; p(\pmb x_n | \pmb \theta)
\end{equation}
\begin{equation}
p(D| \pmb \theta)= \prod_{k=1}^{n} p(\pmb x_k | \pmb \theta)
\end{equation}
The Log likelihood is
\begin{equation}
\Rightarrow l(\theta) = \sum_{k=1}^{n} ln | p(x_k|\theta)
\end{equation}
Differentiating and solving for ( θ )
\begin{equation}
\nabla_{\pmb \theta} \equiv \begin{bmatrix}
\frac{\partial }{\partial \theta_1} \\
\frac{\partial }{\partial \theta_2} \\
...\\
\frac{\partial }{\partial \theta_p}\end{bmatrix}
\end{equation}
\begin{equation}
\nabla_{\pmb \theta} l(\pmb\theta) \equiv \begin{bmatrix}
\frac{\partial L(\pmb\theta)}{\partial \theta_1} \\
\frac{\partial L(\pmb\theta)}{\partial \theta_2} \\
...\\
\frac{\partial L(\pmb\theta)}{\partial \theta_p}\end{bmatrix}
= \begin{bmatrix}
0 \\
0 \\
...\\
0\end{bmatrix}
\end{equation}
Linear Discriminant Analysis
In-between class scatter matrix
\begin{equation}
S_w = \sum\limits_{i=1}^{c} S_i
\end{equation}
where,
\begin{equation}
S_i = \sum\limits_{\pmb x \in D_i}^n (\pmb x - \pmb m_i)\;(\pmb x - \pmb m_i)^T
\end{equation}
\begin{equation}
\pmb m_i = \frac{1}{n_i} \sum\limits_{\pmb x \in D_i}^n \pmb x_k
\end{equation}
Between class scatter matrix
\begin{equation}
S_b = \sum\limits_{i=1}^{c} (\pmb m_i - \pmb m) (\pmb m_i - \pmb m)^T
\end{equation}
\begin{equation}
\Phi_{lda}=\arg\max_{\Phi} \frac{|\Phi^TS_b\Phi|}{|\Phi^TS_w\Phi|}
\end{equation}
Multiple Linear Regression
\begin{equation}
\pmb X \pmb w = \pmb y
\end{equation}
\begin{equation}
\Bigg[ \begin{array}{cc}
x_1 & 1 \\
... & 1 \\
x_n & 1 \end{array} \Bigg]\bigg[ \begin{array}{c}
w \\
b \end{array} \bigg]=\Bigg[ \begin{array}{c}
y_1 \\
... \\
y_n \end{array} \Bigg]
\end{equation}
\begin{equation}
\pmb w = (\pmb X^T \pmb X)^{-1} \pmb X^T \pmb y
\end{equation}
Contour Regression
\begin{equation}
\pmb X \pmb w = \pmb y
\end{equation}
\begin{equation}
\Bigg[ \begin{array}{cc}
x_1 & 1 \\
... & 1 \\
x_n & 1 \end{array} \Bigg]\bigg[ \begin{array}{c}
w \\
b \end{array} \bigg]=\Bigg[ \begin{array}{c}
y_1 \\
... \\
y_n \end{array} \Bigg]
\end{equation}
Step 1: Initizalize \(\;\pmb w \;\) using \(\;\pmb w = (\pmb X^T \pmb X)^{-1} \pmb X^T \pmb y \hspace{50pt}\)
Step 2: Repeat until convergence \(\hspace{135pt}\)
Step 2a: Reorder \(\;\pmb X \;\) based on latest \(\hat{y} \hspace{70pt}\)
Step 2b: Estimate \(\;\pmb w \;\) using \(\hspace{125pt}\)
\begin{equation}
\pmb w = (\pmb X^T \pmb X)^{-1} \pmb X^T \pmb (y+\hat{y}_{[i]})
\end{equation}
Naive Bayes' classifier
Posterior probability:
\begin{equation}
P(\omega_j|x) = \frac{p(x|\omega_j) \cdot P(\omega_j)}{p(x)}
\end{equation}
\begin{equation}
\Rightarrow \text{posterior} = \frac{ \text{likelihood} \cdot \text{prior}}{\text{evidence}}
\end{equation}
Decision rule:
\begin{equation}
\frac{p(x|\omega_1) \cdot P(\omega_1)}{p(x)} > \frac{p(x|\omega_2) \cdot P(\omega_2)}{p(x)}
\end{equation}
Download the Equations file