None

GCCA vs PCA

[1]:
from mvlearn.embed import GCCA
import matplotlib.pyplot as plt
import numpy as np
import scipy
%matplotlib inline
import seaborn as sns
from scipy.sparse.linalg import svds
[2]:
def get_train_test(n=100, mu=0, var=1, var2=1, nviews=3,m=1000):
    # Creates train and test data with a
    # - shared signal feature ~ N(mu, var1)
    # - an independent noise feature ~ N(mu, var2)
    # - independent noise feautures ~ N(0, 1)
    np.random.seed(0)

    X_TRAIN = np.random.normal(mu,var,(n,1))
    X_TEST = np.random.normal(mu,var,(n,1))

    Xs_train = []
    Xs_test = []
    for i in range(nviews):
        X_train = np.hstack((np.random.normal(0,1,(n,i)),
                             X_TRAIN,
                             np.random.normal(0,1,(n,m-2-i)),
                             np.random.normal(0,var2,(n,1))
                            ))
        X_test = np.hstack((np.random.normal(0,1,(n,i)),
                            X_TEST,
                            np.random.normal(0,1,(n,m-2-i)),
                            np.random.normal(0,var2,(n,1))
                           ))

        Xs_train.append(X_train)
        Xs_test.append(X_test)

    return(Xs_train,Xs_test)

Positive Test

Setting:

1 high variance shared signal feature, 1 high variance noise feature

[3]:
nviews = 3
Xs_train, Xs_test = get_train_test(var=10,var2=10,nviews=nviews,m=1000)
[5]:
gcca = GCCA(n_components=2)
gcca.fit(Xs_train)
Xs_hat = gcca.transform(Xs_test)

Results:

  • GCCA results show high correlation on testing data
[6]:
np.corrcoef(np.array(Xs_hat)[:,:,0])
[6]:
array([[1.        , 0.99698235, 0.99687182],
       [0.99698235, 1.        , 0.99689792],
       [0.99687182, 0.99689792, 1.        ]])
[7]:
Xs_hat = []
for i in range(len(Xs_train)):
    _,_,vt = svds(Xs_train[i],k=1)
    Xs_hat.append(Xs_test[i] @ vt.T)
  • PCA selects shared dimension but also high noise dimension and so weaker correlation on testing data
[8]:
np.corrcoef(np.array(Xs_hat)[:,:,0])
[8]:
array([[ 1.        , -0.54014795,  0.51173297],
       [-0.54014795,  1.        , -0.98138902],
       [ 0.51173297, -0.98138902,  1.        ]])

Negative Test

Setting:

1 low variance shared feature

[9]:
nviews = 3
Xs_train, Xs_test = get_train_test(var=1,var2=1,nviews=nviews,m=1000)
[10]:
gcca = GCCA(n_components = 2)
gcca.fit(Xs_train)
Xs_hat = gcca.transform(Xs_test)

Results:

  • GCCA fails to select shared feature and so shows low correlation on testing data
[11]:
np.corrcoef(np.array(Xs_hat)[:,:,0])
[11]:
array([[ 1.        ,  0.31254995, -0.02208907],
       [ 0.31254995,  1.        ,  0.13722633],
       [-0.02208907,  0.13722633,  1.        ]])
[12]:
Xs_hat = []
for i in range(len(Xs_train)):
    _,_,vt = svds(Xs_train[i],k=1)
    Xs_hat.append(Xs_test[i] @ vt.T)
  • PCA fails to select shared feature and shows low correlation on testing data
[13]:
np.corrcoef(np.array(Xs_hat)[:,:,0])
[13]:
array([[1.        , 0.01016507, 0.0888701 ],
       [0.01016507, 1.        , 0.03812276],
       [0.0888701 , 0.03812276, 1.        ]])