-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathfeats_transformations.py
More file actions
164 lines (109 loc) · 4.07 KB
/
feats_transformations.py
File metadata and controls
164 lines (109 loc) · 4.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import numpy as np
import pandas as pd
from singlecelldata import SingleCell
from sklearn.metrics import pairwise_distances
from scipy.stats import spearmanr
from .feats_utils import FeatureNormalize
### Pairwise Distances
def spearmans(X):
"""
X is a (n x d) matrix where rows are samples
Returns D which is a (n x n) matrix of distances between samples
"""
D, _ = spearmanr(X, axis = 1)
D = 1 - D
return D
def pearsons(X):
"""
X is a (n x d) matrix where rows are samples
Returns D which is a (n x n) matrix of distances between samples
"""
D = pairwise_distances(X, metric = "correlation")
return D
def euclidean(X):
"""
X is a (n x d) matrix where rows are samples
Returns D which is a (n x n) matrix of distances between samples
"""
D = pairwise_distances(X, metric = "euclidean", n_jobs = -1)
return D
### Kernels
def linear_kernel(X):
"""
X is a (n x d) matrix where rows are samples
Returns K which is a (n x n) kernel matrix
"""
K = np.dot(X, X.T)
return K
### Utility Functions
# Converts Distances to Kernels
def dist_to_kernel(D):
gamma = 1.0 / np.amax(D)
S = np.exp(-D * gamma)
return S
# Computes Gram Matrix from Kernel
def GramMatrix(K):
N_one = np.ones(K.shape) / K.shape[0]
K_bar = K - np.dot(N_one, K) - np.dot(K, N_one) + np.dot(np.dot(N_one, K), N_one)
return K_bar
def PCA(sc, n_comp = 1, dist_or_kernel = 'linear'):
"""
Computes and stores the Principal Components of the gene expression data stored in
the SingleCell object.
Parameters
----------
sc : SingleCell
The SingleCell object containing gene expression data.
n_comp : int, optional
The number of Principal Components to compute. Default 1.
dist_or_kernel : str, optional
The distance metric or the kernel to compute. If a distance metric is passed,
it computes the pairwise distance between the cells and then converts the distance
metrics to kernels. If a kernel is passed, it computes the kernel. Valid values
are 'linear' (default) for linear kernel, 'spearmans' for Spearmans distance,
'euclidean' for Euclidean distance and 'pearsons' for Pearsons distance.
Returns
-------
sc : SingleCell
The SingleCell object containing the dimensionnality reduced gene expression data. The
reduced dimensionality is n_comp. The gene names are removed and the features in the
reduced space are named 'PC1', 'PC2' and so on.
"""
sc = FeatureNormalize(sc, 'mean')
X = sc.getCounts()
if (dist_or_kernel == 'linear'):
K = GramMatrix(linear_kernel(X.T))
elif (dist_or_kernel == 'spearmans'):
K = GramMatrix(dist_to_kernel(spearmans(X.T)))
elif (dist_or_kernel == 'euclidean'):
K = GramMatrix(dist_to_kernel(euclidean(X.T)))
elif (dist_or_kernel == 'pearsons'):
K = GramMatrix(dist_to_kernel(pearsons(X.T)))
E_vec, E_val, _ = np.linalg.svd(K)
#print(E_val)
#print(E_vec[:,0])
# Sort Eigenvector
idx = np.argsort(E_val, kind = 'mergesort')
idx = np.flip(idx)
E_val = E_val[idx]
E_vec = E_vec[:, idx]
# Remove zero eigenvectors
idx2 = E_val > 0
E_val = E_val[idx2]
E_vec = E_vec[:, idx2]
# print("Maximum components possible = ", E_val.size)
# Scale eigenvectors so that np.dot(D[:,0].T, D[:, 0]) * E[0] = 1
E_val = np.reshape(E_val, [1, E_val.size])
# E_vec = E_vec / np.linalg.norm(E_vec, axis = 0, keepdims = True)
E_vec = E_vec / np.sqrt(E_val)
X_red = np.dot(E_vec[:, 0:n_comp].T, K)
data = pd.DataFrame(X_red)
celldata = sc.celldata
pcs = []
for i in range(n_comp):
pcs.append('PC' + str(i+1))
genedata = pd.DataFrame(pcs, index = data.index, columns = ['principal_components'])
sc_red = SingleCell(dataset = sc.dataset + "_reduced", data = data, celldata = celldata, genedata = genedata)
# print(E_vec[:,0])
# print(X_red)
return sc_red