_isomap.py
9.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
"""Isomap for manifold learning"""
# Author: Jake Vanderplas -- <vanderplas@astro.washington.edu>
# License: BSD 3 clause (C) 2011
import numpy as np
from ..base import BaseEstimator, TransformerMixin
from ..neighbors import NearestNeighbors, kneighbors_graph
from ..utils.deprecation import deprecated
from ..utils.validation import check_is_fitted
from ..utils.validation import _deprecate_positional_args
from ..utils.graph import graph_shortest_path
from ..decomposition import KernelPCA
from ..preprocessing import KernelCenterer
class Isomap(TransformerMixin, BaseEstimator):
"""Isomap Embedding
Non-linear dimensionality reduction through Isometric Mapping
Read more in the :ref:`User Guide <isomap>`.
Parameters
----------
n_neighbors : integer
number of neighbors to consider for each point.
n_components : integer
number of coordinates for the manifold
eigen_solver : ['auto'|'arpack'|'dense']
'auto' : Attempt to choose the most efficient solver
for the given problem.
'arpack' : Use Arnoldi decomposition to find the eigenvalues
and eigenvectors.
'dense' : Use a direct solver (i.e. LAPACK)
for the eigenvalue decomposition.
tol : float
Convergence tolerance passed to arpack or lobpcg.
not used if eigen_solver == 'dense'.
max_iter : integer
Maximum number of iterations for the arpack solver.
not used if eigen_solver == 'dense'.
path_method : string ['auto'|'FW'|'D']
Method to use in finding shortest path.
'auto' : attempt to choose the best algorithm automatically.
'FW' : Floyd-Warshall algorithm.
'D' : Dijkstra's algorithm.
neighbors_algorithm : string ['auto'|'brute'|'kd_tree'|'ball_tree']
Algorithm to use for nearest neighbors search,
passed to neighbors.NearestNeighbors instance.
n_jobs : int or None, default=None
The number of parallel jobs to run.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
metric : string, or callable, default="minkowski"
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
its metric parameter.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square. X may be a :term:`Glossary <sparse graph>`.
.. versionadded:: 0.22
p : int, default=2
Parameter for the Minkowski metric from
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
.. versionadded:: 0.22
metric_params : dict, default=None
Additional keyword arguments for the metric function.
.. versionadded:: 0.22
Attributes
----------
embedding_ : array-like, shape (n_samples, n_components)
Stores the embedding vectors.
kernel_pca_ : object
:class:`~sklearn.decomposition.KernelPCA` object used to implement the
embedding.
nbrs_ : sklearn.neighbors.NearestNeighbors instance
Stores nearest neighbors instance, including BallTree or KDtree
if applicable.
dist_matrix_ : array-like, shape (n_samples, n_samples)
Stores the geodesic distance matrix of training data.
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.manifold import Isomap
>>> X, _ = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> embedding = Isomap(n_components=2)
>>> X_transformed = embedding.fit_transform(X[:100])
>>> X_transformed.shape
(100, 2)
References
----------
.. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric
framework for nonlinear dimensionality reduction. Science 290 (5500)
"""
@_deprecate_positional_args
def __init__(self, *, n_neighbors=5, n_components=2, eigen_solver='auto',
tol=0, max_iter=None, path_method='auto',
neighbors_algorithm='auto', n_jobs=None, metric='minkowski',
p=2, metric_params=None):
self.n_neighbors = n_neighbors
self.n_components = n_components
self.eigen_solver = eigen_solver
self.tol = tol
self.max_iter = max_iter
self.path_method = path_method
self.neighbors_algorithm = neighbors_algorithm
self.n_jobs = n_jobs
self.metric = metric
self.p = p
self.metric_params = metric_params
def _fit_transform(self, X):
self.nbrs_ = NearestNeighbors(n_neighbors=self.n_neighbors,
algorithm=self.neighbors_algorithm,
metric=self.metric, p=self.p,
metric_params=self.metric_params,
n_jobs=self.n_jobs)
self.nbrs_.fit(X)
self.n_features_in_ = self.nbrs_.n_features_in_
self.kernel_pca_ = KernelPCA(n_components=self.n_components,
kernel="precomputed",
eigen_solver=self.eigen_solver,
tol=self.tol, max_iter=self.max_iter,
n_jobs=self.n_jobs)
kng = kneighbors_graph(self.nbrs_, self.n_neighbors,
metric=self.metric, p=self.p,
metric_params=self.metric_params,
mode='distance', n_jobs=self.n_jobs)
self.dist_matrix_ = graph_shortest_path(kng,
method=self.path_method,
directed=False)
G = self.dist_matrix_ ** 2
G *= -0.5
self.embedding_ = self.kernel_pca_.fit_transform(G)
# mypy error: Decorated property not supported
@deprecated( # type: ignore
"Attribute `training_data_` was deprecated in version 0.22 and"
" will be removed in 0.24."
)
@property
def training_data_(self):
check_is_fitted(self)
return self.nbrs_._fit_X
def reconstruction_error(self):
"""Compute the reconstruction error for the embedding.
Returns
-------
reconstruction_error : float
Notes
-----
The cost function of an isomap embedding is
``E = frobenius_norm[K(D) - K(D_fit)] / n_samples``
Where D is the matrix of distances for the input data X,
D_fit is the matrix of distances for the output embedding X_fit,
and K is the isomap kernel:
``K(D) = -0.5 * (I - 1/n_samples) * D^2 * (I - 1/n_samples)``
"""
G = -0.5 * self.dist_matrix_ ** 2
G_center = KernelCenterer().fit_transform(G)
evals = self.kernel_pca_.lambdas_
return np.sqrt(np.sum(G_center ** 2) - np.sum(evals ** 2)) / G.shape[0]
def fit(self, X, y=None):
"""Compute the embedding vectors for data X
Parameters
----------
X : {array-like, sparse graph, BallTree, KDTree, NearestNeighbors}
Sample data, shape = (n_samples, n_features), in the form of a
numpy array, sparse graph, precomputed tree, or NearestNeighbors
object.
y : Ignored
Returns
-------
self : returns an instance of self.
"""
self._fit_transform(X)
return self
def fit_transform(self, X, y=None):
"""Fit the model from data in X and transform X.
Parameters
----------
X : {array-like, sparse graph, BallTree, KDTree}
Training vector, where n_samples in the number of samples
and n_features is the number of features.
y : Ignored
Returns
-------
X_new : array-like, shape (n_samples, n_components)
"""
self._fit_transform(X)
return self.embedding_
def transform(self, X):
"""Transform X.
This is implemented by linking the points X into the graph of geodesic
distances of the training data. First the `n_neighbors` nearest
neighbors of X are found in the training data, and from these the
shortest geodesic distances from each point in X to each point in
the training data are computed in order to construct the kernel.
The embedding of X is the projection of this kernel onto the
embedding vectors of the training set.
Parameters
----------
X : array-like, shape (n_queries, n_features)
If neighbors_algorithm='precomputed', X is assumed to be a
distance matrix or a sparse graph of shape
(n_queries, n_samples_fit).
Returns
-------
X_new : array-like, shape (n_queries, n_components)
"""
check_is_fitted(self)
distances, indices = self.nbrs_.kneighbors(X, return_distance=True)
# Create the graph of shortest distances from X to
# training data via the nearest neighbors of X.
# This can be done as a single array operation, but it potentially
# takes a lot of memory. To avoid that, use a loop:
n_samples_fit = self.nbrs_.n_samples_fit_
n_queries = distances.shape[0]
G_X = np.zeros((n_queries, n_samples_fit))
for i in range(n_queries):
G_X[i] = np.min(self.dist_matrix_[indices[i]] +
distances[i][:, None], 0)
G_X **= 2
G_X *= -0.5
return self.kernel_pca_.transform(G_X)