seckcoder
diff --git a/‎doc/modules/classes.rst
Copy file name to clipboardExpand all lines: doc/modules/classes.rst
+1Lines changed: 1 addition & 0 deletions b/‎doc/modules/classes.rst
Copy file name to clipboardExpand all lines: doc/modules/classes.rst
+1Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/modules/cross_validation.rst
Copy file name to clipboardExpand all lines: doc/modules/cross_validation.rst
+5-2Lines changed: 5 additions & 2 deletions b/‎doc/modules/cross_validation.rst
Copy file name to clipboardExpand all lines: doc/modules/cross_validation.rst
+5-2Lines changed: 5 additions & 2 deletions
diff --git a/‎doc/modules/decomposition.rst
Copy file name to clipboardExpand all lines: doc/modules/decomposition.rst
+80Lines changed: 80 additions & 0 deletions b/‎doc/modules/decomposition.rst
Copy file name to clipboardExpand all lines: doc/modules/decomposition.rst
+80Lines changed: 80 additions & 0 deletions
diff --git a/‎doc/whats_new.rst
Copy file name to clipboardExpand all lines: doc/whats_new.rst
+8Lines changed: 8 additions & 0 deletions b/‎doc/whats_new.rst
Copy file name to clipboardExpand all lines: doc/whats_new.rst
+8Lines changed: 8 additions & 0 deletions
diff --git a/‎examples/document_clustering.py
Copy file name to clipboardExpand all lines: examples/document_clustering.py
+23-3Lines changed: 23 additions & 3 deletions b/‎examples/document_clustering.py
Copy file name to clipboardExpand all lines: examples/document_clustering.py
+23-3Lines changed: 23 additions & 3 deletions
diff --git a/‎sklearn/cluster/_feature_agglomeration.py
Copy file name to clipboardExpand all lines: sklearn/cluster/_feature_agglomeration.py
+3-11Lines changed: 3 additions & 11 deletions b/‎sklearn/cluster/_feature_agglomeration.py
Copy file name to clipboardExpand all lines: sklearn/cluster/_feature_agglomeration.py
+3-11Lines changed: 3 additions & 11 deletions
diff --git a/‎sklearn/cluster/_hierarchical.pyx
Copy file name to clipboardExpand all lines: sklearn/cluster/_hierarchical.pyx
+2-2Lines changed: 2 additions & 2 deletions b/‎sklearn/cluster/_hierarchical.pyx
Copy file name to clipboardExpand all lines: sklearn/cluster/_hierarchical.pyx
+2-2Lines changed: 2 additions & 2 deletions
diff --git a/‎sklearn/cluster/tests/test_hierarchical.py
Copy file name to clipboardExpand all lines: sklearn/cluster/tests/test_hierarchical.py
+2Lines changed: 2 additions & 0 deletions b/‎sklearn/cluster/tests/test_hierarchical.py
Copy file name to clipboardExpand all lines: sklearn/cluster/tests/test_hierarchical.py
+2Lines changed: 2 additions & 0 deletions
diff --git a/‎sklearn/covariance/empirical_covariance_.py
Copy file name to clipboardExpand all lines: sklearn/covariance/empirical_covariance_.py
+4-4Lines changed: 4 additions & 4 deletions b/‎sklearn/covariance/empirical_covariance_.py
Copy file name to clipboardExpand all lines: sklearn/covariance/empirical_covariance_.py
+4-4Lines changed: 4 additions & 4 deletions
@@ -217,6 +217,7 @@ Samples generator
    decomposition.KernelPCA
    decomposition.FactorAnalysis
    decomposition.FastICA
+   decomposition.TruncatedSVD
    decomposition.NMF
    decomposition.SparsePCA
    decomposition.MiniBatchSparsePCA
 
@@ -277,8 +277,11 @@ not waste much data as only one sample is removed from the learning set::
 Leave-P-Out - LPO
 -----------------
 
-:class:`LeavePOut` is very similar to *Leave-One-Out*, as it creates all the
-possible training/test sets by removing :math:`P` samples from the complete set.
+:class:`LeavePOut` is very similar to :class:`LeaveOneOut` as it creates all
+the possible training/test sets by removing :math:`p` samples from the complete
+set. For :math:`n` samples, this produces :math:`{n \choose p}` train-test
+pairs. Unlike :class:`LeaveOneOut` and :class:`KFold`, the test sets will
+overlap for :math:`p > 1`.
 
 Example of Leave-2-Out::
 
 
@@ -232,6 +232,86 @@ factorization, while larger values shrink many coefficients to zero.
      R. Jenatton, G. Obozinski, F. Bach, 2009
 
 
+.. _LSA:
+
+Truncated singular value decomposition and latent semantic analysis
+===================================================================
+
+:class:`TruncatedSVD` implements a variant of singular value decomposition
+(SVD) that only computes the :math:`k` largest singular values,
+where :math:`k` is a user-specified parameter.
+
+When truncated SVD is applied to term-document matrices
+(as returned by ``CountVectorizer`` or ``TfidfVectorizer``),
+this transformation is known as
+`latent semantic analysis <http://nlp.stanford.edu/IR-book/pdf/18lsi.pdf>`_
+(LSA), because it transforms such matrices
+to a "semantic" space of low dimensionality.
+In particular, LSA is known to combat the effects of synonymy and polysemy
+(both of which roughly mean there are multiple meanings per word),
+which cause term-document matrices to be overly sparse
+and exhibit poor similarity under measures such as cosine similarity.
+
+.. note::
+    LSA is also known as latent semantic indexing, LSI,
+    though strictly that refers to its use in persistent indexes
+    for information retrieval purposes.
+
+Mathematically, truncated SVD applied to training samples :math:`X`
+produces a low-rank approximation :math:`X`:
+
+.. math::
+    X \approx X_k = U_k \Sigma_k V_k^\top
+
+After this operation, :math:`U_k \Sigma_k^\top`
+is the transformed training set with :math:`k` features
+(called ``n_components`` in the API).
+
+To also transform a test set :math:`X`, we multiply it with :math:`V_k`:
+
+.. math::
+    X' = X V_k^\top
+
+.. note::
+    Most treatments of LSA in the natural language processing (NLP)
+    and information retrieval (IR) literature
+    swap the axis of the matrix :math:`X` so that it has shape
+    ``n_features`` × ``n_samples``.
+    We present LSA in a different way that matches the scikit-learn API better,
+    but the singular values found are the same.
+
+:class:`TruncatedSVD` is very similar to :class:`PCA`, but differs
+in that it works on sample matrices :math:`X` directly
+instead of their covariance matrices.
+When the columnwise (per-feature) means of :math:`X`
+are subtracted from the feature values,
+truncated SVD on the resulting matrix is equivalent to PCA.
+In practical terms, this means
+that the :class:`TruncatedSVD` transformer accepts ``scipy.sparse``
+matrices without the need to densify them,
+as densifying may fill up memory even for medium-sized document collections.
+
+While the :class:`TruncatedSVD` transformer
+works with any (sparse) feature matrix,
+using it on tf–idf matrices is recommended over raw frequency counts
+in an LSA/document processing setting.
+In particular, sublinear scaling and inverse document frequency
+should be turned on (``sublinear_tf=True, use_idf=True``)
+to bring the feature values closer to a Gaussian distribution,
+compensating for LSA's erroneous assumptions about textual data.
+
+.. topic:: Examples:
+
+   * :ref:`example_document_clustering.py`
+
+.. topic:: References:
+
+  * Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze (2008),
+    *Introduction to Information Retrieval*, Cambridge University Press,
+    chapter 18: `Matrix decompositions & latent semantic indexing
+    <http://nlp.stanford.edu/IR-book/pdf/18lsi.pdf>`_
+
+
 .. _DictionaryLearning:
 
 Dictionary Learning
 
@@ -102,6 +102,11 @@ Changelog
    - Refactored and vectorized implementation of :func:`metrics.roc_curve`
      and :func:`metrics.precision_recall_curve`. By `Joel Nothman`_.
 
+   - The new estimator :class:`sklearn.decomposition.TruncatedSVD`
+     performs dimensionality reduction using SVD on sparse matrices,
+     and can be used for latent semantic analysis (LSA).
+     By `Lars Buitinck`_.
+
 
 API changes summary
 -------------------
@@ -121,6 +126,9 @@ API changes summary
    - ``gcv_mode="auto"`` no longer tries to perform SVD on a densified
      sparse matrix in :class:`sklearn.linear_model.RidgeCV`.
 
+   - Sparse matrix support in :class:`sklearn.decomposition.RandomizedPCA`
+     is now deprecated in favor of the new ``TruncatedSVD``.
+
 
 .. _changes_0_13_1:
 
 
@@ -53,10 +53,12 @@
 from __future__ import print_function
 
 from sklearn.datasets import fetch_20newsgroups
+from sklearn.decomposition import TruncatedSVD
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.feature_extraction.text import HashingVectorizer
 from sklearn.feature_extraction.text import TfidfTransformer
 from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import Normalizer
 from sklearn import metrics
 
 from sklearn.cluster import KMeans, MiniBatchKMeans
@@ -75,6 +77,9 @@
 
 # parse commandline arguments
 op = OptionParser()
+op.add_option("--lsa",
+              dest="n_components", type="int",
+              help="Preprocess documents with latent semantic analysis.")
 op.add_option("--no-minibatch",
               action="store_false", dest="minibatch", default=True,
               help="Use ordinary k-means algorithm (in batch mode).")
@@ -87,6 +92,9 @@
 op.add_option("--n-features", type=int, default=10000,
               help="Maximum number of features (dimensions)"
                    "to extract from text.")
+op.add_option("--verbose",
+              action="store_true", dest="verbose", default=False,
+              help="Print progress reports inside k-means algorithm.")
 
 print(__doc__)
 op.print_help()
@@ -147,17 +155,29 @@
 print("n_samples: %d, n_features: %d" % X.shape)
 print()
 
+if opts.n_components:
+    print("Performing dimensionality reduction using LSA")
+    t0 = time()
+    lsa = TruncatedSVD(opts.n_components)
+    X = lsa.fit_transform(X)
+    # Vectorizer results are normalized, which makes KMeans behave as
+    # spherical k-means for better results. Since LSA/SVD results are
+    # not normalized, we have to redo the normalization.
+    X = Normalizer(copy=False).fit_transform(X)
+
+    print("done in %fs" % (time() - t0))
+    print()
+
 
 ###############################################################################
 # Do the actual clustering
 
 if opts.minibatch:
     km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
-                         init_size=1000,
-                         batch_size=1000, verbose=1)
+                         init_size=1000, batch_size=1000, verbose=opts.verbose)
 else:
     km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
-                verbose=1)
+                verbose=opts.verbose)
 
 print("Clustering sparse data with %s" % km)
 t0 = time()
 
@@ -9,6 +9,7 @@
 
 from ..base import TransformerMixin
 from ..utils import array2d
+from ..utils.fixes import unique
 
 
 ###############################################################################
@@ -60,14 +61,5 @@ def inverse_transform(self, Xred):
             A vector of size nb_samples with the values of Xred assigned to
             each of the cluster of samples.
         """
-        if np.size((Xred.shape)) == 1:
-            X = np.zeros([self.labels_.shape[0]])
-        else:
-            X = np.zeros([Xred.shape[0], self.labels_.shape[0]])
-        unil = np.unique(self.labels_)
-        for i in range(len(unil)):
-            if np.size((Xred.shape)) == 1:
-                X[self.labels_ == unil[i]] = Xred[i]
-            else:
-                X[:, self.labels_ == unil[i]] = array2d(Xred[:, i]).T
-        return X
+        unil, inverse = unique(self.labels_, return_inverse=True)
+        return Xred[..., inverse]
@@ -47,8 +47,8 @@ def _hc_get_descendent(int node, children, int n_leaves):
     n_leaves : int
         Number of leaves.
 
-    Return
-    ------
+    Returns
+    -------
     descendent : list of int
     """
     ind = [node]
 
@@ -14,6 +14,7 @@
 from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_equal
+from sklearn.utils.testing import assert_array_almost_equal
 
 from sklearn.cluster import Ward, WardAgglomeration, ward_tree
 from sklearn.cluster.hierarchical import _hc_cut
@@ -119,6 +120,7 @@ def test_ward_agglomeration():
     assert_true(Xred.shape[1] == 5)
     Xfull = ward.inverse_transform(Xred)
     assert_true(np.unique(Xfull[0]).size == 5)
+    assert_array_almost_equal(ward.transform(Xfull), Xred)
 
 
 def assess_same_labelling(cut1, cut2):
 
@@ -23,8 +23,8 @@
 def log_likelihood(emp_cov, precision):
     """Computes the log_likelihood of the data
 
-    Params
-    ------
+    Parameters
+    ----------
     emp_cov: 2D ndarray (n_features, n_features)
       Maximum Likelihood Estimator of covariance
     precision: 2D ndarray (n_features, n_features)
@@ -101,8 +101,8 @@ def _set_covariance(self, covariance):
         Storage is done accordingly to `self.store_precision`.
         Precision stored only if invertible.
 
-        Params
-        ------
+        Parameters
+        ----------
         covariance: 2D ndarray, shape (n_features, n_features)
           Estimated covariance matrix to be stored, and from which precision
           is computed.